drivers: net: cpsw: Add support for set MAC address
[deliverable/linux.git] / net / ipv4 / tcp_ipv4.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
1da177e4
LT
8 * IPv4 specific functions
9 *
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 *
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
22 */
23
24/*
25 * Changes:
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
34 * ACK bit.
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
60236fdd 37 * request_sock handling and moved
1da177e4
LT
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
caa20d9a 40 * Added new listen semantics.
1da177e4
LT
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
45 * coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
51 */
52
afd46503 53#define pr_fmt(fmt) "TCP: " fmt
1da177e4 54
eb4dea58 55#include <linux/bottom_half.h>
1da177e4
LT
56#include <linux/types.h>
57#include <linux/fcntl.h>
58#include <linux/module.h>
59#include <linux/random.h>
60#include <linux/cache.h>
61#include <linux/jhash.h>
62#include <linux/init.h>
63#include <linux/times.h>
5a0e3ad6 64#include <linux/slab.h>
1da177e4 65
457c4cbc 66#include <net/net_namespace.h>
1da177e4 67#include <net/icmp.h>
304a1618 68#include <net/inet_hashtables.h>
1da177e4 69#include <net/tcp.h>
20380731 70#include <net/transp_v6.h>
1da177e4
LT
71#include <net/ipv6.h>
72#include <net/inet_common.h>
6d6ee43e 73#include <net/timewait_sock.h>
1da177e4 74#include <net/xfrm.h>
1a2449a8 75#include <net/netdma.h>
6e5714ea 76#include <net/secure_seq.h>
d1a4c0b3 77#include <net/tcp_memcontrol.h>
076bb0c8 78#include <net/busy_poll.h>
1da177e4
LT
79
80#include <linux/inet.h>
81#include <linux/ipv6.h>
82#include <linux/stddef.h>
83#include <linux/proc_fs.h>
84#include <linux/seq_file.h>
85
cfb6eeb4
YH
86#include <linux/crypto.h>
87#include <linux/scatterlist.h>
88
ab32ea5d
BH
89int sysctl_tcp_tw_reuse __read_mostly;
90int sysctl_tcp_low_latency __read_mostly;
4bc2f18b 91EXPORT_SYMBOL(sysctl_tcp_low_latency);
1da177e4 92
1da177e4 93
cfb6eeb4 94#ifdef CONFIG_TCP_MD5SIG
a915da9b 95static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 96 __be32 daddr, __be32 saddr, const struct tcphdr *th);
cfb6eeb4
YH
97#endif
98
5caea4ea 99struct inet_hashinfo tcp_hashinfo;
4bc2f18b 100EXPORT_SYMBOL(tcp_hashinfo);
1da177e4 101
cf533ea5 102static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
1da177e4 103{
eddc9ec5
ACM
104 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
105 ip_hdr(skb)->saddr,
aa8223c7
ACM
106 tcp_hdr(skb)->dest,
107 tcp_hdr(skb)->source);
1da177e4
LT
108}
109
6d6ee43e
ACM
110int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
111{
112 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
113 struct tcp_sock *tp = tcp_sk(sk);
114
115 /* With PAWS, it is safe from the viewpoint
116 of data integrity. Even without PAWS it is safe provided sequence
117 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
118
119 Actually, the idea is close to VJ's one, only timestamp cache is
120 held not per host, but per port pair and TW bucket is used as state
121 holder.
122
123 If TW bucket has been already destroyed we fall back to VJ's scheme
124 and use initial timestamp retrieved from peer table.
125 */
126 if (tcptw->tw_ts_recent_stamp &&
127 (twp == NULL || (sysctl_tcp_tw_reuse &&
9d729f72 128 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
6d6ee43e
ACM
129 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
130 if (tp->write_seq == 0)
131 tp->write_seq = 1;
132 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
133 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
134 sock_hold(sktw);
135 return 1;
136 }
137
138 return 0;
139}
6d6ee43e
ACM
140EXPORT_SYMBOL_GPL(tcp_twsk_unique);
141
1da177e4
LT
142/* This will initiate an outgoing connection. */
143int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
144{
2d7192d6 145 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
1da177e4
LT
146 struct inet_sock *inet = inet_sk(sk);
147 struct tcp_sock *tp = tcp_sk(sk);
dca8b089 148 __be16 orig_sport, orig_dport;
bada8adc 149 __be32 daddr, nexthop;
da905bd1 150 struct flowi4 *fl4;
2d7192d6 151 struct rtable *rt;
1da177e4 152 int err;
f6d8bd05 153 struct ip_options_rcu *inet_opt;
1da177e4
LT
154
155 if (addr_len < sizeof(struct sockaddr_in))
156 return -EINVAL;
157
158 if (usin->sin_family != AF_INET)
159 return -EAFNOSUPPORT;
160
161 nexthop = daddr = usin->sin_addr.s_addr;
f6d8bd05
ED
162 inet_opt = rcu_dereference_protected(inet->inet_opt,
163 sock_owned_by_user(sk));
164 if (inet_opt && inet_opt->opt.srr) {
1da177e4
LT
165 if (!daddr)
166 return -EINVAL;
f6d8bd05 167 nexthop = inet_opt->opt.faddr;
1da177e4
LT
168 }
169
dca8b089
DM
170 orig_sport = inet->inet_sport;
171 orig_dport = usin->sin_port;
da905bd1
DM
172 fl4 = &inet->cork.fl.u.ip4;
173 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
b23dd4fe
DM
174 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
175 IPPROTO_TCP,
176 orig_sport, orig_dport, sk, true);
177 if (IS_ERR(rt)) {
178 err = PTR_ERR(rt);
179 if (err == -ENETUNREACH)
7c73a6fa 180 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
b23dd4fe 181 return err;
584bdf8c 182 }
1da177e4
LT
183
184 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
185 ip_rt_put(rt);
186 return -ENETUNREACH;
187 }
188
f6d8bd05 189 if (!inet_opt || !inet_opt->opt.srr)
da905bd1 190 daddr = fl4->daddr;
1da177e4 191
c720c7e8 192 if (!inet->inet_saddr)
da905bd1 193 inet->inet_saddr = fl4->saddr;
c720c7e8 194 inet->inet_rcv_saddr = inet->inet_saddr;
1da177e4 195
c720c7e8 196 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
1da177e4
LT
197 /* Reset inherited state */
198 tp->rx_opt.ts_recent = 0;
199 tp->rx_opt.ts_recent_stamp = 0;
ee995283
PE
200 if (likely(!tp->repair))
201 tp->write_seq = 0;
1da177e4
LT
202 }
203
295ff7ed 204 if (tcp_death_row.sysctl_tw_recycle &&
81166dd6
DM
205 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
206 tcp_fetch_timewait_stamp(sk, &rt->dst);
1da177e4 207
c720c7e8
ED
208 inet->inet_dport = usin->sin_port;
209 inet->inet_daddr = daddr;
1da177e4 210
d83d8461 211 inet_csk(sk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
212 if (inet_opt)
213 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1da177e4 214
bee7ca9e 215 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
1da177e4
LT
216
217 /* Socket identity is still unknown (sport may be zero).
218 * However we set state to SYN-SENT and not releasing socket
219 * lock select source port, enter ourselves into the hash tables and
220 * complete initialization after this.
221 */
222 tcp_set_state(sk, TCP_SYN_SENT);
a7f5e7f1 223 err = inet_hash_connect(&tcp_death_row, sk);
1da177e4
LT
224 if (err)
225 goto failure;
226
da905bd1 227 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
b23dd4fe
DM
228 inet->inet_sport, inet->inet_dport, sk);
229 if (IS_ERR(rt)) {
230 err = PTR_ERR(rt);
231 rt = NULL;
1da177e4 232 goto failure;
b23dd4fe 233 }
1da177e4 234 /* OK, now commit destination to socket. */
bcd76111 235 sk->sk_gso_type = SKB_GSO_TCPV4;
d8d1f30b 236 sk_setup_caps(sk, &rt->dst);
1da177e4 237
ee995283 238 if (!tp->write_seq && likely(!tp->repair))
c720c7e8
ED
239 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
240 inet->inet_daddr,
241 inet->inet_sport,
1da177e4
LT
242 usin->sin_port);
243
c720c7e8 244 inet->inet_id = tp->write_seq ^ jiffies;
1da177e4 245
2b916477 246 err = tcp_connect(sk);
ee995283 247
1da177e4
LT
248 rt = NULL;
249 if (err)
250 goto failure;
251
252 return 0;
253
254failure:
7174259e
ACM
255 /*
256 * This unhashes the socket and releases the local port,
257 * if necessary.
258 */
1da177e4
LT
259 tcp_set_state(sk, TCP_CLOSE);
260 ip_rt_put(rt);
261 sk->sk_route_caps = 0;
c720c7e8 262 inet->inet_dport = 0;
1da177e4
LT
263 return err;
264}
4bc2f18b 265EXPORT_SYMBOL(tcp_v4_connect);
1da177e4 266
1da177e4 267/*
563d34d0
ED
268 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
269 * It can be called through tcp_release_cb() if socket was owned by user
270 * at the time tcp_v4_err() was called to handle ICMP message.
1da177e4 271 */
563d34d0 272static void tcp_v4_mtu_reduced(struct sock *sk)
1da177e4
LT
273{
274 struct dst_entry *dst;
275 struct inet_sock *inet = inet_sk(sk);
563d34d0 276 u32 mtu = tcp_sk(sk)->mtu_info;
1da177e4 277
80d0a69f
DM
278 dst = inet_csk_update_pmtu(sk, mtu);
279 if (!dst)
1da177e4
LT
280 return;
281
1da177e4
LT
282 /* Something is about to be wrong... Remember soft error
283 * for the case, if this connection will not able to recover.
284 */
285 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
286 sk->sk_err_soft = EMSGSIZE;
287
288 mtu = dst_mtu(dst);
289
290 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
d83d8461 291 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
1da177e4
LT
292 tcp_sync_mss(sk, mtu);
293
294 /* Resend the TCP packet because it's
295 * clear that the old packet has been
296 * dropped. This is the new "fast" path mtu
297 * discovery.
298 */
299 tcp_simple_retransmit(sk);
300 } /* else let the usual retransmit timer handle it */
301}
302
55be7a9c
DM
303static void do_redirect(struct sk_buff *skb, struct sock *sk)
304{
305 struct dst_entry *dst = __sk_dst_check(sk, 0);
306
1ed5c48f 307 if (dst)
6700c270 308 dst->ops->redirect(dst, sk, skb);
55be7a9c
DM
309}
310
1da177e4
LT
311/*
312 * This routine is called by the ICMP module when it gets some
313 * sort of error condition. If err < 0 then the socket should
314 * be closed and the error returned to the user. If err > 0
315 * it's just the icmp type << 8 | icmp code. After adjustment
316 * header points to the first 8 bytes of the tcp header. We need
317 * to find the appropriate port.
318 *
319 * The locking strategy used here is very "optimistic". When
320 * someone else accesses the socket the ICMP is just dropped
321 * and for some paths there is no check at all.
322 * A more general error queue to queue errors for later handling
323 * is probably better.
324 *
325 */
326
4d1a2d9e 327void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
1da177e4 328{
b71d1d42 329 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
4d1a2d9e 330 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
f1ecd5d9 331 struct inet_connection_sock *icsk;
1da177e4
LT
332 struct tcp_sock *tp;
333 struct inet_sock *inet;
4d1a2d9e
DL
334 const int type = icmp_hdr(icmp_skb)->type;
335 const int code = icmp_hdr(icmp_skb)->code;
1da177e4 336 struct sock *sk;
f1ecd5d9 337 struct sk_buff *skb;
168a8f58 338 struct request_sock *req;
1da177e4 339 __u32 seq;
f1ecd5d9 340 __u32 remaining;
1da177e4 341 int err;
4d1a2d9e 342 struct net *net = dev_net(icmp_skb->dev);
1da177e4 343
4d1a2d9e 344 if (icmp_skb->len < (iph->ihl << 2) + 8) {
dcfc23ca 345 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
1da177e4
LT
346 return;
347 }
348
fd54d716 349 sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
4d1a2d9e 350 iph->saddr, th->source, inet_iif(icmp_skb));
1da177e4 351 if (!sk) {
dcfc23ca 352 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
1da177e4
LT
353 return;
354 }
355 if (sk->sk_state == TCP_TIME_WAIT) {
9469c7b4 356 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
357 return;
358 }
359
360 bh_lock_sock(sk);
361 /* If too many ICMPs get dropped on busy
362 * servers this needs to be solved differently.
563d34d0
ED
363 * We do take care of PMTU discovery (RFC1191) special case :
364 * we can receive locally generated ICMP messages while socket is held.
1da177e4 365 */
b74aa930
ED
366 if (sock_owned_by_user(sk)) {
367 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
368 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
369 }
1da177e4
LT
370 if (sk->sk_state == TCP_CLOSE)
371 goto out;
372
97e3ecd1 373 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
374 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
375 goto out;
376 }
377
f1ecd5d9 378 icsk = inet_csk(sk);
1da177e4 379 tp = tcp_sk(sk);
168a8f58 380 req = tp->fastopen_rsk;
1da177e4
LT
381 seq = ntohl(th->seq);
382 if (sk->sk_state != TCP_LISTEN &&
168a8f58
JC
383 !between(seq, tp->snd_una, tp->snd_nxt) &&
384 (req == NULL || seq != tcp_rsk(req)->snt_isn)) {
385 /* For a Fast Open socket, allow seq to be snt_isn. */
de0744af 386 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
1da177e4
LT
387 goto out;
388 }
389
390 switch (type) {
55be7a9c
DM
391 case ICMP_REDIRECT:
392 do_redirect(icmp_skb, sk);
393 goto out;
1da177e4
LT
394 case ICMP_SOURCE_QUENCH:
395 /* Just silently ignore these. */
396 goto out;
397 case ICMP_PARAMETERPROB:
398 err = EPROTO;
399 break;
400 case ICMP_DEST_UNREACH:
401 if (code > NR_ICMP_UNREACH)
402 goto out;
403
404 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
0d4f0608
ED
405 /* We are not interested in TCP_LISTEN and open_requests
406 * (SYN-ACKs send out by Linux are always <576bytes so
407 * they should go through unfragmented).
408 */
409 if (sk->sk_state == TCP_LISTEN)
410 goto out;
411
563d34d0 412 tp->mtu_info = info;
144d56e9 413 if (!sock_owned_by_user(sk)) {
563d34d0 414 tcp_v4_mtu_reduced(sk);
144d56e9
ED
415 } else {
416 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
417 sock_hold(sk);
418 }
1da177e4
LT
419 goto out;
420 }
421
422 err = icmp_err_convert[code].errno;
f1ecd5d9
DL
423 /* check if icmp_skb allows revert of backoff
424 * (see draft-zimmermann-tcp-lcd) */
425 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
426 break;
427 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
428 !icsk->icsk_backoff)
429 break;
430
168a8f58
JC
431 /* XXX (TFO) - revisit the following logic for TFO */
432
8f49c270
DM
433 if (sock_owned_by_user(sk))
434 break;
435
f1ecd5d9 436 icsk->icsk_backoff--;
9ad7c049
JC
437 inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
438 TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
f1ecd5d9
DL
439 tcp_bound_rto(sk);
440
441 skb = tcp_write_queue_head(sk);
442 BUG_ON(!skb);
443
444 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
445 tcp_time_stamp - TCP_SKB_CB(skb)->when);
446
447 if (remaining) {
448 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
449 remaining, TCP_RTO_MAX);
f1ecd5d9
DL
450 } else {
451 /* RTO revert clocked out retransmission.
452 * Will retransmit now */
453 tcp_retransmit_timer(sk);
454 }
455
1da177e4
LT
456 break;
457 case ICMP_TIME_EXCEEDED:
458 err = EHOSTUNREACH;
459 break;
460 default:
461 goto out;
462 }
463
168a8f58
JC
464 /* XXX (TFO) - if it's a TFO socket and has been accepted, rather
465 * than following the TCP_SYN_RECV case and closing the socket,
466 * we ignore the ICMP error and keep trying like a fully established
467 * socket. Is this the right thing to do?
468 */
469 if (req && req->sk == NULL)
470 goto out;
471
1da177e4 472 switch (sk->sk_state) {
60236fdd 473 struct request_sock *req, **prev;
1da177e4
LT
474 case TCP_LISTEN:
475 if (sock_owned_by_user(sk))
476 goto out;
477
463c84b9
ACM
478 req = inet_csk_search_req(sk, &prev, th->dest,
479 iph->daddr, iph->saddr);
1da177e4
LT
480 if (!req)
481 goto out;
482
483 /* ICMPs are not backlogged, hence we cannot get
484 an established socket here.
485 */
547b792c 486 WARN_ON(req->sk);
1da177e4 487
2e6599cb 488 if (seq != tcp_rsk(req)->snt_isn) {
de0744af 489 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
1da177e4
LT
490 goto out;
491 }
492
493 /*
494 * Still in SYN_RECV, just remove it silently.
495 * There is no good way to pass the error to the newly
496 * created socket, and POSIX does not want network
497 * errors returned from accept().
498 */
463c84b9 499 inet_csk_reqsk_queue_drop(sk, req, prev);
848bf15f 500 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1da177e4
LT
501 goto out;
502
503 case TCP_SYN_SENT:
504 case TCP_SYN_RECV: /* Cannot happen.
168a8f58
JC
505 It can f.e. if SYNs crossed,
506 or Fast Open.
1da177e4
LT
507 */
508 if (!sock_owned_by_user(sk)) {
1da177e4
LT
509 sk->sk_err = err;
510
511 sk->sk_error_report(sk);
512
513 tcp_done(sk);
514 } else {
515 sk->sk_err_soft = err;
516 }
517 goto out;
518 }
519
520 /* If we've already connected we will keep trying
521 * until we time out, or the user gives up.
522 *
523 * rfc1122 4.2.3.9 allows to consider as hard errors
524 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
525 * but it is obsoleted by pmtu discovery).
526 *
527 * Note, that in modern internet, where routing is unreliable
528 * and in each dark corner broken firewalls sit, sending random
529 * errors ordered by their masters even this two messages finally lose
530 * their original sense (even Linux sends invalid PORT_UNREACHs)
531 *
532 * Now we are in compliance with RFCs.
533 * --ANK (980905)
534 */
535
536 inet = inet_sk(sk);
537 if (!sock_owned_by_user(sk) && inet->recverr) {
538 sk->sk_err = err;
539 sk->sk_error_report(sk);
540 } else { /* Only an error on timeout */
541 sk->sk_err_soft = err;
542 }
543
544out:
545 bh_unlock_sock(sk);
546 sock_put(sk);
547}
548
28850dc7 549void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1da177e4 550{
aa8223c7 551 struct tcphdr *th = tcp_hdr(skb);
1da177e4 552
84fa7933 553 if (skb->ip_summed == CHECKSUM_PARTIAL) {
419f9f89 554 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
663ead3b 555 skb->csum_start = skb_transport_header(skb) - skb->head;
ff1dcadb 556 skb->csum_offset = offsetof(struct tcphdr, check);
1da177e4 557 } else {
419f9f89 558 th->check = tcp_v4_check(skb->len, saddr, daddr,
07f0757a 559 csum_partial(th,
1da177e4
LT
560 th->doff << 2,
561 skb->csum));
562 }
563}
564
419f9f89 565/* This routine computes an IPv4 TCP checksum. */
bb296246 566void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
419f9f89 567{
cf533ea5 568 const struct inet_sock *inet = inet_sk(sk);
419f9f89
HX
569
570 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
571}
4bc2f18b 572EXPORT_SYMBOL(tcp_v4_send_check);
419f9f89 573
1da177e4
LT
574/*
575 * This routine will send an RST to the other tcp.
576 *
577 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
578 * for reset.
579 * Answer: if a packet caused RST, it is not for a socket
580 * existing in our system, if it is matched to a socket,
581 * it is just duplicate segment or bug in other side's TCP.
582 * So that we build reply only basing on parameters
583 * arrived with segment.
584 * Exception: precedence violation. We do not implement it in any case.
585 */
586
cfb6eeb4 587static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
1da177e4 588{
cf533ea5 589 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
590 struct {
591 struct tcphdr th;
592#ifdef CONFIG_TCP_MD5SIG
714e85be 593 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
cfb6eeb4
YH
594#endif
595 } rep;
1da177e4 596 struct ip_reply_arg arg;
cfb6eeb4
YH
597#ifdef CONFIG_TCP_MD5SIG
598 struct tcp_md5sig_key *key;
658ddaaf
SL
599 const __u8 *hash_location = NULL;
600 unsigned char newhash[16];
601 int genhash;
602 struct sock *sk1 = NULL;
cfb6eeb4 603#endif
a86b1e30 604 struct net *net;
1da177e4
LT
605
606 /* Never send a reset in response to a reset. */
607 if (th->rst)
608 return;
609
511c3f92 610 if (skb_rtable(skb)->rt_type != RTN_LOCAL)
1da177e4
LT
611 return;
612
613 /* Swap the send and the receive. */
cfb6eeb4
YH
614 memset(&rep, 0, sizeof(rep));
615 rep.th.dest = th->source;
616 rep.th.source = th->dest;
617 rep.th.doff = sizeof(struct tcphdr) / 4;
618 rep.th.rst = 1;
1da177e4
LT
619
620 if (th->ack) {
cfb6eeb4 621 rep.th.seq = th->ack_seq;
1da177e4 622 } else {
cfb6eeb4
YH
623 rep.th.ack = 1;
624 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
625 skb->len - (th->doff << 2));
1da177e4
LT
626 }
627
7174259e 628 memset(&arg, 0, sizeof(arg));
cfb6eeb4
YH
629 arg.iov[0].iov_base = (unsigned char *)&rep;
630 arg.iov[0].iov_len = sizeof(rep.th);
631
632#ifdef CONFIG_TCP_MD5SIG
658ddaaf
SL
633 hash_location = tcp_parse_md5sig_option(th);
634 if (!sk && hash_location) {
635 /*
636 * active side is lost. Try to find listening socket through
637 * source port, and then find md5 key through listening socket.
638 * we are not loose security here:
639 * Incoming packet is checked with md5 hash with finding key,
640 * no RST generated if md5 hash doesn't match.
641 */
642 sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
da5e3630
TH
643 &tcp_hashinfo, ip_hdr(skb)->saddr,
644 th->source, ip_hdr(skb)->daddr,
658ddaaf
SL
645 ntohs(th->source), inet_iif(skb));
646 /* don't send rst if it can't find key */
647 if (!sk1)
648 return;
649 rcu_read_lock();
650 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
651 &ip_hdr(skb)->saddr, AF_INET);
652 if (!key)
653 goto release_sk1;
654
655 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
656 if (genhash || memcmp(hash_location, newhash, 16) != 0)
657 goto release_sk1;
658 } else {
659 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
660 &ip_hdr(skb)->saddr,
661 AF_INET) : NULL;
662 }
663
cfb6eeb4
YH
664 if (key) {
665 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
666 (TCPOPT_NOP << 16) |
667 (TCPOPT_MD5SIG << 8) |
668 TCPOLEN_MD5SIG);
669 /* Update length and the length the header thinks exists */
670 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
671 rep.th.doff = arg.iov[0].iov_len / 4;
672
49a72dfb 673 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
78e645cb
IJ
674 key, ip_hdr(skb)->saddr,
675 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
676 }
677#endif
eddc9ec5
ACM
678 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
679 ip_hdr(skb)->saddr, /* XXX */
52cd5750 680 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1da177e4 681 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
88ef4a5a 682 arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
e2446eaa 683 /* When socket is gone, all binding information is lost.
4c675258
AK
684 * routing might fail in this case. No choice here, if we choose to force
685 * input interface, we will misroute in case of asymmetric route.
e2446eaa 686 */
4c675258
AK
687 if (sk)
688 arg.bound_dev_if = sk->sk_bound_dev_if;
1da177e4 689
adf30907 690 net = dev_net(skb_dst(skb)->dev);
66b13d99 691 arg.tos = ip_hdr(skb)->tos;
be9f4a44 692 ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
70e73416 693 ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
1da177e4 694
63231bdd
PE
695 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
696 TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
658ddaaf
SL
697
698#ifdef CONFIG_TCP_MD5SIG
699release_sk1:
700 if (sk1) {
701 rcu_read_unlock();
702 sock_put(sk1);
703 }
704#endif
1da177e4
LT
705}
706
707/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
708 outside socket context is ugly, certainly. What can I do?
709 */
710
9501f972 711static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
ee684b6f 712 u32 win, u32 tsval, u32 tsecr, int oif,
88ef4a5a 713 struct tcp_md5sig_key *key,
66b13d99 714 int reply_flags, u8 tos)
1da177e4 715{
cf533ea5 716 const struct tcphdr *th = tcp_hdr(skb);
1da177e4
LT
717 struct {
718 struct tcphdr th;
714e85be 719 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
cfb6eeb4 720#ifdef CONFIG_TCP_MD5SIG
714e85be 721 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
cfb6eeb4
YH
722#endif
723 ];
1da177e4
LT
724 } rep;
725 struct ip_reply_arg arg;
adf30907 726 struct net *net = dev_net(skb_dst(skb)->dev);
1da177e4
LT
727
728 memset(&rep.th, 0, sizeof(struct tcphdr));
7174259e 729 memset(&arg, 0, sizeof(arg));
1da177e4
LT
730
731 arg.iov[0].iov_base = (unsigned char *)&rep;
732 arg.iov[0].iov_len = sizeof(rep.th);
ee684b6f 733 if (tsecr) {
cfb6eeb4
YH
734 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
735 (TCPOPT_TIMESTAMP << 8) |
736 TCPOLEN_TIMESTAMP);
ee684b6f
AV
737 rep.opt[1] = htonl(tsval);
738 rep.opt[2] = htonl(tsecr);
cb48cfe8 739 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
1da177e4
LT
740 }
741
742 /* Swap the send and the receive. */
743 rep.th.dest = th->source;
744 rep.th.source = th->dest;
745 rep.th.doff = arg.iov[0].iov_len / 4;
746 rep.th.seq = htonl(seq);
747 rep.th.ack_seq = htonl(ack);
748 rep.th.ack = 1;
749 rep.th.window = htons(win);
750
cfb6eeb4 751#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4 752 if (key) {
ee684b6f 753 int offset = (tsecr) ? 3 : 0;
cfb6eeb4
YH
754
755 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
756 (TCPOPT_NOP << 16) |
757 (TCPOPT_MD5SIG << 8) |
758 TCPOLEN_MD5SIG);
759 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
760 rep.th.doff = arg.iov[0].iov_len/4;
761
49a72dfb 762 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
90b7e112
AL
763 key, ip_hdr(skb)->saddr,
764 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
765 }
766#endif
88ef4a5a 767 arg.flags = reply_flags;
eddc9ec5
ACM
768 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
769 ip_hdr(skb)->saddr, /* XXX */
1da177e4
LT
770 arg.iov[0].iov_len, IPPROTO_TCP, 0);
771 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
9501f972
YH
772 if (oif)
773 arg.bound_dev_if = oif;
66b13d99 774 arg.tos = tos;
be9f4a44 775 ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
70e73416 776 ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
1da177e4 777
63231bdd 778 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
1da177e4
LT
779}
780
781static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
782{
8feaf0c0 783 struct inet_timewait_sock *tw = inet_twsk(sk);
cfb6eeb4 784 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1da177e4 785
9501f972 786 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
7174259e 787 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
ee684b6f 788 tcp_time_stamp + tcptw->tw_ts_offset,
9501f972
YH
789 tcptw->tw_ts_recent,
790 tw->tw_bound_dev_if,
88ef4a5a 791 tcp_twsk_md5_key(tcptw),
66b13d99
ED
792 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
793 tw->tw_tos
9501f972 794 );
1da177e4 795
8feaf0c0 796 inet_twsk_put(tw);
1da177e4
LT
797}
798
6edafaaf 799static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
7174259e 800 struct request_sock *req)
1da177e4 801{
168a8f58
JC
802 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
803 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
804 */
805 tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
806 tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
807 tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
ee684b6f 808 tcp_time_stamp,
9501f972
YH
809 req->ts_recent,
810 0,
a915da9b
ED
811 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
812 AF_INET),
66b13d99
ED
813 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
814 ip_hdr(skb)->tos);
1da177e4
LT
815}
816
1da177e4 817/*
9bf1d83e 818 * Send a SYN-ACK after having received a SYN.
60236fdd 819 * This still operates on a request_sock only, not on a big
1da177e4
LT
820 * socket.
821 */
72659ecc
OP
822static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
823 struct request_sock *req,
7586eceb
ED
824 u16 queue_mapping,
825 bool nocache)
1da177e4 826{
2e6599cb 827 const struct inet_request_sock *ireq = inet_rsk(req);
6bd023f3 828 struct flowi4 fl4;
1da177e4
LT
829 int err = -1;
830 struct sk_buff * skb;
831
832 /* First, grab a route. */
ba3f7f04 833 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
fd80eb94 834 return -1;
1da177e4 835
1a2c6181 836 skb = tcp_make_synack(sk, dst, req, NULL);
1da177e4
LT
837
838 if (skb) {
419f9f89 839 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
1da177e4 840
fff32699 841 skb_set_queue_mapping(skb, queue_mapping);
2e6599cb
ACM
842 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
843 ireq->rmt_addr,
844 ireq->opt);
b9df3cb8 845 err = net_xmit_eval(err);
016818d0
NC
846 if (!tcp_rsk(req)->snt_synack && !err)
847 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1da177e4
LT
848 }
849
1da177e4
LT
850 return err;
851}
852
1a2c6181 853static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
fd80eb94 854{
1a2c6181 855 int res = tcp_v4_send_synack(sk, NULL, req, 0, false);
e6c022a4
ED
856
857 if (!res)
858 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
859 return res;
fd80eb94
DL
860}
861
1da177e4 862/*
60236fdd 863 * IPv4 request_sock destructor.
1da177e4 864 */
60236fdd 865static void tcp_v4_reqsk_destructor(struct request_sock *req)
1da177e4 866{
a51482bd 867 kfree(inet_rsk(req)->opt);
1da177e4
LT
868}
869
946cedcc 870/*
a2a385d6 871 * Return true if a syncookie should be sent
946cedcc 872 */
a2a385d6 873bool tcp_syn_flood_action(struct sock *sk,
946cedcc
ED
874 const struct sk_buff *skb,
875 const char *proto)
1da177e4 876{
946cedcc 877 const char *msg = "Dropping request";
a2a385d6 878 bool want_cookie = false;
946cedcc
ED
879 struct listen_sock *lopt;
880
881
1da177e4 882
2a1d4bd4 883#ifdef CONFIG_SYN_COOKIES
946cedcc 884 if (sysctl_tcp_syncookies) {
2a1d4bd4 885 msg = "Sending cookies";
a2a385d6 886 want_cookie = true;
946cedcc
ED
887 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
888 } else
80e40daa 889#endif
946cedcc
ED
890 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
891
892 lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
893 if (!lopt->synflood_warned) {
894 lopt->synflood_warned = 1;
afd46503 895 pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
946cedcc
ED
896 proto, ntohs(tcp_hdr(skb)->dest), msg);
897 }
898 return want_cookie;
2a1d4bd4 899}
946cedcc 900EXPORT_SYMBOL(tcp_syn_flood_action);
1da177e4
LT
901
902/*
60236fdd 903 * Save and compile IPv4 options into the request_sock if needed.
1da177e4 904 */
5dff747b 905static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
1da177e4 906{
f6d8bd05
ED
907 const struct ip_options *opt = &(IPCB(skb)->opt);
908 struct ip_options_rcu *dopt = NULL;
1da177e4
LT
909
910 if (opt && opt->optlen) {
f6d8bd05
ED
911 int opt_size = sizeof(*dopt) + opt->optlen;
912
1da177e4
LT
913 dopt = kmalloc(opt_size, GFP_ATOMIC);
914 if (dopt) {
f6d8bd05 915 if (ip_options_echo(&dopt->opt, skb)) {
1da177e4
LT
916 kfree(dopt);
917 dopt = NULL;
918 }
919 }
920 }
921 return dopt;
922}
923
cfb6eeb4
YH
924#ifdef CONFIG_TCP_MD5SIG
925/*
926 * RFC2385 MD5 checksumming requires a mapping of
927 * IP address->MD5 Key.
928 * We need to maintain these in the sk structure.
929 */
930
931/* Find the Key structure for an address. */
a915da9b
ED
932struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
933 const union tcp_md5_addr *addr,
934 int family)
cfb6eeb4
YH
935{
936 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 937 struct tcp_md5sig_key *key;
a915da9b 938 unsigned int size = sizeof(struct in_addr);
a8afca03 939 struct tcp_md5sig_info *md5sig;
cfb6eeb4 940
a8afca03
ED
941 /* caller either holds rcu_read_lock() or socket lock */
942 md5sig = rcu_dereference_check(tp->md5sig_info,
b4fb05ea
ED
943 sock_owned_by_user(sk) ||
944 lockdep_is_held(&sk->sk_lock.slock));
a8afca03 945 if (!md5sig)
cfb6eeb4 946 return NULL;
a915da9b
ED
947#if IS_ENABLED(CONFIG_IPV6)
948 if (family == AF_INET6)
949 size = sizeof(struct in6_addr);
950#endif
b67bfe0d 951 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
a915da9b
ED
952 if (key->family != family)
953 continue;
954 if (!memcmp(&key->addr, addr, size))
955 return key;
cfb6eeb4
YH
956 }
957 return NULL;
958}
a915da9b 959EXPORT_SYMBOL(tcp_md5_do_lookup);
cfb6eeb4
YH
960
961struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
962 struct sock *addr_sk)
963{
a915da9b
ED
964 union tcp_md5_addr *addr;
965
966 addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
967 return tcp_md5_do_lookup(sk, addr, AF_INET);
cfb6eeb4 968}
cfb6eeb4
YH
969EXPORT_SYMBOL(tcp_v4_md5_lookup);
970
f5b99bcd
AB
971static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
972 struct request_sock *req)
cfb6eeb4 973{
a915da9b
ED
974 union tcp_md5_addr *addr;
975
976 addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr;
977 return tcp_md5_do_lookup(sk, addr, AF_INET);
cfb6eeb4
YH
978}
979
980/* This can be called on a newly created socket, from other files */
a915da9b
ED
981int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
982 int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
cfb6eeb4
YH
983{
984 /* Add Key to the list */
b0a713e9 985 struct tcp_md5sig_key *key;
cfb6eeb4 986 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 987 struct tcp_md5sig_info *md5sig;
cfb6eeb4 988
c0353c7b 989 key = tcp_md5_do_lookup(sk, addr, family);
cfb6eeb4
YH
990 if (key) {
991 /* Pre-existing entry - just update that one. */
a915da9b 992 memcpy(key->key, newkey, newkeylen);
b0a713e9 993 key->keylen = newkeylen;
a915da9b
ED
994 return 0;
995 }
260fcbeb 996
a8afca03
ED
997 md5sig = rcu_dereference_protected(tp->md5sig_info,
998 sock_owned_by_user(sk));
a915da9b
ED
999 if (!md5sig) {
1000 md5sig = kmalloc(sizeof(*md5sig), gfp);
1001 if (!md5sig)
cfb6eeb4 1002 return -ENOMEM;
cfb6eeb4 1003
a915da9b
ED
1004 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1005 INIT_HLIST_HEAD(&md5sig->head);
a8afca03 1006 rcu_assign_pointer(tp->md5sig_info, md5sig);
a915da9b 1007 }
cfb6eeb4 1008
5f3d9cb2 1009 key = sock_kmalloc(sk, sizeof(*key), gfp);
a915da9b
ED
1010 if (!key)
1011 return -ENOMEM;
71cea17e 1012 if (!tcp_alloc_md5sig_pool()) {
5f3d9cb2 1013 sock_kfree_s(sk, key, sizeof(*key));
a915da9b 1014 return -ENOMEM;
cfb6eeb4 1015 }
a915da9b
ED
1016
1017 memcpy(key->key, newkey, newkeylen);
1018 key->keylen = newkeylen;
1019 key->family = family;
1020 memcpy(&key->addr, addr,
1021 (family == AF_INET6) ? sizeof(struct in6_addr) :
1022 sizeof(struct in_addr));
1023 hlist_add_head_rcu(&key->node, &md5sig->head);
cfb6eeb4
YH
1024 return 0;
1025}
a915da9b 1026EXPORT_SYMBOL(tcp_md5_do_add);
cfb6eeb4 1027
a915da9b 1028int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
cfb6eeb4 1029{
a915da9b
ED
1030 struct tcp_md5sig_key *key;
1031
c0353c7b 1032 key = tcp_md5_do_lookup(sk, addr, family);
a915da9b
ED
1033 if (!key)
1034 return -ENOENT;
1035 hlist_del_rcu(&key->node);
5f3d9cb2 1036 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 1037 kfree_rcu(key, rcu);
a915da9b 1038 return 0;
cfb6eeb4 1039}
a915da9b 1040EXPORT_SYMBOL(tcp_md5_do_del);
cfb6eeb4 1041
e0683e70 1042static void tcp_clear_md5_list(struct sock *sk)
cfb6eeb4
YH
1043{
1044 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 1045 struct tcp_md5sig_key *key;
b67bfe0d 1046 struct hlist_node *n;
a8afca03 1047 struct tcp_md5sig_info *md5sig;
cfb6eeb4 1048
a8afca03
ED
1049 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1050
b67bfe0d 1051 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
a915da9b 1052 hlist_del_rcu(&key->node);
5f3d9cb2 1053 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 1054 kfree_rcu(key, rcu);
cfb6eeb4
YH
1055 }
1056}
1057
7174259e
ACM
1058static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1059 int optlen)
cfb6eeb4
YH
1060{
1061 struct tcp_md5sig cmd;
1062 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
cfb6eeb4
YH
1063
1064 if (optlen < sizeof(cmd))
1065 return -EINVAL;
1066
7174259e 1067 if (copy_from_user(&cmd, optval, sizeof(cmd)))
cfb6eeb4
YH
1068 return -EFAULT;
1069
1070 if (sin->sin_family != AF_INET)
1071 return -EINVAL;
1072
a8afca03 1073 if (!cmd.tcpm_key || !cmd.tcpm_keylen)
a915da9b
ED
1074 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1075 AF_INET);
cfb6eeb4
YH
1076
1077 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1078 return -EINVAL;
1079
a915da9b
ED
1080 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1081 AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1082 GFP_KERNEL);
cfb6eeb4
YH
1083}
1084
49a72dfb
AL
1085static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1086 __be32 daddr, __be32 saddr, int nbytes)
cfb6eeb4 1087{
cfb6eeb4 1088 struct tcp4_pseudohdr *bp;
49a72dfb 1089 struct scatterlist sg;
cfb6eeb4
YH
1090
1091 bp = &hp->md5_blk.ip4;
cfb6eeb4
YH
1092
1093 /*
49a72dfb 1094 * 1. the TCP pseudo-header (in the order: source IP address,
cfb6eeb4
YH
1095 * destination IP address, zero-padded protocol number, and
1096 * segment length)
1097 */
1098 bp->saddr = saddr;
1099 bp->daddr = daddr;
1100 bp->pad = 0;
076fb722 1101 bp->protocol = IPPROTO_TCP;
49a72dfb 1102 bp->len = cpu_to_be16(nbytes);
c7da57a1 1103
49a72dfb
AL
1104 sg_init_one(&sg, bp, sizeof(*bp));
1105 return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1106}
1107
a915da9b 1108static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 1109 __be32 daddr, __be32 saddr, const struct tcphdr *th)
49a72dfb
AL
1110{
1111 struct tcp_md5sig_pool *hp;
1112 struct hash_desc *desc;
1113
1114 hp = tcp_get_md5sig_pool();
1115 if (!hp)
1116 goto clear_hash_noput;
1117 desc = &hp->md5_desc;
1118
1119 if (crypto_hash_init(desc))
1120 goto clear_hash;
1121 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1122 goto clear_hash;
1123 if (tcp_md5_hash_header(hp, th))
1124 goto clear_hash;
1125 if (tcp_md5_hash_key(hp, key))
1126 goto clear_hash;
1127 if (crypto_hash_final(desc, md5_hash))
cfb6eeb4
YH
1128 goto clear_hash;
1129
cfb6eeb4 1130 tcp_put_md5sig_pool();
cfb6eeb4 1131 return 0;
49a72dfb 1132
cfb6eeb4
YH
1133clear_hash:
1134 tcp_put_md5sig_pool();
1135clear_hash_noput:
1136 memset(md5_hash, 0, 16);
49a72dfb 1137 return 1;
cfb6eeb4
YH
1138}
1139
49a72dfb 1140int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
318cf7aa
ED
1141 const struct sock *sk, const struct request_sock *req,
1142 const struct sk_buff *skb)
cfb6eeb4 1143{
49a72dfb
AL
1144 struct tcp_md5sig_pool *hp;
1145 struct hash_desc *desc;
318cf7aa 1146 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
1147 __be32 saddr, daddr;
1148
1149 if (sk) {
c720c7e8
ED
1150 saddr = inet_sk(sk)->inet_saddr;
1151 daddr = inet_sk(sk)->inet_daddr;
49a72dfb
AL
1152 } else if (req) {
1153 saddr = inet_rsk(req)->loc_addr;
1154 daddr = inet_rsk(req)->rmt_addr;
cfb6eeb4 1155 } else {
49a72dfb
AL
1156 const struct iphdr *iph = ip_hdr(skb);
1157 saddr = iph->saddr;
1158 daddr = iph->daddr;
cfb6eeb4 1159 }
49a72dfb
AL
1160
1161 hp = tcp_get_md5sig_pool();
1162 if (!hp)
1163 goto clear_hash_noput;
1164 desc = &hp->md5_desc;
1165
1166 if (crypto_hash_init(desc))
1167 goto clear_hash;
1168
1169 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1170 goto clear_hash;
1171 if (tcp_md5_hash_header(hp, th))
1172 goto clear_hash;
1173 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1174 goto clear_hash;
1175 if (tcp_md5_hash_key(hp, key))
1176 goto clear_hash;
1177 if (crypto_hash_final(desc, md5_hash))
1178 goto clear_hash;
1179
1180 tcp_put_md5sig_pool();
1181 return 0;
1182
1183clear_hash:
1184 tcp_put_md5sig_pool();
1185clear_hash_noput:
1186 memset(md5_hash, 0, 16);
1187 return 1;
cfb6eeb4 1188}
49a72dfb 1189EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
cfb6eeb4 1190
a2a385d6 1191static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
cfb6eeb4
YH
1192{
1193 /*
1194 * This gets called for each TCP segment that arrives
1195 * so we want to be efficient.
1196 * We have 3 drop cases:
1197 * o No MD5 hash and one expected.
1198 * o MD5 hash and we're not expecting one.
1199 * o MD5 hash and its wrong.
1200 */
cf533ea5 1201 const __u8 *hash_location = NULL;
cfb6eeb4 1202 struct tcp_md5sig_key *hash_expected;
eddc9ec5 1203 const struct iphdr *iph = ip_hdr(skb);
cf533ea5 1204 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4 1205 int genhash;
cfb6eeb4
YH
1206 unsigned char newhash[16];
1207
a915da9b
ED
1208 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1209 AF_INET);
7d5d5525 1210 hash_location = tcp_parse_md5sig_option(th);
cfb6eeb4 1211
cfb6eeb4
YH
1212 /* We've parsed the options - do we have a hash? */
1213 if (!hash_expected && !hash_location)
a2a385d6 1214 return false;
cfb6eeb4
YH
1215
1216 if (hash_expected && !hash_location) {
785957d3 1217 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
a2a385d6 1218 return true;
cfb6eeb4
YH
1219 }
1220
1221 if (!hash_expected && hash_location) {
785957d3 1222 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
a2a385d6 1223 return true;
cfb6eeb4
YH
1224 }
1225
1226 /* Okay, so this is hash_expected and hash_location -
1227 * so we need to calculate the checksum.
1228 */
49a72dfb
AL
1229 genhash = tcp_v4_md5_hash_skb(newhash,
1230 hash_expected,
1231 NULL, NULL, skb);
cfb6eeb4
YH
1232
1233 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
e87cc472
JP
1234 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1235 &iph->saddr, ntohs(th->source),
1236 &iph->daddr, ntohs(th->dest),
1237 genhash ? " tcp_v4_calc_md5_hash failed"
1238 : "");
a2a385d6 1239 return true;
cfb6eeb4 1240 }
a2a385d6 1241 return false;
cfb6eeb4
YH
1242}
1243
1244#endif
1245
72a3effa 1246struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1da177e4 1247 .family = PF_INET,
2e6599cb 1248 .obj_size = sizeof(struct tcp_request_sock),
72659ecc 1249 .rtx_syn_ack = tcp_v4_rtx_synack,
60236fdd
ACM
1250 .send_ack = tcp_v4_reqsk_send_ack,
1251 .destructor = tcp_v4_reqsk_destructor,
1da177e4 1252 .send_reset = tcp_v4_send_reset,
72659ecc 1253 .syn_ack_timeout = tcp_syn_ack_timeout,
1da177e4
LT
1254};
1255
cfb6eeb4 1256#ifdef CONFIG_TCP_MD5SIG
b2e4b3de 1257static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
cfb6eeb4 1258 .md5_lookup = tcp_v4_reqsk_md5_lookup,
e3afe7b7 1259 .calc_md5_hash = tcp_v4_md5_hash_skb,
cfb6eeb4 1260};
b6332e6c 1261#endif
cfb6eeb4 1262
168a8f58
JC
1263static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
1264 struct request_sock *req,
1265 struct tcp_fastopen_cookie *foc,
1266 struct tcp_fastopen_cookie *valid_foc)
1267{
1268 bool skip_cookie = false;
1269 struct fastopen_queue *fastopenq;
1270
1271 if (likely(!fastopen_cookie_present(foc))) {
1272 /* See include/net/tcp.h for the meaning of these knobs */
1273 if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) ||
1274 ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) &&
1275 (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1)))
1276 skip_cookie = true; /* no cookie to validate */
1277 else
1278 return false;
1279 }
1280 fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
1281 /* A FO option is present; bump the counter. */
1282 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE);
1283
1284 /* Make sure the listener has enabled fastopen, and we don't
1285 * exceed the max # of pending TFO requests allowed before trying
1286 * to validating the cookie in order to avoid burning CPU cycles
1287 * unnecessarily.
1288 *
1289 * XXX (TFO) - The implication of checking the max_qlen before
1290 * processing a cookie request is that clients can't differentiate
1291 * between qlen overflow causing Fast Open to be disabled
1292 * temporarily vs a server not supporting Fast Open at all.
1293 */
1294 if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 ||
1295 fastopenq == NULL || fastopenq->max_qlen == 0)
1296 return false;
1297
1298 if (fastopenq->qlen >= fastopenq->max_qlen) {
1299 struct request_sock *req1;
1300 spin_lock(&fastopenq->lock);
1301 req1 = fastopenq->rskq_rst_head;
1302 if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
1303 spin_unlock(&fastopenq->lock);
1304 NET_INC_STATS_BH(sock_net(sk),
1305 LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
1306 /* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/
1307 foc->len = -1;
1308 return false;
1309 }
1310 fastopenq->rskq_rst_head = req1->dl_next;
1311 fastopenq->qlen--;
1312 spin_unlock(&fastopenq->lock);
1313 reqsk_free(req1);
1314 }
1315 if (skip_cookie) {
1316 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1317 return true;
1318 }
1319 if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) {
1320 if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) {
1321 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1322 if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) ||
1323 memcmp(&foc->val[0], &valid_foc->val[0],
1324 TCP_FASTOPEN_COOKIE_SIZE) != 0)
1325 return false;
1326 valid_foc->len = -1;
1327 }
1328 /* Acknowledge the data received from the peer. */
1329 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1330 return true;
1331 } else if (foc->len == 0) { /* Client requesting a cookie */
1332 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1333 NET_INC_STATS_BH(sock_net(sk),
1334 LINUX_MIB_TCPFASTOPENCOOKIEREQD);
1335 } else {
1336 /* Client sent a cookie with wrong size. Treat it
1337 * the same as invalid and return a valid one.
1338 */
1339 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1340 }
1341 return false;
1342}
1343
1344static int tcp_v4_conn_req_fastopen(struct sock *sk,
1345 struct sk_buff *skb,
1346 struct sk_buff *skb_synack,
1a2c6181 1347 struct request_sock *req)
168a8f58
JC
1348{
1349 struct tcp_sock *tp = tcp_sk(sk);
1350 struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
1351 const struct inet_request_sock *ireq = inet_rsk(req);
1352 struct sock *child;
016818d0 1353 int err;
168a8f58 1354
e6c022a4
ED
1355 req->num_retrans = 0;
1356 req->num_timeout = 0;
168a8f58
JC
1357 req->sk = NULL;
1358
1359 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
1360 if (child == NULL) {
1361 NET_INC_STATS_BH(sock_net(sk),
1362 LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1363 kfree_skb(skb_synack);
1364 return -1;
1365 }
016818d0
NC
1366 err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1367 ireq->rmt_addr, ireq->opt);
1368 err = net_xmit_eval(err);
1369 if (!err)
1370 tcp_rsk(req)->snt_synack = tcp_time_stamp;
168a8f58
JC
1371 /* XXX (TFO) - is it ok to ignore error and continue? */
1372
1373 spin_lock(&queue->fastopenq->lock);
1374 queue->fastopenq->qlen++;
1375 spin_unlock(&queue->fastopenq->lock);
1376
1377 /* Initialize the child socket. Have to fix some values to take
1378 * into account the child is a Fast Open socket and is created
1379 * only out of the bits carried in the SYN packet.
1380 */
1381 tp = tcp_sk(child);
1382
1383 tp->fastopen_rsk = req;
1384 /* Do a hold on the listner sk so that if the listener is being
1385 * closed, the child that has been accepted can live on and still
1386 * access listen_lock.
1387 */
1388 sock_hold(sk);
1389 tcp_rsk(req)->listener = sk;
1390
1391 /* RFC1323: The window in SYN & SYN/ACK segments is never
1392 * scaled. So correct it appropriately.
1393 */
1394 tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
1395
1396 /* Activate the retrans timer so that SYNACK can be retransmitted.
1397 * The request socket is not added to the SYN table of the parent
1398 * because it's been added to the accept queue directly.
1399 */
1400 inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
1401 TCP_TIMEOUT_INIT, TCP_RTO_MAX);
1402
1403 /* Add the child socket directly into the accept queue */
1404 inet_csk_reqsk_queue_add(sk, req, child);
1405
1406 /* Now finish processing the fastopen child socket. */
1407 inet_csk(child)->icsk_af_ops->rebuild_header(child);
1408 tcp_init_congestion_control(child);
1409 tcp_mtup_init(child);
1410 tcp_init_buffer_space(child);
1411 tcp_init_metrics(child);
1412
1413 /* Queue the data carried in the SYN packet. We need to first
1414 * bump skb's refcnt because the caller will attempt to free it.
1415 *
1416 * XXX (TFO) - we honor a zero-payload TFO request for now.
1417 * (Any reason not to?)
1418 */
1419 if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) {
1420 /* Don't queue the skb if there is no payload in SYN.
1421 * XXX (TFO) - How about SYN+FIN?
1422 */
1423 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1424 } else {
1425 skb = skb_get(skb);
1426 skb_dst_drop(skb);
1427 __skb_pull(skb, tcp_hdr(skb)->doff * 4);
1428 skb_set_owner_r(skb, child);
1429 __skb_queue_tail(&child->sk_receive_queue, skb);
1430 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
6f73601e 1431 tp->syn_data_acked = 1;
168a8f58
JC
1432 }
1433 sk->sk_data_ready(sk, 0);
1434 bh_unlock_sock(child);
1435 sock_put(child);
1436 WARN_ON(req->sk == NULL);
1437 return 0;
1438}
1439
1da177e4
LT
1440int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1441{
1442 struct tcp_options_received tmp_opt;
60236fdd 1443 struct request_sock *req;
e6b4d113 1444 struct inet_request_sock *ireq;
4957faad 1445 struct tcp_sock *tp = tcp_sk(sk);
e6b4d113 1446 struct dst_entry *dst = NULL;
eddc9ec5
ACM
1447 __be32 saddr = ip_hdr(skb)->saddr;
1448 __be32 daddr = ip_hdr(skb)->daddr;
1da177e4 1449 __u32 isn = TCP_SKB_CB(skb)->when;
a2a385d6 1450 bool want_cookie = false;
168a8f58
JC
1451 struct flowi4 fl4;
1452 struct tcp_fastopen_cookie foc = { .len = -1 };
1453 struct tcp_fastopen_cookie valid_foc = { .len = -1 };
1454 struct sk_buff *skb_synack;
1455 int do_fastopen;
1da177e4
LT
1456
1457 /* Never answer to SYNs send to broadcast or multicast */
511c3f92 1458 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1da177e4
LT
1459 goto drop;
1460
1461 /* TW buckets are converted to open requests without
1462 * limitations, they conserve resources and peer is
1463 * evidently real one.
1464 */
463c84b9 1465 if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
946cedcc
ED
1466 want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1467 if (!want_cookie)
1468 goto drop;
1da177e4
LT
1469 }
1470
1471 /* Accept backlog is full. If we have already queued enough
1472 * of warm entries in syn queue, drop request. It is better than
1473 * clogging syn queue with openreqs with exponentially increasing
1474 * timeout.
1475 */
2aeef18d
NS
1476 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
1477 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1da177e4 1478 goto drop;
2aeef18d 1479 }
1da177e4 1480
ce4a7d0d 1481 req = inet_reqsk_alloc(&tcp_request_sock_ops);
1da177e4
LT
1482 if (!req)
1483 goto drop;
1484
cfb6eeb4
YH
1485#ifdef CONFIG_TCP_MD5SIG
1486 tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1487#endif
1488
1da177e4 1489 tcp_clear_options(&tmp_opt);
bee7ca9e 1490 tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
4957faad 1491 tmp_opt.user_mss = tp->rx_opt.user_mss;
1a2c6181 1492 tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
1da177e4 1493
4dfc2817 1494 if (want_cookie && !tmp_opt.saw_tstamp)
1da177e4 1495 tcp_clear_options(&tmp_opt);
1da177e4 1496
1da177e4 1497 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1da177e4
LT
1498 tcp_openreq_init(req, &tmp_opt, skb);
1499
bb5b7c11
DM
1500 ireq = inet_rsk(req);
1501 ireq->loc_addr = daddr;
1502 ireq->rmt_addr = saddr;
1503 ireq->no_srccheck = inet_sk(sk)->transparent;
5dff747b 1504 ireq->opt = tcp_v4_save_options(skb);
bb5b7c11 1505
284904aa 1506 if (security_inet_conn_request(sk, skb, req))
bb5b7c11 1507 goto drop_and_free;
284904aa 1508
172d69e6 1509 if (!want_cookie || tmp_opt.tstamp_ok)
5d134f1c 1510 TCP_ECN_create_request(req, skb, sock_net(sk));
1da177e4
LT
1511
1512 if (want_cookie) {
1da177e4 1513 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
172d69e6 1514 req->cookie_ts = tmp_opt.tstamp_ok;
1da177e4 1515 } else if (!isn) {
1da177e4
LT
1516 /* VJ's idea. We save last timestamp seen
1517 * from the destination in peer table, when entering
1518 * state TIME-WAIT, and check against it before
1519 * accepting new connection request.
1520 *
1521 * If "isn" is not zero, this request hit alive
1522 * timewait bucket, so that all the necessary checks
1523 * are made in the function processing timewait state.
1524 */
1525 if (tmp_opt.saw_tstamp &&
295ff7ed 1526 tcp_death_row.sysctl_tw_recycle &&
ba3f7f04 1527 (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
81166dd6
DM
1528 fl4.daddr == saddr) {
1529 if (!tcp_peer_is_proven(req, dst, true)) {
de0744af 1530 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
7cd04fa7 1531 goto drop_and_release;
1da177e4
LT
1532 }
1533 }
1534 /* Kill the following clause, if you dislike this way. */
1535 else if (!sysctl_tcp_syncookies &&
463c84b9 1536 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1da177e4 1537 (sysctl_max_syn_backlog >> 2)) &&
81166dd6 1538 !tcp_peer_is_proven(req, dst, false)) {
1da177e4
LT
1539 /* Without syncookies last quarter of
1540 * backlog is filled with destinations,
1541 * proven to be alive.
1542 * It means that we continue to communicate
1543 * to destinations, already remembered
1544 * to the moment of synflood.
1545 */
afd46503 1546 LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
673d57e7 1547 &saddr, ntohs(tcp_hdr(skb)->source));
7cd04fa7 1548 goto drop_and_release;
1da177e4
LT
1549 }
1550
a94f723d 1551 isn = tcp_v4_init_sequence(skb);
1da177e4 1552 }
2e6599cb 1553 tcp_rsk(req)->snt_isn = isn;
1da177e4 1554
168a8f58
JC
1555 if (dst == NULL) {
1556 dst = inet_csk_route_req(sk, &fl4, req);
1557 if (dst == NULL)
1558 goto drop_and_free;
1559 }
1560 do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc);
1561
1562 /* We don't call tcp_v4_send_synack() directly because we need
1563 * to make sure a child socket can be created successfully before
1564 * sending back synack!
1565 *
1566 * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack()
1567 * (or better yet, call tcp_send_synack() in the child context
1568 * directly, but will have to fix bunch of other code first)
1569 * after syn_recv_sock() except one will need to first fix the
1570 * latter to remove its dependency on the current implementation
1571 * of tcp_v4_send_synack()->tcp_select_initial_window().
1572 */
1573 skb_synack = tcp_make_synack(sk, dst, req,
168a8f58
JC
1574 fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
1575
1576 if (skb_synack) {
1577 __tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr);
1578 skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb));
1579 } else
1580 goto drop_and_free;
1581
1582 if (likely(!do_fastopen)) {
1583 int err;
1584 err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1585 ireq->rmt_addr, ireq->opt);
1586 err = net_xmit_eval(err);
1587 if (err || want_cookie)
1588 goto drop_and_free;
1589
016818d0 1590 tcp_rsk(req)->snt_synack = tcp_time_stamp;
168a8f58
JC
1591 tcp_rsk(req)->listener = NULL;
1592 /* Add the request_sock to the SYN table */
1593 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1594 if (fastopen_cookie_present(&foc) && foc.len != 0)
1595 NET_INC_STATS_BH(sock_net(sk),
1596 LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1a2c6181 1597 } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req))
1da177e4
LT
1598 goto drop_and_free;
1599
1da177e4
LT
1600 return 0;
1601
7cd04fa7
DL
1602drop_and_release:
1603 dst_release(dst);
1da177e4 1604drop_and_free:
60236fdd 1605 reqsk_free(req);
1da177e4 1606drop:
848bf15f 1607 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1da177e4
LT
1608 return 0;
1609}
4bc2f18b 1610EXPORT_SYMBOL(tcp_v4_conn_request);
1da177e4
LT
1611
1612
1613/*
1614 * The three way handshake has completed - we got a valid synack -
1615 * now create the new socket.
1616 */
1617struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
60236fdd 1618 struct request_sock *req,
1da177e4
LT
1619 struct dst_entry *dst)
1620{
2e6599cb 1621 struct inet_request_sock *ireq;
1da177e4
LT
1622 struct inet_sock *newinet;
1623 struct tcp_sock *newtp;
1624 struct sock *newsk;
cfb6eeb4
YH
1625#ifdef CONFIG_TCP_MD5SIG
1626 struct tcp_md5sig_key *key;
1627#endif
f6d8bd05 1628 struct ip_options_rcu *inet_opt;
1da177e4
LT
1629
1630 if (sk_acceptq_is_full(sk))
1631 goto exit_overflow;
1632
1da177e4
LT
1633 newsk = tcp_create_openreq_child(sk, req, skb);
1634 if (!newsk)
093d2823 1635 goto exit_nonewsk;
1da177e4 1636
bcd76111 1637 newsk->sk_gso_type = SKB_GSO_TCPV4;
fae6ef87 1638 inet_sk_rx_dst_set(newsk, skb);
1da177e4
LT
1639
1640 newtp = tcp_sk(newsk);
1641 newinet = inet_sk(newsk);
2e6599cb 1642 ireq = inet_rsk(req);
c720c7e8
ED
1643 newinet->inet_daddr = ireq->rmt_addr;
1644 newinet->inet_rcv_saddr = ireq->loc_addr;
1645 newinet->inet_saddr = ireq->loc_addr;
f6d8bd05
ED
1646 inet_opt = ireq->opt;
1647 rcu_assign_pointer(newinet->inet_opt, inet_opt);
2e6599cb 1648 ireq->opt = NULL;
463c84b9 1649 newinet->mc_index = inet_iif(skb);
eddc9ec5 1650 newinet->mc_ttl = ip_hdr(skb)->ttl;
4c507d28 1651 newinet->rcv_tos = ip_hdr(skb)->tos;
d83d8461 1652 inet_csk(newsk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
1653 if (inet_opt)
1654 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
c720c7e8 1655 newinet->inet_id = newtp->write_seq ^ jiffies;
1da177e4 1656
dfd25fff
ED
1657 if (!dst) {
1658 dst = inet_csk_route_child_sock(sk, newsk, req);
1659 if (!dst)
1660 goto put_and_exit;
1661 } else {
1662 /* syncookie case : see end of cookie_v4_check() */
1663 }
0e734419
DM
1664 sk_setup_caps(newsk, dst);
1665
5d424d5a 1666 tcp_mtup_init(newsk);
1da177e4 1667 tcp_sync_mss(newsk, dst_mtu(dst));
0dbaee3b 1668 newtp->advmss = dst_metric_advmss(dst);
f5fff5dc
TQ
1669 if (tcp_sk(sk)->rx_opt.user_mss &&
1670 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1671 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1672
1da177e4
LT
1673 tcp_initialize_rcv_mss(newsk);
1674
cfb6eeb4
YH
1675#ifdef CONFIG_TCP_MD5SIG
1676 /* Copy over the MD5 key from the original socket */
a915da9b
ED
1677 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1678 AF_INET);
c720c7e8 1679 if (key != NULL) {
cfb6eeb4
YH
1680 /*
1681 * We're using one, so create a matching key
1682 * on the newsk structure. If we fail to get
1683 * memory, then we end up not copying the key
1684 * across. Shucks.
1685 */
a915da9b
ED
1686 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1687 AF_INET, key->key, key->keylen, GFP_ATOMIC);
a465419b 1688 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
cfb6eeb4
YH
1689 }
1690#endif
1691
0e734419
DM
1692 if (__inet_inherit_port(sk, newsk) < 0)
1693 goto put_and_exit;
9327f705 1694 __inet_hash_nolisten(newsk, NULL);
1da177e4
LT
1695
1696 return newsk;
1697
1698exit_overflow:
de0744af 1699 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
093d2823
BS
1700exit_nonewsk:
1701 dst_release(dst);
1da177e4 1702exit:
de0744af 1703 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1da177e4 1704 return NULL;
0e734419 1705put_and_exit:
e337e24d
CP
1706 inet_csk_prepare_forced_close(newsk);
1707 tcp_done(newsk);
0e734419 1708 goto exit;
1da177e4 1709}
4bc2f18b 1710EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1da177e4
LT
1711
1712static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1713{
aa8223c7 1714 struct tcphdr *th = tcp_hdr(skb);
eddc9ec5 1715 const struct iphdr *iph = ip_hdr(skb);
1da177e4 1716 struct sock *nsk;
60236fdd 1717 struct request_sock **prev;
1da177e4 1718 /* Find possible connection requests. */
463c84b9
ACM
1719 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1720 iph->saddr, iph->daddr);
1da177e4 1721 if (req)
8336886f 1722 return tcp_check_req(sk, skb, req, prev, false);
1da177e4 1723
3b1e0a65 1724 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
c67499c0 1725 th->source, iph->daddr, th->dest, inet_iif(skb));
1da177e4
LT
1726
1727 if (nsk) {
1728 if (nsk->sk_state != TCP_TIME_WAIT) {
1729 bh_lock_sock(nsk);
1730 return nsk;
1731 }
9469c7b4 1732 inet_twsk_put(inet_twsk(nsk));
1da177e4
LT
1733 return NULL;
1734 }
1735
1736#ifdef CONFIG_SYN_COOKIES
af9b4738 1737 if (!th->syn)
1da177e4
LT
1738 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1739#endif
1740 return sk;
1741}
1742
b51655b9 1743static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1da177e4 1744{
eddc9ec5
ACM
1745 const struct iphdr *iph = ip_hdr(skb);
1746
84fa7933 1747 if (skb->ip_summed == CHECKSUM_COMPLETE) {
eddc9ec5
ACM
1748 if (!tcp_v4_check(skb->len, iph->saddr,
1749 iph->daddr, skb->csum)) {
fb286bb2 1750 skb->ip_summed = CHECKSUM_UNNECESSARY;
1da177e4 1751 return 0;
fb286bb2 1752 }
1da177e4 1753 }
fb286bb2 1754
eddc9ec5 1755 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
fb286bb2
HX
1756 skb->len, IPPROTO_TCP, 0);
1757
1da177e4 1758 if (skb->len <= 76) {
fb286bb2 1759 return __skb_checksum_complete(skb);
1da177e4
LT
1760 }
1761 return 0;
1762}
1763
1764
1765/* The socket must have it's spinlock held when we get
1766 * here.
1767 *
1768 * We have a potential double-lock case here, so even when
1769 * doing backlog processing we use the BH locking scheme.
1770 * This is because we cannot sleep with the original spinlock
1771 * held.
1772 */
1773int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1774{
cfb6eeb4
YH
1775 struct sock *rsk;
1776#ifdef CONFIG_TCP_MD5SIG
1777 /*
1778 * We really want to reject the packet as early as possible
1779 * if:
1780 * o We're expecting an MD5'd packet and this is no MD5 tcp option
1781 * o There is an MD5 option and we're not expecting one
1782 */
7174259e 1783 if (tcp_v4_inbound_md5_hash(sk, skb))
cfb6eeb4
YH
1784 goto discard;
1785#endif
1786
1da177e4 1787 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
404e0a8b
ED
1788 struct dst_entry *dst = sk->sk_rx_dst;
1789
bdeab991 1790 sock_rps_save_rxhash(sk, skb);
404e0a8b 1791 if (dst) {
505fbcf0
ED
1792 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1793 dst->ops->check(dst, 0) == NULL) {
92101b3b
DM
1794 dst_release(dst);
1795 sk->sk_rx_dst = NULL;
1796 }
1797 }
aa8223c7 1798 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
cfb6eeb4 1799 rsk = sk;
1da177e4 1800 goto reset;
cfb6eeb4 1801 }
1da177e4
LT
1802 return 0;
1803 }
1804
ab6a5bb6 1805 if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1da177e4
LT
1806 goto csum_err;
1807
1808 if (sk->sk_state == TCP_LISTEN) {
1809 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1810 if (!nsk)
1811 goto discard;
1812
1813 if (nsk != sk) {
bdeab991 1814 sock_rps_save_rxhash(nsk, skb);
cfb6eeb4
YH
1815 if (tcp_child_process(sk, nsk, skb)) {
1816 rsk = nsk;
1da177e4 1817 goto reset;
cfb6eeb4 1818 }
1da177e4
LT
1819 return 0;
1820 }
ca55158c 1821 } else
bdeab991 1822 sock_rps_save_rxhash(sk, skb);
ca55158c 1823
aa8223c7 1824 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
cfb6eeb4 1825 rsk = sk;
1da177e4 1826 goto reset;
cfb6eeb4 1827 }
1da177e4
LT
1828 return 0;
1829
1830reset:
cfb6eeb4 1831 tcp_v4_send_reset(rsk, skb);
1da177e4
LT
1832discard:
1833 kfree_skb(skb);
1834 /* Be careful here. If this function gets more complicated and
1835 * gcc suffers from register pressure on the x86, sk (in %ebx)
1836 * might be destroyed here. This current version compiles correctly,
1837 * but you have been warned.
1838 */
1839 return 0;
1840
1841csum_err:
6a5dc9e5 1842 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
63231bdd 1843 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1da177e4
LT
1844 goto discard;
1845}
4bc2f18b 1846EXPORT_SYMBOL(tcp_v4_do_rcv);
1da177e4 1847
160eb5a6 1848void tcp_v4_early_demux(struct sk_buff *skb)
41063e9d 1849{
41063e9d
DM
1850 const struct iphdr *iph;
1851 const struct tcphdr *th;
1852 struct sock *sk;
41063e9d 1853
41063e9d 1854 if (skb->pkt_type != PACKET_HOST)
160eb5a6 1855 return;
41063e9d 1856
45f00f99 1857 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
160eb5a6 1858 return;
41063e9d
DM
1859
1860 iph = ip_hdr(skb);
45f00f99 1861 th = tcp_hdr(skb);
41063e9d
DM
1862
1863 if (th->doff < sizeof(struct tcphdr) / 4)
160eb5a6 1864 return;
41063e9d 1865
45f00f99 1866 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
41063e9d 1867 iph->saddr, th->source,
7011d085 1868 iph->daddr, ntohs(th->dest),
9cb429d6 1869 skb->skb_iif);
41063e9d
DM
1870 if (sk) {
1871 skb->sk = sk;
1872 skb->destructor = sock_edemux;
1873 if (sk->sk_state != TCP_TIME_WAIT) {
1874 struct dst_entry *dst = sk->sk_rx_dst;
505fbcf0 1875
41063e9d
DM
1876 if (dst)
1877 dst = dst_check(dst, 0);
92101b3b 1878 if (dst &&
505fbcf0 1879 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
92101b3b 1880 skb_dst_set_noref(skb, dst);
41063e9d
DM
1881 }
1882 }
41063e9d
DM
1883}
1884
b2fb4f54
ED
1885/* Packet is added to VJ-style prequeue for processing in process
1886 * context, if a reader task is waiting. Apparently, this exciting
1887 * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1888 * failed somewhere. Latency? Burstiness? Well, at least now we will
1889 * see, why it failed. 8)8) --ANK
1890 *
1891 */
1892bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1893{
1894 struct tcp_sock *tp = tcp_sk(sk);
1895
1896 if (sysctl_tcp_low_latency || !tp->ucopy.task)
1897 return false;
1898
1899 if (skb->len <= tcp_hdrlen(skb) &&
1900 skb_queue_len(&tp->ucopy.prequeue) == 0)
1901 return false;
1902
58717686 1903 skb_dst_force(skb);
b2fb4f54
ED
1904 __skb_queue_tail(&tp->ucopy.prequeue, skb);
1905 tp->ucopy.memory += skb->truesize;
1906 if (tp->ucopy.memory > sk->sk_rcvbuf) {
1907 struct sk_buff *skb1;
1908
1909 BUG_ON(sock_owned_by_user(sk));
1910
1911 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1912 sk_backlog_rcv(sk, skb1);
1913 NET_INC_STATS_BH(sock_net(sk),
1914 LINUX_MIB_TCPPREQUEUEDROPPED);
1915 }
1916
1917 tp->ucopy.memory = 0;
1918 } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1919 wake_up_interruptible_sync_poll(sk_sleep(sk),
1920 POLLIN | POLLRDNORM | POLLRDBAND);
1921 if (!inet_csk_ack_scheduled(sk))
1922 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1923 (3 * tcp_rto_min(sk)) / 4,
1924 TCP_RTO_MAX);
1925 }
1926 return true;
1927}
1928EXPORT_SYMBOL(tcp_prequeue);
1929
1da177e4
LT
1930/*
1931 * From tcp_input.c
1932 */
1933
1934int tcp_v4_rcv(struct sk_buff *skb)
1935{
eddc9ec5 1936 const struct iphdr *iph;
cf533ea5 1937 const struct tcphdr *th;
1da177e4
LT
1938 struct sock *sk;
1939 int ret;
a86b1e30 1940 struct net *net = dev_net(skb->dev);
1da177e4
LT
1941
1942 if (skb->pkt_type != PACKET_HOST)
1943 goto discard_it;
1944
1945 /* Count it even if it's bad */
63231bdd 1946 TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1da177e4
LT
1947
1948 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1949 goto discard_it;
1950
aa8223c7 1951 th = tcp_hdr(skb);
1da177e4
LT
1952
1953 if (th->doff < sizeof(struct tcphdr) / 4)
1954 goto bad_packet;
1955 if (!pskb_may_pull(skb, th->doff * 4))
1956 goto discard_it;
1957
1958 /* An explanation is required here, I think.
1959 * Packet length and doff are validated by header prediction,
caa20d9a 1960 * provided case of th->doff==0 is eliminated.
1da177e4 1961 * So, we defer the checks. */
60476372 1962 if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
6a5dc9e5 1963 goto csum_error;
1da177e4 1964
aa8223c7 1965 th = tcp_hdr(skb);
eddc9ec5 1966 iph = ip_hdr(skb);
1da177e4
LT
1967 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1968 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1969 skb->len - th->doff * 4);
1970 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1971 TCP_SKB_CB(skb)->when = 0;
b82d1bb4 1972 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1da177e4
LT
1973 TCP_SKB_CB(skb)->sacked = 0;
1974
9a1f27c4 1975 sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1da177e4
LT
1976 if (!sk)
1977 goto no_tcp_socket;
1978
bb134d5d
ED
1979process:
1980 if (sk->sk_state == TCP_TIME_WAIT)
1981 goto do_time_wait;
1982
6cce09f8
ED
1983 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1984 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
d218d111 1985 goto discard_and_relse;
6cce09f8 1986 }
d218d111 1987
1da177e4
LT
1988 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1989 goto discard_and_relse;
b59c2701 1990 nf_reset(skb);
1da177e4 1991
fda9ef5d 1992 if (sk_filter(sk, skb))
1da177e4
LT
1993 goto discard_and_relse;
1994
8b80cda5 1995 sk_mark_napi_id(sk, skb);
1da177e4
LT
1996 skb->dev = NULL;
1997
c6366184 1998 bh_lock_sock_nested(sk);
1da177e4
LT
1999 ret = 0;
2000 if (!sock_owned_by_user(sk)) {
1a2449a8
CL
2001#ifdef CONFIG_NET_DMA
2002 struct tcp_sock *tp = tcp_sk(sk);
2003 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
a2bd1140 2004 tp->ucopy.dma_chan = net_dma_find_channel();
1a2449a8 2005 if (tp->ucopy.dma_chan)
1da177e4 2006 ret = tcp_v4_do_rcv(sk, skb);
1a2449a8
CL
2007 else
2008#endif
2009 {
2010 if (!tcp_prequeue(sk, skb))
ae8d7f88 2011 ret = tcp_v4_do_rcv(sk, skb);
1a2449a8 2012 }
da882c1f
ED
2013 } else if (unlikely(sk_add_backlog(sk, skb,
2014 sk->sk_rcvbuf + sk->sk_sndbuf))) {
6b03a53a 2015 bh_unlock_sock(sk);
6cce09f8 2016 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
6b03a53a
ZY
2017 goto discard_and_relse;
2018 }
1da177e4
LT
2019 bh_unlock_sock(sk);
2020
2021 sock_put(sk);
2022
2023 return ret;
2024
2025no_tcp_socket:
2026 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2027 goto discard_it;
2028
2029 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
6a5dc9e5
ED
2030csum_error:
2031 TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1da177e4 2032bad_packet:
63231bdd 2033 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1da177e4 2034 } else {
cfb6eeb4 2035 tcp_v4_send_reset(NULL, skb);
1da177e4
LT
2036 }
2037
2038discard_it:
2039 /* Discard frame. */
2040 kfree_skb(skb);
e905a9ed 2041 return 0;
1da177e4
LT
2042
2043discard_and_relse:
2044 sock_put(sk);
2045 goto discard_it;
2046
2047do_time_wait:
2048 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
9469c7b4 2049 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
2050 goto discard_it;
2051 }
2052
6a5dc9e5 2053 if (skb->len < (th->doff << 2)) {
9469c7b4 2054 inet_twsk_put(inet_twsk(sk));
6a5dc9e5
ED
2055 goto bad_packet;
2056 }
2057 if (tcp_checksum_complete(skb)) {
2058 inet_twsk_put(inet_twsk(sk));
2059 goto csum_error;
1da177e4 2060 }
9469c7b4 2061 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1da177e4 2062 case TCP_TW_SYN: {
c346dca1 2063 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
c67499c0 2064 &tcp_hashinfo,
da5e3630 2065 iph->saddr, th->source,
eddc9ec5 2066 iph->daddr, th->dest,
463c84b9 2067 inet_iif(skb));
1da177e4 2068 if (sk2) {
9469c7b4
YH
2069 inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
2070 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
2071 sk = sk2;
2072 goto process;
2073 }
2074 /* Fall through to ACK */
2075 }
2076 case TCP_TW_ACK:
2077 tcp_v4_timewait_ack(sk, skb);
2078 break;
2079 case TCP_TW_RST:
2080 goto no_tcp_socket;
2081 case TCP_TW_SUCCESS:;
2082 }
2083 goto discard_it;
2084}
2085
ccb7c410
DM
2086static struct timewait_sock_ops tcp_timewait_sock_ops = {
2087 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2088 .twsk_unique = tcp_twsk_unique,
2089 .twsk_destructor= tcp_twsk_destructor,
ccb7c410 2090};
1da177e4 2091
63d02d15 2092void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
5d299f3d
ED
2093{
2094 struct dst_entry *dst = skb_dst(skb);
2095
2096 dst_hold(dst);
2097 sk->sk_rx_dst = dst;
2098 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2099}
63d02d15 2100EXPORT_SYMBOL(inet_sk_rx_dst_set);
5d299f3d 2101
3b401a81 2102const struct inet_connection_sock_af_ops ipv4_specific = {
543d9cfe
ACM
2103 .queue_xmit = ip_queue_xmit,
2104 .send_check = tcp_v4_send_check,
2105 .rebuild_header = inet_sk_rebuild_header,
5d299f3d 2106 .sk_rx_dst_set = inet_sk_rx_dst_set,
543d9cfe
ACM
2107 .conn_request = tcp_v4_conn_request,
2108 .syn_recv_sock = tcp_v4_syn_recv_sock,
543d9cfe
ACM
2109 .net_header_len = sizeof(struct iphdr),
2110 .setsockopt = ip_setsockopt,
2111 .getsockopt = ip_getsockopt,
2112 .addr2sockaddr = inet_csk_addr2sockaddr,
2113 .sockaddr_len = sizeof(struct sockaddr_in),
ab1e0a13 2114 .bind_conflict = inet_csk_bind_conflict,
3fdadf7d 2115#ifdef CONFIG_COMPAT
543d9cfe
ACM
2116 .compat_setsockopt = compat_ip_setsockopt,
2117 .compat_getsockopt = compat_ip_getsockopt,
3fdadf7d 2118#endif
1da177e4 2119};
4bc2f18b 2120EXPORT_SYMBOL(ipv4_specific);
1da177e4 2121
cfb6eeb4 2122#ifdef CONFIG_TCP_MD5SIG
b2e4b3de 2123static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
cfb6eeb4 2124 .md5_lookup = tcp_v4_md5_lookup,
49a72dfb 2125 .calc_md5_hash = tcp_v4_md5_hash_skb,
cfb6eeb4 2126 .md5_parse = tcp_v4_parse_md5_keys,
cfb6eeb4 2127};
b6332e6c 2128#endif
cfb6eeb4 2129
1da177e4
LT
2130/* NOTE: A lot of things set to zero explicitly by call to
2131 * sk_alloc() so need not be done here.
2132 */
2133static int tcp_v4_init_sock(struct sock *sk)
2134{
6687e988 2135 struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4 2136
900f65d3 2137 tcp_init_sock(sk);
1da177e4 2138
8292a17a 2139 icsk->icsk_af_ops = &ipv4_specific;
900f65d3 2140
cfb6eeb4 2141#ifdef CONFIG_TCP_MD5SIG
ac807fa8 2142 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
cfb6eeb4 2143#endif
1da177e4 2144
1da177e4
LT
2145 return 0;
2146}
2147
7d06b2e0 2148void tcp_v4_destroy_sock(struct sock *sk)
1da177e4
LT
2149{
2150 struct tcp_sock *tp = tcp_sk(sk);
2151
2152 tcp_clear_xmit_timers(sk);
2153
6687e988 2154 tcp_cleanup_congestion_control(sk);
317a76f9 2155
1da177e4 2156 /* Cleanup up the write buffer. */
fe067e8a 2157 tcp_write_queue_purge(sk);
1da177e4
LT
2158
2159 /* Cleans up our, hopefully empty, out_of_order_queue. */
e905a9ed 2160 __skb_queue_purge(&tp->out_of_order_queue);
1da177e4 2161
cfb6eeb4
YH
2162#ifdef CONFIG_TCP_MD5SIG
2163 /* Clean up the MD5 key list, if any */
2164 if (tp->md5sig_info) {
a915da9b 2165 tcp_clear_md5_list(sk);
a8afca03 2166 kfree_rcu(tp->md5sig_info, rcu);
cfb6eeb4
YH
2167 tp->md5sig_info = NULL;
2168 }
2169#endif
2170
1a2449a8
CL
2171#ifdef CONFIG_NET_DMA
2172 /* Cleans up our sk_async_wait_queue */
e905a9ed 2173 __skb_queue_purge(&sk->sk_async_wait_queue);
1a2449a8
CL
2174#endif
2175
1da177e4
LT
2176 /* Clean prequeue, it must be empty really */
2177 __skb_queue_purge(&tp->ucopy.prequeue);
2178
2179 /* Clean up a referenced TCP bind bucket. */
463c84b9 2180 if (inet_csk(sk)->icsk_bind_hash)
ab1e0a13 2181 inet_put_port(sk);
1da177e4 2182
168a8f58 2183 BUG_ON(tp->fastopen_rsk != NULL);
435cf559 2184
cf60af03
YC
2185 /* If socket is aborted during connect operation */
2186 tcp_free_fastopen_req(tp);
2187
180d8cd9 2188 sk_sockets_allocated_dec(sk);
d1a4c0b3 2189 sock_release_memcg(sk);
1da177e4 2190}
1da177e4
LT
2191EXPORT_SYMBOL(tcp_v4_destroy_sock);
2192
2193#ifdef CONFIG_PROC_FS
2194/* Proc filesystem TCP sock list dumping. */
2195
3ab5aee7 2196static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1da177e4 2197{
3ab5aee7 2198 return hlist_nulls_empty(head) ? NULL :
8feaf0c0 2199 list_entry(head->first, struct inet_timewait_sock, tw_node);
1da177e4
LT
2200}
2201
8feaf0c0 2202static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1da177e4 2203{
3ab5aee7
ED
2204 return !is_a_nulls(tw->tw_node.next) ?
2205 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1da177e4
LT
2206}
2207
a8b690f9
TH
2208/*
2209 * Get next listener socket follow cur. If cur is NULL, get first socket
2210 * starting from bucket given in st->bucket; when st->bucket is zero the
2211 * very first socket in the hash table is returned.
2212 */
1da177e4
LT
2213static void *listening_get_next(struct seq_file *seq, void *cur)
2214{
463c84b9 2215 struct inet_connection_sock *icsk;
c25eb3bf 2216 struct hlist_nulls_node *node;
1da177e4 2217 struct sock *sk = cur;
5caea4ea 2218 struct inet_listen_hashbucket *ilb;
5799de0b 2219 struct tcp_iter_state *st = seq->private;
a4146b1b 2220 struct net *net = seq_file_net(seq);
1da177e4
LT
2221
2222 if (!sk) {
a8b690f9 2223 ilb = &tcp_hashinfo.listening_hash[st->bucket];
5caea4ea 2224 spin_lock_bh(&ilb->lock);
c25eb3bf 2225 sk = sk_nulls_head(&ilb->head);
a8b690f9 2226 st->offset = 0;
1da177e4
LT
2227 goto get_sk;
2228 }
5caea4ea 2229 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1da177e4 2230 ++st->num;
a8b690f9 2231 ++st->offset;
1da177e4
LT
2232
2233 if (st->state == TCP_SEQ_STATE_OPENREQ) {
60236fdd 2234 struct request_sock *req = cur;
1da177e4 2235
72a3effa 2236 icsk = inet_csk(st->syn_wait_sk);
1da177e4
LT
2237 req = req->dl_next;
2238 while (1) {
2239 while (req) {
bdccc4ca 2240 if (req->rsk_ops->family == st->family) {
1da177e4
LT
2241 cur = req;
2242 goto out;
2243 }
2244 req = req->dl_next;
2245 }
72a3effa 2246 if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1da177e4
LT
2247 break;
2248get_req:
463c84b9 2249 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1da177e4 2250 }
1bde5ac4 2251 sk = sk_nulls_next(st->syn_wait_sk);
1da177e4 2252 st->state = TCP_SEQ_STATE_LISTENING;
463c84b9 2253 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4 2254 } else {
e905a9ed 2255 icsk = inet_csk(sk);
463c84b9
ACM
2256 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2257 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1da177e4 2258 goto start_req;
463c84b9 2259 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1bde5ac4 2260 sk = sk_nulls_next(sk);
1da177e4
LT
2261 }
2262get_sk:
c25eb3bf 2263 sk_nulls_for_each_from(sk, node) {
8475ef9f
PE
2264 if (!net_eq(sock_net(sk), net))
2265 continue;
2266 if (sk->sk_family == st->family) {
1da177e4
LT
2267 cur = sk;
2268 goto out;
2269 }
e905a9ed 2270 icsk = inet_csk(sk);
463c84b9
ACM
2271 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2272 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1da177e4
LT
2273start_req:
2274 st->uid = sock_i_uid(sk);
2275 st->syn_wait_sk = sk;
2276 st->state = TCP_SEQ_STATE_OPENREQ;
2277 st->sbucket = 0;
2278 goto get_req;
2279 }
463c84b9 2280 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4 2281 }
5caea4ea 2282 spin_unlock_bh(&ilb->lock);
a8b690f9 2283 st->offset = 0;
0f7ff927 2284 if (++st->bucket < INET_LHTABLE_SIZE) {
5caea4ea
ED
2285 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2286 spin_lock_bh(&ilb->lock);
c25eb3bf 2287 sk = sk_nulls_head(&ilb->head);
1da177e4
LT
2288 goto get_sk;
2289 }
2290 cur = NULL;
2291out:
2292 return cur;
2293}
2294
2295static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2296{
a8b690f9
TH
2297 struct tcp_iter_state *st = seq->private;
2298 void *rc;
2299
2300 st->bucket = 0;
2301 st->offset = 0;
2302 rc = listening_get_next(seq, NULL);
1da177e4
LT
2303
2304 while (rc && *pos) {
2305 rc = listening_get_next(seq, rc);
2306 --*pos;
2307 }
2308 return rc;
2309}
2310
a2a385d6 2311static inline bool empty_bucket(struct tcp_iter_state *st)
6eac5604 2312{
3ab5aee7
ED
2313 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2314 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
6eac5604
AK
2315}
2316
a8b690f9
TH
2317/*
2318 * Get first established socket starting from bucket given in st->bucket.
2319 * If st->bucket is zero, the very first socket in the hash is returned.
2320 */
1da177e4
LT
2321static void *established_get_first(struct seq_file *seq)
2322{
5799de0b 2323 struct tcp_iter_state *st = seq->private;
a4146b1b 2324 struct net *net = seq_file_net(seq);
1da177e4
LT
2325 void *rc = NULL;
2326
a8b690f9
TH
2327 st->offset = 0;
2328 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1da177e4 2329 struct sock *sk;
3ab5aee7 2330 struct hlist_nulls_node *node;
8feaf0c0 2331 struct inet_timewait_sock *tw;
9db66bdc 2332 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1da177e4 2333
6eac5604
AK
2334 /* Lockless fast path for the common case of empty buckets */
2335 if (empty_bucket(st))
2336 continue;
2337
9db66bdc 2338 spin_lock_bh(lock);
3ab5aee7 2339 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
f40c8174 2340 if (sk->sk_family != st->family ||
878628fb 2341 !net_eq(sock_net(sk), net)) {
1da177e4
LT
2342 continue;
2343 }
2344 rc = sk;
2345 goto out;
2346 }
2347 st->state = TCP_SEQ_STATE_TIME_WAIT;
8feaf0c0 2348 inet_twsk_for_each(tw, node,
dbca9b27 2349 &tcp_hashinfo.ehash[st->bucket].twchain) {
28518fc1 2350 if (tw->tw_family != st->family ||
878628fb 2351 !net_eq(twsk_net(tw), net)) {
1da177e4
LT
2352 continue;
2353 }
2354 rc = tw;
2355 goto out;
2356 }
9db66bdc 2357 spin_unlock_bh(lock);
1da177e4
LT
2358 st->state = TCP_SEQ_STATE_ESTABLISHED;
2359 }
2360out:
2361 return rc;
2362}
2363
2364static void *established_get_next(struct seq_file *seq, void *cur)
2365{
2366 struct sock *sk = cur;
8feaf0c0 2367 struct inet_timewait_sock *tw;
3ab5aee7 2368 struct hlist_nulls_node *node;
5799de0b 2369 struct tcp_iter_state *st = seq->private;
a4146b1b 2370 struct net *net = seq_file_net(seq);
1da177e4
LT
2371
2372 ++st->num;
a8b690f9 2373 ++st->offset;
1da177e4
LT
2374
2375 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2376 tw = cur;
2377 tw = tw_next(tw);
2378get_tw:
878628fb 2379 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
1da177e4
LT
2380 tw = tw_next(tw);
2381 }
2382 if (tw) {
2383 cur = tw;
2384 goto out;
2385 }
9db66bdc 2386 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1da177e4
LT
2387 st->state = TCP_SEQ_STATE_ESTABLISHED;
2388
6eac5604 2389 /* Look for next non empty bucket */
a8b690f9 2390 st->offset = 0;
f373b53b 2391 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
6eac5604
AK
2392 empty_bucket(st))
2393 ;
f373b53b 2394 if (st->bucket > tcp_hashinfo.ehash_mask)
6eac5604
AK
2395 return NULL;
2396
9db66bdc 2397 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
3ab5aee7 2398 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
1da177e4 2399 } else
3ab5aee7 2400 sk = sk_nulls_next(sk);
1da177e4 2401
3ab5aee7 2402 sk_nulls_for_each_from(sk, node) {
878628fb 2403 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1da177e4
LT
2404 goto found;
2405 }
2406
2407 st->state = TCP_SEQ_STATE_TIME_WAIT;
dbca9b27 2408 tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
1da177e4
LT
2409 goto get_tw;
2410found:
2411 cur = sk;
2412out:
2413 return cur;
2414}
2415
2416static void *established_get_idx(struct seq_file *seq, loff_t pos)
2417{
a8b690f9
TH
2418 struct tcp_iter_state *st = seq->private;
2419 void *rc;
2420
2421 st->bucket = 0;
2422 rc = established_get_first(seq);
1da177e4
LT
2423
2424 while (rc && pos) {
2425 rc = established_get_next(seq, rc);
2426 --pos;
7174259e 2427 }
1da177e4
LT
2428 return rc;
2429}
2430
2431static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2432{
2433 void *rc;
5799de0b 2434 struct tcp_iter_state *st = seq->private;
1da177e4 2435
1da177e4
LT
2436 st->state = TCP_SEQ_STATE_LISTENING;
2437 rc = listening_get_idx(seq, &pos);
2438
2439 if (!rc) {
1da177e4
LT
2440 st->state = TCP_SEQ_STATE_ESTABLISHED;
2441 rc = established_get_idx(seq, pos);
2442 }
2443
2444 return rc;
2445}
2446
a8b690f9
TH
2447static void *tcp_seek_last_pos(struct seq_file *seq)
2448{
2449 struct tcp_iter_state *st = seq->private;
2450 int offset = st->offset;
2451 int orig_num = st->num;
2452 void *rc = NULL;
2453
2454 switch (st->state) {
2455 case TCP_SEQ_STATE_OPENREQ:
2456 case TCP_SEQ_STATE_LISTENING:
2457 if (st->bucket >= INET_LHTABLE_SIZE)
2458 break;
2459 st->state = TCP_SEQ_STATE_LISTENING;
2460 rc = listening_get_next(seq, NULL);
2461 while (offset-- && rc)
2462 rc = listening_get_next(seq, rc);
2463 if (rc)
2464 break;
2465 st->bucket = 0;
2466 /* Fallthrough */
2467 case TCP_SEQ_STATE_ESTABLISHED:
2468 case TCP_SEQ_STATE_TIME_WAIT:
2469 st->state = TCP_SEQ_STATE_ESTABLISHED;
2470 if (st->bucket > tcp_hashinfo.ehash_mask)
2471 break;
2472 rc = established_get_first(seq);
2473 while (offset-- && rc)
2474 rc = established_get_next(seq, rc);
2475 }
2476
2477 st->num = orig_num;
2478
2479 return rc;
2480}
2481
1da177e4
LT
2482static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2483{
5799de0b 2484 struct tcp_iter_state *st = seq->private;
a8b690f9
TH
2485 void *rc;
2486
2487 if (*pos && *pos == st->last_pos) {
2488 rc = tcp_seek_last_pos(seq);
2489 if (rc)
2490 goto out;
2491 }
2492
1da177e4
LT
2493 st->state = TCP_SEQ_STATE_LISTENING;
2494 st->num = 0;
a8b690f9
TH
2495 st->bucket = 0;
2496 st->offset = 0;
2497 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2498
2499out:
2500 st->last_pos = *pos;
2501 return rc;
1da177e4
LT
2502}
2503
2504static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2505{
a8b690f9 2506 struct tcp_iter_state *st = seq->private;
1da177e4 2507 void *rc = NULL;
1da177e4
LT
2508
2509 if (v == SEQ_START_TOKEN) {
2510 rc = tcp_get_idx(seq, 0);
2511 goto out;
2512 }
1da177e4
LT
2513
2514 switch (st->state) {
2515 case TCP_SEQ_STATE_OPENREQ:
2516 case TCP_SEQ_STATE_LISTENING:
2517 rc = listening_get_next(seq, v);
2518 if (!rc) {
1da177e4 2519 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2520 st->bucket = 0;
2521 st->offset = 0;
1da177e4
LT
2522 rc = established_get_first(seq);
2523 }
2524 break;
2525 case TCP_SEQ_STATE_ESTABLISHED:
2526 case TCP_SEQ_STATE_TIME_WAIT:
2527 rc = established_get_next(seq, v);
2528 break;
2529 }
2530out:
2531 ++*pos;
a8b690f9 2532 st->last_pos = *pos;
1da177e4
LT
2533 return rc;
2534}
2535
2536static void tcp_seq_stop(struct seq_file *seq, void *v)
2537{
5799de0b 2538 struct tcp_iter_state *st = seq->private;
1da177e4
LT
2539
2540 switch (st->state) {
2541 case TCP_SEQ_STATE_OPENREQ:
2542 if (v) {
463c84b9
ACM
2543 struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2544 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4
LT
2545 }
2546 case TCP_SEQ_STATE_LISTENING:
2547 if (v != SEQ_START_TOKEN)
5caea4ea 2548 spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
1da177e4
LT
2549 break;
2550 case TCP_SEQ_STATE_TIME_WAIT:
2551 case TCP_SEQ_STATE_ESTABLISHED:
2552 if (v)
9db66bdc 2553 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1da177e4
LT
2554 break;
2555 }
2556}
2557
73cb88ec 2558int tcp_seq_open(struct inode *inode, struct file *file)
1da177e4 2559{
d9dda78b 2560 struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
1da177e4 2561 struct tcp_iter_state *s;
52d6f3f1 2562 int err;
1da177e4 2563
52d6f3f1
DL
2564 err = seq_open_net(inode, file, &afinfo->seq_ops,
2565 sizeof(struct tcp_iter_state));
2566 if (err < 0)
2567 return err;
f40c8174 2568
52d6f3f1 2569 s = ((struct seq_file *)file->private_data)->private;
1da177e4 2570 s->family = afinfo->family;
a8b690f9 2571 s->last_pos = 0;
f40c8174
DL
2572 return 0;
2573}
73cb88ec 2574EXPORT_SYMBOL(tcp_seq_open);
f40c8174 2575
6f8b13bc 2576int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
1da177e4
LT
2577{
2578 int rc = 0;
2579 struct proc_dir_entry *p;
2580
9427c4b3
DL
2581 afinfo->seq_ops.start = tcp_seq_start;
2582 afinfo->seq_ops.next = tcp_seq_next;
2583 afinfo->seq_ops.stop = tcp_seq_stop;
2584
84841c3c 2585 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
73cb88ec 2586 afinfo->seq_fops, afinfo);
84841c3c 2587 if (!p)
1da177e4
LT
2588 rc = -ENOMEM;
2589 return rc;
2590}
4bc2f18b 2591EXPORT_SYMBOL(tcp_proc_register);
1da177e4 2592
6f8b13bc 2593void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
1da177e4 2594{
ece31ffd 2595 remove_proc_entry(afinfo->name, net->proc_net);
1da177e4 2596}
4bc2f18b 2597EXPORT_SYMBOL(tcp_proc_unregister);
1da177e4 2598
cf533ea5 2599static void get_openreq4(const struct sock *sk, const struct request_sock *req,
a7cb5a49 2600 struct seq_file *f, int i, kuid_t uid, int *len)
1da177e4 2601{
2e6599cb 2602 const struct inet_request_sock *ireq = inet_rsk(req);
a399a805 2603 long delta = req->expires - jiffies;
1da177e4 2604
5e659e4c 2605 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
71338aa7 2606 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
1da177e4 2607 i,
2e6599cb 2608 ireq->loc_addr,
c720c7e8 2609 ntohs(inet_sk(sk)->inet_sport),
2e6599cb
ACM
2610 ireq->rmt_addr,
2611 ntohs(ireq->rmt_port),
1da177e4
LT
2612 TCP_SYN_RECV,
2613 0, 0, /* could print option size, but that is af dependent. */
2614 1, /* timers active (only the expire timer) */
a399a805 2615 jiffies_delta_to_clock_t(delta),
e6c022a4 2616 req->num_timeout,
a7cb5a49 2617 from_kuid_munged(seq_user_ns(f), uid),
1da177e4
LT
2618 0, /* non standard timer */
2619 0, /* open_requests have no inode */
2620 atomic_read(&sk->sk_refcnt),
5e659e4c
PE
2621 req,
2622 len);
1da177e4
LT
2623}
2624
5e659e4c 2625static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
1da177e4
LT
2626{
2627 int timer_active;
2628 unsigned long timer_expires;
cf533ea5 2629 const struct tcp_sock *tp = tcp_sk(sk);
cf4c6bf8 2630 const struct inet_connection_sock *icsk = inet_csk(sk);
cf533ea5 2631 const struct inet_sock *inet = inet_sk(sk);
168a8f58 2632 struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
c720c7e8
ED
2633 __be32 dest = inet->inet_daddr;
2634 __be32 src = inet->inet_rcv_saddr;
2635 __u16 destp = ntohs(inet->inet_dport);
2636 __u16 srcp = ntohs(inet->inet_sport);
49d09007 2637 int rx_queue;
1da177e4 2638
6ba8a3b1
ND
2639 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2640 icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2641 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
1da177e4 2642 timer_active = 1;
463c84b9
ACM
2643 timer_expires = icsk->icsk_timeout;
2644 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1da177e4 2645 timer_active = 4;
463c84b9 2646 timer_expires = icsk->icsk_timeout;
cf4c6bf8 2647 } else if (timer_pending(&sk->sk_timer)) {
1da177e4 2648 timer_active = 2;
cf4c6bf8 2649 timer_expires = sk->sk_timer.expires;
1da177e4
LT
2650 } else {
2651 timer_active = 0;
2652 timer_expires = jiffies;
2653 }
2654
49d09007
ED
2655 if (sk->sk_state == TCP_LISTEN)
2656 rx_queue = sk->sk_ack_backlog;
2657 else
2658 /*
2659 * because we dont lock socket, we might find a transient negative value
2660 */
2661 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2662
5e659e4c 2663 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
71338aa7 2664 "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
cf4c6bf8 2665 i, src, srcp, dest, destp, sk->sk_state,
47da8ee6 2666 tp->write_seq - tp->snd_una,
49d09007 2667 rx_queue,
1da177e4 2668 timer_active,
a399a805 2669 jiffies_delta_to_clock_t(timer_expires - jiffies),
463c84b9 2670 icsk->icsk_retransmits,
a7cb5a49 2671 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
6687e988 2672 icsk->icsk_probes_out,
cf4c6bf8
IJ
2673 sock_i_ino(sk),
2674 atomic_read(&sk->sk_refcnt), sk,
7be87351
SH
2675 jiffies_to_clock_t(icsk->icsk_rto),
2676 jiffies_to_clock_t(icsk->icsk_ack.ato),
463c84b9 2677 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
1da177e4 2678 tp->snd_cwnd,
168a8f58
JC
2679 sk->sk_state == TCP_LISTEN ?
2680 (fastopenq ? fastopenq->max_qlen : 0) :
2681 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh),
5e659e4c 2682 len);
1da177e4
LT
2683}
2684
cf533ea5 2685static void get_timewait4_sock(const struct inet_timewait_sock *tw,
5e659e4c 2686 struct seq_file *f, int i, int *len)
1da177e4 2687{
23f33c2d 2688 __be32 dest, src;
1da177e4 2689 __u16 destp, srcp;
a399a805 2690 long delta = tw->tw_ttd - jiffies;
1da177e4
LT
2691
2692 dest = tw->tw_daddr;
2693 src = tw->tw_rcv_saddr;
2694 destp = ntohs(tw->tw_dport);
2695 srcp = ntohs(tw->tw_sport);
2696
5e659e4c 2697 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
71338aa7 2698 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
1da177e4 2699 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
a399a805 2700 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
5e659e4c 2701 atomic_read(&tw->tw_refcnt), tw, len);
1da177e4
LT
2702}
2703
2704#define TMPSZ 150
2705
2706static int tcp4_seq_show(struct seq_file *seq, void *v)
2707{
5799de0b 2708 struct tcp_iter_state *st;
5e659e4c 2709 int len;
1da177e4
LT
2710
2711 if (v == SEQ_START_TOKEN) {
2712 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2713 " sl local_address rem_address st tx_queue "
2714 "rx_queue tr tm->when retrnsmt uid timeout "
2715 "inode");
2716 goto out;
2717 }
2718 st = seq->private;
2719
2720 switch (st->state) {
2721 case TCP_SEQ_STATE_LISTENING:
2722 case TCP_SEQ_STATE_ESTABLISHED:
5e659e4c 2723 get_tcp4_sock(v, seq, st->num, &len);
1da177e4
LT
2724 break;
2725 case TCP_SEQ_STATE_OPENREQ:
5e659e4c 2726 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
1da177e4
LT
2727 break;
2728 case TCP_SEQ_STATE_TIME_WAIT:
5e659e4c 2729 get_timewait4_sock(v, seq, st->num, &len);
1da177e4
LT
2730 break;
2731 }
5e659e4c 2732 seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
1da177e4
LT
2733out:
2734 return 0;
2735}
2736
73cb88ec
AV
2737static const struct file_operations tcp_afinfo_seq_fops = {
2738 .owner = THIS_MODULE,
2739 .open = tcp_seq_open,
2740 .read = seq_read,
2741 .llseek = seq_lseek,
2742 .release = seq_release_net
2743};
2744
1da177e4 2745static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1da177e4
LT
2746 .name = "tcp",
2747 .family = AF_INET,
73cb88ec 2748 .seq_fops = &tcp_afinfo_seq_fops,
9427c4b3
DL
2749 .seq_ops = {
2750 .show = tcp4_seq_show,
2751 },
1da177e4
LT
2752};
2753
2c8c1e72 2754static int __net_init tcp4_proc_init_net(struct net *net)
757764f6
PE
2755{
2756 return tcp_proc_register(net, &tcp4_seq_afinfo);
2757}
2758
2c8c1e72 2759static void __net_exit tcp4_proc_exit_net(struct net *net)
757764f6
PE
2760{
2761 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2762}
2763
2764static struct pernet_operations tcp4_net_ops = {
2765 .init = tcp4_proc_init_net,
2766 .exit = tcp4_proc_exit_net,
2767};
2768
1da177e4
LT
2769int __init tcp4_proc_init(void)
2770{
757764f6 2771 return register_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2772}
2773
2774void tcp4_proc_exit(void)
2775{
757764f6 2776 unregister_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2777}
2778#endif /* CONFIG_PROC_FS */
2779
2780struct proto tcp_prot = {
2781 .name = "TCP",
2782 .owner = THIS_MODULE,
2783 .close = tcp_close,
2784 .connect = tcp_v4_connect,
2785 .disconnect = tcp_disconnect,
463c84b9 2786 .accept = inet_csk_accept,
1da177e4
LT
2787 .ioctl = tcp_ioctl,
2788 .init = tcp_v4_init_sock,
2789 .destroy = tcp_v4_destroy_sock,
2790 .shutdown = tcp_shutdown,
2791 .setsockopt = tcp_setsockopt,
2792 .getsockopt = tcp_getsockopt,
1da177e4 2793 .recvmsg = tcp_recvmsg,
7ba42910
CG
2794 .sendmsg = tcp_sendmsg,
2795 .sendpage = tcp_sendpage,
1da177e4 2796 .backlog_rcv = tcp_v4_do_rcv,
46d3ceab 2797 .release_cb = tcp_release_cb,
563d34d0 2798 .mtu_reduced = tcp_v4_mtu_reduced,
ab1e0a13
ACM
2799 .hash = inet_hash,
2800 .unhash = inet_unhash,
2801 .get_port = inet_csk_get_port,
1da177e4 2802 .enter_memory_pressure = tcp_enter_memory_pressure,
c9bee3b7 2803 .stream_memory_free = tcp_stream_memory_free,
1da177e4 2804 .sockets_allocated = &tcp_sockets_allocated,
0a5578cf 2805 .orphan_count = &tcp_orphan_count,
1da177e4
LT
2806 .memory_allocated = &tcp_memory_allocated,
2807 .memory_pressure = &tcp_memory_pressure,
1da177e4
LT
2808 .sysctl_wmem = sysctl_tcp_wmem,
2809 .sysctl_rmem = sysctl_tcp_rmem,
2810 .max_header = MAX_TCP_HEADER,
2811 .obj_size = sizeof(struct tcp_sock),
3ab5aee7 2812 .slab_flags = SLAB_DESTROY_BY_RCU,
6d6ee43e 2813 .twsk_prot = &tcp_timewait_sock_ops,
60236fdd 2814 .rsk_prot = &tcp_request_sock_ops,
39d8cda7 2815 .h.hashinfo = &tcp_hashinfo,
7ba42910 2816 .no_autobind = true,
543d9cfe
ACM
2817#ifdef CONFIG_COMPAT
2818 .compat_setsockopt = compat_tcp_setsockopt,
2819 .compat_getsockopt = compat_tcp_getsockopt,
2820#endif
c255a458 2821#ifdef CONFIG_MEMCG_KMEM
d1a4c0b3
GC
2822 .init_cgroup = tcp_init_cgroup,
2823 .destroy_cgroup = tcp_destroy_cgroup,
2824 .proto_cgroup = tcp_proto_cgroup,
2825#endif
1da177e4 2826};
4bc2f18b 2827EXPORT_SYMBOL(tcp_prot);
1da177e4 2828
046ee902
DL
2829static int __net_init tcp_sk_init(struct net *net)
2830{
5d134f1c 2831 net->ipv4.sysctl_tcp_ecn = 2;
be9f4a44 2832 return 0;
046ee902
DL
2833}
2834
2835static void __net_exit tcp_sk_exit(struct net *net)
2836{
b099ce26
EB
2837}
2838
2839static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2840{
2841 inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
046ee902
DL
2842}
2843
2844static struct pernet_operations __net_initdata tcp_sk_ops = {
b099ce26
EB
2845 .init = tcp_sk_init,
2846 .exit = tcp_sk_exit,
2847 .exit_batch = tcp_sk_exit_batch,
046ee902
DL
2848};
2849
9b0f976f 2850void __init tcp_v4_init(void)
1da177e4 2851{
5caea4ea 2852 inet_hashinfo_init(&tcp_hashinfo);
6a1b3054 2853 if (register_pernet_subsys(&tcp_sk_ops))
1da177e4 2854 panic("Failed to create the TCP control socket.\n");
1da177e4 2855}
This page took 1.072981 seconds and 5 git commands to generate.