ipv4: Sanitize and simplify ip_route_{connect,newports}()
[deliverable/linux.git] / net / ipv4 / tcp_ipv4.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
1da177e4
LT
8 * IPv4 specific functions
9 *
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 *
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
22 */
23
24/*
25 * Changes:
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
34 * ACK bit.
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
60236fdd 37 * request_sock handling and moved
1da177e4
LT
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
caa20d9a 40 * Added new listen semantics.
1da177e4
LT
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
45 * coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
51 */
52
1da177e4 53
eb4dea58 54#include <linux/bottom_half.h>
1da177e4
LT
55#include <linux/types.h>
56#include <linux/fcntl.h>
57#include <linux/module.h>
58#include <linux/random.h>
59#include <linux/cache.h>
60#include <linux/jhash.h>
61#include <linux/init.h>
62#include <linux/times.h>
5a0e3ad6 63#include <linux/slab.h>
1da177e4 64
457c4cbc 65#include <net/net_namespace.h>
1da177e4 66#include <net/icmp.h>
304a1618 67#include <net/inet_hashtables.h>
1da177e4 68#include <net/tcp.h>
20380731 69#include <net/transp_v6.h>
1da177e4
LT
70#include <net/ipv6.h>
71#include <net/inet_common.h>
6d6ee43e 72#include <net/timewait_sock.h>
1da177e4 73#include <net/xfrm.h>
1a2449a8 74#include <net/netdma.h>
1da177e4
LT
75
76#include <linux/inet.h>
77#include <linux/ipv6.h>
78#include <linux/stddef.h>
79#include <linux/proc_fs.h>
80#include <linux/seq_file.h>
81
cfb6eeb4
YH
82#include <linux/crypto.h>
83#include <linux/scatterlist.h>
84
ab32ea5d
BH
85int sysctl_tcp_tw_reuse __read_mostly;
86int sysctl_tcp_low_latency __read_mostly;
4bc2f18b 87EXPORT_SYMBOL(sysctl_tcp_low_latency);
1da177e4 88
1da177e4 89
cfb6eeb4 90#ifdef CONFIG_TCP_MD5SIG
7174259e
ACM
91static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
92 __be32 addr);
49a72dfb
AL
93static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
94 __be32 daddr, __be32 saddr, struct tcphdr *th);
9501f972
YH
95#else
96static inline
97struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
98{
99 return NULL;
100}
cfb6eeb4
YH
101#endif
102
5caea4ea 103struct inet_hashinfo tcp_hashinfo;
4bc2f18b 104EXPORT_SYMBOL(tcp_hashinfo);
1da177e4 105
a94f723d 106static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
1da177e4 107{
eddc9ec5
ACM
108 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
109 ip_hdr(skb)->saddr,
aa8223c7
ACM
110 tcp_hdr(skb)->dest,
111 tcp_hdr(skb)->source);
1da177e4
LT
112}
113
6d6ee43e
ACM
114int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
115{
116 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
117 struct tcp_sock *tp = tcp_sk(sk);
118
119 /* With PAWS, it is safe from the viewpoint
120 of data integrity. Even without PAWS it is safe provided sequence
121 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
122
123 Actually, the idea is close to VJ's one, only timestamp cache is
124 held not per host, but per port pair and TW bucket is used as state
125 holder.
126
127 If TW bucket has been already destroyed we fall back to VJ's scheme
128 and use initial timestamp retrieved from peer table.
129 */
130 if (tcptw->tw_ts_recent_stamp &&
131 (twp == NULL || (sysctl_tcp_tw_reuse &&
9d729f72 132 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
6d6ee43e
ACM
133 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
134 if (tp->write_seq == 0)
135 tp->write_seq = 1;
136 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
137 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
138 sock_hold(sktw);
139 return 1;
140 }
141
142 return 0;
143}
6d6ee43e
ACM
144EXPORT_SYMBOL_GPL(tcp_twsk_unique);
145
1da177e4
LT
146/* This will initiate an outgoing connection. */
147int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
148{
2d7192d6 149 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
1da177e4
LT
150 struct inet_sock *inet = inet_sk(sk);
151 struct tcp_sock *tp = tcp_sk(sk);
dca8b089 152 __be16 orig_sport, orig_dport;
bada8adc 153 __be32 daddr, nexthop;
2d7192d6
DM
154 struct flowi4 fl4;
155 struct rtable *rt;
1da177e4
LT
156 int err;
157
158 if (addr_len < sizeof(struct sockaddr_in))
159 return -EINVAL;
160
161 if (usin->sin_family != AF_INET)
162 return -EAFNOSUPPORT;
163
164 nexthop = daddr = usin->sin_addr.s_addr;
165 if (inet->opt && inet->opt->srr) {
166 if (!daddr)
167 return -EINVAL;
168 nexthop = inet->opt->faddr;
169 }
170
dca8b089
DM
171 orig_sport = inet->inet_sport;
172 orig_dport = usin->sin_port;
2d7192d6 173 rt = ip_route_connect(&fl4, nexthop, inet->inet_saddr,
b23dd4fe
DM
174 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
175 IPPROTO_TCP,
176 orig_sport, orig_dport, sk, true);
177 if (IS_ERR(rt)) {
178 err = PTR_ERR(rt);
179 if (err == -ENETUNREACH)
7c73a6fa 180 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
b23dd4fe 181 return err;
584bdf8c 182 }
1da177e4
LT
183
184 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
185 ip_rt_put(rt);
186 return -ENETUNREACH;
187 }
188
189 if (!inet->opt || !inet->opt->srr)
190 daddr = rt->rt_dst;
191
c720c7e8
ED
192 if (!inet->inet_saddr)
193 inet->inet_saddr = rt->rt_src;
194 inet->inet_rcv_saddr = inet->inet_saddr;
1da177e4 195
c720c7e8 196 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
1da177e4
LT
197 /* Reset inherited state */
198 tp->rx_opt.ts_recent = 0;
199 tp->rx_opt.ts_recent_stamp = 0;
200 tp->write_seq = 0;
201 }
202
295ff7ed 203 if (tcp_death_row.sysctl_tw_recycle &&
1da177e4
LT
204 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
205 struct inet_peer *peer = rt_get_peer(rt);
7174259e
ACM
206 /*
207 * VJ's idea. We save last timestamp seen from
208 * the destination in peer table, when entering state
209 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
210 * when trying new connection.
1da177e4 211 */
317fe0e6
ED
212 if (peer) {
213 inet_peer_refcheck(peer);
214 if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
215 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
216 tp->rx_opt.ts_recent = peer->tcp_ts;
217 }
1da177e4
LT
218 }
219 }
220
c720c7e8
ED
221 inet->inet_dport = usin->sin_port;
222 inet->inet_daddr = daddr;
1da177e4 223
d83d8461 224 inet_csk(sk)->icsk_ext_hdr_len = 0;
1da177e4 225 if (inet->opt)
d83d8461 226 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
1da177e4 227
bee7ca9e 228 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
1da177e4
LT
229
230 /* Socket identity is still unknown (sport may be zero).
231 * However we set state to SYN-SENT and not releasing socket
232 * lock select source port, enter ourselves into the hash tables and
233 * complete initialization after this.
234 */
235 tcp_set_state(sk, TCP_SYN_SENT);
a7f5e7f1 236 err = inet_hash_connect(&tcp_death_row, sk);
1da177e4
LT
237 if (err)
238 goto failure;
239
2d7192d6 240 rt = ip_route_newports(&fl4, rt, orig_sport, orig_dport,
b23dd4fe
DM
241 inet->inet_sport, inet->inet_dport, sk);
242 if (IS_ERR(rt)) {
243 err = PTR_ERR(rt);
244 rt = NULL;
1da177e4 245 goto failure;
b23dd4fe 246 }
1da177e4 247 /* OK, now commit destination to socket. */
bcd76111 248 sk->sk_gso_type = SKB_GSO_TCPV4;
d8d1f30b 249 sk_setup_caps(sk, &rt->dst);
1da177e4
LT
250
251 if (!tp->write_seq)
c720c7e8
ED
252 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
253 inet->inet_daddr,
254 inet->inet_sport,
1da177e4
LT
255 usin->sin_port);
256
c720c7e8 257 inet->inet_id = tp->write_seq ^ jiffies;
1da177e4
LT
258
259 err = tcp_connect(sk);
260 rt = NULL;
261 if (err)
262 goto failure;
263
264 return 0;
265
266failure:
7174259e
ACM
267 /*
268 * This unhashes the socket and releases the local port,
269 * if necessary.
270 */
1da177e4
LT
271 tcp_set_state(sk, TCP_CLOSE);
272 ip_rt_put(rt);
273 sk->sk_route_caps = 0;
c720c7e8 274 inet->inet_dport = 0;
1da177e4
LT
275 return err;
276}
4bc2f18b 277EXPORT_SYMBOL(tcp_v4_connect);
1da177e4 278
1da177e4
LT
279/*
280 * This routine does path mtu discovery as defined in RFC1191.
281 */
b71d1d42 282static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
1da177e4
LT
283{
284 struct dst_entry *dst;
285 struct inet_sock *inet = inet_sk(sk);
1da177e4
LT
286
287 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
288 * send out by Linux are always <576bytes so they should go through
289 * unfragmented).
290 */
291 if (sk->sk_state == TCP_LISTEN)
292 return;
293
294 /* We don't check in the destentry if pmtu discovery is forbidden
295 * on this route. We just assume that no packet_to_big packets
296 * are send back when pmtu discovery is not active.
e905a9ed 297 * There is a small race when the user changes this flag in the
1da177e4
LT
298 * route, but I think that's acceptable.
299 */
300 if ((dst = __sk_dst_check(sk, 0)) == NULL)
301 return;
302
303 dst->ops->update_pmtu(dst, mtu);
304
305 /* Something is about to be wrong... Remember soft error
306 * for the case, if this connection will not able to recover.
307 */
308 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
309 sk->sk_err_soft = EMSGSIZE;
310
311 mtu = dst_mtu(dst);
312
313 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
d83d8461 314 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
1da177e4
LT
315 tcp_sync_mss(sk, mtu);
316
317 /* Resend the TCP packet because it's
318 * clear that the old packet has been
319 * dropped. This is the new "fast" path mtu
320 * discovery.
321 */
322 tcp_simple_retransmit(sk);
323 } /* else let the usual retransmit timer handle it */
324}
325
326/*
327 * This routine is called by the ICMP module when it gets some
328 * sort of error condition. If err < 0 then the socket should
329 * be closed and the error returned to the user. If err > 0
330 * it's just the icmp type << 8 | icmp code. After adjustment
331 * header points to the first 8 bytes of the tcp header. We need
332 * to find the appropriate port.
333 *
334 * The locking strategy used here is very "optimistic". When
335 * someone else accesses the socket the ICMP is just dropped
336 * and for some paths there is no check at all.
337 * A more general error queue to queue errors for later handling
338 * is probably better.
339 *
340 */
341
4d1a2d9e 342void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
1da177e4 343{
b71d1d42 344 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
4d1a2d9e 345 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
f1ecd5d9 346 struct inet_connection_sock *icsk;
1da177e4
LT
347 struct tcp_sock *tp;
348 struct inet_sock *inet;
4d1a2d9e
DL
349 const int type = icmp_hdr(icmp_skb)->type;
350 const int code = icmp_hdr(icmp_skb)->code;
1da177e4 351 struct sock *sk;
f1ecd5d9 352 struct sk_buff *skb;
1da177e4 353 __u32 seq;
f1ecd5d9 354 __u32 remaining;
1da177e4 355 int err;
4d1a2d9e 356 struct net *net = dev_net(icmp_skb->dev);
1da177e4 357
4d1a2d9e 358 if (icmp_skb->len < (iph->ihl << 2) + 8) {
dcfc23ca 359 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
1da177e4
LT
360 return;
361 }
362
fd54d716 363 sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
4d1a2d9e 364 iph->saddr, th->source, inet_iif(icmp_skb));
1da177e4 365 if (!sk) {
dcfc23ca 366 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
1da177e4
LT
367 return;
368 }
369 if (sk->sk_state == TCP_TIME_WAIT) {
9469c7b4 370 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
371 return;
372 }
373
374 bh_lock_sock(sk);
375 /* If too many ICMPs get dropped on busy
376 * servers this needs to be solved differently.
377 */
378 if (sock_owned_by_user(sk))
de0744af 379 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
1da177e4
LT
380
381 if (sk->sk_state == TCP_CLOSE)
382 goto out;
383
97e3ecd1 384 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
385 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
386 goto out;
387 }
388
f1ecd5d9 389 icsk = inet_csk(sk);
1da177e4
LT
390 tp = tcp_sk(sk);
391 seq = ntohl(th->seq);
392 if (sk->sk_state != TCP_LISTEN &&
393 !between(seq, tp->snd_una, tp->snd_nxt)) {
de0744af 394 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
1da177e4
LT
395 goto out;
396 }
397
398 switch (type) {
399 case ICMP_SOURCE_QUENCH:
400 /* Just silently ignore these. */
401 goto out;
402 case ICMP_PARAMETERPROB:
403 err = EPROTO;
404 break;
405 case ICMP_DEST_UNREACH:
406 if (code > NR_ICMP_UNREACH)
407 goto out;
408
409 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
410 if (!sock_owned_by_user(sk))
411 do_pmtu_discovery(sk, iph, info);
412 goto out;
413 }
414
415 err = icmp_err_convert[code].errno;
f1ecd5d9
DL
416 /* check if icmp_skb allows revert of backoff
417 * (see draft-zimmermann-tcp-lcd) */
418 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
419 break;
420 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
421 !icsk->icsk_backoff)
422 break;
423
8f49c270
DM
424 if (sock_owned_by_user(sk))
425 break;
426
f1ecd5d9
DL
427 icsk->icsk_backoff--;
428 inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
429 icsk->icsk_backoff;
430 tcp_bound_rto(sk);
431
432 skb = tcp_write_queue_head(sk);
433 BUG_ON(!skb);
434
435 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
436 tcp_time_stamp - TCP_SKB_CB(skb)->when);
437
438 if (remaining) {
439 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
440 remaining, TCP_RTO_MAX);
f1ecd5d9
DL
441 } else {
442 /* RTO revert clocked out retransmission.
443 * Will retransmit now */
444 tcp_retransmit_timer(sk);
445 }
446
1da177e4
LT
447 break;
448 case ICMP_TIME_EXCEEDED:
449 err = EHOSTUNREACH;
450 break;
451 default:
452 goto out;
453 }
454
455 switch (sk->sk_state) {
60236fdd 456 struct request_sock *req, **prev;
1da177e4
LT
457 case TCP_LISTEN:
458 if (sock_owned_by_user(sk))
459 goto out;
460
463c84b9
ACM
461 req = inet_csk_search_req(sk, &prev, th->dest,
462 iph->daddr, iph->saddr);
1da177e4
LT
463 if (!req)
464 goto out;
465
466 /* ICMPs are not backlogged, hence we cannot get
467 an established socket here.
468 */
547b792c 469 WARN_ON(req->sk);
1da177e4 470
2e6599cb 471 if (seq != tcp_rsk(req)->snt_isn) {
de0744af 472 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
1da177e4
LT
473 goto out;
474 }
475
476 /*
477 * Still in SYN_RECV, just remove it silently.
478 * There is no good way to pass the error to the newly
479 * created socket, and POSIX does not want network
480 * errors returned from accept().
481 */
463c84b9 482 inet_csk_reqsk_queue_drop(sk, req, prev);
1da177e4
LT
483 goto out;
484
485 case TCP_SYN_SENT:
486 case TCP_SYN_RECV: /* Cannot happen.
487 It can f.e. if SYNs crossed.
488 */
489 if (!sock_owned_by_user(sk)) {
1da177e4
LT
490 sk->sk_err = err;
491
492 sk->sk_error_report(sk);
493
494 tcp_done(sk);
495 } else {
496 sk->sk_err_soft = err;
497 }
498 goto out;
499 }
500
501 /* If we've already connected we will keep trying
502 * until we time out, or the user gives up.
503 *
504 * rfc1122 4.2.3.9 allows to consider as hard errors
505 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
506 * but it is obsoleted by pmtu discovery).
507 *
508 * Note, that in modern internet, where routing is unreliable
509 * and in each dark corner broken firewalls sit, sending random
510 * errors ordered by their masters even this two messages finally lose
511 * their original sense (even Linux sends invalid PORT_UNREACHs)
512 *
513 * Now we are in compliance with RFCs.
514 * --ANK (980905)
515 */
516
517 inet = inet_sk(sk);
518 if (!sock_owned_by_user(sk) && inet->recverr) {
519 sk->sk_err = err;
520 sk->sk_error_report(sk);
521 } else { /* Only an error on timeout */
522 sk->sk_err_soft = err;
523 }
524
525out:
526 bh_unlock_sock(sk);
527 sock_put(sk);
528}
529
419f9f89
HX
530static void __tcp_v4_send_check(struct sk_buff *skb,
531 __be32 saddr, __be32 daddr)
1da177e4 532{
aa8223c7 533 struct tcphdr *th = tcp_hdr(skb);
1da177e4 534
84fa7933 535 if (skb->ip_summed == CHECKSUM_PARTIAL) {
419f9f89 536 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
663ead3b 537 skb->csum_start = skb_transport_header(skb) - skb->head;
ff1dcadb 538 skb->csum_offset = offsetof(struct tcphdr, check);
1da177e4 539 } else {
419f9f89 540 th->check = tcp_v4_check(skb->len, saddr, daddr,
07f0757a 541 csum_partial(th,
1da177e4
LT
542 th->doff << 2,
543 skb->csum));
544 }
545}
546
419f9f89 547/* This routine computes an IPv4 TCP checksum. */
bb296246 548void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
419f9f89
HX
549{
550 struct inet_sock *inet = inet_sk(sk);
551
552 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
553}
4bc2f18b 554EXPORT_SYMBOL(tcp_v4_send_check);
419f9f89 555
a430a43d
HX
556int tcp_v4_gso_send_check(struct sk_buff *skb)
557{
eddc9ec5 558 const struct iphdr *iph;
a430a43d
HX
559 struct tcphdr *th;
560
561 if (!pskb_may_pull(skb, sizeof(*th)))
562 return -EINVAL;
563
eddc9ec5 564 iph = ip_hdr(skb);
aa8223c7 565 th = tcp_hdr(skb);
a430a43d
HX
566
567 th->check = 0;
84fa7933 568 skb->ip_summed = CHECKSUM_PARTIAL;
419f9f89 569 __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
a430a43d
HX
570 return 0;
571}
572
1da177e4
LT
573/*
574 * This routine will send an RST to the other tcp.
575 *
576 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
577 * for reset.
578 * Answer: if a packet caused RST, it is not for a socket
579 * existing in our system, if it is matched to a socket,
580 * it is just duplicate segment or bug in other side's TCP.
581 * So that we build reply only basing on parameters
582 * arrived with segment.
583 * Exception: precedence violation. We do not implement it in any case.
584 */
585
cfb6eeb4 586static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
1da177e4 587{
aa8223c7 588 struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
589 struct {
590 struct tcphdr th;
591#ifdef CONFIG_TCP_MD5SIG
714e85be 592 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
cfb6eeb4
YH
593#endif
594 } rep;
1da177e4 595 struct ip_reply_arg arg;
cfb6eeb4
YH
596#ifdef CONFIG_TCP_MD5SIG
597 struct tcp_md5sig_key *key;
598#endif
a86b1e30 599 struct net *net;
1da177e4
LT
600
601 /* Never send a reset in response to a reset. */
602 if (th->rst)
603 return;
604
511c3f92 605 if (skb_rtable(skb)->rt_type != RTN_LOCAL)
1da177e4
LT
606 return;
607
608 /* Swap the send and the receive. */
cfb6eeb4
YH
609 memset(&rep, 0, sizeof(rep));
610 rep.th.dest = th->source;
611 rep.th.source = th->dest;
612 rep.th.doff = sizeof(struct tcphdr) / 4;
613 rep.th.rst = 1;
1da177e4
LT
614
615 if (th->ack) {
cfb6eeb4 616 rep.th.seq = th->ack_seq;
1da177e4 617 } else {
cfb6eeb4
YH
618 rep.th.ack = 1;
619 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
620 skb->len - (th->doff << 2));
1da177e4
LT
621 }
622
7174259e 623 memset(&arg, 0, sizeof(arg));
cfb6eeb4
YH
624 arg.iov[0].iov_base = (unsigned char *)&rep;
625 arg.iov[0].iov_len = sizeof(rep.th);
626
627#ifdef CONFIG_TCP_MD5SIG
eddc9ec5 628 key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
cfb6eeb4
YH
629 if (key) {
630 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
631 (TCPOPT_NOP << 16) |
632 (TCPOPT_MD5SIG << 8) |
633 TCPOLEN_MD5SIG);
634 /* Update length and the length the header thinks exists */
635 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
636 rep.th.doff = arg.iov[0].iov_len / 4;
637
49a72dfb 638 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
78e645cb
IJ
639 key, ip_hdr(skb)->saddr,
640 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
641 }
642#endif
eddc9ec5
ACM
643 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
644 ip_hdr(skb)->saddr, /* XXX */
52cd5750 645 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1da177e4 646 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
88ef4a5a 647 arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
1da177e4 648
adf30907 649 net = dev_net(skb_dst(skb)->dev);
a86b1e30 650 ip_send_reply(net->ipv4.tcp_sock, skb,
7feb49c8 651 &arg, arg.iov[0].iov_len);
1da177e4 652
63231bdd
PE
653 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
654 TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
1da177e4
LT
655}
656
657/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
658 outside socket context is ugly, certainly. What can I do?
659 */
660
9501f972
YH
661static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
662 u32 win, u32 ts, int oif,
88ef4a5a
KK
663 struct tcp_md5sig_key *key,
664 int reply_flags)
1da177e4 665{
aa8223c7 666 struct tcphdr *th = tcp_hdr(skb);
1da177e4
LT
667 struct {
668 struct tcphdr th;
714e85be 669 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
cfb6eeb4 670#ifdef CONFIG_TCP_MD5SIG
714e85be 671 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
cfb6eeb4
YH
672#endif
673 ];
1da177e4
LT
674 } rep;
675 struct ip_reply_arg arg;
adf30907 676 struct net *net = dev_net(skb_dst(skb)->dev);
1da177e4
LT
677
678 memset(&rep.th, 0, sizeof(struct tcphdr));
7174259e 679 memset(&arg, 0, sizeof(arg));
1da177e4
LT
680
681 arg.iov[0].iov_base = (unsigned char *)&rep;
682 arg.iov[0].iov_len = sizeof(rep.th);
683 if (ts) {
cfb6eeb4
YH
684 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
685 (TCPOPT_TIMESTAMP << 8) |
686 TCPOLEN_TIMESTAMP);
687 rep.opt[1] = htonl(tcp_time_stamp);
688 rep.opt[2] = htonl(ts);
cb48cfe8 689 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
1da177e4
LT
690 }
691
692 /* Swap the send and the receive. */
693 rep.th.dest = th->source;
694 rep.th.source = th->dest;
695 rep.th.doff = arg.iov[0].iov_len / 4;
696 rep.th.seq = htonl(seq);
697 rep.th.ack_seq = htonl(ack);
698 rep.th.ack = 1;
699 rep.th.window = htons(win);
700
cfb6eeb4 701#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4
YH
702 if (key) {
703 int offset = (ts) ? 3 : 0;
704
705 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
706 (TCPOPT_NOP << 16) |
707 (TCPOPT_MD5SIG << 8) |
708 TCPOLEN_MD5SIG);
709 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
710 rep.th.doff = arg.iov[0].iov_len/4;
711
49a72dfb 712 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
90b7e112
AL
713 key, ip_hdr(skb)->saddr,
714 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
715 }
716#endif
88ef4a5a 717 arg.flags = reply_flags;
eddc9ec5
ACM
718 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
719 ip_hdr(skb)->saddr, /* XXX */
1da177e4
LT
720 arg.iov[0].iov_len, IPPROTO_TCP, 0);
721 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
9501f972
YH
722 if (oif)
723 arg.bound_dev_if = oif;
1da177e4 724
a86b1e30 725 ip_send_reply(net->ipv4.tcp_sock, skb,
7feb49c8 726 &arg, arg.iov[0].iov_len);
1da177e4 727
63231bdd 728 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
1da177e4
LT
729}
730
731static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
732{
8feaf0c0 733 struct inet_timewait_sock *tw = inet_twsk(sk);
cfb6eeb4 734 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1da177e4 735
9501f972 736 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
7174259e 737 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
9501f972
YH
738 tcptw->tw_ts_recent,
739 tw->tw_bound_dev_if,
88ef4a5a
KK
740 tcp_twsk_md5_key(tcptw),
741 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
9501f972 742 );
1da177e4 743
8feaf0c0 744 inet_twsk_put(tw);
1da177e4
LT
745}
746
6edafaaf 747static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
7174259e 748 struct request_sock *req)
1da177e4 749{
9501f972 750 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
cfb6eeb4 751 tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
9501f972
YH
752 req->ts_recent,
753 0,
88ef4a5a
KK
754 tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
755 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
1da177e4
LT
756}
757
1da177e4 758/*
9bf1d83e 759 * Send a SYN-ACK after having received a SYN.
60236fdd 760 * This still operates on a request_sock only, not on a big
1da177e4
LT
761 * socket.
762 */
72659ecc
OP
763static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
764 struct request_sock *req,
765 struct request_values *rvp)
1da177e4 766{
2e6599cb 767 const struct inet_request_sock *ireq = inet_rsk(req);
1da177e4
LT
768 int err = -1;
769 struct sk_buff * skb;
770
771 /* First, grab a route. */
463c84b9 772 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
fd80eb94 773 return -1;
1da177e4 774
e6b4d113 775 skb = tcp_make_synack(sk, dst, req, rvp);
1da177e4
LT
776
777 if (skb) {
419f9f89 778 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
1da177e4 779
2e6599cb
ACM
780 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
781 ireq->rmt_addr,
782 ireq->opt);
b9df3cb8 783 err = net_xmit_eval(err);
1da177e4
LT
784 }
785
1da177e4
LT
786 dst_release(dst);
787 return err;
788}
789
72659ecc 790static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
e6b4d113 791 struct request_values *rvp)
fd80eb94 792{
72659ecc
OP
793 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
794 return tcp_v4_send_synack(sk, NULL, req, rvp);
fd80eb94
DL
795}
796
1da177e4 797/*
60236fdd 798 * IPv4 request_sock destructor.
1da177e4 799 */
60236fdd 800static void tcp_v4_reqsk_destructor(struct request_sock *req)
1da177e4 801{
a51482bd 802 kfree(inet_rsk(req)->opt);
1da177e4
LT
803}
804
2a1d4bd4 805static void syn_flood_warning(const struct sk_buff *skb)
1da177e4 806{
2a1d4bd4 807 const char *msg;
1da177e4 808
2a1d4bd4
FW
809#ifdef CONFIG_SYN_COOKIES
810 if (sysctl_tcp_syncookies)
811 msg = "Sending cookies";
812 else
80e40daa 813#endif
2a1d4bd4
FW
814 msg = "Dropping request";
815
816 pr_info("TCP: Possible SYN flooding on port %d. %s.\n",
817 ntohs(tcp_hdr(skb)->dest), msg);
818}
1da177e4
LT
819
820/*
60236fdd 821 * Save and compile IPv4 options into the request_sock if needed.
1da177e4 822 */
40efc6fa
SH
823static struct ip_options *tcp_v4_save_options(struct sock *sk,
824 struct sk_buff *skb)
1da177e4
LT
825{
826 struct ip_options *opt = &(IPCB(skb)->opt);
827 struct ip_options *dopt = NULL;
828
829 if (opt && opt->optlen) {
830 int opt_size = optlength(opt);
831 dopt = kmalloc(opt_size, GFP_ATOMIC);
832 if (dopt) {
833 if (ip_options_echo(dopt, skb)) {
834 kfree(dopt);
835 dopt = NULL;
836 }
837 }
838 }
839 return dopt;
840}
841
cfb6eeb4
YH
842#ifdef CONFIG_TCP_MD5SIG
843/*
844 * RFC2385 MD5 checksumming requires a mapping of
845 * IP address->MD5 Key.
846 * We need to maintain these in the sk structure.
847 */
848
849/* Find the Key structure for an address. */
7174259e
ACM
850static struct tcp_md5sig_key *
851 tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
cfb6eeb4
YH
852{
853 struct tcp_sock *tp = tcp_sk(sk);
854 int i;
855
856 if (!tp->md5sig_info || !tp->md5sig_info->entries4)
857 return NULL;
858 for (i = 0; i < tp->md5sig_info->entries4; i++) {
859 if (tp->md5sig_info->keys4[i].addr == addr)
f8ab18d2 860 return &tp->md5sig_info->keys4[i].base;
cfb6eeb4
YH
861 }
862 return NULL;
863}
864
865struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
866 struct sock *addr_sk)
867{
c720c7e8 868 return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
cfb6eeb4 869}
cfb6eeb4
YH
870EXPORT_SYMBOL(tcp_v4_md5_lookup);
871
f5b99bcd
AB
872static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
873 struct request_sock *req)
cfb6eeb4
YH
874{
875 return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
876}
877
878/* This can be called on a newly created socket, from other files */
879int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
880 u8 *newkey, u8 newkeylen)
881{
882 /* Add Key to the list */
b0a713e9 883 struct tcp_md5sig_key *key;
cfb6eeb4
YH
884 struct tcp_sock *tp = tcp_sk(sk);
885 struct tcp4_md5sig_key *keys;
886
b0a713e9 887 key = tcp_v4_md5_do_lookup(sk, addr);
cfb6eeb4
YH
888 if (key) {
889 /* Pre-existing entry - just update that one. */
b0a713e9
MD
890 kfree(key->key);
891 key->key = newkey;
892 key->keylen = newkeylen;
cfb6eeb4 893 } else {
f6685938
ACM
894 struct tcp_md5sig_info *md5sig;
895
cfb6eeb4 896 if (!tp->md5sig_info) {
f6685938
ACM
897 tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
898 GFP_ATOMIC);
cfb6eeb4
YH
899 if (!tp->md5sig_info) {
900 kfree(newkey);
901 return -ENOMEM;
902 }
a465419b 903 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
cfb6eeb4 904 }
aa133076 905 if (tcp_alloc_md5sig_pool(sk) == NULL) {
cfb6eeb4
YH
906 kfree(newkey);
907 return -ENOMEM;
908 }
f6685938
ACM
909 md5sig = tp->md5sig_info;
910
911 if (md5sig->alloced4 == md5sig->entries4) {
912 keys = kmalloc((sizeof(*keys) *
e905a9ed 913 (md5sig->entries4 + 1)), GFP_ATOMIC);
cfb6eeb4
YH
914 if (!keys) {
915 kfree(newkey);
916 tcp_free_md5sig_pool();
917 return -ENOMEM;
918 }
919
f6685938
ACM
920 if (md5sig->entries4)
921 memcpy(keys, md5sig->keys4,
922 sizeof(*keys) * md5sig->entries4);
cfb6eeb4
YH
923
924 /* Free old key list, and reference new one */
a80cc20d 925 kfree(md5sig->keys4);
f6685938
ACM
926 md5sig->keys4 = keys;
927 md5sig->alloced4++;
cfb6eeb4 928 }
f6685938 929 md5sig->entries4++;
f8ab18d2
DM
930 md5sig->keys4[md5sig->entries4 - 1].addr = addr;
931 md5sig->keys4[md5sig->entries4 - 1].base.key = newkey;
932 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
cfb6eeb4
YH
933 }
934 return 0;
935}
cfb6eeb4
YH
936EXPORT_SYMBOL(tcp_v4_md5_do_add);
937
938static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
939 u8 *newkey, u8 newkeylen)
940{
c720c7e8 941 return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
cfb6eeb4
YH
942 newkey, newkeylen);
943}
944
945int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
946{
947 struct tcp_sock *tp = tcp_sk(sk);
948 int i;
949
950 for (i = 0; i < tp->md5sig_info->entries4; i++) {
951 if (tp->md5sig_info->keys4[i].addr == addr) {
952 /* Free the key */
f8ab18d2 953 kfree(tp->md5sig_info->keys4[i].base.key);
cfb6eeb4
YH
954 tp->md5sig_info->entries4--;
955
956 if (tp->md5sig_info->entries4 == 0) {
957 kfree(tp->md5sig_info->keys4);
958 tp->md5sig_info->keys4 = NULL;
8228a18d 959 tp->md5sig_info->alloced4 = 0;
7174259e 960 } else if (tp->md5sig_info->entries4 != i) {
cfb6eeb4 961 /* Need to do some manipulation */
354faf09
YH
962 memmove(&tp->md5sig_info->keys4[i],
963 &tp->md5sig_info->keys4[i+1],
964 (tp->md5sig_info->entries4 - i) *
965 sizeof(struct tcp4_md5sig_key));
cfb6eeb4
YH
966 }
967 tcp_free_md5sig_pool();
968 return 0;
969 }
970 }
971 return -ENOENT;
972}
cfb6eeb4
YH
973EXPORT_SYMBOL(tcp_v4_md5_do_del);
974
7174259e 975static void tcp_v4_clear_md5_list(struct sock *sk)
cfb6eeb4
YH
976{
977 struct tcp_sock *tp = tcp_sk(sk);
978
979 /* Free each key, then the set of key keys,
980 * the crypto element, and then decrement our
981 * hold on the last resort crypto.
982 */
983 if (tp->md5sig_info->entries4) {
984 int i;
985 for (i = 0; i < tp->md5sig_info->entries4; i++)
f8ab18d2 986 kfree(tp->md5sig_info->keys4[i].base.key);
cfb6eeb4
YH
987 tp->md5sig_info->entries4 = 0;
988 tcp_free_md5sig_pool();
989 }
990 if (tp->md5sig_info->keys4) {
991 kfree(tp->md5sig_info->keys4);
992 tp->md5sig_info->keys4 = NULL;
993 tp->md5sig_info->alloced4 = 0;
994 }
995}
996
7174259e
ACM
997static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
998 int optlen)
cfb6eeb4
YH
999{
1000 struct tcp_md5sig cmd;
1001 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1002 u8 *newkey;
1003
1004 if (optlen < sizeof(cmd))
1005 return -EINVAL;
1006
7174259e 1007 if (copy_from_user(&cmd, optval, sizeof(cmd)))
cfb6eeb4
YH
1008 return -EFAULT;
1009
1010 if (sin->sin_family != AF_INET)
1011 return -EINVAL;
1012
1013 if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1014 if (!tcp_sk(sk)->md5sig_info)
1015 return -ENOENT;
1016 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1017 }
1018
1019 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1020 return -EINVAL;
1021
1022 if (!tcp_sk(sk)->md5sig_info) {
1023 struct tcp_sock *tp = tcp_sk(sk);
aa133076 1024 struct tcp_md5sig_info *p;
cfb6eeb4 1025
aa133076 1026 p = kzalloc(sizeof(*p), sk->sk_allocation);
cfb6eeb4
YH
1027 if (!p)
1028 return -EINVAL;
1029
1030 tp->md5sig_info = p;
a465419b 1031 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
cfb6eeb4
YH
1032 }
1033
aa133076 1034 newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
cfb6eeb4
YH
1035 if (!newkey)
1036 return -ENOMEM;
cfb6eeb4
YH
1037 return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1038 newkey, cmd.tcpm_keylen);
1039}
1040
49a72dfb
AL
1041static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1042 __be32 daddr, __be32 saddr, int nbytes)
cfb6eeb4 1043{
cfb6eeb4 1044 struct tcp4_pseudohdr *bp;
49a72dfb 1045 struct scatterlist sg;
cfb6eeb4
YH
1046
1047 bp = &hp->md5_blk.ip4;
cfb6eeb4
YH
1048
1049 /*
49a72dfb 1050 * 1. the TCP pseudo-header (in the order: source IP address,
cfb6eeb4
YH
1051 * destination IP address, zero-padded protocol number, and
1052 * segment length)
1053 */
1054 bp->saddr = saddr;
1055 bp->daddr = daddr;
1056 bp->pad = 0;
076fb722 1057 bp->protocol = IPPROTO_TCP;
49a72dfb 1058 bp->len = cpu_to_be16(nbytes);
c7da57a1 1059
49a72dfb
AL
1060 sg_init_one(&sg, bp, sizeof(*bp));
1061 return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1062}
1063
1064static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1065 __be32 daddr, __be32 saddr, struct tcphdr *th)
1066{
1067 struct tcp_md5sig_pool *hp;
1068 struct hash_desc *desc;
1069
1070 hp = tcp_get_md5sig_pool();
1071 if (!hp)
1072 goto clear_hash_noput;
1073 desc = &hp->md5_desc;
1074
1075 if (crypto_hash_init(desc))
1076 goto clear_hash;
1077 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1078 goto clear_hash;
1079 if (tcp_md5_hash_header(hp, th))
1080 goto clear_hash;
1081 if (tcp_md5_hash_key(hp, key))
1082 goto clear_hash;
1083 if (crypto_hash_final(desc, md5_hash))
cfb6eeb4
YH
1084 goto clear_hash;
1085
cfb6eeb4 1086 tcp_put_md5sig_pool();
cfb6eeb4 1087 return 0;
49a72dfb 1088
cfb6eeb4
YH
1089clear_hash:
1090 tcp_put_md5sig_pool();
1091clear_hash_noput:
1092 memset(md5_hash, 0, 16);
49a72dfb 1093 return 1;
cfb6eeb4
YH
1094}
1095
49a72dfb
AL
1096int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1097 struct sock *sk, struct request_sock *req,
1098 struct sk_buff *skb)
cfb6eeb4 1099{
49a72dfb
AL
1100 struct tcp_md5sig_pool *hp;
1101 struct hash_desc *desc;
1102 struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
1103 __be32 saddr, daddr;
1104
1105 if (sk) {
c720c7e8
ED
1106 saddr = inet_sk(sk)->inet_saddr;
1107 daddr = inet_sk(sk)->inet_daddr;
49a72dfb
AL
1108 } else if (req) {
1109 saddr = inet_rsk(req)->loc_addr;
1110 daddr = inet_rsk(req)->rmt_addr;
cfb6eeb4 1111 } else {
49a72dfb
AL
1112 const struct iphdr *iph = ip_hdr(skb);
1113 saddr = iph->saddr;
1114 daddr = iph->daddr;
cfb6eeb4 1115 }
49a72dfb
AL
1116
1117 hp = tcp_get_md5sig_pool();
1118 if (!hp)
1119 goto clear_hash_noput;
1120 desc = &hp->md5_desc;
1121
1122 if (crypto_hash_init(desc))
1123 goto clear_hash;
1124
1125 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1126 goto clear_hash;
1127 if (tcp_md5_hash_header(hp, th))
1128 goto clear_hash;
1129 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1130 goto clear_hash;
1131 if (tcp_md5_hash_key(hp, key))
1132 goto clear_hash;
1133 if (crypto_hash_final(desc, md5_hash))
1134 goto clear_hash;
1135
1136 tcp_put_md5sig_pool();
1137 return 0;
1138
1139clear_hash:
1140 tcp_put_md5sig_pool();
1141clear_hash_noput:
1142 memset(md5_hash, 0, 16);
1143 return 1;
cfb6eeb4 1144}
49a72dfb 1145EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
cfb6eeb4 1146
7174259e 1147static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
cfb6eeb4
YH
1148{
1149 /*
1150 * This gets called for each TCP segment that arrives
1151 * so we want to be efficient.
1152 * We have 3 drop cases:
1153 * o No MD5 hash and one expected.
1154 * o MD5 hash and we're not expecting one.
1155 * o MD5 hash and its wrong.
1156 */
1157 __u8 *hash_location = NULL;
1158 struct tcp_md5sig_key *hash_expected;
eddc9ec5 1159 const struct iphdr *iph = ip_hdr(skb);
aa8223c7 1160 struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4 1161 int genhash;
cfb6eeb4
YH
1162 unsigned char newhash[16];
1163
1164 hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
7d5d5525 1165 hash_location = tcp_parse_md5sig_option(th);
cfb6eeb4 1166
cfb6eeb4
YH
1167 /* We've parsed the options - do we have a hash? */
1168 if (!hash_expected && !hash_location)
1169 return 0;
1170
1171 if (hash_expected && !hash_location) {
785957d3 1172 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
cfb6eeb4
YH
1173 return 1;
1174 }
1175
1176 if (!hash_expected && hash_location) {
785957d3 1177 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
cfb6eeb4
YH
1178 return 1;
1179 }
1180
1181 /* Okay, so this is hash_expected and hash_location -
1182 * so we need to calculate the checksum.
1183 */
49a72dfb
AL
1184 genhash = tcp_v4_md5_hash_skb(newhash,
1185 hash_expected,
1186 NULL, NULL, skb);
cfb6eeb4
YH
1187
1188 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1189 if (net_ratelimit()) {
673d57e7
HH
1190 printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1191 &iph->saddr, ntohs(th->source),
1192 &iph->daddr, ntohs(th->dest),
cfb6eeb4 1193 genhash ? " tcp_v4_calc_md5_hash failed" : "");
cfb6eeb4
YH
1194 }
1195 return 1;
1196 }
1197 return 0;
1198}
1199
1200#endif
1201
72a3effa 1202struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1da177e4 1203 .family = PF_INET,
2e6599cb 1204 .obj_size = sizeof(struct tcp_request_sock),
72659ecc 1205 .rtx_syn_ack = tcp_v4_rtx_synack,
60236fdd
ACM
1206 .send_ack = tcp_v4_reqsk_send_ack,
1207 .destructor = tcp_v4_reqsk_destructor,
1da177e4 1208 .send_reset = tcp_v4_send_reset,
72659ecc 1209 .syn_ack_timeout = tcp_syn_ack_timeout,
1da177e4
LT
1210};
1211
cfb6eeb4 1212#ifdef CONFIG_TCP_MD5SIG
b2e4b3de 1213static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
cfb6eeb4 1214 .md5_lookup = tcp_v4_reqsk_md5_lookup,
e3afe7b7 1215 .calc_md5_hash = tcp_v4_md5_hash_skb,
cfb6eeb4 1216};
b6332e6c 1217#endif
cfb6eeb4 1218
1da177e4
LT
1219int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1220{
4957faad 1221 struct tcp_extend_values tmp_ext;
1da177e4 1222 struct tcp_options_received tmp_opt;
4957faad 1223 u8 *hash_location;
60236fdd 1224 struct request_sock *req;
e6b4d113 1225 struct inet_request_sock *ireq;
4957faad 1226 struct tcp_sock *tp = tcp_sk(sk);
e6b4d113 1227 struct dst_entry *dst = NULL;
eddc9ec5
ACM
1228 __be32 saddr = ip_hdr(skb)->saddr;
1229 __be32 daddr = ip_hdr(skb)->daddr;
1da177e4 1230 __u32 isn = TCP_SKB_CB(skb)->when;
1da177e4
LT
1231#ifdef CONFIG_SYN_COOKIES
1232 int want_cookie = 0;
1233#else
1234#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1235#endif
1236
1237 /* Never answer to SYNs send to broadcast or multicast */
511c3f92 1238 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1da177e4
LT
1239 goto drop;
1240
1241 /* TW buckets are converted to open requests without
1242 * limitations, they conserve resources and peer is
1243 * evidently real one.
1244 */
463c84b9 1245 if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
2a1d4bd4
FW
1246 if (net_ratelimit())
1247 syn_flood_warning(skb);
1da177e4
LT
1248#ifdef CONFIG_SYN_COOKIES
1249 if (sysctl_tcp_syncookies) {
1250 want_cookie = 1;
1251 } else
1252#endif
1253 goto drop;
1254 }
1255
1256 /* Accept backlog is full. If we have already queued enough
1257 * of warm entries in syn queue, drop request. It is better than
1258 * clogging syn queue with openreqs with exponentially increasing
1259 * timeout.
1260 */
463c84b9 1261 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1da177e4
LT
1262 goto drop;
1263
ce4a7d0d 1264 req = inet_reqsk_alloc(&tcp_request_sock_ops);
1da177e4
LT
1265 if (!req)
1266 goto drop;
1267
cfb6eeb4
YH
1268#ifdef CONFIG_TCP_MD5SIG
1269 tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1270#endif
1271
1da177e4 1272 tcp_clear_options(&tmp_opt);
bee7ca9e 1273 tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
4957faad 1274 tmp_opt.user_mss = tp->rx_opt.user_mss;
bb5b7c11 1275 tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
4957faad
WAS
1276
1277 if (tmp_opt.cookie_plus > 0 &&
1278 tmp_opt.saw_tstamp &&
1279 !tp->rx_opt.cookie_out_never &&
1280 (sysctl_tcp_cookie_size > 0 ||
1281 (tp->cookie_values != NULL &&
1282 tp->cookie_values->cookie_desired > 0))) {
1283 u8 *c;
1284 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1285 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1286
1287 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1288 goto drop_and_release;
1289
1290 /* Secret recipe starts with IP addresses */
0eae88f3
ED
1291 *mess++ ^= (__force u32)daddr;
1292 *mess++ ^= (__force u32)saddr;
1da177e4 1293
4957faad
WAS
1294 /* plus variable length Initiator Cookie */
1295 c = (u8 *)mess;
1296 while (l-- > 0)
1297 *c++ ^= *hash_location++;
1298
1299#ifdef CONFIG_SYN_COOKIES
1300 want_cookie = 0; /* not our kind of cookie */
1301#endif
1302 tmp_ext.cookie_out_never = 0; /* false */
1303 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1304 } else if (!tp->rx_opt.cookie_in_always) {
1305 /* redundant indications, but ensure initialization. */
1306 tmp_ext.cookie_out_never = 1; /* true */
1307 tmp_ext.cookie_plus = 0;
1308 } else {
1309 goto drop_and_release;
1310 }
1311 tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1da177e4 1312
4dfc2817 1313 if (want_cookie && !tmp_opt.saw_tstamp)
1da177e4 1314 tcp_clear_options(&tmp_opt);
1da177e4 1315
1da177e4 1316 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1da177e4
LT
1317 tcp_openreq_init(req, &tmp_opt, skb);
1318
bb5b7c11
DM
1319 ireq = inet_rsk(req);
1320 ireq->loc_addr = daddr;
1321 ireq->rmt_addr = saddr;
1322 ireq->no_srccheck = inet_sk(sk)->transparent;
1323 ireq->opt = tcp_v4_save_options(sk, skb);
1324
284904aa 1325 if (security_inet_conn_request(sk, skb, req))
bb5b7c11 1326 goto drop_and_free;
284904aa 1327
172d69e6 1328 if (!want_cookie || tmp_opt.tstamp_ok)
aa8223c7 1329 TCP_ECN_create_request(req, tcp_hdr(skb));
1da177e4
LT
1330
1331 if (want_cookie) {
1da177e4 1332 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
172d69e6 1333 req->cookie_ts = tmp_opt.tstamp_ok;
1da177e4
LT
1334 } else if (!isn) {
1335 struct inet_peer *peer = NULL;
1336
1337 /* VJ's idea. We save last timestamp seen
1338 * from the destination in peer table, when entering
1339 * state TIME-WAIT, and check against it before
1340 * accepting new connection request.
1341 *
1342 * If "isn" is not zero, this request hit alive
1343 * timewait bucket, so that all the necessary checks
1344 * are made in the function processing timewait state.
1345 */
1346 if (tmp_opt.saw_tstamp &&
295ff7ed 1347 tcp_death_row.sysctl_tw_recycle &&
bb5b7c11 1348 (dst = inet_csk_route_req(sk, req)) != NULL &&
1da177e4 1349 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
7a71ed89 1350 peer->daddr.addr.a4 == saddr) {
317fe0e6 1351 inet_peer_refcheck(peer);
2c1409a0 1352 if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1da177e4
LT
1353 (s32)(peer->tcp_ts - req->ts_recent) >
1354 TCP_PAWS_WINDOW) {
de0744af 1355 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
7cd04fa7 1356 goto drop_and_release;
1da177e4
LT
1357 }
1358 }
1359 /* Kill the following clause, if you dislike this way. */
1360 else if (!sysctl_tcp_syncookies &&
463c84b9 1361 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1da177e4
LT
1362 (sysctl_max_syn_backlog >> 2)) &&
1363 (!peer || !peer->tcp_ts_stamp) &&
1364 (!dst || !dst_metric(dst, RTAX_RTT))) {
1365 /* Without syncookies last quarter of
1366 * backlog is filled with destinations,
1367 * proven to be alive.
1368 * It means that we continue to communicate
1369 * to destinations, already remembered
1370 * to the moment of synflood.
1371 */
673d57e7
HH
1372 LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1373 &saddr, ntohs(tcp_hdr(skb)->source));
7cd04fa7 1374 goto drop_and_release;
1da177e4
LT
1375 }
1376
a94f723d 1377 isn = tcp_v4_init_sequence(skb);
1da177e4 1378 }
2e6599cb 1379 tcp_rsk(req)->snt_isn = isn;
1da177e4 1380
72659ecc
OP
1381 if (tcp_v4_send_synack(sk, dst, req,
1382 (struct request_values *)&tmp_ext) ||
4957faad 1383 want_cookie)
1da177e4
LT
1384 goto drop_and_free;
1385
7cd04fa7 1386 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1da177e4
LT
1387 return 0;
1388
7cd04fa7
DL
1389drop_and_release:
1390 dst_release(dst);
1da177e4 1391drop_and_free:
60236fdd 1392 reqsk_free(req);
1da177e4 1393drop:
1da177e4
LT
1394 return 0;
1395}
4bc2f18b 1396EXPORT_SYMBOL(tcp_v4_conn_request);
1da177e4
LT
1397
1398
1399/*
1400 * The three way handshake has completed - we got a valid synack -
1401 * now create the new socket.
1402 */
1403struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
60236fdd 1404 struct request_sock *req,
1da177e4
LT
1405 struct dst_entry *dst)
1406{
2e6599cb 1407 struct inet_request_sock *ireq;
1da177e4
LT
1408 struct inet_sock *newinet;
1409 struct tcp_sock *newtp;
1410 struct sock *newsk;
cfb6eeb4
YH
1411#ifdef CONFIG_TCP_MD5SIG
1412 struct tcp_md5sig_key *key;
1413#endif
1da177e4
LT
1414
1415 if (sk_acceptq_is_full(sk))
1416 goto exit_overflow;
1417
463c84b9 1418 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1da177e4
LT
1419 goto exit;
1420
1421 newsk = tcp_create_openreq_child(sk, req, skb);
1422 if (!newsk)
093d2823 1423 goto exit_nonewsk;
1da177e4 1424
bcd76111 1425 newsk->sk_gso_type = SKB_GSO_TCPV4;
6cbb0df7 1426 sk_setup_caps(newsk, dst);
1da177e4
LT
1427
1428 newtp = tcp_sk(newsk);
1429 newinet = inet_sk(newsk);
2e6599cb 1430 ireq = inet_rsk(req);
c720c7e8
ED
1431 newinet->inet_daddr = ireq->rmt_addr;
1432 newinet->inet_rcv_saddr = ireq->loc_addr;
1433 newinet->inet_saddr = ireq->loc_addr;
2e6599cb
ACM
1434 newinet->opt = ireq->opt;
1435 ireq->opt = NULL;
463c84b9 1436 newinet->mc_index = inet_iif(skb);
eddc9ec5 1437 newinet->mc_ttl = ip_hdr(skb)->ttl;
d83d8461 1438 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1da177e4 1439 if (newinet->opt)
d83d8461 1440 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
c720c7e8 1441 newinet->inet_id = newtp->write_seq ^ jiffies;
1da177e4 1442
5d424d5a 1443 tcp_mtup_init(newsk);
1da177e4 1444 tcp_sync_mss(newsk, dst_mtu(dst));
0dbaee3b 1445 newtp->advmss = dst_metric_advmss(dst);
f5fff5dc
TQ
1446 if (tcp_sk(sk)->rx_opt.user_mss &&
1447 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1448 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1449
1da177e4
LT
1450 tcp_initialize_rcv_mss(newsk);
1451
cfb6eeb4
YH
1452#ifdef CONFIG_TCP_MD5SIG
1453 /* Copy over the MD5 key from the original socket */
c720c7e8
ED
1454 key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1455 if (key != NULL) {
cfb6eeb4
YH
1456 /*
1457 * We're using one, so create a matching key
1458 * on the newsk structure. If we fail to get
1459 * memory, then we end up not copying the key
1460 * across. Shucks.
1461 */
f6685938
ACM
1462 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1463 if (newkey != NULL)
c720c7e8 1464 tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
cfb6eeb4 1465 newkey, key->keylen);
a465419b 1466 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
cfb6eeb4
YH
1467 }
1468#endif
1469
093d2823
BS
1470 if (__inet_inherit_port(sk, newsk) < 0) {
1471 sock_put(newsk);
1472 goto exit;
1473 }
9327f705 1474 __inet_hash_nolisten(newsk, NULL);
1da177e4
LT
1475
1476 return newsk;
1477
1478exit_overflow:
de0744af 1479 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
093d2823
BS
1480exit_nonewsk:
1481 dst_release(dst);
1da177e4 1482exit:
de0744af 1483 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1da177e4
LT
1484 return NULL;
1485}
4bc2f18b 1486EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1da177e4
LT
1487
1488static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1489{
aa8223c7 1490 struct tcphdr *th = tcp_hdr(skb);
eddc9ec5 1491 const struct iphdr *iph = ip_hdr(skb);
1da177e4 1492 struct sock *nsk;
60236fdd 1493 struct request_sock **prev;
1da177e4 1494 /* Find possible connection requests. */
463c84b9
ACM
1495 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1496 iph->saddr, iph->daddr);
1da177e4
LT
1497 if (req)
1498 return tcp_check_req(sk, skb, req, prev);
1499
3b1e0a65 1500 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
c67499c0 1501 th->source, iph->daddr, th->dest, inet_iif(skb));
1da177e4
LT
1502
1503 if (nsk) {
1504 if (nsk->sk_state != TCP_TIME_WAIT) {
1505 bh_lock_sock(nsk);
1506 return nsk;
1507 }
9469c7b4 1508 inet_twsk_put(inet_twsk(nsk));
1da177e4
LT
1509 return NULL;
1510 }
1511
1512#ifdef CONFIG_SYN_COOKIES
af9b4738 1513 if (!th->syn)
1da177e4
LT
1514 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1515#endif
1516 return sk;
1517}
1518
b51655b9 1519static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1da177e4 1520{
eddc9ec5
ACM
1521 const struct iphdr *iph = ip_hdr(skb);
1522
84fa7933 1523 if (skb->ip_summed == CHECKSUM_COMPLETE) {
eddc9ec5
ACM
1524 if (!tcp_v4_check(skb->len, iph->saddr,
1525 iph->daddr, skb->csum)) {
fb286bb2 1526 skb->ip_summed = CHECKSUM_UNNECESSARY;
1da177e4 1527 return 0;
fb286bb2 1528 }
1da177e4 1529 }
fb286bb2 1530
eddc9ec5 1531 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
fb286bb2
HX
1532 skb->len, IPPROTO_TCP, 0);
1533
1da177e4 1534 if (skb->len <= 76) {
fb286bb2 1535 return __skb_checksum_complete(skb);
1da177e4
LT
1536 }
1537 return 0;
1538}
1539
1540
1541/* The socket must have it's spinlock held when we get
1542 * here.
1543 *
1544 * We have a potential double-lock case here, so even when
1545 * doing backlog processing we use the BH locking scheme.
1546 * This is because we cannot sleep with the original spinlock
1547 * held.
1548 */
1549int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1550{
cfb6eeb4
YH
1551 struct sock *rsk;
1552#ifdef CONFIG_TCP_MD5SIG
1553 /*
1554 * We really want to reject the packet as early as possible
1555 * if:
1556 * o We're expecting an MD5'd packet and this is no MD5 tcp option
1557 * o There is an MD5 option and we're not expecting one
1558 */
7174259e 1559 if (tcp_v4_inbound_md5_hash(sk, skb))
cfb6eeb4
YH
1560 goto discard;
1561#endif
1562
1da177e4 1563 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
ca55158c 1564 sock_rps_save_rxhash(sk, skb->rxhash);
aa8223c7 1565 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
cfb6eeb4 1566 rsk = sk;
1da177e4 1567 goto reset;
cfb6eeb4 1568 }
1da177e4
LT
1569 return 0;
1570 }
1571
ab6a5bb6 1572 if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1da177e4
LT
1573 goto csum_err;
1574
1575 if (sk->sk_state == TCP_LISTEN) {
1576 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1577 if (!nsk)
1578 goto discard;
1579
1580 if (nsk != sk) {
cfb6eeb4
YH
1581 if (tcp_child_process(sk, nsk, skb)) {
1582 rsk = nsk;
1da177e4 1583 goto reset;
cfb6eeb4 1584 }
1da177e4
LT
1585 return 0;
1586 }
ca55158c
ED
1587 } else
1588 sock_rps_save_rxhash(sk, skb->rxhash);
1589
aa8223c7 1590 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
cfb6eeb4 1591 rsk = sk;
1da177e4 1592 goto reset;
cfb6eeb4 1593 }
1da177e4
LT
1594 return 0;
1595
1596reset:
cfb6eeb4 1597 tcp_v4_send_reset(rsk, skb);
1da177e4
LT
1598discard:
1599 kfree_skb(skb);
1600 /* Be careful here. If this function gets more complicated and
1601 * gcc suffers from register pressure on the x86, sk (in %ebx)
1602 * might be destroyed here. This current version compiles correctly,
1603 * but you have been warned.
1604 */
1605 return 0;
1606
1607csum_err:
63231bdd 1608 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1da177e4
LT
1609 goto discard;
1610}
4bc2f18b 1611EXPORT_SYMBOL(tcp_v4_do_rcv);
1da177e4
LT
1612
1613/*
1614 * From tcp_input.c
1615 */
1616
1617int tcp_v4_rcv(struct sk_buff *skb)
1618{
eddc9ec5 1619 const struct iphdr *iph;
1da177e4
LT
1620 struct tcphdr *th;
1621 struct sock *sk;
1622 int ret;
a86b1e30 1623 struct net *net = dev_net(skb->dev);
1da177e4
LT
1624
1625 if (skb->pkt_type != PACKET_HOST)
1626 goto discard_it;
1627
1628 /* Count it even if it's bad */
63231bdd 1629 TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1da177e4
LT
1630
1631 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1632 goto discard_it;
1633
aa8223c7 1634 th = tcp_hdr(skb);
1da177e4
LT
1635
1636 if (th->doff < sizeof(struct tcphdr) / 4)
1637 goto bad_packet;
1638 if (!pskb_may_pull(skb, th->doff * 4))
1639 goto discard_it;
1640
1641 /* An explanation is required here, I think.
1642 * Packet length and doff are validated by header prediction,
caa20d9a 1643 * provided case of th->doff==0 is eliminated.
1da177e4 1644 * So, we defer the checks. */
60476372 1645 if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1da177e4
LT
1646 goto bad_packet;
1647
aa8223c7 1648 th = tcp_hdr(skb);
eddc9ec5 1649 iph = ip_hdr(skb);
1da177e4
LT
1650 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1651 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1652 skb->len - th->doff * 4);
1653 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1654 TCP_SKB_CB(skb)->when = 0;
eddc9ec5 1655 TCP_SKB_CB(skb)->flags = iph->tos;
1da177e4
LT
1656 TCP_SKB_CB(skb)->sacked = 0;
1657
9a1f27c4 1658 sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1da177e4
LT
1659 if (!sk)
1660 goto no_tcp_socket;
1661
bb134d5d
ED
1662process:
1663 if (sk->sk_state == TCP_TIME_WAIT)
1664 goto do_time_wait;
1665
6cce09f8
ED
1666 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1667 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
d218d111 1668 goto discard_and_relse;
6cce09f8 1669 }
d218d111 1670
1da177e4
LT
1671 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1672 goto discard_and_relse;
b59c2701 1673 nf_reset(skb);
1da177e4 1674
fda9ef5d 1675 if (sk_filter(sk, skb))
1da177e4
LT
1676 goto discard_and_relse;
1677
1678 skb->dev = NULL;
1679
c6366184 1680 bh_lock_sock_nested(sk);
1da177e4
LT
1681 ret = 0;
1682 if (!sock_owned_by_user(sk)) {
1a2449a8
CL
1683#ifdef CONFIG_NET_DMA
1684 struct tcp_sock *tp = tcp_sk(sk);
1685 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
f67b4599 1686 tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1a2449a8 1687 if (tp->ucopy.dma_chan)
1da177e4 1688 ret = tcp_v4_do_rcv(sk, skb);
1a2449a8
CL
1689 else
1690#endif
1691 {
1692 if (!tcp_prequeue(sk, skb))
ae8d7f88 1693 ret = tcp_v4_do_rcv(sk, skb);
1a2449a8 1694 }
6cce09f8 1695 } else if (unlikely(sk_add_backlog(sk, skb))) {
6b03a53a 1696 bh_unlock_sock(sk);
6cce09f8 1697 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
6b03a53a
ZY
1698 goto discard_and_relse;
1699 }
1da177e4
LT
1700 bh_unlock_sock(sk);
1701
1702 sock_put(sk);
1703
1704 return ret;
1705
1706no_tcp_socket:
1707 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1708 goto discard_it;
1709
1710 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1711bad_packet:
63231bdd 1712 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1da177e4 1713 } else {
cfb6eeb4 1714 tcp_v4_send_reset(NULL, skb);
1da177e4
LT
1715 }
1716
1717discard_it:
1718 /* Discard frame. */
1719 kfree_skb(skb);
e905a9ed 1720 return 0;
1da177e4
LT
1721
1722discard_and_relse:
1723 sock_put(sk);
1724 goto discard_it;
1725
1726do_time_wait:
1727 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
9469c7b4 1728 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
1729 goto discard_it;
1730 }
1731
1732 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
63231bdd 1733 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
9469c7b4 1734 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
1735 goto discard_it;
1736 }
9469c7b4 1737 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1da177e4 1738 case TCP_TW_SYN: {
c346dca1 1739 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
c67499c0 1740 &tcp_hashinfo,
eddc9ec5 1741 iph->daddr, th->dest,
463c84b9 1742 inet_iif(skb));
1da177e4 1743 if (sk2) {
9469c7b4
YH
1744 inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1745 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
1746 sk = sk2;
1747 goto process;
1748 }
1749 /* Fall through to ACK */
1750 }
1751 case TCP_TW_ACK:
1752 tcp_v4_timewait_ack(sk, skb);
1753 break;
1754 case TCP_TW_RST:
1755 goto no_tcp_socket;
1756 case TCP_TW_SUCCESS:;
1757 }
1758 goto discard_it;
1759}
1760
3f419d2d 1761struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
1da177e4 1762{
3f419d2d 1763 struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1da177e4 1764 struct inet_sock *inet = inet_sk(sk);
3f419d2d 1765 struct inet_peer *peer;
1da177e4 1766
c720c7e8 1767 if (!rt || rt->rt_dst != inet->inet_daddr) {
b534ecf1 1768 peer = inet_getpeer_v4(inet->inet_daddr, 1);
3f419d2d 1769 *release_it = true;
1da177e4
LT
1770 } else {
1771 if (!rt->peer)
1772 rt_bind_peer(rt, 1);
1773 peer = rt->peer;
3f419d2d 1774 *release_it = false;
1da177e4
LT
1775 }
1776
3f419d2d 1777 return peer;
1da177e4 1778}
3f419d2d 1779EXPORT_SYMBOL(tcp_v4_get_peer);
1da177e4 1780
ccb7c410 1781void *tcp_v4_tw_get_peer(struct sock *sk)
1da177e4 1782{
ccb7c410 1783 struct inet_timewait_sock *tw = inet_twsk(sk);
1da177e4 1784
ccb7c410 1785 return inet_getpeer_v4(tw->tw_daddr, 1);
1da177e4 1786}
ccb7c410
DM
1787EXPORT_SYMBOL(tcp_v4_tw_get_peer);
1788
1789static struct timewait_sock_ops tcp_timewait_sock_ops = {
1790 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1791 .twsk_unique = tcp_twsk_unique,
1792 .twsk_destructor= tcp_twsk_destructor,
1793 .twsk_getpeer = tcp_v4_tw_get_peer,
1794};
1da177e4 1795
3b401a81 1796const struct inet_connection_sock_af_ops ipv4_specific = {
543d9cfe
ACM
1797 .queue_xmit = ip_queue_xmit,
1798 .send_check = tcp_v4_send_check,
1799 .rebuild_header = inet_sk_rebuild_header,
1800 .conn_request = tcp_v4_conn_request,
1801 .syn_recv_sock = tcp_v4_syn_recv_sock,
3f419d2d 1802 .get_peer = tcp_v4_get_peer,
543d9cfe
ACM
1803 .net_header_len = sizeof(struct iphdr),
1804 .setsockopt = ip_setsockopt,
1805 .getsockopt = ip_getsockopt,
1806 .addr2sockaddr = inet_csk_addr2sockaddr,
1807 .sockaddr_len = sizeof(struct sockaddr_in),
ab1e0a13 1808 .bind_conflict = inet_csk_bind_conflict,
3fdadf7d 1809#ifdef CONFIG_COMPAT
543d9cfe
ACM
1810 .compat_setsockopt = compat_ip_setsockopt,
1811 .compat_getsockopt = compat_ip_getsockopt,
3fdadf7d 1812#endif
1da177e4 1813};
4bc2f18b 1814EXPORT_SYMBOL(ipv4_specific);
1da177e4 1815
cfb6eeb4 1816#ifdef CONFIG_TCP_MD5SIG
b2e4b3de 1817static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
cfb6eeb4 1818 .md5_lookup = tcp_v4_md5_lookup,
49a72dfb 1819 .calc_md5_hash = tcp_v4_md5_hash_skb,
cfb6eeb4
YH
1820 .md5_add = tcp_v4_md5_add_func,
1821 .md5_parse = tcp_v4_parse_md5_keys,
cfb6eeb4 1822};
b6332e6c 1823#endif
cfb6eeb4 1824
1da177e4
LT
1825/* NOTE: A lot of things set to zero explicitly by call to
1826 * sk_alloc() so need not be done here.
1827 */
1828static int tcp_v4_init_sock(struct sock *sk)
1829{
6687e988 1830 struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4
LT
1831 struct tcp_sock *tp = tcp_sk(sk);
1832
1833 skb_queue_head_init(&tp->out_of_order_queue);
1834 tcp_init_xmit_timers(sk);
1835 tcp_prequeue_init(tp);
1836
6687e988 1837 icsk->icsk_rto = TCP_TIMEOUT_INIT;
1da177e4
LT
1838 tp->mdev = TCP_TIMEOUT_INIT;
1839
1840 /* So many TCP implementations out there (incorrectly) count the
1841 * initial SYN frame in their delayed-ACK and congestion control
1842 * algorithms that we must have the following bandaid to talk
1843 * efficiently to them. -DaveM
1844 */
1845 tp->snd_cwnd = 2;
1846
1847 /* See draft-stevens-tcpca-spec-01 for discussion of the
1848 * initialization of these values.
1849 */
0b6a05c1 1850 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1da177e4 1851 tp->snd_cwnd_clamp = ~0;
bee7ca9e 1852 tp->mss_cache = TCP_MSS_DEFAULT;
1da177e4
LT
1853
1854 tp->reordering = sysctl_tcp_reordering;
6687e988 1855 icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1da177e4
LT
1856
1857 sk->sk_state = TCP_CLOSE;
1858
1859 sk->sk_write_space = sk_stream_write_space;
1860 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1861
8292a17a 1862 icsk->icsk_af_ops = &ipv4_specific;
d83d8461 1863 icsk->icsk_sync_mss = tcp_sync_mss;
cfb6eeb4
YH
1864#ifdef CONFIG_TCP_MD5SIG
1865 tp->af_specific = &tcp_sock_ipv4_specific;
1866#endif
1da177e4 1867
435cf559
WAS
1868 /* TCP Cookie Transactions */
1869 if (sysctl_tcp_cookie_size > 0) {
1870 /* Default, cookies without s_data_payload. */
1871 tp->cookie_values =
1872 kzalloc(sizeof(*tp->cookie_values),
1873 sk->sk_allocation);
1874 if (tp->cookie_values != NULL)
1875 kref_init(&tp->cookie_values->kref);
1876 }
1877 /* Presumed zeroed, in order of appearance:
1878 * cookie_in_always, cookie_out_never,
1879 * s_data_constant, s_data_in, s_data_out
1880 */
1da177e4
LT
1881 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1882 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1883
eb4dea58 1884 local_bh_disable();
1748376b 1885 percpu_counter_inc(&tcp_sockets_allocated);
eb4dea58 1886 local_bh_enable();
1da177e4
LT
1887
1888 return 0;
1889}
1890
7d06b2e0 1891void tcp_v4_destroy_sock(struct sock *sk)
1da177e4
LT
1892{
1893 struct tcp_sock *tp = tcp_sk(sk);
1894
1895 tcp_clear_xmit_timers(sk);
1896
6687e988 1897 tcp_cleanup_congestion_control(sk);
317a76f9 1898
1da177e4 1899 /* Cleanup up the write buffer. */
fe067e8a 1900 tcp_write_queue_purge(sk);
1da177e4
LT
1901
1902 /* Cleans up our, hopefully empty, out_of_order_queue. */
e905a9ed 1903 __skb_queue_purge(&tp->out_of_order_queue);
1da177e4 1904
cfb6eeb4
YH
1905#ifdef CONFIG_TCP_MD5SIG
1906 /* Clean up the MD5 key list, if any */
1907 if (tp->md5sig_info) {
1908 tcp_v4_clear_md5_list(sk);
1909 kfree(tp->md5sig_info);
1910 tp->md5sig_info = NULL;
1911 }
1912#endif
1913
1a2449a8
CL
1914#ifdef CONFIG_NET_DMA
1915 /* Cleans up our sk_async_wait_queue */
e905a9ed 1916 __skb_queue_purge(&sk->sk_async_wait_queue);
1a2449a8
CL
1917#endif
1918
1da177e4
LT
1919 /* Clean prequeue, it must be empty really */
1920 __skb_queue_purge(&tp->ucopy.prequeue);
1921
1922 /* Clean up a referenced TCP bind bucket. */
463c84b9 1923 if (inet_csk(sk)->icsk_bind_hash)
ab1e0a13 1924 inet_put_port(sk);
1da177e4
LT
1925
1926 /*
1927 * If sendmsg cached page exists, toss it.
1928 */
1929 if (sk->sk_sndmsg_page) {
1930 __free_page(sk->sk_sndmsg_page);
1931 sk->sk_sndmsg_page = NULL;
1932 }
1933
435cf559
WAS
1934 /* TCP Cookie Transactions */
1935 if (tp->cookie_values != NULL) {
1936 kref_put(&tp->cookie_values->kref,
1937 tcp_cookie_values_release);
1938 tp->cookie_values = NULL;
1939 }
1940
1748376b 1941 percpu_counter_dec(&tcp_sockets_allocated);
1da177e4 1942}
1da177e4
LT
1943EXPORT_SYMBOL(tcp_v4_destroy_sock);
1944
1945#ifdef CONFIG_PROC_FS
1946/* Proc filesystem TCP sock list dumping. */
1947
3ab5aee7 1948static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1da177e4 1949{
3ab5aee7 1950 return hlist_nulls_empty(head) ? NULL :
8feaf0c0 1951 list_entry(head->first, struct inet_timewait_sock, tw_node);
1da177e4
LT
1952}
1953
8feaf0c0 1954static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1da177e4 1955{
3ab5aee7
ED
1956 return !is_a_nulls(tw->tw_node.next) ?
1957 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1da177e4
LT
1958}
1959
a8b690f9
TH
1960/*
1961 * Get next listener socket follow cur. If cur is NULL, get first socket
1962 * starting from bucket given in st->bucket; when st->bucket is zero the
1963 * very first socket in the hash table is returned.
1964 */
1da177e4
LT
1965static void *listening_get_next(struct seq_file *seq, void *cur)
1966{
463c84b9 1967 struct inet_connection_sock *icsk;
c25eb3bf 1968 struct hlist_nulls_node *node;
1da177e4 1969 struct sock *sk = cur;
5caea4ea 1970 struct inet_listen_hashbucket *ilb;
5799de0b 1971 struct tcp_iter_state *st = seq->private;
a4146b1b 1972 struct net *net = seq_file_net(seq);
1da177e4
LT
1973
1974 if (!sk) {
a8b690f9 1975 ilb = &tcp_hashinfo.listening_hash[st->bucket];
5caea4ea 1976 spin_lock_bh(&ilb->lock);
c25eb3bf 1977 sk = sk_nulls_head(&ilb->head);
a8b690f9 1978 st->offset = 0;
1da177e4
LT
1979 goto get_sk;
1980 }
5caea4ea 1981 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1da177e4 1982 ++st->num;
a8b690f9 1983 ++st->offset;
1da177e4
LT
1984
1985 if (st->state == TCP_SEQ_STATE_OPENREQ) {
60236fdd 1986 struct request_sock *req = cur;
1da177e4 1987
72a3effa 1988 icsk = inet_csk(st->syn_wait_sk);
1da177e4
LT
1989 req = req->dl_next;
1990 while (1) {
1991 while (req) {
bdccc4ca 1992 if (req->rsk_ops->family == st->family) {
1da177e4
LT
1993 cur = req;
1994 goto out;
1995 }
1996 req = req->dl_next;
1997 }
72a3effa 1998 if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1da177e4
LT
1999 break;
2000get_req:
463c84b9 2001 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1da177e4 2002 }
1bde5ac4 2003 sk = sk_nulls_next(st->syn_wait_sk);
1da177e4 2004 st->state = TCP_SEQ_STATE_LISTENING;
463c84b9 2005 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4 2006 } else {
e905a9ed 2007 icsk = inet_csk(sk);
463c84b9
ACM
2008 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2009 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1da177e4 2010 goto start_req;
463c84b9 2011 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1bde5ac4 2012 sk = sk_nulls_next(sk);
1da177e4
LT
2013 }
2014get_sk:
c25eb3bf 2015 sk_nulls_for_each_from(sk, node) {
8475ef9f
PE
2016 if (!net_eq(sock_net(sk), net))
2017 continue;
2018 if (sk->sk_family == st->family) {
1da177e4
LT
2019 cur = sk;
2020 goto out;
2021 }
e905a9ed 2022 icsk = inet_csk(sk);
463c84b9
ACM
2023 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2024 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1da177e4
LT
2025start_req:
2026 st->uid = sock_i_uid(sk);
2027 st->syn_wait_sk = sk;
2028 st->state = TCP_SEQ_STATE_OPENREQ;
2029 st->sbucket = 0;
2030 goto get_req;
2031 }
463c84b9 2032 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4 2033 }
5caea4ea 2034 spin_unlock_bh(&ilb->lock);
a8b690f9 2035 st->offset = 0;
0f7ff927 2036 if (++st->bucket < INET_LHTABLE_SIZE) {
5caea4ea
ED
2037 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2038 spin_lock_bh(&ilb->lock);
c25eb3bf 2039 sk = sk_nulls_head(&ilb->head);
1da177e4
LT
2040 goto get_sk;
2041 }
2042 cur = NULL;
2043out:
2044 return cur;
2045}
2046
2047static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2048{
a8b690f9
TH
2049 struct tcp_iter_state *st = seq->private;
2050 void *rc;
2051
2052 st->bucket = 0;
2053 st->offset = 0;
2054 rc = listening_get_next(seq, NULL);
1da177e4
LT
2055
2056 while (rc && *pos) {
2057 rc = listening_get_next(seq, rc);
2058 --*pos;
2059 }
2060 return rc;
2061}
2062
6eac5604
AK
2063static inline int empty_bucket(struct tcp_iter_state *st)
2064{
3ab5aee7
ED
2065 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2066 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
6eac5604
AK
2067}
2068
a8b690f9
TH
2069/*
2070 * Get first established socket starting from bucket given in st->bucket.
2071 * If st->bucket is zero, the very first socket in the hash is returned.
2072 */
1da177e4
LT
2073static void *established_get_first(struct seq_file *seq)
2074{
5799de0b 2075 struct tcp_iter_state *st = seq->private;
a4146b1b 2076 struct net *net = seq_file_net(seq);
1da177e4
LT
2077 void *rc = NULL;
2078
a8b690f9
TH
2079 st->offset = 0;
2080 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1da177e4 2081 struct sock *sk;
3ab5aee7 2082 struct hlist_nulls_node *node;
8feaf0c0 2083 struct inet_timewait_sock *tw;
9db66bdc 2084 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1da177e4 2085
6eac5604
AK
2086 /* Lockless fast path for the common case of empty buckets */
2087 if (empty_bucket(st))
2088 continue;
2089
9db66bdc 2090 spin_lock_bh(lock);
3ab5aee7 2091 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
f40c8174 2092 if (sk->sk_family != st->family ||
878628fb 2093 !net_eq(sock_net(sk), net)) {
1da177e4
LT
2094 continue;
2095 }
2096 rc = sk;
2097 goto out;
2098 }
2099 st->state = TCP_SEQ_STATE_TIME_WAIT;
8feaf0c0 2100 inet_twsk_for_each(tw, node,
dbca9b27 2101 &tcp_hashinfo.ehash[st->bucket].twchain) {
28518fc1 2102 if (tw->tw_family != st->family ||
878628fb 2103 !net_eq(twsk_net(tw), net)) {
1da177e4
LT
2104 continue;
2105 }
2106 rc = tw;
2107 goto out;
2108 }
9db66bdc 2109 spin_unlock_bh(lock);
1da177e4
LT
2110 st->state = TCP_SEQ_STATE_ESTABLISHED;
2111 }
2112out:
2113 return rc;
2114}
2115
2116static void *established_get_next(struct seq_file *seq, void *cur)
2117{
2118 struct sock *sk = cur;
8feaf0c0 2119 struct inet_timewait_sock *tw;
3ab5aee7 2120 struct hlist_nulls_node *node;
5799de0b 2121 struct tcp_iter_state *st = seq->private;
a4146b1b 2122 struct net *net = seq_file_net(seq);
1da177e4
LT
2123
2124 ++st->num;
a8b690f9 2125 ++st->offset;
1da177e4
LT
2126
2127 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2128 tw = cur;
2129 tw = tw_next(tw);
2130get_tw:
878628fb 2131 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
1da177e4
LT
2132 tw = tw_next(tw);
2133 }
2134 if (tw) {
2135 cur = tw;
2136 goto out;
2137 }
9db66bdc 2138 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1da177e4
LT
2139 st->state = TCP_SEQ_STATE_ESTABLISHED;
2140
6eac5604 2141 /* Look for next non empty bucket */
a8b690f9 2142 st->offset = 0;
f373b53b 2143 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
6eac5604
AK
2144 empty_bucket(st))
2145 ;
f373b53b 2146 if (st->bucket > tcp_hashinfo.ehash_mask)
6eac5604
AK
2147 return NULL;
2148
9db66bdc 2149 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
3ab5aee7 2150 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
1da177e4 2151 } else
3ab5aee7 2152 sk = sk_nulls_next(sk);
1da177e4 2153
3ab5aee7 2154 sk_nulls_for_each_from(sk, node) {
878628fb 2155 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1da177e4
LT
2156 goto found;
2157 }
2158
2159 st->state = TCP_SEQ_STATE_TIME_WAIT;
dbca9b27 2160 tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
1da177e4
LT
2161 goto get_tw;
2162found:
2163 cur = sk;
2164out:
2165 return cur;
2166}
2167
2168static void *established_get_idx(struct seq_file *seq, loff_t pos)
2169{
a8b690f9
TH
2170 struct tcp_iter_state *st = seq->private;
2171 void *rc;
2172
2173 st->bucket = 0;
2174 rc = established_get_first(seq);
1da177e4
LT
2175
2176 while (rc && pos) {
2177 rc = established_get_next(seq, rc);
2178 --pos;
7174259e 2179 }
1da177e4
LT
2180 return rc;
2181}
2182
2183static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2184{
2185 void *rc;
5799de0b 2186 struct tcp_iter_state *st = seq->private;
1da177e4 2187
1da177e4
LT
2188 st->state = TCP_SEQ_STATE_LISTENING;
2189 rc = listening_get_idx(seq, &pos);
2190
2191 if (!rc) {
1da177e4
LT
2192 st->state = TCP_SEQ_STATE_ESTABLISHED;
2193 rc = established_get_idx(seq, pos);
2194 }
2195
2196 return rc;
2197}
2198
a8b690f9
TH
2199static void *tcp_seek_last_pos(struct seq_file *seq)
2200{
2201 struct tcp_iter_state *st = seq->private;
2202 int offset = st->offset;
2203 int orig_num = st->num;
2204 void *rc = NULL;
2205
2206 switch (st->state) {
2207 case TCP_SEQ_STATE_OPENREQ:
2208 case TCP_SEQ_STATE_LISTENING:
2209 if (st->bucket >= INET_LHTABLE_SIZE)
2210 break;
2211 st->state = TCP_SEQ_STATE_LISTENING;
2212 rc = listening_get_next(seq, NULL);
2213 while (offset-- && rc)
2214 rc = listening_get_next(seq, rc);
2215 if (rc)
2216 break;
2217 st->bucket = 0;
2218 /* Fallthrough */
2219 case TCP_SEQ_STATE_ESTABLISHED:
2220 case TCP_SEQ_STATE_TIME_WAIT:
2221 st->state = TCP_SEQ_STATE_ESTABLISHED;
2222 if (st->bucket > tcp_hashinfo.ehash_mask)
2223 break;
2224 rc = established_get_first(seq);
2225 while (offset-- && rc)
2226 rc = established_get_next(seq, rc);
2227 }
2228
2229 st->num = orig_num;
2230
2231 return rc;
2232}
2233
1da177e4
LT
2234static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2235{
5799de0b 2236 struct tcp_iter_state *st = seq->private;
a8b690f9
TH
2237 void *rc;
2238
2239 if (*pos && *pos == st->last_pos) {
2240 rc = tcp_seek_last_pos(seq);
2241 if (rc)
2242 goto out;
2243 }
2244
1da177e4
LT
2245 st->state = TCP_SEQ_STATE_LISTENING;
2246 st->num = 0;
a8b690f9
TH
2247 st->bucket = 0;
2248 st->offset = 0;
2249 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2250
2251out:
2252 st->last_pos = *pos;
2253 return rc;
1da177e4
LT
2254}
2255
2256static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2257{
a8b690f9 2258 struct tcp_iter_state *st = seq->private;
1da177e4 2259 void *rc = NULL;
1da177e4
LT
2260
2261 if (v == SEQ_START_TOKEN) {
2262 rc = tcp_get_idx(seq, 0);
2263 goto out;
2264 }
1da177e4
LT
2265
2266 switch (st->state) {
2267 case TCP_SEQ_STATE_OPENREQ:
2268 case TCP_SEQ_STATE_LISTENING:
2269 rc = listening_get_next(seq, v);
2270 if (!rc) {
1da177e4 2271 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2272 st->bucket = 0;
2273 st->offset = 0;
1da177e4
LT
2274 rc = established_get_first(seq);
2275 }
2276 break;
2277 case TCP_SEQ_STATE_ESTABLISHED:
2278 case TCP_SEQ_STATE_TIME_WAIT:
2279 rc = established_get_next(seq, v);
2280 break;
2281 }
2282out:
2283 ++*pos;
a8b690f9 2284 st->last_pos = *pos;
1da177e4
LT
2285 return rc;
2286}
2287
2288static void tcp_seq_stop(struct seq_file *seq, void *v)
2289{
5799de0b 2290 struct tcp_iter_state *st = seq->private;
1da177e4
LT
2291
2292 switch (st->state) {
2293 case TCP_SEQ_STATE_OPENREQ:
2294 if (v) {
463c84b9
ACM
2295 struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2296 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4
LT
2297 }
2298 case TCP_SEQ_STATE_LISTENING:
2299 if (v != SEQ_START_TOKEN)
5caea4ea 2300 spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
1da177e4
LT
2301 break;
2302 case TCP_SEQ_STATE_TIME_WAIT:
2303 case TCP_SEQ_STATE_ESTABLISHED:
2304 if (v)
9db66bdc 2305 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1da177e4
LT
2306 break;
2307 }
2308}
2309
2310static int tcp_seq_open(struct inode *inode, struct file *file)
2311{
2312 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
1da177e4 2313 struct tcp_iter_state *s;
52d6f3f1 2314 int err;
1da177e4 2315
52d6f3f1
DL
2316 err = seq_open_net(inode, file, &afinfo->seq_ops,
2317 sizeof(struct tcp_iter_state));
2318 if (err < 0)
2319 return err;
f40c8174 2320
52d6f3f1 2321 s = ((struct seq_file *)file->private_data)->private;
1da177e4 2322 s->family = afinfo->family;
a8b690f9 2323 s->last_pos = 0;
f40c8174
DL
2324 return 0;
2325}
2326
6f8b13bc 2327int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
1da177e4
LT
2328{
2329 int rc = 0;
2330 struct proc_dir_entry *p;
2331
68fcadd1
DL
2332 afinfo->seq_fops.open = tcp_seq_open;
2333 afinfo->seq_fops.read = seq_read;
2334 afinfo->seq_fops.llseek = seq_lseek;
2335 afinfo->seq_fops.release = seq_release_net;
7174259e 2336
9427c4b3
DL
2337 afinfo->seq_ops.start = tcp_seq_start;
2338 afinfo->seq_ops.next = tcp_seq_next;
2339 afinfo->seq_ops.stop = tcp_seq_stop;
2340
84841c3c
DL
2341 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2342 &afinfo->seq_fops, afinfo);
2343 if (!p)
1da177e4
LT
2344 rc = -ENOMEM;
2345 return rc;
2346}
4bc2f18b 2347EXPORT_SYMBOL(tcp_proc_register);
1da177e4 2348
6f8b13bc 2349void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
1da177e4 2350{
6f8b13bc 2351 proc_net_remove(net, afinfo->name);
1da177e4 2352}
4bc2f18b 2353EXPORT_SYMBOL(tcp_proc_unregister);
1da177e4 2354
60236fdd 2355static void get_openreq4(struct sock *sk, struct request_sock *req,
5e659e4c 2356 struct seq_file *f, int i, int uid, int *len)
1da177e4 2357{
2e6599cb 2358 const struct inet_request_sock *ireq = inet_rsk(req);
1da177e4
LT
2359 int ttd = req->expires - jiffies;
2360
5e659e4c
PE
2361 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2362 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
1da177e4 2363 i,
2e6599cb 2364 ireq->loc_addr,
c720c7e8 2365 ntohs(inet_sk(sk)->inet_sport),
2e6599cb
ACM
2366 ireq->rmt_addr,
2367 ntohs(ireq->rmt_port),
1da177e4
LT
2368 TCP_SYN_RECV,
2369 0, 0, /* could print option size, but that is af dependent. */
2370 1, /* timers active (only the expire timer) */
2371 jiffies_to_clock_t(ttd),
2372 req->retrans,
2373 uid,
2374 0, /* non standard timer */
2375 0, /* open_requests have no inode */
2376 atomic_read(&sk->sk_refcnt),
5e659e4c
PE
2377 req,
2378 len);
1da177e4
LT
2379}
2380
5e659e4c 2381static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
1da177e4
LT
2382{
2383 int timer_active;
2384 unsigned long timer_expires;
cf4c6bf8
IJ
2385 struct tcp_sock *tp = tcp_sk(sk);
2386 const struct inet_connection_sock *icsk = inet_csk(sk);
2387 struct inet_sock *inet = inet_sk(sk);
c720c7e8
ED
2388 __be32 dest = inet->inet_daddr;
2389 __be32 src = inet->inet_rcv_saddr;
2390 __u16 destp = ntohs(inet->inet_dport);
2391 __u16 srcp = ntohs(inet->inet_sport);
49d09007 2392 int rx_queue;
1da177e4 2393
463c84b9 2394 if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
1da177e4 2395 timer_active = 1;
463c84b9
ACM
2396 timer_expires = icsk->icsk_timeout;
2397 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1da177e4 2398 timer_active = 4;
463c84b9 2399 timer_expires = icsk->icsk_timeout;
cf4c6bf8 2400 } else if (timer_pending(&sk->sk_timer)) {
1da177e4 2401 timer_active = 2;
cf4c6bf8 2402 timer_expires = sk->sk_timer.expires;
1da177e4
LT
2403 } else {
2404 timer_active = 0;
2405 timer_expires = jiffies;
2406 }
2407
49d09007
ED
2408 if (sk->sk_state == TCP_LISTEN)
2409 rx_queue = sk->sk_ack_backlog;
2410 else
2411 /*
2412 * because we dont lock socket, we might find a transient negative value
2413 */
2414 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2415
5e659e4c 2416 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
7be87351 2417 "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
cf4c6bf8 2418 i, src, srcp, dest, destp, sk->sk_state,
47da8ee6 2419 tp->write_seq - tp->snd_una,
49d09007 2420 rx_queue,
1da177e4
LT
2421 timer_active,
2422 jiffies_to_clock_t(timer_expires - jiffies),
463c84b9 2423 icsk->icsk_retransmits,
cf4c6bf8 2424 sock_i_uid(sk),
6687e988 2425 icsk->icsk_probes_out,
cf4c6bf8
IJ
2426 sock_i_ino(sk),
2427 atomic_read(&sk->sk_refcnt), sk,
7be87351
SH
2428 jiffies_to_clock_t(icsk->icsk_rto),
2429 jiffies_to_clock_t(icsk->icsk_ack.ato),
463c84b9 2430 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
1da177e4 2431 tp->snd_cwnd,
0b6a05c1 2432 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
5e659e4c 2433 len);
1da177e4
LT
2434}
2435
7174259e 2436static void get_timewait4_sock(struct inet_timewait_sock *tw,
5e659e4c 2437 struct seq_file *f, int i, int *len)
1da177e4 2438{
23f33c2d 2439 __be32 dest, src;
1da177e4
LT
2440 __u16 destp, srcp;
2441 int ttd = tw->tw_ttd - jiffies;
2442
2443 if (ttd < 0)
2444 ttd = 0;
2445
2446 dest = tw->tw_daddr;
2447 src = tw->tw_rcv_saddr;
2448 destp = ntohs(tw->tw_dport);
2449 srcp = ntohs(tw->tw_sport);
2450
5e659e4c
PE
2451 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2452 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
1da177e4
LT
2453 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2454 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
5e659e4c 2455 atomic_read(&tw->tw_refcnt), tw, len);
1da177e4
LT
2456}
2457
2458#define TMPSZ 150
2459
2460static int tcp4_seq_show(struct seq_file *seq, void *v)
2461{
5799de0b 2462 struct tcp_iter_state *st;
5e659e4c 2463 int len;
1da177e4
LT
2464
2465 if (v == SEQ_START_TOKEN) {
2466 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2467 " sl local_address rem_address st tx_queue "
2468 "rx_queue tr tm->when retrnsmt uid timeout "
2469 "inode");
2470 goto out;
2471 }
2472 st = seq->private;
2473
2474 switch (st->state) {
2475 case TCP_SEQ_STATE_LISTENING:
2476 case TCP_SEQ_STATE_ESTABLISHED:
5e659e4c 2477 get_tcp4_sock(v, seq, st->num, &len);
1da177e4
LT
2478 break;
2479 case TCP_SEQ_STATE_OPENREQ:
5e659e4c 2480 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
1da177e4
LT
2481 break;
2482 case TCP_SEQ_STATE_TIME_WAIT:
5e659e4c 2483 get_timewait4_sock(v, seq, st->num, &len);
1da177e4
LT
2484 break;
2485 }
5e659e4c 2486 seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
1da177e4
LT
2487out:
2488 return 0;
2489}
2490
1da177e4 2491static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1da177e4
LT
2492 .name = "tcp",
2493 .family = AF_INET,
5f4472c5
DL
2494 .seq_fops = {
2495 .owner = THIS_MODULE,
2496 },
9427c4b3
DL
2497 .seq_ops = {
2498 .show = tcp4_seq_show,
2499 },
1da177e4
LT
2500};
2501
2c8c1e72 2502static int __net_init tcp4_proc_init_net(struct net *net)
757764f6
PE
2503{
2504 return tcp_proc_register(net, &tcp4_seq_afinfo);
2505}
2506
2c8c1e72 2507static void __net_exit tcp4_proc_exit_net(struct net *net)
757764f6
PE
2508{
2509 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2510}
2511
2512static struct pernet_operations tcp4_net_ops = {
2513 .init = tcp4_proc_init_net,
2514 .exit = tcp4_proc_exit_net,
2515};
2516
1da177e4
LT
2517int __init tcp4_proc_init(void)
2518{
757764f6 2519 return register_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2520}
2521
2522void tcp4_proc_exit(void)
2523{
757764f6 2524 unregister_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2525}
2526#endif /* CONFIG_PROC_FS */
2527
bf296b12
HX
2528struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2529{
b71d1d42 2530 const struct iphdr *iph = skb_gro_network_header(skb);
bf296b12
HX
2531
2532 switch (skb->ip_summed) {
2533 case CHECKSUM_COMPLETE:
86911732 2534 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
bf296b12
HX
2535 skb->csum)) {
2536 skb->ip_summed = CHECKSUM_UNNECESSARY;
2537 break;
2538 }
2539
2540 /* fall through */
2541 case CHECKSUM_NONE:
2542 NAPI_GRO_CB(skb)->flush = 1;
2543 return NULL;
2544 }
2545
2546 return tcp_gro_receive(head, skb);
2547}
bf296b12
HX
2548
2549int tcp4_gro_complete(struct sk_buff *skb)
2550{
b71d1d42 2551 const struct iphdr *iph = ip_hdr(skb);
bf296b12
HX
2552 struct tcphdr *th = tcp_hdr(skb);
2553
2554 th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2555 iph->saddr, iph->daddr, 0);
2556 skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2557
2558 return tcp_gro_complete(skb);
2559}
bf296b12 2560
1da177e4
LT
2561struct proto tcp_prot = {
2562 .name = "TCP",
2563 .owner = THIS_MODULE,
2564 .close = tcp_close,
2565 .connect = tcp_v4_connect,
2566 .disconnect = tcp_disconnect,
463c84b9 2567 .accept = inet_csk_accept,
1da177e4
LT
2568 .ioctl = tcp_ioctl,
2569 .init = tcp_v4_init_sock,
2570 .destroy = tcp_v4_destroy_sock,
2571 .shutdown = tcp_shutdown,
2572 .setsockopt = tcp_setsockopt,
2573 .getsockopt = tcp_getsockopt,
1da177e4 2574 .recvmsg = tcp_recvmsg,
7ba42910
CG
2575 .sendmsg = tcp_sendmsg,
2576 .sendpage = tcp_sendpage,
1da177e4 2577 .backlog_rcv = tcp_v4_do_rcv,
ab1e0a13
ACM
2578 .hash = inet_hash,
2579 .unhash = inet_unhash,
2580 .get_port = inet_csk_get_port,
1da177e4
LT
2581 .enter_memory_pressure = tcp_enter_memory_pressure,
2582 .sockets_allocated = &tcp_sockets_allocated,
0a5578cf 2583 .orphan_count = &tcp_orphan_count,
1da177e4
LT
2584 .memory_allocated = &tcp_memory_allocated,
2585 .memory_pressure = &tcp_memory_pressure,
2586 .sysctl_mem = sysctl_tcp_mem,
2587 .sysctl_wmem = sysctl_tcp_wmem,
2588 .sysctl_rmem = sysctl_tcp_rmem,
2589 .max_header = MAX_TCP_HEADER,
2590 .obj_size = sizeof(struct tcp_sock),
3ab5aee7 2591 .slab_flags = SLAB_DESTROY_BY_RCU,
6d6ee43e 2592 .twsk_prot = &tcp_timewait_sock_ops,
60236fdd 2593 .rsk_prot = &tcp_request_sock_ops,
39d8cda7 2594 .h.hashinfo = &tcp_hashinfo,
7ba42910 2595 .no_autobind = true,
543d9cfe
ACM
2596#ifdef CONFIG_COMPAT
2597 .compat_setsockopt = compat_tcp_setsockopt,
2598 .compat_getsockopt = compat_tcp_getsockopt,
2599#endif
1da177e4 2600};
4bc2f18b 2601EXPORT_SYMBOL(tcp_prot);
1da177e4 2602
046ee902
DL
2603
2604static int __net_init tcp_sk_init(struct net *net)
2605{
2606 return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2607 PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2608}
2609
2610static void __net_exit tcp_sk_exit(struct net *net)
2611{
2612 inet_ctl_sock_destroy(net->ipv4.tcp_sock);
b099ce26
EB
2613}
2614
2615static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2616{
2617 inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
046ee902
DL
2618}
2619
2620static struct pernet_operations __net_initdata tcp_sk_ops = {
b099ce26
EB
2621 .init = tcp_sk_init,
2622 .exit = tcp_sk_exit,
2623 .exit_batch = tcp_sk_exit_batch,
046ee902
DL
2624};
2625
9b0f976f 2626void __init tcp_v4_init(void)
1da177e4 2627{
5caea4ea 2628 inet_hashinfo_init(&tcp_hashinfo);
6a1b3054 2629 if (register_pernet_subsys(&tcp_sk_ops))
1da177e4 2630 panic("Failed to create the TCP control socket.\n");
1da177e4 2631}
This page took 0.817107 seconds and 5 git commands to generate.