ipv4: Use inet_csk_route_child_sock() in DCCP and TCP.
[deliverable/linux.git] / net / ipv4 / tcp_ipv4.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
1da177e4
LT
8 * IPv4 specific functions
9 *
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 *
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
22 */
23
24/*
25 * Changes:
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
34 * ACK bit.
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
60236fdd 37 * request_sock handling and moved
1da177e4
LT
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
caa20d9a 40 * Added new listen semantics.
1da177e4
LT
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
45 * coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
51 */
52
1da177e4 53
eb4dea58 54#include <linux/bottom_half.h>
1da177e4
LT
55#include <linux/types.h>
56#include <linux/fcntl.h>
57#include <linux/module.h>
58#include <linux/random.h>
59#include <linux/cache.h>
60#include <linux/jhash.h>
61#include <linux/init.h>
62#include <linux/times.h>
5a0e3ad6 63#include <linux/slab.h>
1da177e4 64
457c4cbc 65#include <net/net_namespace.h>
1da177e4 66#include <net/icmp.h>
304a1618 67#include <net/inet_hashtables.h>
1da177e4 68#include <net/tcp.h>
20380731 69#include <net/transp_v6.h>
1da177e4
LT
70#include <net/ipv6.h>
71#include <net/inet_common.h>
6d6ee43e 72#include <net/timewait_sock.h>
1da177e4 73#include <net/xfrm.h>
1a2449a8 74#include <net/netdma.h>
1da177e4
LT
75
76#include <linux/inet.h>
77#include <linux/ipv6.h>
78#include <linux/stddef.h>
79#include <linux/proc_fs.h>
80#include <linux/seq_file.h>
81
cfb6eeb4
YH
82#include <linux/crypto.h>
83#include <linux/scatterlist.h>
84
ab32ea5d
BH
85int sysctl_tcp_tw_reuse __read_mostly;
86int sysctl_tcp_low_latency __read_mostly;
4bc2f18b 87EXPORT_SYMBOL(sysctl_tcp_low_latency);
1da177e4 88
1da177e4 89
cfb6eeb4 90#ifdef CONFIG_TCP_MD5SIG
7174259e
ACM
91static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
92 __be32 addr);
49a72dfb
AL
93static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
94 __be32 daddr, __be32 saddr, struct tcphdr *th);
9501f972
YH
95#else
96static inline
97struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
98{
99 return NULL;
100}
cfb6eeb4
YH
101#endif
102
5caea4ea 103struct inet_hashinfo tcp_hashinfo;
4bc2f18b 104EXPORT_SYMBOL(tcp_hashinfo);
1da177e4 105
a94f723d 106static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
1da177e4 107{
eddc9ec5
ACM
108 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
109 ip_hdr(skb)->saddr,
aa8223c7
ACM
110 tcp_hdr(skb)->dest,
111 tcp_hdr(skb)->source);
1da177e4
LT
112}
113
6d6ee43e
ACM
114int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
115{
116 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
117 struct tcp_sock *tp = tcp_sk(sk);
118
119 /* With PAWS, it is safe from the viewpoint
120 of data integrity. Even without PAWS it is safe provided sequence
121 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
122
123 Actually, the idea is close to VJ's one, only timestamp cache is
124 held not per host, but per port pair and TW bucket is used as state
125 holder.
126
127 If TW bucket has been already destroyed we fall back to VJ's scheme
128 and use initial timestamp retrieved from peer table.
129 */
130 if (tcptw->tw_ts_recent_stamp &&
131 (twp == NULL || (sysctl_tcp_tw_reuse &&
9d729f72 132 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
6d6ee43e
ACM
133 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
134 if (tp->write_seq == 0)
135 tp->write_seq = 1;
136 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
137 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
138 sock_hold(sktw);
139 return 1;
140 }
141
142 return 0;
143}
6d6ee43e
ACM
144EXPORT_SYMBOL_GPL(tcp_twsk_unique);
145
1da177e4
LT
146/* This will initiate an outgoing connection. */
147int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
148{
2d7192d6 149 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
1da177e4
LT
150 struct inet_sock *inet = inet_sk(sk);
151 struct tcp_sock *tp = tcp_sk(sk);
dca8b089 152 __be16 orig_sport, orig_dport;
bada8adc 153 __be32 daddr, nexthop;
da905bd1 154 struct flowi4 *fl4;
2d7192d6 155 struct rtable *rt;
1da177e4 156 int err;
f6d8bd05 157 struct ip_options_rcu *inet_opt;
1da177e4
LT
158
159 if (addr_len < sizeof(struct sockaddr_in))
160 return -EINVAL;
161
162 if (usin->sin_family != AF_INET)
163 return -EAFNOSUPPORT;
164
165 nexthop = daddr = usin->sin_addr.s_addr;
f6d8bd05
ED
166 inet_opt = rcu_dereference_protected(inet->inet_opt,
167 sock_owned_by_user(sk));
168 if (inet_opt && inet_opt->opt.srr) {
1da177e4
LT
169 if (!daddr)
170 return -EINVAL;
f6d8bd05 171 nexthop = inet_opt->opt.faddr;
1da177e4
LT
172 }
173
dca8b089
DM
174 orig_sport = inet->inet_sport;
175 orig_dport = usin->sin_port;
da905bd1
DM
176 fl4 = &inet->cork.fl.u.ip4;
177 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
b23dd4fe
DM
178 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
179 IPPROTO_TCP,
180 orig_sport, orig_dport, sk, true);
181 if (IS_ERR(rt)) {
182 err = PTR_ERR(rt);
183 if (err == -ENETUNREACH)
7c73a6fa 184 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
b23dd4fe 185 return err;
584bdf8c 186 }
1da177e4
LT
187
188 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
189 ip_rt_put(rt);
190 return -ENETUNREACH;
191 }
192
f6d8bd05 193 if (!inet_opt || !inet_opt->opt.srr)
da905bd1 194 daddr = fl4->daddr;
1da177e4 195
c720c7e8 196 if (!inet->inet_saddr)
da905bd1 197 inet->inet_saddr = fl4->saddr;
c720c7e8 198 inet->inet_rcv_saddr = inet->inet_saddr;
1da177e4 199
c720c7e8 200 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
1da177e4
LT
201 /* Reset inherited state */
202 tp->rx_opt.ts_recent = 0;
203 tp->rx_opt.ts_recent_stamp = 0;
204 tp->write_seq = 0;
205 }
206
295ff7ed 207 if (tcp_death_row.sysctl_tw_recycle &&
da905bd1 208 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) {
1da177e4 209 struct inet_peer *peer = rt_get_peer(rt);
7174259e
ACM
210 /*
211 * VJ's idea. We save last timestamp seen from
212 * the destination in peer table, when entering state
213 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
214 * when trying new connection.
1da177e4 215 */
317fe0e6
ED
216 if (peer) {
217 inet_peer_refcheck(peer);
218 if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
219 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
220 tp->rx_opt.ts_recent = peer->tcp_ts;
221 }
1da177e4
LT
222 }
223 }
224
c720c7e8
ED
225 inet->inet_dport = usin->sin_port;
226 inet->inet_daddr = daddr;
1da177e4 227
d83d8461 228 inet_csk(sk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
229 if (inet_opt)
230 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1da177e4 231
bee7ca9e 232 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
1da177e4
LT
233
234 /* Socket identity is still unknown (sport may be zero).
235 * However we set state to SYN-SENT and not releasing socket
236 * lock select source port, enter ourselves into the hash tables and
237 * complete initialization after this.
238 */
239 tcp_set_state(sk, TCP_SYN_SENT);
a7f5e7f1 240 err = inet_hash_connect(&tcp_death_row, sk);
1da177e4
LT
241 if (err)
242 goto failure;
243
da905bd1 244 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
b23dd4fe
DM
245 inet->inet_sport, inet->inet_dport, sk);
246 if (IS_ERR(rt)) {
247 err = PTR_ERR(rt);
248 rt = NULL;
1da177e4 249 goto failure;
b23dd4fe 250 }
1da177e4 251 /* OK, now commit destination to socket. */
bcd76111 252 sk->sk_gso_type = SKB_GSO_TCPV4;
d8d1f30b 253 sk_setup_caps(sk, &rt->dst);
1da177e4
LT
254
255 if (!tp->write_seq)
c720c7e8
ED
256 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
257 inet->inet_daddr,
258 inet->inet_sport,
1da177e4
LT
259 usin->sin_port);
260
c720c7e8 261 inet->inet_id = tp->write_seq ^ jiffies;
1da177e4
LT
262
263 err = tcp_connect(sk);
264 rt = NULL;
265 if (err)
266 goto failure;
267
268 return 0;
269
270failure:
7174259e
ACM
271 /*
272 * This unhashes the socket and releases the local port,
273 * if necessary.
274 */
1da177e4
LT
275 tcp_set_state(sk, TCP_CLOSE);
276 ip_rt_put(rt);
277 sk->sk_route_caps = 0;
c720c7e8 278 inet->inet_dport = 0;
1da177e4
LT
279 return err;
280}
4bc2f18b 281EXPORT_SYMBOL(tcp_v4_connect);
1da177e4 282
1da177e4
LT
283/*
284 * This routine does path mtu discovery as defined in RFC1191.
285 */
b71d1d42 286static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
1da177e4
LT
287{
288 struct dst_entry *dst;
289 struct inet_sock *inet = inet_sk(sk);
1da177e4
LT
290
291 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
292 * send out by Linux are always <576bytes so they should go through
293 * unfragmented).
294 */
295 if (sk->sk_state == TCP_LISTEN)
296 return;
297
298 /* We don't check in the destentry if pmtu discovery is forbidden
299 * on this route. We just assume that no packet_to_big packets
300 * are send back when pmtu discovery is not active.
e905a9ed 301 * There is a small race when the user changes this flag in the
1da177e4
LT
302 * route, but I think that's acceptable.
303 */
304 if ((dst = __sk_dst_check(sk, 0)) == NULL)
305 return;
306
307 dst->ops->update_pmtu(dst, mtu);
308
309 /* Something is about to be wrong... Remember soft error
310 * for the case, if this connection will not able to recover.
311 */
312 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
313 sk->sk_err_soft = EMSGSIZE;
314
315 mtu = dst_mtu(dst);
316
317 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
d83d8461 318 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
1da177e4
LT
319 tcp_sync_mss(sk, mtu);
320
321 /* Resend the TCP packet because it's
322 * clear that the old packet has been
323 * dropped. This is the new "fast" path mtu
324 * discovery.
325 */
326 tcp_simple_retransmit(sk);
327 } /* else let the usual retransmit timer handle it */
328}
329
330/*
331 * This routine is called by the ICMP module when it gets some
332 * sort of error condition. If err < 0 then the socket should
333 * be closed and the error returned to the user. If err > 0
334 * it's just the icmp type << 8 | icmp code. After adjustment
335 * header points to the first 8 bytes of the tcp header. We need
336 * to find the appropriate port.
337 *
338 * The locking strategy used here is very "optimistic". When
339 * someone else accesses the socket the ICMP is just dropped
340 * and for some paths there is no check at all.
341 * A more general error queue to queue errors for later handling
342 * is probably better.
343 *
344 */
345
4d1a2d9e 346void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
1da177e4 347{
b71d1d42 348 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
4d1a2d9e 349 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
f1ecd5d9 350 struct inet_connection_sock *icsk;
1da177e4
LT
351 struct tcp_sock *tp;
352 struct inet_sock *inet;
4d1a2d9e
DL
353 const int type = icmp_hdr(icmp_skb)->type;
354 const int code = icmp_hdr(icmp_skb)->code;
1da177e4 355 struct sock *sk;
f1ecd5d9 356 struct sk_buff *skb;
1da177e4 357 __u32 seq;
f1ecd5d9 358 __u32 remaining;
1da177e4 359 int err;
4d1a2d9e 360 struct net *net = dev_net(icmp_skb->dev);
1da177e4 361
4d1a2d9e 362 if (icmp_skb->len < (iph->ihl << 2) + 8) {
dcfc23ca 363 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
1da177e4
LT
364 return;
365 }
366
fd54d716 367 sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
4d1a2d9e 368 iph->saddr, th->source, inet_iif(icmp_skb));
1da177e4 369 if (!sk) {
dcfc23ca 370 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
1da177e4
LT
371 return;
372 }
373 if (sk->sk_state == TCP_TIME_WAIT) {
9469c7b4 374 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
375 return;
376 }
377
378 bh_lock_sock(sk);
379 /* If too many ICMPs get dropped on busy
380 * servers this needs to be solved differently.
381 */
382 if (sock_owned_by_user(sk))
de0744af 383 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
1da177e4
LT
384
385 if (sk->sk_state == TCP_CLOSE)
386 goto out;
387
97e3ecd1 388 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
389 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
390 goto out;
391 }
392
f1ecd5d9 393 icsk = inet_csk(sk);
1da177e4
LT
394 tp = tcp_sk(sk);
395 seq = ntohl(th->seq);
396 if (sk->sk_state != TCP_LISTEN &&
397 !between(seq, tp->snd_una, tp->snd_nxt)) {
de0744af 398 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
1da177e4
LT
399 goto out;
400 }
401
402 switch (type) {
403 case ICMP_SOURCE_QUENCH:
404 /* Just silently ignore these. */
405 goto out;
406 case ICMP_PARAMETERPROB:
407 err = EPROTO;
408 break;
409 case ICMP_DEST_UNREACH:
410 if (code > NR_ICMP_UNREACH)
411 goto out;
412
413 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
414 if (!sock_owned_by_user(sk))
415 do_pmtu_discovery(sk, iph, info);
416 goto out;
417 }
418
419 err = icmp_err_convert[code].errno;
f1ecd5d9
DL
420 /* check if icmp_skb allows revert of backoff
421 * (see draft-zimmermann-tcp-lcd) */
422 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
423 break;
424 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
425 !icsk->icsk_backoff)
426 break;
427
8f49c270
DM
428 if (sock_owned_by_user(sk))
429 break;
430
f1ecd5d9
DL
431 icsk->icsk_backoff--;
432 inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
433 icsk->icsk_backoff;
434 tcp_bound_rto(sk);
435
436 skb = tcp_write_queue_head(sk);
437 BUG_ON(!skb);
438
439 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
440 tcp_time_stamp - TCP_SKB_CB(skb)->when);
441
442 if (remaining) {
443 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
444 remaining, TCP_RTO_MAX);
f1ecd5d9
DL
445 } else {
446 /* RTO revert clocked out retransmission.
447 * Will retransmit now */
448 tcp_retransmit_timer(sk);
449 }
450
1da177e4
LT
451 break;
452 case ICMP_TIME_EXCEEDED:
453 err = EHOSTUNREACH;
454 break;
455 default:
456 goto out;
457 }
458
459 switch (sk->sk_state) {
60236fdd 460 struct request_sock *req, **prev;
1da177e4
LT
461 case TCP_LISTEN:
462 if (sock_owned_by_user(sk))
463 goto out;
464
463c84b9
ACM
465 req = inet_csk_search_req(sk, &prev, th->dest,
466 iph->daddr, iph->saddr);
1da177e4
LT
467 if (!req)
468 goto out;
469
470 /* ICMPs are not backlogged, hence we cannot get
471 an established socket here.
472 */
547b792c 473 WARN_ON(req->sk);
1da177e4 474
2e6599cb 475 if (seq != tcp_rsk(req)->snt_isn) {
de0744af 476 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
1da177e4
LT
477 goto out;
478 }
479
480 /*
481 * Still in SYN_RECV, just remove it silently.
482 * There is no good way to pass the error to the newly
483 * created socket, and POSIX does not want network
484 * errors returned from accept().
485 */
463c84b9 486 inet_csk_reqsk_queue_drop(sk, req, prev);
1da177e4
LT
487 goto out;
488
489 case TCP_SYN_SENT:
490 case TCP_SYN_RECV: /* Cannot happen.
491 It can f.e. if SYNs crossed.
492 */
493 if (!sock_owned_by_user(sk)) {
1da177e4
LT
494 sk->sk_err = err;
495
496 sk->sk_error_report(sk);
497
498 tcp_done(sk);
499 } else {
500 sk->sk_err_soft = err;
501 }
502 goto out;
503 }
504
505 /* If we've already connected we will keep trying
506 * until we time out, or the user gives up.
507 *
508 * rfc1122 4.2.3.9 allows to consider as hard errors
509 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
510 * but it is obsoleted by pmtu discovery).
511 *
512 * Note, that in modern internet, where routing is unreliable
513 * and in each dark corner broken firewalls sit, sending random
514 * errors ordered by their masters even this two messages finally lose
515 * their original sense (even Linux sends invalid PORT_UNREACHs)
516 *
517 * Now we are in compliance with RFCs.
518 * --ANK (980905)
519 */
520
521 inet = inet_sk(sk);
522 if (!sock_owned_by_user(sk) && inet->recverr) {
523 sk->sk_err = err;
524 sk->sk_error_report(sk);
525 } else { /* Only an error on timeout */
526 sk->sk_err_soft = err;
527 }
528
529out:
530 bh_unlock_sock(sk);
531 sock_put(sk);
532}
533
419f9f89
HX
534static void __tcp_v4_send_check(struct sk_buff *skb,
535 __be32 saddr, __be32 daddr)
1da177e4 536{
aa8223c7 537 struct tcphdr *th = tcp_hdr(skb);
1da177e4 538
84fa7933 539 if (skb->ip_summed == CHECKSUM_PARTIAL) {
419f9f89 540 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
663ead3b 541 skb->csum_start = skb_transport_header(skb) - skb->head;
ff1dcadb 542 skb->csum_offset = offsetof(struct tcphdr, check);
1da177e4 543 } else {
419f9f89 544 th->check = tcp_v4_check(skb->len, saddr, daddr,
07f0757a 545 csum_partial(th,
1da177e4
LT
546 th->doff << 2,
547 skb->csum));
548 }
549}
550
419f9f89 551/* This routine computes an IPv4 TCP checksum. */
bb296246 552void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
419f9f89
HX
553{
554 struct inet_sock *inet = inet_sk(sk);
555
556 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
557}
4bc2f18b 558EXPORT_SYMBOL(tcp_v4_send_check);
419f9f89 559
a430a43d
HX
560int tcp_v4_gso_send_check(struct sk_buff *skb)
561{
eddc9ec5 562 const struct iphdr *iph;
a430a43d
HX
563 struct tcphdr *th;
564
565 if (!pskb_may_pull(skb, sizeof(*th)))
566 return -EINVAL;
567
eddc9ec5 568 iph = ip_hdr(skb);
aa8223c7 569 th = tcp_hdr(skb);
a430a43d
HX
570
571 th->check = 0;
84fa7933 572 skb->ip_summed = CHECKSUM_PARTIAL;
419f9f89 573 __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
a430a43d
HX
574 return 0;
575}
576
1da177e4
LT
577/*
578 * This routine will send an RST to the other tcp.
579 *
580 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
581 * for reset.
582 * Answer: if a packet caused RST, it is not for a socket
583 * existing in our system, if it is matched to a socket,
584 * it is just duplicate segment or bug in other side's TCP.
585 * So that we build reply only basing on parameters
586 * arrived with segment.
587 * Exception: precedence violation. We do not implement it in any case.
588 */
589
cfb6eeb4 590static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
1da177e4 591{
aa8223c7 592 struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
593 struct {
594 struct tcphdr th;
595#ifdef CONFIG_TCP_MD5SIG
714e85be 596 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
cfb6eeb4
YH
597#endif
598 } rep;
1da177e4 599 struct ip_reply_arg arg;
cfb6eeb4
YH
600#ifdef CONFIG_TCP_MD5SIG
601 struct tcp_md5sig_key *key;
602#endif
a86b1e30 603 struct net *net;
1da177e4
LT
604
605 /* Never send a reset in response to a reset. */
606 if (th->rst)
607 return;
608
511c3f92 609 if (skb_rtable(skb)->rt_type != RTN_LOCAL)
1da177e4
LT
610 return;
611
612 /* Swap the send and the receive. */
cfb6eeb4
YH
613 memset(&rep, 0, sizeof(rep));
614 rep.th.dest = th->source;
615 rep.th.source = th->dest;
616 rep.th.doff = sizeof(struct tcphdr) / 4;
617 rep.th.rst = 1;
1da177e4
LT
618
619 if (th->ack) {
cfb6eeb4 620 rep.th.seq = th->ack_seq;
1da177e4 621 } else {
cfb6eeb4
YH
622 rep.th.ack = 1;
623 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
624 skb->len - (th->doff << 2));
1da177e4
LT
625 }
626
7174259e 627 memset(&arg, 0, sizeof(arg));
cfb6eeb4
YH
628 arg.iov[0].iov_base = (unsigned char *)&rep;
629 arg.iov[0].iov_len = sizeof(rep.th);
630
631#ifdef CONFIG_TCP_MD5SIG
eddc9ec5 632 key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
cfb6eeb4
YH
633 if (key) {
634 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
635 (TCPOPT_NOP << 16) |
636 (TCPOPT_MD5SIG << 8) |
637 TCPOLEN_MD5SIG);
638 /* Update length and the length the header thinks exists */
639 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
640 rep.th.doff = arg.iov[0].iov_len / 4;
641
49a72dfb 642 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
78e645cb
IJ
643 key, ip_hdr(skb)->saddr,
644 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
645 }
646#endif
eddc9ec5
ACM
647 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
648 ip_hdr(skb)->saddr, /* XXX */
52cd5750 649 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1da177e4 650 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
88ef4a5a 651 arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
1da177e4 652
adf30907 653 net = dev_net(skb_dst(skb)->dev);
a86b1e30 654 ip_send_reply(net->ipv4.tcp_sock, skb,
7feb49c8 655 &arg, arg.iov[0].iov_len);
1da177e4 656
63231bdd
PE
657 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
658 TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
1da177e4
LT
659}
660
661/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
662 outside socket context is ugly, certainly. What can I do?
663 */
664
9501f972
YH
665static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
666 u32 win, u32 ts, int oif,
88ef4a5a
KK
667 struct tcp_md5sig_key *key,
668 int reply_flags)
1da177e4 669{
aa8223c7 670 struct tcphdr *th = tcp_hdr(skb);
1da177e4
LT
671 struct {
672 struct tcphdr th;
714e85be 673 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
cfb6eeb4 674#ifdef CONFIG_TCP_MD5SIG
714e85be 675 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
cfb6eeb4
YH
676#endif
677 ];
1da177e4
LT
678 } rep;
679 struct ip_reply_arg arg;
adf30907 680 struct net *net = dev_net(skb_dst(skb)->dev);
1da177e4
LT
681
682 memset(&rep.th, 0, sizeof(struct tcphdr));
7174259e 683 memset(&arg, 0, sizeof(arg));
1da177e4
LT
684
685 arg.iov[0].iov_base = (unsigned char *)&rep;
686 arg.iov[0].iov_len = sizeof(rep.th);
687 if (ts) {
cfb6eeb4
YH
688 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
689 (TCPOPT_TIMESTAMP << 8) |
690 TCPOLEN_TIMESTAMP);
691 rep.opt[1] = htonl(tcp_time_stamp);
692 rep.opt[2] = htonl(ts);
cb48cfe8 693 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
1da177e4
LT
694 }
695
696 /* Swap the send and the receive. */
697 rep.th.dest = th->source;
698 rep.th.source = th->dest;
699 rep.th.doff = arg.iov[0].iov_len / 4;
700 rep.th.seq = htonl(seq);
701 rep.th.ack_seq = htonl(ack);
702 rep.th.ack = 1;
703 rep.th.window = htons(win);
704
cfb6eeb4 705#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4
YH
706 if (key) {
707 int offset = (ts) ? 3 : 0;
708
709 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
710 (TCPOPT_NOP << 16) |
711 (TCPOPT_MD5SIG << 8) |
712 TCPOLEN_MD5SIG);
713 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
714 rep.th.doff = arg.iov[0].iov_len/4;
715
49a72dfb 716 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
90b7e112
AL
717 key, ip_hdr(skb)->saddr,
718 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
719 }
720#endif
88ef4a5a 721 arg.flags = reply_flags;
eddc9ec5
ACM
722 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
723 ip_hdr(skb)->saddr, /* XXX */
1da177e4
LT
724 arg.iov[0].iov_len, IPPROTO_TCP, 0);
725 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
9501f972
YH
726 if (oif)
727 arg.bound_dev_if = oif;
1da177e4 728
a86b1e30 729 ip_send_reply(net->ipv4.tcp_sock, skb,
7feb49c8 730 &arg, arg.iov[0].iov_len);
1da177e4 731
63231bdd 732 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
1da177e4
LT
733}
734
735static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
736{
8feaf0c0 737 struct inet_timewait_sock *tw = inet_twsk(sk);
cfb6eeb4 738 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1da177e4 739
9501f972 740 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
7174259e 741 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
9501f972
YH
742 tcptw->tw_ts_recent,
743 tw->tw_bound_dev_if,
88ef4a5a
KK
744 tcp_twsk_md5_key(tcptw),
745 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
9501f972 746 );
1da177e4 747
8feaf0c0 748 inet_twsk_put(tw);
1da177e4
LT
749}
750
6edafaaf 751static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
7174259e 752 struct request_sock *req)
1da177e4 753{
9501f972 754 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
cfb6eeb4 755 tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
9501f972
YH
756 req->ts_recent,
757 0,
88ef4a5a
KK
758 tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
759 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
1da177e4
LT
760}
761
1da177e4 762/*
9bf1d83e 763 * Send a SYN-ACK after having received a SYN.
60236fdd 764 * This still operates on a request_sock only, not on a big
1da177e4
LT
765 * socket.
766 */
72659ecc
OP
767static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
768 struct request_sock *req,
769 struct request_values *rvp)
1da177e4 770{
2e6599cb 771 const struct inet_request_sock *ireq = inet_rsk(req);
1da177e4
LT
772 int err = -1;
773 struct sk_buff * skb;
774
775 /* First, grab a route. */
463c84b9 776 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
fd80eb94 777 return -1;
1da177e4 778
e6b4d113 779 skb = tcp_make_synack(sk, dst, req, rvp);
1da177e4
LT
780
781 if (skb) {
419f9f89 782 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
1da177e4 783
2e6599cb
ACM
784 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
785 ireq->rmt_addr,
786 ireq->opt);
b9df3cb8 787 err = net_xmit_eval(err);
1da177e4
LT
788 }
789
1da177e4
LT
790 dst_release(dst);
791 return err;
792}
793
72659ecc 794static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
e6b4d113 795 struct request_values *rvp)
fd80eb94 796{
72659ecc
OP
797 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
798 return tcp_v4_send_synack(sk, NULL, req, rvp);
fd80eb94
DL
799}
800
1da177e4 801/*
60236fdd 802 * IPv4 request_sock destructor.
1da177e4 803 */
60236fdd 804static void tcp_v4_reqsk_destructor(struct request_sock *req)
1da177e4 805{
a51482bd 806 kfree(inet_rsk(req)->opt);
1da177e4
LT
807}
808
2a1d4bd4 809static void syn_flood_warning(const struct sk_buff *skb)
1da177e4 810{
2a1d4bd4 811 const char *msg;
1da177e4 812
2a1d4bd4
FW
813#ifdef CONFIG_SYN_COOKIES
814 if (sysctl_tcp_syncookies)
815 msg = "Sending cookies";
816 else
80e40daa 817#endif
2a1d4bd4
FW
818 msg = "Dropping request";
819
820 pr_info("TCP: Possible SYN flooding on port %d. %s.\n",
821 ntohs(tcp_hdr(skb)->dest), msg);
822}
1da177e4
LT
823
824/*
60236fdd 825 * Save and compile IPv4 options into the request_sock if needed.
1da177e4 826 */
f6d8bd05
ED
827static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
828 struct sk_buff *skb)
1da177e4 829{
f6d8bd05
ED
830 const struct ip_options *opt = &(IPCB(skb)->opt);
831 struct ip_options_rcu *dopt = NULL;
1da177e4
LT
832
833 if (opt && opt->optlen) {
f6d8bd05
ED
834 int opt_size = sizeof(*dopt) + opt->optlen;
835
1da177e4
LT
836 dopt = kmalloc(opt_size, GFP_ATOMIC);
837 if (dopt) {
f6d8bd05 838 if (ip_options_echo(&dopt->opt, skb)) {
1da177e4
LT
839 kfree(dopt);
840 dopt = NULL;
841 }
842 }
843 }
844 return dopt;
845}
846
cfb6eeb4
YH
847#ifdef CONFIG_TCP_MD5SIG
848/*
849 * RFC2385 MD5 checksumming requires a mapping of
850 * IP address->MD5 Key.
851 * We need to maintain these in the sk structure.
852 */
853
854/* Find the Key structure for an address. */
7174259e
ACM
855static struct tcp_md5sig_key *
856 tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
cfb6eeb4
YH
857{
858 struct tcp_sock *tp = tcp_sk(sk);
859 int i;
860
861 if (!tp->md5sig_info || !tp->md5sig_info->entries4)
862 return NULL;
863 for (i = 0; i < tp->md5sig_info->entries4; i++) {
864 if (tp->md5sig_info->keys4[i].addr == addr)
f8ab18d2 865 return &tp->md5sig_info->keys4[i].base;
cfb6eeb4
YH
866 }
867 return NULL;
868}
869
870struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
871 struct sock *addr_sk)
872{
c720c7e8 873 return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
cfb6eeb4 874}
cfb6eeb4
YH
875EXPORT_SYMBOL(tcp_v4_md5_lookup);
876
f5b99bcd
AB
877static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
878 struct request_sock *req)
cfb6eeb4
YH
879{
880 return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
881}
882
883/* This can be called on a newly created socket, from other files */
884int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
885 u8 *newkey, u8 newkeylen)
886{
887 /* Add Key to the list */
b0a713e9 888 struct tcp_md5sig_key *key;
cfb6eeb4
YH
889 struct tcp_sock *tp = tcp_sk(sk);
890 struct tcp4_md5sig_key *keys;
891
b0a713e9 892 key = tcp_v4_md5_do_lookup(sk, addr);
cfb6eeb4
YH
893 if (key) {
894 /* Pre-existing entry - just update that one. */
b0a713e9
MD
895 kfree(key->key);
896 key->key = newkey;
897 key->keylen = newkeylen;
cfb6eeb4 898 } else {
f6685938
ACM
899 struct tcp_md5sig_info *md5sig;
900
cfb6eeb4 901 if (!tp->md5sig_info) {
f6685938
ACM
902 tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
903 GFP_ATOMIC);
cfb6eeb4
YH
904 if (!tp->md5sig_info) {
905 kfree(newkey);
906 return -ENOMEM;
907 }
a465419b 908 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
cfb6eeb4 909 }
aa133076 910 if (tcp_alloc_md5sig_pool(sk) == NULL) {
cfb6eeb4
YH
911 kfree(newkey);
912 return -ENOMEM;
913 }
f6685938
ACM
914 md5sig = tp->md5sig_info;
915
916 if (md5sig->alloced4 == md5sig->entries4) {
917 keys = kmalloc((sizeof(*keys) *
e905a9ed 918 (md5sig->entries4 + 1)), GFP_ATOMIC);
cfb6eeb4
YH
919 if (!keys) {
920 kfree(newkey);
921 tcp_free_md5sig_pool();
922 return -ENOMEM;
923 }
924
f6685938
ACM
925 if (md5sig->entries4)
926 memcpy(keys, md5sig->keys4,
927 sizeof(*keys) * md5sig->entries4);
cfb6eeb4
YH
928
929 /* Free old key list, and reference new one */
a80cc20d 930 kfree(md5sig->keys4);
f6685938
ACM
931 md5sig->keys4 = keys;
932 md5sig->alloced4++;
cfb6eeb4 933 }
f6685938 934 md5sig->entries4++;
f8ab18d2
DM
935 md5sig->keys4[md5sig->entries4 - 1].addr = addr;
936 md5sig->keys4[md5sig->entries4 - 1].base.key = newkey;
937 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
cfb6eeb4
YH
938 }
939 return 0;
940}
cfb6eeb4
YH
941EXPORT_SYMBOL(tcp_v4_md5_do_add);
942
943static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
944 u8 *newkey, u8 newkeylen)
945{
c720c7e8 946 return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
cfb6eeb4
YH
947 newkey, newkeylen);
948}
949
950int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
951{
952 struct tcp_sock *tp = tcp_sk(sk);
953 int i;
954
955 for (i = 0; i < tp->md5sig_info->entries4; i++) {
956 if (tp->md5sig_info->keys4[i].addr == addr) {
957 /* Free the key */
f8ab18d2 958 kfree(tp->md5sig_info->keys4[i].base.key);
cfb6eeb4
YH
959 tp->md5sig_info->entries4--;
960
961 if (tp->md5sig_info->entries4 == 0) {
962 kfree(tp->md5sig_info->keys4);
963 tp->md5sig_info->keys4 = NULL;
8228a18d 964 tp->md5sig_info->alloced4 = 0;
7174259e 965 } else if (tp->md5sig_info->entries4 != i) {
cfb6eeb4 966 /* Need to do some manipulation */
354faf09
YH
967 memmove(&tp->md5sig_info->keys4[i],
968 &tp->md5sig_info->keys4[i+1],
969 (tp->md5sig_info->entries4 - i) *
970 sizeof(struct tcp4_md5sig_key));
cfb6eeb4
YH
971 }
972 tcp_free_md5sig_pool();
973 return 0;
974 }
975 }
976 return -ENOENT;
977}
cfb6eeb4
YH
978EXPORT_SYMBOL(tcp_v4_md5_do_del);
979
7174259e 980static void tcp_v4_clear_md5_list(struct sock *sk)
cfb6eeb4
YH
981{
982 struct tcp_sock *tp = tcp_sk(sk);
983
984 /* Free each key, then the set of key keys,
985 * the crypto element, and then decrement our
986 * hold on the last resort crypto.
987 */
988 if (tp->md5sig_info->entries4) {
989 int i;
990 for (i = 0; i < tp->md5sig_info->entries4; i++)
f8ab18d2 991 kfree(tp->md5sig_info->keys4[i].base.key);
cfb6eeb4
YH
992 tp->md5sig_info->entries4 = 0;
993 tcp_free_md5sig_pool();
994 }
995 if (tp->md5sig_info->keys4) {
996 kfree(tp->md5sig_info->keys4);
997 tp->md5sig_info->keys4 = NULL;
998 tp->md5sig_info->alloced4 = 0;
999 }
1000}
1001
7174259e
ACM
1002static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1003 int optlen)
cfb6eeb4
YH
1004{
1005 struct tcp_md5sig cmd;
1006 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1007 u8 *newkey;
1008
1009 if (optlen < sizeof(cmd))
1010 return -EINVAL;
1011
7174259e 1012 if (copy_from_user(&cmd, optval, sizeof(cmd)))
cfb6eeb4
YH
1013 return -EFAULT;
1014
1015 if (sin->sin_family != AF_INET)
1016 return -EINVAL;
1017
1018 if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1019 if (!tcp_sk(sk)->md5sig_info)
1020 return -ENOENT;
1021 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1022 }
1023
1024 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1025 return -EINVAL;
1026
1027 if (!tcp_sk(sk)->md5sig_info) {
1028 struct tcp_sock *tp = tcp_sk(sk);
aa133076 1029 struct tcp_md5sig_info *p;
cfb6eeb4 1030
aa133076 1031 p = kzalloc(sizeof(*p), sk->sk_allocation);
cfb6eeb4
YH
1032 if (!p)
1033 return -EINVAL;
1034
1035 tp->md5sig_info = p;
a465419b 1036 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
cfb6eeb4
YH
1037 }
1038
aa133076 1039 newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
cfb6eeb4
YH
1040 if (!newkey)
1041 return -ENOMEM;
cfb6eeb4
YH
1042 return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1043 newkey, cmd.tcpm_keylen);
1044}
1045
49a72dfb
AL
1046static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1047 __be32 daddr, __be32 saddr, int nbytes)
cfb6eeb4 1048{
cfb6eeb4 1049 struct tcp4_pseudohdr *bp;
49a72dfb 1050 struct scatterlist sg;
cfb6eeb4
YH
1051
1052 bp = &hp->md5_blk.ip4;
cfb6eeb4
YH
1053
1054 /*
49a72dfb 1055 * 1. the TCP pseudo-header (in the order: source IP address,
cfb6eeb4
YH
1056 * destination IP address, zero-padded protocol number, and
1057 * segment length)
1058 */
1059 bp->saddr = saddr;
1060 bp->daddr = daddr;
1061 bp->pad = 0;
076fb722 1062 bp->protocol = IPPROTO_TCP;
49a72dfb 1063 bp->len = cpu_to_be16(nbytes);
c7da57a1 1064
49a72dfb
AL
1065 sg_init_one(&sg, bp, sizeof(*bp));
1066 return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1067}
1068
1069static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1070 __be32 daddr, __be32 saddr, struct tcphdr *th)
1071{
1072 struct tcp_md5sig_pool *hp;
1073 struct hash_desc *desc;
1074
1075 hp = tcp_get_md5sig_pool();
1076 if (!hp)
1077 goto clear_hash_noput;
1078 desc = &hp->md5_desc;
1079
1080 if (crypto_hash_init(desc))
1081 goto clear_hash;
1082 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1083 goto clear_hash;
1084 if (tcp_md5_hash_header(hp, th))
1085 goto clear_hash;
1086 if (tcp_md5_hash_key(hp, key))
1087 goto clear_hash;
1088 if (crypto_hash_final(desc, md5_hash))
cfb6eeb4
YH
1089 goto clear_hash;
1090
cfb6eeb4 1091 tcp_put_md5sig_pool();
cfb6eeb4 1092 return 0;
49a72dfb 1093
cfb6eeb4
YH
1094clear_hash:
1095 tcp_put_md5sig_pool();
1096clear_hash_noput:
1097 memset(md5_hash, 0, 16);
49a72dfb 1098 return 1;
cfb6eeb4
YH
1099}
1100
49a72dfb
AL
1101int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1102 struct sock *sk, struct request_sock *req,
1103 struct sk_buff *skb)
cfb6eeb4 1104{
49a72dfb
AL
1105 struct tcp_md5sig_pool *hp;
1106 struct hash_desc *desc;
1107 struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
1108 __be32 saddr, daddr;
1109
1110 if (sk) {
c720c7e8
ED
1111 saddr = inet_sk(sk)->inet_saddr;
1112 daddr = inet_sk(sk)->inet_daddr;
49a72dfb
AL
1113 } else if (req) {
1114 saddr = inet_rsk(req)->loc_addr;
1115 daddr = inet_rsk(req)->rmt_addr;
cfb6eeb4 1116 } else {
49a72dfb
AL
1117 const struct iphdr *iph = ip_hdr(skb);
1118 saddr = iph->saddr;
1119 daddr = iph->daddr;
cfb6eeb4 1120 }
49a72dfb
AL
1121
1122 hp = tcp_get_md5sig_pool();
1123 if (!hp)
1124 goto clear_hash_noput;
1125 desc = &hp->md5_desc;
1126
1127 if (crypto_hash_init(desc))
1128 goto clear_hash;
1129
1130 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1131 goto clear_hash;
1132 if (tcp_md5_hash_header(hp, th))
1133 goto clear_hash;
1134 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1135 goto clear_hash;
1136 if (tcp_md5_hash_key(hp, key))
1137 goto clear_hash;
1138 if (crypto_hash_final(desc, md5_hash))
1139 goto clear_hash;
1140
1141 tcp_put_md5sig_pool();
1142 return 0;
1143
1144clear_hash:
1145 tcp_put_md5sig_pool();
1146clear_hash_noput:
1147 memset(md5_hash, 0, 16);
1148 return 1;
cfb6eeb4 1149}
49a72dfb 1150EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
cfb6eeb4 1151
7174259e 1152static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
cfb6eeb4
YH
1153{
1154 /*
1155 * This gets called for each TCP segment that arrives
1156 * so we want to be efficient.
1157 * We have 3 drop cases:
1158 * o No MD5 hash and one expected.
1159 * o MD5 hash and we're not expecting one.
1160 * o MD5 hash and its wrong.
1161 */
1162 __u8 *hash_location = NULL;
1163 struct tcp_md5sig_key *hash_expected;
eddc9ec5 1164 const struct iphdr *iph = ip_hdr(skb);
aa8223c7 1165 struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4 1166 int genhash;
cfb6eeb4
YH
1167 unsigned char newhash[16];
1168
1169 hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
7d5d5525 1170 hash_location = tcp_parse_md5sig_option(th);
cfb6eeb4 1171
cfb6eeb4
YH
1172 /* We've parsed the options - do we have a hash? */
1173 if (!hash_expected && !hash_location)
1174 return 0;
1175
1176 if (hash_expected && !hash_location) {
785957d3 1177 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
cfb6eeb4
YH
1178 return 1;
1179 }
1180
1181 if (!hash_expected && hash_location) {
785957d3 1182 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
cfb6eeb4
YH
1183 return 1;
1184 }
1185
1186 /* Okay, so this is hash_expected and hash_location -
1187 * so we need to calculate the checksum.
1188 */
49a72dfb
AL
1189 genhash = tcp_v4_md5_hash_skb(newhash,
1190 hash_expected,
1191 NULL, NULL, skb);
cfb6eeb4
YH
1192
1193 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1194 if (net_ratelimit()) {
673d57e7
HH
1195 printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1196 &iph->saddr, ntohs(th->source),
1197 &iph->daddr, ntohs(th->dest),
cfb6eeb4 1198 genhash ? " tcp_v4_calc_md5_hash failed" : "");
cfb6eeb4
YH
1199 }
1200 return 1;
1201 }
1202 return 0;
1203}
1204
1205#endif
1206
72a3effa 1207struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1da177e4 1208 .family = PF_INET,
2e6599cb 1209 .obj_size = sizeof(struct tcp_request_sock),
72659ecc 1210 .rtx_syn_ack = tcp_v4_rtx_synack,
60236fdd
ACM
1211 .send_ack = tcp_v4_reqsk_send_ack,
1212 .destructor = tcp_v4_reqsk_destructor,
1da177e4 1213 .send_reset = tcp_v4_send_reset,
72659ecc 1214 .syn_ack_timeout = tcp_syn_ack_timeout,
1da177e4
LT
1215};
1216
cfb6eeb4 1217#ifdef CONFIG_TCP_MD5SIG
b2e4b3de 1218static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
cfb6eeb4 1219 .md5_lookup = tcp_v4_reqsk_md5_lookup,
e3afe7b7 1220 .calc_md5_hash = tcp_v4_md5_hash_skb,
cfb6eeb4 1221};
b6332e6c 1222#endif
cfb6eeb4 1223
1da177e4
LT
1224int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1225{
4957faad 1226 struct tcp_extend_values tmp_ext;
1da177e4 1227 struct tcp_options_received tmp_opt;
4957faad 1228 u8 *hash_location;
60236fdd 1229 struct request_sock *req;
e6b4d113 1230 struct inet_request_sock *ireq;
4957faad 1231 struct tcp_sock *tp = tcp_sk(sk);
e6b4d113 1232 struct dst_entry *dst = NULL;
eddc9ec5
ACM
1233 __be32 saddr = ip_hdr(skb)->saddr;
1234 __be32 daddr = ip_hdr(skb)->daddr;
1da177e4 1235 __u32 isn = TCP_SKB_CB(skb)->when;
1da177e4
LT
1236#ifdef CONFIG_SYN_COOKIES
1237 int want_cookie = 0;
1238#else
1239#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1240#endif
1241
1242 /* Never answer to SYNs send to broadcast or multicast */
511c3f92 1243 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1da177e4
LT
1244 goto drop;
1245
1246 /* TW buckets are converted to open requests without
1247 * limitations, they conserve resources and peer is
1248 * evidently real one.
1249 */
463c84b9 1250 if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
2a1d4bd4
FW
1251 if (net_ratelimit())
1252 syn_flood_warning(skb);
1da177e4
LT
1253#ifdef CONFIG_SYN_COOKIES
1254 if (sysctl_tcp_syncookies) {
1255 want_cookie = 1;
1256 } else
1257#endif
1258 goto drop;
1259 }
1260
1261 /* Accept backlog is full. If we have already queued enough
1262 * of warm entries in syn queue, drop request. It is better than
1263 * clogging syn queue with openreqs with exponentially increasing
1264 * timeout.
1265 */
463c84b9 1266 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1da177e4
LT
1267 goto drop;
1268
ce4a7d0d 1269 req = inet_reqsk_alloc(&tcp_request_sock_ops);
1da177e4
LT
1270 if (!req)
1271 goto drop;
1272
cfb6eeb4
YH
1273#ifdef CONFIG_TCP_MD5SIG
1274 tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1275#endif
1276
1da177e4 1277 tcp_clear_options(&tmp_opt);
bee7ca9e 1278 tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
4957faad 1279 tmp_opt.user_mss = tp->rx_opt.user_mss;
bb5b7c11 1280 tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
4957faad
WAS
1281
1282 if (tmp_opt.cookie_plus > 0 &&
1283 tmp_opt.saw_tstamp &&
1284 !tp->rx_opt.cookie_out_never &&
1285 (sysctl_tcp_cookie_size > 0 ||
1286 (tp->cookie_values != NULL &&
1287 tp->cookie_values->cookie_desired > 0))) {
1288 u8 *c;
1289 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1290 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1291
1292 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1293 goto drop_and_release;
1294
1295 /* Secret recipe starts with IP addresses */
0eae88f3
ED
1296 *mess++ ^= (__force u32)daddr;
1297 *mess++ ^= (__force u32)saddr;
1da177e4 1298
4957faad
WAS
1299 /* plus variable length Initiator Cookie */
1300 c = (u8 *)mess;
1301 while (l-- > 0)
1302 *c++ ^= *hash_location++;
1303
1304#ifdef CONFIG_SYN_COOKIES
1305 want_cookie = 0; /* not our kind of cookie */
1306#endif
1307 tmp_ext.cookie_out_never = 0; /* false */
1308 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1309 } else if (!tp->rx_opt.cookie_in_always) {
1310 /* redundant indications, but ensure initialization. */
1311 tmp_ext.cookie_out_never = 1; /* true */
1312 tmp_ext.cookie_plus = 0;
1313 } else {
1314 goto drop_and_release;
1315 }
1316 tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1da177e4 1317
4dfc2817 1318 if (want_cookie && !tmp_opt.saw_tstamp)
1da177e4 1319 tcp_clear_options(&tmp_opt);
1da177e4 1320
1da177e4 1321 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1da177e4
LT
1322 tcp_openreq_init(req, &tmp_opt, skb);
1323
bb5b7c11
DM
1324 ireq = inet_rsk(req);
1325 ireq->loc_addr = daddr;
1326 ireq->rmt_addr = saddr;
1327 ireq->no_srccheck = inet_sk(sk)->transparent;
1328 ireq->opt = tcp_v4_save_options(sk, skb);
1329
284904aa 1330 if (security_inet_conn_request(sk, skb, req))
bb5b7c11 1331 goto drop_and_free;
284904aa 1332
172d69e6 1333 if (!want_cookie || tmp_opt.tstamp_ok)
aa8223c7 1334 TCP_ECN_create_request(req, tcp_hdr(skb));
1da177e4
LT
1335
1336 if (want_cookie) {
1da177e4 1337 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
172d69e6 1338 req->cookie_ts = tmp_opt.tstamp_ok;
1da177e4
LT
1339 } else if (!isn) {
1340 struct inet_peer *peer = NULL;
1341
1342 /* VJ's idea. We save last timestamp seen
1343 * from the destination in peer table, when entering
1344 * state TIME-WAIT, and check against it before
1345 * accepting new connection request.
1346 *
1347 * If "isn" is not zero, this request hit alive
1348 * timewait bucket, so that all the necessary checks
1349 * are made in the function processing timewait state.
1350 */
1351 if (tmp_opt.saw_tstamp &&
295ff7ed 1352 tcp_death_row.sysctl_tw_recycle &&
bb5b7c11 1353 (dst = inet_csk_route_req(sk, req)) != NULL &&
1da177e4 1354 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
7a71ed89 1355 peer->daddr.addr.a4 == saddr) {
317fe0e6 1356 inet_peer_refcheck(peer);
2c1409a0 1357 if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1da177e4
LT
1358 (s32)(peer->tcp_ts - req->ts_recent) >
1359 TCP_PAWS_WINDOW) {
de0744af 1360 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
7cd04fa7 1361 goto drop_and_release;
1da177e4
LT
1362 }
1363 }
1364 /* Kill the following clause, if you dislike this way. */
1365 else if (!sysctl_tcp_syncookies &&
463c84b9 1366 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1da177e4
LT
1367 (sysctl_max_syn_backlog >> 2)) &&
1368 (!peer || !peer->tcp_ts_stamp) &&
1369 (!dst || !dst_metric(dst, RTAX_RTT))) {
1370 /* Without syncookies last quarter of
1371 * backlog is filled with destinations,
1372 * proven to be alive.
1373 * It means that we continue to communicate
1374 * to destinations, already remembered
1375 * to the moment of synflood.
1376 */
673d57e7
HH
1377 LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1378 &saddr, ntohs(tcp_hdr(skb)->source));
7cd04fa7 1379 goto drop_and_release;
1da177e4
LT
1380 }
1381
a94f723d 1382 isn = tcp_v4_init_sequence(skb);
1da177e4 1383 }
2e6599cb 1384 tcp_rsk(req)->snt_isn = isn;
1da177e4 1385
72659ecc
OP
1386 if (tcp_v4_send_synack(sk, dst, req,
1387 (struct request_values *)&tmp_ext) ||
4957faad 1388 want_cookie)
1da177e4
LT
1389 goto drop_and_free;
1390
7cd04fa7 1391 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1da177e4
LT
1392 return 0;
1393
7cd04fa7
DL
1394drop_and_release:
1395 dst_release(dst);
1da177e4 1396drop_and_free:
60236fdd 1397 reqsk_free(req);
1da177e4 1398drop:
1da177e4
LT
1399 return 0;
1400}
4bc2f18b 1401EXPORT_SYMBOL(tcp_v4_conn_request);
1da177e4
LT
1402
1403
1404/*
1405 * The three way handshake has completed - we got a valid synack -
1406 * now create the new socket.
1407 */
1408struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
60236fdd 1409 struct request_sock *req,
1da177e4
LT
1410 struct dst_entry *dst)
1411{
2e6599cb 1412 struct inet_request_sock *ireq;
1da177e4
LT
1413 struct inet_sock *newinet;
1414 struct tcp_sock *newtp;
1415 struct sock *newsk;
cfb6eeb4
YH
1416#ifdef CONFIG_TCP_MD5SIG
1417 struct tcp_md5sig_key *key;
1418#endif
f6d8bd05 1419 struct ip_options_rcu *inet_opt;
1da177e4
LT
1420
1421 if (sk_acceptq_is_full(sk))
1422 goto exit_overflow;
1423
1da177e4
LT
1424 newsk = tcp_create_openreq_child(sk, req, skb);
1425 if (!newsk)
093d2823 1426 goto exit_nonewsk;
1da177e4 1427
bcd76111 1428 newsk->sk_gso_type = SKB_GSO_TCPV4;
1da177e4
LT
1429
1430 newtp = tcp_sk(newsk);
1431 newinet = inet_sk(newsk);
2e6599cb 1432 ireq = inet_rsk(req);
c720c7e8
ED
1433 newinet->inet_daddr = ireq->rmt_addr;
1434 newinet->inet_rcv_saddr = ireq->loc_addr;
1435 newinet->inet_saddr = ireq->loc_addr;
f6d8bd05
ED
1436 inet_opt = ireq->opt;
1437 rcu_assign_pointer(newinet->inet_opt, inet_opt);
2e6599cb 1438 ireq->opt = NULL;
463c84b9 1439 newinet->mc_index = inet_iif(skb);
eddc9ec5 1440 newinet->mc_ttl = ip_hdr(skb)->ttl;
d83d8461 1441 inet_csk(newsk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
1442 if (inet_opt)
1443 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
c720c7e8 1444 newinet->inet_id = newtp->write_seq ^ jiffies;
1da177e4 1445
0e734419
DM
1446 if (!dst && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL)
1447 goto put_and_exit;
1448
1449 sk_setup_caps(newsk, dst);
1450
5d424d5a 1451 tcp_mtup_init(newsk);
1da177e4 1452 tcp_sync_mss(newsk, dst_mtu(dst));
0dbaee3b 1453 newtp->advmss = dst_metric_advmss(dst);
f5fff5dc
TQ
1454 if (tcp_sk(sk)->rx_opt.user_mss &&
1455 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1456 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1457
1da177e4
LT
1458 tcp_initialize_rcv_mss(newsk);
1459
cfb6eeb4
YH
1460#ifdef CONFIG_TCP_MD5SIG
1461 /* Copy over the MD5 key from the original socket */
c720c7e8
ED
1462 key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1463 if (key != NULL) {
cfb6eeb4
YH
1464 /*
1465 * We're using one, so create a matching key
1466 * on the newsk structure. If we fail to get
1467 * memory, then we end up not copying the key
1468 * across. Shucks.
1469 */
f6685938
ACM
1470 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1471 if (newkey != NULL)
c720c7e8 1472 tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
cfb6eeb4 1473 newkey, key->keylen);
a465419b 1474 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
cfb6eeb4
YH
1475 }
1476#endif
1477
0e734419
DM
1478 if (__inet_inherit_port(sk, newsk) < 0)
1479 goto put_and_exit;
9327f705 1480 __inet_hash_nolisten(newsk, NULL);
1da177e4
LT
1481
1482 return newsk;
1483
1484exit_overflow:
de0744af 1485 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
093d2823
BS
1486exit_nonewsk:
1487 dst_release(dst);
1da177e4 1488exit:
de0744af 1489 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1da177e4 1490 return NULL;
0e734419
DM
1491put_and_exit:
1492 sock_put(newsk);
1493 goto exit;
1da177e4 1494}
4bc2f18b 1495EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1da177e4
LT
1496
1497static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1498{
aa8223c7 1499 struct tcphdr *th = tcp_hdr(skb);
eddc9ec5 1500 const struct iphdr *iph = ip_hdr(skb);
1da177e4 1501 struct sock *nsk;
60236fdd 1502 struct request_sock **prev;
1da177e4 1503 /* Find possible connection requests. */
463c84b9
ACM
1504 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1505 iph->saddr, iph->daddr);
1da177e4
LT
1506 if (req)
1507 return tcp_check_req(sk, skb, req, prev);
1508
3b1e0a65 1509 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
c67499c0 1510 th->source, iph->daddr, th->dest, inet_iif(skb));
1da177e4
LT
1511
1512 if (nsk) {
1513 if (nsk->sk_state != TCP_TIME_WAIT) {
1514 bh_lock_sock(nsk);
1515 return nsk;
1516 }
9469c7b4 1517 inet_twsk_put(inet_twsk(nsk));
1da177e4
LT
1518 return NULL;
1519 }
1520
1521#ifdef CONFIG_SYN_COOKIES
af9b4738 1522 if (!th->syn)
1da177e4
LT
1523 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1524#endif
1525 return sk;
1526}
1527
b51655b9 1528static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1da177e4 1529{
eddc9ec5
ACM
1530 const struct iphdr *iph = ip_hdr(skb);
1531
84fa7933 1532 if (skb->ip_summed == CHECKSUM_COMPLETE) {
eddc9ec5
ACM
1533 if (!tcp_v4_check(skb->len, iph->saddr,
1534 iph->daddr, skb->csum)) {
fb286bb2 1535 skb->ip_summed = CHECKSUM_UNNECESSARY;
1da177e4 1536 return 0;
fb286bb2 1537 }
1da177e4 1538 }
fb286bb2 1539
eddc9ec5 1540 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
fb286bb2
HX
1541 skb->len, IPPROTO_TCP, 0);
1542
1da177e4 1543 if (skb->len <= 76) {
fb286bb2 1544 return __skb_checksum_complete(skb);
1da177e4
LT
1545 }
1546 return 0;
1547}
1548
1549
1550/* The socket must have it's spinlock held when we get
1551 * here.
1552 *
1553 * We have a potential double-lock case here, so even when
1554 * doing backlog processing we use the BH locking scheme.
1555 * This is because we cannot sleep with the original spinlock
1556 * held.
1557 */
1558int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1559{
cfb6eeb4
YH
1560 struct sock *rsk;
1561#ifdef CONFIG_TCP_MD5SIG
1562 /*
1563 * We really want to reject the packet as early as possible
1564 * if:
1565 * o We're expecting an MD5'd packet and this is no MD5 tcp option
1566 * o There is an MD5 option and we're not expecting one
1567 */
7174259e 1568 if (tcp_v4_inbound_md5_hash(sk, skb))
cfb6eeb4
YH
1569 goto discard;
1570#endif
1571
1da177e4 1572 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
ca55158c 1573 sock_rps_save_rxhash(sk, skb->rxhash);
aa8223c7 1574 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
cfb6eeb4 1575 rsk = sk;
1da177e4 1576 goto reset;
cfb6eeb4 1577 }
1da177e4
LT
1578 return 0;
1579 }
1580
ab6a5bb6 1581 if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1da177e4
LT
1582 goto csum_err;
1583
1584 if (sk->sk_state == TCP_LISTEN) {
1585 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1586 if (!nsk)
1587 goto discard;
1588
1589 if (nsk != sk) {
cfb6eeb4
YH
1590 if (tcp_child_process(sk, nsk, skb)) {
1591 rsk = nsk;
1da177e4 1592 goto reset;
cfb6eeb4 1593 }
1da177e4
LT
1594 return 0;
1595 }
ca55158c
ED
1596 } else
1597 sock_rps_save_rxhash(sk, skb->rxhash);
1598
aa8223c7 1599 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
cfb6eeb4 1600 rsk = sk;
1da177e4 1601 goto reset;
cfb6eeb4 1602 }
1da177e4
LT
1603 return 0;
1604
1605reset:
cfb6eeb4 1606 tcp_v4_send_reset(rsk, skb);
1da177e4
LT
1607discard:
1608 kfree_skb(skb);
1609 /* Be careful here. If this function gets more complicated and
1610 * gcc suffers from register pressure on the x86, sk (in %ebx)
1611 * might be destroyed here. This current version compiles correctly,
1612 * but you have been warned.
1613 */
1614 return 0;
1615
1616csum_err:
63231bdd 1617 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1da177e4
LT
1618 goto discard;
1619}
4bc2f18b 1620EXPORT_SYMBOL(tcp_v4_do_rcv);
1da177e4
LT
1621
1622/*
1623 * From tcp_input.c
1624 */
1625
1626int tcp_v4_rcv(struct sk_buff *skb)
1627{
eddc9ec5 1628 const struct iphdr *iph;
1da177e4
LT
1629 struct tcphdr *th;
1630 struct sock *sk;
1631 int ret;
a86b1e30 1632 struct net *net = dev_net(skb->dev);
1da177e4
LT
1633
1634 if (skb->pkt_type != PACKET_HOST)
1635 goto discard_it;
1636
1637 /* Count it even if it's bad */
63231bdd 1638 TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1da177e4
LT
1639
1640 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1641 goto discard_it;
1642
aa8223c7 1643 th = tcp_hdr(skb);
1da177e4
LT
1644
1645 if (th->doff < sizeof(struct tcphdr) / 4)
1646 goto bad_packet;
1647 if (!pskb_may_pull(skb, th->doff * 4))
1648 goto discard_it;
1649
1650 /* An explanation is required here, I think.
1651 * Packet length and doff are validated by header prediction,
caa20d9a 1652 * provided case of th->doff==0 is eliminated.
1da177e4 1653 * So, we defer the checks. */
60476372 1654 if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1da177e4
LT
1655 goto bad_packet;
1656
aa8223c7 1657 th = tcp_hdr(skb);
eddc9ec5 1658 iph = ip_hdr(skb);
1da177e4
LT
1659 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1660 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1661 skb->len - th->doff * 4);
1662 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1663 TCP_SKB_CB(skb)->when = 0;
eddc9ec5 1664 TCP_SKB_CB(skb)->flags = iph->tos;
1da177e4
LT
1665 TCP_SKB_CB(skb)->sacked = 0;
1666
9a1f27c4 1667 sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1da177e4
LT
1668 if (!sk)
1669 goto no_tcp_socket;
1670
bb134d5d
ED
1671process:
1672 if (sk->sk_state == TCP_TIME_WAIT)
1673 goto do_time_wait;
1674
6cce09f8
ED
1675 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1676 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
d218d111 1677 goto discard_and_relse;
6cce09f8 1678 }
d218d111 1679
1da177e4
LT
1680 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1681 goto discard_and_relse;
b59c2701 1682 nf_reset(skb);
1da177e4 1683
fda9ef5d 1684 if (sk_filter(sk, skb))
1da177e4
LT
1685 goto discard_and_relse;
1686
1687 skb->dev = NULL;
1688
c6366184 1689 bh_lock_sock_nested(sk);
1da177e4
LT
1690 ret = 0;
1691 if (!sock_owned_by_user(sk)) {
1a2449a8
CL
1692#ifdef CONFIG_NET_DMA
1693 struct tcp_sock *tp = tcp_sk(sk);
1694 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
f67b4599 1695 tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1a2449a8 1696 if (tp->ucopy.dma_chan)
1da177e4 1697 ret = tcp_v4_do_rcv(sk, skb);
1a2449a8
CL
1698 else
1699#endif
1700 {
1701 if (!tcp_prequeue(sk, skb))
ae8d7f88 1702 ret = tcp_v4_do_rcv(sk, skb);
1a2449a8 1703 }
6cce09f8 1704 } else if (unlikely(sk_add_backlog(sk, skb))) {
6b03a53a 1705 bh_unlock_sock(sk);
6cce09f8 1706 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
6b03a53a
ZY
1707 goto discard_and_relse;
1708 }
1da177e4
LT
1709 bh_unlock_sock(sk);
1710
1711 sock_put(sk);
1712
1713 return ret;
1714
1715no_tcp_socket:
1716 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1717 goto discard_it;
1718
1719 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1720bad_packet:
63231bdd 1721 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1da177e4 1722 } else {
cfb6eeb4 1723 tcp_v4_send_reset(NULL, skb);
1da177e4
LT
1724 }
1725
1726discard_it:
1727 /* Discard frame. */
1728 kfree_skb(skb);
e905a9ed 1729 return 0;
1da177e4
LT
1730
1731discard_and_relse:
1732 sock_put(sk);
1733 goto discard_it;
1734
1735do_time_wait:
1736 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
9469c7b4 1737 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
1738 goto discard_it;
1739 }
1740
1741 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
63231bdd 1742 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
9469c7b4 1743 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
1744 goto discard_it;
1745 }
9469c7b4 1746 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1da177e4 1747 case TCP_TW_SYN: {
c346dca1 1748 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
c67499c0 1749 &tcp_hashinfo,
eddc9ec5 1750 iph->daddr, th->dest,
463c84b9 1751 inet_iif(skb));
1da177e4 1752 if (sk2) {
9469c7b4
YH
1753 inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1754 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
1755 sk = sk2;
1756 goto process;
1757 }
1758 /* Fall through to ACK */
1759 }
1760 case TCP_TW_ACK:
1761 tcp_v4_timewait_ack(sk, skb);
1762 break;
1763 case TCP_TW_RST:
1764 goto no_tcp_socket;
1765 case TCP_TW_SUCCESS:;
1766 }
1767 goto discard_it;
1768}
1769
3f419d2d 1770struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
1da177e4 1771{
3f419d2d 1772 struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1da177e4 1773 struct inet_sock *inet = inet_sk(sk);
3f419d2d 1774 struct inet_peer *peer;
1da177e4 1775
c720c7e8 1776 if (!rt || rt->rt_dst != inet->inet_daddr) {
b534ecf1 1777 peer = inet_getpeer_v4(inet->inet_daddr, 1);
3f419d2d 1778 *release_it = true;
1da177e4
LT
1779 } else {
1780 if (!rt->peer)
1781 rt_bind_peer(rt, 1);
1782 peer = rt->peer;
3f419d2d 1783 *release_it = false;
1da177e4
LT
1784 }
1785
3f419d2d 1786 return peer;
1da177e4 1787}
3f419d2d 1788EXPORT_SYMBOL(tcp_v4_get_peer);
1da177e4 1789
ccb7c410 1790void *tcp_v4_tw_get_peer(struct sock *sk)
1da177e4 1791{
ccb7c410 1792 struct inet_timewait_sock *tw = inet_twsk(sk);
1da177e4 1793
ccb7c410 1794 return inet_getpeer_v4(tw->tw_daddr, 1);
1da177e4 1795}
ccb7c410
DM
1796EXPORT_SYMBOL(tcp_v4_tw_get_peer);
1797
1798static struct timewait_sock_ops tcp_timewait_sock_ops = {
1799 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1800 .twsk_unique = tcp_twsk_unique,
1801 .twsk_destructor= tcp_twsk_destructor,
1802 .twsk_getpeer = tcp_v4_tw_get_peer,
1803};
1da177e4 1804
3b401a81 1805const struct inet_connection_sock_af_ops ipv4_specific = {
543d9cfe
ACM
1806 .queue_xmit = ip_queue_xmit,
1807 .send_check = tcp_v4_send_check,
1808 .rebuild_header = inet_sk_rebuild_header,
1809 .conn_request = tcp_v4_conn_request,
1810 .syn_recv_sock = tcp_v4_syn_recv_sock,
3f419d2d 1811 .get_peer = tcp_v4_get_peer,
543d9cfe
ACM
1812 .net_header_len = sizeof(struct iphdr),
1813 .setsockopt = ip_setsockopt,
1814 .getsockopt = ip_getsockopt,
1815 .addr2sockaddr = inet_csk_addr2sockaddr,
1816 .sockaddr_len = sizeof(struct sockaddr_in),
ab1e0a13 1817 .bind_conflict = inet_csk_bind_conflict,
3fdadf7d 1818#ifdef CONFIG_COMPAT
543d9cfe
ACM
1819 .compat_setsockopt = compat_ip_setsockopt,
1820 .compat_getsockopt = compat_ip_getsockopt,
3fdadf7d 1821#endif
1da177e4 1822};
4bc2f18b 1823EXPORT_SYMBOL(ipv4_specific);
1da177e4 1824
cfb6eeb4 1825#ifdef CONFIG_TCP_MD5SIG
b2e4b3de 1826static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
cfb6eeb4 1827 .md5_lookup = tcp_v4_md5_lookup,
49a72dfb 1828 .calc_md5_hash = tcp_v4_md5_hash_skb,
cfb6eeb4
YH
1829 .md5_add = tcp_v4_md5_add_func,
1830 .md5_parse = tcp_v4_parse_md5_keys,
cfb6eeb4 1831};
b6332e6c 1832#endif
cfb6eeb4 1833
1da177e4
LT
1834/* NOTE: A lot of things set to zero explicitly by call to
1835 * sk_alloc() so need not be done here.
1836 */
1837static int tcp_v4_init_sock(struct sock *sk)
1838{
6687e988 1839 struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4
LT
1840 struct tcp_sock *tp = tcp_sk(sk);
1841
1842 skb_queue_head_init(&tp->out_of_order_queue);
1843 tcp_init_xmit_timers(sk);
1844 tcp_prequeue_init(tp);
1845
6687e988 1846 icsk->icsk_rto = TCP_TIMEOUT_INIT;
1da177e4
LT
1847 tp->mdev = TCP_TIMEOUT_INIT;
1848
1849 /* So many TCP implementations out there (incorrectly) count the
1850 * initial SYN frame in their delayed-ACK and congestion control
1851 * algorithms that we must have the following bandaid to talk
1852 * efficiently to them. -DaveM
1853 */
1854 tp->snd_cwnd = 2;
1855
1856 /* See draft-stevens-tcpca-spec-01 for discussion of the
1857 * initialization of these values.
1858 */
0b6a05c1 1859 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1da177e4 1860 tp->snd_cwnd_clamp = ~0;
bee7ca9e 1861 tp->mss_cache = TCP_MSS_DEFAULT;
1da177e4
LT
1862
1863 tp->reordering = sysctl_tcp_reordering;
6687e988 1864 icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1da177e4
LT
1865
1866 sk->sk_state = TCP_CLOSE;
1867
1868 sk->sk_write_space = sk_stream_write_space;
1869 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1870
8292a17a 1871 icsk->icsk_af_ops = &ipv4_specific;
d83d8461 1872 icsk->icsk_sync_mss = tcp_sync_mss;
cfb6eeb4
YH
1873#ifdef CONFIG_TCP_MD5SIG
1874 tp->af_specific = &tcp_sock_ipv4_specific;
1875#endif
1da177e4 1876
435cf559
WAS
1877 /* TCP Cookie Transactions */
1878 if (sysctl_tcp_cookie_size > 0) {
1879 /* Default, cookies without s_data_payload. */
1880 tp->cookie_values =
1881 kzalloc(sizeof(*tp->cookie_values),
1882 sk->sk_allocation);
1883 if (tp->cookie_values != NULL)
1884 kref_init(&tp->cookie_values->kref);
1885 }
1886 /* Presumed zeroed, in order of appearance:
1887 * cookie_in_always, cookie_out_never,
1888 * s_data_constant, s_data_in, s_data_out
1889 */
1da177e4
LT
1890 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1891 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1892
eb4dea58 1893 local_bh_disable();
1748376b 1894 percpu_counter_inc(&tcp_sockets_allocated);
eb4dea58 1895 local_bh_enable();
1da177e4
LT
1896
1897 return 0;
1898}
1899
7d06b2e0 1900void tcp_v4_destroy_sock(struct sock *sk)
1da177e4
LT
1901{
1902 struct tcp_sock *tp = tcp_sk(sk);
1903
1904 tcp_clear_xmit_timers(sk);
1905
6687e988 1906 tcp_cleanup_congestion_control(sk);
317a76f9 1907
1da177e4 1908 /* Cleanup up the write buffer. */
fe067e8a 1909 tcp_write_queue_purge(sk);
1da177e4
LT
1910
1911 /* Cleans up our, hopefully empty, out_of_order_queue. */
e905a9ed 1912 __skb_queue_purge(&tp->out_of_order_queue);
1da177e4 1913
cfb6eeb4
YH
1914#ifdef CONFIG_TCP_MD5SIG
1915 /* Clean up the MD5 key list, if any */
1916 if (tp->md5sig_info) {
1917 tcp_v4_clear_md5_list(sk);
1918 kfree(tp->md5sig_info);
1919 tp->md5sig_info = NULL;
1920 }
1921#endif
1922
1a2449a8
CL
1923#ifdef CONFIG_NET_DMA
1924 /* Cleans up our sk_async_wait_queue */
e905a9ed 1925 __skb_queue_purge(&sk->sk_async_wait_queue);
1a2449a8
CL
1926#endif
1927
1da177e4
LT
1928 /* Clean prequeue, it must be empty really */
1929 __skb_queue_purge(&tp->ucopy.prequeue);
1930
1931 /* Clean up a referenced TCP bind bucket. */
463c84b9 1932 if (inet_csk(sk)->icsk_bind_hash)
ab1e0a13 1933 inet_put_port(sk);
1da177e4
LT
1934
1935 /*
1936 * If sendmsg cached page exists, toss it.
1937 */
1938 if (sk->sk_sndmsg_page) {
1939 __free_page(sk->sk_sndmsg_page);
1940 sk->sk_sndmsg_page = NULL;
1941 }
1942
435cf559
WAS
1943 /* TCP Cookie Transactions */
1944 if (tp->cookie_values != NULL) {
1945 kref_put(&tp->cookie_values->kref,
1946 tcp_cookie_values_release);
1947 tp->cookie_values = NULL;
1948 }
1949
1748376b 1950 percpu_counter_dec(&tcp_sockets_allocated);
1da177e4 1951}
1da177e4
LT
1952EXPORT_SYMBOL(tcp_v4_destroy_sock);
1953
1954#ifdef CONFIG_PROC_FS
1955/* Proc filesystem TCP sock list dumping. */
1956
3ab5aee7 1957static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1da177e4 1958{
3ab5aee7 1959 return hlist_nulls_empty(head) ? NULL :
8feaf0c0 1960 list_entry(head->first, struct inet_timewait_sock, tw_node);
1da177e4
LT
1961}
1962
8feaf0c0 1963static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1da177e4 1964{
3ab5aee7
ED
1965 return !is_a_nulls(tw->tw_node.next) ?
1966 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1da177e4
LT
1967}
1968
a8b690f9
TH
1969/*
1970 * Get next listener socket follow cur. If cur is NULL, get first socket
1971 * starting from bucket given in st->bucket; when st->bucket is zero the
1972 * very first socket in the hash table is returned.
1973 */
1da177e4
LT
1974static void *listening_get_next(struct seq_file *seq, void *cur)
1975{
463c84b9 1976 struct inet_connection_sock *icsk;
c25eb3bf 1977 struct hlist_nulls_node *node;
1da177e4 1978 struct sock *sk = cur;
5caea4ea 1979 struct inet_listen_hashbucket *ilb;
5799de0b 1980 struct tcp_iter_state *st = seq->private;
a4146b1b 1981 struct net *net = seq_file_net(seq);
1da177e4
LT
1982
1983 if (!sk) {
a8b690f9 1984 ilb = &tcp_hashinfo.listening_hash[st->bucket];
5caea4ea 1985 spin_lock_bh(&ilb->lock);
c25eb3bf 1986 sk = sk_nulls_head(&ilb->head);
a8b690f9 1987 st->offset = 0;
1da177e4
LT
1988 goto get_sk;
1989 }
5caea4ea 1990 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1da177e4 1991 ++st->num;
a8b690f9 1992 ++st->offset;
1da177e4
LT
1993
1994 if (st->state == TCP_SEQ_STATE_OPENREQ) {
60236fdd 1995 struct request_sock *req = cur;
1da177e4 1996
72a3effa 1997 icsk = inet_csk(st->syn_wait_sk);
1da177e4
LT
1998 req = req->dl_next;
1999 while (1) {
2000 while (req) {
bdccc4ca 2001 if (req->rsk_ops->family == st->family) {
1da177e4
LT
2002 cur = req;
2003 goto out;
2004 }
2005 req = req->dl_next;
2006 }
72a3effa 2007 if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1da177e4
LT
2008 break;
2009get_req:
463c84b9 2010 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1da177e4 2011 }
1bde5ac4 2012 sk = sk_nulls_next(st->syn_wait_sk);
1da177e4 2013 st->state = TCP_SEQ_STATE_LISTENING;
463c84b9 2014 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4 2015 } else {
e905a9ed 2016 icsk = inet_csk(sk);
463c84b9
ACM
2017 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2018 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1da177e4 2019 goto start_req;
463c84b9 2020 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1bde5ac4 2021 sk = sk_nulls_next(sk);
1da177e4
LT
2022 }
2023get_sk:
c25eb3bf 2024 sk_nulls_for_each_from(sk, node) {
8475ef9f
PE
2025 if (!net_eq(sock_net(sk), net))
2026 continue;
2027 if (sk->sk_family == st->family) {
1da177e4
LT
2028 cur = sk;
2029 goto out;
2030 }
e905a9ed 2031 icsk = inet_csk(sk);
463c84b9
ACM
2032 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2033 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1da177e4
LT
2034start_req:
2035 st->uid = sock_i_uid(sk);
2036 st->syn_wait_sk = sk;
2037 st->state = TCP_SEQ_STATE_OPENREQ;
2038 st->sbucket = 0;
2039 goto get_req;
2040 }
463c84b9 2041 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4 2042 }
5caea4ea 2043 spin_unlock_bh(&ilb->lock);
a8b690f9 2044 st->offset = 0;
0f7ff927 2045 if (++st->bucket < INET_LHTABLE_SIZE) {
5caea4ea
ED
2046 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2047 spin_lock_bh(&ilb->lock);
c25eb3bf 2048 sk = sk_nulls_head(&ilb->head);
1da177e4
LT
2049 goto get_sk;
2050 }
2051 cur = NULL;
2052out:
2053 return cur;
2054}
2055
2056static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2057{
a8b690f9
TH
2058 struct tcp_iter_state *st = seq->private;
2059 void *rc;
2060
2061 st->bucket = 0;
2062 st->offset = 0;
2063 rc = listening_get_next(seq, NULL);
1da177e4
LT
2064
2065 while (rc && *pos) {
2066 rc = listening_get_next(seq, rc);
2067 --*pos;
2068 }
2069 return rc;
2070}
2071
6eac5604
AK
2072static inline int empty_bucket(struct tcp_iter_state *st)
2073{
3ab5aee7
ED
2074 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2075 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
6eac5604
AK
2076}
2077
a8b690f9
TH
2078/*
2079 * Get first established socket starting from bucket given in st->bucket.
2080 * If st->bucket is zero, the very first socket in the hash is returned.
2081 */
1da177e4
LT
2082static void *established_get_first(struct seq_file *seq)
2083{
5799de0b 2084 struct tcp_iter_state *st = seq->private;
a4146b1b 2085 struct net *net = seq_file_net(seq);
1da177e4
LT
2086 void *rc = NULL;
2087
a8b690f9
TH
2088 st->offset = 0;
2089 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1da177e4 2090 struct sock *sk;
3ab5aee7 2091 struct hlist_nulls_node *node;
8feaf0c0 2092 struct inet_timewait_sock *tw;
9db66bdc 2093 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1da177e4 2094
6eac5604
AK
2095 /* Lockless fast path for the common case of empty buckets */
2096 if (empty_bucket(st))
2097 continue;
2098
9db66bdc 2099 spin_lock_bh(lock);
3ab5aee7 2100 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
f40c8174 2101 if (sk->sk_family != st->family ||
878628fb 2102 !net_eq(sock_net(sk), net)) {
1da177e4
LT
2103 continue;
2104 }
2105 rc = sk;
2106 goto out;
2107 }
2108 st->state = TCP_SEQ_STATE_TIME_WAIT;
8feaf0c0 2109 inet_twsk_for_each(tw, node,
dbca9b27 2110 &tcp_hashinfo.ehash[st->bucket].twchain) {
28518fc1 2111 if (tw->tw_family != st->family ||
878628fb 2112 !net_eq(twsk_net(tw), net)) {
1da177e4
LT
2113 continue;
2114 }
2115 rc = tw;
2116 goto out;
2117 }
9db66bdc 2118 spin_unlock_bh(lock);
1da177e4
LT
2119 st->state = TCP_SEQ_STATE_ESTABLISHED;
2120 }
2121out:
2122 return rc;
2123}
2124
2125static void *established_get_next(struct seq_file *seq, void *cur)
2126{
2127 struct sock *sk = cur;
8feaf0c0 2128 struct inet_timewait_sock *tw;
3ab5aee7 2129 struct hlist_nulls_node *node;
5799de0b 2130 struct tcp_iter_state *st = seq->private;
a4146b1b 2131 struct net *net = seq_file_net(seq);
1da177e4
LT
2132
2133 ++st->num;
a8b690f9 2134 ++st->offset;
1da177e4
LT
2135
2136 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2137 tw = cur;
2138 tw = tw_next(tw);
2139get_tw:
878628fb 2140 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
1da177e4
LT
2141 tw = tw_next(tw);
2142 }
2143 if (tw) {
2144 cur = tw;
2145 goto out;
2146 }
9db66bdc 2147 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1da177e4
LT
2148 st->state = TCP_SEQ_STATE_ESTABLISHED;
2149
6eac5604 2150 /* Look for next non empty bucket */
a8b690f9 2151 st->offset = 0;
f373b53b 2152 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
6eac5604
AK
2153 empty_bucket(st))
2154 ;
f373b53b 2155 if (st->bucket > tcp_hashinfo.ehash_mask)
6eac5604
AK
2156 return NULL;
2157
9db66bdc 2158 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
3ab5aee7 2159 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
1da177e4 2160 } else
3ab5aee7 2161 sk = sk_nulls_next(sk);
1da177e4 2162
3ab5aee7 2163 sk_nulls_for_each_from(sk, node) {
878628fb 2164 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1da177e4
LT
2165 goto found;
2166 }
2167
2168 st->state = TCP_SEQ_STATE_TIME_WAIT;
dbca9b27 2169 tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
1da177e4
LT
2170 goto get_tw;
2171found:
2172 cur = sk;
2173out:
2174 return cur;
2175}
2176
2177static void *established_get_idx(struct seq_file *seq, loff_t pos)
2178{
a8b690f9
TH
2179 struct tcp_iter_state *st = seq->private;
2180 void *rc;
2181
2182 st->bucket = 0;
2183 rc = established_get_first(seq);
1da177e4
LT
2184
2185 while (rc && pos) {
2186 rc = established_get_next(seq, rc);
2187 --pos;
7174259e 2188 }
1da177e4
LT
2189 return rc;
2190}
2191
2192static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2193{
2194 void *rc;
5799de0b 2195 struct tcp_iter_state *st = seq->private;
1da177e4 2196
1da177e4
LT
2197 st->state = TCP_SEQ_STATE_LISTENING;
2198 rc = listening_get_idx(seq, &pos);
2199
2200 if (!rc) {
1da177e4
LT
2201 st->state = TCP_SEQ_STATE_ESTABLISHED;
2202 rc = established_get_idx(seq, pos);
2203 }
2204
2205 return rc;
2206}
2207
a8b690f9
TH
2208static void *tcp_seek_last_pos(struct seq_file *seq)
2209{
2210 struct tcp_iter_state *st = seq->private;
2211 int offset = st->offset;
2212 int orig_num = st->num;
2213 void *rc = NULL;
2214
2215 switch (st->state) {
2216 case TCP_SEQ_STATE_OPENREQ:
2217 case TCP_SEQ_STATE_LISTENING:
2218 if (st->bucket >= INET_LHTABLE_SIZE)
2219 break;
2220 st->state = TCP_SEQ_STATE_LISTENING;
2221 rc = listening_get_next(seq, NULL);
2222 while (offset-- && rc)
2223 rc = listening_get_next(seq, rc);
2224 if (rc)
2225 break;
2226 st->bucket = 0;
2227 /* Fallthrough */
2228 case TCP_SEQ_STATE_ESTABLISHED:
2229 case TCP_SEQ_STATE_TIME_WAIT:
2230 st->state = TCP_SEQ_STATE_ESTABLISHED;
2231 if (st->bucket > tcp_hashinfo.ehash_mask)
2232 break;
2233 rc = established_get_first(seq);
2234 while (offset-- && rc)
2235 rc = established_get_next(seq, rc);
2236 }
2237
2238 st->num = orig_num;
2239
2240 return rc;
2241}
2242
1da177e4
LT
2243static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2244{
5799de0b 2245 struct tcp_iter_state *st = seq->private;
a8b690f9
TH
2246 void *rc;
2247
2248 if (*pos && *pos == st->last_pos) {
2249 rc = tcp_seek_last_pos(seq);
2250 if (rc)
2251 goto out;
2252 }
2253
1da177e4
LT
2254 st->state = TCP_SEQ_STATE_LISTENING;
2255 st->num = 0;
a8b690f9
TH
2256 st->bucket = 0;
2257 st->offset = 0;
2258 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2259
2260out:
2261 st->last_pos = *pos;
2262 return rc;
1da177e4
LT
2263}
2264
2265static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2266{
a8b690f9 2267 struct tcp_iter_state *st = seq->private;
1da177e4 2268 void *rc = NULL;
1da177e4
LT
2269
2270 if (v == SEQ_START_TOKEN) {
2271 rc = tcp_get_idx(seq, 0);
2272 goto out;
2273 }
1da177e4
LT
2274
2275 switch (st->state) {
2276 case TCP_SEQ_STATE_OPENREQ:
2277 case TCP_SEQ_STATE_LISTENING:
2278 rc = listening_get_next(seq, v);
2279 if (!rc) {
1da177e4 2280 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2281 st->bucket = 0;
2282 st->offset = 0;
1da177e4
LT
2283 rc = established_get_first(seq);
2284 }
2285 break;
2286 case TCP_SEQ_STATE_ESTABLISHED:
2287 case TCP_SEQ_STATE_TIME_WAIT:
2288 rc = established_get_next(seq, v);
2289 break;
2290 }
2291out:
2292 ++*pos;
a8b690f9 2293 st->last_pos = *pos;
1da177e4
LT
2294 return rc;
2295}
2296
2297static void tcp_seq_stop(struct seq_file *seq, void *v)
2298{
5799de0b 2299 struct tcp_iter_state *st = seq->private;
1da177e4
LT
2300
2301 switch (st->state) {
2302 case TCP_SEQ_STATE_OPENREQ:
2303 if (v) {
463c84b9
ACM
2304 struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2305 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4
LT
2306 }
2307 case TCP_SEQ_STATE_LISTENING:
2308 if (v != SEQ_START_TOKEN)
5caea4ea 2309 spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
1da177e4
LT
2310 break;
2311 case TCP_SEQ_STATE_TIME_WAIT:
2312 case TCP_SEQ_STATE_ESTABLISHED:
2313 if (v)
9db66bdc 2314 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1da177e4
LT
2315 break;
2316 }
2317}
2318
2319static int tcp_seq_open(struct inode *inode, struct file *file)
2320{
2321 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
1da177e4 2322 struct tcp_iter_state *s;
52d6f3f1 2323 int err;
1da177e4 2324
52d6f3f1
DL
2325 err = seq_open_net(inode, file, &afinfo->seq_ops,
2326 sizeof(struct tcp_iter_state));
2327 if (err < 0)
2328 return err;
f40c8174 2329
52d6f3f1 2330 s = ((struct seq_file *)file->private_data)->private;
1da177e4 2331 s->family = afinfo->family;
a8b690f9 2332 s->last_pos = 0;
f40c8174
DL
2333 return 0;
2334}
2335
6f8b13bc 2336int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
1da177e4
LT
2337{
2338 int rc = 0;
2339 struct proc_dir_entry *p;
2340
68fcadd1
DL
2341 afinfo->seq_fops.open = tcp_seq_open;
2342 afinfo->seq_fops.read = seq_read;
2343 afinfo->seq_fops.llseek = seq_lseek;
2344 afinfo->seq_fops.release = seq_release_net;
7174259e 2345
9427c4b3
DL
2346 afinfo->seq_ops.start = tcp_seq_start;
2347 afinfo->seq_ops.next = tcp_seq_next;
2348 afinfo->seq_ops.stop = tcp_seq_stop;
2349
84841c3c
DL
2350 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2351 &afinfo->seq_fops, afinfo);
2352 if (!p)
1da177e4
LT
2353 rc = -ENOMEM;
2354 return rc;
2355}
4bc2f18b 2356EXPORT_SYMBOL(tcp_proc_register);
1da177e4 2357
6f8b13bc 2358void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
1da177e4 2359{
6f8b13bc 2360 proc_net_remove(net, afinfo->name);
1da177e4 2361}
4bc2f18b 2362EXPORT_SYMBOL(tcp_proc_unregister);
1da177e4 2363
60236fdd 2364static void get_openreq4(struct sock *sk, struct request_sock *req,
5e659e4c 2365 struct seq_file *f, int i, int uid, int *len)
1da177e4 2366{
2e6599cb 2367 const struct inet_request_sock *ireq = inet_rsk(req);
1da177e4
LT
2368 int ttd = req->expires - jiffies;
2369
5e659e4c
PE
2370 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2371 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
1da177e4 2372 i,
2e6599cb 2373 ireq->loc_addr,
c720c7e8 2374 ntohs(inet_sk(sk)->inet_sport),
2e6599cb
ACM
2375 ireq->rmt_addr,
2376 ntohs(ireq->rmt_port),
1da177e4
LT
2377 TCP_SYN_RECV,
2378 0, 0, /* could print option size, but that is af dependent. */
2379 1, /* timers active (only the expire timer) */
2380 jiffies_to_clock_t(ttd),
2381 req->retrans,
2382 uid,
2383 0, /* non standard timer */
2384 0, /* open_requests have no inode */
2385 atomic_read(&sk->sk_refcnt),
5e659e4c
PE
2386 req,
2387 len);
1da177e4
LT
2388}
2389
5e659e4c 2390static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
1da177e4
LT
2391{
2392 int timer_active;
2393 unsigned long timer_expires;
cf4c6bf8
IJ
2394 struct tcp_sock *tp = tcp_sk(sk);
2395 const struct inet_connection_sock *icsk = inet_csk(sk);
2396 struct inet_sock *inet = inet_sk(sk);
c720c7e8
ED
2397 __be32 dest = inet->inet_daddr;
2398 __be32 src = inet->inet_rcv_saddr;
2399 __u16 destp = ntohs(inet->inet_dport);
2400 __u16 srcp = ntohs(inet->inet_sport);
49d09007 2401 int rx_queue;
1da177e4 2402
463c84b9 2403 if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
1da177e4 2404 timer_active = 1;
463c84b9
ACM
2405 timer_expires = icsk->icsk_timeout;
2406 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1da177e4 2407 timer_active = 4;
463c84b9 2408 timer_expires = icsk->icsk_timeout;
cf4c6bf8 2409 } else if (timer_pending(&sk->sk_timer)) {
1da177e4 2410 timer_active = 2;
cf4c6bf8 2411 timer_expires = sk->sk_timer.expires;
1da177e4
LT
2412 } else {
2413 timer_active = 0;
2414 timer_expires = jiffies;
2415 }
2416
49d09007
ED
2417 if (sk->sk_state == TCP_LISTEN)
2418 rx_queue = sk->sk_ack_backlog;
2419 else
2420 /*
2421 * because we dont lock socket, we might find a transient negative value
2422 */
2423 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2424
5e659e4c 2425 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
7be87351 2426 "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
cf4c6bf8 2427 i, src, srcp, dest, destp, sk->sk_state,
47da8ee6 2428 tp->write_seq - tp->snd_una,
49d09007 2429 rx_queue,
1da177e4
LT
2430 timer_active,
2431 jiffies_to_clock_t(timer_expires - jiffies),
463c84b9 2432 icsk->icsk_retransmits,
cf4c6bf8 2433 sock_i_uid(sk),
6687e988 2434 icsk->icsk_probes_out,
cf4c6bf8
IJ
2435 sock_i_ino(sk),
2436 atomic_read(&sk->sk_refcnt), sk,
7be87351
SH
2437 jiffies_to_clock_t(icsk->icsk_rto),
2438 jiffies_to_clock_t(icsk->icsk_ack.ato),
463c84b9 2439 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
1da177e4 2440 tp->snd_cwnd,
0b6a05c1 2441 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
5e659e4c 2442 len);
1da177e4
LT
2443}
2444
7174259e 2445static void get_timewait4_sock(struct inet_timewait_sock *tw,
5e659e4c 2446 struct seq_file *f, int i, int *len)
1da177e4 2447{
23f33c2d 2448 __be32 dest, src;
1da177e4
LT
2449 __u16 destp, srcp;
2450 int ttd = tw->tw_ttd - jiffies;
2451
2452 if (ttd < 0)
2453 ttd = 0;
2454
2455 dest = tw->tw_daddr;
2456 src = tw->tw_rcv_saddr;
2457 destp = ntohs(tw->tw_dport);
2458 srcp = ntohs(tw->tw_sport);
2459
5e659e4c
PE
2460 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2461 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
1da177e4
LT
2462 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2463 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
5e659e4c 2464 atomic_read(&tw->tw_refcnt), tw, len);
1da177e4
LT
2465}
2466
2467#define TMPSZ 150
2468
2469static int tcp4_seq_show(struct seq_file *seq, void *v)
2470{
5799de0b 2471 struct tcp_iter_state *st;
5e659e4c 2472 int len;
1da177e4
LT
2473
2474 if (v == SEQ_START_TOKEN) {
2475 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2476 " sl local_address rem_address st tx_queue "
2477 "rx_queue tr tm->when retrnsmt uid timeout "
2478 "inode");
2479 goto out;
2480 }
2481 st = seq->private;
2482
2483 switch (st->state) {
2484 case TCP_SEQ_STATE_LISTENING:
2485 case TCP_SEQ_STATE_ESTABLISHED:
5e659e4c 2486 get_tcp4_sock(v, seq, st->num, &len);
1da177e4
LT
2487 break;
2488 case TCP_SEQ_STATE_OPENREQ:
5e659e4c 2489 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
1da177e4
LT
2490 break;
2491 case TCP_SEQ_STATE_TIME_WAIT:
5e659e4c 2492 get_timewait4_sock(v, seq, st->num, &len);
1da177e4
LT
2493 break;
2494 }
5e659e4c 2495 seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
1da177e4
LT
2496out:
2497 return 0;
2498}
2499
1da177e4 2500static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1da177e4
LT
2501 .name = "tcp",
2502 .family = AF_INET,
5f4472c5
DL
2503 .seq_fops = {
2504 .owner = THIS_MODULE,
2505 },
9427c4b3
DL
2506 .seq_ops = {
2507 .show = tcp4_seq_show,
2508 },
1da177e4
LT
2509};
2510
2c8c1e72 2511static int __net_init tcp4_proc_init_net(struct net *net)
757764f6
PE
2512{
2513 return tcp_proc_register(net, &tcp4_seq_afinfo);
2514}
2515
2c8c1e72 2516static void __net_exit tcp4_proc_exit_net(struct net *net)
757764f6
PE
2517{
2518 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2519}
2520
2521static struct pernet_operations tcp4_net_ops = {
2522 .init = tcp4_proc_init_net,
2523 .exit = tcp4_proc_exit_net,
2524};
2525
1da177e4
LT
2526int __init tcp4_proc_init(void)
2527{
757764f6 2528 return register_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2529}
2530
2531void tcp4_proc_exit(void)
2532{
757764f6 2533 unregister_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2534}
2535#endif /* CONFIG_PROC_FS */
2536
bf296b12
HX
2537struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2538{
b71d1d42 2539 const struct iphdr *iph = skb_gro_network_header(skb);
bf296b12
HX
2540
2541 switch (skb->ip_summed) {
2542 case CHECKSUM_COMPLETE:
86911732 2543 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
bf296b12
HX
2544 skb->csum)) {
2545 skb->ip_summed = CHECKSUM_UNNECESSARY;
2546 break;
2547 }
2548
2549 /* fall through */
2550 case CHECKSUM_NONE:
2551 NAPI_GRO_CB(skb)->flush = 1;
2552 return NULL;
2553 }
2554
2555 return tcp_gro_receive(head, skb);
2556}
bf296b12
HX
2557
2558int tcp4_gro_complete(struct sk_buff *skb)
2559{
b71d1d42 2560 const struct iphdr *iph = ip_hdr(skb);
bf296b12
HX
2561 struct tcphdr *th = tcp_hdr(skb);
2562
2563 th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2564 iph->saddr, iph->daddr, 0);
2565 skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2566
2567 return tcp_gro_complete(skb);
2568}
bf296b12 2569
1da177e4
LT
2570struct proto tcp_prot = {
2571 .name = "TCP",
2572 .owner = THIS_MODULE,
2573 .close = tcp_close,
2574 .connect = tcp_v4_connect,
2575 .disconnect = tcp_disconnect,
463c84b9 2576 .accept = inet_csk_accept,
1da177e4
LT
2577 .ioctl = tcp_ioctl,
2578 .init = tcp_v4_init_sock,
2579 .destroy = tcp_v4_destroy_sock,
2580 .shutdown = tcp_shutdown,
2581 .setsockopt = tcp_setsockopt,
2582 .getsockopt = tcp_getsockopt,
1da177e4 2583 .recvmsg = tcp_recvmsg,
7ba42910
CG
2584 .sendmsg = tcp_sendmsg,
2585 .sendpage = tcp_sendpage,
1da177e4 2586 .backlog_rcv = tcp_v4_do_rcv,
ab1e0a13
ACM
2587 .hash = inet_hash,
2588 .unhash = inet_unhash,
2589 .get_port = inet_csk_get_port,
1da177e4
LT
2590 .enter_memory_pressure = tcp_enter_memory_pressure,
2591 .sockets_allocated = &tcp_sockets_allocated,
0a5578cf 2592 .orphan_count = &tcp_orphan_count,
1da177e4
LT
2593 .memory_allocated = &tcp_memory_allocated,
2594 .memory_pressure = &tcp_memory_pressure,
2595 .sysctl_mem = sysctl_tcp_mem,
2596 .sysctl_wmem = sysctl_tcp_wmem,
2597 .sysctl_rmem = sysctl_tcp_rmem,
2598 .max_header = MAX_TCP_HEADER,
2599 .obj_size = sizeof(struct tcp_sock),
3ab5aee7 2600 .slab_flags = SLAB_DESTROY_BY_RCU,
6d6ee43e 2601 .twsk_prot = &tcp_timewait_sock_ops,
60236fdd 2602 .rsk_prot = &tcp_request_sock_ops,
39d8cda7 2603 .h.hashinfo = &tcp_hashinfo,
7ba42910 2604 .no_autobind = true,
543d9cfe
ACM
2605#ifdef CONFIG_COMPAT
2606 .compat_setsockopt = compat_tcp_setsockopt,
2607 .compat_getsockopt = compat_tcp_getsockopt,
2608#endif
1da177e4 2609};
4bc2f18b 2610EXPORT_SYMBOL(tcp_prot);
1da177e4 2611
046ee902
DL
2612
2613static int __net_init tcp_sk_init(struct net *net)
2614{
2615 return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2616 PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2617}
2618
2619static void __net_exit tcp_sk_exit(struct net *net)
2620{
2621 inet_ctl_sock_destroy(net->ipv4.tcp_sock);
b099ce26
EB
2622}
2623
2624static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2625{
2626 inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
046ee902
DL
2627}
2628
2629static struct pernet_operations __net_initdata tcp_sk_ops = {
b099ce26
EB
2630 .init = tcp_sk_init,
2631 .exit = tcp_sk_exit,
2632 .exit_batch = tcp_sk_exit_batch,
046ee902
DL
2633};
2634
9b0f976f 2635void __init tcp_v4_init(void)
1da177e4 2636{
5caea4ea 2637 inet_hashinfo_init(&tcp_hashinfo);
6a1b3054 2638 if (register_pernet_subsys(&tcp_sk_ops))
1da177e4 2639 panic("Failed to create the TCP control socket.\n");
1da177e4 2640}
This page took 0.822986 seconds and 5 git commands to generate.