2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * IPv4 specific functions
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
16 * See tcp.c for author information
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
37 * request_sock handling and moved
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
40 * Added new listen semantics.
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
53 #define pr_fmt(fmt) "TCP: " fmt
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
66 #include <net/net_namespace.h>
68 #include <net/inet_hashtables.h>
70 #include <net/transp_v6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
84 #include <linux/crypto.h>
85 #include <linux/scatterlist.h>
87 int sysctl_tcp_tw_reuse __read_mostly
;
88 int sysctl_tcp_low_latency __read_mostly
;
89 EXPORT_SYMBOL(sysctl_tcp_low_latency
);
91 #ifdef CONFIG_TCP_MD5SIG
92 static int tcp_v4_md5_hash_hdr(char *md5_hash
, const struct tcp_md5sig_key
*key
,
93 __be32 daddr
, __be32 saddr
, const struct tcphdr
*th
);
96 struct inet_hashinfo tcp_hashinfo
;
97 EXPORT_SYMBOL(tcp_hashinfo
);
99 static __u32
tcp_v4_init_sequence(const struct sk_buff
*skb
)
101 return secure_tcp_sequence_number(ip_hdr(skb
)->daddr
,
104 tcp_hdr(skb
)->source
);
107 int tcp_twsk_unique(struct sock
*sk
, struct sock
*sktw
, void *twp
)
109 const struct tcp_timewait_sock
*tcptw
= tcp_twsk(sktw
);
110 struct tcp_sock
*tp
= tcp_sk(sk
);
112 /* With PAWS, it is safe from the viewpoint
113 of data integrity. Even without PAWS it is safe provided sequence
114 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
116 Actually, the idea is close to VJ's one, only timestamp cache is
117 held not per host, but per port pair and TW bucket is used as state
120 If TW bucket has been already destroyed we fall back to VJ's scheme
121 and use initial timestamp retrieved from peer table.
123 if (tcptw
->tw_ts_recent_stamp
&&
124 (!twp
|| (sysctl_tcp_tw_reuse
&&
125 get_seconds() - tcptw
->tw_ts_recent_stamp
> 1))) {
126 tp
->write_seq
= tcptw
->tw_snd_nxt
+ 65535 + 2;
127 if (tp
->write_seq
== 0)
129 tp
->rx_opt
.ts_recent
= tcptw
->tw_ts_recent
;
130 tp
->rx_opt
.ts_recent_stamp
= tcptw
->tw_ts_recent_stamp
;
137 EXPORT_SYMBOL_GPL(tcp_twsk_unique
);
139 /* This will initiate an outgoing connection. */
140 int tcp_v4_connect(struct sock
*sk
, struct sockaddr
*uaddr
, int addr_len
)
142 struct sockaddr_in
*usin
= (struct sockaddr_in
*)uaddr
;
143 struct inet_sock
*inet
= inet_sk(sk
);
144 struct tcp_sock
*tp
= tcp_sk(sk
);
145 __be16 orig_sport
, orig_dport
;
146 __be32 daddr
, nexthop
;
150 struct ip_options_rcu
*inet_opt
;
152 if (addr_len
< sizeof(struct sockaddr_in
))
155 if (usin
->sin_family
!= AF_INET
)
156 return -EAFNOSUPPORT
;
158 nexthop
= daddr
= usin
->sin_addr
.s_addr
;
159 inet_opt
= rcu_dereference_protected(inet
->inet_opt
,
160 sock_owned_by_user(sk
));
161 if (inet_opt
&& inet_opt
->opt
.srr
) {
164 nexthop
= inet_opt
->opt
.faddr
;
167 orig_sport
= inet
->inet_sport
;
168 orig_dport
= usin
->sin_port
;
169 fl4
= &inet
->cork
.fl
.u
.ip4
;
170 rt
= ip_route_connect(fl4
, nexthop
, inet
->inet_saddr
,
171 RT_CONN_FLAGS(sk
), sk
->sk_bound_dev_if
,
173 orig_sport
, orig_dport
, sk
);
176 if (err
== -ENETUNREACH
)
177 IP_INC_STATS(sock_net(sk
), IPSTATS_MIB_OUTNOROUTES
);
181 if (rt
->rt_flags
& (RTCF_MULTICAST
| RTCF_BROADCAST
)) {
186 if (!inet_opt
|| !inet_opt
->opt
.srr
)
189 if (!inet
->inet_saddr
)
190 inet
->inet_saddr
= fl4
->saddr
;
191 sk_rcv_saddr_set(sk
, inet
->inet_saddr
);
193 if (tp
->rx_opt
.ts_recent_stamp
&& inet
->inet_daddr
!= daddr
) {
194 /* Reset inherited state */
195 tp
->rx_opt
.ts_recent
= 0;
196 tp
->rx_opt
.ts_recent_stamp
= 0;
197 if (likely(!tp
->repair
))
201 if (tcp_death_row
.sysctl_tw_recycle
&&
202 !tp
->rx_opt
.ts_recent_stamp
&& fl4
->daddr
== daddr
)
203 tcp_fetch_timewait_stamp(sk
, &rt
->dst
);
205 inet
->inet_dport
= usin
->sin_port
;
206 sk_daddr_set(sk
, daddr
);
208 inet_csk(sk
)->icsk_ext_hdr_len
= 0;
210 inet_csk(sk
)->icsk_ext_hdr_len
= inet_opt
->opt
.optlen
;
212 tp
->rx_opt
.mss_clamp
= TCP_MSS_DEFAULT
;
214 /* Socket identity is still unknown (sport may be zero).
215 * However we set state to SYN-SENT and not releasing socket
216 * lock select source port, enter ourselves into the hash tables and
217 * complete initialization after this.
219 tcp_set_state(sk
, TCP_SYN_SENT
);
220 err
= inet_hash_connect(&tcp_death_row
, sk
);
226 rt
= ip_route_newports(fl4
, rt
, orig_sport
, orig_dport
,
227 inet
->inet_sport
, inet
->inet_dport
, sk
);
233 /* OK, now commit destination to socket. */
234 sk
->sk_gso_type
= SKB_GSO_TCPV4
;
235 sk_setup_caps(sk
, &rt
->dst
);
237 if (!tp
->write_seq
&& likely(!tp
->repair
))
238 tp
->write_seq
= secure_tcp_sequence_number(inet
->inet_saddr
,
243 inet
->inet_id
= tp
->write_seq
^ jiffies
;
245 err
= tcp_connect(sk
);
255 * This unhashes the socket and releases the local port,
258 tcp_set_state(sk
, TCP_CLOSE
);
260 sk
->sk_route_caps
= 0;
261 inet
->inet_dport
= 0;
264 EXPORT_SYMBOL(tcp_v4_connect
);
267 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
268 * It can be called through tcp_release_cb() if socket was owned by user
269 * at the time tcp_v4_err() was called to handle ICMP message.
271 void tcp_v4_mtu_reduced(struct sock
*sk
)
273 struct dst_entry
*dst
;
274 struct inet_sock
*inet
= inet_sk(sk
);
275 u32 mtu
= tcp_sk(sk
)->mtu_info
;
277 dst
= inet_csk_update_pmtu(sk
, mtu
);
281 /* Something is about to be wrong... Remember soft error
282 * for the case, if this connection will not able to recover.
284 if (mtu
< dst_mtu(dst
) && ip_dont_fragment(sk
, dst
))
285 sk
->sk_err_soft
= EMSGSIZE
;
289 if (inet
->pmtudisc
!= IP_PMTUDISC_DONT
&&
290 ip_sk_accept_pmtu(sk
) &&
291 inet_csk(sk
)->icsk_pmtu_cookie
> mtu
) {
292 tcp_sync_mss(sk
, mtu
);
294 /* Resend the TCP packet because it's
295 * clear that the old packet has been
296 * dropped. This is the new "fast" path mtu
299 tcp_simple_retransmit(sk
);
300 } /* else let the usual retransmit timer handle it */
302 EXPORT_SYMBOL(tcp_v4_mtu_reduced
);
304 static void do_redirect(struct sk_buff
*skb
, struct sock
*sk
)
306 struct dst_entry
*dst
= __sk_dst_check(sk
, 0);
309 dst
->ops
->redirect(dst
, sk
, skb
);
313 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
314 void tcp_req_err(struct sock
*sk
, u32 seq
)
316 struct request_sock
*req
= inet_reqsk(sk
);
317 struct net
*net
= sock_net(sk
);
319 /* ICMPs are not backlogged, hence we cannot get
320 * an established socket here.
324 if (seq
!= tcp_rsk(req
)->snt_isn
) {
325 NET_INC_STATS_BH(net
, LINUX_MIB_OUTOFWINDOWICMPS
);
328 * Still in SYN_RECV, just remove it silently.
329 * There is no good way to pass the error to the newly
330 * created socket, and POSIX does not want network
331 * errors returned from accept().
333 inet_csk_reqsk_queue_drop(req
->rsk_listener
, req
);
334 NET_INC_STATS_BH(net
, LINUX_MIB_LISTENDROPS
);
338 EXPORT_SYMBOL(tcp_req_err
);
341 * This routine is called by the ICMP module when it gets some
342 * sort of error condition. If err < 0 then the socket should
343 * be closed and the error returned to the user. If err > 0
344 * it's just the icmp type << 8 | icmp code. After adjustment
345 * header points to the first 8 bytes of the tcp header. We need
346 * to find the appropriate port.
348 * The locking strategy used here is very "optimistic". When
349 * someone else accesses the socket the ICMP is just dropped
350 * and for some paths there is no check at all.
351 * A more general error queue to queue errors for later handling
352 * is probably better.
356 void tcp_v4_err(struct sk_buff
*icmp_skb
, u32 info
)
358 const struct iphdr
*iph
= (const struct iphdr
*)icmp_skb
->data
;
359 struct tcphdr
*th
= (struct tcphdr
*)(icmp_skb
->data
+ (iph
->ihl
<< 2));
360 struct inet_connection_sock
*icsk
;
362 struct inet_sock
*inet
;
363 const int type
= icmp_hdr(icmp_skb
)->type
;
364 const int code
= icmp_hdr(icmp_skb
)->code
;
367 struct request_sock
*fastopen
;
371 struct net
*net
= dev_net(icmp_skb
->dev
);
373 sk
= __inet_lookup_established(net
, &tcp_hashinfo
, iph
->daddr
,
374 th
->dest
, iph
->saddr
, ntohs(th
->source
),
377 ICMP_INC_STATS_BH(net
, ICMP_MIB_INERRORS
);
380 if (sk
->sk_state
== TCP_TIME_WAIT
) {
381 inet_twsk_put(inet_twsk(sk
));
384 seq
= ntohl(th
->seq
);
385 if (sk
->sk_state
== TCP_NEW_SYN_RECV
)
386 return tcp_req_err(sk
, seq
);
389 /* If too many ICMPs get dropped on busy
390 * servers this needs to be solved differently.
391 * We do take care of PMTU discovery (RFC1191) special case :
392 * we can receive locally generated ICMP messages while socket is held.
394 if (sock_owned_by_user(sk
)) {
395 if (!(type
== ICMP_DEST_UNREACH
&& code
== ICMP_FRAG_NEEDED
))
396 NET_INC_STATS_BH(net
, LINUX_MIB_LOCKDROPPEDICMPS
);
398 if (sk
->sk_state
== TCP_CLOSE
)
401 if (unlikely(iph
->ttl
< inet_sk(sk
)->min_ttl
)) {
402 NET_INC_STATS_BH(net
, LINUX_MIB_TCPMINTTLDROP
);
408 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
409 fastopen
= tp
->fastopen_rsk
;
410 snd_una
= fastopen
? tcp_rsk(fastopen
)->snt_isn
: tp
->snd_una
;
411 if (sk
->sk_state
!= TCP_LISTEN
&&
412 !between(seq
, snd_una
, tp
->snd_nxt
)) {
413 NET_INC_STATS_BH(net
, LINUX_MIB_OUTOFWINDOWICMPS
);
419 do_redirect(icmp_skb
, sk
);
421 case ICMP_SOURCE_QUENCH
:
422 /* Just silently ignore these. */
424 case ICMP_PARAMETERPROB
:
427 case ICMP_DEST_UNREACH
:
428 if (code
> NR_ICMP_UNREACH
)
431 if (code
== ICMP_FRAG_NEEDED
) { /* PMTU discovery (RFC1191) */
432 /* We are not interested in TCP_LISTEN and open_requests
433 * (SYN-ACKs send out by Linux are always <576bytes so
434 * they should go through unfragmented).
436 if (sk
->sk_state
== TCP_LISTEN
)
440 if (!sock_owned_by_user(sk
)) {
441 tcp_v4_mtu_reduced(sk
);
443 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED
, &tp
->tsq_flags
))
449 err
= icmp_err_convert
[code
].errno
;
450 /* check if icmp_skb allows revert of backoff
451 * (see draft-zimmermann-tcp-lcd) */
452 if (code
!= ICMP_NET_UNREACH
&& code
!= ICMP_HOST_UNREACH
)
454 if (seq
!= tp
->snd_una
|| !icsk
->icsk_retransmits
||
455 !icsk
->icsk_backoff
|| fastopen
)
458 if (sock_owned_by_user(sk
))
461 icsk
->icsk_backoff
--;
462 icsk
->icsk_rto
= tp
->srtt_us
? __tcp_set_rto(tp
) :
464 icsk
->icsk_rto
= inet_csk_rto_backoff(icsk
, TCP_RTO_MAX
);
466 skb
= tcp_write_queue_head(sk
);
469 remaining
= icsk
->icsk_rto
-
471 tcp_time_stamp
- tcp_skb_timestamp(skb
));
474 inet_csk_reset_xmit_timer(sk
, ICSK_TIME_RETRANS
,
475 remaining
, TCP_RTO_MAX
);
477 /* RTO revert clocked out retransmission.
478 * Will retransmit now */
479 tcp_retransmit_timer(sk
);
483 case ICMP_TIME_EXCEEDED
:
490 switch (sk
->sk_state
) {
493 /* Only in fast or simultaneous open. If a fast open socket is
494 * is already accepted it is treated as a connected one below.
496 if (fastopen
&& !fastopen
->sk
)
499 if (!sock_owned_by_user(sk
)) {
502 sk
->sk_error_report(sk
);
506 sk
->sk_err_soft
= err
;
511 /* If we've already connected we will keep trying
512 * until we time out, or the user gives up.
514 * rfc1122 4.2.3.9 allows to consider as hard errors
515 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
516 * but it is obsoleted by pmtu discovery).
518 * Note, that in modern internet, where routing is unreliable
519 * and in each dark corner broken firewalls sit, sending random
520 * errors ordered by their masters even this two messages finally lose
521 * their original sense (even Linux sends invalid PORT_UNREACHs)
523 * Now we are in compliance with RFCs.
528 if (!sock_owned_by_user(sk
) && inet
->recverr
) {
530 sk
->sk_error_report(sk
);
531 } else { /* Only an error on timeout */
532 sk
->sk_err_soft
= err
;
540 void __tcp_v4_send_check(struct sk_buff
*skb
, __be32 saddr
, __be32 daddr
)
542 struct tcphdr
*th
= tcp_hdr(skb
);
544 if (skb
->ip_summed
== CHECKSUM_PARTIAL
) {
545 th
->check
= ~tcp_v4_check(skb
->len
, saddr
, daddr
, 0);
546 skb
->csum_start
= skb_transport_header(skb
) - skb
->head
;
547 skb
->csum_offset
= offsetof(struct tcphdr
, check
);
549 th
->check
= tcp_v4_check(skb
->len
, saddr
, daddr
,
556 /* This routine computes an IPv4 TCP checksum. */
557 void tcp_v4_send_check(struct sock
*sk
, struct sk_buff
*skb
)
559 const struct inet_sock
*inet
= inet_sk(sk
);
561 __tcp_v4_send_check(skb
, inet
->inet_saddr
, inet
->inet_daddr
);
563 EXPORT_SYMBOL(tcp_v4_send_check
);
566 * This routine will send an RST to the other tcp.
568 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
570 * Answer: if a packet caused RST, it is not for a socket
571 * existing in our system, if it is matched to a socket,
572 * it is just duplicate segment or bug in other side's TCP.
573 * So that we build reply only basing on parameters
574 * arrived with segment.
575 * Exception: precedence violation. We do not implement it in any case.
578 static void tcp_v4_send_reset(const struct sock
*sk
, struct sk_buff
*skb
)
580 const struct tcphdr
*th
= tcp_hdr(skb
);
583 #ifdef CONFIG_TCP_MD5SIG
584 __be32 opt
[(TCPOLEN_MD5SIG_ALIGNED
>> 2)];
587 struct ip_reply_arg arg
;
588 #ifdef CONFIG_TCP_MD5SIG
589 struct tcp_md5sig_key
*key
= NULL
;
590 const __u8
*hash_location
= NULL
;
591 unsigned char newhash
[16];
593 struct sock
*sk1
= NULL
;
597 /* Never send a reset in response to a reset. */
601 /* If sk not NULL, it means we did a successful lookup and incoming
602 * route had to be correct. prequeue might have dropped our dst.
604 if (!sk
&& skb_rtable(skb
)->rt_type
!= RTN_LOCAL
)
607 /* Swap the send and the receive. */
608 memset(&rep
, 0, sizeof(rep
));
609 rep
.th
.dest
= th
->source
;
610 rep
.th
.source
= th
->dest
;
611 rep
.th
.doff
= sizeof(struct tcphdr
) / 4;
615 rep
.th
.seq
= th
->ack_seq
;
618 rep
.th
.ack_seq
= htonl(ntohl(th
->seq
) + th
->syn
+ th
->fin
+
619 skb
->len
- (th
->doff
<< 2));
622 memset(&arg
, 0, sizeof(arg
));
623 arg
.iov
[0].iov_base
= (unsigned char *)&rep
;
624 arg
.iov
[0].iov_len
= sizeof(rep
.th
);
626 net
= sk
? sock_net(sk
) : dev_net(skb_dst(skb
)->dev
);
627 #ifdef CONFIG_TCP_MD5SIG
628 hash_location
= tcp_parse_md5sig_option(th
);
629 if (sk
&& sk_fullsock(sk
)) {
630 key
= tcp_md5_do_lookup(sk
, (union tcp_md5_addr
*)
631 &ip_hdr(skb
)->saddr
, AF_INET
);
632 } else if (hash_location
) {
634 * active side is lost. Try to find listening socket through
635 * source port, and then find md5 key through listening socket.
636 * we are not loose security here:
637 * Incoming packet is checked with md5 hash with finding key,
638 * no RST generated if md5 hash doesn't match.
640 sk1
= __inet_lookup_listener(net
,
641 &tcp_hashinfo
, ip_hdr(skb
)->saddr
,
642 th
->source
, ip_hdr(skb
)->daddr
,
643 ntohs(th
->source
), inet_iif(skb
));
644 /* don't send rst if it can't find key */
648 key
= tcp_md5_do_lookup(sk1
, (union tcp_md5_addr
*)
649 &ip_hdr(skb
)->saddr
, AF_INET
);
653 genhash
= tcp_v4_md5_hash_skb(newhash
, key
, NULL
, skb
);
654 if (genhash
|| memcmp(hash_location
, newhash
, 16) != 0)
659 rep
.opt
[0] = htonl((TCPOPT_NOP
<< 24) |
661 (TCPOPT_MD5SIG
<< 8) |
663 /* Update length and the length the header thinks exists */
664 arg
.iov
[0].iov_len
+= TCPOLEN_MD5SIG_ALIGNED
;
665 rep
.th
.doff
= arg
.iov
[0].iov_len
/ 4;
667 tcp_v4_md5_hash_hdr((__u8
*) &rep
.opt
[1],
668 key
, ip_hdr(skb
)->saddr
,
669 ip_hdr(skb
)->daddr
, &rep
.th
);
672 arg
.csum
= csum_tcpudp_nofold(ip_hdr(skb
)->daddr
,
673 ip_hdr(skb
)->saddr
, /* XXX */
674 arg
.iov
[0].iov_len
, IPPROTO_TCP
, 0);
675 arg
.csumoffset
= offsetof(struct tcphdr
, check
) / 2;
676 arg
.flags
= (sk
&& inet_sk_transparent(sk
)) ? IP_REPLY_ARG_NOSRCCHECK
: 0;
678 /* When socket is gone, all binding information is lost.
679 * routing might fail in this case. No choice here, if we choose to force
680 * input interface, we will misroute in case of asymmetric route.
683 arg
.bound_dev_if
= sk
->sk_bound_dev_if
;
685 BUILD_BUG_ON(offsetof(struct sock
, sk_bound_dev_if
) !=
686 offsetof(struct inet_timewait_sock
, tw_bound_dev_if
));
688 arg
.tos
= ip_hdr(skb
)->tos
;
689 ip_send_unicast_reply(*this_cpu_ptr(net
->ipv4
.tcp_sk
),
690 skb
, &TCP_SKB_CB(skb
)->header
.h4
.opt
,
691 ip_hdr(skb
)->saddr
, ip_hdr(skb
)->daddr
,
692 &arg
, arg
.iov
[0].iov_len
);
694 TCP_INC_STATS_BH(net
, TCP_MIB_OUTSEGS
);
695 TCP_INC_STATS_BH(net
, TCP_MIB_OUTRSTS
);
697 #ifdef CONFIG_TCP_MD5SIG
706 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
707 outside socket context is ugly, certainly. What can I do?
710 static void tcp_v4_send_ack(struct sk_buff
*skb
, u32 seq
, u32 ack
,
711 u32 win
, u32 tsval
, u32 tsecr
, int oif
,
712 struct tcp_md5sig_key
*key
,
713 int reply_flags
, u8 tos
)
715 const struct tcphdr
*th
= tcp_hdr(skb
);
718 __be32 opt
[(TCPOLEN_TSTAMP_ALIGNED
>> 2)
719 #ifdef CONFIG_TCP_MD5SIG
720 + (TCPOLEN_MD5SIG_ALIGNED
>> 2)
724 struct ip_reply_arg arg
;
725 struct net
*net
= dev_net(skb_dst(skb
)->dev
);
727 memset(&rep
.th
, 0, sizeof(struct tcphdr
));
728 memset(&arg
, 0, sizeof(arg
));
730 arg
.iov
[0].iov_base
= (unsigned char *)&rep
;
731 arg
.iov
[0].iov_len
= sizeof(rep
.th
);
733 rep
.opt
[0] = htonl((TCPOPT_NOP
<< 24) | (TCPOPT_NOP
<< 16) |
734 (TCPOPT_TIMESTAMP
<< 8) |
736 rep
.opt
[1] = htonl(tsval
);
737 rep
.opt
[2] = htonl(tsecr
);
738 arg
.iov
[0].iov_len
+= TCPOLEN_TSTAMP_ALIGNED
;
741 /* Swap the send and the receive. */
742 rep
.th
.dest
= th
->source
;
743 rep
.th
.source
= th
->dest
;
744 rep
.th
.doff
= arg
.iov
[0].iov_len
/ 4;
745 rep
.th
.seq
= htonl(seq
);
746 rep
.th
.ack_seq
= htonl(ack
);
748 rep
.th
.window
= htons(win
);
750 #ifdef CONFIG_TCP_MD5SIG
752 int offset
= (tsecr
) ? 3 : 0;
754 rep
.opt
[offset
++] = htonl((TCPOPT_NOP
<< 24) |
756 (TCPOPT_MD5SIG
<< 8) |
758 arg
.iov
[0].iov_len
+= TCPOLEN_MD5SIG_ALIGNED
;
759 rep
.th
.doff
= arg
.iov
[0].iov_len
/4;
761 tcp_v4_md5_hash_hdr((__u8
*) &rep
.opt
[offset
],
762 key
, ip_hdr(skb
)->saddr
,
763 ip_hdr(skb
)->daddr
, &rep
.th
);
766 arg
.flags
= reply_flags
;
767 arg
.csum
= csum_tcpudp_nofold(ip_hdr(skb
)->daddr
,
768 ip_hdr(skb
)->saddr
, /* XXX */
769 arg
.iov
[0].iov_len
, IPPROTO_TCP
, 0);
770 arg
.csumoffset
= offsetof(struct tcphdr
, check
) / 2;
772 arg
.bound_dev_if
= oif
;
774 ip_send_unicast_reply(*this_cpu_ptr(net
->ipv4
.tcp_sk
),
775 skb
, &TCP_SKB_CB(skb
)->header
.h4
.opt
,
776 ip_hdr(skb
)->saddr
, ip_hdr(skb
)->daddr
,
777 &arg
, arg
.iov
[0].iov_len
);
779 TCP_INC_STATS_BH(net
, TCP_MIB_OUTSEGS
);
782 static void tcp_v4_timewait_ack(struct sock
*sk
, struct sk_buff
*skb
)
784 struct inet_timewait_sock
*tw
= inet_twsk(sk
);
785 struct tcp_timewait_sock
*tcptw
= tcp_twsk(sk
);
787 tcp_v4_send_ack(skb
, tcptw
->tw_snd_nxt
, tcptw
->tw_rcv_nxt
,
788 tcptw
->tw_rcv_wnd
>> tw
->tw_rcv_wscale
,
789 tcp_time_stamp
+ tcptw
->tw_ts_offset
,
792 tcp_twsk_md5_key(tcptw
),
793 tw
->tw_transparent
? IP_REPLY_ARG_NOSRCCHECK
: 0,
800 static void tcp_v4_reqsk_send_ack(const struct sock
*sk
, struct sk_buff
*skb
,
801 struct request_sock
*req
)
803 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
804 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
806 tcp_v4_send_ack(skb
, (sk
->sk_state
== TCP_LISTEN
) ?
807 tcp_rsk(req
)->snt_isn
+ 1 : tcp_sk(sk
)->snd_nxt
,
808 tcp_rsk(req
)->rcv_nxt
, req
->rsk_rcv_wnd
,
812 tcp_md5_do_lookup(sk
, (union tcp_md5_addr
*)&ip_hdr(skb
)->daddr
,
814 inet_rsk(req
)->no_srccheck
? IP_REPLY_ARG_NOSRCCHECK
: 0,
819 * Send a SYN-ACK after having received a SYN.
820 * This still operates on a request_sock only, not on a big
823 static int tcp_v4_send_synack(const struct sock
*sk
, struct dst_entry
*dst
,
825 struct request_sock
*req
,
826 struct tcp_fastopen_cookie
*foc
,
829 const struct inet_request_sock
*ireq
= inet_rsk(req
);
834 /* First, grab a route. */
835 if (!dst
&& (dst
= inet_csk_route_req(sk
, &fl4
, req
)) == NULL
)
838 skb
= tcp_make_synack(sk
, dst
, req
, foc
, attach_req
);
841 __tcp_v4_send_check(skb
, ireq
->ir_loc_addr
, ireq
->ir_rmt_addr
);
843 err
= ip_build_and_send_pkt(skb
, sk
, ireq
->ir_loc_addr
,
846 err
= net_xmit_eval(err
);
853 * IPv4 request_sock destructor.
855 static void tcp_v4_reqsk_destructor(struct request_sock
*req
)
857 kfree(inet_rsk(req
)->opt
);
861 #ifdef CONFIG_TCP_MD5SIG
863 * RFC2385 MD5 checksumming requires a mapping of
864 * IP address->MD5 Key.
865 * We need to maintain these in the sk structure.
868 /* Find the Key structure for an address. */
869 struct tcp_md5sig_key
*tcp_md5_do_lookup(const struct sock
*sk
,
870 const union tcp_md5_addr
*addr
,
873 const struct tcp_sock
*tp
= tcp_sk(sk
);
874 struct tcp_md5sig_key
*key
;
875 unsigned int size
= sizeof(struct in_addr
);
876 const struct tcp_md5sig_info
*md5sig
;
878 /* caller either holds rcu_read_lock() or socket lock */
879 md5sig
= rcu_dereference_check(tp
->md5sig_info
,
880 sock_owned_by_user(sk
) ||
881 lockdep_is_held((spinlock_t
*)&sk
->sk_lock
.slock
));
884 #if IS_ENABLED(CONFIG_IPV6)
885 if (family
== AF_INET6
)
886 size
= sizeof(struct in6_addr
);
888 hlist_for_each_entry_rcu(key
, &md5sig
->head
, node
) {
889 if (key
->family
!= family
)
891 if (!memcmp(&key
->addr
, addr
, size
))
896 EXPORT_SYMBOL(tcp_md5_do_lookup
);
898 struct tcp_md5sig_key
*tcp_v4_md5_lookup(const struct sock
*sk
,
899 const struct sock
*addr_sk
)
901 const union tcp_md5_addr
*addr
;
903 addr
= (const union tcp_md5_addr
*)&addr_sk
->sk_daddr
;
904 return tcp_md5_do_lookup(sk
, addr
, AF_INET
);
906 EXPORT_SYMBOL(tcp_v4_md5_lookup
);
908 /* This can be called on a newly created socket, from other files */
909 int tcp_md5_do_add(struct sock
*sk
, const union tcp_md5_addr
*addr
,
910 int family
, const u8
*newkey
, u8 newkeylen
, gfp_t gfp
)
912 /* Add Key to the list */
913 struct tcp_md5sig_key
*key
;
914 struct tcp_sock
*tp
= tcp_sk(sk
);
915 struct tcp_md5sig_info
*md5sig
;
917 key
= tcp_md5_do_lookup(sk
, addr
, family
);
919 /* Pre-existing entry - just update that one. */
920 memcpy(key
->key
, newkey
, newkeylen
);
921 key
->keylen
= newkeylen
;
925 md5sig
= rcu_dereference_protected(tp
->md5sig_info
,
926 sock_owned_by_user(sk
) ||
927 lockdep_is_held(&sk
->sk_lock
.slock
));
929 md5sig
= kmalloc(sizeof(*md5sig
), gfp
);
933 sk_nocaps_add(sk
, NETIF_F_GSO_MASK
);
934 INIT_HLIST_HEAD(&md5sig
->head
);
935 rcu_assign_pointer(tp
->md5sig_info
, md5sig
);
938 key
= sock_kmalloc(sk
, sizeof(*key
), gfp
);
941 if (!tcp_alloc_md5sig_pool()) {
942 sock_kfree_s(sk
, key
, sizeof(*key
));
946 memcpy(key
->key
, newkey
, newkeylen
);
947 key
->keylen
= newkeylen
;
948 key
->family
= family
;
949 memcpy(&key
->addr
, addr
,
950 (family
== AF_INET6
) ? sizeof(struct in6_addr
) :
951 sizeof(struct in_addr
));
952 hlist_add_head_rcu(&key
->node
, &md5sig
->head
);
955 EXPORT_SYMBOL(tcp_md5_do_add
);
957 int tcp_md5_do_del(struct sock
*sk
, const union tcp_md5_addr
*addr
, int family
)
959 struct tcp_md5sig_key
*key
;
961 key
= tcp_md5_do_lookup(sk
, addr
, family
);
964 hlist_del_rcu(&key
->node
);
965 atomic_sub(sizeof(*key
), &sk
->sk_omem_alloc
);
969 EXPORT_SYMBOL(tcp_md5_do_del
);
971 static void tcp_clear_md5_list(struct sock
*sk
)
973 struct tcp_sock
*tp
= tcp_sk(sk
);
974 struct tcp_md5sig_key
*key
;
975 struct hlist_node
*n
;
976 struct tcp_md5sig_info
*md5sig
;
978 md5sig
= rcu_dereference_protected(tp
->md5sig_info
, 1);
980 hlist_for_each_entry_safe(key
, n
, &md5sig
->head
, node
) {
981 hlist_del_rcu(&key
->node
);
982 atomic_sub(sizeof(*key
), &sk
->sk_omem_alloc
);
987 static int tcp_v4_parse_md5_keys(struct sock
*sk
, char __user
*optval
,
990 struct tcp_md5sig cmd
;
991 struct sockaddr_in
*sin
= (struct sockaddr_in
*)&cmd
.tcpm_addr
;
993 if (optlen
< sizeof(cmd
))
996 if (copy_from_user(&cmd
, optval
, sizeof(cmd
)))
999 if (sin
->sin_family
!= AF_INET
)
1002 if (!cmd
.tcpm_keylen
)
1003 return tcp_md5_do_del(sk
, (union tcp_md5_addr
*)&sin
->sin_addr
.s_addr
,
1006 if (cmd
.tcpm_keylen
> TCP_MD5SIG_MAXKEYLEN
)
1009 return tcp_md5_do_add(sk
, (union tcp_md5_addr
*)&sin
->sin_addr
.s_addr
,
1010 AF_INET
, cmd
.tcpm_key
, cmd
.tcpm_keylen
,
1014 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool
*hp
,
1015 __be32 daddr
, __be32 saddr
, int nbytes
)
1017 struct tcp4_pseudohdr
*bp
;
1018 struct scatterlist sg
;
1020 bp
= &hp
->md5_blk
.ip4
;
1023 * 1. the TCP pseudo-header (in the order: source IP address,
1024 * destination IP address, zero-padded protocol number, and
1030 bp
->protocol
= IPPROTO_TCP
;
1031 bp
->len
= cpu_to_be16(nbytes
);
1033 sg_init_one(&sg
, bp
, sizeof(*bp
));
1034 return crypto_hash_update(&hp
->md5_desc
, &sg
, sizeof(*bp
));
1037 static int tcp_v4_md5_hash_hdr(char *md5_hash
, const struct tcp_md5sig_key
*key
,
1038 __be32 daddr
, __be32 saddr
, const struct tcphdr
*th
)
1040 struct tcp_md5sig_pool
*hp
;
1041 struct hash_desc
*desc
;
1043 hp
= tcp_get_md5sig_pool();
1045 goto clear_hash_noput
;
1046 desc
= &hp
->md5_desc
;
1048 if (crypto_hash_init(desc
))
1050 if (tcp_v4_md5_hash_pseudoheader(hp
, daddr
, saddr
, th
->doff
<< 2))
1052 if (tcp_md5_hash_header(hp
, th
))
1054 if (tcp_md5_hash_key(hp
, key
))
1056 if (crypto_hash_final(desc
, md5_hash
))
1059 tcp_put_md5sig_pool();
1063 tcp_put_md5sig_pool();
1065 memset(md5_hash
, 0, 16);
1069 int tcp_v4_md5_hash_skb(char *md5_hash
, const struct tcp_md5sig_key
*key
,
1070 const struct sock
*sk
,
1071 const struct sk_buff
*skb
)
1073 struct tcp_md5sig_pool
*hp
;
1074 struct hash_desc
*desc
;
1075 const struct tcphdr
*th
= tcp_hdr(skb
);
1076 __be32 saddr
, daddr
;
1078 if (sk
) { /* valid for establish/request sockets */
1079 saddr
= sk
->sk_rcv_saddr
;
1080 daddr
= sk
->sk_daddr
;
1082 const struct iphdr
*iph
= ip_hdr(skb
);
1087 hp
= tcp_get_md5sig_pool();
1089 goto clear_hash_noput
;
1090 desc
= &hp
->md5_desc
;
1092 if (crypto_hash_init(desc
))
1095 if (tcp_v4_md5_hash_pseudoheader(hp
, daddr
, saddr
, skb
->len
))
1097 if (tcp_md5_hash_header(hp
, th
))
1099 if (tcp_md5_hash_skb_data(hp
, skb
, th
->doff
<< 2))
1101 if (tcp_md5_hash_key(hp
, key
))
1103 if (crypto_hash_final(desc
, md5_hash
))
1106 tcp_put_md5sig_pool();
1110 tcp_put_md5sig_pool();
1112 memset(md5_hash
, 0, 16);
1115 EXPORT_SYMBOL(tcp_v4_md5_hash_skb
);
1119 /* Called with rcu_read_lock() */
1120 static bool tcp_v4_inbound_md5_hash(const struct sock
*sk
,
1121 const struct sk_buff
*skb
)
1123 #ifdef CONFIG_TCP_MD5SIG
1125 * This gets called for each TCP segment that arrives
1126 * so we want to be efficient.
1127 * We have 3 drop cases:
1128 * o No MD5 hash and one expected.
1129 * o MD5 hash and we're not expecting one.
1130 * o MD5 hash and its wrong.
1132 const __u8
*hash_location
= NULL
;
1133 struct tcp_md5sig_key
*hash_expected
;
1134 const struct iphdr
*iph
= ip_hdr(skb
);
1135 const struct tcphdr
*th
= tcp_hdr(skb
);
1137 unsigned char newhash
[16];
1139 hash_expected
= tcp_md5_do_lookup(sk
, (union tcp_md5_addr
*)&iph
->saddr
,
1141 hash_location
= tcp_parse_md5sig_option(th
);
1143 /* We've parsed the options - do we have a hash? */
1144 if (!hash_expected
&& !hash_location
)
1147 if (hash_expected
&& !hash_location
) {
1148 NET_INC_STATS_BH(sock_net(sk
), LINUX_MIB_TCPMD5NOTFOUND
);
1152 if (!hash_expected
&& hash_location
) {
1153 NET_INC_STATS_BH(sock_net(sk
), LINUX_MIB_TCPMD5UNEXPECTED
);
1157 /* Okay, so this is hash_expected and hash_location -
1158 * so we need to calculate the checksum.
1160 genhash
= tcp_v4_md5_hash_skb(newhash
,
1164 if (genhash
|| memcmp(hash_location
, newhash
, 16) != 0) {
1165 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1166 &iph
->saddr
, ntohs(th
->source
),
1167 &iph
->daddr
, ntohs(th
->dest
),
1168 genhash
? " tcp_v4_calc_md5_hash failed"
1177 static void tcp_v4_init_req(struct request_sock
*req
,
1178 const struct sock
*sk_listener
,
1179 struct sk_buff
*skb
)
1181 struct inet_request_sock
*ireq
= inet_rsk(req
);
1183 sk_rcv_saddr_set(req_to_sk(req
), ip_hdr(skb
)->daddr
);
1184 sk_daddr_set(req_to_sk(req
), ip_hdr(skb
)->saddr
);
1185 ireq
->no_srccheck
= inet_sk(sk_listener
)->transparent
;
1186 ireq
->opt
= tcp_v4_save_options(skb
);
1189 static struct dst_entry
*tcp_v4_route_req(const struct sock
*sk
,
1191 const struct request_sock
*req
,
1194 struct dst_entry
*dst
= inet_csk_route_req(sk
, &fl
->u
.ip4
, req
);
1197 if (fl
->u
.ip4
.daddr
== inet_rsk(req
)->ir_rmt_addr
)
1206 struct request_sock_ops tcp_request_sock_ops __read_mostly
= {
1208 .obj_size
= sizeof(struct tcp_request_sock
),
1209 .rtx_syn_ack
= tcp_rtx_synack
,
1210 .send_ack
= tcp_v4_reqsk_send_ack
,
1211 .destructor
= tcp_v4_reqsk_destructor
,
1212 .send_reset
= tcp_v4_send_reset
,
1213 .syn_ack_timeout
= tcp_syn_ack_timeout
,
1216 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops
= {
1217 .mss_clamp
= TCP_MSS_DEFAULT
,
1218 #ifdef CONFIG_TCP_MD5SIG
1219 .req_md5_lookup
= tcp_v4_md5_lookup
,
1220 .calc_md5_hash
= tcp_v4_md5_hash_skb
,
1222 .init_req
= tcp_v4_init_req
,
1223 #ifdef CONFIG_SYN_COOKIES
1224 .cookie_init_seq
= cookie_v4_init_sequence
,
1226 .route_req
= tcp_v4_route_req
,
1227 .init_seq
= tcp_v4_init_sequence
,
1228 .send_synack
= tcp_v4_send_synack
,
1231 int tcp_v4_conn_request(struct sock
*sk
, struct sk_buff
*skb
)
1233 /* Never answer to SYNs send to broadcast or multicast */
1234 if (skb_rtable(skb
)->rt_flags
& (RTCF_BROADCAST
| RTCF_MULTICAST
))
1237 return tcp_conn_request(&tcp_request_sock_ops
,
1238 &tcp_request_sock_ipv4_ops
, sk
, skb
);
1241 NET_INC_STATS_BH(sock_net(sk
), LINUX_MIB_LISTENDROPS
);
1244 EXPORT_SYMBOL(tcp_v4_conn_request
);
1248 * The three way handshake has completed - we got a valid synack -
1249 * now create the new socket.
1251 struct sock
*tcp_v4_syn_recv_sock(const struct sock
*sk
, struct sk_buff
*skb
,
1252 struct request_sock
*req
,
1253 struct dst_entry
*dst
,
1254 struct request_sock
*req_unhash
,
1257 struct inet_request_sock
*ireq
;
1258 struct inet_sock
*newinet
;
1259 struct tcp_sock
*newtp
;
1261 #ifdef CONFIG_TCP_MD5SIG
1262 struct tcp_md5sig_key
*key
;
1264 struct ip_options_rcu
*inet_opt
;
1266 if (sk_acceptq_is_full(sk
))
1269 newsk
= tcp_create_openreq_child(sk
, req
, skb
);
1273 newsk
->sk_gso_type
= SKB_GSO_TCPV4
;
1274 inet_sk_rx_dst_set(newsk
, skb
);
1276 newtp
= tcp_sk(newsk
);
1277 newinet
= inet_sk(newsk
);
1278 ireq
= inet_rsk(req
);
1279 sk_daddr_set(newsk
, ireq
->ir_rmt_addr
);
1280 sk_rcv_saddr_set(newsk
, ireq
->ir_loc_addr
);
1281 newsk
->sk_bound_dev_if
= ireq
->ir_iif
;
1282 newinet
->inet_saddr
= ireq
->ir_loc_addr
;
1283 inet_opt
= ireq
->opt
;
1284 rcu_assign_pointer(newinet
->inet_opt
, inet_opt
);
1286 newinet
->mc_index
= inet_iif(skb
);
1287 newinet
->mc_ttl
= ip_hdr(skb
)->ttl
;
1288 newinet
->rcv_tos
= ip_hdr(skb
)->tos
;
1289 inet_csk(newsk
)->icsk_ext_hdr_len
= 0;
1291 inet_csk(newsk
)->icsk_ext_hdr_len
= inet_opt
->opt
.optlen
;
1292 newinet
->inet_id
= newtp
->write_seq
^ jiffies
;
1295 dst
= inet_csk_route_child_sock(sk
, newsk
, req
);
1299 /* syncookie case : see end of cookie_v4_check() */
1301 sk_setup_caps(newsk
, dst
);
1303 tcp_ca_openreq_child(newsk
, dst
);
1305 tcp_sync_mss(newsk
, dst_mtu(dst
));
1306 newtp
->advmss
= dst_metric_advmss(dst
);
1307 if (tcp_sk(sk
)->rx_opt
.user_mss
&&
1308 tcp_sk(sk
)->rx_opt
.user_mss
< newtp
->advmss
)
1309 newtp
->advmss
= tcp_sk(sk
)->rx_opt
.user_mss
;
1311 tcp_initialize_rcv_mss(newsk
);
1313 #ifdef CONFIG_TCP_MD5SIG
1314 /* Copy over the MD5 key from the original socket */
1315 key
= tcp_md5_do_lookup(sk
, (union tcp_md5_addr
*)&newinet
->inet_daddr
,
1319 * We're using one, so create a matching key
1320 * on the newsk structure. If we fail to get
1321 * memory, then we end up not copying the key
1324 tcp_md5_do_add(newsk
, (union tcp_md5_addr
*)&newinet
->inet_daddr
,
1325 AF_INET
, key
->key
, key
->keylen
, GFP_ATOMIC
);
1326 sk_nocaps_add(newsk
, NETIF_F_GSO_MASK
);
1330 if (__inet_inherit_port(sk
, newsk
) < 0)
1332 *own_req
= inet_ehash_nolisten(newsk
, req_to_sk(req_unhash
));
1334 tcp_move_syn(newtp
, req
);
1339 NET_INC_STATS_BH(sock_net(sk
), LINUX_MIB_LISTENOVERFLOWS
);
1343 NET_INC_STATS_BH(sock_net(sk
), LINUX_MIB_LISTENDROPS
);
1346 inet_csk_prepare_forced_close(newsk
);
1350 EXPORT_SYMBOL(tcp_v4_syn_recv_sock
);
1352 static struct sock
*tcp_v4_cookie_check(struct sock
*sk
, struct sk_buff
*skb
)
1354 #ifdef CONFIG_SYN_COOKIES
1355 const struct tcphdr
*th
= tcp_hdr(skb
);
1358 sk
= cookie_v4_check(sk
, skb
);
1363 /* The socket must have it's spinlock held when we get
1364 * here, unless it is a TCP_LISTEN socket.
1366 * We have a potential double-lock case here, so even when
1367 * doing backlog processing we use the BH locking scheme.
1368 * This is because we cannot sleep with the original spinlock
1371 int tcp_v4_do_rcv(struct sock
*sk
, struct sk_buff
*skb
)
1375 if (sk
->sk_state
== TCP_ESTABLISHED
) { /* Fast path */
1376 struct dst_entry
*dst
= sk
->sk_rx_dst
;
1378 sock_rps_save_rxhash(sk
, skb
);
1379 sk_mark_napi_id(sk
, skb
);
1381 if (inet_sk(sk
)->rx_dst_ifindex
!= skb
->skb_iif
||
1382 !dst
->ops
->check(dst
, 0)) {
1384 sk
->sk_rx_dst
= NULL
;
1387 tcp_rcv_established(sk
, skb
, tcp_hdr(skb
), skb
->len
);
1391 if (tcp_checksum_complete(skb
))
1394 if (sk
->sk_state
== TCP_LISTEN
) {
1395 struct sock
*nsk
= tcp_v4_cookie_check(sk
, skb
);
1400 sock_rps_save_rxhash(nsk
, skb
);
1401 sk_mark_napi_id(nsk
, skb
);
1402 if (tcp_child_process(sk
, nsk
, skb
)) {
1409 sock_rps_save_rxhash(sk
, skb
);
1411 if (tcp_rcv_state_process(sk
, skb
)) {
1418 tcp_v4_send_reset(rsk
, skb
);
1421 /* Be careful here. If this function gets more complicated and
1422 * gcc suffers from register pressure on the x86, sk (in %ebx)
1423 * might be destroyed here. This current version compiles correctly,
1424 * but you have been warned.
1429 TCP_INC_STATS_BH(sock_net(sk
), TCP_MIB_CSUMERRORS
);
1430 TCP_INC_STATS_BH(sock_net(sk
), TCP_MIB_INERRS
);
1433 EXPORT_SYMBOL(tcp_v4_do_rcv
);
1435 void tcp_v4_early_demux(struct sk_buff
*skb
)
1437 const struct iphdr
*iph
;
1438 const struct tcphdr
*th
;
1441 if (skb
->pkt_type
!= PACKET_HOST
)
1444 if (!pskb_may_pull(skb
, skb_transport_offset(skb
) + sizeof(struct tcphdr
)))
1450 if (th
->doff
< sizeof(struct tcphdr
) / 4)
1453 sk
= __inet_lookup_established(dev_net(skb
->dev
), &tcp_hashinfo
,
1454 iph
->saddr
, th
->source
,
1455 iph
->daddr
, ntohs(th
->dest
),
1459 skb
->destructor
= sock_edemux
;
1460 if (sk_fullsock(sk
)) {
1461 struct dst_entry
*dst
= READ_ONCE(sk
->sk_rx_dst
);
1464 dst
= dst_check(dst
, 0);
1466 inet_sk(sk
)->rx_dst_ifindex
== skb
->skb_iif
)
1467 skb_dst_set_noref(skb
, dst
);
1472 /* Packet is added to VJ-style prequeue for processing in process
1473 * context, if a reader task is waiting. Apparently, this exciting
1474 * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1475 * failed somewhere. Latency? Burstiness? Well, at least now we will
1476 * see, why it failed. 8)8) --ANK
1479 bool tcp_prequeue(struct sock
*sk
, struct sk_buff
*skb
)
1481 struct tcp_sock
*tp
= tcp_sk(sk
);
1483 if (sysctl_tcp_low_latency
|| !tp
->ucopy
.task
)
1486 if (skb
->len
<= tcp_hdrlen(skb
) &&
1487 skb_queue_len(&tp
->ucopy
.prequeue
) == 0)
1490 /* Before escaping RCU protected region, we need to take care of skb
1491 * dst. Prequeue is only enabled for established sockets.
1492 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1493 * Instead of doing full sk_rx_dst validity here, let's perform
1494 * an optimistic check.
1496 if (likely(sk
->sk_rx_dst
))
1499 skb_dst_force_safe(skb
);
1501 __skb_queue_tail(&tp
->ucopy
.prequeue
, skb
);
1502 tp
->ucopy
.memory
+= skb
->truesize
;
1503 if (tp
->ucopy
.memory
> sk
->sk_rcvbuf
) {
1504 struct sk_buff
*skb1
;
1506 BUG_ON(sock_owned_by_user(sk
));
1508 while ((skb1
= __skb_dequeue(&tp
->ucopy
.prequeue
)) != NULL
) {
1509 sk_backlog_rcv(sk
, skb1
);
1510 NET_INC_STATS_BH(sock_net(sk
),
1511 LINUX_MIB_TCPPREQUEUEDROPPED
);
1514 tp
->ucopy
.memory
= 0;
1515 } else if (skb_queue_len(&tp
->ucopy
.prequeue
) == 1) {
1516 wake_up_interruptible_sync_poll(sk_sleep(sk
),
1517 POLLIN
| POLLRDNORM
| POLLRDBAND
);
1518 if (!inet_csk_ack_scheduled(sk
))
1519 inet_csk_reset_xmit_timer(sk
, ICSK_TIME_DACK
,
1520 (3 * tcp_rto_min(sk
)) / 4,
1525 EXPORT_SYMBOL(tcp_prequeue
);
1531 int tcp_v4_rcv(struct sk_buff
*skb
)
1533 const struct iphdr
*iph
;
1534 const struct tcphdr
*th
;
1537 struct net
*net
= dev_net(skb
->dev
);
1539 if (skb
->pkt_type
!= PACKET_HOST
)
1542 /* Count it even if it's bad */
1543 TCP_INC_STATS_BH(net
, TCP_MIB_INSEGS
);
1545 if (!pskb_may_pull(skb
, sizeof(struct tcphdr
)))
1550 if (th
->doff
< sizeof(struct tcphdr
) / 4)
1552 if (!pskb_may_pull(skb
, th
->doff
* 4))
1555 /* An explanation is required here, I think.
1556 * Packet length and doff are validated by header prediction,
1557 * provided case of th->doff==0 is eliminated.
1558 * So, we defer the checks. */
1560 if (skb_checksum_init(skb
, IPPROTO_TCP
, inet_compute_pseudo
))
1565 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1566 * barrier() makes sure compiler wont play fool^Waliasing games.
1568 memmove(&TCP_SKB_CB(skb
)->header
.h4
, IPCB(skb
),
1569 sizeof(struct inet_skb_parm
));
1572 TCP_SKB_CB(skb
)->seq
= ntohl(th
->seq
);
1573 TCP_SKB_CB(skb
)->end_seq
= (TCP_SKB_CB(skb
)->seq
+ th
->syn
+ th
->fin
+
1574 skb
->len
- th
->doff
* 4);
1575 TCP_SKB_CB(skb
)->ack_seq
= ntohl(th
->ack_seq
);
1576 TCP_SKB_CB(skb
)->tcp_flags
= tcp_flag_byte(th
);
1577 TCP_SKB_CB(skb
)->tcp_tw_isn
= 0;
1578 TCP_SKB_CB(skb
)->ip_dsfield
= ipv4_get_dsfield(iph
);
1579 TCP_SKB_CB(skb
)->sacked
= 0;
1582 sk
= __inet_lookup_skb(&tcp_hashinfo
, skb
, th
->source
, th
->dest
);
1587 if (sk
->sk_state
== TCP_TIME_WAIT
)
1590 if (sk
->sk_state
== TCP_NEW_SYN_RECV
) {
1591 struct request_sock
*req
= inet_reqsk(sk
);
1592 struct sock
*nsk
= NULL
;
1594 sk
= req
->rsk_listener
;
1595 if (tcp_v4_inbound_md5_hash(sk
, skb
))
1596 goto discard_and_relse
;
1597 if (likely(sk
->sk_state
== TCP_LISTEN
)) {
1598 nsk
= tcp_check_req(sk
, skb
, req
, false);
1600 inet_csk_reqsk_queue_drop_and_put(sk
, req
);
1610 } else if (tcp_child_process(sk
, nsk
, skb
)) {
1611 tcp_v4_send_reset(nsk
, skb
);
1617 if (unlikely(iph
->ttl
< inet_sk(sk
)->min_ttl
)) {
1618 NET_INC_STATS_BH(net
, LINUX_MIB_TCPMINTTLDROP
);
1619 goto discard_and_relse
;
1622 if (!xfrm4_policy_check(sk
, XFRM_POLICY_IN
, skb
))
1623 goto discard_and_relse
;
1625 if (tcp_v4_inbound_md5_hash(sk
, skb
))
1626 goto discard_and_relse
;
1630 if (sk_filter(sk
, skb
))
1631 goto discard_and_relse
;
1635 if (sk
->sk_state
== TCP_LISTEN
) {
1636 ret
= tcp_v4_do_rcv(sk
, skb
);
1637 goto put_and_return
;
1640 sk_incoming_cpu_update(sk
);
1642 bh_lock_sock_nested(sk
);
1643 tcp_sk(sk
)->segs_in
+= max_t(u16
, 1, skb_shinfo(skb
)->gso_segs
);
1645 if (!sock_owned_by_user(sk
)) {
1646 if (!tcp_prequeue(sk
, skb
))
1647 ret
= tcp_v4_do_rcv(sk
, skb
);
1648 } else if (unlikely(sk_add_backlog(sk
, skb
,
1649 sk
->sk_rcvbuf
+ sk
->sk_sndbuf
))) {
1651 NET_INC_STATS_BH(net
, LINUX_MIB_TCPBACKLOGDROP
);
1652 goto discard_and_relse
;
1662 if (!xfrm4_policy_check(NULL
, XFRM_POLICY_IN
, skb
))
1665 if (tcp_checksum_complete(skb
)) {
1667 TCP_INC_STATS_BH(net
, TCP_MIB_CSUMERRORS
);
1669 TCP_INC_STATS_BH(net
, TCP_MIB_INERRS
);
1671 tcp_v4_send_reset(NULL
, skb
);
1675 /* Discard frame. */
1684 if (!xfrm4_policy_check(NULL
, XFRM_POLICY_IN
, skb
)) {
1685 inet_twsk_put(inet_twsk(sk
));
1689 if (tcp_checksum_complete(skb
)) {
1690 inet_twsk_put(inet_twsk(sk
));
1693 switch (tcp_timewait_state_process(inet_twsk(sk
), skb
, th
)) {
1695 struct sock
*sk2
= inet_lookup_listener(dev_net(skb
->dev
),
1697 iph
->saddr
, th
->source
,
1698 iph
->daddr
, th
->dest
,
1701 inet_twsk_deschedule_put(inet_twsk(sk
));
1705 /* Fall through to ACK */
1708 tcp_v4_timewait_ack(sk
, skb
);
1711 tcp_v4_send_reset(sk
, skb
);
1712 inet_twsk_deschedule_put(inet_twsk(sk
));
1714 case TCP_TW_SUCCESS
:;
1719 static struct timewait_sock_ops tcp_timewait_sock_ops
= {
1720 .twsk_obj_size
= sizeof(struct tcp_timewait_sock
),
1721 .twsk_unique
= tcp_twsk_unique
,
1722 .twsk_destructor
= tcp_twsk_destructor
,
1725 void inet_sk_rx_dst_set(struct sock
*sk
, const struct sk_buff
*skb
)
1727 struct dst_entry
*dst
= skb_dst(skb
);
1729 if (dst
&& dst_hold_safe(dst
)) {
1730 sk
->sk_rx_dst
= dst
;
1731 inet_sk(sk
)->rx_dst_ifindex
= skb
->skb_iif
;
1734 EXPORT_SYMBOL(inet_sk_rx_dst_set
);
1736 const struct inet_connection_sock_af_ops ipv4_specific
= {
1737 .queue_xmit
= ip_queue_xmit
,
1738 .send_check
= tcp_v4_send_check
,
1739 .rebuild_header
= inet_sk_rebuild_header
,
1740 .sk_rx_dst_set
= inet_sk_rx_dst_set
,
1741 .conn_request
= tcp_v4_conn_request
,
1742 .syn_recv_sock
= tcp_v4_syn_recv_sock
,
1743 .net_header_len
= sizeof(struct iphdr
),
1744 .setsockopt
= ip_setsockopt
,
1745 .getsockopt
= ip_getsockopt
,
1746 .addr2sockaddr
= inet_csk_addr2sockaddr
,
1747 .sockaddr_len
= sizeof(struct sockaddr_in
),
1748 .bind_conflict
= inet_csk_bind_conflict
,
1749 #ifdef CONFIG_COMPAT
1750 .compat_setsockopt
= compat_ip_setsockopt
,
1751 .compat_getsockopt
= compat_ip_getsockopt
,
1753 .mtu_reduced
= tcp_v4_mtu_reduced
,
1755 EXPORT_SYMBOL(ipv4_specific
);
1757 #ifdef CONFIG_TCP_MD5SIG
1758 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific
= {
1759 .md5_lookup
= tcp_v4_md5_lookup
,
1760 .calc_md5_hash
= tcp_v4_md5_hash_skb
,
1761 .md5_parse
= tcp_v4_parse_md5_keys
,
1765 /* NOTE: A lot of things set to zero explicitly by call to
1766 * sk_alloc() so need not be done here.
1768 static int tcp_v4_init_sock(struct sock
*sk
)
1770 struct inet_connection_sock
*icsk
= inet_csk(sk
);
1774 icsk
->icsk_af_ops
= &ipv4_specific
;
1776 #ifdef CONFIG_TCP_MD5SIG
1777 tcp_sk(sk
)->af_specific
= &tcp_sock_ipv4_specific
;
1783 void tcp_v4_destroy_sock(struct sock
*sk
)
1785 struct tcp_sock
*tp
= tcp_sk(sk
);
1787 tcp_clear_xmit_timers(sk
);
1789 tcp_cleanup_congestion_control(sk
);
1791 /* Cleanup up the write buffer. */
1792 tcp_write_queue_purge(sk
);
1794 /* Cleans up our, hopefully empty, out_of_order_queue. */
1795 __skb_queue_purge(&tp
->out_of_order_queue
);
1797 #ifdef CONFIG_TCP_MD5SIG
1798 /* Clean up the MD5 key list, if any */
1799 if (tp
->md5sig_info
) {
1800 tcp_clear_md5_list(sk
);
1801 kfree_rcu(tp
->md5sig_info
, rcu
);
1802 tp
->md5sig_info
= NULL
;
1806 /* Clean prequeue, it must be empty really */
1807 __skb_queue_purge(&tp
->ucopy
.prequeue
);
1809 /* Clean up a referenced TCP bind bucket. */
1810 if (inet_csk(sk
)->icsk_bind_hash
)
1813 BUG_ON(tp
->fastopen_rsk
);
1815 /* If socket is aborted during connect operation */
1816 tcp_free_fastopen_req(tp
);
1817 tcp_saved_syn_free(tp
);
1819 sk_sockets_allocated_dec(sk
);
1821 if (mem_cgroup_sockets_enabled
&& sk
->sk_memcg
)
1822 sock_release_memcg(sk
);
1824 EXPORT_SYMBOL(tcp_v4_destroy_sock
);
1826 #ifdef CONFIG_PROC_FS
1827 /* Proc filesystem TCP sock list dumping. */
1830 * Get next listener socket follow cur. If cur is NULL, get first socket
1831 * starting from bucket given in st->bucket; when st->bucket is zero the
1832 * very first socket in the hash table is returned.
1834 static void *listening_get_next(struct seq_file
*seq
, void *cur
)
1836 struct inet_connection_sock
*icsk
;
1837 struct hlist_nulls_node
*node
;
1838 struct sock
*sk
= cur
;
1839 struct inet_listen_hashbucket
*ilb
;
1840 struct tcp_iter_state
*st
= seq
->private;
1841 struct net
*net
= seq_file_net(seq
);
1844 ilb
= &tcp_hashinfo
.listening_hash
[st
->bucket
];
1845 spin_lock_bh(&ilb
->lock
);
1846 sk
= sk_nulls_head(&ilb
->head
);
1850 ilb
= &tcp_hashinfo
.listening_hash
[st
->bucket
];
1854 sk
= sk_nulls_next(sk
);
1856 sk_nulls_for_each_from(sk
, node
) {
1857 if (!net_eq(sock_net(sk
), net
))
1859 if (sk
->sk_family
== st
->family
) {
1863 icsk
= inet_csk(sk
);
1865 spin_unlock_bh(&ilb
->lock
);
1867 if (++st
->bucket
< INET_LHTABLE_SIZE
) {
1868 ilb
= &tcp_hashinfo
.listening_hash
[st
->bucket
];
1869 spin_lock_bh(&ilb
->lock
);
1870 sk
= sk_nulls_head(&ilb
->head
);
1878 static void *listening_get_idx(struct seq_file
*seq
, loff_t
*pos
)
1880 struct tcp_iter_state
*st
= seq
->private;
1885 rc
= listening_get_next(seq
, NULL
);
1887 while (rc
&& *pos
) {
1888 rc
= listening_get_next(seq
, rc
);
1894 static inline bool empty_bucket(const struct tcp_iter_state
*st
)
1896 return hlist_nulls_empty(&tcp_hashinfo
.ehash
[st
->bucket
].chain
);
1900 * Get first established socket starting from bucket given in st->bucket.
1901 * If st->bucket is zero, the very first socket in the hash is returned.
1903 static void *established_get_first(struct seq_file
*seq
)
1905 struct tcp_iter_state
*st
= seq
->private;
1906 struct net
*net
= seq_file_net(seq
);
1910 for (; st
->bucket
<= tcp_hashinfo
.ehash_mask
; ++st
->bucket
) {
1912 struct hlist_nulls_node
*node
;
1913 spinlock_t
*lock
= inet_ehash_lockp(&tcp_hashinfo
, st
->bucket
);
1915 /* Lockless fast path for the common case of empty buckets */
1916 if (empty_bucket(st
))
1920 sk_nulls_for_each(sk
, node
, &tcp_hashinfo
.ehash
[st
->bucket
].chain
) {
1921 if (sk
->sk_family
!= st
->family
||
1922 !net_eq(sock_net(sk
), net
)) {
1928 spin_unlock_bh(lock
);
1934 static void *established_get_next(struct seq_file
*seq
, void *cur
)
1936 struct sock
*sk
= cur
;
1937 struct hlist_nulls_node
*node
;
1938 struct tcp_iter_state
*st
= seq
->private;
1939 struct net
*net
= seq_file_net(seq
);
1944 sk
= sk_nulls_next(sk
);
1946 sk_nulls_for_each_from(sk
, node
) {
1947 if (sk
->sk_family
== st
->family
&& net_eq(sock_net(sk
), net
))
1951 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo
, st
->bucket
));
1953 return established_get_first(seq
);
1956 static void *established_get_idx(struct seq_file
*seq
, loff_t pos
)
1958 struct tcp_iter_state
*st
= seq
->private;
1962 rc
= established_get_first(seq
);
1965 rc
= established_get_next(seq
, rc
);
1971 static void *tcp_get_idx(struct seq_file
*seq
, loff_t pos
)
1974 struct tcp_iter_state
*st
= seq
->private;
1976 st
->state
= TCP_SEQ_STATE_LISTENING
;
1977 rc
= listening_get_idx(seq
, &pos
);
1980 st
->state
= TCP_SEQ_STATE_ESTABLISHED
;
1981 rc
= established_get_idx(seq
, pos
);
1987 static void *tcp_seek_last_pos(struct seq_file
*seq
)
1989 struct tcp_iter_state
*st
= seq
->private;
1990 int offset
= st
->offset
;
1991 int orig_num
= st
->num
;
1994 switch (st
->state
) {
1995 case TCP_SEQ_STATE_LISTENING
:
1996 if (st
->bucket
>= INET_LHTABLE_SIZE
)
1998 st
->state
= TCP_SEQ_STATE_LISTENING
;
1999 rc
= listening_get_next(seq
, NULL
);
2000 while (offset
-- && rc
)
2001 rc
= listening_get_next(seq
, rc
);
2005 st
->state
= TCP_SEQ_STATE_ESTABLISHED
;
2007 case TCP_SEQ_STATE_ESTABLISHED
:
2008 if (st
->bucket
> tcp_hashinfo
.ehash_mask
)
2010 rc
= established_get_first(seq
);
2011 while (offset
-- && rc
)
2012 rc
= established_get_next(seq
, rc
);
2020 static void *tcp_seq_start(struct seq_file
*seq
, loff_t
*pos
)
2022 struct tcp_iter_state
*st
= seq
->private;
2025 if (*pos
&& *pos
== st
->last_pos
) {
2026 rc
= tcp_seek_last_pos(seq
);
2031 st
->state
= TCP_SEQ_STATE_LISTENING
;
2035 rc
= *pos
? tcp_get_idx(seq
, *pos
- 1) : SEQ_START_TOKEN
;
2038 st
->last_pos
= *pos
;
2042 static void *tcp_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
2044 struct tcp_iter_state
*st
= seq
->private;
2047 if (v
== SEQ_START_TOKEN
) {
2048 rc
= tcp_get_idx(seq
, 0);
2052 switch (st
->state
) {
2053 case TCP_SEQ_STATE_LISTENING
:
2054 rc
= listening_get_next(seq
, v
);
2056 st
->state
= TCP_SEQ_STATE_ESTABLISHED
;
2059 rc
= established_get_first(seq
);
2062 case TCP_SEQ_STATE_ESTABLISHED
:
2063 rc
= established_get_next(seq
, v
);
2068 st
->last_pos
= *pos
;
2072 static void tcp_seq_stop(struct seq_file
*seq
, void *v
)
2074 struct tcp_iter_state
*st
= seq
->private;
2076 switch (st
->state
) {
2077 case TCP_SEQ_STATE_LISTENING
:
2078 if (v
!= SEQ_START_TOKEN
)
2079 spin_unlock_bh(&tcp_hashinfo
.listening_hash
[st
->bucket
].lock
);
2081 case TCP_SEQ_STATE_ESTABLISHED
:
2083 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo
, st
->bucket
));
2088 int tcp_seq_open(struct inode
*inode
, struct file
*file
)
2090 struct tcp_seq_afinfo
*afinfo
= PDE_DATA(inode
);
2091 struct tcp_iter_state
*s
;
2094 err
= seq_open_net(inode
, file
, &afinfo
->seq_ops
,
2095 sizeof(struct tcp_iter_state
));
2099 s
= ((struct seq_file
*)file
->private_data
)->private;
2100 s
->family
= afinfo
->family
;
2104 EXPORT_SYMBOL(tcp_seq_open
);
2106 int tcp_proc_register(struct net
*net
, struct tcp_seq_afinfo
*afinfo
)
2109 struct proc_dir_entry
*p
;
2111 afinfo
->seq_ops
.start
= tcp_seq_start
;
2112 afinfo
->seq_ops
.next
= tcp_seq_next
;
2113 afinfo
->seq_ops
.stop
= tcp_seq_stop
;
2115 p
= proc_create_data(afinfo
->name
, S_IRUGO
, net
->proc_net
,
2116 afinfo
->seq_fops
, afinfo
);
2121 EXPORT_SYMBOL(tcp_proc_register
);
2123 void tcp_proc_unregister(struct net
*net
, struct tcp_seq_afinfo
*afinfo
)
2125 remove_proc_entry(afinfo
->name
, net
->proc_net
);
2127 EXPORT_SYMBOL(tcp_proc_unregister
);
2129 static void get_openreq4(const struct request_sock
*req
,
2130 struct seq_file
*f
, int i
)
2132 const struct inet_request_sock
*ireq
= inet_rsk(req
);
2133 long delta
= req
->rsk_timer
.expires
- jiffies
;
2135 seq_printf(f
, "%4d: %08X:%04X %08X:%04X"
2136 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2141 ntohs(ireq
->ir_rmt_port
),
2143 0, 0, /* could print option size, but that is af dependent. */
2144 1, /* timers active (only the expire timer) */
2145 jiffies_delta_to_clock_t(delta
),
2147 from_kuid_munged(seq_user_ns(f
),
2148 sock_i_uid(req
->rsk_listener
)),
2149 0, /* non standard timer */
2150 0, /* open_requests have no inode */
2155 static void get_tcp4_sock(struct sock
*sk
, struct seq_file
*f
, int i
)
2158 unsigned long timer_expires
;
2159 const struct tcp_sock
*tp
= tcp_sk(sk
);
2160 const struct inet_connection_sock
*icsk
= inet_csk(sk
);
2161 const struct inet_sock
*inet
= inet_sk(sk
);
2162 const struct fastopen_queue
*fastopenq
= &icsk
->icsk_accept_queue
.fastopenq
;
2163 __be32 dest
= inet
->inet_daddr
;
2164 __be32 src
= inet
->inet_rcv_saddr
;
2165 __u16 destp
= ntohs(inet
->inet_dport
);
2166 __u16 srcp
= ntohs(inet
->inet_sport
);
2170 if (icsk
->icsk_pending
== ICSK_TIME_RETRANS
||
2171 icsk
->icsk_pending
== ICSK_TIME_EARLY_RETRANS
||
2172 icsk
->icsk_pending
== ICSK_TIME_LOSS_PROBE
) {
2174 timer_expires
= icsk
->icsk_timeout
;
2175 } else if (icsk
->icsk_pending
== ICSK_TIME_PROBE0
) {
2177 timer_expires
= icsk
->icsk_timeout
;
2178 } else if (timer_pending(&sk
->sk_timer
)) {
2180 timer_expires
= sk
->sk_timer
.expires
;
2183 timer_expires
= jiffies
;
2186 state
= sk_state_load(sk
);
2187 if (state
== TCP_LISTEN
)
2188 rx_queue
= sk
->sk_ack_backlog
;
2190 /* Because we don't lock the socket,
2191 * we might find a transient negative value.
2193 rx_queue
= max_t(int, tp
->rcv_nxt
- tp
->copied_seq
, 0);
2195 seq_printf(f
, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2196 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2197 i
, src
, srcp
, dest
, destp
, state
,
2198 tp
->write_seq
- tp
->snd_una
,
2201 jiffies_delta_to_clock_t(timer_expires
- jiffies
),
2202 icsk
->icsk_retransmits
,
2203 from_kuid_munged(seq_user_ns(f
), sock_i_uid(sk
)),
2204 icsk
->icsk_probes_out
,
2206 atomic_read(&sk
->sk_refcnt
), sk
,
2207 jiffies_to_clock_t(icsk
->icsk_rto
),
2208 jiffies_to_clock_t(icsk
->icsk_ack
.ato
),
2209 (icsk
->icsk_ack
.quick
<< 1) | icsk
->icsk_ack
.pingpong
,
2211 state
== TCP_LISTEN
?
2212 fastopenq
->max_qlen
:
2213 (tcp_in_initial_slowstart(tp
) ? -1 : tp
->snd_ssthresh
));
2216 static void get_timewait4_sock(const struct inet_timewait_sock
*tw
,
2217 struct seq_file
*f
, int i
)
2219 long delta
= tw
->tw_timer
.expires
- jiffies
;
2223 dest
= tw
->tw_daddr
;
2224 src
= tw
->tw_rcv_saddr
;
2225 destp
= ntohs(tw
->tw_dport
);
2226 srcp
= ntohs(tw
->tw_sport
);
2228 seq_printf(f
, "%4d: %08X:%04X %08X:%04X"
2229 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2230 i
, src
, srcp
, dest
, destp
, tw
->tw_substate
, 0, 0,
2231 3, jiffies_delta_to_clock_t(delta
), 0, 0, 0, 0,
2232 atomic_read(&tw
->tw_refcnt
), tw
);
2237 static int tcp4_seq_show(struct seq_file
*seq
, void *v
)
2239 struct tcp_iter_state
*st
;
2240 struct sock
*sk
= v
;
2242 seq_setwidth(seq
, TMPSZ
- 1);
2243 if (v
== SEQ_START_TOKEN
) {
2244 seq_puts(seq
, " sl local_address rem_address st tx_queue "
2245 "rx_queue tr tm->when retrnsmt uid timeout "
2251 if (sk
->sk_state
== TCP_TIME_WAIT
)
2252 get_timewait4_sock(v
, seq
, st
->num
);
2253 else if (sk
->sk_state
== TCP_NEW_SYN_RECV
)
2254 get_openreq4(v
, seq
, st
->num
);
2256 get_tcp4_sock(v
, seq
, st
->num
);
2262 static const struct file_operations tcp_afinfo_seq_fops
= {
2263 .owner
= THIS_MODULE
,
2264 .open
= tcp_seq_open
,
2266 .llseek
= seq_lseek
,
2267 .release
= seq_release_net
2270 static struct tcp_seq_afinfo tcp4_seq_afinfo
= {
2273 .seq_fops
= &tcp_afinfo_seq_fops
,
2275 .show
= tcp4_seq_show
,
2279 static int __net_init
tcp4_proc_init_net(struct net
*net
)
2281 return tcp_proc_register(net
, &tcp4_seq_afinfo
);
2284 static void __net_exit
tcp4_proc_exit_net(struct net
*net
)
2286 tcp_proc_unregister(net
, &tcp4_seq_afinfo
);
2289 static struct pernet_operations tcp4_net_ops
= {
2290 .init
= tcp4_proc_init_net
,
2291 .exit
= tcp4_proc_exit_net
,
2294 int __init
tcp4_proc_init(void)
2296 return register_pernet_subsys(&tcp4_net_ops
);
2299 void tcp4_proc_exit(void)
2301 unregister_pernet_subsys(&tcp4_net_ops
);
2303 #endif /* CONFIG_PROC_FS */
2305 struct proto tcp_prot
= {
2307 .owner
= THIS_MODULE
,
2309 .connect
= tcp_v4_connect
,
2310 .disconnect
= tcp_disconnect
,
2311 .accept
= inet_csk_accept
,
2313 .init
= tcp_v4_init_sock
,
2314 .destroy
= tcp_v4_destroy_sock
,
2315 .shutdown
= tcp_shutdown
,
2316 .setsockopt
= tcp_setsockopt
,
2317 .getsockopt
= tcp_getsockopt
,
2318 .recvmsg
= tcp_recvmsg
,
2319 .sendmsg
= tcp_sendmsg
,
2320 .sendpage
= tcp_sendpage
,
2321 .backlog_rcv
= tcp_v4_do_rcv
,
2322 .release_cb
= tcp_release_cb
,
2324 .unhash
= inet_unhash
,
2325 .get_port
= inet_csk_get_port
,
2326 .enter_memory_pressure
= tcp_enter_memory_pressure
,
2327 .stream_memory_free
= tcp_stream_memory_free
,
2328 .sockets_allocated
= &tcp_sockets_allocated
,
2329 .orphan_count
= &tcp_orphan_count
,
2330 .memory_allocated
= &tcp_memory_allocated
,
2331 .memory_pressure
= &tcp_memory_pressure
,
2332 .sysctl_mem
= sysctl_tcp_mem
,
2333 .sysctl_wmem
= sysctl_tcp_wmem
,
2334 .sysctl_rmem
= sysctl_tcp_rmem
,
2335 .max_header
= MAX_TCP_HEADER
,
2336 .obj_size
= sizeof(struct tcp_sock
),
2337 .slab_flags
= SLAB_DESTROY_BY_RCU
,
2338 .twsk_prot
= &tcp_timewait_sock_ops
,
2339 .rsk_prot
= &tcp_request_sock_ops
,
2340 .h
.hashinfo
= &tcp_hashinfo
,
2341 .no_autobind
= true,
2342 #ifdef CONFIG_COMPAT
2343 .compat_setsockopt
= compat_tcp_setsockopt
,
2344 .compat_getsockopt
= compat_tcp_getsockopt
,
2346 .diag_destroy
= tcp_abort
,
2348 EXPORT_SYMBOL(tcp_prot
);
2350 static void __net_exit
tcp_sk_exit(struct net
*net
)
2354 for_each_possible_cpu(cpu
)
2355 inet_ctl_sock_destroy(*per_cpu_ptr(net
->ipv4
.tcp_sk
, cpu
));
2356 free_percpu(net
->ipv4
.tcp_sk
);
2359 static int __net_init
tcp_sk_init(struct net
*net
)
2363 net
->ipv4
.tcp_sk
= alloc_percpu(struct sock
*);
2364 if (!net
->ipv4
.tcp_sk
)
2367 for_each_possible_cpu(cpu
) {
2370 res
= inet_ctl_sock_create(&sk
, PF_INET
, SOCK_RAW
,
2374 *per_cpu_ptr(net
->ipv4
.tcp_sk
, cpu
) = sk
;
2377 net
->ipv4
.sysctl_tcp_ecn
= 2;
2378 net
->ipv4
.sysctl_tcp_ecn_fallback
= 1;
2380 net
->ipv4
.sysctl_tcp_base_mss
= TCP_BASE_MSS
;
2381 net
->ipv4
.sysctl_tcp_probe_threshold
= TCP_PROBE_THRESHOLD
;
2382 net
->ipv4
.sysctl_tcp_probe_interval
= TCP_PROBE_INTERVAL
;
2384 net
->ipv4
.sysctl_tcp_keepalive_time
= TCP_KEEPALIVE_TIME
;
2385 net
->ipv4
.sysctl_tcp_keepalive_probes
= TCP_KEEPALIVE_PROBES
;
2386 net
->ipv4
.sysctl_tcp_keepalive_intvl
= TCP_KEEPALIVE_INTVL
;
2395 static void __net_exit
tcp_sk_exit_batch(struct list_head
*net_exit_list
)
2397 inet_twsk_purge(&tcp_hashinfo
, &tcp_death_row
, AF_INET
);
2400 static struct pernet_operations __net_initdata tcp_sk_ops
= {
2401 .init
= tcp_sk_init
,
2402 .exit
= tcp_sk_exit
,
2403 .exit_batch
= tcp_sk_exit_batch
,
2406 void __init
tcp_v4_init(void)
2408 inet_hashinfo_init(&tcp_hashinfo
);
2409 if (register_pernet_subsys(&tcp_sk_ops
))
2410 panic("Failed to create the TCP control socket.\n");