Merge branch 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
[deliverable/linux.git] / net / ipv4 / tcp_ipv4.c
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * IPv4 specific functions
9 *
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 *
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
22 */
23
24 /*
25 * Changes:
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
34 * ACK bit.
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
37 * request_sock handling and moved
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
40 * Added new listen semantics.
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
45 * coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
51 */
52
53 #define pr_fmt(fmt) "TCP: " fmt
54
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/tcp_memcontrol.h>
77 #include <net/busy_poll.h>
78
79 #include <linux/inet.h>
80 #include <linux/ipv6.h>
81 #include <linux/stddef.h>
82 #include <linux/proc_fs.h>
83 #include <linux/seq_file.h>
84
85 #include <linux/crypto.h>
86 #include <linux/scatterlist.h>
87
88 int sysctl_tcp_tw_reuse __read_mostly;
89 int sysctl_tcp_low_latency __read_mostly;
90 EXPORT_SYMBOL(sysctl_tcp_low_latency);
91
92 #ifdef CONFIG_TCP_MD5SIG
93 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
94 __be32 daddr, __be32 saddr, const struct tcphdr *th);
95 #endif
96
97 struct inet_hashinfo tcp_hashinfo;
98 EXPORT_SYMBOL(tcp_hashinfo);
99
100 static __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
101 {
102 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
103 ip_hdr(skb)->saddr,
104 tcp_hdr(skb)->dest,
105 tcp_hdr(skb)->source);
106 }
107
108 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
109 {
110 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111 struct tcp_sock *tp = tcp_sk(sk);
112
113 /* With PAWS, it is safe from the viewpoint
114 of data integrity. Even without PAWS it is safe provided sequence
115 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
116
117 Actually, the idea is close to VJ's one, only timestamp cache is
118 held not per host, but per port pair and TW bucket is used as state
119 holder.
120
121 If TW bucket has been already destroyed we fall back to VJ's scheme
122 and use initial timestamp retrieved from peer table.
123 */
124 if (tcptw->tw_ts_recent_stamp &&
125 (!twp || (sysctl_tcp_tw_reuse &&
126 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
127 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
128 if (tp->write_seq == 0)
129 tp->write_seq = 1;
130 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
131 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
132 sock_hold(sktw);
133 return 1;
134 }
135
136 return 0;
137 }
138 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
139
140 /* This will initiate an outgoing connection. */
141 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
142 {
143 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
144 struct inet_sock *inet = inet_sk(sk);
145 struct tcp_sock *tp = tcp_sk(sk);
146 __be16 orig_sport, orig_dport;
147 __be32 daddr, nexthop;
148 struct flowi4 *fl4;
149 struct rtable *rt;
150 int err;
151 struct ip_options_rcu *inet_opt;
152
153 if (addr_len < sizeof(struct sockaddr_in))
154 return -EINVAL;
155
156 if (usin->sin_family != AF_INET)
157 return -EAFNOSUPPORT;
158
159 nexthop = daddr = usin->sin_addr.s_addr;
160 inet_opt = rcu_dereference_protected(inet->inet_opt,
161 sock_owned_by_user(sk));
162 if (inet_opt && inet_opt->opt.srr) {
163 if (!daddr)
164 return -EINVAL;
165 nexthop = inet_opt->opt.faddr;
166 }
167
168 orig_sport = inet->inet_sport;
169 orig_dport = usin->sin_port;
170 fl4 = &inet->cork.fl.u.ip4;
171 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
172 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
173 IPPROTO_TCP,
174 orig_sport, orig_dport, sk);
175 if (IS_ERR(rt)) {
176 err = PTR_ERR(rt);
177 if (err == -ENETUNREACH)
178 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
179 return err;
180 }
181
182 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
183 ip_rt_put(rt);
184 return -ENETUNREACH;
185 }
186
187 if (!inet_opt || !inet_opt->opt.srr)
188 daddr = fl4->daddr;
189
190 if (!inet->inet_saddr)
191 inet->inet_saddr = fl4->saddr;
192 sk_rcv_saddr_set(sk, inet->inet_saddr);
193
194 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
195 /* Reset inherited state */
196 tp->rx_opt.ts_recent = 0;
197 tp->rx_opt.ts_recent_stamp = 0;
198 if (likely(!tp->repair))
199 tp->write_seq = 0;
200 }
201
202 if (tcp_death_row.sysctl_tw_recycle &&
203 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
204 tcp_fetch_timewait_stamp(sk, &rt->dst);
205
206 inet->inet_dport = usin->sin_port;
207 sk_daddr_set(sk, daddr);
208
209 inet_csk(sk)->icsk_ext_hdr_len = 0;
210 if (inet_opt)
211 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
212
213 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
214
215 /* Socket identity is still unknown (sport may be zero).
216 * However we set state to SYN-SENT and not releasing socket
217 * lock select source port, enter ourselves into the hash tables and
218 * complete initialization after this.
219 */
220 tcp_set_state(sk, TCP_SYN_SENT);
221 err = inet_hash_connect(&tcp_death_row, sk);
222 if (err)
223 goto failure;
224
225 sk_set_txhash(sk);
226
227 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
228 inet->inet_sport, inet->inet_dport, sk);
229 if (IS_ERR(rt)) {
230 err = PTR_ERR(rt);
231 rt = NULL;
232 goto failure;
233 }
234 /* OK, now commit destination to socket. */
235 sk->sk_gso_type = SKB_GSO_TCPV4;
236 sk_setup_caps(sk, &rt->dst);
237
238 if (!tp->write_seq && likely(!tp->repair))
239 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
240 inet->inet_daddr,
241 inet->inet_sport,
242 usin->sin_port);
243
244 inet->inet_id = tp->write_seq ^ jiffies;
245
246 err = tcp_connect(sk);
247
248 rt = NULL;
249 if (err)
250 goto failure;
251
252 return 0;
253
254 failure:
255 /*
256 * This unhashes the socket and releases the local port,
257 * if necessary.
258 */
259 tcp_set_state(sk, TCP_CLOSE);
260 ip_rt_put(rt);
261 sk->sk_route_caps = 0;
262 inet->inet_dport = 0;
263 return err;
264 }
265 EXPORT_SYMBOL(tcp_v4_connect);
266
267 /*
268 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
269 * It can be called through tcp_release_cb() if socket was owned by user
270 * at the time tcp_v4_err() was called to handle ICMP message.
271 */
272 void tcp_v4_mtu_reduced(struct sock *sk)
273 {
274 struct dst_entry *dst;
275 struct inet_sock *inet = inet_sk(sk);
276 u32 mtu = tcp_sk(sk)->mtu_info;
277
278 dst = inet_csk_update_pmtu(sk, mtu);
279 if (!dst)
280 return;
281
282 /* Something is about to be wrong... Remember soft error
283 * for the case, if this connection will not able to recover.
284 */
285 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
286 sk->sk_err_soft = EMSGSIZE;
287
288 mtu = dst_mtu(dst);
289
290 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
291 ip_sk_accept_pmtu(sk) &&
292 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
293 tcp_sync_mss(sk, mtu);
294
295 /* Resend the TCP packet because it's
296 * clear that the old packet has been
297 * dropped. This is the new "fast" path mtu
298 * discovery.
299 */
300 tcp_simple_retransmit(sk);
301 } /* else let the usual retransmit timer handle it */
302 }
303 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
304
305 static void do_redirect(struct sk_buff *skb, struct sock *sk)
306 {
307 struct dst_entry *dst = __sk_dst_check(sk, 0);
308
309 if (dst)
310 dst->ops->redirect(dst, sk, skb);
311 }
312
313
314 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
315 void tcp_req_err(struct sock *sk, u32 seq)
316 {
317 struct request_sock *req = inet_reqsk(sk);
318 struct net *net = sock_net(sk);
319
320 /* ICMPs are not backlogged, hence we cannot get
321 * an established socket here.
322 */
323 WARN_ON(req->sk);
324
325 if (seq != tcp_rsk(req)->snt_isn) {
326 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
327 } else {
328 /*
329 * Still in SYN_RECV, just remove it silently.
330 * There is no good way to pass the error to the newly
331 * created socket, and POSIX does not want network
332 * errors returned from accept().
333 */
334 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
335 NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
336 }
337 reqsk_put(req);
338 }
339 EXPORT_SYMBOL(tcp_req_err);
340
341 /*
342 * This routine is called by the ICMP module when it gets some
343 * sort of error condition. If err < 0 then the socket should
344 * be closed and the error returned to the user. If err > 0
345 * it's just the icmp type << 8 | icmp code. After adjustment
346 * header points to the first 8 bytes of the tcp header. We need
347 * to find the appropriate port.
348 *
349 * The locking strategy used here is very "optimistic". When
350 * someone else accesses the socket the ICMP is just dropped
351 * and for some paths there is no check at all.
352 * A more general error queue to queue errors for later handling
353 * is probably better.
354 *
355 */
356
357 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
358 {
359 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
360 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
361 struct inet_connection_sock *icsk;
362 struct tcp_sock *tp;
363 struct inet_sock *inet;
364 const int type = icmp_hdr(icmp_skb)->type;
365 const int code = icmp_hdr(icmp_skb)->code;
366 struct sock *sk;
367 struct sk_buff *skb;
368 struct request_sock *fastopen;
369 __u32 seq, snd_una;
370 __u32 remaining;
371 int err;
372 struct net *net = dev_net(icmp_skb->dev);
373
374 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
375 th->dest, iph->saddr, ntohs(th->source),
376 inet_iif(icmp_skb));
377 if (!sk) {
378 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
379 return;
380 }
381 if (sk->sk_state == TCP_TIME_WAIT) {
382 inet_twsk_put(inet_twsk(sk));
383 return;
384 }
385 seq = ntohl(th->seq);
386 if (sk->sk_state == TCP_NEW_SYN_RECV)
387 return tcp_req_err(sk, seq);
388
389 bh_lock_sock(sk);
390 /* If too many ICMPs get dropped on busy
391 * servers this needs to be solved differently.
392 * We do take care of PMTU discovery (RFC1191) special case :
393 * we can receive locally generated ICMP messages while socket is held.
394 */
395 if (sock_owned_by_user(sk)) {
396 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
397 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
398 }
399 if (sk->sk_state == TCP_CLOSE)
400 goto out;
401
402 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
403 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
404 goto out;
405 }
406
407 icsk = inet_csk(sk);
408 tp = tcp_sk(sk);
409 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
410 fastopen = tp->fastopen_rsk;
411 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
412 if (sk->sk_state != TCP_LISTEN &&
413 !between(seq, snd_una, tp->snd_nxt)) {
414 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
415 goto out;
416 }
417
418 switch (type) {
419 case ICMP_REDIRECT:
420 do_redirect(icmp_skb, sk);
421 goto out;
422 case ICMP_SOURCE_QUENCH:
423 /* Just silently ignore these. */
424 goto out;
425 case ICMP_PARAMETERPROB:
426 err = EPROTO;
427 break;
428 case ICMP_DEST_UNREACH:
429 if (code > NR_ICMP_UNREACH)
430 goto out;
431
432 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
433 /* We are not interested in TCP_LISTEN and open_requests
434 * (SYN-ACKs send out by Linux are always <576bytes so
435 * they should go through unfragmented).
436 */
437 if (sk->sk_state == TCP_LISTEN)
438 goto out;
439
440 tp->mtu_info = info;
441 if (!sock_owned_by_user(sk)) {
442 tcp_v4_mtu_reduced(sk);
443 } else {
444 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
445 sock_hold(sk);
446 }
447 goto out;
448 }
449
450 err = icmp_err_convert[code].errno;
451 /* check if icmp_skb allows revert of backoff
452 * (see draft-zimmermann-tcp-lcd) */
453 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
454 break;
455 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
456 !icsk->icsk_backoff || fastopen)
457 break;
458
459 if (sock_owned_by_user(sk))
460 break;
461
462 icsk->icsk_backoff--;
463 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
464 TCP_TIMEOUT_INIT;
465 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
466
467 skb = tcp_write_queue_head(sk);
468 BUG_ON(!skb);
469
470 remaining = icsk->icsk_rto -
471 min(icsk->icsk_rto,
472 tcp_time_stamp - tcp_skb_timestamp(skb));
473
474 if (remaining) {
475 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
476 remaining, TCP_RTO_MAX);
477 } else {
478 /* RTO revert clocked out retransmission.
479 * Will retransmit now */
480 tcp_retransmit_timer(sk);
481 }
482
483 break;
484 case ICMP_TIME_EXCEEDED:
485 err = EHOSTUNREACH;
486 break;
487 default:
488 goto out;
489 }
490
491 switch (sk->sk_state) {
492 case TCP_SYN_SENT:
493 case TCP_SYN_RECV:
494 /* Only in fast or simultaneous open. If a fast open socket is
495 * is already accepted it is treated as a connected one below.
496 */
497 if (fastopen && !fastopen->sk)
498 break;
499
500 if (!sock_owned_by_user(sk)) {
501 sk->sk_err = err;
502
503 sk->sk_error_report(sk);
504
505 tcp_done(sk);
506 } else {
507 sk->sk_err_soft = err;
508 }
509 goto out;
510 }
511
512 /* If we've already connected we will keep trying
513 * until we time out, or the user gives up.
514 *
515 * rfc1122 4.2.3.9 allows to consider as hard errors
516 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
517 * but it is obsoleted by pmtu discovery).
518 *
519 * Note, that in modern internet, where routing is unreliable
520 * and in each dark corner broken firewalls sit, sending random
521 * errors ordered by their masters even this two messages finally lose
522 * their original sense (even Linux sends invalid PORT_UNREACHs)
523 *
524 * Now we are in compliance with RFCs.
525 * --ANK (980905)
526 */
527
528 inet = inet_sk(sk);
529 if (!sock_owned_by_user(sk) && inet->recverr) {
530 sk->sk_err = err;
531 sk->sk_error_report(sk);
532 } else { /* Only an error on timeout */
533 sk->sk_err_soft = err;
534 }
535
536 out:
537 bh_unlock_sock(sk);
538 sock_put(sk);
539 }
540
541 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
542 {
543 struct tcphdr *th = tcp_hdr(skb);
544
545 if (skb->ip_summed == CHECKSUM_PARTIAL) {
546 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
547 skb->csum_start = skb_transport_header(skb) - skb->head;
548 skb->csum_offset = offsetof(struct tcphdr, check);
549 } else {
550 th->check = tcp_v4_check(skb->len, saddr, daddr,
551 csum_partial(th,
552 th->doff << 2,
553 skb->csum));
554 }
555 }
556
557 /* This routine computes an IPv4 TCP checksum. */
558 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
559 {
560 const struct inet_sock *inet = inet_sk(sk);
561
562 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
563 }
564 EXPORT_SYMBOL(tcp_v4_send_check);
565
566 /*
567 * This routine will send an RST to the other tcp.
568 *
569 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
570 * for reset.
571 * Answer: if a packet caused RST, it is not for a socket
572 * existing in our system, if it is matched to a socket,
573 * it is just duplicate segment or bug in other side's TCP.
574 * So that we build reply only basing on parameters
575 * arrived with segment.
576 * Exception: precedence violation. We do not implement it in any case.
577 */
578
579 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
580 {
581 const struct tcphdr *th = tcp_hdr(skb);
582 struct {
583 struct tcphdr th;
584 #ifdef CONFIG_TCP_MD5SIG
585 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
586 #endif
587 } rep;
588 struct ip_reply_arg arg;
589 #ifdef CONFIG_TCP_MD5SIG
590 struct tcp_md5sig_key *key = NULL;
591 const __u8 *hash_location = NULL;
592 unsigned char newhash[16];
593 int genhash;
594 struct sock *sk1 = NULL;
595 #endif
596 struct net *net;
597
598 /* Never send a reset in response to a reset. */
599 if (th->rst)
600 return;
601
602 /* If sk not NULL, it means we did a successful lookup and incoming
603 * route had to be correct. prequeue might have dropped our dst.
604 */
605 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
606 return;
607
608 /* Swap the send and the receive. */
609 memset(&rep, 0, sizeof(rep));
610 rep.th.dest = th->source;
611 rep.th.source = th->dest;
612 rep.th.doff = sizeof(struct tcphdr) / 4;
613 rep.th.rst = 1;
614
615 if (th->ack) {
616 rep.th.seq = th->ack_seq;
617 } else {
618 rep.th.ack = 1;
619 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
620 skb->len - (th->doff << 2));
621 }
622
623 memset(&arg, 0, sizeof(arg));
624 arg.iov[0].iov_base = (unsigned char *)&rep;
625 arg.iov[0].iov_len = sizeof(rep.th);
626
627 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
628 #ifdef CONFIG_TCP_MD5SIG
629 hash_location = tcp_parse_md5sig_option(th);
630 if (sk && sk_fullsock(sk)) {
631 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
632 &ip_hdr(skb)->saddr, AF_INET);
633 } else if (hash_location) {
634 /*
635 * active side is lost. Try to find listening socket through
636 * source port, and then find md5 key through listening socket.
637 * we are not loose security here:
638 * Incoming packet is checked with md5 hash with finding key,
639 * no RST generated if md5 hash doesn't match.
640 */
641 sk1 = __inet_lookup_listener(net,
642 &tcp_hashinfo, ip_hdr(skb)->saddr,
643 th->source, ip_hdr(skb)->daddr,
644 ntohs(th->source), inet_iif(skb));
645 /* don't send rst if it can't find key */
646 if (!sk1)
647 return;
648 rcu_read_lock();
649 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
650 &ip_hdr(skb)->saddr, AF_INET);
651 if (!key)
652 goto release_sk1;
653
654 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
655 if (genhash || memcmp(hash_location, newhash, 16) != 0)
656 goto release_sk1;
657 }
658
659 if (key) {
660 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
661 (TCPOPT_NOP << 16) |
662 (TCPOPT_MD5SIG << 8) |
663 TCPOLEN_MD5SIG);
664 /* Update length and the length the header thinks exists */
665 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
666 rep.th.doff = arg.iov[0].iov_len / 4;
667
668 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
669 key, ip_hdr(skb)->saddr,
670 ip_hdr(skb)->daddr, &rep.th);
671 }
672 #endif
673 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
674 ip_hdr(skb)->saddr, /* XXX */
675 arg.iov[0].iov_len, IPPROTO_TCP, 0);
676 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
677 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
678
679 /* When socket is gone, all binding information is lost.
680 * routing might fail in this case. No choice here, if we choose to force
681 * input interface, we will misroute in case of asymmetric route.
682 */
683 if (sk)
684 arg.bound_dev_if = sk->sk_bound_dev_if;
685
686 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
687 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
688
689 arg.tos = ip_hdr(skb)->tos;
690 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
691 skb, &TCP_SKB_CB(skb)->header.h4.opt,
692 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
693 &arg, arg.iov[0].iov_len);
694
695 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
696 TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
697
698 #ifdef CONFIG_TCP_MD5SIG
699 release_sk1:
700 if (sk1) {
701 rcu_read_unlock();
702 sock_put(sk1);
703 }
704 #endif
705 }
706
707 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
708 outside socket context is ugly, certainly. What can I do?
709 */
710
711 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
712 u32 win, u32 tsval, u32 tsecr, int oif,
713 struct tcp_md5sig_key *key,
714 int reply_flags, u8 tos)
715 {
716 const struct tcphdr *th = tcp_hdr(skb);
717 struct {
718 struct tcphdr th;
719 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
720 #ifdef CONFIG_TCP_MD5SIG
721 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
722 #endif
723 ];
724 } rep;
725 struct ip_reply_arg arg;
726 struct net *net = dev_net(skb_dst(skb)->dev);
727
728 memset(&rep.th, 0, sizeof(struct tcphdr));
729 memset(&arg, 0, sizeof(arg));
730
731 arg.iov[0].iov_base = (unsigned char *)&rep;
732 arg.iov[0].iov_len = sizeof(rep.th);
733 if (tsecr) {
734 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
735 (TCPOPT_TIMESTAMP << 8) |
736 TCPOLEN_TIMESTAMP);
737 rep.opt[1] = htonl(tsval);
738 rep.opt[2] = htonl(tsecr);
739 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
740 }
741
742 /* Swap the send and the receive. */
743 rep.th.dest = th->source;
744 rep.th.source = th->dest;
745 rep.th.doff = arg.iov[0].iov_len / 4;
746 rep.th.seq = htonl(seq);
747 rep.th.ack_seq = htonl(ack);
748 rep.th.ack = 1;
749 rep.th.window = htons(win);
750
751 #ifdef CONFIG_TCP_MD5SIG
752 if (key) {
753 int offset = (tsecr) ? 3 : 0;
754
755 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
756 (TCPOPT_NOP << 16) |
757 (TCPOPT_MD5SIG << 8) |
758 TCPOLEN_MD5SIG);
759 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
760 rep.th.doff = arg.iov[0].iov_len/4;
761
762 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
763 key, ip_hdr(skb)->saddr,
764 ip_hdr(skb)->daddr, &rep.th);
765 }
766 #endif
767 arg.flags = reply_flags;
768 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
769 ip_hdr(skb)->saddr, /* XXX */
770 arg.iov[0].iov_len, IPPROTO_TCP, 0);
771 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
772 if (oif)
773 arg.bound_dev_if = oif;
774 arg.tos = tos;
775 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
776 skb, &TCP_SKB_CB(skb)->header.h4.opt,
777 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
778 &arg, arg.iov[0].iov_len);
779
780 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
781 }
782
783 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
784 {
785 struct inet_timewait_sock *tw = inet_twsk(sk);
786 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
787
788 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
789 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
790 tcp_time_stamp + tcptw->tw_ts_offset,
791 tcptw->tw_ts_recent,
792 tw->tw_bound_dev_if,
793 tcp_twsk_md5_key(tcptw),
794 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
795 tw->tw_tos
796 );
797
798 inet_twsk_put(tw);
799 }
800
801 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
802 struct request_sock *req)
803 {
804 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
805 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
806 */
807 tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
808 tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
809 tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
810 tcp_time_stamp,
811 req->ts_recent,
812 0,
813 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
814 AF_INET),
815 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
816 ip_hdr(skb)->tos);
817 }
818
819 /*
820 * Send a SYN-ACK after having received a SYN.
821 * This still operates on a request_sock only, not on a big
822 * socket.
823 */
824 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
825 struct flowi *fl,
826 struct request_sock *req,
827 struct tcp_fastopen_cookie *foc,
828 bool attach_req)
829 {
830 const struct inet_request_sock *ireq = inet_rsk(req);
831 struct flowi4 fl4;
832 int err = -1;
833 struct sk_buff *skb;
834
835 /* First, grab a route. */
836 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
837 return -1;
838
839 skb = tcp_make_synack(sk, dst, req, foc, attach_req);
840
841 if (skb) {
842 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
843
844 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
845 ireq->ir_rmt_addr,
846 ireq->opt);
847 err = net_xmit_eval(err);
848 }
849
850 return err;
851 }
852
853 /*
854 * IPv4 request_sock destructor.
855 */
856 static void tcp_v4_reqsk_destructor(struct request_sock *req)
857 {
858 kfree(inet_rsk(req)->opt);
859 }
860
861
862 #ifdef CONFIG_TCP_MD5SIG
863 /*
864 * RFC2385 MD5 checksumming requires a mapping of
865 * IP address->MD5 Key.
866 * We need to maintain these in the sk structure.
867 */
868
869 /* Find the Key structure for an address. */
870 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
871 const union tcp_md5_addr *addr,
872 int family)
873 {
874 const struct tcp_sock *tp = tcp_sk(sk);
875 struct tcp_md5sig_key *key;
876 unsigned int size = sizeof(struct in_addr);
877 const struct tcp_md5sig_info *md5sig;
878
879 /* caller either holds rcu_read_lock() or socket lock */
880 md5sig = rcu_dereference_check(tp->md5sig_info,
881 sock_owned_by_user(sk) ||
882 lockdep_is_held((spinlock_t *)&sk->sk_lock.slock));
883 if (!md5sig)
884 return NULL;
885 #if IS_ENABLED(CONFIG_IPV6)
886 if (family == AF_INET6)
887 size = sizeof(struct in6_addr);
888 #endif
889 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
890 if (key->family != family)
891 continue;
892 if (!memcmp(&key->addr, addr, size))
893 return key;
894 }
895 return NULL;
896 }
897 EXPORT_SYMBOL(tcp_md5_do_lookup);
898
899 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
900 const struct sock *addr_sk)
901 {
902 const union tcp_md5_addr *addr;
903
904 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
905 return tcp_md5_do_lookup(sk, addr, AF_INET);
906 }
907 EXPORT_SYMBOL(tcp_v4_md5_lookup);
908
909 /* This can be called on a newly created socket, from other files */
910 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
911 int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
912 {
913 /* Add Key to the list */
914 struct tcp_md5sig_key *key;
915 struct tcp_sock *tp = tcp_sk(sk);
916 struct tcp_md5sig_info *md5sig;
917
918 key = tcp_md5_do_lookup(sk, addr, family);
919 if (key) {
920 /* Pre-existing entry - just update that one. */
921 memcpy(key->key, newkey, newkeylen);
922 key->keylen = newkeylen;
923 return 0;
924 }
925
926 md5sig = rcu_dereference_protected(tp->md5sig_info,
927 sock_owned_by_user(sk) ||
928 lockdep_is_held(&sk->sk_lock.slock));
929 if (!md5sig) {
930 md5sig = kmalloc(sizeof(*md5sig), gfp);
931 if (!md5sig)
932 return -ENOMEM;
933
934 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
935 INIT_HLIST_HEAD(&md5sig->head);
936 rcu_assign_pointer(tp->md5sig_info, md5sig);
937 }
938
939 key = sock_kmalloc(sk, sizeof(*key), gfp);
940 if (!key)
941 return -ENOMEM;
942 if (!tcp_alloc_md5sig_pool()) {
943 sock_kfree_s(sk, key, sizeof(*key));
944 return -ENOMEM;
945 }
946
947 memcpy(key->key, newkey, newkeylen);
948 key->keylen = newkeylen;
949 key->family = family;
950 memcpy(&key->addr, addr,
951 (family == AF_INET6) ? sizeof(struct in6_addr) :
952 sizeof(struct in_addr));
953 hlist_add_head_rcu(&key->node, &md5sig->head);
954 return 0;
955 }
956 EXPORT_SYMBOL(tcp_md5_do_add);
957
958 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
959 {
960 struct tcp_md5sig_key *key;
961
962 key = tcp_md5_do_lookup(sk, addr, family);
963 if (!key)
964 return -ENOENT;
965 hlist_del_rcu(&key->node);
966 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
967 kfree_rcu(key, rcu);
968 return 0;
969 }
970 EXPORT_SYMBOL(tcp_md5_do_del);
971
972 static void tcp_clear_md5_list(struct sock *sk)
973 {
974 struct tcp_sock *tp = tcp_sk(sk);
975 struct tcp_md5sig_key *key;
976 struct hlist_node *n;
977 struct tcp_md5sig_info *md5sig;
978
979 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
980
981 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
982 hlist_del_rcu(&key->node);
983 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
984 kfree_rcu(key, rcu);
985 }
986 }
987
988 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
989 int optlen)
990 {
991 struct tcp_md5sig cmd;
992 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
993
994 if (optlen < sizeof(cmd))
995 return -EINVAL;
996
997 if (copy_from_user(&cmd, optval, sizeof(cmd)))
998 return -EFAULT;
999
1000 if (sin->sin_family != AF_INET)
1001 return -EINVAL;
1002
1003 if (!cmd.tcpm_keylen)
1004 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1005 AF_INET);
1006
1007 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1008 return -EINVAL;
1009
1010 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1011 AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1012 GFP_KERNEL);
1013 }
1014
1015 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1016 __be32 daddr, __be32 saddr, int nbytes)
1017 {
1018 struct tcp4_pseudohdr *bp;
1019 struct scatterlist sg;
1020
1021 bp = &hp->md5_blk.ip4;
1022
1023 /*
1024 * 1. the TCP pseudo-header (in the order: source IP address,
1025 * destination IP address, zero-padded protocol number, and
1026 * segment length)
1027 */
1028 bp->saddr = saddr;
1029 bp->daddr = daddr;
1030 bp->pad = 0;
1031 bp->protocol = IPPROTO_TCP;
1032 bp->len = cpu_to_be16(nbytes);
1033
1034 sg_init_one(&sg, bp, sizeof(*bp));
1035 return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1036 }
1037
1038 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1039 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1040 {
1041 struct tcp_md5sig_pool *hp;
1042 struct hash_desc *desc;
1043
1044 hp = tcp_get_md5sig_pool();
1045 if (!hp)
1046 goto clear_hash_noput;
1047 desc = &hp->md5_desc;
1048
1049 if (crypto_hash_init(desc))
1050 goto clear_hash;
1051 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1052 goto clear_hash;
1053 if (tcp_md5_hash_header(hp, th))
1054 goto clear_hash;
1055 if (tcp_md5_hash_key(hp, key))
1056 goto clear_hash;
1057 if (crypto_hash_final(desc, md5_hash))
1058 goto clear_hash;
1059
1060 tcp_put_md5sig_pool();
1061 return 0;
1062
1063 clear_hash:
1064 tcp_put_md5sig_pool();
1065 clear_hash_noput:
1066 memset(md5_hash, 0, 16);
1067 return 1;
1068 }
1069
1070 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1071 const struct sock *sk,
1072 const struct sk_buff *skb)
1073 {
1074 struct tcp_md5sig_pool *hp;
1075 struct hash_desc *desc;
1076 const struct tcphdr *th = tcp_hdr(skb);
1077 __be32 saddr, daddr;
1078
1079 if (sk) { /* valid for establish/request sockets */
1080 saddr = sk->sk_rcv_saddr;
1081 daddr = sk->sk_daddr;
1082 } else {
1083 const struct iphdr *iph = ip_hdr(skb);
1084 saddr = iph->saddr;
1085 daddr = iph->daddr;
1086 }
1087
1088 hp = tcp_get_md5sig_pool();
1089 if (!hp)
1090 goto clear_hash_noput;
1091 desc = &hp->md5_desc;
1092
1093 if (crypto_hash_init(desc))
1094 goto clear_hash;
1095
1096 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1097 goto clear_hash;
1098 if (tcp_md5_hash_header(hp, th))
1099 goto clear_hash;
1100 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1101 goto clear_hash;
1102 if (tcp_md5_hash_key(hp, key))
1103 goto clear_hash;
1104 if (crypto_hash_final(desc, md5_hash))
1105 goto clear_hash;
1106
1107 tcp_put_md5sig_pool();
1108 return 0;
1109
1110 clear_hash:
1111 tcp_put_md5sig_pool();
1112 clear_hash_noput:
1113 memset(md5_hash, 0, 16);
1114 return 1;
1115 }
1116 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1117
1118 #endif
1119
1120 /* Called with rcu_read_lock() */
1121 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1122 const struct sk_buff *skb)
1123 {
1124 #ifdef CONFIG_TCP_MD5SIG
1125 /*
1126 * This gets called for each TCP segment that arrives
1127 * so we want to be efficient.
1128 * We have 3 drop cases:
1129 * o No MD5 hash and one expected.
1130 * o MD5 hash and we're not expecting one.
1131 * o MD5 hash and its wrong.
1132 */
1133 const __u8 *hash_location = NULL;
1134 struct tcp_md5sig_key *hash_expected;
1135 const struct iphdr *iph = ip_hdr(skb);
1136 const struct tcphdr *th = tcp_hdr(skb);
1137 int genhash;
1138 unsigned char newhash[16];
1139
1140 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1141 AF_INET);
1142 hash_location = tcp_parse_md5sig_option(th);
1143
1144 /* We've parsed the options - do we have a hash? */
1145 if (!hash_expected && !hash_location)
1146 return false;
1147
1148 if (hash_expected && !hash_location) {
1149 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1150 return true;
1151 }
1152
1153 if (!hash_expected && hash_location) {
1154 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1155 return true;
1156 }
1157
1158 /* Okay, so this is hash_expected and hash_location -
1159 * so we need to calculate the checksum.
1160 */
1161 genhash = tcp_v4_md5_hash_skb(newhash,
1162 hash_expected,
1163 NULL, skb);
1164
1165 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1166 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1167 &iph->saddr, ntohs(th->source),
1168 &iph->daddr, ntohs(th->dest),
1169 genhash ? " tcp_v4_calc_md5_hash failed"
1170 : "");
1171 return true;
1172 }
1173 return false;
1174 #endif
1175 return false;
1176 }
1177
1178 static void tcp_v4_init_req(struct request_sock *req,
1179 const struct sock *sk_listener,
1180 struct sk_buff *skb)
1181 {
1182 struct inet_request_sock *ireq = inet_rsk(req);
1183
1184 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1185 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1186 ireq->no_srccheck = inet_sk(sk_listener)->transparent;
1187 ireq->opt = tcp_v4_save_options(skb);
1188 }
1189
1190 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1191 struct flowi *fl,
1192 const struct request_sock *req,
1193 bool *strict)
1194 {
1195 struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1196
1197 if (strict) {
1198 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1199 *strict = true;
1200 else
1201 *strict = false;
1202 }
1203
1204 return dst;
1205 }
1206
1207 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1208 .family = PF_INET,
1209 .obj_size = sizeof(struct tcp_request_sock),
1210 .rtx_syn_ack = tcp_rtx_synack,
1211 .send_ack = tcp_v4_reqsk_send_ack,
1212 .destructor = tcp_v4_reqsk_destructor,
1213 .send_reset = tcp_v4_send_reset,
1214 .syn_ack_timeout = tcp_syn_ack_timeout,
1215 };
1216
1217 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1218 .mss_clamp = TCP_MSS_DEFAULT,
1219 #ifdef CONFIG_TCP_MD5SIG
1220 .req_md5_lookup = tcp_v4_md5_lookup,
1221 .calc_md5_hash = tcp_v4_md5_hash_skb,
1222 #endif
1223 .init_req = tcp_v4_init_req,
1224 #ifdef CONFIG_SYN_COOKIES
1225 .cookie_init_seq = cookie_v4_init_sequence,
1226 #endif
1227 .route_req = tcp_v4_route_req,
1228 .init_seq = tcp_v4_init_sequence,
1229 .send_synack = tcp_v4_send_synack,
1230 };
1231
1232 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1233 {
1234 /* Never answer to SYNs send to broadcast or multicast */
1235 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1236 goto drop;
1237
1238 return tcp_conn_request(&tcp_request_sock_ops,
1239 &tcp_request_sock_ipv4_ops, sk, skb);
1240
1241 drop:
1242 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1243 return 0;
1244 }
1245 EXPORT_SYMBOL(tcp_v4_conn_request);
1246
1247
1248 /*
1249 * The three way handshake has completed - we got a valid synack -
1250 * now create the new socket.
1251 */
1252 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1253 struct request_sock *req,
1254 struct dst_entry *dst,
1255 struct request_sock *req_unhash,
1256 bool *own_req)
1257 {
1258 struct inet_request_sock *ireq;
1259 struct inet_sock *newinet;
1260 struct tcp_sock *newtp;
1261 struct sock *newsk;
1262 #ifdef CONFIG_TCP_MD5SIG
1263 struct tcp_md5sig_key *key;
1264 #endif
1265 struct ip_options_rcu *inet_opt;
1266
1267 if (sk_acceptq_is_full(sk))
1268 goto exit_overflow;
1269
1270 newsk = tcp_create_openreq_child(sk, req, skb);
1271 if (!newsk)
1272 goto exit_nonewsk;
1273
1274 newsk->sk_gso_type = SKB_GSO_TCPV4;
1275 inet_sk_rx_dst_set(newsk, skb);
1276
1277 newtp = tcp_sk(newsk);
1278 newinet = inet_sk(newsk);
1279 ireq = inet_rsk(req);
1280 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1281 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1282 newsk->sk_bound_dev_if = ireq->ir_iif;
1283 newinet->inet_saddr = ireq->ir_loc_addr;
1284 inet_opt = ireq->opt;
1285 rcu_assign_pointer(newinet->inet_opt, inet_opt);
1286 ireq->opt = NULL;
1287 newinet->mc_index = inet_iif(skb);
1288 newinet->mc_ttl = ip_hdr(skb)->ttl;
1289 newinet->rcv_tos = ip_hdr(skb)->tos;
1290 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1291 if (inet_opt)
1292 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1293 newinet->inet_id = newtp->write_seq ^ jiffies;
1294
1295 if (!dst) {
1296 dst = inet_csk_route_child_sock(sk, newsk, req);
1297 if (!dst)
1298 goto put_and_exit;
1299 } else {
1300 /* syncookie case : see end of cookie_v4_check() */
1301 }
1302 sk_setup_caps(newsk, dst);
1303
1304 tcp_ca_openreq_child(newsk, dst);
1305
1306 tcp_sync_mss(newsk, dst_mtu(dst));
1307 newtp->advmss = dst_metric_advmss(dst);
1308 if (tcp_sk(sk)->rx_opt.user_mss &&
1309 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1310 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1311
1312 tcp_initialize_rcv_mss(newsk);
1313
1314 #ifdef CONFIG_TCP_MD5SIG
1315 /* Copy over the MD5 key from the original socket */
1316 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1317 AF_INET);
1318 if (key) {
1319 /*
1320 * We're using one, so create a matching key
1321 * on the newsk structure. If we fail to get
1322 * memory, then we end up not copying the key
1323 * across. Shucks.
1324 */
1325 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1326 AF_INET, key->key, key->keylen, GFP_ATOMIC);
1327 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1328 }
1329 #endif
1330
1331 if (__inet_inherit_port(sk, newsk) < 0)
1332 goto put_and_exit;
1333 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1334 if (*own_req)
1335 tcp_move_syn(newtp, req);
1336
1337 return newsk;
1338
1339 exit_overflow:
1340 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1341 exit_nonewsk:
1342 dst_release(dst);
1343 exit:
1344 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1345 return NULL;
1346 put_and_exit:
1347 inet_csk_prepare_forced_close(newsk);
1348 tcp_done(newsk);
1349 goto exit;
1350 }
1351 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1352
1353 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1354 {
1355 #ifdef CONFIG_SYN_COOKIES
1356 const struct tcphdr *th = tcp_hdr(skb);
1357
1358 if (!th->syn)
1359 sk = cookie_v4_check(sk, skb);
1360 #endif
1361 return sk;
1362 }
1363
1364 /* The socket must have it's spinlock held when we get
1365 * here, unless it is a TCP_LISTEN socket.
1366 *
1367 * We have a potential double-lock case here, so even when
1368 * doing backlog processing we use the BH locking scheme.
1369 * This is because we cannot sleep with the original spinlock
1370 * held.
1371 */
1372 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1373 {
1374 struct sock *rsk;
1375
1376 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1377 struct dst_entry *dst = sk->sk_rx_dst;
1378
1379 sock_rps_save_rxhash(sk, skb);
1380 sk_mark_napi_id(sk, skb);
1381 if (dst) {
1382 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1383 !dst->ops->check(dst, 0)) {
1384 dst_release(dst);
1385 sk->sk_rx_dst = NULL;
1386 }
1387 }
1388 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1389 return 0;
1390 }
1391
1392 if (tcp_checksum_complete(skb))
1393 goto csum_err;
1394
1395 if (sk->sk_state == TCP_LISTEN) {
1396 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1397
1398 if (!nsk)
1399 goto discard;
1400 if (nsk != sk) {
1401 sock_rps_save_rxhash(nsk, skb);
1402 sk_mark_napi_id(nsk, skb);
1403 if (tcp_child_process(sk, nsk, skb)) {
1404 rsk = nsk;
1405 goto reset;
1406 }
1407 return 0;
1408 }
1409 } else
1410 sock_rps_save_rxhash(sk, skb);
1411
1412 if (tcp_rcv_state_process(sk, skb)) {
1413 rsk = sk;
1414 goto reset;
1415 }
1416 return 0;
1417
1418 reset:
1419 tcp_v4_send_reset(rsk, skb);
1420 discard:
1421 kfree_skb(skb);
1422 /* Be careful here. If this function gets more complicated and
1423 * gcc suffers from register pressure on the x86, sk (in %ebx)
1424 * might be destroyed here. This current version compiles correctly,
1425 * but you have been warned.
1426 */
1427 return 0;
1428
1429 csum_err:
1430 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1431 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1432 goto discard;
1433 }
1434 EXPORT_SYMBOL(tcp_v4_do_rcv);
1435
1436 void tcp_v4_early_demux(struct sk_buff *skb)
1437 {
1438 const struct iphdr *iph;
1439 const struct tcphdr *th;
1440 struct sock *sk;
1441
1442 if (skb->pkt_type != PACKET_HOST)
1443 return;
1444
1445 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1446 return;
1447
1448 iph = ip_hdr(skb);
1449 th = tcp_hdr(skb);
1450
1451 if (th->doff < sizeof(struct tcphdr) / 4)
1452 return;
1453
1454 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1455 iph->saddr, th->source,
1456 iph->daddr, ntohs(th->dest),
1457 skb->skb_iif);
1458 if (sk) {
1459 skb->sk = sk;
1460 skb->destructor = sock_edemux;
1461 if (sk_fullsock(sk)) {
1462 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1463
1464 if (dst)
1465 dst = dst_check(dst, 0);
1466 if (dst &&
1467 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1468 skb_dst_set_noref(skb, dst);
1469 }
1470 }
1471 }
1472
1473 /* Packet is added to VJ-style prequeue for processing in process
1474 * context, if a reader task is waiting. Apparently, this exciting
1475 * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1476 * failed somewhere. Latency? Burstiness? Well, at least now we will
1477 * see, why it failed. 8)8) --ANK
1478 *
1479 */
1480 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1481 {
1482 struct tcp_sock *tp = tcp_sk(sk);
1483
1484 if (sysctl_tcp_low_latency || !tp->ucopy.task)
1485 return false;
1486
1487 if (skb->len <= tcp_hdrlen(skb) &&
1488 skb_queue_len(&tp->ucopy.prequeue) == 0)
1489 return false;
1490
1491 /* Before escaping RCU protected region, we need to take care of skb
1492 * dst. Prequeue is only enabled for established sockets.
1493 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1494 * Instead of doing full sk_rx_dst validity here, let's perform
1495 * an optimistic check.
1496 */
1497 if (likely(sk->sk_rx_dst))
1498 skb_dst_drop(skb);
1499 else
1500 skb_dst_force_safe(skb);
1501
1502 __skb_queue_tail(&tp->ucopy.prequeue, skb);
1503 tp->ucopy.memory += skb->truesize;
1504 if (tp->ucopy.memory > sk->sk_rcvbuf) {
1505 struct sk_buff *skb1;
1506
1507 BUG_ON(sock_owned_by_user(sk));
1508
1509 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1510 sk_backlog_rcv(sk, skb1);
1511 NET_INC_STATS_BH(sock_net(sk),
1512 LINUX_MIB_TCPPREQUEUEDROPPED);
1513 }
1514
1515 tp->ucopy.memory = 0;
1516 } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1517 wake_up_interruptible_sync_poll(sk_sleep(sk),
1518 POLLIN | POLLRDNORM | POLLRDBAND);
1519 if (!inet_csk_ack_scheduled(sk))
1520 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1521 (3 * tcp_rto_min(sk)) / 4,
1522 TCP_RTO_MAX);
1523 }
1524 return true;
1525 }
1526 EXPORT_SYMBOL(tcp_prequeue);
1527
1528 /*
1529 * From tcp_input.c
1530 */
1531
1532 int tcp_v4_rcv(struct sk_buff *skb)
1533 {
1534 const struct iphdr *iph;
1535 const struct tcphdr *th;
1536 struct sock *sk;
1537 int ret;
1538 struct net *net = dev_net(skb->dev);
1539
1540 if (skb->pkt_type != PACKET_HOST)
1541 goto discard_it;
1542
1543 /* Count it even if it's bad */
1544 TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1545
1546 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1547 goto discard_it;
1548
1549 th = tcp_hdr(skb);
1550
1551 if (th->doff < sizeof(struct tcphdr) / 4)
1552 goto bad_packet;
1553 if (!pskb_may_pull(skb, th->doff * 4))
1554 goto discard_it;
1555
1556 /* An explanation is required here, I think.
1557 * Packet length and doff are validated by header prediction,
1558 * provided case of th->doff==0 is eliminated.
1559 * So, we defer the checks. */
1560
1561 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1562 goto csum_error;
1563
1564 th = tcp_hdr(skb);
1565 iph = ip_hdr(skb);
1566 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1567 * barrier() makes sure compiler wont play fool^Waliasing games.
1568 */
1569 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1570 sizeof(struct inet_skb_parm));
1571 barrier();
1572
1573 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1574 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1575 skb->len - th->doff * 4);
1576 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1577 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1578 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1579 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1580 TCP_SKB_CB(skb)->sacked = 0;
1581
1582 lookup:
1583 sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1584 if (!sk)
1585 goto no_tcp_socket;
1586
1587 process:
1588 if (sk->sk_state == TCP_TIME_WAIT)
1589 goto do_time_wait;
1590
1591 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1592 struct request_sock *req = inet_reqsk(sk);
1593 struct sock *nsk = NULL;
1594
1595 sk = req->rsk_listener;
1596 if (tcp_v4_inbound_md5_hash(sk, skb))
1597 goto discard_and_relse;
1598 if (likely(sk->sk_state == TCP_LISTEN)) {
1599 nsk = tcp_check_req(sk, skb, req, false);
1600 } else {
1601 inet_csk_reqsk_queue_drop_and_put(sk, req);
1602 goto lookup;
1603 }
1604 if (!nsk) {
1605 reqsk_put(req);
1606 goto discard_it;
1607 }
1608 if (nsk == sk) {
1609 sock_hold(sk);
1610 reqsk_put(req);
1611 } else if (tcp_child_process(sk, nsk, skb)) {
1612 tcp_v4_send_reset(nsk, skb);
1613 goto discard_it;
1614 } else {
1615 return 0;
1616 }
1617 }
1618 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1619 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1620 goto discard_and_relse;
1621 }
1622
1623 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1624 goto discard_and_relse;
1625
1626 if (tcp_v4_inbound_md5_hash(sk, skb))
1627 goto discard_and_relse;
1628
1629 nf_reset(skb);
1630
1631 if (sk_filter(sk, skb))
1632 goto discard_and_relse;
1633
1634 skb->dev = NULL;
1635
1636 if (sk->sk_state == TCP_LISTEN) {
1637 ret = tcp_v4_do_rcv(sk, skb);
1638 goto put_and_return;
1639 }
1640
1641 sk_incoming_cpu_update(sk);
1642
1643 bh_lock_sock_nested(sk);
1644 tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
1645 ret = 0;
1646 if (!sock_owned_by_user(sk)) {
1647 if (!tcp_prequeue(sk, skb))
1648 ret = tcp_v4_do_rcv(sk, skb);
1649 } else if (unlikely(sk_add_backlog(sk, skb,
1650 sk->sk_rcvbuf + sk->sk_sndbuf))) {
1651 bh_unlock_sock(sk);
1652 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1653 goto discard_and_relse;
1654 }
1655 bh_unlock_sock(sk);
1656
1657 put_and_return:
1658 sock_put(sk);
1659
1660 return ret;
1661
1662 no_tcp_socket:
1663 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1664 goto discard_it;
1665
1666 if (tcp_checksum_complete(skb)) {
1667 csum_error:
1668 TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1669 bad_packet:
1670 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1671 } else {
1672 tcp_v4_send_reset(NULL, skb);
1673 }
1674
1675 discard_it:
1676 /* Discard frame. */
1677 kfree_skb(skb);
1678 return 0;
1679
1680 discard_and_relse:
1681 sock_put(sk);
1682 goto discard_it;
1683
1684 do_time_wait:
1685 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1686 inet_twsk_put(inet_twsk(sk));
1687 goto discard_it;
1688 }
1689
1690 if (tcp_checksum_complete(skb)) {
1691 inet_twsk_put(inet_twsk(sk));
1692 goto csum_error;
1693 }
1694 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1695 case TCP_TW_SYN: {
1696 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1697 &tcp_hashinfo,
1698 iph->saddr, th->source,
1699 iph->daddr, th->dest,
1700 inet_iif(skb));
1701 if (sk2) {
1702 inet_twsk_deschedule_put(inet_twsk(sk));
1703 sk = sk2;
1704 goto process;
1705 }
1706 /* Fall through to ACK */
1707 }
1708 case TCP_TW_ACK:
1709 tcp_v4_timewait_ack(sk, skb);
1710 break;
1711 case TCP_TW_RST:
1712 tcp_v4_send_reset(sk, skb);
1713 inet_twsk_deschedule_put(inet_twsk(sk));
1714 goto discard_it;
1715 case TCP_TW_SUCCESS:;
1716 }
1717 goto discard_it;
1718 }
1719
1720 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1721 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1722 .twsk_unique = tcp_twsk_unique,
1723 .twsk_destructor= tcp_twsk_destructor,
1724 };
1725
1726 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1727 {
1728 struct dst_entry *dst = skb_dst(skb);
1729
1730 if (dst && dst_hold_safe(dst)) {
1731 sk->sk_rx_dst = dst;
1732 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1733 }
1734 }
1735 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1736
1737 const struct inet_connection_sock_af_ops ipv4_specific = {
1738 .queue_xmit = ip_queue_xmit,
1739 .send_check = tcp_v4_send_check,
1740 .rebuild_header = inet_sk_rebuild_header,
1741 .sk_rx_dst_set = inet_sk_rx_dst_set,
1742 .conn_request = tcp_v4_conn_request,
1743 .syn_recv_sock = tcp_v4_syn_recv_sock,
1744 .net_header_len = sizeof(struct iphdr),
1745 .setsockopt = ip_setsockopt,
1746 .getsockopt = ip_getsockopt,
1747 .addr2sockaddr = inet_csk_addr2sockaddr,
1748 .sockaddr_len = sizeof(struct sockaddr_in),
1749 .bind_conflict = inet_csk_bind_conflict,
1750 #ifdef CONFIG_COMPAT
1751 .compat_setsockopt = compat_ip_setsockopt,
1752 .compat_getsockopt = compat_ip_getsockopt,
1753 #endif
1754 .mtu_reduced = tcp_v4_mtu_reduced,
1755 };
1756 EXPORT_SYMBOL(ipv4_specific);
1757
1758 #ifdef CONFIG_TCP_MD5SIG
1759 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1760 .md5_lookup = tcp_v4_md5_lookup,
1761 .calc_md5_hash = tcp_v4_md5_hash_skb,
1762 .md5_parse = tcp_v4_parse_md5_keys,
1763 };
1764 #endif
1765
1766 /* NOTE: A lot of things set to zero explicitly by call to
1767 * sk_alloc() so need not be done here.
1768 */
1769 static int tcp_v4_init_sock(struct sock *sk)
1770 {
1771 struct inet_connection_sock *icsk = inet_csk(sk);
1772
1773 tcp_init_sock(sk);
1774
1775 icsk->icsk_af_ops = &ipv4_specific;
1776
1777 #ifdef CONFIG_TCP_MD5SIG
1778 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1779 #endif
1780
1781 return 0;
1782 }
1783
1784 void tcp_v4_destroy_sock(struct sock *sk)
1785 {
1786 struct tcp_sock *tp = tcp_sk(sk);
1787
1788 tcp_clear_xmit_timers(sk);
1789
1790 tcp_cleanup_congestion_control(sk);
1791
1792 /* Cleanup up the write buffer. */
1793 tcp_write_queue_purge(sk);
1794
1795 /* Cleans up our, hopefully empty, out_of_order_queue. */
1796 __skb_queue_purge(&tp->out_of_order_queue);
1797
1798 #ifdef CONFIG_TCP_MD5SIG
1799 /* Clean up the MD5 key list, if any */
1800 if (tp->md5sig_info) {
1801 tcp_clear_md5_list(sk);
1802 kfree_rcu(tp->md5sig_info, rcu);
1803 tp->md5sig_info = NULL;
1804 }
1805 #endif
1806
1807 /* Clean prequeue, it must be empty really */
1808 __skb_queue_purge(&tp->ucopy.prequeue);
1809
1810 /* Clean up a referenced TCP bind bucket. */
1811 if (inet_csk(sk)->icsk_bind_hash)
1812 inet_put_port(sk);
1813
1814 BUG_ON(tp->fastopen_rsk);
1815
1816 /* If socket is aborted during connect operation */
1817 tcp_free_fastopen_req(tp);
1818 tcp_saved_syn_free(tp);
1819
1820 sk_sockets_allocated_dec(sk);
1821 sock_release_memcg(sk);
1822 }
1823 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1824
1825 #ifdef CONFIG_PROC_FS
1826 /* Proc filesystem TCP sock list dumping. */
1827
1828 /*
1829 * Get next listener socket follow cur. If cur is NULL, get first socket
1830 * starting from bucket given in st->bucket; when st->bucket is zero the
1831 * very first socket in the hash table is returned.
1832 */
1833 static void *listening_get_next(struct seq_file *seq, void *cur)
1834 {
1835 struct inet_connection_sock *icsk;
1836 struct hlist_nulls_node *node;
1837 struct sock *sk = cur;
1838 struct inet_listen_hashbucket *ilb;
1839 struct tcp_iter_state *st = seq->private;
1840 struct net *net = seq_file_net(seq);
1841
1842 if (!sk) {
1843 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1844 spin_lock_bh(&ilb->lock);
1845 sk = sk_nulls_head(&ilb->head);
1846 st->offset = 0;
1847 goto get_sk;
1848 }
1849 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1850 ++st->num;
1851 ++st->offset;
1852
1853 sk = sk_nulls_next(sk);
1854 get_sk:
1855 sk_nulls_for_each_from(sk, node) {
1856 if (!net_eq(sock_net(sk), net))
1857 continue;
1858 if (sk->sk_family == st->family) {
1859 cur = sk;
1860 goto out;
1861 }
1862 icsk = inet_csk(sk);
1863 }
1864 spin_unlock_bh(&ilb->lock);
1865 st->offset = 0;
1866 if (++st->bucket < INET_LHTABLE_SIZE) {
1867 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1868 spin_lock_bh(&ilb->lock);
1869 sk = sk_nulls_head(&ilb->head);
1870 goto get_sk;
1871 }
1872 cur = NULL;
1873 out:
1874 return cur;
1875 }
1876
1877 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1878 {
1879 struct tcp_iter_state *st = seq->private;
1880 void *rc;
1881
1882 st->bucket = 0;
1883 st->offset = 0;
1884 rc = listening_get_next(seq, NULL);
1885
1886 while (rc && *pos) {
1887 rc = listening_get_next(seq, rc);
1888 --*pos;
1889 }
1890 return rc;
1891 }
1892
1893 static inline bool empty_bucket(const struct tcp_iter_state *st)
1894 {
1895 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1896 }
1897
1898 /*
1899 * Get first established socket starting from bucket given in st->bucket.
1900 * If st->bucket is zero, the very first socket in the hash is returned.
1901 */
1902 static void *established_get_first(struct seq_file *seq)
1903 {
1904 struct tcp_iter_state *st = seq->private;
1905 struct net *net = seq_file_net(seq);
1906 void *rc = NULL;
1907
1908 st->offset = 0;
1909 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1910 struct sock *sk;
1911 struct hlist_nulls_node *node;
1912 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1913
1914 /* Lockless fast path for the common case of empty buckets */
1915 if (empty_bucket(st))
1916 continue;
1917
1918 spin_lock_bh(lock);
1919 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1920 if (sk->sk_family != st->family ||
1921 !net_eq(sock_net(sk), net)) {
1922 continue;
1923 }
1924 rc = sk;
1925 goto out;
1926 }
1927 spin_unlock_bh(lock);
1928 }
1929 out:
1930 return rc;
1931 }
1932
1933 static void *established_get_next(struct seq_file *seq, void *cur)
1934 {
1935 struct sock *sk = cur;
1936 struct hlist_nulls_node *node;
1937 struct tcp_iter_state *st = seq->private;
1938 struct net *net = seq_file_net(seq);
1939
1940 ++st->num;
1941 ++st->offset;
1942
1943 sk = sk_nulls_next(sk);
1944
1945 sk_nulls_for_each_from(sk, node) {
1946 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1947 return sk;
1948 }
1949
1950 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1951 ++st->bucket;
1952 return established_get_first(seq);
1953 }
1954
1955 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1956 {
1957 struct tcp_iter_state *st = seq->private;
1958 void *rc;
1959
1960 st->bucket = 0;
1961 rc = established_get_first(seq);
1962
1963 while (rc && pos) {
1964 rc = established_get_next(seq, rc);
1965 --pos;
1966 }
1967 return rc;
1968 }
1969
1970 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1971 {
1972 void *rc;
1973 struct tcp_iter_state *st = seq->private;
1974
1975 st->state = TCP_SEQ_STATE_LISTENING;
1976 rc = listening_get_idx(seq, &pos);
1977
1978 if (!rc) {
1979 st->state = TCP_SEQ_STATE_ESTABLISHED;
1980 rc = established_get_idx(seq, pos);
1981 }
1982
1983 return rc;
1984 }
1985
1986 static void *tcp_seek_last_pos(struct seq_file *seq)
1987 {
1988 struct tcp_iter_state *st = seq->private;
1989 int offset = st->offset;
1990 int orig_num = st->num;
1991 void *rc = NULL;
1992
1993 switch (st->state) {
1994 case TCP_SEQ_STATE_LISTENING:
1995 if (st->bucket >= INET_LHTABLE_SIZE)
1996 break;
1997 st->state = TCP_SEQ_STATE_LISTENING;
1998 rc = listening_get_next(seq, NULL);
1999 while (offset-- && rc)
2000 rc = listening_get_next(seq, rc);
2001 if (rc)
2002 break;
2003 st->bucket = 0;
2004 st->state = TCP_SEQ_STATE_ESTABLISHED;
2005 /* Fallthrough */
2006 case TCP_SEQ_STATE_ESTABLISHED:
2007 if (st->bucket > tcp_hashinfo.ehash_mask)
2008 break;
2009 rc = established_get_first(seq);
2010 while (offset-- && rc)
2011 rc = established_get_next(seq, rc);
2012 }
2013
2014 st->num = orig_num;
2015
2016 return rc;
2017 }
2018
2019 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2020 {
2021 struct tcp_iter_state *st = seq->private;
2022 void *rc;
2023
2024 if (*pos && *pos == st->last_pos) {
2025 rc = tcp_seek_last_pos(seq);
2026 if (rc)
2027 goto out;
2028 }
2029
2030 st->state = TCP_SEQ_STATE_LISTENING;
2031 st->num = 0;
2032 st->bucket = 0;
2033 st->offset = 0;
2034 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2035
2036 out:
2037 st->last_pos = *pos;
2038 return rc;
2039 }
2040
2041 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2042 {
2043 struct tcp_iter_state *st = seq->private;
2044 void *rc = NULL;
2045
2046 if (v == SEQ_START_TOKEN) {
2047 rc = tcp_get_idx(seq, 0);
2048 goto out;
2049 }
2050
2051 switch (st->state) {
2052 case TCP_SEQ_STATE_LISTENING:
2053 rc = listening_get_next(seq, v);
2054 if (!rc) {
2055 st->state = TCP_SEQ_STATE_ESTABLISHED;
2056 st->bucket = 0;
2057 st->offset = 0;
2058 rc = established_get_first(seq);
2059 }
2060 break;
2061 case TCP_SEQ_STATE_ESTABLISHED:
2062 rc = established_get_next(seq, v);
2063 break;
2064 }
2065 out:
2066 ++*pos;
2067 st->last_pos = *pos;
2068 return rc;
2069 }
2070
2071 static void tcp_seq_stop(struct seq_file *seq, void *v)
2072 {
2073 struct tcp_iter_state *st = seq->private;
2074
2075 switch (st->state) {
2076 case TCP_SEQ_STATE_LISTENING:
2077 if (v != SEQ_START_TOKEN)
2078 spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2079 break;
2080 case TCP_SEQ_STATE_ESTABLISHED:
2081 if (v)
2082 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2083 break;
2084 }
2085 }
2086
2087 int tcp_seq_open(struct inode *inode, struct file *file)
2088 {
2089 struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2090 struct tcp_iter_state *s;
2091 int err;
2092
2093 err = seq_open_net(inode, file, &afinfo->seq_ops,
2094 sizeof(struct tcp_iter_state));
2095 if (err < 0)
2096 return err;
2097
2098 s = ((struct seq_file *)file->private_data)->private;
2099 s->family = afinfo->family;
2100 s->last_pos = 0;
2101 return 0;
2102 }
2103 EXPORT_SYMBOL(tcp_seq_open);
2104
2105 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2106 {
2107 int rc = 0;
2108 struct proc_dir_entry *p;
2109
2110 afinfo->seq_ops.start = tcp_seq_start;
2111 afinfo->seq_ops.next = tcp_seq_next;
2112 afinfo->seq_ops.stop = tcp_seq_stop;
2113
2114 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2115 afinfo->seq_fops, afinfo);
2116 if (!p)
2117 rc = -ENOMEM;
2118 return rc;
2119 }
2120 EXPORT_SYMBOL(tcp_proc_register);
2121
2122 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2123 {
2124 remove_proc_entry(afinfo->name, net->proc_net);
2125 }
2126 EXPORT_SYMBOL(tcp_proc_unregister);
2127
2128 static void get_openreq4(const struct request_sock *req,
2129 struct seq_file *f, int i)
2130 {
2131 const struct inet_request_sock *ireq = inet_rsk(req);
2132 long delta = req->rsk_timer.expires - jiffies;
2133
2134 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2135 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2136 i,
2137 ireq->ir_loc_addr,
2138 ireq->ir_num,
2139 ireq->ir_rmt_addr,
2140 ntohs(ireq->ir_rmt_port),
2141 TCP_SYN_RECV,
2142 0, 0, /* could print option size, but that is af dependent. */
2143 1, /* timers active (only the expire timer) */
2144 jiffies_delta_to_clock_t(delta),
2145 req->num_timeout,
2146 from_kuid_munged(seq_user_ns(f),
2147 sock_i_uid(req->rsk_listener)),
2148 0, /* non standard timer */
2149 0, /* open_requests have no inode */
2150 0,
2151 req);
2152 }
2153
2154 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2155 {
2156 int timer_active;
2157 unsigned long timer_expires;
2158 const struct tcp_sock *tp = tcp_sk(sk);
2159 const struct inet_connection_sock *icsk = inet_csk(sk);
2160 const struct inet_sock *inet = inet_sk(sk);
2161 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2162 __be32 dest = inet->inet_daddr;
2163 __be32 src = inet->inet_rcv_saddr;
2164 __u16 destp = ntohs(inet->inet_dport);
2165 __u16 srcp = ntohs(inet->inet_sport);
2166 int rx_queue;
2167 int state;
2168
2169 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2170 icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2171 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2172 timer_active = 1;
2173 timer_expires = icsk->icsk_timeout;
2174 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2175 timer_active = 4;
2176 timer_expires = icsk->icsk_timeout;
2177 } else if (timer_pending(&sk->sk_timer)) {
2178 timer_active = 2;
2179 timer_expires = sk->sk_timer.expires;
2180 } else {
2181 timer_active = 0;
2182 timer_expires = jiffies;
2183 }
2184
2185 state = sk_state_load(sk);
2186 if (state == TCP_LISTEN)
2187 rx_queue = sk->sk_ack_backlog;
2188 else
2189 /* Because we don't lock the socket,
2190 * we might find a transient negative value.
2191 */
2192 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2193
2194 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2195 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2196 i, src, srcp, dest, destp, state,
2197 tp->write_seq - tp->snd_una,
2198 rx_queue,
2199 timer_active,
2200 jiffies_delta_to_clock_t(timer_expires - jiffies),
2201 icsk->icsk_retransmits,
2202 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2203 icsk->icsk_probes_out,
2204 sock_i_ino(sk),
2205 atomic_read(&sk->sk_refcnt), sk,
2206 jiffies_to_clock_t(icsk->icsk_rto),
2207 jiffies_to_clock_t(icsk->icsk_ack.ato),
2208 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2209 tp->snd_cwnd,
2210 state == TCP_LISTEN ?
2211 fastopenq->max_qlen :
2212 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2213 }
2214
2215 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2216 struct seq_file *f, int i)
2217 {
2218 long delta = tw->tw_timer.expires - jiffies;
2219 __be32 dest, src;
2220 __u16 destp, srcp;
2221
2222 dest = tw->tw_daddr;
2223 src = tw->tw_rcv_saddr;
2224 destp = ntohs(tw->tw_dport);
2225 srcp = ntohs(tw->tw_sport);
2226
2227 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2228 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2229 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2230 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2231 atomic_read(&tw->tw_refcnt), tw);
2232 }
2233
2234 #define TMPSZ 150
2235
2236 static int tcp4_seq_show(struct seq_file *seq, void *v)
2237 {
2238 struct tcp_iter_state *st;
2239 struct sock *sk = v;
2240
2241 seq_setwidth(seq, TMPSZ - 1);
2242 if (v == SEQ_START_TOKEN) {
2243 seq_puts(seq, " sl local_address rem_address st tx_queue "
2244 "rx_queue tr tm->when retrnsmt uid timeout "
2245 "inode");
2246 goto out;
2247 }
2248 st = seq->private;
2249
2250 if (sk->sk_state == TCP_TIME_WAIT)
2251 get_timewait4_sock(v, seq, st->num);
2252 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2253 get_openreq4(v, seq, st->num);
2254 else
2255 get_tcp4_sock(v, seq, st->num);
2256 out:
2257 seq_pad(seq, '\n');
2258 return 0;
2259 }
2260
2261 static const struct file_operations tcp_afinfo_seq_fops = {
2262 .owner = THIS_MODULE,
2263 .open = tcp_seq_open,
2264 .read = seq_read,
2265 .llseek = seq_lseek,
2266 .release = seq_release_net
2267 };
2268
2269 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2270 .name = "tcp",
2271 .family = AF_INET,
2272 .seq_fops = &tcp_afinfo_seq_fops,
2273 .seq_ops = {
2274 .show = tcp4_seq_show,
2275 },
2276 };
2277
2278 static int __net_init tcp4_proc_init_net(struct net *net)
2279 {
2280 return tcp_proc_register(net, &tcp4_seq_afinfo);
2281 }
2282
2283 static void __net_exit tcp4_proc_exit_net(struct net *net)
2284 {
2285 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2286 }
2287
2288 static struct pernet_operations tcp4_net_ops = {
2289 .init = tcp4_proc_init_net,
2290 .exit = tcp4_proc_exit_net,
2291 };
2292
2293 int __init tcp4_proc_init(void)
2294 {
2295 return register_pernet_subsys(&tcp4_net_ops);
2296 }
2297
2298 void tcp4_proc_exit(void)
2299 {
2300 unregister_pernet_subsys(&tcp4_net_ops);
2301 }
2302 #endif /* CONFIG_PROC_FS */
2303
2304 struct proto tcp_prot = {
2305 .name = "TCP",
2306 .owner = THIS_MODULE,
2307 .close = tcp_close,
2308 .connect = tcp_v4_connect,
2309 .disconnect = tcp_disconnect,
2310 .accept = inet_csk_accept,
2311 .ioctl = tcp_ioctl,
2312 .init = tcp_v4_init_sock,
2313 .destroy = tcp_v4_destroy_sock,
2314 .shutdown = tcp_shutdown,
2315 .setsockopt = tcp_setsockopt,
2316 .getsockopt = tcp_getsockopt,
2317 .recvmsg = tcp_recvmsg,
2318 .sendmsg = tcp_sendmsg,
2319 .sendpage = tcp_sendpage,
2320 .backlog_rcv = tcp_v4_do_rcv,
2321 .release_cb = tcp_release_cb,
2322 .hash = inet_hash,
2323 .unhash = inet_unhash,
2324 .get_port = inet_csk_get_port,
2325 .enter_memory_pressure = tcp_enter_memory_pressure,
2326 .stream_memory_free = tcp_stream_memory_free,
2327 .sockets_allocated = &tcp_sockets_allocated,
2328 .orphan_count = &tcp_orphan_count,
2329 .memory_allocated = &tcp_memory_allocated,
2330 .memory_pressure = &tcp_memory_pressure,
2331 .sysctl_mem = sysctl_tcp_mem,
2332 .sysctl_wmem = sysctl_tcp_wmem,
2333 .sysctl_rmem = sysctl_tcp_rmem,
2334 .max_header = MAX_TCP_HEADER,
2335 .obj_size = sizeof(struct tcp_sock),
2336 .slab_flags = SLAB_DESTROY_BY_RCU,
2337 .twsk_prot = &tcp_timewait_sock_ops,
2338 .rsk_prot = &tcp_request_sock_ops,
2339 .h.hashinfo = &tcp_hashinfo,
2340 .no_autobind = true,
2341 #ifdef CONFIG_COMPAT
2342 .compat_setsockopt = compat_tcp_setsockopt,
2343 .compat_getsockopt = compat_tcp_getsockopt,
2344 #endif
2345 #ifdef CONFIG_MEMCG_KMEM
2346 .init_cgroup = tcp_init_cgroup,
2347 .destroy_cgroup = tcp_destroy_cgroup,
2348 .proto_cgroup = tcp_proto_cgroup,
2349 #endif
2350 .diag_destroy = tcp_abort,
2351 };
2352 EXPORT_SYMBOL(tcp_prot);
2353
2354 static void __net_exit tcp_sk_exit(struct net *net)
2355 {
2356 int cpu;
2357
2358 for_each_possible_cpu(cpu)
2359 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2360 free_percpu(net->ipv4.tcp_sk);
2361 }
2362
2363 static int __net_init tcp_sk_init(struct net *net)
2364 {
2365 int res, cpu;
2366
2367 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2368 if (!net->ipv4.tcp_sk)
2369 return -ENOMEM;
2370
2371 for_each_possible_cpu(cpu) {
2372 struct sock *sk;
2373
2374 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2375 IPPROTO_TCP, net);
2376 if (res)
2377 goto fail;
2378 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2379 }
2380
2381 net->ipv4.sysctl_tcp_ecn = 2;
2382 net->ipv4.sysctl_tcp_ecn_fallback = 1;
2383
2384 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2385 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2386 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2387
2388 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2389 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2390 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2391
2392 return 0;
2393 fail:
2394 tcp_sk_exit(net);
2395
2396 return res;
2397 }
2398
2399 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2400 {
2401 inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2402 }
2403
2404 static struct pernet_operations __net_initdata tcp_sk_ops = {
2405 .init = tcp_sk_init,
2406 .exit = tcp_sk_exit,
2407 .exit_batch = tcp_sk_exit_batch,
2408 };
2409
2410 void __init tcp_v4_init(void)
2411 {
2412 inet_hashinfo_init(&tcp_hashinfo);
2413 if (register_pernet_subsys(&tcp_sk_ops))
2414 panic("Failed to create the TCP control socket.\n");
2415 }
This page took 0.087832 seconds and 6 git commands to generate.