Merge tag 'powerpc-4.5-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc...
[deliverable/linux.git] / net / ipv4 / tcp_ipv4.c
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * IPv4 specific functions
9 *
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 *
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
22 */
23
24 /*
25 * Changes:
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
34 * ACK bit.
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
37 * request_sock handling and moved
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
40 * Added new listen semantics.
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
45 * coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
51 */
52
53 #define pr_fmt(fmt) "TCP: " fmt
54
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/tcp_memcontrol.h>
77 #include <net/busy_poll.h>
78
79 #include <linux/inet.h>
80 #include <linux/ipv6.h>
81 #include <linux/stddef.h>
82 #include <linux/proc_fs.h>
83 #include <linux/seq_file.h>
84
85 #include <linux/crypto.h>
86 #include <linux/scatterlist.h>
87
88 int sysctl_tcp_tw_reuse __read_mostly;
89 int sysctl_tcp_low_latency __read_mostly;
90 EXPORT_SYMBOL(sysctl_tcp_low_latency);
91
92 #ifdef CONFIG_TCP_MD5SIG
93 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
94 __be32 daddr, __be32 saddr, const struct tcphdr *th);
95 #endif
96
97 struct inet_hashinfo tcp_hashinfo;
98 EXPORT_SYMBOL(tcp_hashinfo);
99
100 static __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
101 {
102 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
103 ip_hdr(skb)->saddr,
104 tcp_hdr(skb)->dest,
105 tcp_hdr(skb)->source);
106 }
107
108 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
109 {
110 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111 struct tcp_sock *tp = tcp_sk(sk);
112
113 /* With PAWS, it is safe from the viewpoint
114 of data integrity. Even without PAWS it is safe provided sequence
115 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
116
117 Actually, the idea is close to VJ's one, only timestamp cache is
118 held not per host, but per port pair and TW bucket is used as state
119 holder.
120
121 If TW bucket has been already destroyed we fall back to VJ's scheme
122 and use initial timestamp retrieved from peer table.
123 */
124 if (tcptw->tw_ts_recent_stamp &&
125 (!twp || (sysctl_tcp_tw_reuse &&
126 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
127 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
128 if (tp->write_seq == 0)
129 tp->write_seq = 1;
130 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
131 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
132 sock_hold(sktw);
133 return 1;
134 }
135
136 return 0;
137 }
138 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
139
140 /* This will initiate an outgoing connection. */
141 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
142 {
143 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
144 struct inet_sock *inet = inet_sk(sk);
145 struct tcp_sock *tp = tcp_sk(sk);
146 __be16 orig_sport, orig_dport;
147 __be32 daddr, nexthop;
148 struct flowi4 *fl4;
149 struct rtable *rt;
150 int err;
151 struct ip_options_rcu *inet_opt;
152
153 if (addr_len < sizeof(struct sockaddr_in))
154 return -EINVAL;
155
156 if (usin->sin_family != AF_INET)
157 return -EAFNOSUPPORT;
158
159 nexthop = daddr = usin->sin_addr.s_addr;
160 inet_opt = rcu_dereference_protected(inet->inet_opt,
161 sock_owned_by_user(sk));
162 if (inet_opt && inet_opt->opt.srr) {
163 if (!daddr)
164 return -EINVAL;
165 nexthop = inet_opt->opt.faddr;
166 }
167
168 orig_sport = inet->inet_sport;
169 orig_dport = usin->sin_port;
170 fl4 = &inet->cork.fl.u.ip4;
171 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
172 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
173 IPPROTO_TCP,
174 orig_sport, orig_dport, sk);
175 if (IS_ERR(rt)) {
176 err = PTR_ERR(rt);
177 if (err == -ENETUNREACH)
178 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
179 return err;
180 }
181
182 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
183 ip_rt_put(rt);
184 return -ENETUNREACH;
185 }
186
187 if (!inet_opt || !inet_opt->opt.srr)
188 daddr = fl4->daddr;
189
190 if (!inet->inet_saddr)
191 inet->inet_saddr = fl4->saddr;
192 sk_rcv_saddr_set(sk, inet->inet_saddr);
193
194 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
195 /* Reset inherited state */
196 tp->rx_opt.ts_recent = 0;
197 tp->rx_opt.ts_recent_stamp = 0;
198 if (likely(!tp->repair))
199 tp->write_seq = 0;
200 }
201
202 if (tcp_death_row.sysctl_tw_recycle &&
203 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
204 tcp_fetch_timewait_stamp(sk, &rt->dst);
205
206 inet->inet_dport = usin->sin_port;
207 sk_daddr_set(sk, daddr);
208
209 inet_csk(sk)->icsk_ext_hdr_len = 0;
210 if (inet_opt)
211 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
212
213 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
214
215 /* Socket identity is still unknown (sport may be zero).
216 * However we set state to SYN-SENT and not releasing socket
217 * lock select source port, enter ourselves into the hash tables and
218 * complete initialization after this.
219 */
220 tcp_set_state(sk, TCP_SYN_SENT);
221 err = inet_hash_connect(&tcp_death_row, sk);
222 if (err)
223 goto failure;
224
225 sk_set_txhash(sk);
226
227 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
228 inet->inet_sport, inet->inet_dport, sk);
229 if (IS_ERR(rt)) {
230 err = PTR_ERR(rt);
231 rt = NULL;
232 goto failure;
233 }
234 /* OK, now commit destination to socket. */
235 sk->sk_gso_type = SKB_GSO_TCPV4;
236 sk_setup_caps(sk, &rt->dst);
237
238 if (!tp->write_seq && likely(!tp->repair))
239 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
240 inet->inet_daddr,
241 inet->inet_sport,
242 usin->sin_port);
243
244 inet->inet_id = tp->write_seq ^ jiffies;
245
246 err = tcp_connect(sk);
247
248 rt = NULL;
249 if (err)
250 goto failure;
251
252 return 0;
253
254 failure:
255 /*
256 * This unhashes the socket and releases the local port,
257 * if necessary.
258 */
259 tcp_set_state(sk, TCP_CLOSE);
260 ip_rt_put(rt);
261 sk->sk_route_caps = 0;
262 inet->inet_dport = 0;
263 return err;
264 }
265 EXPORT_SYMBOL(tcp_v4_connect);
266
267 /*
268 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
269 * It can be called through tcp_release_cb() if socket was owned by user
270 * at the time tcp_v4_err() was called to handle ICMP message.
271 */
272 void tcp_v4_mtu_reduced(struct sock *sk)
273 {
274 struct dst_entry *dst;
275 struct inet_sock *inet = inet_sk(sk);
276 u32 mtu = tcp_sk(sk)->mtu_info;
277
278 dst = inet_csk_update_pmtu(sk, mtu);
279 if (!dst)
280 return;
281
282 /* Something is about to be wrong... Remember soft error
283 * for the case, if this connection will not able to recover.
284 */
285 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
286 sk->sk_err_soft = EMSGSIZE;
287
288 mtu = dst_mtu(dst);
289
290 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
291 ip_sk_accept_pmtu(sk) &&
292 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
293 tcp_sync_mss(sk, mtu);
294
295 /* Resend the TCP packet because it's
296 * clear that the old packet has been
297 * dropped. This is the new "fast" path mtu
298 * discovery.
299 */
300 tcp_simple_retransmit(sk);
301 } /* else let the usual retransmit timer handle it */
302 }
303 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
304
305 static void do_redirect(struct sk_buff *skb, struct sock *sk)
306 {
307 struct dst_entry *dst = __sk_dst_check(sk, 0);
308
309 if (dst)
310 dst->ops->redirect(dst, sk, skb);
311 }
312
313
314 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
315 void tcp_req_err(struct sock *sk, u32 seq)
316 {
317 struct request_sock *req = inet_reqsk(sk);
318 struct net *net = sock_net(sk);
319
320 /* ICMPs are not backlogged, hence we cannot get
321 * an established socket here.
322 */
323 WARN_ON(req->sk);
324
325 if (seq != tcp_rsk(req)->snt_isn) {
326 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
327 } else {
328 /*
329 * Still in SYN_RECV, just remove it silently.
330 * There is no good way to pass the error to the newly
331 * created socket, and POSIX does not want network
332 * errors returned from accept().
333 */
334 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
335 NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
336 }
337 reqsk_put(req);
338 }
339 EXPORT_SYMBOL(tcp_req_err);
340
341 /*
342 * This routine is called by the ICMP module when it gets some
343 * sort of error condition. If err < 0 then the socket should
344 * be closed and the error returned to the user. If err > 0
345 * it's just the icmp type << 8 | icmp code. After adjustment
346 * header points to the first 8 bytes of the tcp header. We need
347 * to find the appropriate port.
348 *
349 * The locking strategy used here is very "optimistic". When
350 * someone else accesses the socket the ICMP is just dropped
351 * and for some paths there is no check at all.
352 * A more general error queue to queue errors for later handling
353 * is probably better.
354 *
355 */
356
357 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
358 {
359 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
360 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
361 struct inet_connection_sock *icsk;
362 struct tcp_sock *tp;
363 struct inet_sock *inet;
364 const int type = icmp_hdr(icmp_skb)->type;
365 const int code = icmp_hdr(icmp_skb)->code;
366 struct sock *sk;
367 struct sk_buff *skb;
368 struct request_sock *fastopen;
369 __u32 seq, snd_una;
370 __u32 remaining;
371 int err;
372 struct net *net = dev_net(icmp_skb->dev);
373
374 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
375 th->dest, iph->saddr, ntohs(th->source),
376 inet_iif(icmp_skb));
377 if (!sk) {
378 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
379 return;
380 }
381 if (sk->sk_state == TCP_TIME_WAIT) {
382 inet_twsk_put(inet_twsk(sk));
383 return;
384 }
385 seq = ntohl(th->seq);
386 if (sk->sk_state == TCP_NEW_SYN_RECV)
387 return tcp_req_err(sk, seq);
388
389 bh_lock_sock(sk);
390 /* If too many ICMPs get dropped on busy
391 * servers this needs to be solved differently.
392 * We do take care of PMTU discovery (RFC1191) special case :
393 * we can receive locally generated ICMP messages while socket is held.
394 */
395 if (sock_owned_by_user(sk)) {
396 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
397 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
398 }
399 if (sk->sk_state == TCP_CLOSE)
400 goto out;
401
402 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
403 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
404 goto out;
405 }
406
407 icsk = inet_csk(sk);
408 tp = tcp_sk(sk);
409 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
410 fastopen = tp->fastopen_rsk;
411 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
412 if (sk->sk_state != TCP_LISTEN &&
413 !between(seq, snd_una, tp->snd_nxt)) {
414 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
415 goto out;
416 }
417
418 switch (type) {
419 case ICMP_REDIRECT:
420 do_redirect(icmp_skb, sk);
421 goto out;
422 case ICMP_SOURCE_QUENCH:
423 /* Just silently ignore these. */
424 goto out;
425 case ICMP_PARAMETERPROB:
426 err = EPROTO;
427 break;
428 case ICMP_DEST_UNREACH:
429 if (code > NR_ICMP_UNREACH)
430 goto out;
431
432 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
433 /* We are not interested in TCP_LISTEN and open_requests
434 * (SYN-ACKs send out by Linux are always <576bytes so
435 * they should go through unfragmented).
436 */
437 if (sk->sk_state == TCP_LISTEN)
438 goto out;
439
440 tp->mtu_info = info;
441 if (!sock_owned_by_user(sk)) {
442 tcp_v4_mtu_reduced(sk);
443 } else {
444 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
445 sock_hold(sk);
446 }
447 goto out;
448 }
449
450 err = icmp_err_convert[code].errno;
451 /* check if icmp_skb allows revert of backoff
452 * (see draft-zimmermann-tcp-lcd) */
453 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
454 break;
455 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
456 !icsk->icsk_backoff || fastopen)
457 break;
458
459 if (sock_owned_by_user(sk))
460 break;
461
462 icsk->icsk_backoff--;
463 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
464 TCP_TIMEOUT_INIT;
465 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
466
467 skb = tcp_write_queue_head(sk);
468 BUG_ON(!skb);
469
470 remaining = icsk->icsk_rto -
471 min(icsk->icsk_rto,
472 tcp_time_stamp - tcp_skb_timestamp(skb));
473
474 if (remaining) {
475 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
476 remaining, TCP_RTO_MAX);
477 } else {
478 /* RTO revert clocked out retransmission.
479 * Will retransmit now */
480 tcp_retransmit_timer(sk);
481 }
482
483 break;
484 case ICMP_TIME_EXCEEDED:
485 err = EHOSTUNREACH;
486 break;
487 default:
488 goto out;
489 }
490
491 switch (sk->sk_state) {
492 case TCP_SYN_SENT:
493 case TCP_SYN_RECV:
494 /* Only in fast or simultaneous open. If a fast open socket is
495 * is already accepted it is treated as a connected one below.
496 */
497 if (fastopen && !fastopen->sk)
498 break;
499
500 if (!sock_owned_by_user(sk)) {
501 sk->sk_err = err;
502
503 sk->sk_error_report(sk);
504
505 tcp_done(sk);
506 } else {
507 sk->sk_err_soft = err;
508 }
509 goto out;
510 }
511
512 /* If we've already connected we will keep trying
513 * until we time out, or the user gives up.
514 *
515 * rfc1122 4.2.3.9 allows to consider as hard errors
516 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
517 * but it is obsoleted by pmtu discovery).
518 *
519 * Note, that in modern internet, where routing is unreliable
520 * and in each dark corner broken firewalls sit, sending random
521 * errors ordered by their masters even this two messages finally lose
522 * their original sense (even Linux sends invalid PORT_UNREACHs)
523 *
524 * Now we are in compliance with RFCs.
525 * --ANK (980905)
526 */
527
528 inet = inet_sk(sk);
529 if (!sock_owned_by_user(sk) && inet->recverr) {
530 sk->sk_err = err;
531 sk->sk_error_report(sk);
532 } else { /* Only an error on timeout */
533 sk->sk_err_soft = err;
534 }
535
536 out:
537 bh_unlock_sock(sk);
538 sock_put(sk);
539 }
540
541 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
542 {
543 struct tcphdr *th = tcp_hdr(skb);
544
545 if (skb->ip_summed == CHECKSUM_PARTIAL) {
546 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
547 skb->csum_start = skb_transport_header(skb) - skb->head;
548 skb->csum_offset = offsetof(struct tcphdr, check);
549 } else {
550 th->check = tcp_v4_check(skb->len, saddr, daddr,
551 csum_partial(th,
552 th->doff << 2,
553 skb->csum));
554 }
555 }
556
557 /* This routine computes an IPv4 TCP checksum. */
558 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
559 {
560 const struct inet_sock *inet = inet_sk(sk);
561
562 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
563 }
564 EXPORT_SYMBOL(tcp_v4_send_check);
565
566 /*
567 * This routine will send an RST to the other tcp.
568 *
569 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
570 * for reset.
571 * Answer: if a packet caused RST, it is not for a socket
572 * existing in our system, if it is matched to a socket,
573 * it is just duplicate segment or bug in other side's TCP.
574 * So that we build reply only basing on parameters
575 * arrived with segment.
576 * Exception: precedence violation. We do not implement it in any case.
577 */
578
579 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
580 {
581 const struct tcphdr *th = tcp_hdr(skb);
582 struct {
583 struct tcphdr th;
584 #ifdef CONFIG_TCP_MD5SIG
585 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
586 #endif
587 } rep;
588 struct ip_reply_arg arg;
589 #ifdef CONFIG_TCP_MD5SIG
590 struct tcp_md5sig_key *key = NULL;
591 const __u8 *hash_location = NULL;
592 unsigned char newhash[16];
593 int genhash;
594 struct sock *sk1 = NULL;
595 #endif
596 struct net *net;
597
598 /* Never send a reset in response to a reset. */
599 if (th->rst)
600 return;
601
602 /* If sk not NULL, it means we did a successful lookup and incoming
603 * route had to be correct. prequeue might have dropped our dst.
604 */
605 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
606 return;
607
608 /* Swap the send and the receive. */
609 memset(&rep, 0, sizeof(rep));
610 rep.th.dest = th->source;
611 rep.th.source = th->dest;
612 rep.th.doff = sizeof(struct tcphdr) / 4;
613 rep.th.rst = 1;
614
615 if (th->ack) {
616 rep.th.seq = th->ack_seq;
617 } else {
618 rep.th.ack = 1;
619 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
620 skb->len - (th->doff << 2));
621 }
622
623 memset(&arg, 0, sizeof(arg));
624 arg.iov[0].iov_base = (unsigned char *)&rep;
625 arg.iov[0].iov_len = sizeof(rep.th);
626
627 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
628 #ifdef CONFIG_TCP_MD5SIG
629 hash_location = tcp_parse_md5sig_option(th);
630 if (sk && sk_fullsock(sk)) {
631 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
632 &ip_hdr(skb)->saddr, AF_INET);
633 } else if (hash_location) {
634 /*
635 * active side is lost. Try to find listening socket through
636 * source port, and then find md5 key through listening socket.
637 * we are not loose security here:
638 * Incoming packet is checked with md5 hash with finding key,
639 * no RST generated if md5 hash doesn't match.
640 */
641 sk1 = __inet_lookup_listener(net,
642 &tcp_hashinfo, ip_hdr(skb)->saddr,
643 th->source, ip_hdr(skb)->daddr,
644 ntohs(th->source), inet_iif(skb));
645 /* don't send rst if it can't find key */
646 if (!sk1)
647 return;
648 rcu_read_lock();
649 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
650 &ip_hdr(skb)->saddr, AF_INET);
651 if (!key)
652 goto release_sk1;
653
654 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
655 if (genhash || memcmp(hash_location, newhash, 16) != 0)
656 goto release_sk1;
657 }
658
659 if (key) {
660 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
661 (TCPOPT_NOP << 16) |
662 (TCPOPT_MD5SIG << 8) |
663 TCPOLEN_MD5SIG);
664 /* Update length and the length the header thinks exists */
665 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
666 rep.th.doff = arg.iov[0].iov_len / 4;
667
668 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
669 key, ip_hdr(skb)->saddr,
670 ip_hdr(skb)->daddr, &rep.th);
671 }
672 #endif
673 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
674 ip_hdr(skb)->saddr, /* XXX */
675 arg.iov[0].iov_len, IPPROTO_TCP, 0);
676 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
677 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
678
679 /* When socket is gone, all binding information is lost.
680 * routing might fail in this case. No choice here, if we choose to force
681 * input interface, we will misroute in case of asymmetric route.
682 */
683 if (sk)
684 arg.bound_dev_if = sk->sk_bound_dev_if;
685
686 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
687 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
688
689 arg.tos = ip_hdr(skb)->tos;
690 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
691 skb, &TCP_SKB_CB(skb)->header.h4.opt,
692 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
693 &arg, arg.iov[0].iov_len);
694
695 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
696 TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
697
698 #ifdef CONFIG_TCP_MD5SIG
699 release_sk1:
700 if (sk1) {
701 rcu_read_unlock();
702 sock_put(sk1);
703 }
704 #endif
705 }
706
707 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
708 outside socket context is ugly, certainly. What can I do?
709 */
710
711 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
712 u32 win, u32 tsval, u32 tsecr, int oif,
713 struct tcp_md5sig_key *key,
714 int reply_flags, u8 tos)
715 {
716 const struct tcphdr *th = tcp_hdr(skb);
717 struct {
718 struct tcphdr th;
719 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
720 #ifdef CONFIG_TCP_MD5SIG
721 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
722 #endif
723 ];
724 } rep;
725 struct ip_reply_arg arg;
726 struct net *net = dev_net(skb_dst(skb)->dev);
727
728 memset(&rep.th, 0, sizeof(struct tcphdr));
729 memset(&arg, 0, sizeof(arg));
730
731 arg.iov[0].iov_base = (unsigned char *)&rep;
732 arg.iov[0].iov_len = sizeof(rep.th);
733 if (tsecr) {
734 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
735 (TCPOPT_TIMESTAMP << 8) |
736 TCPOLEN_TIMESTAMP);
737 rep.opt[1] = htonl(tsval);
738 rep.opt[2] = htonl(tsecr);
739 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
740 }
741
742 /* Swap the send and the receive. */
743 rep.th.dest = th->source;
744 rep.th.source = th->dest;
745 rep.th.doff = arg.iov[0].iov_len / 4;
746 rep.th.seq = htonl(seq);
747 rep.th.ack_seq = htonl(ack);
748 rep.th.ack = 1;
749 rep.th.window = htons(win);
750
751 #ifdef CONFIG_TCP_MD5SIG
752 if (key) {
753 int offset = (tsecr) ? 3 : 0;
754
755 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
756 (TCPOPT_NOP << 16) |
757 (TCPOPT_MD5SIG << 8) |
758 TCPOLEN_MD5SIG);
759 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
760 rep.th.doff = arg.iov[0].iov_len/4;
761
762 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
763 key, ip_hdr(skb)->saddr,
764 ip_hdr(skb)->daddr, &rep.th);
765 }
766 #endif
767 arg.flags = reply_flags;
768 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
769 ip_hdr(skb)->saddr, /* XXX */
770 arg.iov[0].iov_len, IPPROTO_TCP, 0);
771 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
772 if (oif)
773 arg.bound_dev_if = oif;
774 arg.tos = tos;
775 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
776 skb, &TCP_SKB_CB(skb)->header.h4.opt,
777 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
778 &arg, arg.iov[0].iov_len);
779
780 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
781 }
782
783 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
784 {
785 struct inet_timewait_sock *tw = inet_twsk(sk);
786 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
787
788 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
789 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
790 tcp_time_stamp + tcptw->tw_ts_offset,
791 tcptw->tw_ts_recent,
792 tw->tw_bound_dev_if,
793 tcp_twsk_md5_key(tcptw),
794 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
795 tw->tw_tos
796 );
797
798 inet_twsk_put(tw);
799 }
800
801 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
802 struct request_sock *req)
803 {
804 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
805 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
806 */
807 tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
808 tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
809 tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
810 tcp_time_stamp,
811 req->ts_recent,
812 0,
813 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
814 AF_INET),
815 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
816 ip_hdr(skb)->tos);
817 }
818
819 /*
820 * Send a SYN-ACK after having received a SYN.
821 * This still operates on a request_sock only, not on a big
822 * socket.
823 */
824 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
825 struct flowi *fl,
826 struct request_sock *req,
827 struct tcp_fastopen_cookie *foc,
828 bool attach_req)
829 {
830 const struct inet_request_sock *ireq = inet_rsk(req);
831 struct flowi4 fl4;
832 int err = -1;
833 struct sk_buff *skb;
834
835 /* First, grab a route. */
836 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
837 return -1;
838
839 skb = tcp_make_synack(sk, dst, req, foc, attach_req);
840
841 if (skb) {
842 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
843
844 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
845 ireq->ir_rmt_addr,
846 ireq->opt);
847 err = net_xmit_eval(err);
848 }
849
850 return err;
851 }
852
853 /*
854 * IPv4 request_sock destructor.
855 */
856 static void tcp_v4_reqsk_destructor(struct request_sock *req)
857 {
858 kfree(inet_rsk(req)->opt);
859 }
860
861
862 #ifdef CONFIG_TCP_MD5SIG
863 /*
864 * RFC2385 MD5 checksumming requires a mapping of
865 * IP address->MD5 Key.
866 * We need to maintain these in the sk structure.
867 */
868
869 /* Find the Key structure for an address. */
870 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
871 const union tcp_md5_addr *addr,
872 int family)
873 {
874 const struct tcp_sock *tp = tcp_sk(sk);
875 struct tcp_md5sig_key *key;
876 unsigned int size = sizeof(struct in_addr);
877 const struct tcp_md5sig_info *md5sig;
878
879 /* caller either holds rcu_read_lock() or socket lock */
880 md5sig = rcu_dereference_check(tp->md5sig_info,
881 sock_owned_by_user(sk) ||
882 lockdep_is_held((spinlock_t *)&sk->sk_lock.slock));
883 if (!md5sig)
884 return NULL;
885 #if IS_ENABLED(CONFIG_IPV6)
886 if (family == AF_INET6)
887 size = sizeof(struct in6_addr);
888 #endif
889 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
890 if (key->family != family)
891 continue;
892 if (!memcmp(&key->addr, addr, size))
893 return key;
894 }
895 return NULL;
896 }
897 EXPORT_SYMBOL(tcp_md5_do_lookup);
898
899 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
900 const struct sock *addr_sk)
901 {
902 const union tcp_md5_addr *addr;
903
904 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
905 return tcp_md5_do_lookup(sk, addr, AF_INET);
906 }
907 EXPORT_SYMBOL(tcp_v4_md5_lookup);
908
909 /* This can be called on a newly created socket, from other files */
910 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
911 int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
912 {
913 /* Add Key to the list */
914 struct tcp_md5sig_key *key;
915 struct tcp_sock *tp = tcp_sk(sk);
916 struct tcp_md5sig_info *md5sig;
917
918 key = tcp_md5_do_lookup(sk, addr, family);
919 if (key) {
920 /* Pre-existing entry - just update that one. */
921 memcpy(key->key, newkey, newkeylen);
922 key->keylen = newkeylen;
923 return 0;
924 }
925
926 md5sig = rcu_dereference_protected(tp->md5sig_info,
927 sock_owned_by_user(sk) ||
928 lockdep_is_held(&sk->sk_lock.slock));
929 if (!md5sig) {
930 md5sig = kmalloc(sizeof(*md5sig), gfp);
931 if (!md5sig)
932 return -ENOMEM;
933
934 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
935 INIT_HLIST_HEAD(&md5sig->head);
936 rcu_assign_pointer(tp->md5sig_info, md5sig);
937 }
938
939 key = sock_kmalloc(sk, sizeof(*key), gfp);
940 if (!key)
941 return -ENOMEM;
942 if (!tcp_alloc_md5sig_pool()) {
943 sock_kfree_s(sk, key, sizeof(*key));
944 return -ENOMEM;
945 }
946
947 memcpy(key->key, newkey, newkeylen);
948 key->keylen = newkeylen;
949 key->family = family;
950 memcpy(&key->addr, addr,
951 (family == AF_INET6) ? sizeof(struct in6_addr) :
952 sizeof(struct in_addr));
953 hlist_add_head_rcu(&key->node, &md5sig->head);
954 return 0;
955 }
956 EXPORT_SYMBOL(tcp_md5_do_add);
957
958 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
959 {
960 struct tcp_md5sig_key *key;
961
962 key = tcp_md5_do_lookup(sk, addr, family);
963 if (!key)
964 return -ENOENT;
965 hlist_del_rcu(&key->node);
966 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
967 kfree_rcu(key, rcu);
968 return 0;
969 }
970 EXPORT_SYMBOL(tcp_md5_do_del);
971
972 static void tcp_clear_md5_list(struct sock *sk)
973 {
974 struct tcp_sock *tp = tcp_sk(sk);
975 struct tcp_md5sig_key *key;
976 struct hlist_node *n;
977 struct tcp_md5sig_info *md5sig;
978
979 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
980
981 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
982 hlist_del_rcu(&key->node);
983 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
984 kfree_rcu(key, rcu);
985 }
986 }
987
988 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
989 int optlen)
990 {
991 struct tcp_md5sig cmd;
992 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
993
994 if (optlen < sizeof(cmd))
995 return -EINVAL;
996
997 if (copy_from_user(&cmd, optval, sizeof(cmd)))
998 return -EFAULT;
999
1000 if (sin->sin_family != AF_INET)
1001 return -EINVAL;
1002
1003 if (!cmd.tcpm_keylen)
1004 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1005 AF_INET);
1006
1007 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1008 return -EINVAL;
1009
1010 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1011 AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1012 GFP_KERNEL);
1013 }
1014
1015 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1016 __be32 daddr, __be32 saddr, int nbytes)
1017 {
1018 struct tcp4_pseudohdr *bp;
1019 struct scatterlist sg;
1020
1021 bp = &hp->md5_blk.ip4;
1022
1023 /*
1024 * 1. the TCP pseudo-header (in the order: source IP address,
1025 * destination IP address, zero-padded protocol number, and
1026 * segment length)
1027 */
1028 bp->saddr = saddr;
1029 bp->daddr = daddr;
1030 bp->pad = 0;
1031 bp->protocol = IPPROTO_TCP;
1032 bp->len = cpu_to_be16(nbytes);
1033
1034 sg_init_one(&sg, bp, sizeof(*bp));
1035 return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1036 }
1037
1038 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1039 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1040 {
1041 struct tcp_md5sig_pool *hp;
1042 struct hash_desc *desc;
1043
1044 hp = tcp_get_md5sig_pool();
1045 if (!hp)
1046 goto clear_hash_noput;
1047 desc = &hp->md5_desc;
1048
1049 if (crypto_hash_init(desc))
1050 goto clear_hash;
1051 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1052 goto clear_hash;
1053 if (tcp_md5_hash_header(hp, th))
1054 goto clear_hash;
1055 if (tcp_md5_hash_key(hp, key))
1056 goto clear_hash;
1057 if (crypto_hash_final(desc, md5_hash))
1058 goto clear_hash;
1059
1060 tcp_put_md5sig_pool();
1061 return 0;
1062
1063 clear_hash:
1064 tcp_put_md5sig_pool();
1065 clear_hash_noput:
1066 memset(md5_hash, 0, 16);
1067 return 1;
1068 }
1069
1070 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1071 const struct sock *sk,
1072 const struct sk_buff *skb)
1073 {
1074 struct tcp_md5sig_pool *hp;
1075 struct hash_desc *desc;
1076 const struct tcphdr *th = tcp_hdr(skb);
1077 __be32 saddr, daddr;
1078
1079 if (sk) { /* valid for establish/request sockets */
1080 saddr = sk->sk_rcv_saddr;
1081 daddr = sk->sk_daddr;
1082 } else {
1083 const struct iphdr *iph = ip_hdr(skb);
1084 saddr = iph->saddr;
1085 daddr = iph->daddr;
1086 }
1087
1088 hp = tcp_get_md5sig_pool();
1089 if (!hp)
1090 goto clear_hash_noput;
1091 desc = &hp->md5_desc;
1092
1093 if (crypto_hash_init(desc))
1094 goto clear_hash;
1095
1096 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1097 goto clear_hash;
1098 if (tcp_md5_hash_header(hp, th))
1099 goto clear_hash;
1100 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1101 goto clear_hash;
1102 if (tcp_md5_hash_key(hp, key))
1103 goto clear_hash;
1104 if (crypto_hash_final(desc, md5_hash))
1105 goto clear_hash;
1106
1107 tcp_put_md5sig_pool();
1108 return 0;
1109
1110 clear_hash:
1111 tcp_put_md5sig_pool();
1112 clear_hash_noput:
1113 memset(md5_hash, 0, 16);
1114 return 1;
1115 }
1116 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1117
1118 #endif
1119
1120 /* Called with rcu_read_lock() */
1121 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1122 const struct sk_buff *skb)
1123 {
1124 #ifdef CONFIG_TCP_MD5SIG
1125 /*
1126 * This gets called for each TCP segment that arrives
1127 * so we want to be efficient.
1128 * We have 3 drop cases:
1129 * o No MD5 hash and one expected.
1130 * o MD5 hash and we're not expecting one.
1131 * o MD5 hash and its wrong.
1132 */
1133 const __u8 *hash_location = NULL;
1134 struct tcp_md5sig_key *hash_expected;
1135 const struct iphdr *iph = ip_hdr(skb);
1136 const struct tcphdr *th = tcp_hdr(skb);
1137 int genhash;
1138 unsigned char newhash[16];
1139
1140 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1141 AF_INET);
1142 hash_location = tcp_parse_md5sig_option(th);
1143
1144 /* We've parsed the options - do we have a hash? */
1145 if (!hash_expected && !hash_location)
1146 return false;
1147
1148 if (hash_expected && !hash_location) {
1149 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1150 return true;
1151 }
1152
1153 if (!hash_expected && hash_location) {
1154 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1155 return true;
1156 }
1157
1158 /* Okay, so this is hash_expected and hash_location -
1159 * so we need to calculate the checksum.
1160 */
1161 genhash = tcp_v4_md5_hash_skb(newhash,
1162 hash_expected,
1163 NULL, skb);
1164
1165 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1166 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1167 &iph->saddr, ntohs(th->source),
1168 &iph->daddr, ntohs(th->dest),
1169 genhash ? " tcp_v4_calc_md5_hash failed"
1170 : "");
1171 return true;
1172 }
1173 return false;
1174 #endif
1175 return false;
1176 }
1177
1178 static void tcp_v4_init_req(struct request_sock *req,
1179 const struct sock *sk_listener,
1180 struct sk_buff *skb)
1181 {
1182 struct inet_request_sock *ireq = inet_rsk(req);
1183
1184 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1185 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1186 ireq->no_srccheck = inet_sk(sk_listener)->transparent;
1187 ireq->opt = tcp_v4_save_options(skb);
1188 }
1189
1190 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1191 struct flowi *fl,
1192 const struct request_sock *req,
1193 bool *strict)
1194 {
1195 struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1196
1197 if (strict) {
1198 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1199 *strict = true;
1200 else
1201 *strict = false;
1202 }
1203
1204 return dst;
1205 }
1206
1207 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1208 .family = PF_INET,
1209 .obj_size = sizeof(struct tcp_request_sock),
1210 .rtx_syn_ack = tcp_rtx_synack,
1211 .send_ack = tcp_v4_reqsk_send_ack,
1212 .destructor = tcp_v4_reqsk_destructor,
1213 .send_reset = tcp_v4_send_reset,
1214 .syn_ack_timeout = tcp_syn_ack_timeout,
1215 };
1216
1217 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1218 .mss_clamp = TCP_MSS_DEFAULT,
1219 #ifdef CONFIG_TCP_MD5SIG
1220 .req_md5_lookup = tcp_v4_md5_lookup,
1221 .calc_md5_hash = tcp_v4_md5_hash_skb,
1222 #endif
1223 .init_req = tcp_v4_init_req,
1224 #ifdef CONFIG_SYN_COOKIES
1225 .cookie_init_seq = cookie_v4_init_sequence,
1226 #endif
1227 .route_req = tcp_v4_route_req,
1228 .init_seq = tcp_v4_init_sequence,
1229 .send_synack = tcp_v4_send_synack,
1230 };
1231
1232 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1233 {
1234 /* Never answer to SYNs send to broadcast or multicast */
1235 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1236 goto drop;
1237
1238 return tcp_conn_request(&tcp_request_sock_ops,
1239 &tcp_request_sock_ipv4_ops, sk, skb);
1240
1241 drop:
1242 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1243 return 0;
1244 }
1245 EXPORT_SYMBOL(tcp_v4_conn_request);
1246
1247
1248 /*
1249 * The three way handshake has completed - we got a valid synack -
1250 * now create the new socket.
1251 */
1252 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1253 struct request_sock *req,
1254 struct dst_entry *dst,
1255 struct request_sock *req_unhash,
1256 bool *own_req)
1257 {
1258 struct inet_request_sock *ireq;
1259 struct inet_sock *newinet;
1260 struct tcp_sock *newtp;
1261 struct sock *newsk;
1262 #ifdef CONFIG_TCP_MD5SIG
1263 struct tcp_md5sig_key *key;
1264 #endif
1265 struct ip_options_rcu *inet_opt;
1266
1267 if (sk_acceptq_is_full(sk))
1268 goto exit_overflow;
1269
1270 newsk = tcp_create_openreq_child(sk, req, skb);
1271 if (!newsk)
1272 goto exit_nonewsk;
1273
1274 newsk->sk_gso_type = SKB_GSO_TCPV4;
1275 inet_sk_rx_dst_set(newsk, skb);
1276
1277 newtp = tcp_sk(newsk);
1278 newinet = inet_sk(newsk);
1279 ireq = inet_rsk(req);
1280 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1281 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1282 newsk->sk_bound_dev_if = ireq->ir_iif;
1283 newinet->inet_saddr = ireq->ir_loc_addr;
1284 inet_opt = ireq->opt;
1285 rcu_assign_pointer(newinet->inet_opt, inet_opt);
1286 ireq->opt = NULL;
1287 newinet->mc_index = inet_iif(skb);
1288 newinet->mc_ttl = ip_hdr(skb)->ttl;
1289 newinet->rcv_tos = ip_hdr(skb)->tos;
1290 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1291 if (inet_opt)
1292 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1293 newinet->inet_id = newtp->write_seq ^ jiffies;
1294
1295 if (!dst) {
1296 dst = inet_csk_route_child_sock(sk, newsk, req);
1297 if (!dst)
1298 goto put_and_exit;
1299 } else {
1300 /* syncookie case : see end of cookie_v4_check() */
1301 }
1302 sk_setup_caps(newsk, dst);
1303
1304 tcp_ca_openreq_child(newsk, dst);
1305
1306 tcp_sync_mss(newsk, dst_mtu(dst));
1307 newtp->advmss = dst_metric_advmss(dst);
1308 if (tcp_sk(sk)->rx_opt.user_mss &&
1309 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1310 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1311
1312 tcp_initialize_rcv_mss(newsk);
1313
1314 #ifdef CONFIG_TCP_MD5SIG
1315 /* Copy over the MD5 key from the original socket */
1316 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1317 AF_INET);
1318 if (key) {
1319 /*
1320 * We're using one, so create a matching key
1321 * on the newsk structure. If we fail to get
1322 * memory, then we end up not copying the key
1323 * across. Shucks.
1324 */
1325 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1326 AF_INET, key->key, key->keylen, GFP_ATOMIC);
1327 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1328 }
1329 #endif
1330
1331 if (__inet_inherit_port(sk, newsk) < 0)
1332 goto put_and_exit;
1333 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1334 if (*own_req)
1335 tcp_move_syn(newtp, req);
1336
1337 return newsk;
1338
1339 exit_overflow:
1340 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1341 exit_nonewsk:
1342 dst_release(dst);
1343 exit:
1344 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1345 return NULL;
1346 put_and_exit:
1347 inet_csk_prepare_forced_close(newsk);
1348 tcp_done(newsk);
1349 goto exit;
1350 }
1351 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1352
1353 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1354 {
1355 #ifdef CONFIG_SYN_COOKIES
1356 const struct tcphdr *th = tcp_hdr(skb);
1357
1358 if (!th->syn)
1359 sk = cookie_v4_check(sk, skb);
1360 #endif
1361 return sk;
1362 }
1363
1364 /* The socket must have it's spinlock held when we get
1365 * here, unless it is a TCP_LISTEN socket.
1366 *
1367 * We have a potential double-lock case here, so even when
1368 * doing backlog processing we use the BH locking scheme.
1369 * This is because we cannot sleep with the original spinlock
1370 * held.
1371 */
1372 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1373 {
1374 struct sock *rsk;
1375
1376 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1377 struct dst_entry *dst = sk->sk_rx_dst;
1378
1379 sock_rps_save_rxhash(sk, skb);
1380 sk_mark_napi_id(sk, skb);
1381 if (dst) {
1382 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1383 !dst->ops->check(dst, 0)) {
1384 dst_release(dst);
1385 sk->sk_rx_dst = NULL;
1386 }
1387 }
1388 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1389 return 0;
1390 }
1391
1392 if (tcp_checksum_complete(skb))
1393 goto csum_err;
1394
1395 if (sk->sk_state == TCP_LISTEN) {
1396 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1397
1398 if (!nsk)
1399 goto discard;
1400 if (nsk != sk) {
1401 sock_rps_save_rxhash(nsk, skb);
1402 sk_mark_napi_id(nsk, skb);
1403 if (tcp_child_process(sk, nsk, skb)) {
1404 rsk = nsk;
1405 goto reset;
1406 }
1407 return 0;
1408 }
1409 } else
1410 sock_rps_save_rxhash(sk, skb);
1411
1412 if (tcp_rcv_state_process(sk, skb)) {
1413 rsk = sk;
1414 goto reset;
1415 }
1416 return 0;
1417
1418 reset:
1419 tcp_v4_send_reset(rsk, skb);
1420 discard:
1421 kfree_skb(skb);
1422 /* Be careful here. If this function gets more complicated and
1423 * gcc suffers from register pressure on the x86, sk (in %ebx)
1424 * might be destroyed here. This current version compiles correctly,
1425 * but you have been warned.
1426 */
1427 return 0;
1428
1429 csum_err:
1430 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1431 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1432 goto discard;
1433 }
1434 EXPORT_SYMBOL(tcp_v4_do_rcv);
1435
1436 void tcp_v4_early_demux(struct sk_buff *skb)
1437 {
1438 const struct iphdr *iph;
1439 const struct tcphdr *th;
1440 struct sock *sk;
1441
1442 if (skb->pkt_type != PACKET_HOST)
1443 return;
1444
1445 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1446 return;
1447
1448 iph = ip_hdr(skb);
1449 th = tcp_hdr(skb);
1450
1451 if (th->doff < sizeof(struct tcphdr) / 4)
1452 return;
1453
1454 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1455 iph->saddr, th->source,
1456 iph->daddr, ntohs(th->dest),
1457 skb->skb_iif);
1458 if (sk) {
1459 skb->sk = sk;
1460 skb->destructor = sock_edemux;
1461 if (sk_fullsock(sk)) {
1462 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1463
1464 if (dst)
1465 dst = dst_check(dst, 0);
1466 if (dst &&
1467 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1468 skb_dst_set_noref(skb, dst);
1469 }
1470 }
1471 }
1472
1473 /* Packet is added to VJ-style prequeue for processing in process
1474 * context, if a reader task is waiting. Apparently, this exciting
1475 * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1476 * failed somewhere. Latency? Burstiness? Well, at least now we will
1477 * see, why it failed. 8)8) --ANK
1478 *
1479 */
1480 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1481 {
1482 struct tcp_sock *tp = tcp_sk(sk);
1483
1484 if (sysctl_tcp_low_latency || !tp->ucopy.task)
1485 return false;
1486
1487 if (skb->len <= tcp_hdrlen(skb) &&
1488 skb_queue_len(&tp->ucopy.prequeue) == 0)
1489 return false;
1490
1491 /* Before escaping RCU protected region, we need to take care of skb
1492 * dst. Prequeue is only enabled for established sockets.
1493 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1494 * Instead of doing full sk_rx_dst validity here, let's perform
1495 * an optimistic check.
1496 */
1497 if (likely(sk->sk_rx_dst))
1498 skb_dst_drop(skb);
1499 else
1500 skb_dst_force_safe(skb);
1501
1502 __skb_queue_tail(&tp->ucopy.prequeue, skb);
1503 tp->ucopy.memory += skb->truesize;
1504 if (tp->ucopy.memory > sk->sk_rcvbuf) {
1505 struct sk_buff *skb1;
1506
1507 BUG_ON(sock_owned_by_user(sk));
1508
1509 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1510 sk_backlog_rcv(sk, skb1);
1511 NET_INC_STATS_BH(sock_net(sk),
1512 LINUX_MIB_TCPPREQUEUEDROPPED);
1513 }
1514
1515 tp->ucopy.memory = 0;
1516 } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1517 wake_up_interruptible_sync_poll(sk_sleep(sk),
1518 POLLIN | POLLRDNORM | POLLRDBAND);
1519 if (!inet_csk_ack_scheduled(sk))
1520 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1521 (3 * tcp_rto_min(sk)) / 4,
1522 TCP_RTO_MAX);
1523 }
1524 return true;
1525 }
1526 EXPORT_SYMBOL(tcp_prequeue);
1527
1528 /*
1529 * From tcp_input.c
1530 */
1531
1532 int tcp_v4_rcv(struct sk_buff *skb)
1533 {
1534 const struct iphdr *iph;
1535 const struct tcphdr *th;
1536 struct sock *sk;
1537 int ret;
1538 struct net *net = dev_net(skb->dev);
1539
1540 if (skb->pkt_type != PACKET_HOST)
1541 goto discard_it;
1542
1543 /* Count it even if it's bad */
1544 TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1545
1546 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1547 goto discard_it;
1548
1549 th = tcp_hdr(skb);
1550
1551 if (th->doff < sizeof(struct tcphdr) / 4)
1552 goto bad_packet;
1553 if (!pskb_may_pull(skb, th->doff * 4))
1554 goto discard_it;
1555
1556 /* An explanation is required here, I think.
1557 * Packet length and doff are validated by header prediction,
1558 * provided case of th->doff==0 is eliminated.
1559 * So, we defer the checks. */
1560
1561 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1562 goto csum_error;
1563
1564 th = tcp_hdr(skb);
1565 iph = ip_hdr(skb);
1566 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1567 * barrier() makes sure compiler wont play fool^Waliasing games.
1568 */
1569 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1570 sizeof(struct inet_skb_parm));
1571 barrier();
1572
1573 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1574 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1575 skb->len - th->doff * 4);
1576 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1577 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1578 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1579 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1580 TCP_SKB_CB(skb)->sacked = 0;
1581
1582 lookup:
1583 sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1584 if (!sk)
1585 goto no_tcp_socket;
1586
1587 process:
1588 if (sk->sk_state == TCP_TIME_WAIT)
1589 goto do_time_wait;
1590
1591 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1592 struct request_sock *req = inet_reqsk(sk);
1593 struct sock *nsk = NULL;
1594
1595 sk = req->rsk_listener;
1596 if (tcp_v4_inbound_md5_hash(sk, skb))
1597 goto discard_and_relse;
1598 if (likely(sk->sk_state == TCP_LISTEN)) {
1599 nsk = tcp_check_req(sk, skb, req, false);
1600 } else {
1601 inet_csk_reqsk_queue_drop_and_put(sk, req);
1602 goto lookup;
1603 }
1604 if (!nsk) {
1605 reqsk_put(req);
1606 goto discard_it;
1607 }
1608 if (nsk == sk) {
1609 sock_hold(sk);
1610 reqsk_put(req);
1611 } else if (tcp_child_process(sk, nsk, skb)) {
1612 tcp_v4_send_reset(nsk, skb);
1613 goto discard_it;
1614 } else {
1615 return 0;
1616 }
1617 }
1618 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1619 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1620 goto discard_and_relse;
1621 }
1622
1623 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1624 goto discard_and_relse;
1625
1626 if (tcp_v4_inbound_md5_hash(sk, skb))
1627 goto discard_and_relse;
1628
1629 nf_reset(skb);
1630
1631 if (sk_filter(sk, skb))
1632 goto discard_and_relse;
1633
1634 skb->dev = NULL;
1635
1636 if (sk->sk_state == TCP_LISTEN) {
1637 ret = tcp_v4_do_rcv(sk, skb);
1638 goto put_and_return;
1639 }
1640
1641 sk_incoming_cpu_update(sk);
1642
1643 bh_lock_sock_nested(sk);
1644 tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
1645 ret = 0;
1646 if (!sock_owned_by_user(sk)) {
1647 if (!tcp_prequeue(sk, skb))
1648 ret = tcp_v4_do_rcv(sk, skb);
1649 } else if (unlikely(sk_add_backlog(sk, skb,
1650 sk->sk_rcvbuf + sk->sk_sndbuf))) {
1651 bh_unlock_sock(sk);
1652 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1653 goto discard_and_relse;
1654 }
1655 bh_unlock_sock(sk);
1656
1657 put_and_return:
1658 sock_put(sk);
1659
1660 return ret;
1661
1662 no_tcp_socket:
1663 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1664 goto discard_it;
1665
1666 if (tcp_checksum_complete(skb)) {
1667 csum_error:
1668 TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1669 bad_packet:
1670 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1671 } else {
1672 tcp_v4_send_reset(NULL, skb);
1673 }
1674
1675 discard_it:
1676 /* Discard frame. */
1677 kfree_skb(skb);
1678 return 0;
1679
1680 discard_and_relse:
1681 sock_put(sk);
1682 goto discard_it;
1683
1684 do_time_wait:
1685 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1686 inet_twsk_put(inet_twsk(sk));
1687 goto discard_it;
1688 }
1689
1690 if (tcp_checksum_complete(skb)) {
1691 inet_twsk_put(inet_twsk(sk));
1692 goto csum_error;
1693 }
1694 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1695 case TCP_TW_SYN: {
1696 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1697 &tcp_hashinfo,
1698 iph->saddr, th->source,
1699 iph->daddr, th->dest,
1700 inet_iif(skb));
1701 if (sk2) {
1702 inet_twsk_deschedule_put(inet_twsk(sk));
1703 sk = sk2;
1704 goto process;
1705 }
1706 /* Fall through to ACK */
1707 }
1708 case TCP_TW_ACK:
1709 tcp_v4_timewait_ack(sk, skb);
1710 break;
1711 case TCP_TW_RST:
1712 tcp_v4_send_reset(sk, skb);
1713 inet_twsk_deschedule_put(inet_twsk(sk));
1714 goto discard_it;
1715 case TCP_TW_SUCCESS:;
1716 }
1717 goto discard_it;
1718 }
1719
1720 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1721 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1722 .twsk_unique = tcp_twsk_unique,
1723 .twsk_destructor= tcp_twsk_destructor,
1724 };
1725
1726 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1727 {
1728 struct dst_entry *dst = skb_dst(skb);
1729
1730 if (dst && dst_hold_safe(dst)) {
1731 sk->sk_rx_dst = dst;
1732 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1733 }
1734 }
1735 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1736
1737 const struct inet_connection_sock_af_ops ipv4_specific = {
1738 .queue_xmit = ip_queue_xmit,
1739 .send_check = tcp_v4_send_check,
1740 .rebuild_header = inet_sk_rebuild_header,
1741 .sk_rx_dst_set = inet_sk_rx_dst_set,
1742 .conn_request = tcp_v4_conn_request,
1743 .syn_recv_sock = tcp_v4_syn_recv_sock,
1744 .net_header_len = sizeof(struct iphdr),
1745 .setsockopt = ip_setsockopt,
1746 .getsockopt = ip_getsockopt,
1747 .addr2sockaddr = inet_csk_addr2sockaddr,
1748 .sockaddr_len = sizeof(struct sockaddr_in),
1749 .bind_conflict = inet_csk_bind_conflict,
1750 #ifdef CONFIG_COMPAT
1751 .compat_setsockopt = compat_ip_setsockopt,
1752 .compat_getsockopt = compat_ip_getsockopt,
1753 #endif
1754 .mtu_reduced = tcp_v4_mtu_reduced,
1755 };
1756 EXPORT_SYMBOL(ipv4_specific);
1757
1758 #ifdef CONFIG_TCP_MD5SIG
1759 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1760 .md5_lookup = tcp_v4_md5_lookup,
1761 .calc_md5_hash = tcp_v4_md5_hash_skb,
1762 .md5_parse = tcp_v4_parse_md5_keys,
1763 };
1764 #endif
1765
1766 /* NOTE: A lot of things set to zero explicitly by call to
1767 * sk_alloc() so need not be done here.
1768 */
1769 static int tcp_v4_init_sock(struct sock *sk)
1770 {
1771 struct inet_connection_sock *icsk = inet_csk(sk);
1772
1773 tcp_init_sock(sk);
1774
1775 icsk->icsk_af_ops = &ipv4_specific;
1776
1777 #ifdef CONFIG_TCP_MD5SIG
1778 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1779 #endif
1780
1781 return 0;
1782 }
1783
1784 void tcp_v4_destroy_sock(struct sock *sk)
1785 {
1786 struct tcp_sock *tp = tcp_sk(sk);
1787
1788 tcp_clear_xmit_timers(sk);
1789
1790 tcp_cleanup_congestion_control(sk);
1791
1792 /* Cleanup up the write buffer. */
1793 tcp_write_queue_purge(sk);
1794
1795 /* Cleans up our, hopefully empty, out_of_order_queue. */
1796 __skb_queue_purge(&tp->out_of_order_queue);
1797
1798 #ifdef CONFIG_TCP_MD5SIG
1799 /* Clean up the MD5 key list, if any */
1800 if (tp->md5sig_info) {
1801 tcp_clear_md5_list(sk);
1802 kfree_rcu(tp->md5sig_info, rcu);
1803 tp->md5sig_info = NULL;
1804 }
1805 #endif
1806
1807 /* Clean prequeue, it must be empty really */
1808 __skb_queue_purge(&tp->ucopy.prequeue);
1809
1810 /* Clean up a referenced TCP bind bucket. */
1811 if (inet_csk(sk)->icsk_bind_hash)
1812 inet_put_port(sk);
1813
1814 BUG_ON(tp->fastopen_rsk);
1815
1816 /* If socket is aborted during connect operation */
1817 tcp_free_fastopen_req(tp);
1818 tcp_saved_syn_free(tp);
1819
1820 sk_sockets_allocated_dec(sk);
1821
1822 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
1823 sock_release_memcg(sk);
1824 }
1825 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1826
1827 #ifdef CONFIG_PROC_FS
1828 /* Proc filesystem TCP sock list dumping. */
1829
1830 /*
1831 * Get next listener socket follow cur. If cur is NULL, get first socket
1832 * starting from bucket given in st->bucket; when st->bucket is zero the
1833 * very first socket in the hash table is returned.
1834 */
1835 static void *listening_get_next(struct seq_file *seq, void *cur)
1836 {
1837 struct inet_connection_sock *icsk;
1838 struct hlist_nulls_node *node;
1839 struct sock *sk = cur;
1840 struct inet_listen_hashbucket *ilb;
1841 struct tcp_iter_state *st = seq->private;
1842 struct net *net = seq_file_net(seq);
1843
1844 if (!sk) {
1845 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1846 spin_lock_bh(&ilb->lock);
1847 sk = sk_nulls_head(&ilb->head);
1848 st->offset = 0;
1849 goto get_sk;
1850 }
1851 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1852 ++st->num;
1853 ++st->offset;
1854
1855 sk = sk_nulls_next(sk);
1856 get_sk:
1857 sk_nulls_for_each_from(sk, node) {
1858 if (!net_eq(sock_net(sk), net))
1859 continue;
1860 if (sk->sk_family == st->family) {
1861 cur = sk;
1862 goto out;
1863 }
1864 icsk = inet_csk(sk);
1865 }
1866 spin_unlock_bh(&ilb->lock);
1867 st->offset = 0;
1868 if (++st->bucket < INET_LHTABLE_SIZE) {
1869 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1870 spin_lock_bh(&ilb->lock);
1871 sk = sk_nulls_head(&ilb->head);
1872 goto get_sk;
1873 }
1874 cur = NULL;
1875 out:
1876 return cur;
1877 }
1878
1879 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1880 {
1881 struct tcp_iter_state *st = seq->private;
1882 void *rc;
1883
1884 st->bucket = 0;
1885 st->offset = 0;
1886 rc = listening_get_next(seq, NULL);
1887
1888 while (rc && *pos) {
1889 rc = listening_get_next(seq, rc);
1890 --*pos;
1891 }
1892 return rc;
1893 }
1894
1895 static inline bool empty_bucket(const struct tcp_iter_state *st)
1896 {
1897 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1898 }
1899
1900 /*
1901 * Get first established socket starting from bucket given in st->bucket.
1902 * If st->bucket is zero, the very first socket in the hash is returned.
1903 */
1904 static void *established_get_first(struct seq_file *seq)
1905 {
1906 struct tcp_iter_state *st = seq->private;
1907 struct net *net = seq_file_net(seq);
1908 void *rc = NULL;
1909
1910 st->offset = 0;
1911 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1912 struct sock *sk;
1913 struct hlist_nulls_node *node;
1914 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1915
1916 /* Lockless fast path for the common case of empty buckets */
1917 if (empty_bucket(st))
1918 continue;
1919
1920 spin_lock_bh(lock);
1921 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1922 if (sk->sk_family != st->family ||
1923 !net_eq(sock_net(sk), net)) {
1924 continue;
1925 }
1926 rc = sk;
1927 goto out;
1928 }
1929 spin_unlock_bh(lock);
1930 }
1931 out:
1932 return rc;
1933 }
1934
1935 static void *established_get_next(struct seq_file *seq, void *cur)
1936 {
1937 struct sock *sk = cur;
1938 struct hlist_nulls_node *node;
1939 struct tcp_iter_state *st = seq->private;
1940 struct net *net = seq_file_net(seq);
1941
1942 ++st->num;
1943 ++st->offset;
1944
1945 sk = sk_nulls_next(sk);
1946
1947 sk_nulls_for_each_from(sk, node) {
1948 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1949 return sk;
1950 }
1951
1952 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1953 ++st->bucket;
1954 return established_get_first(seq);
1955 }
1956
1957 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1958 {
1959 struct tcp_iter_state *st = seq->private;
1960 void *rc;
1961
1962 st->bucket = 0;
1963 rc = established_get_first(seq);
1964
1965 while (rc && pos) {
1966 rc = established_get_next(seq, rc);
1967 --pos;
1968 }
1969 return rc;
1970 }
1971
1972 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1973 {
1974 void *rc;
1975 struct tcp_iter_state *st = seq->private;
1976
1977 st->state = TCP_SEQ_STATE_LISTENING;
1978 rc = listening_get_idx(seq, &pos);
1979
1980 if (!rc) {
1981 st->state = TCP_SEQ_STATE_ESTABLISHED;
1982 rc = established_get_idx(seq, pos);
1983 }
1984
1985 return rc;
1986 }
1987
1988 static void *tcp_seek_last_pos(struct seq_file *seq)
1989 {
1990 struct tcp_iter_state *st = seq->private;
1991 int offset = st->offset;
1992 int orig_num = st->num;
1993 void *rc = NULL;
1994
1995 switch (st->state) {
1996 case TCP_SEQ_STATE_LISTENING:
1997 if (st->bucket >= INET_LHTABLE_SIZE)
1998 break;
1999 st->state = TCP_SEQ_STATE_LISTENING;
2000 rc = listening_get_next(seq, NULL);
2001 while (offset-- && rc)
2002 rc = listening_get_next(seq, rc);
2003 if (rc)
2004 break;
2005 st->bucket = 0;
2006 st->state = TCP_SEQ_STATE_ESTABLISHED;
2007 /* Fallthrough */
2008 case TCP_SEQ_STATE_ESTABLISHED:
2009 if (st->bucket > tcp_hashinfo.ehash_mask)
2010 break;
2011 rc = established_get_first(seq);
2012 while (offset-- && rc)
2013 rc = established_get_next(seq, rc);
2014 }
2015
2016 st->num = orig_num;
2017
2018 return rc;
2019 }
2020
2021 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2022 {
2023 struct tcp_iter_state *st = seq->private;
2024 void *rc;
2025
2026 if (*pos && *pos == st->last_pos) {
2027 rc = tcp_seek_last_pos(seq);
2028 if (rc)
2029 goto out;
2030 }
2031
2032 st->state = TCP_SEQ_STATE_LISTENING;
2033 st->num = 0;
2034 st->bucket = 0;
2035 st->offset = 0;
2036 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2037
2038 out:
2039 st->last_pos = *pos;
2040 return rc;
2041 }
2042
2043 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2044 {
2045 struct tcp_iter_state *st = seq->private;
2046 void *rc = NULL;
2047
2048 if (v == SEQ_START_TOKEN) {
2049 rc = tcp_get_idx(seq, 0);
2050 goto out;
2051 }
2052
2053 switch (st->state) {
2054 case TCP_SEQ_STATE_LISTENING:
2055 rc = listening_get_next(seq, v);
2056 if (!rc) {
2057 st->state = TCP_SEQ_STATE_ESTABLISHED;
2058 st->bucket = 0;
2059 st->offset = 0;
2060 rc = established_get_first(seq);
2061 }
2062 break;
2063 case TCP_SEQ_STATE_ESTABLISHED:
2064 rc = established_get_next(seq, v);
2065 break;
2066 }
2067 out:
2068 ++*pos;
2069 st->last_pos = *pos;
2070 return rc;
2071 }
2072
2073 static void tcp_seq_stop(struct seq_file *seq, void *v)
2074 {
2075 struct tcp_iter_state *st = seq->private;
2076
2077 switch (st->state) {
2078 case TCP_SEQ_STATE_LISTENING:
2079 if (v != SEQ_START_TOKEN)
2080 spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2081 break;
2082 case TCP_SEQ_STATE_ESTABLISHED:
2083 if (v)
2084 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2085 break;
2086 }
2087 }
2088
2089 int tcp_seq_open(struct inode *inode, struct file *file)
2090 {
2091 struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2092 struct tcp_iter_state *s;
2093 int err;
2094
2095 err = seq_open_net(inode, file, &afinfo->seq_ops,
2096 sizeof(struct tcp_iter_state));
2097 if (err < 0)
2098 return err;
2099
2100 s = ((struct seq_file *)file->private_data)->private;
2101 s->family = afinfo->family;
2102 s->last_pos = 0;
2103 return 0;
2104 }
2105 EXPORT_SYMBOL(tcp_seq_open);
2106
2107 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2108 {
2109 int rc = 0;
2110 struct proc_dir_entry *p;
2111
2112 afinfo->seq_ops.start = tcp_seq_start;
2113 afinfo->seq_ops.next = tcp_seq_next;
2114 afinfo->seq_ops.stop = tcp_seq_stop;
2115
2116 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2117 afinfo->seq_fops, afinfo);
2118 if (!p)
2119 rc = -ENOMEM;
2120 return rc;
2121 }
2122 EXPORT_SYMBOL(tcp_proc_register);
2123
2124 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2125 {
2126 remove_proc_entry(afinfo->name, net->proc_net);
2127 }
2128 EXPORT_SYMBOL(tcp_proc_unregister);
2129
2130 static void get_openreq4(const struct request_sock *req,
2131 struct seq_file *f, int i)
2132 {
2133 const struct inet_request_sock *ireq = inet_rsk(req);
2134 long delta = req->rsk_timer.expires - jiffies;
2135
2136 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2137 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2138 i,
2139 ireq->ir_loc_addr,
2140 ireq->ir_num,
2141 ireq->ir_rmt_addr,
2142 ntohs(ireq->ir_rmt_port),
2143 TCP_SYN_RECV,
2144 0, 0, /* could print option size, but that is af dependent. */
2145 1, /* timers active (only the expire timer) */
2146 jiffies_delta_to_clock_t(delta),
2147 req->num_timeout,
2148 from_kuid_munged(seq_user_ns(f),
2149 sock_i_uid(req->rsk_listener)),
2150 0, /* non standard timer */
2151 0, /* open_requests have no inode */
2152 0,
2153 req);
2154 }
2155
2156 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2157 {
2158 int timer_active;
2159 unsigned long timer_expires;
2160 const struct tcp_sock *tp = tcp_sk(sk);
2161 const struct inet_connection_sock *icsk = inet_csk(sk);
2162 const struct inet_sock *inet = inet_sk(sk);
2163 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2164 __be32 dest = inet->inet_daddr;
2165 __be32 src = inet->inet_rcv_saddr;
2166 __u16 destp = ntohs(inet->inet_dport);
2167 __u16 srcp = ntohs(inet->inet_sport);
2168 int rx_queue;
2169 int state;
2170
2171 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2172 icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2173 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2174 timer_active = 1;
2175 timer_expires = icsk->icsk_timeout;
2176 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2177 timer_active = 4;
2178 timer_expires = icsk->icsk_timeout;
2179 } else if (timer_pending(&sk->sk_timer)) {
2180 timer_active = 2;
2181 timer_expires = sk->sk_timer.expires;
2182 } else {
2183 timer_active = 0;
2184 timer_expires = jiffies;
2185 }
2186
2187 state = sk_state_load(sk);
2188 if (state == TCP_LISTEN)
2189 rx_queue = sk->sk_ack_backlog;
2190 else
2191 /* Because we don't lock the socket,
2192 * we might find a transient negative value.
2193 */
2194 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2195
2196 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2197 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2198 i, src, srcp, dest, destp, state,
2199 tp->write_seq - tp->snd_una,
2200 rx_queue,
2201 timer_active,
2202 jiffies_delta_to_clock_t(timer_expires - jiffies),
2203 icsk->icsk_retransmits,
2204 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2205 icsk->icsk_probes_out,
2206 sock_i_ino(sk),
2207 atomic_read(&sk->sk_refcnt), sk,
2208 jiffies_to_clock_t(icsk->icsk_rto),
2209 jiffies_to_clock_t(icsk->icsk_ack.ato),
2210 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2211 tp->snd_cwnd,
2212 state == TCP_LISTEN ?
2213 fastopenq->max_qlen :
2214 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2215 }
2216
2217 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2218 struct seq_file *f, int i)
2219 {
2220 long delta = tw->tw_timer.expires - jiffies;
2221 __be32 dest, src;
2222 __u16 destp, srcp;
2223
2224 dest = tw->tw_daddr;
2225 src = tw->tw_rcv_saddr;
2226 destp = ntohs(tw->tw_dport);
2227 srcp = ntohs(tw->tw_sport);
2228
2229 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2230 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2231 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2232 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2233 atomic_read(&tw->tw_refcnt), tw);
2234 }
2235
2236 #define TMPSZ 150
2237
2238 static int tcp4_seq_show(struct seq_file *seq, void *v)
2239 {
2240 struct tcp_iter_state *st;
2241 struct sock *sk = v;
2242
2243 seq_setwidth(seq, TMPSZ - 1);
2244 if (v == SEQ_START_TOKEN) {
2245 seq_puts(seq, " sl local_address rem_address st tx_queue "
2246 "rx_queue tr tm->when retrnsmt uid timeout "
2247 "inode");
2248 goto out;
2249 }
2250 st = seq->private;
2251
2252 if (sk->sk_state == TCP_TIME_WAIT)
2253 get_timewait4_sock(v, seq, st->num);
2254 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2255 get_openreq4(v, seq, st->num);
2256 else
2257 get_tcp4_sock(v, seq, st->num);
2258 out:
2259 seq_pad(seq, '\n');
2260 return 0;
2261 }
2262
2263 static const struct file_operations tcp_afinfo_seq_fops = {
2264 .owner = THIS_MODULE,
2265 .open = tcp_seq_open,
2266 .read = seq_read,
2267 .llseek = seq_lseek,
2268 .release = seq_release_net
2269 };
2270
2271 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2272 .name = "tcp",
2273 .family = AF_INET,
2274 .seq_fops = &tcp_afinfo_seq_fops,
2275 .seq_ops = {
2276 .show = tcp4_seq_show,
2277 },
2278 };
2279
2280 static int __net_init tcp4_proc_init_net(struct net *net)
2281 {
2282 return tcp_proc_register(net, &tcp4_seq_afinfo);
2283 }
2284
2285 static void __net_exit tcp4_proc_exit_net(struct net *net)
2286 {
2287 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2288 }
2289
2290 static struct pernet_operations tcp4_net_ops = {
2291 .init = tcp4_proc_init_net,
2292 .exit = tcp4_proc_exit_net,
2293 };
2294
2295 int __init tcp4_proc_init(void)
2296 {
2297 return register_pernet_subsys(&tcp4_net_ops);
2298 }
2299
2300 void tcp4_proc_exit(void)
2301 {
2302 unregister_pernet_subsys(&tcp4_net_ops);
2303 }
2304 #endif /* CONFIG_PROC_FS */
2305
2306 struct proto tcp_prot = {
2307 .name = "TCP",
2308 .owner = THIS_MODULE,
2309 .close = tcp_close,
2310 .connect = tcp_v4_connect,
2311 .disconnect = tcp_disconnect,
2312 .accept = inet_csk_accept,
2313 .ioctl = tcp_ioctl,
2314 .init = tcp_v4_init_sock,
2315 .destroy = tcp_v4_destroy_sock,
2316 .shutdown = tcp_shutdown,
2317 .setsockopt = tcp_setsockopt,
2318 .getsockopt = tcp_getsockopt,
2319 .recvmsg = tcp_recvmsg,
2320 .sendmsg = tcp_sendmsg,
2321 .sendpage = tcp_sendpage,
2322 .backlog_rcv = tcp_v4_do_rcv,
2323 .release_cb = tcp_release_cb,
2324 .hash = inet_hash,
2325 .unhash = inet_unhash,
2326 .get_port = inet_csk_get_port,
2327 .enter_memory_pressure = tcp_enter_memory_pressure,
2328 .stream_memory_free = tcp_stream_memory_free,
2329 .sockets_allocated = &tcp_sockets_allocated,
2330 .orphan_count = &tcp_orphan_count,
2331 .memory_allocated = &tcp_memory_allocated,
2332 .memory_pressure = &tcp_memory_pressure,
2333 .sysctl_mem = sysctl_tcp_mem,
2334 .sysctl_wmem = sysctl_tcp_wmem,
2335 .sysctl_rmem = sysctl_tcp_rmem,
2336 .max_header = MAX_TCP_HEADER,
2337 .obj_size = sizeof(struct tcp_sock),
2338 .slab_flags = SLAB_DESTROY_BY_RCU,
2339 .twsk_prot = &tcp_timewait_sock_ops,
2340 .rsk_prot = &tcp_request_sock_ops,
2341 .h.hashinfo = &tcp_hashinfo,
2342 .no_autobind = true,
2343 #ifdef CONFIG_COMPAT
2344 .compat_setsockopt = compat_tcp_setsockopt,
2345 .compat_getsockopt = compat_tcp_getsockopt,
2346 #endif
2347 .diag_destroy = tcp_abort,
2348 };
2349 EXPORT_SYMBOL(tcp_prot);
2350
2351 static void __net_exit tcp_sk_exit(struct net *net)
2352 {
2353 int cpu;
2354
2355 for_each_possible_cpu(cpu)
2356 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2357 free_percpu(net->ipv4.tcp_sk);
2358 }
2359
2360 static int __net_init tcp_sk_init(struct net *net)
2361 {
2362 int res, cpu;
2363
2364 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2365 if (!net->ipv4.tcp_sk)
2366 return -ENOMEM;
2367
2368 for_each_possible_cpu(cpu) {
2369 struct sock *sk;
2370
2371 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2372 IPPROTO_TCP, net);
2373 if (res)
2374 goto fail;
2375 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2376 }
2377
2378 net->ipv4.sysctl_tcp_ecn = 2;
2379 net->ipv4.sysctl_tcp_ecn_fallback = 1;
2380
2381 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2382 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2383 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2384
2385 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2386 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2387 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2388
2389 return 0;
2390 fail:
2391 tcp_sk_exit(net);
2392
2393 return res;
2394 }
2395
2396 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2397 {
2398 inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2399 }
2400
2401 static struct pernet_operations __net_initdata tcp_sk_ops = {
2402 .init = tcp_sk_init,
2403 .exit = tcp_sk_exit,
2404 .exit_batch = tcp_sk_exit_batch,
2405 };
2406
2407 void __init tcp_v4_init(void)
2408 {
2409 inet_hashinfo_init(&tcp_hashinfo);
2410 if (register_pernet_subsys(&tcp_sk_ops))
2411 panic("Failed to create the TCP control socket.\n");
2412 }
This page took 0.248137 seconds and 6 git commands to generate.