Merge branch 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
[deliverable/linux.git] / net / ipv4 / tcp_ipv4.c
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * IPv4 specific functions
9 *
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 *
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
22 */
23
24 /*
25 * Changes:
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
34 * ACK bit.
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
37 * request_sock handling and moved
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
40 * Added new listen semantics.
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
45 * coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
51 */
52
53 #define pr_fmt(fmt) "TCP: " fmt
54
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83
84 #include <linux/crypto.h>
85 #include <linux/scatterlist.h>
86
87 int sysctl_tcp_tw_reuse __read_mostly;
88 int sysctl_tcp_low_latency __read_mostly;
89 EXPORT_SYMBOL(sysctl_tcp_low_latency);
90
91 #ifdef CONFIG_TCP_MD5SIG
92 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
93 __be32 daddr, __be32 saddr, const struct tcphdr *th);
94 #endif
95
96 struct inet_hashinfo tcp_hashinfo;
97 EXPORT_SYMBOL(tcp_hashinfo);
98
99 static __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
100 {
101 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
102 ip_hdr(skb)->saddr,
103 tcp_hdr(skb)->dest,
104 tcp_hdr(skb)->source);
105 }
106
107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108 {
109 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
110 struct tcp_sock *tp = tcp_sk(sk);
111
112 /* With PAWS, it is safe from the viewpoint
113 of data integrity. Even without PAWS it is safe provided sequence
114 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
115
116 Actually, the idea is close to VJ's one, only timestamp cache is
117 held not per host, but per port pair and TW bucket is used as state
118 holder.
119
120 If TW bucket has been already destroyed we fall back to VJ's scheme
121 and use initial timestamp retrieved from peer table.
122 */
123 if (tcptw->tw_ts_recent_stamp &&
124 (!twp || (sysctl_tcp_tw_reuse &&
125 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
126 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
127 if (tp->write_seq == 0)
128 tp->write_seq = 1;
129 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
130 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
131 sock_hold(sktw);
132 return 1;
133 }
134
135 return 0;
136 }
137 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
138
139 /* This will initiate an outgoing connection. */
140 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
141 {
142 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
143 struct inet_sock *inet = inet_sk(sk);
144 struct tcp_sock *tp = tcp_sk(sk);
145 __be16 orig_sport, orig_dport;
146 __be32 daddr, nexthop;
147 struct flowi4 *fl4;
148 struct rtable *rt;
149 int err;
150 struct ip_options_rcu *inet_opt;
151
152 if (addr_len < sizeof(struct sockaddr_in))
153 return -EINVAL;
154
155 if (usin->sin_family != AF_INET)
156 return -EAFNOSUPPORT;
157
158 nexthop = daddr = usin->sin_addr.s_addr;
159 inet_opt = rcu_dereference_protected(inet->inet_opt,
160 sock_owned_by_user(sk));
161 if (inet_opt && inet_opt->opt.srr) {
162 if (!daddr)
163 return -EINVAL;
164 nexthop = inet_opt->opt.faddr;
165 }
166
167 orig_sport = inet->inet_sport;
168 orig_dport = usin->sin_port;
169 fl4 = &inet->cork.fl.u.ip4;
170 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
171 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
172 IPPROTO_TCP,
173 orig_sport, orig_dport, sk);
174 if (IS_ERR(rt)) {
175 err = PTR_ERR(rt);
176 if (err == -ENETUNREACH)
177 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
178 return err;
179 }
180
181 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
182 ip_rt_put(rt);
183 return -ENETUNREACH;
184 }
185
186 if (!inet_opt || !inet_opt->opt.srr)
187 daddr = fl4->daddr;
188
189 if (!inet->inet_saddr)
190 inet->inet_saddr = fl4->saddr;
191 sk_rcv_saddr_set(sk, inet->inet_saddr);
192
193 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
194 /* Reset inherited state */
195 tp->rx_opt.ts_recent = 0;
196 tp->rx_opt.ts_recent_stamp = 0;
197 if (likely(!tp->repair))
198 tp->write_seq = 0;
199 }
200
201 if (tcp_death_row.sysctl_tw_recycle &&
202 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
203 tcp_fetch_timewait_stamp(sk, &rt->dst);
204
205 inet->inet_dport = usin->sin_port;
206 sk_daddr_set(sk, daddr);
207
208 inet_csk(sk)->icsk_ext_hdr_len = 0;
209 if (inet_opt)
210 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
211
212 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
213
214 /* Socket identity is still unknown (sport may be zero).
215 * However we set state to SYN-SENT and not releasing socket
216 * lock select source port, enter ourselves into the hash tables and
217 * complete initialization after this.
218 */
219 tcp_set_state(sk, TCP_SYN_SENT);
220 err = inet_hash_connect(&tcp_death_row, sk);
221 if (err)
222 goto failure;
223
224 sk_set_txhash(sk);
225
226 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
227 inet->inet_sport, inet->inet_dport, sk);
228 if (IS_ERR(rt)) {
229 err = PTR_ERR(rt);
230 rt = NULL;
231 goto failure;
232 }
233 /* OK, now commit destination to socket. */
234 sk->sk_gso_type = SKB_GSO_TCPV4;
235 sk_setup_caps(sk, &rt->dst);
236
237 if (!tp->write_seq && likely(!tp->repair))
238 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
239 inet->inet_daddr,
240 inet->inet_sport,
241 usin->sin_port);
242
243 inet->inet_id = tp->write_seq ^ jiffies;
244
245 err = tcp_connect(sk);
246
247 rt = NULL;
248 if (err)
249 goto failure;
250
251 return 0;
252
253 failure:
254 /*
255 * This unhashes the socket and releases the local port,
256 * if necessary.
257 */
258 tcp_set_state(sk, TCP_CLOSE);
259 ip_rt_put(rt);
260 sk->sk_route_caps = 0;
261 inet->inet_dport = 0;
262 return err;
263 }
264 EXPORT_SYMBOL(tcp_v4_connect);
265
266 /*
267 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
268 * It can be called through tcp_release_cb() if socket was owned by user
269 * at the time tcp_v4_err() was called to handle ICMP message.
270 */
271 void tcp_v4_mtu_reduced(struct sock *sk)
272 {
273 struct dst_entry *dst;
274 struct inet_sock *inet = inet_sk(sk);
275 u32 mtu = tcp_sk(sk)->mtu_info;
276
277 dst = inet_csk_update_pmtu(sk, mtu);
278 if (!dst)
279 return;
280
281 /* Something is about to be wrong... Remember soft error
282 * for the case, if this connection will not able to recover.
283 */
284 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
285 sk->sk_err_soft = EMSGSIZE;
286
287 mtu = dst_mtu(dst);
288
289 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
290 ip_sk_accept_pmtu(sk) &&
291 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
292 tcp_sync_mss(sk, mtu);
293
294 /* Resend the TCP packet because it's
295 * clear that the old packet has been
296 * dropped. This is the new "fast" path mtu
297 * discovery.
298 */
299 tcp_simple_retransmit(sk);
300 } /* else let the usual retransmit timer handle it */
301 }
302 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
303
304 static void do_redirect(struct sk_buff *skb, struct sock *sk)
305 {
306 struct dst_entry *dst = __sk_dst_check(sk, 0);
307
308 if (dst)
309 dst->ops->redirect(dst, sk, skb);
310 }
311
312
313 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
314 void tcp_req_err(struct sock *sk, u32 seq)
315 {
316 struct request_sock *req = inet_reqsk(sk);
317 struct net *net = sock_net(sk);
318
319 /* ICMPs are not backlogged, hence we cannot get
320 * an established socket here.
321 */
322 WARN_ON(req->sk);
323
324 if (seq != tcp_rsk(req)->snt_isn) {
325 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
326 } else {
327 /*
328 * Still in SYN_RECV, just remove it silently.
329 * There is no good way to pass the error to the newly
330 * created socket, and POSIX does not want network
331 * errors returned from accept().
332 */
333 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
334 NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
335 }
336 reqsk_put(req);
337 }
338 EXPORT_SYMBOL(tcp_req_err);
339
340 /*
341 * This routine is called by the ICMP module when it gets some
342 * sort of error condition. If err < 0 then the socket should
343 * be closed and the error returned to the user. If err > 0
344 * it's just the icmp type << 8 | icmp code. After adjustment
345 * header points to the first 8 bytes of the tcp header. We need
346 * to find the appropriate port.
347 *
348 * The locking strategy used here is very "optimistic". When
349 * someone else accesses the socket the ICMP is just dropped
350 * and for some paths there is no check at all.
351 * A more general error queue to queue errors for later handling
352 * is probably better.
353 *
354 */
355
356 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
357 {
358 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
359 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
360 struct inet_connection_sock *icsk;
361 struct tcp_sock *tp;
362 struct inet_sock *inet;
363 const int type = icmp_hdr(icmp_skb)->type;
364 const int code = icmp_hdr(icmp_skb)->code;
365 struct sock *sk;
366 struct sk_buff *skb;
367 struct request_sock *fastopen;
368 __u32 seq, snd_una;
369 __u32 remaining;
370 int err;
371 struct net *net = dev_net(icmp_skb->dev);
372
373 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
374 th->dest, iph->saddr, ntohs(th->source),
375 inet_iif(icmp_skb));
376 if (!sk) {
377 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
378 return;
379 }
380 if (sk->sk_state == TCP_TIME_WAIT) {
381 inet_twsk_put(inet_twsk(sk));
382 return;
383 }
384 seq = ntohl(th->seq);
385 if (sk->sk_state == TCP_NEW_SYN_RECV)
386 return tcp_req_err(sk, seq);
387
388 bh_lock_sock(sk);
389 /* If too many ICMPs get dropped on busy
390 * servers this needs to be solved differently.
391 * We do take care of PMTU discovery (RFC1191) special case :
392 * we can receive locally generated ICMP messages while socket is held.
393 */
394 if (sock_owned_by_user(sk)) {
395 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
396 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
397 }
398 if (sk->sk_state == TCP_CLOSE)
399 goto out;
400
401 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
402 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
403 goto out;
404 }
405
406 icsk = inet_csk(sk);
407 tp = tcp_sk(sk);
408 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
409 fastopen = tp->fastopen_rsk;
410 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
411 if (sk->sk_state != TCP_LISTEN &&
412 !between(seq, snd_una, tp->snd_nxt)) {
413 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
414 goto out;
415 }
416
417 switch (type) {
418 case ICMP_REDIRECT:
419 do_redirect(icmp_skb, sk);
420 goto out;
421 case ICMP_SOURCE_QUENCH:
422 /* Just silently ignore these. */
423 goto out;
424 case ICMP_PARAMETERPROB:
425 err = EPROTO;
426 break;
427 case ICMP_DEST_UNREACH:
428 if (code > NR_ICMP_UNREACH)
429 goto out;
430
431 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
432 /* We are not interested in TCP_LISTEN and open_requests
433 * (SYN-ACKs send out by Linux are always <576bytes so
434 * they should go through unfragmented).
435 */
436 if (sk->sk_state == TCP_LISTEN)
437 goto out;
438
439 tp->mtu_info = info;
440 if (!sock_owned_by_user(sk)) {
441 tcp_v4_mtu_reduced(sk);
442 } else {
443 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
444 sock_hold(sk);
445 }
446 goto out;
447 }
448
449 err = icmp_err_convert[code].errno;
450 /* check if icmp_skb allows revert of backoff
451 * (see draft-zimmermann-tcp-lcd) */
452 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
453 break;
454 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
455 !icsk->icsk_backoff || fastopen)
456 break;
457
458 if (sock_owned_by_user(sk))
459 break;
460
461 icsk->icsk_backoff--;
462 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
463 TCP_TIMEOUT_INIT;
464 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
465
466 skb = tcp_write_queue_head(sk);
467 BUG_ON(!skb);
468
469 remaining = icsk->icsk_rto -
470 min(icsk->icsk_rto,
471 tcp_time_stamp - tcp_skb_timestamp(skb));
472
473 if (remaining) {
474 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
475 remaining, TCP_RTO_MAX);
476 } else {
477 /* RTO revert clocked out retransmission.
478 * Will retransmit now */
479 tcp_retransmit_timer(sk);
480 }
481
482 break;
483 case ICMP_TIME_EXCEEDED:
484 err = EHOSTUNREACH;
485 break;
486 default:
487 goto out;
488 }
489
490 switch (sk->sk_state) {
491 case TCP_SYN_SENT:
492 case TCP_SYN_RECV:
493 /* Only in fast or simultaneous open. If a fast open socket is
494 * is already accepted it is treated as a connected one below.
495 */
496 if (fastopen && !fastopen->sk)
497 break;
498
499 if (!sock_owned_by_user(sk)) {
500 sk->sk_err = err;
501
502 sk->sk_error_report(sk);
503
504 tcp_done(sk);
505 } else {
506 sk->sk_err_soft = err;
507 }
508 goto out;
509 }
510
511 /* If we've already connected we will keep trying
512 * until we time out, or the user gives up.
513 *
514 * rfc1122 4.2.3.9 allows to consider as hard errors
515 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
516 * but it is obsoleted by pmtu discovery).
517 *
518 * Note, that in modern internet, where routing is unreliable
519 * and in each dark corner broken firewalls sit, sending random
520 * errors ordered by their masters even this two messages finally lose
521 * their original sense (even Linux sends invalid PORT_UNREACHs)
522 *
523 * Now we are in compliance with RFCs.
524 * --ANK (980905)
525 */
526
527 inet = inet_sk(sk);
528 if (!sock_owned_by_user(sk) && inet->recverr) {
529 sk->sk_err = err;
530 sk->sk_error_report(sk);
531 } else { /* Only an error on timeout */
532 sk->sk_err_soft = err;
533 }
534
535 out:
536 bh_unlock_sock(sk);
537 sock_put(sk);
538 }
539
540 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
541 {
542 struct tcphdr *th = tcp_hdr(skb);
543
544 if (skb->ip_summed == CHECKSUM_PARTIAL) {
545 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
546 skb->csum_start = skb_transport_header(skb) - skb->head;
547 skb->csum_offset = offsetof(struct tcphdr, check);
548 } else {
549 th->check = tcp_v4_check(skb->len, saddr, daddr,
550 csum_partial(th,
551 th->doff << 2,
552 skb->csum));
553 }
554 }
555
556 /* This routine computes an IPv4 TCP checksum. */
557 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
558 {
559 const struct inet_sock *inet = inet_sk(sk);
560
561 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
562 }
563 EXPORT_SYMBOL(tcp_v4_send_check);
564
565 /*
566 * This routine will send an RST to the other tcp.
567 *
568 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
569 * for reset.
570 * Answer: if a packet caused RST, it is not for a socket
571 * existing in our system, if it is matched to a socket,
572 * it is just duplicate segment or bug in other side's TCP.
573 * So that we build reply only basing on parameters
574 * arrived with segment.
575 * Exception: precedence violation. We do not implement it in any case.
576 */
577
578 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
579 {
580 const struct tcphdr *th = tcp_hdr(skb);
581 struct {
582 struct tcphdr th;
583 #ifdef CONFIG_TCP_MD5SIG
584 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
585 #endif
586 } rep;
587 struct ip_reply_arg arg;
588 #ifdef CONFIG_TCP_MD5SIG
589 struct tcp_md5sig_key *key = NULL;
590 const __u8 *hash_location = NULL;
591 unsigned char newhash[16];
592 int genhash;
593 struct sock *sk1 = NULL;
594 #endif
595 struct net *net;
596
597 /* Never send a reset in response to a reset. */
598 if (th->rst)
599 return;
600
601 /* If sk not NULL, it means we did a successful lookup and incoming
602 * route had to be correct. prequeue might have dropped our dst.
603 */
604 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
605 return;
606
607 /* Swap the send and the receive. */
608 memset(&rep, 0, sizeof(rep));
609 rep.th.dest = th->source;
610 rep.th.source = th->dest;
611 rep.th.doff = sizeof(struct tcphdr) / 4;
612 rep.th.rst = 1;
613
614 if (th->ack) {
615 rep.th.seq = th->ack_seq;
616 } else {
617 rep.th.ack = 1;
618 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
619 skb->len - (th->doff << 2));
620 }
621
622 memset(&arg, 0, sizeof(arg));
623 arg.iov[0].iov_base = (unsigned char *)&rep;
624 arg.iov[0].iov_len = sizeof(rep.th);
625
626 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
627 #ifdef CONFIG_TCP_MD5SIG
628 hash_location = tcp_parse_md5sig_option(th);
629 if (sk && sk_fullsock(sk)) {
630 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
631 &ip_hdr(skb)->saddr, AF_INET);
632 } else if (hash_location) {
633 /*
634 * active side is lost. Try to find listening socket through
635 * source port, and then find md5 key through listening socket.
636 * we are not loose security here:
637 * Incoming packet is checked with md5 hash with finding key,
638 * no RST generated if md5 hash doesn't match.
639 */
640 sk1 = __inet_lookup_listener(net,
641 &tcp_hashinfo, ip_hdr(skb)->saddr,
642 th->source, ip_hdr(skb)->daddr,
643 ntohs(th->source), inet_iif(skb));
644 /* don't send rst if it can't find key */
645 if (!sk1)
646 return;
647 rcu_read_lock();
648 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
649 &ip_hdr(skb)->saddr, AF_INET);
650 if (!key)
651 goto release_sk1;
652
653 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
654 if (genhash || memcmp(hash_location, newhash, 16) != 0)
655 goto release_sk1;
656 }
657
658 if (key) {
659 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
660 (TCPOPT_NOP << 16) |
661 (TCPOPT_MD5SIG << 8) |
662 TCPOLEN_MD5SIG);
663 /* Update length and the length the header thinks exists */
664 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
665 rep.th.doff = arg.iov[0].iov_len / 4;
666
667 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
668 key, ip_hdr(skb)->saddr,
669 ip_hdr(skb)->daddr, &rep.th);
670 }
671 #endif
672 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
673 ip_hdr(skb)->saddr, /* XXX */
674 arg.iov[0].iov_len, IPPROTO_TCP, 0);
675 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
676 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
677
678 /* When socket is gone, all binding information is lost.
679 * routing might fail in this case. No choice here, if we choose to force
680 * input interface, we will misroute in case of asymmetric route.
681 */
682 if (sk)
683 arg.bound_dev_if = sk->sk_bound_dev_if;
684
685 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
686 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
687
688 arg.tos = ip_hdr(skb)->tos;
689 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
690 skb, &TCP_SKB_CB(skb)->header.h4.opt,
691 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
692 &arg, arg.iov[0].iov_len);
693
694 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
695 TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
696
697 #ifdef CONFIG_TCP_MD5SIG
698 release_sk1:
699 if (sk1) {
700 rcu_read_unlock();
701 sock_put(sk1);
702 }
703 #endif
704 }
705
706 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
707 outside socket context is ugly, certainly. What can I do?
708 */
709
710 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
711 u32 win, u32 tsval, u32 tsecr, int oif,
712 struct tcp_md5sig_key *key,
713 int reply_flags, u8 tos)
714 {
715 const struct tcphdr *th = tcp_hdr(skb);
716 struct {
717 struct tcphdr th;
718 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
719 #ifdef CONFIG_TCP_MD5SIG
720 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
721 #endif
722 ];
723 } rep;
724 struct ip_reply_arg arg;
725 struct net *net = dev_net(skb_dst(skb)->dev);
726
727 memset(&rep.th, 0, sizeof(struct tcphdr));
728 memset(&arg, 0, sizeof(arg));
729
730 arg.iov[0].iov_base = (unsigned char *)&rep;
731 arg.iov[0].iov_len = sizeof(rep.th);
732 if (tsecr) {
733 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
734 (TCPOPT_TIMESTAMP << 8) |
735 TCPOLEN_TIMESTAMP);
736 rep.opt[1] = htonl(tsval);
737 rep.opt[2] = htonl(tsecr);
738 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
739 }
740
741 /* Swap the send and the receive. */
742 rep.th.dest = th->source;
743 rep.th.source = th->dest;
744 rep.th.doff = arg.iov[0].iov_len / 4;
745 rep.th.seq = htonl(seq);
746 rep.th.ack_seq = htonl(ack);
747 rep.th.ack = 1;
748 rep.th.window = htons(win);
749
750 #ifdef CONFIG_TCP_MD5SIG
751 if (key) {
752 int offset = (tsecr) ? 3 : 0;
753
754 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
755 (TCPOPT_NOP << 16) |
756 (TCPOPT_MD5SIG << 8) |
757 TCPOLEN_MD5SIG);
758 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
759 rep.th.doff = arg.iov[0].iov_len/4;
760
761 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
762 key, ip_hdr(skb)->saddr,
763 ip_hdr(skb)->daddr, &rep.th);
764 }
765 #endif
766 arg.flags = reply_flags;
767 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
768 ip_hdr(skb)->saddr, /* XXX */
769 arg.iov[0].iov_len, IPPROTO_TCP, 0);
770 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
771 if (oif)
772 arg.bound_dev_if = oif;
773 arg.tos = tos;
774 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
775 skb, &TCP_SKB_CB(skb)->header.h4.opt,
776 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
777 &arg, arg.iov[0].iov_len);
778
779 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
780 }
781
782 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
783 {
784 struct inet_timewait_sock *tw = inet_twsk(sk);
785 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
786
787 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
788 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
789 tcp_time_stamp + tcptw->tw_ts_offset,
790 tcptw->tw_ts_recent,
791 tw->tw_bound_dev_if,
792 tcp_twsk_md5_key(tcptw),
793 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
794 tw->tw_tos
795 );
796
797 inet_twsk_put(tw);
798 }
799
800 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
801 struct request_sock *req)
802 {
803 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
804 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
805 */
806 tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
807 tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
808 tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
809 tcp_time_stamp,
810 req->ts_recent,
811 0,
812 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
813 AF_INET),
814 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
815 ip_hdr(skb)->tos);
816 }
817
818 /*
819 * Send a SYN-ACK after having received a SYN.
820 * This still operates on a request_sock only, not on a big
821 * socket.
822 */
823 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
824 struct flowi *fl,
825 struct request_sock *req,
826 struct tcp_fastopen_cookie *foc,
827 bool attach_req)
828 {
829 const struct inet_request_sock *ireq = inet_rsk(req);
830 struct flowi4 fl4;
831 int err = -1;
832 struct sk_buff *skb;
833
834 /* First, grab a route. */
835 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
836 return -1;
837
838 skb = tcp_make_synack(sk, dst, req, foc, attach_req);
839
840 if (skb) {
841 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
842
843 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
844 ireq->ir_rmt_addr,
845 ireq->opt);
846 err = net_xmit_eval(err);
847 }
848
849 return err;
850 }
851
852 /*
853 * IPv4 request_sock destructor.
854 */
855 static void tcp_v4_reqsk_destructor(struct request_sock *req)
856 {
857 kfree(inet_rsk(req)->opt);
858 }
859
860
861 #ifdef CONFIG_TCP_MD5SIG
862 /*
863 * RFC2385 MD5 checksumming requires a mapping of
864 * IP address->MD5 Key.
865 * We need to maintain these in the sk structure.
866 */
867
868 /* Find the Key structure for an address. */
869 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
870 const union tcp_md5_addr *addr,
871 int family)
872 {
873 const struct tcp_sock *tp = tcp_sk(sk);
874 struct tcp_md5sig_key *key;
875 unsigned int size = sizeof(struct in_addr);
876 const struct tcp_md5sig_info *md5sig;
877
878 /* caller either holds rcu_read_lock() or socket lock */
879 md5sig = rcu_dereference_check(tp->md5sig_info,
880 sock_owned_by_user(sk) ||
881 lockdep_is_held((spinlock_t *)&sk->sk_lock.slock));
882 if (!md5sig)
883 return NULL;
884 #if IS_ENABLED(CONFIG_IPV6)
885 if (family == AF_INET6)
886 size = sizeof(struct in6_addr);
887 #endif
888 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
889 if (key->family != family)
890 continue;
891 if (!memcmp(&key->addr, addr, size))
892 return key;
893 }
894 return NULL;
895 }
896 EXPORT_SYMBOL(tcp_md5_do_lookup);
897
898 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
899 const struct sock *addr_sk)
900 {
901 const union tcp_md5_addr *addr;
902
903 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
904 return tcp_md5_do_lookup(sk, addr, AF_INET);
905 }
906 EXPORT_SYMBOL(tcp_v4_md5_lookup);
907
908 /* This can be called on a newly created socket, from other files */
909 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
910 int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
911 {
912 /* Add Key to the list */
913 struct tcp_md5sig_key *key;
914 struct tcp_sock *tp = tcp_sk(sk);
915 struct tcp_md5sig_info *md5sig;
916
917 key = tcp_md5_do_lookup(sk, addr, family);
918 if (key) {
919 /* Pre-existing entry - just update that one. */
920 memcpy(key->key, newkey, newkeylen);
921 key->keylen = newkeylen;
922 return 0;
923 }
924
925 md5sig = rcu_dereference_protected(tp->md5sig_info,
926 sock_owned_by_user(sk) ||
927 lockdep_is_held(&sk->sk_lock.slock));
928 if (!md5sig) {
929 md5sig = kmalloc(sizeof(*md5sig), gfp);
930 if (!md5sig)
931 return -ENOMEM;
932
933 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
934 INIT_HLIST_HEAD(&md5sig->head);
935 rcu_assign_pointer(tp->md5sig_info, md5sig);
936 }
937
938 key = sock_kmalloc(sk, sizeof(*key), gfp);
939 if (!key)
940 return -ENOMEM;
941 if (!tcp_alloc_md5sig_pool()) {
942 sock_kfree_s(sk, key, sizeof(*key));
943 return -ENOMEM;
944 }
945
946 memcpy(key->key, newkey, newkeylen);
947 key->keylen = newkeylen;
948 key->family = family;
949 memcpy(&key->addr, addr,
950 (family == AF_INET6) ? sizeof(struct in6_addr) :
951 sizeof(struct in_addr));
952 hlist_add_head_rcu(&key->node, &md5sig->head);
953 return 0;
954 }
955 EXPORT_SYMBOL(tcp_md5_do_add);
956
957 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
958 {
959 struct tcp_md5sig_key *key;
960
961 key = tcp_md5_do_lookup(sk, addr, family);
962 if (!key)
963 return -ENOENT;
964 hlist_del_rcu(&key->node);
965 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
966 kfree_rcu(key, rcu);
967 return 0;
968 }
969 EXPORT_SYMBOL(tcp_md5_do_del);
970
971 static void tcp_clear_md5_list(struct sock *sk)
972 {
973 struct tcp_sock *tp = tcp_sk(sk);
974 struct tcp_md5sig_key *key;
975 struct hlist_node *n;
976 struct tcp_md5sig_info *md5sig;
977
978 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
979
980 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
981 hlist_del_rcu(&key->node);
982 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
983 kfree_rcu(key, rcu);
984 }
985 }
986
987 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
988 int optlen)
989 {
990 struct tcp_md5sig cmd;
991 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
992
993 if (optlen < sizeof(cmd))
994 return -EINVAL;
995
996 if (copy_from_user(&cmd, optval, sizeof(cmd)))
997 return -EFAULT;
998
999 if (sin->sin_family != AF_INET)
1000 return -EINVAL;
1001
1002 if (!cmd.tcpm_keylen)
1003 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1004 AF_INET);
1005
1006 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1007 return -EINVAL;
1008
1009 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1010 AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1011 GFP_KERNEL);
1012 }
1013
1014 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1015 __be32 daddr, __be32 saddr, int nbytes)
1016 {
1017 struct tcp4_pseudohdr *bp;
1018 struct scatterlist sg;
1019
1020 bp = &hp->md5_blk.ip4;
1021
1022 /*
1023 * 1. the TCP pseudo-header (in the order: source IP address,
1024 * destination IP address, zero-padded protocol number, and
1025 * segment length)
1026 */
1027 bp->saddr = saddr;
1028 bp->daddr = daddr;
1029 bp->pad = 0;
1030 bp->protocol = IPPROTO_TCP;
1031 bp->len = cpu_to_be16(nbytes);
1032
1033 sg_init_one(&sg, bp, sizeof(*bp));
1034 return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1035 }
1036
1037 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1038 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1039 {
1040 struct tcp_md5sig_pool *hp;
1041 struct hash_desc *desc;
1042
1043 hp = tcp_get_md5sig_pool();
1044 if (!hp)
1045 goto clear_hash_noput;
1046 desc = &hp->md5_desc;
1047
1048 if (crypto_hash_init(desc))
1049 goto clear_hash;
1050 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1051 goto clear_hash;
1052 if (tcp_md5_hash_header(hp, th))
1053 goto clear_hash;
1054 if (tcp_md5_hash_key(hp, key))
1055 goto clear_hash;
1056 if (crypto_hash_final(desc, md5_hash))
1057 goto clear_hash;
1058
1059 tcp_put_md5sig_pool();
1060 return 0;
1061
1062 clear_hash:
1063 tcp_put_md5sig_pool();
1064 clear_hash_noput:
1065 memset(md5_hash, 0, 16);
1066 return 1;
1067 }
1068
1069 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1070 const struct sock *sk,
1071 const struct sk_buff *skb)
1072 {
1073 struct tcp_md5sig_pool *hp;
1074 struct hash_desc *desc;
1075 const struct tcphdr *th = tcp_hdr(skb);
1076 __be32 saddr, daddr;
1077
1078 if (sk) { /* valid for establish/request sockets */
1079 saddr = sk->sk_rcv_saddr;
1080 daddr = sk->sk_daddr;
1081 } else {
1082 const struct iphdr *iph = ip_hdr(skb);
1083 saddr = iph->saddr;
1084 daddr = iph->daddr;
1085 }
1086
1087 hp = tcp_get_md5sig_pool();
1088 if (!hp)
1089 goto clear_hash_noput;
1090 desc = &hp->md5_desc;
1091
1092 if (crypto_hash_init(desc))
1093 goto clear_hash;
1094
1095 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1096 goto clear_hash;
1097 if (tcp_md5_hash_header(hp, th))
1098 goto clear_hash;
1099 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1100 goto clear_hash;
1101 if (tcp_md5_hash_key(hp, key))
1102 goto clear_hash;
1103 if (crypto_hash_final(desc, md5_hash))
1104 goto clear_hash;
1105
1106 tcp_put_md5sig_pool();
1107 return 0;
1108
1109 clear_hash:
1110 tcp_put_md5sig_pool();
1111 clear_hash_noput:
1112 memset(md5_hash, 0, 16);
1113 return 1;
1114 }
1115 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1116
1117 #endif
1118
1119 /* Called with rcu_read_lock() */
1120 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1121 const struct sk_buff *skb)
1122 {
1123 #ifdef CONFIG_TCP_MD5SIG
1124 /*
1125 * This gets called for each TCP segment that arrives
1126 * so we want to be efficient.
1127 * We have 3 drop cases:
1128 * o No MD5 hash and one expected.
1129 * o MD5 hash and we're not expecting one.
1130 * o MD5 hash and its wrong.
1131 */
1132 const __u8 *hash_location = NULL;
1133 struct tcp_md5sig_key *hash_expected;
1134 const struct iphdr *iph = ip_hdr(skb);
1135 const struct tcphdr *th = tcp_hdr(skb);
1136 int genhash;
1137 unsigned char newhash[16];
1138
1139 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1140 AF_INET);
1141 hash_location = tcp_parse_md5sig_option(th);
1142
1143 /* We've parsed the options - do we have a hash? */
1144 if (!hash_expected && !hash_location)
1145 return false;
1146
1147 if (hash_expected && !hash_location) {
1148 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1149 return true;
1150 }
1151
1152 if (!hash_expected && hash_location) {
1153 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1154 return true;
1155 }
1156
1157 /* Okay, so this is hash_expected and hash_location -
1158 * so we need to calculate the checksum.
1159 */
1160 genhash = tcp_v4_md5_hash_skb(newhash,
1161 hash_expected,
1162 NULL, skb);
1163
1164 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1165 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1166 &iph->saddr, ntohs(th->source),
1167 &iph->daddr, ntohs(th->dest),
1168 genhash ? " tcp_v4_calc_md5_hash failed"
1169 : "");
1170 return true;
1171 }
1172 return false;
1173 #endif
1174 return false;
1175 }
1176
1177 static void tcp_v4_init_req(struct request_sock *req,
1178 const struct sock *sk_listener,
1179 struct sk_buff *skb)
1180 {
1181 struct inet_request_sock *ireq = inet_rsk(req);
1182
1183 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1184 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1185 ireq->no_srccheck = inet_sk(sk_listener)->transparent;
1186 ireq->opt = tcp_v4_save_options(skb);
1187 }
1188
1189 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1190 struct flowi *fl,
1191 const struct request_sock *req,
1192 bool *strict)
1193 {
1194 struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1195
1196 if (strict) {
1197 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1198 *strict = true;
1199 else
1200 *strict = false;
1201 }
1202
1203 return dst;
1204 }
1205
1206 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1207 .family = PF_INET,
1208 .obj_size = sizeof(struct tcp_request_sock),
1209 .rtx_syn_ack = tcp_rtx_synack,
1210 .send_ack = tcp_v4_reqsk_send_ack,
1211 .destructor = tcp_v4_reqsk_destructor,
1212 .send_reset = tcp_v4_send_reset,
1213 .syn_ack_timeout = tcp_syn_ack_timeout,
1214 };
1215
1216 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1217 .mss_clamp = TCP_MSS_DEFAULT,
1218 #ifdef CONFIG_TCP_MD5SIG
1219 .req_md5_lookup = tcp_v4_md5_lookup,
1220 .calc_md5_hash = tcp_v4_md5_hash_skb,
1221 #endif
1222 .init_req = tcp_v4_init_req,
1223 #ifdef CONFIG_SYN_COOKIES
1224 .cookie_init_seq = cookie_v4_init_sequence,
1225 #endif
1226 .route_req = tcp_v4_route_req,
1227 .init_seq = tcp_v4_init_sequence,
1228 .send_synack = tcp_v4_send_synack,
1229 };
1230
1231 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1232 {
1233 /* Never answer to SYNs send to broadcast or multicast */
1234 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1235 goto drop;
1236
1237 return tcp_conn_request(&tcp_request_sock_ops,
1238 &tcp_request_sock_ipv4_ops, sk, skb);
1239
1240 drop:
1241 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1242 return 0;
1243 }
1244 EXPORT_SYMBOL(tcp_v4_conn_request);
1245
1246
1247 /*
1248 * The three way handshake has completed - we got a valid synack -
1249 * now create the new socket.
1250 */
1251 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1252 struct request_sock *req,
1253 struct dst_entry *dst,
1254 struct request_sock *req_unhash,
1255 bool *own_req)
1256 {
1257 struct inet_request_sock *ireq;
1258 struct inet_sock *newinet;
1259 struct tcp_sock *newtp;
1260 struct sock *newsk;
1261 #ifdef CONFIG_TCP_MD5SIG
1262 struct tcp_md5sig_key *key;
1263 #endif
1264 struct ip_options_rcu *inet_opt;
1265
1266 if (sk_acceptq_is_full(sk))
1267 goto exit_overflow;
1268
1269 newsk = tcp_create_openreq_child(sk, req, skb);
1270 if (!newsk)
1271 goto exit_nonewsk;
1272
1273 newsk->sk_gso_type = SKB_GSO_TCPV4;
1274 inet_sk_rx_dst_set(newsk, skb);
1275
1276 newtp = tcp_sk(newsk);
1277 newinet = inet_sk(newsk);
1278 ireq = inet_rsk(req);
1279 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1280 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1281 newsk->sk_bound_dev_if = ireq->ir_iif;
1282 newinet->inet_saddr = ireq->ir_loc_addr;
1283 inet_opt = ireq->opt;
1284 rcu_assign_pointer(newinet->inet_opt, inet_opt);
1285 ireq->opt = NULL;
1286 newinet->mc_index = inet_iif(skb);
1287 newinet->mc_ttl = ip_hdr(skb)->ttl;
1288 newinet->rcv_tos = ip_hdr(skb)->tos;
1289 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1290 if (inet_opt)
1291 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1292 newinet->inet_id = newtp->write_seq ^ jiffies;
1293
1294 if (!dst) {
1295 dst = inet_csk_route_child_sock(sk, newsk, req);
1296 if (!dst)
1297 goto put_and_exit;
1298 } else {
1299 /* syncookie case : see end of cookie_v4_check() */
1300 }
1301 sk_setup_caps(newsk, dst);
1302
1303 tcp_ca_openreq_child(newsk, dst);
1304
1305 tcp_sync_mss(newsk, dst_mtu(dst));
1306 newtp->advmss = dst_metric_advmss(dst);
1307 if (tcp_sk(sk)->rx_opt.user_mss &&
1308 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1309 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1310
1311 tcp_initialize_rcv_mss(newsk);
1312
1313 #ifdef CONFIG_TCP_MD5SIG
1314 /* Copy over the MD5 key from the original socket */
1315 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1316 AF_INET);
1317 if (key) {
1318 /*
1319 * We're using one, so create a matching key
1320 * on the newsk structure. If we fail to get
1321 * memory, then we end up not copying the key
1322 * across. Shucks.
1323 */
1324 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1325 AF_INET, key->key, key->keylen, GFP_ATOMIC);
1326 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1327 }
1328 #endif
1329
1330 if (__inet_inherit_port(sk, newsk) < 0)
1331 goto put_and_exit;
1332 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1333 if (*own_req)
1334 tcp_move_syn(newtp, req);
1335
1336 return newsk;
1337
1338 exit_overflow:
1339 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1340 exit_nonewsk:
1341 dst_release(dst);
1342 exit:
1343 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1344 return NULL;
1345 put_and_exit:
1346 inet_csk_prepare_forced_close(newsk);
1347 tcp_done(newsk);
1348 goto exit;
1349 }
1350 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1351
1352 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1353 {
1354 #ifdef CONFIG_SYN_COOKIES
1355 const struct tcphdr *th = tcp_hdr(skb);
1356
1357 if (!th->syn)
1358 sk = cookie_v4_check(sk, skb);
1359 #endif
1360 return sk;
1361 }
1362
1363 /* The socket must have it's spinlock held when we get
1364 * here, unless it is a TCP_LISTEN socket.
1365 *
1366 * We have a potential double-lock case here, so even when
1367 * doing backlog processing we use the BH locking scheme.
1368 * This is because we cannot sleep with the original spinlock
1369 * held.
1370 */
1371 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1372 {
1373 struct sock *rsk;
1374
1375 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1376 struct dst_entry *dst = sk->sk_rx_dst;
1377
1378 sock_rps_save_rxhash(sk, skb);
1379 sk_mark_napi_id(sk, skb);
1380 if (dst) {
1381 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1382 !dst->ops->check(dst, 0)) {
1383 dst_release(dst);
1384 sk->sk_rx_dst = NULL;
1385 }
1386 }
1387 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1388 return 0;
1389 }
1390
1391 if (tcp_checksum_complete(skb))
1392 goto csum_err;
1393
1394 if (sk->sk_state == TCP_LISTEN) {
1395 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1396
1397 if (!nsk)
1398 goto discard;
1399 if (nsk != sk) {
1400 sock_rps_save_rxhash(nsk, skb);
1401 sk_mark_napi_id(nsk, skb);
1402 if (tcp_child_process(sk, nsk, skb)) {
1403 rsk = nsk;
1404 goto reset;
1405 }
1406 return 0;
1407 }
1408 } else
1409 sock_rps_save_rxhash(sk, skb);
1410
1411 if (tcp_rcv_state_process(sk, skb)) {
1412 rsk = sk;
1413 goto reset;
1414 }
1415 return 0;
1416
1417 reset:
1418 tcp_v4_send_reset(rsk, skb);
1419 discard:
1420 kfree_skb(skb);
1421 /* Be careful here. If this function gets more complicated and
1422 * gcc suffers from register pressure on the x86, sk (in %ebx)
1423 * might be destroyed here. This current version compiles correctly,
1424 * but you have been warned.
1425 */
1426 return 0;
1427
1428 csum_err:
1429 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1430 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1431 goto discard;
1432 }
1433 EXPORT_SYMBOL(tcp_v4_do_rcv);
1434
1435 void tcp_v4_early_demux(struct sk_buff *skb)
1436 {
1437 const struct iphdr *iph;
1438 const struct tcphdr *th;
1439 struct sock *sk;
1440
1441 if (skb->pkt_type != PACKET_HOST)
1442 return;
1443
1444 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1445 return;
1446
1447 iph = ip_hdr(skb);
1448 th = tcp_hdr(skb);
1449
1450 if (th->doff < sizeof(struct tcphdr) / 4)
1451 return;
1452
1453 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1454 iph->saddr, th->source,
1455 iph->daddr, ntohs(th->dest),
1456 skb->skb_iif);
1457 if (sk) {
1458 skb->sk = sk;
1459 skb->destructor = sock_edemux;
1460 if (sk_fullsock(sk)) {
1461 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1462
1463 if (dst)
1464 dst = dst_check(dst, 0);
1465 if (dst &&
1466 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1467 skb_dst_set_noref(skb, dst);
1468 }
1469 }
1470 }
1471
1472 /* Packet is added to VJ-style prequeue for processing in process
1473 * context, if a reader task is waiting. Apparently, this exciting
1474 * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1475 * failed somewhere. Latency? Burstiness? Well, at least now we will
1476 * see, why it failed. 8)8) --ANK
1477 *
1478 */
1479 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1480 {
1481 struct tcp_sock *tp = tcp_sk(sk);
1482
1483 if (sysctl_tcp_low_latency || !tp->ucopy.task)
1484 return false;
1485
1486 if (skb->len <= tcp_hdrlen(skb) &&
1487 skb_queue_len(&tp->ucopy.prequeue) == 0)
1488 return false;
1489
1490 /* Before escaping RCU protected region, we need to take care of skb
1491 * dst. Prequeue is only enabled for established sockets.
1492 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1493 * Instead of doing full sk_rx_dst validity here, let's perform
1494 * an optimistic check.
1495 */
1496 if (likely(sk->sk_rx_dst))
1497 skb_dst_drop(skb);
1498 else
1499 skb_dst_force_safe(skb);
1500
1501 __skb_queue_tail(&tp->ucopy.prequeue, skb);
1502 tp->ucopy.memory += skb->truesize;
1503 if (tp->ucopy.memory > sk->sk_rcvbuf) {
1504 struct sk_buff *skb1;
1505
1506 BUG_ON(sock_owned_by_user(sk));
1507
1508 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1509 sk_backlog_rcv(sk, skb1);
1510 NET_INC_STATS_BH(sock_net(sk),
1511 LINUX_MIB_TCPPREQUEUEDROPPED);
1512 }
1513
1514 tp->ucopy.memory = 0;
1515 } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1516 wake_up_interruptible_sync_poll(sk_sleep(sk),
1517 POLLIN | POLLRDNORM | POLLRDBAND);
1518 if (!inet_csk_ack_scheduled(sk))
1519 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1520 (3 * tcp_rto_min(sk)) / 4,
1521 TCP_RTO_MAX);
1522 }
1523 return true;
1524 }
1525 EXPORT_SYMBOL(tcp_prequeue);
1526
1527 /*
1528 * From tcp_input.c
1529 */
1530
1531 int tcp_v4_rcv(struct sk_buff *skb)
1532 {
1533 const struct iphdr *iph;
1534 const struct tcphdr *th;
1535 struct sock *sk;
1536 int ret;
1537 struct net *net = dev_net(skb->dev);
1538
1539 if (skb->pkt_type != PACKET_HOST)
1540 goto discard_it;
1541
1542 /* Count it even if it's bad */
1543 TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1544
1545 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1546 goto discard_it;
1547
1548 th = tcp_hdr(skb);
1549
1550 if (th->doff < sizeof(struct tcphdr) / 4)
1551 goto bad_packet;
1552 if (!pskb_may_pull(skb, th->doff * 4))
1553 goto discard_it;
1554
1555 /* An explanation is required here, I think.
1556 * Packet length and doff are validated by header prediction,
1557 * provided case of th->doff==0 is eliminated.
1558 * So, we defer the checks. */
1559
1560 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1561 goto csum_error;
1562
1563 th = tcp_hdr(skb);
1564 iph = ip_hdr(skb);
1565 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1566 * barrier() makes sure compiler wont play fool^Waliasing games.
1567 */
1568 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1569 sizeof(struct inet_skb_parm));
1570 barrier();
1571
1572 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1573 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1574 skb->len - th->doff * 4);
1575 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1576 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1577 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1578 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1579 TCP_SKB_CB(skb)->sacked = 0;
1580
1581 lookup:
1582 sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1583 if (!sk)
1584 goto no_tcp_socket;
1585
1586 process:
1587 if (sk->sk_state == TCP_TIME_WAIT)
1588 goto do_time_wait;
1589
1590 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1591 struct request_sock *req = inet_reqsk(sk);
1592 struct sock *nsk = NULL;
1593
1594 sk = req->rsk_listener;
1595 if (tcp_v4_inbound_md5_hash(sk, skb))
1596 goto discard_and_relse;
1597 if (likely(sk->sk_state == TCP_LISTEN)) {
1598 nsk = tcp_check_req(sk, skb, req, false);
1599 } else {
1600 inet_csk_reqsk_queue_drop_and_put(sk, req);
1601 goto lookup;
1602 }
1603 if (!nsk) {
1604 reqsk_put(req);
1605 goto discard_it;
1606 }
1607 if (nsk == sk) {
1608 sock_hold(sk);
1609 reqsk_put(req);
1610 } else if (tcp_child_process(sk, nsk, skb)) {
1611 tcp_v4_send_reset(nsk, skb);
1612 goto discard_it;
1613 } else {
1614 return 0;
1615 }
1616 }
1617 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1618 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1619 goto discard_and_relse;
1620 }
1621
1622 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1623 goto discard_and_relse;
1624
1625 if (tcp_v4_inbound_md5_hash(sk, skb))
1626 goto discard_and_relse;
1627
1628 nf_reset(skb);
1629
1630 if (sk_filter(sk, skb))
1631 goto discard_and_relse;
1632
1633 skb->dev = NULL;
1634
1635 if (sk->sk_state == TCP_LISTEN) {
1636 ret = tcp_v4_do_rcv(sk, skb);
1637 goto put_and_return;
1638 }
1639
1640 sk_incoming_cpu_update(sk);
1641
1642 bh_lock_sock_nested(sk);
1643 tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
1644 ret = 0;
1645 if (!sock_owned_by_user(sk)) {
1646 if (!tcp_prequeue(sk, skb))
1647 ret = tcp_v4_do_rcv(sk, skb);
1648 } else if (unlikely(sk_add_backlog(sk, skb,
1649 sk->sk_rcvbuf + sk->sk_sndbuf))) {
1650 bh_unlock_sock(sk);
1651 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1652 goto discard_and_relse;
1653 }
1654 bh_unlock_sock(sk);
1655
1656 put_and_return:
1657 sock_put(sk);
1658
1659 return ret;
1660
1661 no_tcp_socket:
1662 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1663 goto discard_it;
1664
1665 if (tcp_checksum_complete(skb)) {
1666 csum_error:
1667 TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1668 bad_packet:
1669 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1670 } else {
1671 tcp_v4_send_reset(NULL, skb);
1672 }
1673
1674 discard_it:
1675 /* Discard frame. */
1676 kfree_skb(skb);
1677 return 0;
1678
1679 discard_and_relse:
1680 sock_put(sk);
1681 goto discard_it;
1682
1683 do_time_wait:
1684 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1685 inet_twsk_put(inet_twsk(sk));
1686 goto discard_it;
1687 }
1688
1689 if (tcp_checksum_complete(skb)) {
1690 inet_twsk_put(inet_twsk(sk));
1691 goto csum_error;
1692 }
1693 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1694 case TCP_TW_SYN: {
1695 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1696 &tcp_hashinfo,
1697 iph->saddr, th->source,
1698 iph->daddr, th->dest,
1699 inet_iif(skb));
1700 if (sk2) {
1701 inet_twsk_deschedule_put(inet_twsk(sk));
1702 sk = sk2;
1703 goto process;
1704 }
1705 /* Fall through to ACK */
1706 }
1707 case TCP_TW_ACK:
1708 tcp_v4_timewait_ack(sk, skb);
1709 break;
1710 case TCP_TW_RST:
1711 tcp_v4_send_reset(sk, skb);
1712 inet_twsk_deschedule_put(inet_twsk(sk));
1713 goto discard_it;
1714 case TCP_TW_SUCCESS:;
1715 }
1716 goto discard_it;
1717 }
1718
1719 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1720 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1721 .twsk_unique = tcp_twsk_unique,
1722 .twsk_destructor= tcp_twsk_destructor,
1723 };
1724
1725 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1726 {
1727 struct dst_entry *dst = skb_dst(skb);
1728
1729 if (dst && dst_hold_safe(dst)) {
1730 sk->sk_rx_dst = dst;
1731 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1732 }
1733 }
1734 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1735
1736 const struct inet_connection_sock_af_ops ipv4_specific = {
1737 .queue_xmit = ip_queue_xmit,
1738 .send_check = tcp_v4_send_check,
1739 .rebuild_header = inet_sk_rebuild_header,
1740 .sk_rx_dst_set = inet_sk_rx_dst_set,
1741 .conn_request = tcp_v4_conn_request,
1742 .syn_recv_sock = tcp_v4_syn_recv_sock,
1743 .net_header_len = sizeof(struct iphdr),
1744 .setsockopt = ip_setsockopt,
1745 .getsockopt = ip_getsockopt,
1746 .addr2sockaddr = inet_csk_addr2sockaddr,
1747 .sockaddr_len = sizeof(struct sockaddr_in),
1748 .bind_conflict = inet_csk_bind_conflict,
1749 #ifdef CONFIG_COMPAT
1750 .compat_setsockopt = compat_ip_setsockopt,
1751 .compat_getsockopt = compat_ip_getsockopt,
1752 #endif
1753 .mtu_reduced = tcp_v4_mtu_reduced,
1754 };
1755 EXPORT_SYMBOL(ipv4_specific);
1756
1757 #ifdef CONFIG_TCP_MD5SIG
1758 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1759 .md5_lookup = tcp_v4_md5_lookup,
1760 .calc_md5_hash = tcp_v4_md5_hash_skb,
1761 .md5_parse = tcp_v4_parse_md5_keys,
1762 };
1763 #endif
1764
1765 /* NOTE: A lot of things set to zero explicitly by call to
1766 * sk_alloc() so need not be done here.
1767 */
1768 static int tcp_v4_init_sock(struct sock *sk)
1769 {
1770 struct inet_connection_sock *icsk = inet_csk(sk);
1771
1772 tcp_init_sock(sk);
1773
1774 icsk->icsk_af_ops = &ipv4_specific;
1775
1776 #ifdef CONFIG_TCP_MD5SIG
1777 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1778 #endif
1779
1780 return 0;
1781 }
1782
1783 void tcp_v4_destroy_sock(struct sock *sk)
1784 {
1785 struct tcp_sock *tp = tcp_sk(sk);
1786
1787 tcp_clear_xmit_timers(sk);
1788
1789 tcp_cleanup_congestion_control(sk);
1790
1791 /* Cleanup up the write buffer. */
1792 tcp_write_queue_purge(sk);
1793
1794 /* Cleans up our, hopefully empty, out_of_order_queue. */
1795 __skb_queue_purge(&tp->out_of_order_queue);
1796
1797 #ifdef CONFIG_TCP_MD5SIG
1798 /* Clean up the MD5 key list, if any */
1799 if (tp->md5sig_info) {
1800 tcp_clear_md5_list(sk);
1801 kfree_rcu(tp->md5sig_info, rcu);
1802 tp->md5sig_info = NULL;
1803 }
1804 #endif
1805
1806 /* Clean prequeue, it must be empty really */
1807 __skb_queue_purge(&tp->ucopy.prequeue);
1808
1809 /* Clean up a referenced TCP bind bucket. */
1810 if (inet_csk(sk)->icsk_bind_hash)
1811 inet_put_port(sk);
1812
1813 BUG_ON(tp->fastopen_rsk);
1814
1815 /* If socket is aborted during connect operation */
1816 tcp_free_fastopen_req(tp);
1817 tcp_saved_syn_free(tp);
1818
1819 sk_sockets_allocated_dec(sk);
1820
1821 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
1822 sock_release_memcg(sk);
1823 }
1824 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1825
1826 #ifdef CONFIG_PROC_FS
1827 /* Proc filesystem TCP sock list dumping. */
1828
1829 /*
1830 * Get next listener socket follow cur. If cur is NULL, get first socket
1831 * starting from bucket given in st->bucket; when st->bucket is zero the
1832 * very first socket in the hash table is returned.
1833 */
1834 static void *listening_get_next(struct seq_file *seq, void *cur)
1835 {
1836 struct inet_connection_sock *icsk;
1837 struct hlist_nulls_node *node;
1838 struct sock *sk = cur;
1839 struct inet_listen_hashbucket *ilb;
1840 struct tcp_iter_state *st = seq->private;
1841 struct net *net = seq_file_net(seq);
1842
1843 if (!sk) {
1844 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1845 spin_lock_bh(&ilb->lock);
1846 sk = sk_nulls_head(&ilb->head);
1847 st->offset = 0;
1848 goto get_sk;
1849 }
1850 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1851 ++st->num;
1852 ++st->offset;
1853
1854 sk = sk_nulls_next(sk);
1855 get_sk:
1856 sk_nulls_for_each_from(sk, node) {
1857 if (!net_eq(sock_net(sk), net))
1858 continue;
1859 if (sk->sk_family == st->family) {
1860 cur = sk;
1861 goto out;
1862 }
1863 icsk = inet_csk(sk);
1864 }
1865 spin_unlock_bh(&ilb->lock);
1866 st->offset = 0;
1867 if (++st->bucket < INET_LHTABLE_SIZE) {
1868 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1869 spin_lock_bh(&ilb->lock);
1870 sk = sk_nulls_head(&ilb->head);
1871 goto get_sk;
1872 }
1873 cur = NULL;
1874 out:
1875 return cur;
1876 }
1877
1878 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1879 {
1880 struct tcp_iter_state *st = seq->private;
1881 void *rc;
1882
1883 st->bucket = 0;
1884 st->offset = 0;
1885 rc = listening_get_next(seq, NULL);
1886
1887 while (rc && *pos) {
1888 rc = listening_get_next(seq, rc);
1889 --*pos;
1890 }
1891 return rc;
1892 }
1893
1894 static inline bool empty_bucket(const struct tcp_iter_state *st)
1895 {
1896 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1897 }
1898
1899 /*
1900 * Get first established socket starting from bucket given in st->bucket.
1901 * If st->bucket is zero, the very first socket in the hash is returned.
1902 */
1903 static void *established_get_first(struct seq_file *seq)
1904 {
1905 struct tcp_iter_state *st = seq->private;
1906 struct net *net = seq_file_net(seq);
1907 void *rc = NULL;
1908
1909 st->offset = 0;
1910 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1911 struct sock *sk;
1912 struct hlist_nulls_node *node;
1913 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1914
1915 /* Lockless fast path for the common case of empty buckets */
1916 if (empty_bucket(st))
1917 continue;
1918
1919 spin_lock_bh(lock);
1920 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1921 if (sk->sk_family != st->family ||
1922 !net_eq(sock_net(sk), net)) {
1923 continue;
1924 }
1925 rc = sk;
1926 goto out;
1927 }
1928 spin_unlock_bh(lock);
1929 }
1930 out:
1931 return rc;
1932 }
1933
1934 static void *established_get_next(struct seq_file *seq, void *cur)
1935 {
1936 struct sock *sk = cur;
1937 struct hlist_nulls_node *node;
1938 struct tcp_iter_state *st = seq->private;
1939 struct net *net = seq_file_net(seq);
1940
1941 ++st->num;
1942 ++st->offset;
1943
1944 sk = sk_nulls_next(sk);
1945
1946 sk_nulls_for_each_from(sk, node) {
1947 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1948 return sk;
1949 }
1950
1951 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1952 ++st->bucket;
1953 return established_get_first(seq);
1954 }
1955
1956 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1957 {
1958 struct tcp_iter_state *st = seq->private;
1959 void *rc;
1960
1961 st->bucket = 0;
1962 rc = established_get_first(seq);
1963
1964 while (rc && pos) {
1965 rc = established_get_next(seq, rc);
1966 --pos;
1967 }
1968 return rc;
1969 }
1970
1971 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1972 {
1973 void *rc;
1974 struct tcp_iter_state *st = seq->private;
1975
1976 st->state = TCP_SEQ_STATE_LISTENING;
1977 rc = listening_get_idx(seq, &pos);
1978
1979 if (!rc) {
1980 st->state = TCP_SEQ_STATE_ESTABLISHED;
1981 rc = established_get_idx(seq, pos);
1982 }
1983
1984 return rc;
1985 }
1986
1987 static void *tcp_seek_last_pos(struct seq_file *seq)
1988 {
1989 struct tcp_iter_state *st = seq->private;
1990 int offset = st->offset;
1991 int orig_num = st->num;
1992 void *rc = NULL;
1993
1994 switch (st->state) {
1995 case TCP_SEQ_STATE_LISTENING:
1996 if (st->bucket >= INET_LHTABLE_SIZE)
1997 break;
1998 st->state = TCP_SEQ_STATE_LISTENING;
1999 rc = listening_get_next(seq, NULL);
2000 while (offset-- && rc)
2001 rc = listening_get_next(seq, rc);
2002 if (rc)
2003 break;
2004 st->bucket = 0;
2005 st->state = TCP_SEQ_STATE_ESTABLISHED;
2006 /* Fallthrough */
2007 case TCP_SEQ_STATE_ESTABLISHED:
2008 if (st->bucket > tcp_hashinfo.ehash_mask)
2009 break;
2010 rc = established_get_first(seq);
2011 while (offset-- && rc)
2012 rc = established_get_next(seq, rc);
2013 }
2014
2015 st->num = orig_num;
2016
2017 return rc;
2018 }
2019
2020 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2021 {
2022 struct tcp_iter_state *st = seq->private;
2023 void *rc;
2024
2025 if (*pos && *pos == st->last_pos) {
2026 rc = tcp_seek_last_pos(seq);
2027 if (rc)
2028 goto out;
2029 }
2030
2031 st->state = TCP_SEQ_STATE_LISTENING;
2032 st->num = 0;
2033 st->bucket = 0;
2034 st->offset = 0;
2035 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2036
2037 out:
2038 st->last_pos = *pos;
2039 return rc;
2040 }
2041
2042 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2043 {
2044 struct tcp_iter_state *st = seq->private;
2045 void *rc = NULL;
2046
2047 if (v == SEQ_START_TOKEN) {
2048 rc = tcp_get_idx(seq, 0);
2049 goto out;
2050 }
2051
2052 switch (st->state) {
2053 case TCP_SEQ_STATE_LISTENING:
2054 rc = listening_get_next(seq, v);
2055 if (!rc) {
2056 st->state = TCP_SEQ_STATE_ESTABLISHED;
2057 st->bucket = 0;
2058 st->offset = 0;
2059 rc = established_get_first(seq);
2060 }
2061 break;
2062 case TCP_SEQ_STATE_ESTABLISHED:
2063 rc = established_get_next(seq, v);
2064 break;
2065 }
2066 out:
2067 ++*pos;
2068 st->last_pos = *pos;
2069 return rc;
2070 }
2071
2072 static void tcp_seq_stop(struct seq_file *seq, void *v)
2073 {
2074 struct tcp_iter_state *st = seq->private;
2075
2076 switch (st->state) {
2077 case TCP_SEQ_STATE_LISTENING:
2078 if (v != SEQ_START_TOKEN)
2079 spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2080 break;
2081 case TCP_SEQ_STATE_ESTABLISHED:
2082 if (v)
2083 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2084 break;
2085 }
2086 }
2087
2088 int tcp_seq_open(struct inode *inode, struct file *file)
2089 {
2090 struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2091 struct tcp_iter_state *s;
2092 int err;
2093
2094 err = seq_open_net(inode, file, &afinfo->seq_ops,
2095 sizeof(struct tcp_iter_state));
2096 if (err < 0)
2097 return err;
2098
2099 s = ((struct seq_file *)file->private_data)->private;
2100 s->family = afinfo->family;
2101 s->last_pos = 0;
2102 return 0;
2103 }
2104 EXPORT_SYMBOL(tcp_seq_open);
2105
2106 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2107 {
2108 int rc = 0;
2109 struct proc_dir_entry *p;
2110
2111 afinfo->seq_ops.start = tcp_seq_start;
2112 afinfo->seq_ops.next = tcp_seq_next;
2113 afinfo->seq_ops.stop = tcp_seq_stop;
2114
2115 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2116 afinfo->seq_fops, afinfo);
2117 if (!p)
2118 rc = -ENOMEM;
2119 return rc;
2120 }
2121 EXPORT_SYMBOL(tcp_proc_register);
2122
2123 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2124 {
2125 remove_proc_entry(afinfo->name, net->proc_net);
2126 }
2127 EXPORT_SYMBOL(tcp_proc_unregister);
2128
2129 static void get_openreq4(const struct request_sock *req,
2130 struct seq_file *f, int i)
2131 {
2132 const struct inet_request_sock *ireq = inet_rsk(req);
2133 long delta = req->rsk_timer.expires - jiffies;
2134
2135 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2136 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2137 i,
2138 ireq->ir_loc_addr,
2139 ireq->ir_num,
2140 ireq->ir_rmt_addr,
2141 ntohs(ireq->ir_rmt_port),
2142 TCP_SYN_RECV,
2143 0, 0, /* could print option size, but that is af dependent. */
2144 1, /* timers active (only the expire timer) */
2145 jiffies_delta_to_clock_t(delta),
2146 req->num_timeout,
2147 from_kuid_munged(seq_user_ns(f),
2148 sock_i_uid(req->rsk_listener)),
2149 0, /* non standard timer */
2150 0, /* open_requests have no inode */
2151 0,
2152 req);
2153 }
2154
2155 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2156 {
2157 int timer_active;
2158 unsigned long timer_expires;
2159 const struct tcp_sock *tp = tcp_sk(sk);
2160 const struct inet_connection_sock *icsk = inet_csk(sk);
2161 const struct inet_sock *inet = inet_sk(sk);
2162 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2163 __be32 dest = inet->inet_daddr;
2164 __be32 src = inet->inet_rcv_saddr;
2165 __u16 destp = ntohs(inet->inet_dport);
2166 __u16 srcp = ntohs(inet->inet_sport);
2167 int rx_queue;
2168 int state;
2169
2170 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2171 icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2172 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2173 timer_active = 1;
2174 timer_expires = icsk->icsk_timeout;
2175 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2176 timer_active = 4;
2177 timer_expires = icsk->icsk_timeout;
2178 } else if (timer_pending(&sk->sk_timer)) {
2179 timer_active = 2;
2180 timer_expires = sk->sk_timer.expires;
2181 } else {
2182 timer_active = 0;
2183 timer_expires = jiffies;
2184 }
2185
2186 state = sk_state_load(sk);
2187 if (state == TCP_LISTEN)
2188 rx_queue = sk->sk_ack_backlog;
2189 else
2190 /* Because we don't lock the socket,
2191 * we might find a transient negative value.
2192 */
2193 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2194
2195 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2196 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2197 i, src, srcp, dest, destp, state,
2198 tp->write_seq - tp->snd_una,
2199 rx_queue,
2200 timer_active,
2201 jiffies_delta_to_clock_t(timer_expires - jiffies),
2202 icsk->icsk_retransmits,
2203 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2204 icsk->icsk_probes_out,
2205 sock_i_ino(sk),
2206 atomic_read(&sk->sk_refcnt), sk,
2207 jiffies_to_clock_t(icsk->icsk_rto),
2208 jiffies_to_clock_t(icsk->icsk_ack.ato),
2209 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2210 tp->snd_cwnd,
2211 state == TCP_LISTEN ?
2212 fastopenq->max_qlen :
2213 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2214 }
2215
2216 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2217 struct seq_file *f, int i)
2218 {
2219 long delta = tw->tw_timer.expires - jiffies;
2220 __be32 dest, src;
2221 __u16 destp, srcp;
2222
2223 dest = tw->tw_daddr;
2224 src = tw->tw_rcv_saddr;
2225 destp = ntohs(tw->tw_dport);
2226 srcp = ntohs(tw->tw_sport);
2227
2228 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2229 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2230 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2231 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2232 atomic_read(&tw->tw_refcnt), tw);
2233 }
2234
2235 #define TMPSZ 150
2236
2237 static int tcp4_seq_show(struct seq_file *seq, void *v)
2238 {
2239 struct tcp_iter_state *st;
2240 struct sock *sk = v;
2241
2242 seq_setwidth(seq, TMPSZ - 1);
2243 if (v == SEQ_START_TOKEN) {
2244 seq_puts(seq, " sl local_address rem_address st tx_queue "
2245 "rx_queue tr tm->when retrnsmt uid timeout "
2246 "inode");
2247 goto out;
2248 }
2249 st = seq->private;
2250
2251 if (sk->sk_state == TCP_TIME_WAIT)
2252 get_timewait4_sock(v, seq, st->num);
2253 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2254 get_openreq4(v, seq, st->num);
2255 else
2256 get_tcp4_sock(v, seq, st->num);
2257 out:
2258 seq_pad(seq, '\n');
2259 return 0;
2260 }
2261
2262 static const struct file_operations tcp_afinfo_seq_fops = {
2263 .owner = THIS_MODULE,
2264 .open = tcp_seq_open,
2265 .read = seq_read,
2266 .llseek = seq_lseek,
2267 .release = seq_release_net
2268 };
2269
2270 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2271 .name = "tcp",
2272 .family = AF_INET,
2273 .seq_fops = &tcp_afinfo_seq_fops,
2274 .seq_ops = {
2275 .show = tcp4_seq_show,
2276 },
2277 };
2278
2279 static int __net_init tcp4_proc_init_net(struct net *net)
2280 {
2281 return tcp_proc_register(net, &tcp4_seq_afinfo);
2282 }
2283
2284 static void __net_exit tcp4_proc_exit_net(struct net *net)
2285 {
2286 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2287 }
2288
2289 static struct pernet_operations tcp4_net_ops = {
2290 .init = tcp4_proc_init_net,
2291 .exit = tcp4_proc_exit_net,
2292 };
2293
2294 int __init tcp4_proc_init(void)
2295 {
2296 return register_pernet_subsys(&tcp4_net_ops);
2297 }
2298
2299 void tcp4_proc_exit(void)
2300 {
2301 unregister_pernet_subsys(&tcp4_net_ops);
2302 }
2303 #endif /* CONFIG_PROC_FS */
2304
2305 struct proto tcp_prot = {
2306 .name = "TCP",
2307 .owner = THIS_MODULE,
2308 .close = tcp_close,
2309 .connect = tcp_v4_connect,
2310 .disconnect = tcp_disconnect,
2311 .accept = inet_csk_accept,
2312 .ioctl = tcp_ioctl,
2313 .init = tcp_v4_init_sock,
2314 .destroy = tcp_v4_destroy_sock,
2315 .shutdown = tcp_shutdown,
2316 .setsockopt = tcp_setsockopt,
2317 .getsockopt = tcp_getsockopt,
2318 .recvmsg = tcp_recvmsg,
2319 .sendmsg = tcp_sendmsg,
2320 .sendpage = tcp_sendpage,
2321 .backlog_rcv = tcp_v4_do_rcv,
2322 .release_cb = tcp_release_cb,
2323 .hash = inet_hash,
2324 .unhash = inet_unhash,
2325 .get_port = inet_csk_get_port,
2326 .enter_memory_pressure = tcp_enter_memory_pressure,
2327 .stream_memory_free = tcp_stream_memory_free,
2328 .sockets_allocated = &tcp_sockets_allocated,
2329 .orphan_count = &tcp_orphan_count,
2330 .memory_allocated = &tcp_memory_allocated,
2331 .memory_pressure = &tcp_memory_pressure,
2332 .sysctl_mem = sysctl_tcp_mem,
2333 .sysctl_wmem = sysctl_tcp_wmem,
2334 .sysctl_rmem = sysctl_tcp_rmem,
2335 .max_header = MAX_TCP_HEADER,
2336 .obj_size = sizeof(struct tcp_sock),
2337 .slab_flags = SLAB_DESTROY_BY_RCU,
2338 .twsk_prot = &tcp_timewait_sock_ops,
2339 .rsk_prot = &tcp_request_sock_ops,
2340 .h.hashinfo = &tcp_hashinfo,
2341 .no_autobind = true,
2342 #ifdef CONFIG_COMPAT
2343 .compat_setsockopt = compat_tcp_setsockopt,
2344 .compat_getsockopt = compat_tcp_getsockopt,
2345 #endif
2346 .diag_destroy = tcp_abort,
2347 };
2348 EXPORT_SYMBOL(tcp_prot);
2349
2350 static void __net_exit tcp_sk_exit(struct net *net)
2351 {
2352 int cpu;
2353
2354 for_each_possible_cpu(cpu)
2355 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2356 free_percpu(net->ipv4.tcp_sk);
2357 }
2358
2359 static int __net_init tcp_sk_init(struct net *net)
2360 {
2361 int res, cpu;
2362
2363 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2364 if (!net->ipv4.tcp_sk)
2365 return -ENOMEM;
2366
2367 for_each_possible_cpu(cpu) {
2368 struct sock *sk;
2369
2370 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2371 IPPROTO_TCP, net);
2372 if (res)
2373 goto fail;
2374 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2375 }
2376
2377 net->ipv4.sysctl_tcp_ecn = 2;
2378 net->ipv4.sysctl_tcp_ecn_fallback = 1;
2379
2380 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2381 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2382 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2383
2384 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2385 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2386 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2387
2388 return 0;
2389 fail:
2390 tcp_sk_exit(net);
2391
2392 return res;
2393 }
2394
2395 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2396 {
2397 inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2398 }
2399
2400 static struct pernet_operations __net_initdata tcp_sk_ops = {
2401 .init = tcp_sk_init,
2402 .exit = tcp_sk_exit,
2403 .exit_batch = tcp_sk_exit_batch,
2404 };
2405
2406 void __init tcp_v4_init(void)
2407 {
2408 inet_hashinfo_init(&tcp_hashinfo);
2409 if (register_pernet_subsys(&tcp_sk_ops))
2410 panic("Failed to create the TCP control socket.\n");
2411 }
This page took 0.19488 seconds and 6 git commands to generate.