[NET]: Introduce inet_connection_sock
[deliverable/linux.git] / net / ipv4 / tcp_ipv4.c
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9 *
10 * IPv4 specific functions
11 *
12 *
13 * code split from:
14 * linux/ipv4/tcp.c
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
17 *
18 * See tcp.c for author information
19 *
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
24 */
25
26 /*
27 * Changes:
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
36 * ACK bit.
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
39 * request_sock handling and moved
40 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics.
43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
47 * coma.
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
53 */
54
55 #include <linux/config.h>
56
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
65
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/ipv6.h>
70 #include <net/inet_common.h>
71 #include <net/xfrm.h>
72
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78
79 extern int sysctl_ip_dynaddr;
80 int sysctl_tcp_tw_reuse;
81 int sysctl_tcp_low_latency;
82
83 /* Check TCP sequence numbers in ICMP packets. */
84 #define ICMP_MIN_LENGTH 8
85
86 /* Socket used for sending RSTs */
87 static struct socket *tcp_socket;
88
89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
90 struct sk_buff *skb);
91
92 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93 .lhash_lock = RW_LOCK_UNLOCKED,
94 .lhash_users = ATOMIC_INIT(0),
95 .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
96 .portalloc_lock = SPIN_LOCK_UNLOCKED,
97 .port_rover = 1024 - 1,
98 };
99
100 /*
101 * This array holds the first and last local port number.
102 * For high-usage systems, use sysctl to change this to
103 * 32768-61000
104 */
105 int sysctl_local_port_range[2] = { 1024, 4999 };
106
107 static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
108 {
109 const u32 sk_rcv_saddr = inet_rcv_saddr(sk);
110 struct sock *sk2;
111 struct hlist_node *node;
112 int reuse = sk->sk_reuse;
113
114 sk_for_each_bound(sk2, node, &tb->owners) {
115 if (sk != sk2 &&
116 !inet_v6_ipv6only(sk2) &&
117 (!sk->sk_bound_dev_if ||
118 !sk2->sk_bound_dev_if ||
119 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
120 if (!reuse || !sk2->sk_reuse ||
121 sk2->sk_state == TCP_LISTEN) {
122 const u32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
123 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
124 sk2_rcv_saddr == sk_rcv_saddr)
125 break;
126 }
127 }
128 }
129 return node != NULL;
130 }
131
132 /* Obtain a reference to a local port for the given sock,
133 * if snum is zero it means select any available local port.
134 */
135 int inet_csk_get_port(struct inet_hashinfo *hashinfo,
136 struct sock *sk, unsigned short snum)
137 {
138 struct inet_bind_hashbucket *head;
139 struct hlist_node *node;
140 struct inet_bind_bucket *tb;
141 int ret;
142
143 local_bh_disable();
144 if (!snum) {
145 int low = sysctl_local_port_range[0];
146 int high = sysctl_local_port_range[1];
147 int remaining = (high - low) + 1;
148 int rover;
149
150 spin_lock(&hashinfo->portalloc_lock);
151 if (hashinfo->port_rover < low)
152 rover = low;
153 else
154 rover = hashinfo->port_rover;
155 do {
156 rover++;
157 if (rover > high)
158 rover = low;
159 head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)];
160 spin_lock(&head->lock);
161 inet_bind_bucket_for_each(tb, node, &head->chain)
162 if (tb->port == rover)
163 goto next;
164 break;
165 next:
166 spin_unlock(&head->lock);
167 } while (--remaining > 0);
168 hashinfo->port_rover = rover;
169 spin_unlock(&hashinfo->portalloc_lock);
170
171 /* Exhausted local port range during search? It is not
172 * possible for us to be holding one of the bind hash
173 * locks if this test triggers, because if 'remaining'
174 * drops to zero, we broke out of the do/while loop at
175 * the top level, not from the 'break;' statement.
176 */
177 ret = 1;
178 if (unlikely(remaining <= 0))
179 goto fail;
180
181 /* OK, here is the one we will use. HEAD is
182 * non-NULL and we hold it's mutex.
183 */
184 snum = rover;
185 } else {
186 head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)];
187 spin_lock(&head->lock);
188 inet_bind_bucket_for_each(tb, node, &head->chain)
189 if (tb->port == snum)
190 goto tb_found;
191 }
192 tb = NULL;
193 goto tb_not_found;
194 tb_found:
195 if (!hlist_empty(&tb->owners)) {
196 if (sk->sk_reuse > 1)
197 goto success;
198 if (tb->fastreuse > 0 &&
199 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
200 goto success;
201 } else {
202 ret = 1;
203 if (inet_csk_bind_conflict(sk, tb))
204 goto fail_unlock;
205 }
206 }
207 tb_not_found:
208 ret = 1;
209 if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum)) == NULL)
210 goto fail_unlock;
211 if (hlist_empty(&tb->owners)) {
212 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
213 tb->fastreuse = 1;
214 else
215 tb->fastreuse = 0;
216 } else if (tb->fastreuse &&
217 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
218 tb->fastreuse = 0;
219 success:
220 if (!inet_csk(sk)->icsk_bind_hash)
221 inet_bind_hash(sk, tb, snum);
222 BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb);
223 ret = 0;
224
225 fail_unlock:
226 spin_unlock(&head->lock);
227 fail:
228 local_bh_enable();
229 return ret;
230 }
231
232 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
233 {
234 return inet_csk_get_port(&tcp_hashinfo, sk, snum);
235 }
236
237 static void tcp_v4_hash(struct sock *sk)
238 {
239 inet_hash(&tcp_hashinfo, sk);
240 }
241
242 void tcp_unhash(struct sock *sk)
243 {
244 inet_unhash(&tcp_hashinfo, sk);
245 }
246
247 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
248 {
249 return secure_tcp_sequence_number(skb->nh.iph->daddr,
250 skb->nh.iph->saddr,
251 skb->h.th->dest,
252 skb->h.th->source);
253 }
254
255 /* called with local bh disabled */
256 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
257 struct inet_timewait_sock **twp)
258 {
259 struct inet_sock *inet = inet_sk(sk);
260 u32 daddr = inet->rcv_saddr;
261 u32 saddr = inet->daddr;
262 int dif = sk->sk_bound_dev_if;
263 INET_ADDR_COOKIE(acookie, saddr, daddr)
264 const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
265 const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size);
266 struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash];
267 struct sock *sk2;
268 const struct hlist_node *node;
269 struct inet_timewait_sock *tw;
270
271 write_lock(&head->lock);
272
273 /* Check TIME-WAIT sockets first. */
274 sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
275 tw = inet_twsk(sk2);
276
277 if (INET_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
278 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
279 struct tcp_sock *tp = tcp_sk(sk);
280
281 /* With PAWS, it is safe from the viewpoint
282 of data integrity. Even without PAWS it
283 is safe provided sequence spaces do not
284 overlap i.e. at data rates <= 80Mbit/sec.
285
286 Actually, the idea is close to VJ's one,
287 only timestamp cache is held not per host,
288 but per port pair and TW bucket is used
289 as state holder.
290
291 If TW bucket has been already destroyed we
292 fall back to VJ's scheme and use initial
293 timestamp retrieved from peer table.
294 */
295 if (tcptw->tw_ts_recent_stamp &&
296 (!twp || (sysctl_tcp_tw_reuse &&
297 xtime.tv_sec -
298 tcptw->tw_ts_recent_stamp > 1))) {
299 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
300 if (tp->write_seq == 0)
301 tp->write_seq = 1;
302 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
303 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
304 sock_hold(sk2);
305 goto unique;
306 } else
307 goto not_unique;
308 }
309 }
310 tw = NULL;
311
312 /* And established part... */
313 sk_for_each(sk2, node, &head->chain) {
314 if (INET_MATCH(sk2, acookie, saddr, daddr, ports, dif))
315 goto not_unique;
316 }
317
318 unique:
319 /* Must record num and sport now. Otherwise we will see
320 * in hash table socket with a funny identity. */
321 inet->num = lport;
322 inet->sport = htons(lport);
323 sk->sk_hashent = hash;
324 BUG_TRAP(sk_unhashed(sk));
325 __sk_add_node(sk, &head->chain);
326 sock_prot_inc_use(sk->sk_prot);
327 write_unlock(&head->lock);
328
329 if (twp) {
330 *twp = tw;
331 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
332 } else if (tw) {
333 /* Silly. Should hash-dance instead... */
334 tcp_tw_deschedule(tw);
335 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
336
337 inet_twsk_put(tw);
338 }
339
340 return 0;
341
342 not_unique:
343 write_unlock(&head->lock);
344 return -EADDRNOTAVAIL;
345 }
346
347 static inline u32 connect_port_offset(const struct sock *sk)
348 {
349 const struct inet_sock *inet = inet_sk(sk);
350
351 return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
352 inet->dport);
353 }
354
355 /*
356 * Bind a port for a connect operation and hash it.
357 */
358 static inline int tcp_v4_hash_connect(struct sock *sk)
359 {
360 const unsigned short snum = inet_sk(sk)->num;
361 struct inet_bind_hashbucket *head;
362 struct inet_bind_bucket *tb;
363 int ret;
364
365 if (!snum) {
366 int low = sysctl_local_port_range[0];
367 int high = sysctl_local_port_range[1];
368 int range = high - low;
369 int i;
370 int port;
371 static u32 hint;
372 u32 offset = hint + connect_port_offset(sk);
373 struct hlist_node *node;
374 struct inet_timewait_sock *tw = NULL;
375
376 local_bh_disable();
377 for (i = 1; i <= range; i++) {
378 port = low + (i + offset) % range;
379 head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
380 spin_lock(&head->lock);
381
382 /* Does not bother with rcv_saddr checks,
383 * because the established check is already
384 * unique enough.
385 */
386 inet_bind_bucket_for_each(tb, node, &head->chain) {
387 if (tb->port == port) {
388 BUG_TRAP(!hlist_empty(&tb->owners));
389 if (tb->fastreuse >= 0)
390 goto next_port;
391 if (!__tcp_v4_check_established(sk,
392 port,
393 &tw))
394 goto ok;
395 goto next_port;
396 }
397 }
398
399 tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
400 if (!tb) {
401 spin_unlock(&head->lock);
402 break;
403 }
404 tb->fastreuse = -1;
405 goto ok;
406
407 next_port:
408 spin_unlock(&head->lock);
409 }
410 local_bh_enable();
411
412 return -EADDRNOTAVAIL;
413
414 ok:
415 hint += i;
416
417 /* Head lock still held and bh's disabled */
418 inet_bind_hash(sk, tb, port);
419 if (sk_unhashed(sk)) {
420 inet_sk(sk)->sport = htons(port);
421 __inet_hash(&tcp_hashinfo, sk, 0);
422 }
423 spin_unlock(&head->lock);
424
425 if (tw) {
426 tcp_tw_deschedule(tw);
427 inet_twsk_put(tw);
428 }
429
430 ret = 0;
431 goto out;
432 }
433
434 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
435 tb = inet_csk(sk)->icsk_bind_hash;
436 spin_lock_bh(&head->lock);
437 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
438 __inet_hash(&tcp_hashinfo, sk, 0);
439 spin_unlock_bh(&head->lock);
440 return 0;
441 } else {
442 spin_unlock(&head->lock);
443 /* No definite answer... Walk to established hash table */
444 ret = __tcp_v4_check_established(sk, snum, NULL);
445 out:
446 local_bh_enable();
447 return ret;
448 }
449 }
450
451 /* This will initiate an outgoing connection. */
452 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
453 {
454 struct inet_sock *inet = inet_sk(sk);
455 struct tcp_sock *tp = tcp_sk(sk);
456 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
457 struct rtable *rt;
458 u32 daddr, nexthop;
459 int tmp;
460 int err;
461
462 if (addr_len < sizeof(struct sockaddr_in))
463 return -EINVAL;
464
465 if (usin->sin_family != AF_INET)
466 return -EAFNOSUPPORT;
467
468 nexthop = daddr = usin->sin_addr.s_addr;
469 if (inet->opt && inet->opt->srr) {
470 if (!daddr)
471 return -EINVAL;
472 nexthop = inet->opt->faddr;
473 }
474
475 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
476 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
477 IPPROTO_TCP,
478 inet->sport, usin->sin_port, sk);
479 if (tmp < 0)
480 return tmp;
481
482 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
483 ip_rt_put(rt);
484 return -ENETUNREACH;
485 }
486
487 if (!inet->opt || !inet->opt->srr)
488 daddr = rt->rt_dst;
489
490 if (!inet->saddr)
491 inet->saddr = rt->rt_src;
492 inet->rcv_saddr = inet->saddr;
493
494 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
495 /* Reset inherited state */
496 tp->rx_opt.ts_recent = 0;
497 tp->rx_opt.ts_recent_stamp = 0;
498 tp->write_seq = 0;
499 }
500
501 if (sysctl_tcp_tw_recycle &&
502 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
503 struct inet_peer *peer = rt_get_peer(rt);
504
505 /* VJ's idea. We save last timestamp seen from
506 * the destination in peer table, when entering state TIME-WAIT
507 * and initialize rx_opt.ts_recent from it, when trying new connection.
508 */
509
510 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
511 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
512 tp->rx_opt.ts_recent = peer->tcp_ts;
513 }
514 }
515
516 inet->dport = usin->sin_port;
517 inet->daddr = daddr;
518
519 tp->ext_header_len = 0;
520 if (inet->opt)
521 tp->ext_header_len = inet->opt->optlen;
522
523 tp->rx_opt.mss_clamp = 536;
524
525 /* Socket identity is still unknown (sport may be zero).
526 * However we set state to SYN-SENT and not releasing socket
527 * lock select source port, enter ourselves into the hash tables and
528 * complete initialization after this.
529 */
530 tcp_set_state(sk, TCP_SYN_SENT);
531 err = tcp_v4_hash_connect(sk);
532 if (err)
533 goto failure;
534
535 err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
536 if (err)
537 goto failure;
538
539 /* OK, now commit destination to socket. */
540 sk_setup_caps(sk, &rt->u.dst);
541
542 if (!tp->write_seq)
543 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
544 inet->daddr,
545 inet->sport,
546 usin->sin_port);
547
548 inet->id = tp->write_seq ^ jiffies;
549
550 err = tcp_connect(sk);
551 rt = NULL;
552 if (err)
553 goto failure;
554
555 return 0;
556
557 failure:
558 /* This unhashes the socket and releases the local port, if necessary. */
559 tcp_set_state(sk, TCP_CLOSE);
560 ip_rt_put(rt);
561 sk->sk_route_caps = 0;
562 inet->dport = 0;
563 return err;
564 }
565
566 static inline int inet_iif(const struct sk_buff *skb)
567 {
568 return ((struct rtable *)skb->dst)->rt_iif;
569 }
570
571 static inline u32 inet_synq_hash(const u32 raddr, const u16 rport,
572 const u32 rnd, const u16 synq_hsize)
573 {
574 return jhash_2words(raddr, (u32)rport, rnd) & (synq_hsize - 1);
575 }
576
577 struct request_sock *inet_csk_search_req(const struct sock *sk,
578 struct request_sock ***prevp,
579 const __u16 rport, const __u32 raddr,
580 const __u32 laddr)
581 {
582 const struct inet_connection_sock *icsk = inet_csk(sk);
583 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
584 struct request_sock *req, **prev;
585
586 for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd,
587 lopt->nr_table_entries)];
588 (req = *prev) != NULL;
589 prev = &req->dl_next) {
590 const struct inet_request_sock *ireq = inet_rsk(req);
591
592 if (ireq->rmt_port == rport &&
593 ireq->rmt_addr == raddr &&
594 ireq->loc_addr == laddr &&
595 AF_INET_FAMILY(req->rsk_ops->family)) {
596 BUG_TRAP(!req->sk);
597 *prevp = prev;
598 break;
599 }
600 }
601
602 return req;
603 }
604
605 static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
606 {
607 struct inet_connection_sock *icsk = inet_csk(sk);
608 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
609 const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port,
610 lopt->hash_rnd, lopt->nr_table_entries);
611
612 reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, TCP_TIMEOUT_INIT);
613 inet_csk_reqsk_queue_added(sk, TCP_TIMEOUT_INIT);
614 }
615
616
617 /*
618 * This routine does path mtu discovery as defined in RFC1191.
619 */
620 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
621 u32 mtu)
622 {
623 struct dst_entry *dst;
624 struct inet_sock *inet = inet_sk(sk);
625 struct tcp_sock *tp = tcp_sk(sk);
626
627 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
628 * send out by Linux are always <576bytes so they should go through
629 * unfragmented).
630 */
631 if (sk->sk_state == TCP_LISTEN)
632 return;
633
634 /* We don't check in the destentry if pmtu discovery is forbidden
635 * on this route. We just assume that no packet_to_big packets
636 * are send back when pmtu discovery is not active.
637 * There is a small race when the user changes this flag in the
638 * route, but I think that's acceptable.
639 */
640 if ((dst = __sk_dst_check(sk, 0)) == NULL)
641 return;
642
643 dst->ops->update_pmtu(dst, mtu);
644
645 /* Something is about to be wrong... Remember soft error
646 * for the case, if this connection will not able to recover.
647 */
648 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
649 sk->sk_err_soft = EMSGSIZE;
650
651 mtu = dst_mtu(dst);
652
653 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
654 tp->pmtu_cookie > mtu) {
655 tcp_sync_mss(sk, mtu);
656
657 /* Resend the TCP packet because it's
658 * clear that the old packet has been
659 * dropped. This is the new "fast" path mtu
660 * discovery.
661 */
662 tcp_simple_retransmit(sk);
663 } /* else let the usual retransmit timer handle it */
664 }
665
666 /*
667 * This routine is called by the ICMP module when it gets some
668 * sort of error condition. If err < 0 then the socket should
669 * be closed and the error returned to the user. If err > 0
670 * it's just the icmp type << 8 | icmp code. After adjustment
671 * header points to the first 8 bytes of the tcp header. We need
672 * to find the appropriate port.
673 *
674 * The locking strategy used here is very "optimistic". When
675 * someone else accesses the socket the ICMP is just dropped
676 * and for some paths there is no check at all.
677 * A more general error queue to queue errors for later handling
678 * is probably better.
679 *
680 */
681
682 void tcp_v4_err(struct sk_buff *skb, u32 info)
683 {
684 struct iphdr *iph = (struct iphdr *)skb->data;
685 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
686 struct tcp_sock *tp;
687 struct inet_sock *inet;
688 int type = skb->h.icmph->type;
689 int code = skb->h.icmph->code;
690 struct sock *sk;
691 __u32 seq;
692 int err;
693
694 if (skb->len < (iph->ihl << 2) + 8) {
695 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
696 return;
697 }
698
699 sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
700 th->source, inet_iif(skb));
701 if (!sk) {
702 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
703 return;
704 }
705 if (sk->sk_state == TCP_TIME_WAIT) {
706 inet_twsk_put((struct inet_timewait_sock *)sk);
707 return;
708 }
709
710 bh_lock_sock(sk);
711 /* If too many ICMPs get dropped on busy
712 * servers this needs to be solved differently.
713 */
714 if (sock_owned_by_user(sk))
715 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
716
717 if (sk->sk_state == TCP_CLOSE)
718 goto out;
719
720 tp = tcp_sk(sk);
721 seq = ntohl(th->seq);
722 if (sk->sk_state != TCP_LISTEN &&
723 !between(seq, tp->snd_una, tp->snd_nxt)) {
724 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
725 goto out;
726 }
727
728 switch (type) {
729 case ICMP_SOURCE_QUENCH:
730 /* Just silently ignore these. */
731 goto out;
732 case ICMP_PARAMETERPROB:
733 err = EPROTO;
734 break;
735 case ICMP_DEST_UNREACH:
736 if (code > NR_ICMP_UNREACH)
737 goto out;
738
739 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
740 if (!sock_owned_by_user(sk))
741 do_pmtu_discovery(sk, iph, info);
742 goto out;
743 }
744
745 err = icmp_err_convert[code].errno;
746 break;
747 case ICMP_TIME_EXCEEDED:
748 err = EHOSTUNREACH;
749 break;
750 default:
751 goto out;
752 }
753
754 switch (sk->sk_state) {
755 struct request_sock *req, **prev;
756 case TCP_LISTEN:
757 if (sock_owned_by_user(sk))
758 goto out;
759
760 req = inet_csk_search_req(sk, &prev, th->dest,
761 iph->daddr, iph->saddr);
762 if (!req)
763 goto out;
764
765 /* ICMPs are not backlogged, hence we cannot get
766 an established socket here.
767 */
768 BUG_TRAP(!req->sk);
769
770 if (seq != tcp_rsk(req)->snt_isn) {
771 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
772 goto out;
773 }
774
775 /*
776 * Still in SYN_RECV, just remove it silently.
777 * There is no good way to pass the error to the newly
778 * created socket, and POSIX does not want network
779 * errors returned from accept().
780 */
781 inet_csk_reqsk_queue_drop(sk, req, prev);
782 goto out;
783
784 case TCP_SYN_SENT:
785 case TCP_SYN_RECV: /* Cannot happen.
786 It can f.e. if SYNs crossed.
787 */
788 if (!sock_owned_by_user(sk)) {
789 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
790 sk->sk_err = err;
791
792 sk->sk_error_report(sk);
793
794 tcp_done(sk);
795 } else {
796 sk->sk_err_soft = err;
797 }
798 goto out;
799 }
800
801 /* If we've already connected we will keep trying
802 * until we time out, or the user gives up.
803 *
804 * rfc1122 4.2.3.9 allows to consider as hard errors
805 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
806 * but it is obsoleted by pmtu discovery).
807 *
808 * Note, that in modern internet, where routing is unreliable
809 * and in each dark corner broken firewalls sit, sending random
810 * errors ordered by their masters even this two messages finally lose
811 * their original sense (even Linux sends invalid PORT_UNREACHs)
812 *
813 * Now we are in compliance with RFCs.
814 * --ANK (980905)
815 */
816
817 inet = inet_sk(sk);
818 if (!sock_owned_by_user(sk) && inet->recverr) {
819 sk->sk_err = err;
820 sk->sk_error_report(sk);
821 } else { /* Only an error on timeout */
822 sk->sk_err_soft = err;
823 }
824
825 out:
826 bh_unlock_sock(sk);
827 sock_put(sk);
828 }
829
830 /* This routine computes an IPv4 TCP checksum. */
831 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
832 struct sk_buff *skb)
833 {
834 struct inet_sock *inet = inet_sk(sk);
835
836 if (skb->ip_summed == CHECKSUM_HW) {
837 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
838 skb->csum = offsetof(struct tcphdr, check);
839 } else {
840 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
841 csum_partial((char *)th,
842 th->doff << 2,
843 skb->csum));
844 }
845 }
846
847 /*
848 * This routine will send an RST to the other tcp.
849 *
850 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
851 * for reset.
852 * Answer: if a packet caused RST, it is not for a socket
853 * existing in our system, if it is matched to a socket,
854 * it is just duplicate segment or bug in other side's TCP.
855 * So that we build reply only basing on parameters
856 * arrived with segment.
857 * Exception: precedence violation. We do not implement it in any case.
858 */
859
860 static void tcp_v4_send_reset(struct sk_buff *skb)
861 {
862 struct tcphdr *th = skb->h.th;
863 struct tcphdr rth;
864 struct ip_reply_arg arg;
865
866 /* Never send a reset in response to a reset. */
867 if (th->rst)
868 return;
869
870 if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
871 return;
872
873 /* Swap the send and the receive. */
874 memset(&rth, 0, sizeof(struct tcphdr));
875 rth.dest = th->source;
876 rth.source = th->dest;
877 rth.doff = sizeof(struct tcphdr) / 4;
878 rth.rst = 1;
879
880 if (th->ack) {
881 rth.seq = th->ack_seq;
882 } else {
883 rth.ack = 1;
884 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
885 skb->len - (th->doff << 2));
886 }
887
888 memset(&arg, 0, sizeof arg);
889 arg.iov[0].iov_base = (unsigned char *)&rth;
890 arg.iov[0].iov_len = sizeof rth;
891 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
892 skb->nh.iph->saddr, /*XXX*/
893 sizeof(struct tcphdr), IPPROTO_TCP, 0);
894 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
895
896 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
897
898 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
899 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
900 }
901
902 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
903 outside socket context is ugly, certainly. What can I do?
904 */
905
906 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
907 u32 win, u32 ts)
908 {
909 struct tcphdr *th = skb->h.th;
910 struct {
911 struct tcphdr th;
912 u32 tsopt[3];
913 } rep;
914 struct ip_reply_arg arg;
915
916 memset(&rep.th, 0, sizeof(struct tcphdr));
917 memset(&arg, 0, sizeof arg);
918
919 arg.iov[0].iov_base = (unsigned char *)&rep;
920 arg.iov[0].iov_len = sizeof(rep.th);
921 if (ts) {
922 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
923 (TCPOPT_TIMESTAMP << 8) |
924 TCPOLEN_TIMESTAMP);
925 rep.tsopt[1] = htonl(tcp_time_stamp);
926 rep.tsopt[2] = htonl(ts);
927 arg.iov[0].iov_len = sizeof(rep);
928 }
929
930 /* Swap the send and the receive. */
931 rep.th.dest = th->source;
932 rep.th.source = th->dest;
933 rep.th.doff = arg.iov[0].iov_len / 4;
934 rep.th.seq = htonl(seq);
935 rep.th.ack_seq = htonl(ack);
936 rep.th.ack = 1;
937 rep.th.window = htons(win);
938
939 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
940 skb->nh.iph->saddr, /*XXX*/
941 arg.iov[0].iov_len, IPPROTO_TCP, 0);
942 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
943
944 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
945
946 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
947 }
948
949 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
950 {
951 struct inet_timewait_sock *tw = inet_twsk(sk);
952 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
953
954 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
955 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent);
956
957 inet_twsk_put(tw);
958 }
959
960 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
961 {
962 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
963 req->ts_recent);
964 }
965
966 struct dst_entry* inet_csk_route_req(struct sock *sk,
967 const struct request_sock *req)
968 {
969 struct rtable *rt;
970 const struct inet_request_sock *ireq = inet_rsk(req);
971 struct ip_options *opt = inet_rsk(req)->opt;
972 struct flowi fl = { .oif = sk->sk_bound_dev_if,
973 .nl_u = { .ip4_u =
974 { .daddr = ((opt && opt->srr) ?
975 opt->faddr :
976 ireq->rmt_addr),
977 .saddr = ireq->loc_addr,
978 .tos = RT_CONN_FLAGS(sk) } },
979 .proto = sk->sk_protocol,
980 .uli_u = { .ports =
981 { .sport = inet_sk(sk)->sport,
982 .dport = ireq->rmt_port } } };
983
984 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
985 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
986 return NULL;
987 }
988 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
989 ip_rt_put(rt);
990 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
991 return NULL;
992 }
993 return &rt->u.dst;
994 }
995
996 /*
997 * Send a SYN-ACK after having received an ACK.
998 * This still operates on a request_sock only, not on a big
999 * socket.
1000 */
1001 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
1002 struct dst_entry *dst)
1003 {
1004 const struct inet_request_sock *ireq = inet_rsk(req);
1005 int err = -1;
1006 struct sk_buff * skb;
1007
1008 /* First, grab a route. */
1009 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1010 goto out;
1011
1012 skb = tcp_make_synack(sk, dst, req);
1013
1014 if (skb) {
1015 struct tcphdr *th = skb->h.th;
1016
1017 th->check = tcp_v4_check(th, skb->len,
1018 ireq->loc_addr,
1019 ireq->rmt_addr,
1020 csum_partial((char *)th, skb->len,
1021 skb->csum));
1022
1023 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1024 ireq->rmt_addr,
1025 ireq->opt);
1026 if (err == NET_XMIT_CN)
1027 err = 0;
1028 }
1029
1030 out:
1031 dst_release(dst);
1032 return err;
1033 }
1034
1035 /*
1036 * IPv4 request_sock destructor.
1037 */
1038 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1039 {
1040 if (inet_rsk(req)->opt)
1041 kfree(inet_rsk(req)->opt);
1042 }
1043
1044 static inline void syn_flood_warning(struct sk_buff *skb)
1045 {
1046 static unsigned long warntime;
1047
1048 if (time_after(jiffies, (warntime + HZ * 60))) {
1049 warntime = jiffies;
1050 printk(KERN_INFO
1051 "possible SYN flooding on port %d. Sending cookies.\n",
1052 ntohs(skb->h.th->dest));
1053 }
1054 }
1055
1056 /*
1057 * Save and compile IPv4 options into the request_sock if needed.
1058 */
1059 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1060 struct sk_buff *skb)
1061 {
1062 struct ip_options *opt = &(IPCB(skb)->opt);
1063 struct ip_options *dopt = NULL;
1064
1065 if (opt && opt->optlen) {
1066 int opt_size = optlength(opt);
1067 dopt = kmalloc(opt_size, GFP_ATOMIC);
1068 if (dopt) {
1069 if (ip_options_echo(dopt, skb)) {
1070 kfree(dopt);
1071 dopt = NULL;
1072 }
1073 }
1074 }
1075 return dopt;
1076 }
1077
1078 struct request_sock_ops tcp_request_sock_ops = {
1079 .family = PF_INET,
1080 .obj_size = sizeof(struct tcp_request_sock),
1081 .rtx_syn_ack = tcp_v4_send_synack,
1082 .send_ack = tcp_v4_reqsk_send_ack,
1083 .destructor = tcp_v4_reqsk_destructor,
1084 .send_reset = tcp_v4_send_reset,
1085 };
1086
1087 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1088 {
1089 struct inet_request_sock *ireq;
1090 struct tcp_options_received tmp_opt;
1091 struct request_sock *req;
1092 __u32 saddr = skb->nh.iph->saddr;
1093 __u32 daddr = skb->nh.iph->daddr;
1094 __u32 isn = TCP_SKB_CB(skb)->when;
1095 struct dst_entry *dst = NULL;
1096 #ifdef CONFIG_SYN_COOKIES
1097 int want_cookie = 0;
1098 #else
1099 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1100 #endif
1101
1102 /* Never answer to SYNs send to broadcast or multicast */
1103 if (((struct rtable *)skb->dst)->rt_flags &
1104 (RTCF_BROADCAST | RTCF_MULTICAST))
1105 goto drop;
1106
1107 /* TW buckets are converted to open requests without
1108 * limitations, they conserve resources and peer is
1109 * evidently real one.
1110 */
1111 if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1112 #ifdef CONFIG_SYN_COOKIES
1113 if (sysctl_tcp_syncookies) {
1114 want_cookie = 1;
1115 } else
1116 #endif
1117 goto drop;
1118 }
1119
1120 /* Accept backlog is full. If we have already queued enough
1121 * of warm entries in syn queue, drop request. It is better than
1122 * clogging syn queue with openreqs with exponentially increasing
1123 * timeout.
1124 */
1125 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1126 goto drop;
1127
1128 req = reqsk_alloc(&tcp_request_sock_ops);
1129 if (!req)
1130 goto drop;
1131
1132 tcp_clear_options(&tmp_opt);
1133 tmp_opt.mss_clamp = 536;
1134 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
1135
1136 tcp_parse_options(skb, &tmp_opt, 0);
1137
1138 if (want_cookie) {
1139 tcp_clear_options(&tmp_opt);
1140 tmp_opt.saw_tstamp = 0;
1141 }
1142
1143 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1144 /* Some OSes (unknown ones, but I see them on web server, which
1145 * contains information interesting only for windows'
1146 * users) do not send their stamp in SYN. It is easy case.
1147 * We simply do not advertise TS support.
1148 */
1149 tmp_opt.saw_tstamp = 0;
1150 tmp_opt.tstamp_ok = 0;
1151 }
1152 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1153
1154 tcp_openreq_init(req, &tmp_opt, skb);
1155
1156 ireq = inet_rsk(req);
1157 ireq->loc_addr = daddr;
1158 ireq->rmt_addr = saddr;
1159 ireq->opt = tcp_v4_save_options(sk, skb);
1160 if (!want_cookie)
1161 TCP_ECN_create_request(req, skb->h.th);
1162
1163 if (want_cookie) {
1164 #ifdef CONFIG_SYN_COOKIES
1165 syn_flood_warning(skb);
1166 #endif
1167 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1168 } else if (!isn) {
1169 struct inet_peer *peer = NULL;
1170
1171 /* VJ's idea. We save last timestamp seen
1172 * from the destination in peer table, when entering
1173 * state TIME-WAIT, and check against it before
1174 * accepting new connection request.
1175 *
1176 * If "isn" is not zero, this request hit alive
1177 * timewait bucket, so that all the necessary checks
1178 * are made in the function processing timewait state.
1179 */
1180 if (tmp_opt.saw_tstamp &&
1181 sysctl_tcp_tw_recycle &&
1182 (dst = inet_csk_route_req(sk, req)) != NULL &&
1183 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1184 peer->v4daddr == saddr) {
1185 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1186 (s32)(peer->tcp_ts - req->ts_recent) >
1187 TCP_PAWS_WINDOW) {
1188 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1189 dst_release(dst);
1190 goto drop_and_free;
1191 }
1192 }
1193 /* Kill the following clause, if you dislike this way. */
1194 else if (!sysctl_tcp_syncookies &&
1195 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1196 (sysctl_max_syn_backlog >> 2)) &&
1197 (!peer || !peer->tcp_ts_stamp) &&
1198 (!dst || !dst_metric(dst, RTAX_RTT))) {
1199 /* Without syncookies last quarter of
1200 * backlog is filled with destinations,
1201 * proven to be alive.
1202 * It means that we continue to communicate
1203 * to destinations, already remembered
1204 * to the moment of synflood.
1205 */
1206 LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
1207 "request from %u.%u."
1208 "%u.%u/%u\n",
1209 NIPQUAD(saddr),
1210 ntohs(skb->h.th->source)));
1211 dst_release(dst);
1212 goto drop_and_free;
1213 }
1214
1215 isn = tcp_v4_init_sequence(sk, skb);
1216 }
1217 tcp_rsk(req)->snt_isn = isn;
1218
1219 if (tcp_v4_send_synack(sk, req, dst))
1220 goto drop_and_free;
1221
1222 if (want_cookie) {
1223 reqsk_free(req);
1224 } else {
1225 tcp_v4_synq_add(sk, req);
1226 }
1227 return 0;
1228
1229 drop_and_free:
1230 reqsk_free(req);
1231 drop:
1232 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1233 return 0;
1234 }
1235
1236
1237 /*
1238 * The three way handshake has completed - we got a valid synack -
1239 * now create the new socket.
1240 */
1241 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1242 struct request_sock *req,
1243 struct dst_entry *dst)
1244 {
1245 struct inet_request_sock *ireq;
1246 struct inet_sock *newinet;
1247 struct tcp_sock *newtp;
1248 struct sock *newsk;
1249
1250 if (sk_acceptq_is_full(sk))
1251 goto exit_overflow;
1252
1253 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1254 goto exit;
1255
1256 newsk = tcp_create_openreq_child(sk, req, skb);
1257 if (!newsk)
1258 goto exit;
1259
1260 sk_setup_caps(newsk, dst);
1261
1262 newtp = tcp_sk(newsk);
1263 newinet = inet_sk(newsk);
1264 ireq = inet_rsk(req);
1265 newinet->daddr = ireq->rmt_addr;
1266 newinet->rcv_saddr = ireq->loc_addr;
1267 newinet->saddr = ireq->loc_addr;
1268 newinet->opt = ireq->opt;
1269 ireq->opt = NULL;
1270 newinet->mc_index = inet_iif(skb);
1271 newinet->mc_ttl = skb->nh.iph->ttl;
1272 newtp->ext_header_len = 0;
1273 if (newinet->opt)
1274 newtp->ext_header_len = newinet->opt->optlen;
1275 newinet->id = newtp->write_seq ^ jiffies;
1276
1277 tcp_sync_mss(newsk, dst_mtu(dst));
1278 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1279 tcp_initialize_rcv_mss(newsk);
1280
1281 __inet_hash(&tcp_hashinfo, newsk, 0);
1282 __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1283
1284 return newsk;
1285
1286 exit_overflow:
1287 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1288 exit:
1289 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1290 dst_release(dst);
1291 return NULL;
1292 }
1293
1294 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1295 {
1296 struct tcphdr *th = skb->h.th;
1297 struct iphdr *iph = skb->nh.iph;
1298 struct sock *nsk;
1299 struct request_sock **prev;
1300 /* Find possible connection requests. */
1301 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1302 iph->saddr, iph->daddr);
1303 if (req)
1304 return tcp_check_req(sk, skb, req, prev);
1305
1306 nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
1307 th->source, skb->nh.iph->daddr,
1308 ntohs(th->dest), inet_iif(skb));
1309
1310 if (nsk) {
1311 if (nsk->sk_state != TCP_TIME_WAIT) {
1312 bh_lock_sock(nsk);
1313 return nsk;
1314 }
1315 inet_twsk_put((struct inet_timewait_sock *)nsk);
1316 return NULL;
1317 }
1318
1319 #ifdef CONFIG_SYN_COOKIES
1320 if (!th->rst && !th->syn && th->ack)
1321 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1322 #endif
1323 return sk;
1324 }
1325
1326 static int tcp_v4_checksum_init(struct sk_buff *skb)
1327 {
1328 if (skb->ip_summed == CHECKSUM_HW) {
1329 skb->ip_summed = CHECKSUM_UNNECESSARY;
1330 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1331 skb->nh.iph->daddr, skb->csum))
1332 return 0;
1333
1334 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1335 skb->ip_summed = CHECKSUM_NONE;
1336 }
1337 if (skb->len <= 76) {
1338 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1339 skb->nh.iph->daddr,
1340 skb_checksum(skb, 0, skb->len, 0)))
1341 return -1;
1342 skb->ip_summed = CHECKSUM_UNNECESSARY;
1343 } else {
1344 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1345 skb->nh.iph->saddr,
1346 skb->nh.iph->daddr, 0);
1347 }
1348 return 0;
1349 }
1350
1351
1352 /* The socket must have it's spinlock held when we get
1353 * here.
1354 *
1355 * We have a potential double-lock case here, so even when
1356 * doing backlog processing we use the BH locking scheme.
1357 * This is because we cannot sleep with the original spinlock
1358 * held.
1359 */
1360 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1361 {
1362 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1363 TCP_CHECK_TIMER(sk);
1364 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1365 goto reset;
1366 TCP_CHECK_TIMER(sk);
1367 return 0;
1368 }
1369
1370 if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1371 goto csum_err;
1372
1373 if (sk->sk_state == TCP_LISTEN) {
1374 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1375 if (!nsk)
1376 goto discard;
1377
1378 if (nsk != sk) {
1379 if (tcp_child_process(sk, nsk, skb))
1380 goto reset;
1381 return 0;
1382 }
1383 }
1384
1385 TCP_CHECK_TIMER(sk);
1386 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1387 goto reset;
1388 TCP_CHECK_TIMER(sk);
1389 return 0;
1390
1391 reset:
1392 tcp_v4_send_reset(skb);
1393 discard:
1394 kfree_skb(skb);
1395 /* Be careful here. If this function gets more complicated and
1396 * gcc suffers from register pressure on the x86, sk (in %ebx)
1397 * might be destroyed here. This current version compiles correctly,
1398 * but you have been warned.
1399 */
1400 return 0;
1401
1402 csum_err:
1403 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1404 goto discard;
1405 }
1406
1407 /*
1408 * From tcp_input.c
1409 */
1410
1411 int tcp_v4_rcv(struct sk_buff *skb)
1412 {
1413 struct tcphdr *th;
1414 struct sock *sk;
1415 int ret;
1416
1417 if (skb->pkt_type != PACKET_HOST)
1418 goto discard_it;
1419
1420 /* Count it even if it's bad */
1421 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1422
1423 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1424 goto discard_it;
1425
1426 th = skb->h.th;
1427
1428 if (th->doff < sizeof(struct tcphdr) / 4)
1429 goto bad_packet;
1430 if (!pskb_may_pull(skb, th->doff * 4))
1431 goto discard_it;
1432
1433 /* An explanation is required here, I think.
1434 * Packet length and doff are validated by header prediction,
1435 * provided case of th->doff==0 is elimineted.
1436 * So, we defer the checks. */
1437 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1438 tcp_v4_checksum_init(skb) < 0))
1439 goto bad_packet;
1440
1441 th = skb->h.th;
1442 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1443 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1444 skb->len - th->doff * 4);
1445 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1446 TCP_SKB_CB(skb)->when = 0;
1447 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1448 TCP_SKB_CB(skb)->sacked = 0;
1449
1450 sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
1451 skb->nh.iph->daddr, ntohs(th->dest),
1452 inet_iif(skb));
1453
1454 if (!sk)
1455 goto no_tcp_socket;
1456
1457 process:
1458 if (sk->sk_state == TCP_TIME_WAIT)
1459 goto do_time_wait;
1460
1461 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1462 goto discard_and_relse;
1463
1464 if (sk_filter(sk, skb, 0))
1465 goto discard_and_relse;
1466
1467 skb->dev = NULL;
1468
1469 bh_lock_sock(sk);
1470 ret = 0;
1471 if (!sock_owned_by_user(sk)) {
1472 if (!tcp_prequeue(sk, skb))
1473 ret = tcp_v4_do_rcv(sk, skb);
1474 } else
1475 sk_add_backlog(sk, skb);
1476 bh_unlock_sock(sk);
1477
1478 sock_put(sk);
1479
1480 return ret;
1481
1482 no_tcp_socket:
1483 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1484 goto discard_it;
1485
1486 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1487 bad_packet:
1488 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1489 } else {
1490 tcp_v4_send_reset(skb);
1491 }
1492
1493 discard_it:
1494 /* Discard frame. */
1495 kfree_skb(skb);
1496 return 0;
1497
1498 discard_and_relse:
1499 sock_put(sk);
1500 goto discard_it;
1501
1502 do_time_wait:
1503 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1504 inet_twsk_put((struct inet_timewait_sock *) sk);
1505 goto discard_it;
1506 }
1507
1508 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1509 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1510 inet_twsk_put((struct inet_timewait_sock *) sk);
1511 goto discard_it;
1512 }
1513 switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
1514 skb, th)) {
1515 case TCP_TW_SYN: {
1516 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1517 skb->nh.iph->daddr,
1518 ntohs(th->dest),
1519 inet_iif(skb));
1520 if (sk2) {
1521 tcp_tw_deschedule((struct inet_timewait_sock *)sk);
1522 inet_twsk_put((struct inet_timewait_sock *)sk);
1523 sk = sk2;
1524 goto process;
1525 }
1526 /* Fall through to ACK */
1527 }
1528 case TCP_TW_ACK:
1529 tcp_v4_timewait_ack(sk, skb);
1530 break;
1531 case TCP_TW_RST:
1532 goto no_tcp_socket;
1533 case TCP_TW_SUCCESS:;
1534 }
1535 goto discard_it;
1536 }
1537
1538 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1539 {
1540 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1541 struct inet_sock *inet = inet_sk(sk);
1542
1543 sin->sin_family = AF_INET;
1544 sin->sin_addr.s_addr = inet->daddr;
1545 sin->sin_port = inet->dport;
1546 }
1547
1548 /* VJ's idea. Save last timestamp seen from this destination
1549 * and hold it at least for normal timewait interval to use for duplicate
1550 * segment detection in subsequent connections, before they enter synchronized
1551 * state.
1552 */
1553
1554 int tcp_v4_remember_stamp(struct sock *sk)
1555 {
1556 struct inet_sock *inet = inet_sk(sk);
1557 struct tcp_sock *tp = tcp_sk(sk);
1558 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1559 struct inet_peer *peer = NULL;
1560 int release_it = 0;
1561
1562 if (!rt || rt->rt_dst != inet->daddr) {
1563 peer = inet_getpeer(inet->daddr, 1);
1564 release_it = 1;
1565 } else {
1566 if (!rt->peer)
1567 rt_bind_peer(rt, 1);
1568 peer = rt->peer;
1569 }
1570
1571 if (peer) {
1572 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1573 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1574 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1575 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1576 peer->tcp_ts = tp->rx_opt.ts_recent;
1577 }
1578 if (release_it)
1579 inet_putpeer(peer);
1580 return 1;
1581 }
1582
1583 return 0;
1584 }
1585
1586 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1587 {
1588 struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1589
1590 if (peer) {
1591 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1592
1593 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1594 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1595 peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1596 peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1597 peer->tcp_ts = tcptw->tw_ts_recent;
1598 }
1599 inet_putpeer(peer);
1600 return 1;
1601 }
1602
1603 return 0;
1604 }
1605
1606 struct tcp_func ipv4_specific = {
1607 .queue_xmit = ip_queue_xmit,
1608 .send_check = tcp_v4_send_check,
1609 .rebuild_header = inet_sk_rebuild_header,
1610 .conn_request = tcp_v4_conn_request,
1611 .syn_recv_sock = tcp_v4_syn_recv_sock,
1612 .remember_stamp = tcp_v4_remember_stamp,
1613 .net_header_len = sizeof(struct iphdr),
1614 .setsockopt = ip_setsockopt,
1615 .getsockopt = ip_getsockopt,
1616 .addr2sockaddr = v4_addr2sockaddr,
1617 .sockaddr_len = sizeof(struct sockaddr_in),
1618 };
1619
1620 /* NOTE: A lot of things set to zero explicitly by call to
1621 * sk_alloc() so need not be done here.
1622 */
1623 static int tcp_v4_init_sock(struct sock *sk)
1624 {
1625 struct tcp_sock *tp = tcp_sk(sk);
1626
1627 skb_queue_head_init(&tp->out_of_order_queue);
1628 tcp_init_xmit_timers(sk);
1629 tcp_prequeue_init(tp);
1630
1631 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
1632 tp->mdev = TCP_TIMEOUT_INIT;
1633
1634 /* So many TCP implementations out there (incorrectly) count the
1635 * initial SYN frame in their delayed-ACK and congestion control
1636 * algorithms that we must have the following bandaid to talk
1637 * efficiently to them. -DaveM
1638 */
1639 tp->snd_cwnd = 2;
1640
1641 /* See draft-stevens-tcpca-spec-01 for discussion of the
1642 * initialization of these values.
1643 */
1644 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
1645 tp->snd_cwnd_clamp = ~0;
1646 tp->mss_cache = 536;
1647
1648 tp->reordering = sysctl_tcp_reordering;
1649 tp->ca_ops = &tcp_init_congestion_ops;
1650
1651 sk->sk_state = TCP_CLOSE;
1652
1653 sk->sk_write_space = sk_stream_write_space;
1654 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1655
1656 tp->af_specific = &ipv4_specific;
1657
1658 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1659 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1660
1661 atomic_inc(&tcp_sockets_allocated);
1662
1663 return 0;
1664 }
1665
1666 int tcp_v4_destroy_sock(struct sock *sk)
1667 {
1668 struct tcp_sock *tp = tcp_sk(sk);
1669
1670 tcp_clear_xmit_timers(sk);
1671
1672 tcp_cleanup_congestion_control(tp);
1673
1674 /* Cleanup up the write buffer. */
1675 sk_stream_writequeue_purge(sk);
1676
1677 /* Cleans up our, hopefully empty, out_of_order_queue. */
1678 __skb_queue_purge(&tp->out_of_order_queue);
1679
1680 /* Clean prequeue, it must be empty really */
1681 __skb_queue_purge(&tp->ucopy.prequeue);
1682
1683 /* Clean up a referenced TCP bind bucket. */
1684 if (inet_csk(sk)->icsk_bind_hash)
1685 inet_put_port(&tcp_hashinfo, sk);
1686
1687 /*
1688 * If sendmsg cached page exists, toss it.
1689 */
1690 if (sk->sk_sndmsg_page) {
1691 __free_page(sk->sk_sndmsg_page);
1692 sk->sk_sndmsg_page = NULL;
1693 }
1694
1695 atomic_dec(&tcp_sockets_allocated);
1696
1697 return 0;
1698 }
1699
1700 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1701
1702 #ifdef CONFIG_PROC_FS
1703 /* Proc filesystem TCP sock list dumping. */
1704
1705 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1706 {
1707 return hlist_empty(head) ? NULL :
1708 list_entry(head->first, struct inet_timewait_sock, tw_node);
1709 }
1710
1711 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1712 {
1713 return tw->tw_node.next ?
1714 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1715 }
1716
1717 static void *listening_get_next(struct seq_file *seq, void *cur)
1718 {
1719 struct inet_connection_sock *icsk;
1720 struct hlist_node *node;
1721 struct sock *sk = cur;
1722 struct tcp_iter_state* st = seq->private;
1723
1724 if (!sk) {
1725 st->bucket = 0;
1726 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1727 goto get_sk;
1728 }
1729
1730 ++st->num;
1731
1732 if (st->state == TCP_SEQ_STATE_OPENREQ) {
1733 struct request_sock *req = cur;
1734
1735 icsk = inet_csk(st->syn_wait_sk);
1736 req = req->dl_next;
1737 while (1) {
1738 while (req) {
1739 if (req->rsk_ops->family == st->family) {
1740 cur = req;
1741 goto out;
1742 }
1743 req = req->dl_next;
1744 }
1745 if (++st->sbucket >= TCP_SYNQ_HSIZE)
1746 break;
1747 get_req:
1748 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1749 }
1750 sk = sk_next(st->syn_wait_sk);
1751 st->state = TCP_SEQ_STATE_LISTENING;
1752 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1753 } else {
1754 icsk = inet_csk(sk);
1755 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1756 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1757 goto start_req;
1758 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1759 sk = sk_next(sk);
1760 }
1761 get_sk:
1762 sk_for_each_from(sk, node) {
1763 if (sk->sk_family == st->family) {
1764 cur = sk;
1765 goto out;
1766 }
1767 icsk = inet_csk(sk);
1768 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1769 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1770 start_req:
1771 st->uid = sock_i_uid(sk);
1772 st->syn_wait_sk = sk;
1773 st->state = TCP_SEQ_STATE_OPENREQ;
1774 st->sbucket = 0;
1775 goto get_req;
1776 }
1777 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1778 }
1779 if (++st->bucket < INET_LHTABLE_SIZE) {
1780 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1781 goto get_sk;
1782 }
1783 cur = NULL;
1784 out:
1785 return cur;
1786 }
1787
1788 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1789 {
1790 void *rc = listening_get_next(seq, NULL);
1791
1792 while (rc && *pos) {
1793 rc = listening_get_next(seq, rc);
1794 --*pos;
1795 }
1796 return rc;
1797 }
1798
1799 static void *established_get_first(struct seq_file *seq)
1800 {
1801 struct tcp_iter_state* st = seq->private;
1802 void *rc = NULL;
1803
1804 for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1805 struct sock *sk;
1806 struct hlist_node *node;
1807 struct inet_timewait_sock *tw;
1808
1809 /* We can reschedule _before_ having picked the target: */
1810 cond_resched_softirq();
1811
1812 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1813 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1814 if (sk->sk_family != st->family) {
1815 continue;
1816 }
1817 rc = sk;
1818 goto out;
1819 }
1820 st->state = TCP_SEQ_STATE_TIME_WAIT;
1821 inet_twsk_for_each(tw, node,
1822 &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
1823 if (tw->tw_family != st->family) {
1824 continue;
1825 }
1826 rc = tw;
1827 goto out;
1828 }
1829 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1830 st->state = TCP_SEQ_STATE_ESTABLISHED;
1831 }
1832 out:
1833 return rc;
1834 }
1835
1836 static void *established_get_next(struct seq_file *seq, void *cur)
1837 {
1838 struct sock *sk = cur;
1839 struct inet_timewait_sock *tw;
1840 struct hlist_node *node;
1841 struct tcp_iter_state* st = seq->private;
1842
1843 ++st->num;
1844
1845 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1846 tw = cur;
1847 tw = tw_next(tw);
1848 get_tw:
1849 while (tw && tw->tw_family != st->family) {
1850 tw = tw_next(tw);
1851 }
1852 if (tw) {
1853 cur = tw;
1854 goto out;
1855 }
1856 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1857 st->state = TCP_SEQ_STATE_ESTABLISHED;
1858
1859 /* We can reschedule between buckets: */
1860 cond_resched_softirq();
1861
1862 if (++st->bucket < tcp_hashinfo.ehash_size) {
1863 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1864 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
1865 } else {
1866 cur = NULL;
1867 goto out;
1868 }
1869 } else
1870 sk = sk_next(sk);
1871
1872 sk_for_each_from(sk, node) {
1873 if (sk->sk_family == st->family)
1874 goto found;
1875 }
1876
1877 st->state = TCP_SEQ_STATE_TIME_WAIT;
1878 tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
1879 goto get_tw;
1880 found:
1881 cur = sk;
1882 out:
1883 return cur;
1884 }
1885
1886 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1887 {
1888 void *rc = established_get_first(seq);
1889
1890 while (rc && pos) {
1891 rc = established_get_next(seq, rc);
1892 --pos;
1893 }
1894 return rc;
1895 }
1896
1897 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1898 {
1899 void *rc;
1900 struct tcp_iter_state* st = seq->private;
1901
1902 inet_listen_lock(&tcp_hashinfo);
1903 st->state = TCP_SEQ_STATE_LISTENING;
1904 rc = listening_get_idx(seq, &pos);
1905
1906 if (!rc) {
1907 inet_listen_unlock(&tcp_hashinfo);
1908 local_bh_disable();
1909 st->state = TCP_SEQ_STATE_ESTABLISHED;
1910 rc = established_get_idx(seq, pos);
1911 }
1912
1913 return rc;
1914 }
1915
1916 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1917 {
1918 struct tcp_iter_state* st = seq->private;
1919 st->state = TCP_SEQ_STATE_LISTENING;
1920 st->num = 0;
1921 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
1922 }
1923
1924 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1925 {
1926 void *rc = NULL;
1927 struct tcp_iter_state* st;
1928
1929 if (v == SEQ_START_TOKEN) {
1930 rc = tcp_get_idx(seq, 0);
1931 goto out;
1932 }
1933 st = seq->private;
1934
1935 switch (st->state) {
1936 case TCP_SEQ_STATE_OPENREQ:
1937 case TCP_SEQ_STATE_LISTENING:
1938 rc = listening_get_next(seq, v);
1939 if (!rc) {
1940 inet_listen_unlock(&tcp_hashinfo);
1941 local_bh_disable();
1942 st->state = TCP_SEQ_STATE_ESTABLISHED;
1943 rc = established_get_first(seq);
1944 }
1945 break;
1946 case TCP_SEQ_STATE_ESTABLISHED:
1947 case TCP_SEQ_STATE_TIME_WAIT:
1948 rc = established_get_next(seq, v);
1949 break;
1950 }
1951 out:
1952 ++*pos;
1953 return rc;
1954 }
1955
1956 static void tcp_seq_stop(struct seq_file *seq, void *v)
1957 {
1958 struct tcp_iter_state* st = seq->private;
1959
1960 switch (st->state) {
1961 case TCP_SEQ_STATE_OPENREQ:
1962 if (v) {
1963 struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
1964 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1965 }
1966 case TCP_SEQ_STATE_LISTENING:
1967 if (v != SEQ_START_TOKEN)
1968 inet_listen_unlock(&tcp_hashinfo);
1969 break;
1970 case TCP_SEQ_STATE_TIME_WAIT:
1971 case TCP_SEQ_STATE_ESTABLISHED:
1972 if (v)
1973 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1974 local_bh_enable();
1975 break;
1976 }
1977 }
1978
1979 static int tcp_seq_open(struct inode *inode, struct file *file)
1980 {
1981 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
1982 struct seq_file *seq;
1983 struct tcp_iter_state *s;
1984 int rc;
1985
1986 if (unlikely(afinfo == NULL))
1987 return -EINVAL;
1988
1989 s = kmalloc(sizeof(*s), GFP_KERNEL);
1990 if (!s)
1991 return -ENOMEM;
1992 memset(s, 0, sizeof(*s));
1993 s->family = afinfo->family;
1994 s->seq_ops.start = tcp_seq_start;
1995 s->seq_ops.next = tcp_seq_next;
1996 s->seq_ops.show = afinfo->seq_show;
1997 s->seq_ops.stop = tcp_seq_stop;
1998
1999 rc = seq_open(file, &s->seq_ops);
2000 if (rc)
2001 goto out_kfree;
2002 seq = file->private_data;
2003 seq->private = s;
2004 out:
2005 return rc;
2006 out_kfree:
2007 kfree(s);
2008 goto out;
2009 }
2010
2011 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2012 {
2013 int rc = 0;
2014 struct proc_dir_entry *p;
2015
2016 if (!afinfo)
2017 return -EINVAL;
2018 afinfo->seq_fops->owner = afinfo->owner;
2019 afinfo->seq_fops->open = tcp_seq_open;
2020 afinfo->seq_fops->read = seq_read;
2021 afinfo->seq_fops->llseek = seq_lseek;
2022 afinfo->seq_fops->release = seq_release_private;
2023
2024 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2025 if (p)
2026 p->data = afinfo;
2027 else
2028 rc = -ENOMEM;
2029 return rc;
2030 }
2031
2032 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2033 {
2034 if (!afinfo)
2035 return;
2036 proc_net_remove(afinfo->name);
2037 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2038 }
2039
2040 static void get_openreq4(struct sock *sk, struct request_sock *req,
2041 char *tmpbuf, int i, int uid)
2042 {
2043 const struct inet_request_sock *ireq = inet_rsk(req);
2044 int ttd = req->expires - jiffies;
2045
2046 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2047 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2048 i,
2049 ireq->loc_addr,
2050 ntohs(inet_sk(sk)->sport),
2051 ireq->rmt_addr,
2052 ntohs(ireq->rmt_port),
2053 TCP_SYN_RECV,
2054 0, 0, /* could print option size, but that is af dependent. */
2055 1, /* timers active (only the expire timer) */
2056 jiffies_to_clock_t(ttd),
2057 req->retrans,
2058 uid,
2059 0, /* non standard timer */
2060 0, /* open_requests have no inode */
2061 atomic_read(&sk->sk_refcnt),
2062 req);
2063 }
2064
2065 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2066 {
2067 int timer_active;
2068 unsigned long timer_expires;
2069 struct tcp_sock *tp = tcp_sk(sp);
2070 const struct inet_connection_sock *icsk = inet_csk(sp);
2071 struct inet_sock *inet = inet_sk(sp);
2072 unsigned int dest = inet->daddr;
2073 unsigned int src = inet->rcv_saddr;
2074 __u16 destp = ntohs(inet->dport);
2075 __u16 srcp = ntohs(inet->sport);
2076
2077 if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2078 timer_active = 1;
2079 timer_expires = icsk->icsk_timeout;
2080 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2081 timer_active = 4;
2082 timer_expires = icsk->icsk_timeout;
2083 } else if (timer_pending(&sp->sk_timer)) {
2084 timer_active = 2;
2085 timer_expires = sp->sk_timer.expires;
2086 } else {
2087 timer_active = 0;
2088 timer_expires = jiffies;
2089 }
2090
2091 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2092 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2093 i, src, srcp, dest, destp, sp->sk_state,
2094 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2095 timer_active,
2096 jiffies_to_clock_t(timer_expires - jiffies),
2097 icsk->icsk_retransmits,
2098 sock_i_uid(sp),
2099 tp->probes_out,
2100 sock_i_ino(sp),
2101 atomic_read(&sp->sk_refcnt), sp,
2102 icsk->icsk_rto,
2103 icsk->icsk_ack.ato,
2104 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2105 tp->snd_cwnd,
2106 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2107 }
2108
2109 static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i)
2110 {
2111 unsigned int dest, src;
2112 __u16 destp, srcp;
2113 int ttd = tw->tw_ttd - jiffies;
2114
2115 if (ttd < 0)
2116 ttd = 0;
2117
2118 dest = tw->tw_daddr;
2119 src = tw->tw_rcv_saddr;
2120 destp = ntohs(tw->tw_dport);
2121 srcp = ntohs(tw->tw_sport);
2122
2123 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2124 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2125 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2126 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2127 atomic_read(&tw->tw_refcnt), tw);
2128 }
2129
2130 #define TMPSZ 150
2131
2132 static int tcp4_seq_show(struct seq_file *seq, void *v)
2133 {
2134 struct tcp_iter_state* st;
2135 char tmpbuf[TMPSZ + 1];
2136
2137 if (v == SEQ_START_TOKEN) {
2138 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2139 " sl local_address rem_address st tx_queue "
2140 "rx_queue tr tm->when retrnsmt uid timeout "
2141 "inode");
2142 goto out;
2143 }
2144 st = seq->private;
2145
2146 switch (st->state) {
2147 case TCP_SEQ_STATE_LISTENING:
2148 case TCP_SEQ_STATE_ESTABLISHED:
2149 get_tcp4_sock(v, tmpbuf, st->num);
2150 break;
2151 case TCP_SEQ_STATE_OPENREQ:
2152 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2153 break;
2154 case TCP_SEQ_STATE_TIME_WAIT:
2155 get_timewait4_sock(v, tmpbuf, st->num);
2156 break;
2157 }
2158 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2159 out:
2160 return 0;
2161 }
2162
2163 static struct file_operations tcp4_seq_fops;
2164 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2165 .owner = THIS_MODULE,
2166 .name = "tcp",
2167 .family = AF_INET,
2168 .seq_show = tcp4_seq_show,
2169 .seq_fops = &tcp4_seq_fops,
2170 };
2171
2172 int __init tcp4_proc_init(void)
2173 {
2174 return tcp_proc_register(&tcp4_seq_afinfo);
2175 }
2176
2177 void tcp4_proc_exit(void)
2178 {
2179 tcp_proc_unregister(&tcp4_seq_afinfo);
2180 }
2181 #endif /* CONFIG_PROC_FS */
2182
2183 struct proto tcp_prot = {
2184 .name = "TCP",
2185 .owner = THIS_MODULE,
2186 .close = tcp_close,
2187 .connect = tcp_v4_connect,
2188 .disconnect = tcp_disconnect,
2189 .accept = inet_csk_accept,
2190 .ioctl = tcp_ioctl,
2191 .init = tcp_v4_init_sock,
2192 .destroy = tcp_v4_destroy_sock,
2193 .shutdown = tcp_shutdown,
2194 .setsockopt = tcp_setsockopt,
2195 .getsockopt = tcp_getsockopt,
2196 .sendmsg = tcp_sendmsg,
2197 .recvmsg = tcp_recvmsg,
2198 .backlog_rcv = tcp_v4_do_rcv,
2199 .hash = tcp_v4_hash,
2200 .unhash = tcp_unhash,
2201 .get_port = tcp_v4_get_port,
2202 .enter_memory_pressure = tcp_enter_memory_pressure,
2203 .sockets_allocated = &tcp_sockets_allocated,
2204 .memory_allocated = &tcp_memory_allocated,
2205 .memory_pressure = &tcp_memory_pressure,
2206 .sysctl_mem = sysctl_tcp_mem,
2207 .sysctl_wmem = sysctl_tcp_wmem,
2208 .sysctl_rmem = sysctl_tcp_rmem,
2209 .max_header = MAX_TCP_HEADER,
2210 .obj_size = sizeof(struct tcp_sock),
2211 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2212 .rsk_prot = &tcp_request_sock_ops,
2213 };
2214
2215
2216
2217 void __init tcp_v4_init(struct net_proto_family *ops)
2218 {
2219 int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2220 if (err < 0)
2221 panic("Failed to create the TCP control socket.\n");
2222 tcp_socket->sk->sk_allocation = GFP_ATOMIC;
2223 inet_sk(tcp_socket->sk)->uc_ttl = -1;
2224
2225 /* Unhash it so that IP input processing does not even
2226 * see it, we do not wish this socket to see incoming
2227 * packets.
2228 */
2229 tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2230 }
2231
2232 EXPORT_SYMBOL(ipv4_specific);
2233 EXPORT_SYMBOL(inet_bind_bucket_create);
2234 EXPORT_SYMBOL(tcp_hashinfo);
2235 EXPORT_SYMBOL(tcp_prot);
2236 EXPORT_SYMBOL(tcp_unhash);
2237 EXPORT_SYMBOL(tcp_v4_conn_request);
2238 EXPORT_SYMBOL(tcp_v4_connect);
2239 EXPORT_SYMBOL(tcp_v4_do_rcv);
2240 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2241 EXPORT_SYMBOL(tcp_v4_send_check);
2242 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2243
2244 #ifdef CONFIG_PROC_FS
2245 EXPORT_SYMBOL(tcp_proc_register);
2246 EXPORT_SYMBOL(tcp_proc_unregister);
2247 #endif
2248 EXPORT_SYMBOL(sysctl_local_port_range);
2249 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2250 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2251
This page took 0.076035 seconds and 5 git commands to generate.