net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
   9  *
  10  *              IPv4 specific functions
  11  *
  12  *
  13  *              code split from:
  14  *              linux/ipv4/tcp.c
  15  *              linux/ipv4/tcp_input.c
  16  *              linux/ipv4/tcp_output.c
  17  *
  18  *              See tcp.c for author information
  19  *
  20  *      This program is free software; you can redistribute it and/or
  21  *      modify it under the terms of the GNU General Public License
  22  *      as published by the Free Software Foundation; either version
  23  *      2 of the License, or (at your option) any later version.
  24  */
  25
  26 /*
  27  * Changes:
  28  *              David S. Miller :       New socket lookup architecture.
  29  *                                      This code is dedicated to John Dyson.
  30  *              David S. Miller :       Change semantics of established hash,
  31  *                                      half is devoted to TIME_WAIT sockets
  32  *                                      and the rest go in the other half.
  33  *              Andi Kleen :            Add support for syncookies and fixed
  34  *                                      some bugs: ip options weren't passed to
  35  *                                      the TCP layer, missed a check for an
  36  *                                      ACK bit.
  37  *              Andi Kleen :            Implemented fast path mtu discovery.
  38  *                                      Fixed many serious bugs in the
  39  *                                      request_sock handling and moved
  40  *                                      most of it into the af independent code.
  41  *                                      Added tail drop and some other bugfixes.
  42  *                                      Added new listen sematics.
  43  *              Mike McLagan    :       Routing by source
  44  *      Juan Jose Ciarlante:            ip_dynaddr bits
  45  *              Andi Kleen:             various fixes.
  46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  47  *                                      coma.
  48  *      Andi Kleen              :       Fix new listen.
  49  *      Andi Kleen              :       Fix accept error reporting.
  50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  52  *                                      a single port at the same time.
  53  */
  54
  55 #include <linux/config.h>
  56
  57 #include <linux/types.h>
  58 #include <linux/fcntl.h>
  59 #include <linux/module.h>
  60 #include <linux/random.h>
  61 #include <linux/cache.h>
  62 #include <linux/jhash.h>
  63 #include <linux/init.h>
  64 #include <linux/times.h>
  65
  66 #include <net/icmp.h>
  67 #include <net/inet_hashtables.h>
  68 #include <net/tcp.h>
  69 #include <net/ipv6.h>
  70 #include <net/inet_common.h>
  71 #include <net/xfrm.h>
  72
  73 #include <linux/inet.h>
  74 #include <linux/ipv6.h>
  75 #include <linux/stddef.h>
  76 #include <linux/proc_fs.h>
  77 #include <linux/seq_file.h>
  78
  79 extern int sysctl_ip_dynaddr;
  80 int sysctl_tcp_tw_reuse;
  81 int sysctl_tcp_low_latency;
  82
  83 /* Check TCP sequence numbers in ICMP packets. */
  84 #define ICMP_MIN_LENGTH 8
  85
  86 /* Socket used for sending RSTs */
  87 static struct socket *tcp_socket;
  88
  89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
  90                        struct sk_buff *skb);
  91
  92 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
  93         .lhash_lock     = RW_LOCK_UNLOCKED,
  94         .lhash_users    = ATOMIC_INIT(0),
  95         .lhash_wait     = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
  96         .portalloc_lock = SPIN_LOCK_UNLOCKED,
  97         .port_rover     = 1024 - 1,
  98 };
  99
 100 /*
 101  * This array holds the first and last local port number.
 102  * For high-usage systems, use sysctl to change this to
 103  * 32768-61000
 104  */
 105 int sysctl_local_port_range[2] = { 1024, 4999 };
 106
 107 static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
 108 {
 109         const u32 sk_rcv_saddr = inet_rcv_saddr(sk);
 110         struct sock *sk2;
 111         struct hlist_node *node;
 112         int reuse = sk->sk_reuse;
 113
 114         sk_for_each_bound(sk2, node, &tb->owners) {
 115                 if (sk != sk2 &&
 116                     !inet_v6_ipv6only(sk2) &&
 117                     (!sk->sk_bound_dev_if ||
 118                      !sk2->sk_bound_dev_if ||
 119                      sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
 120                         if (!reuse || !sk2->sk_reuse ||
 121                             sk2->sk_state == TCP_LISTEN) {
 122                                 const u32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
 123                                 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
 124                                     sk2_rcv_saddr == sk_rcv_saddr)
 125                                         break;
 126                         }
 127                 }
 128         }
 129         return node != NULL;
 130 }
 131
 132 /* Obtain a reference to a local port for the given sock,
 133  * if snum is zero it means select any available local port.
 134  */
 135 int inet_csk_get_port(struct inet_hashinfo *hashinfo,
 136                       struct sock *sk, unsigned short snum)
 137 {
 138         struct inet_bind_hashbucket *head;
 139         struct hlist_node *node;
 140         struct inet_bind_bucket *tb;
 141         int ret;
 142
 143         local_bh_disable();
 144         if (!snum) {
 145                 int low = sysctl_local_port_range[0];
 146                 int high = sysctl_local_port_range[1];
 147                 int remaining = (high - low) + 1;
 148                 int rover;
 149
 150                 spin_lock(&hashinfo->portalloc_lock);
 151                 if (hashinfo->port_rover < low)
 152                         rover = low;
 153                 else
 154                         rover = hashinfo->port_rover;
 155                 do {
 156                         rover++;
 157                         if (rover > high)
 158                                 rover = low;
 159                         head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)];
 160                         spin_lock(&head->lock);
 161                         inet_bind_bucket_for_each(tb, node, &head->chain)
 162                                 if (tb->port == rover)
 163                                         goto next;
 164                         break;
 165                 next:
 166                         spin_unlock(&head->lock);
 167                 } while (--remaining > 0);
 168                 hashinfo->port_rover = rover;
 169                 spin_unlock(&hashinfo->portalloc_lock);
 170
 171                 /* Exhausted local port range during search?  It is not
 172                  * possible for us to be holding one of the bind hash
 173                  * locks if this test triggers, because if 'remaining'
 174                  * drops to zero, we broke out of the do/while loop at
 175                  * the top level, not from the 'break;' statement.
 176                  */
 177                 ret = 1;
 178                 if (unlikely(remaining <= 0))
 179                         goto fail;
 180
 181                 /* OK, here is the one we will use.  HEAD is
 182                  * non-NULL and we hold it's mutex.
 183                  */
 184                 snum = rover;
 185         } else {
 186                 head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)];
 187                 spin_lock(&head->lock);
 188                 inet_bind_bucket_for_each(tb, node, &head->chain)
 189                         if (tb->port == snum)
 190                                 goto tb_found;
 191         }
 192         tb = NULL;
 193         goto tb_not_found;
 194 tb_found:
 195         if (!hlist_empty(&tb->owners)) {
 196                 if (sk->sk_reuse > 1)
 197                         goto success;
 198                 if (tb->fastreuse > 0 &&
 199                     sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
 200                         goto success;
 201                 } else {
 202                         ret = 1;
 203                         if (inet_csk_bind_conflict(sk, tb))
 204                                 goto fail_unlock;
 205                 }
 206         }
 207 tb_not_found:
 208         ret = 1;
 209         if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum)) == NULL)
 210                 goto fail_unlock;
 211         if (hlist_empty(&tb->owners)) {
 212                 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
 213                         tb->fastreuse = 1;
 214                 else
 215                         tb->fastreuse = 0;
 216         } else if (tb->fastreuse &&
 217                    (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
 218                 tb->fastreuse = 0;
 219 success:
 220         if (!inet_csk(sk)->icsk_bind_hash)
 221                 inet_bind_hash(sk, tb, snum);
 222         BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb);
 223         ret = 0;
 224
 225 fail_unlock:
 226         spin_unlock(&head->lock);
 227 fail:
 228         local_bh_enable();
 229         return ret;
 230 }
 231
 232 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
 233 {
 234         return inet_csk_get_port(&tcp_hashinfo, sk, snum);
 235 }
 236
 237 static void tcp_v4_hash(struct sock *sk)
 238 {
 239         inet_hash(&tcp_hashinfo, sk);
 240 }
 241
 242 void tcp_unhash(struct sock *sk)
 243 {
 244         inet_unhash(&tcp_hashinfo, sk);
 245 }
 246
 247 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 248 {
 249         return secure_tcp_sequence_number(skb->nh.iph->daddr,
 250                                           skb->nh.iph->saddr,
 251                                           skb->h.th->dest,
 252                                           skb->h.th->source);
 253 }
 254
 255 /* called with local bh disabled */
 256 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
 257                                       struct inet_timewait_sock **twp)
 258 {
 259         struct inet_sock *inet = inet_sk(sk);
 260         u32 daddr = inet->rcv_saddr;
 261         u32 saddr = inet->daddr;
 262         int dif = sk->sk_bound_dev_if;
 263         INET_ADDR_COOKIE(acookie, saddr, daddr)
 264         const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
 265         const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size);
 266         struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash];
 267         struct sock *sk2;
 268         const struct hlist_node *node;
 269         struct inet_timewait_sock *tw;
 270
 271         write_lock(&head->lock);
 272
 273         /* Check TIME-WAIT sockets first. */
 274         sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
 275                 tw = inet_twsk(sk2);
 276
 277                 if (INET_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
 278                         const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
 279                         struct tcp_sock *tp = tcp_sk(sk);
 280
 281                         /* With PAWS, it is safe from the viewpoint
 282                            of data integrity. Even without PAWS it
 283                            is safe provided sequence spaces do not
 284                            overlap i.e. at data rates <= 80Mbit/sec.
 285
 286                            Actually, the idea is close to VJ's one,
 287                            only timestamp cache is held not per host,
 288                            but per port pair and TW bucket is used
 289                            as state holder.
 290
 291                            If TW bucket has been already destroyed we
 292                            fall back to VJ's scheme and use initial
 293                            timestamp retrieved from peer table.
 294                          */
 295                         if (tcptw->tw_ts_recent_stamp &&
 296                             (!twp || (sysctl_tcp_tw_reuse &&
 297                                       xtime.tv_sec -
 298                                       tcptw->tw_ts_recent_stamp > 1))) {
 299                                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 300                                 if (tp->write_seq == 0)
 301                                         tp->write_seq = 1;
 302                                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 303                                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 304                                 sock_hold(sk2);
 305                                 goto unique;
 306                         } else
 307                                 goto not_unique;
 308                 }
 309         }
 310         tw = NULL;
 311
 312         /* And established part... */
 313         sk_for_each(sk2, node, &head->chain) {
 314                 if (INET_MATCH(sk2, acookie, saddr, daddr, ports, dif))
 315                         goto not_unique;
 316         }
 317
 318 unique:
 319         /* Must record num and sport now. Otherwise we will see
 320          * in hash table socket with a funny identity. */
 321         inet->num = lport;
 322         inet->sport = htons(lport);
 323         sk->sk_hashent = hash;
 324         BUG_TRAP(sk_unhashed(sk));
 325         __sk_add_node(sk, &head->chain);
 326         sock_prot_inc_use(sk->sk_prot);
 327         write_unlock(&head->lock);
 328
 329         if (twp) {
 330                 *twp = tw;
 331                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 332         } else if (tw) {
 333                 /* Silly. Should hash-dance instead... */
 334                 tcp_tw_deschedule(tw);
 335                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 336
 337                 inet_twsk_put(tw);
 338         }
 339
 340         return 0;
 341
 342 not_unique:
 343         write_unlock(&head->lock);
 344         return -EADDRNOTAVAIL;
 345 }
 346
 347 static inline u32 connect_port_offset(const struct sock *sk)
 348 {
 349         const struct inet_sock *inet = inet_sk(sk);
 350
 351         return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
 352                                          inet->dport);
 353 }
 354
 355 /*
 356  * Bind a port for a connect operation and hash it.
 357  */
 358 static inline int tcp_v4_hash_connect(struct sock *sk)
 359 {
 360         const unsigned short snum = inet_sk(sk)->num;
 361         struct inet_bind_hashbucket *head;
 362         struct inet_bind_bucket *tb;
 363         int ret;
 364
 365         if (!snum) {
 366                 int low = sysctl_local_port_range[0];
 367                 int high = sysctl_local_port_range[1];
 368                 int range = high - low;
 369                 int i;
 370                 int port;
 371                 static u32 hint;
 372                 u32 offset = hint + connect_port_offset(sk);
 373                 struct hlist_node *node;
 374                 struct inet_timewait_sock *tw = NULL;
 375
 376                 local_bh_disable();
 377                 for (i = 1; i <= range; i++) {
 378                         port = low + (i + offset) % range;
 379                         head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
 380                         spin_lock(&head->lock);
 381
 382                         /* Does not bother with rcv_saddr checks,
 383                          * because the established check is already
 384                          * unique enough.
 385                          */
 386                         inet_bind_bucket_for_each(tb, node, &head->chain) {
 387                                 if (tb->port == port) {
 388                                         BUG_TRAP(!hlist_empty(&tb->owners));
 389                                         if (tb->fastreuse >= 0)
 390                                                 goto next_port;
 391                                         if (!__tcp_v4_check_established(sk,
 392                                                                         port,
 393                                                                         &tw))
 394                                                 goto ok;
 395                                         goto next_port;
 396                                 }
 397                         }
 398
 399                         tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
 400                         if (!tb) {
 401                                 spin_unlock(&head->lock);
 402                                 break;
 403                         }
 404                         tb->fastreuse = -1;
 405                         goto ok;
 406
 407                 next_port:
 408                         spin_unlock(&head->lock);
 409                 }
 410                 local_bh_enable();
 411
 412                 return -EADDRNOTAVAIL;
 413
 414 ok:
 415                 hint += i;
 416
 417                 /* Head lock still held and bh's disabled */
 418                 inet_bind_hash(sk, tb, port);
 419                 if (sk_unhashed(sk)) {
 420                         inet_sk(sk)->sport = htons(port);
 421                         __inet_hash(&tcp_hashinfo, sk, 0);
 422                 }
 423                 spin_unlock(&head->lock);
 424
 425                 if (tw) {
 426                         tcp_tw_deschedule(tw);
 427                         inet_twsk_put(tw);
 428                 }
 429
 430                 ret = 0;
 431                 goto out;
 432         }
 433
 434         head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
 435         tb  = inet_csk(sk)->icsk_bind_hash;
 436         spin_lock_bh(&head->lock);
 437         if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
 438                 __inet_hash(&tcp_hashinfo, sk, 0);
 439                 spin_unlock_bh(&head->lock);
 440                 return 0;
 441         } else {
 442                 spin_unlock(&head->lock);
 443                 /* No definite answer... Walk to established hash table */
 444                 ret = __tcp_v4_check_established(sk, snum, NULL);
 445 out:
 446                 local_bh_enable();
 447                 return ret;
 448         }
 449 }
 450
 451 /* This will initiate an outgoing connection. */
 452 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 453 {
 454         struct inet_sock *inet = inet_sk(sk);
 455         struct tcp_sock *tp = tcp_sk(sk);
 456         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 457         struct rtable *rt;
 458         u32 daddr, nexthop;
 459         int tmp;
 460         int err;
 461
 462         if (addr_len < sizeof(struct sockaddr_in))
 463                 return -EINVAL;
 464
 465         if (usin->sin_family != AF_INET)
 466                 return -EAFNOSUPPORT;
 467
 468         nexthop = daddr = usin->sin_addr.s_addr;
 469         if (inet->opt && inet->opt->srr) {
 470                 if (!daddr)
 471                         return -EINVAL;
 472                 nexthop = inet->opt->faddr;
 473         }
 474
 475         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
 476                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 477                                IPPROTO_TCP,
 478                                inet->sport, usin->sin_port, sk);
 479         if (tmp < 0)
 480                 return tmp;
 481
 482         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 483                 ip_rt_put(rt);
 484                 return -ENETUNREACH;
 485         }
 486
 487         if (!inet->opt || !inet->opt->srr)
 488                 daddr = rt->rt_dst;
 489
 490         if (!inet->saddr)
 491                 inet->saddr = rt->rt_src;
 492         inet->rcv_saddr = inet->saddr;
 493
 494         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
 495                 /* Reset inherited state */
 496                 tp->rx_opt.ts_recent       = 0;
 497                 tp->rx_opt.ts_recent_stamp = 0;
 498                 tp->write_seq              = 0;
 499         }
 500
 501         if (sysctl_tcp_tw_recycle &&
 502             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
 503                 struct inet_peer *peer = rt_get_peer(rt);
 504
 505                 /* VJ's idea. We save last timestamp seen from
 506                  * the destination in peer table, when entering state TIME-WAIT
 507                  * and initialize rx_opt.ts_recent from it, when trying new connection.
 508                  */
 509
 510                 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
 511                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 512                         tp->rx_opt.ts_recent = peer->tcp_ts;
 513                 }
 514         }
 515
 516         inet->dport = usin->sin_port;
 517         inet->daddr = daddr;
 518
 519         tp->ext_header_len = 0;
 520         if (inet->opt)
 521                 tp->ext_header_len = inet->opt->optlen;
 522
 523         tp->rx_opt.mss_clamp = 536;
 524
 525         /* Socket identity is still unknown (sport may be zero).
 526          * However we set state to SYN-SENT and not releasing socket
 527          * lock select source port, enter ourselves into the hash tables and
 528          * complete initialization after this.
 529          */
 530         tcp_set_state(sk, TCP_SYN_SENT);
 531         err = tcp_v4_hash_connect(sk);
 532         if (err)
 533                 goto failure;
 534
 535         err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
 536         if (err)
 537                 goto failure;
 538
 539         /* OK, now commit destination to socket.  */
 540         sk_setup_caps(sk, &rt->u.dst);
 541
 542         if (!tp->write_seq)
 543                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
 544                                                            inet->daddr,
 545                                                            inet->sport,
 546                                                            usin->sin_port);
 547
 548         inet->id = tp->write_seq ^ jiffies;
 549
 550         err = tcp_connect(sk);
 551         rt = NULL;
 552         if (err)
 553                 goto failure;
 554
 555         return 0;
 556
 557 failure:
 558         /* This unhashes the socket and releases the local port, if necessary. */
 559         tcp_set_state(sk, TCP_CLOSE);
 560         ip_rt_put(rt);
 561         sk->sk_route_caps = 0;
 562         inet->dport = 0;
 563         return err;
 564 }
 565
 566 static inline int inet_iif(const struct sk_buff *skb)
 567 {
 568         return ((struct rtable *)skb->dst)->rt_iif;
 569 }
 570
 571 static inline u32 inet_synq_hash(const u32 raddr, const u16 rport,
 572                                  const u32 rnd, const u16 synq_hsize)
 573 {
 574         return jhash_2words(raddr, (u32)rport, rnd) & (synq_hsize - 1);
 575 }
 576
 577 struct request_sock *inet_csk_search_req(const struct sock *sk,
 578                                          struct request_sock ***prevp,
 579                                          const __u16 rport, const __u32 raddr,
 580                                          const __u32 laddr)
 581 {
 582         const struct inet_connection_sock *icsk = inet_csk(sk);
 583         struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
 584         struct request_sock *req, **prev;
 585
 586         for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd,
 587                                                     lopt->nr_table_entries)];
 588              (req = *prev) != NULL;
 589              prev = &req->dl_next) {
 590                 const struct inet_request_sock *ireq = inet_rsk(req);
 591
 592                 if (ireq->rmt_port == rport &&
 593                     ireq->rmt_addr == raddr &&
 594                     ireq->loc_addr == laddr &&
 595                     AF_INET_FAMILY(req->rsk_ops->family)) {
 596                         BUG_TRAP(!req->sk);
 597                         *prevp = prev;
 598                         break;
 599                 }
 600         }
 601
 602         return req;
 603 }
 604
 605 static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
 606 {
 607         struct inet_connection_sock *icsk = inet_csk(sk);
 608         struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
 609         const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port,
 610                                      lopt->hash_rnd, lopt->nr_table_entries);
 611
 612         reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, TCP_TIMEOUT_INIT);
 613         inet_csk_reqsk_queue_added(sk, TCP_TIMEOUT_INIT);
 614 }
 615
 616
 617 /*
 618  * This routine does path mtu discovery as defined in RFC1191.
 619  */
 620 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
 621                                      u32 mtu)
 622 {
 623         struct dst_entry *dst;
 624         struct inet_sock *inet = inet_sk(sk);
 625         struct tcp_sock *tp = tcp_sk(sk);
 626
 627         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 628          * send out by Linux are always <576bytes so they should go through
 629          * unfragmented).
 630          */
 631         if (sk->sk_state == TCP_LISTEN)
 632                 return;
 633
 634         /* We don't check in the destentry if pmtu discovery is forbidden
 635          * on this route. We just assume that no packet_to_big packets
 636          * are send back when pmtu discovery is not active.
 637          * There is a small race when the user changes this flag in the
 638          * route, but I think that's acceptable.
 639          */
 640         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 641                 return;
 642
 643         dst->ops->update_pmtu(dst, mtu);
 644
 645         /* Something is about to be wrong... Remember soft error
 646          * for the case, if this connection will not able to recover.
 647          */
 648         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 649                 sk->sk_err_soft = EMSGSIZE;
 650
 651         mtu = dst_mtu(dst);
 652
 653         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 654             tp->pmtu_cookie > mtu) {
 655                 tcp_sync_mss(sk, mtu);
 656
 657                 /* Resend the TCP packet because it's
 658                  * clear that the old packet has been
 659                  * dropped. This is the new "fast" path mtu
 660                  * discovery.
 661                  */
 662                 tcp_simple_retransmit(sk);
 663         } /* else let the usual retransmit timer handle it */
 664 }
 665
 666 /*
 667  * This routine is called by the ICMP module when it gets some
 668  * sort of error condition.  If err < 0 then the socket should
 669  * be closed and the error returned to the user.  If err > 0
 670  * it's just the icmp type << 8 | icmp code.  After adjustment
 671  * header points to the first 8 bytes of the tcp header.  We need
 672  * to find the appropriate port.
 673  *
 674  * The locking strategy used here is very "optimistic". When
 675  * someone else accesses the socket the ICMP is just dropped
 676  * and for some paths there is no check at all.
 677  * A more general error queue to queue errors for later handling
 678  * is probably better.
 679  *
 680  */
 681
 682 void tcp_v4_err(struct sk_buff *skb, u32 info)
 683 {
 684         struct iphdr *iph = (struct iphdr *)skb->data;
 685         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 686         struct tcp_sock *tp;
 687         struct inet_sock *inet;
 688         int type = skb->h.icmph->type;
 689         int code = skb->h.icmph->code;
 690         struct sock *sk;
 691         __u32 seq;
 692         int err;
 693
 694         if (skb->len < (iph->ihl << 2) + 8) {
 695                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 696                 return;
 697         }
 698
 699         sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
 700                          th->source, inet_iif(skb));
 701         if (!sk) {
 702                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 703                 return;
 704         }
 705         if (sk->sk_state == TCP_TIME_WAIT) {
 706                 inet_twsk_put((struct inet_timewait_sock *)sk);
 707                 return;
 708         }
 709
 710         bh_lock_sock(sk);
 711         /* If too many ICMPs get dropped on busy
 712          * servers this needs to be solved differently.
 713          */
 714         if (sock_owned_by_user(sk))
 715                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
 716
 717         if (sk->sk_state == TCP_CLOSE)
 718                 goto out;
 719
 720         tp = tcp_sk(sk);
 721         seq = ntohl(th->seq);
 722         if (sk->sk_state != TCP_LISTEN &&
 723             !between(seq, tp->snd_una, tp->snd_nxt)) {
 724                 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
 725                 goto out;
 726         }
 727
 728         switch (type) {
 729         case ICMP_SOURCE_QUENCH:
 730                 /* Just silently ignore these. */
 731                 goto out;
 732         case ICMP_PARAMETERPROB:
 733                 err = EPROTO;
 734                 break;
 735         case ICMP_DEST_UNREACH:
 736                 if (code > NR_ICMP_UNREACH)
 737                         goto out;
 738
 739                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 740                         if (!sock_owned_by_user(sk))
 741                                 do_pmtu_discovery(sk, iph, info);
 742                         goto out;
 743                 }
 744
 745                 err = icmp_err_convert[code].errno;
 746                 break;
 747         case ICMP_TIME_EXCEEDED:
 748                 err = EHOSTUNREACH;
 749                 break;
 750         default:
 751                 goto out;
 752         }
 753
 754         switch (sk->sk_state) {
 755                 struct request_sock *req, **prev;
 756         case TCP_LISTEN:
 757                 if (sock_owned_by_user(sk))
 758                         goto out;
 759
 760                 req = inet_csk_search_req(sk, &prev, th->dest,
 761                                           iph->daddr, iph->saddr);
 762                 if (!req)
 763                         goto out;
 764
 765                 /* ICMPs are not backlogged, hence we cannot get
 766                    an established socket here.
 767                  */
 768                 BUG_TRAP(!req->sk);
 769
 770                 if (seq != tcp_rsk(req)->snt_isn) {
 771                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
 772                         goto out;
 773                 }
 774
 775                 /*
 776                  * Still in SYN_RECV, just remove it silently.
 777                  * There is no good way to pass the error to the newly
 778                  * created socket, and POSIX does not want network
 779                  * errors returned from accept().
 780                  */
 781                 inet_csk_reqsk_queue_drop(sk, req, prev);
 782                 goto out;
 783
 784         case TCP_SYN_SENT:
 785         case TCP_SYN_RECV:  /* Cannot happen.
 786                                It can f.e. if SYNs crossed.
 787                              */
 788                 if (!sock_owned_by_user(sk)) {
 789                         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
 790                         sk->sk_err = err;
 791
 792                         sk->sk_error_report(sk);
 793
 794                         tcp_done(sk);
 795                 } else {
 796                         sk->sk_err_soft = err;
 797                 }
 798                 goto out;
 799         }
 800
 801         /* If we've already connected we will keep trying
 802          * until we time out, or the user gives up.
 803          *
 804          * rfc1122 4.2.3.9 allows to consider as hard errors
 805          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 806          * but it is obsoleted by pmtu discovery).
 807          *
 808          * Note, that in modern internet, where routing is unreliable
 809          * and in each dark corner broken firewalls sit, sending random
 810          * errors ordered by their masters even this two messages finally lose
 811          * their original sense (even Linux sends invalid PORT_UNREACHs)
 812          *
 813          * Now we are in compliance with RFCs.
 814          *                                                      --ANK (980905)
 815          */
 816
 817         inet = inet_sk(sk);
 818         if (!sock_owned_by_user(sk) && inet->recverr) {
 819                 sk->sk_err = err;
 820                 sk->sk_error_report(sk);
 821         } else  { /* Only an error on timeout */
 822                 sk->sk_err_soft = err;
 823         }
 824
 825 out:
 826         bh_unlock_sock(sk);
 827         sock_put(sk);
 828 }
 829
 830 /* This routine computes an IPv4 TCP checksum. */
 831 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
 832                        struct sk_buff *skb)
 833 {
 834         struct inet_sock *inet = inet_sk(sk);
 835
 836         if (skb->ip_summed == CHECKSUM_HW) {
 837                 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
 838                 skb->csum = offsetof(struct tcphdr, check);
 839         } else {
 840                 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
 841                                          csum_partial((char *)th,
 842                                                       th->doff << 2,
 843                                                       skb->csum));
 844         }
 845 }
 846
 847 /*
 848  *      This routine will send an RST to the other tcp.
 849  *
 850  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 851  *                    for reset.
 852  *      Answer: if a packet caused RST, it is not for a socket
 853  *              existing in our system, if it is matched to a socket,
 854  *              it is just duplicate segment or bug in other side's TCP.
 855  *              So that we build reply only basing on parameters
 856  *              arrived with segment.
 857  *      Exception: precedence violation. We do not implement it in any case.
 858  */
 859
 860 static void tcp_v4_send_reset(struct sk_buff *skb)
 861 {
 862         struct tcphdr *th = skb->h.th;
 863         struct tcphdr rth;
 864         struct ip_reply_arg arg;
 865
 866         /* Never send a reset in response to a reset. */
 867         if (th->rst)
 868                 return;
 869
 870         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
 871                 return;
 872
 873         /* Swap the send and the receive. */
 874         memset(&rth, 0, sizeof(struct tcphdr));
 875         rth.dest   = th->source;
 876         rth.source = th->dest;
 877         rth.doff   = sizeof(struct tcphdr) / 4;
 878         rth.rst    = 1;
 879
 880         if (th->ack) {
 881                 rth.seq = th->ack_seq;
 882         } else {
 883                 rth.ack = 1;
 884                 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 885                                     skb->len - (th->doff << 2));
 886         }
 887
 888         memset(&arg, 0, sizeof arg);
 889         arg.iov[0].iov_base = (unsigned char *)&rth;
 890         arg.iov[0].iov_len  = sizeof rth;
 891         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
 892                                       skb->nh.iph->saddr, /*XXX*/
 893                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
 894         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 895
 896         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
 897
 898         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
 899         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
 900 }
 901
 902 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 903    outside socket context is ugly, certainly. What can I do?
 904  */
 905
 906 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 907                             u32 win, u32 ts)
 908 {
 909         struct tcphdr *th = skb->h.th;
 910         struct {
 911                 struct tcphdr th;
 912                 u32 tsopt[3];
 913         } rep;
 914         struct ip_reply_arg arg;
 915
 916         memset(&rep.th, 0, sizeof(struct tcphdr));
 917         memset(&arg, 0, sizeof arg);
 918
 919         arg.iov[0].iov_base = (unsigned char *)&rep;
 920         arg.iov[0].iov_len  = sizeof(rep.th);
 921         if (ts) {
 922                 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 923                                      (TCPOPT_TIMESTAMP << 8) |
 924                                      TCPOLEN_TIMESTAMP);
 925                 rep.tsopt[1] = htonl(tcp_time_stamp);
 926                 rep.tsopt[2] = htonl(ts);
 927                 arg.iov[0].iov_len = sizeof(rep);
 928         }
 929
 930         /* Swap the send and the receive. */
 931         rep.th.dest    = th->source;
 932         rep.th.source  = th->dest;
 933         rep.th.doff    = arg.iov[0].iov_len / 4;
 934         rep.th.seq     = htonl(seq);
 935         rep.th.ack_seq = htonl(ack);
 936         rep.th.ack     = 1;
 937         rep.th.window  = htons(win);
 938
 939         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
 940                                       skb->nh.iph->saddr, /*XXX*/
 941                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 942         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 943
 944         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
 945
 946         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
 947 }
 948
 949 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 950 {
 951         struct inet_timewait_sock *tw = inet_twsk(sk);
 952         const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 953
 954         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 955                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent);
 956
 957         inet_twsk_put(tw);
 958 }
 959
 960 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
 961 {
 962         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 963                         req->ts_recent);
 964 }
 965
 966 struct dst_entry* inet_csk_route_req(struct sock *sk,
 967                                      const struct request_sock *req)
 968 {
 969         struct rtable *rt;
 970         const struct inet_request_sock *ireq = inet_rsk(req);
 971         struct ip_options *opt = inet_rsk(req)->opt;
 972         struct flowi fl = { .oif = sk->sk_bound_dev_if,
 973                             .nl_u = { .ip4_u =
 974                                       { .daddr = ((opt && opt->srr) ?
 975                                                   opt->faddr :
 976                                                   ireq->rmt_addr),
 977                                         .saddr = ireq->loc_addr,
 978                                         .tos = RT_CONN_FLAGS(sk) } },
 979                             .proto = sk->sk_protocol,
 980                             .uli_u = { .ports =
 981                                        { .sport = inet_sk(sk)->sport,
 982                                          .dport = ireq->rmt_port } } };
 983
 984         if (ip_route_output_flow(&rt, &fl, sk, 0)) {
 985                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
 986                 return NULL;
 987         }
 988         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
 989                 ip_rt_put(rt);
 990                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
 991                 return NULL;
 992         }
 993         return &rt->u.dst;
 994 }
 995
 996 /*
 997  *      Send a SYN-ACK after having received an ACK.
 998  *      This still operates on a request_sock only, not on a big
 999  *      socket.
1000  */
1001 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
1002                               struct dst_entry *dst)
1003 {
1004         const struct inet_request_sock *ireq = inet_rsk(req);
1005         int err = -1;
1006         struct sk_buff * skb;
1007
1008         /* First, grab a route. */
1009         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1010                 goto out;
1011
1012         skb = tcp_make_synack(sk, dst, req);
1013
1014         if (skb) {
1015                 struct tcphdr *th = skb->h.th;
1016
1017                 th->check = tcp_v4_check(th, skb->len,
1018                                          ireq->loc_addr,
1019                                          ireq->rmt_addr,
1020                                          csum_partial((char *)th, skb->len,
1021                                                       skb->csum));
1022
1023                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1024                                             ireq->rmt_addr,
1025                                             ireq->opt);
1026                 if (err == NET_XMIT_CN)
1027                         err = 0;
1028         }
1029
1030 out:
1031         dst_release(dst);
1032         return err;
1033 }
1034
1035 /*
1036  *      IPv4 request_sock destructor.
1037  */
1038 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1039 {
1040         if (inet_rsk(req)->opt)
1041                 kfree(inet_rsk(req)->opt);
1042 }
1043
1044 static inline void syn_flood_warning(struct sk_buff *skb)
1045 {
1046         static unsigned long warntime;
1047
1048         if (time_after(jiffies, (warntime + HZ * 60))) {
1049                 warntime = jiffies;
1050                 printk(KERN_INFO
1051                        "possible SYN flooding on port %d. Sending cookies.\n",
1052                        ntohs(skb->h.th->dest));
1053         }
1054 }
1055
1056 /*
1057  * Save and compile IPv4 options into the request_sock if needed.
1058  */
1059 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1060                                                      struct sk_buff *skb)
1061 {
1062         struct ip_options *opt = &(IPCB(skb)->opt);
1063         struct ip_options *dopt = NULL;
1064
1065         if (opt && opt->optlen) {
1066                 int opt_size = optlength(opt);
1067                 dopt = kmalloc(opt_size, GFP_ATOMIC);
1068                 if (dopt) {
1069                         if (ip_options_echo(dopt, skb)) {
1070                                 kfree(dopt);
1071                                 dopt = NULL;
1072                         }
1073                 }
1074         }
1075         return dopt;
1076 }
1077
1078 struct request_sock_ops tcp_request_sock_ops = {
1079         .family         =       PF_INET,
1080         .obj_size       =       sizeof(struct tcp_request_sock),
1081         .rtx_syn_ack    =       tcp_v4_send_synack,
1082         .send_ack       =       tcp_v4_reqsk_send_ack,
1083         .destructor     =       tcp_v4_reqsk_destructor,
1084         .send_reset     =       tcp_v4_send_reset,
1085 };
1086
1087 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1088 {
1089         struct inet_request_sock *ireq;
1090         struct tcp_options_received tmp_opt;
1091         struct request_sock *req;
1092         __u32 saddr = skb->nh.iph->saddr;
1093         __u32 daddr = skb->nh.iph->daddr;
1094         __u32 isn = TCP_SKB_CB(skb)->when;
1095         struct dst_entry *dst = NULL;
1096 #ifdef CONFIG_SYN_COOKIES
1097         int want_cookie = 0;
1098 #else
1099 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1100 #endif
1101
1102         /* Never answer to SYNs send to broadcast or multicast */
1103         if (((struct rtable *)skb->dst)->rt_flags &
1104             (RTCF_BROADCAST | RTCF_MULTICAST))
1105                 goto drop;
1106
1107         /* TW buckets are converted to open requests without
1108          * limitations, they conserve resources and peer is
1109          * evidently real one.
1110          */
1111         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1112 #ifdef CONFIG_SYN_COOKIES
1113                 if (sysctl_tcp_syncookies) {
1114                         want_cookie = 1;
1115                 } else
1116 #endif
1117                 goto drop;
1118         }
1119
1120         /* Accept backlog is full. If we have already queued enough
1121          * of warm entries in syn queue, drop request. It is better than
1122          * clogging syn queue with openreqs with exponentially increasing
1123          * timeout.
1124          */
1125         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1126                 goto drop;
1127
1128         req = reqsk_alloc(&tcp_request_sock_ops);
1129         if (!req)
1130                 goto drop;
1131
1132         tcp_clear_options(&tmp_opt);
1133         tmp_opt.mss_clamp = 536;
1134         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1135
1136         tcp_parse_options(skb, &tmp_opt, 0);
1137
1138         if (want_cookie) {
1139                 tcp_clear_options(&tmp_opt);
1140                 tmp_opt.saw_tstamp = 0;
1141         }
1142
1143         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1144                 /* Some OSes (unknown ones, but I see them on web server, which
1145                  * contains information interesting only for windows'
1146                  * users) do not send their stamp in SYN. It is easy case.
1147                  * We simply do not advertise TS support.
1148                  */
1149                 tmp_opt.saw_tstamp = 0;
1150                 tmp_opt.tstamp_ok  = 0;
1151         }
1152         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1153
1154         tcp_openreq_init(req, &tmp_opt, skb);
1155
1156         ireq = inet_rsk(req);
1157         ireq->loc_addr = daddr;
1158         ireq->rmt_addr = saddr;
1159         ireq->opt = tcp_v4_save_options(sk, skb);
1160         if (!want_cookie)
1161                 TCP_ECN_create_request(req, skb->h.th);
1162
1163         if (want_cookie) {
1164 #ifdef CONFIG_SYN_COOKIES
1165                 syn_flood_warning(skb);
1166 #endif
1167                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1168         } else if (!isn) {
1169                 struct inet_peer *peer = NULL;
1170
1171                 /* VJ's idea. We save last timestamp seen
1172                  * from the destination in peer table, when entering
1173                  * state TIME-WAIT, and check against it before
1174                  * accepting new connection request.
1175                  *
1176                  * If "isn" is not zero, this request hit alive
1177                  * timewait bucket, so that all the necessary checks
1178                  * are made in the function processing timewait state.
1179                  */
1180                 if (tmp_opt.saw_tstamp &&
1181                     sysctl_tcp_tw_recycle &&
1182                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1183                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1184                     peer->v4daddr == saddr) {
1185                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1186                             (s32)(peer->tcp_ts - req->ts_recent) >
1187                                                         TCP_PAWS_WINDOW) {
1188                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1189                                 dst_release(dst);
1190                                 goto drop_and_free;
1191                         }
1192                 }
1193                 /* Kill the following clause, if you dislike this way. */
1194                 else if (!sysctl_tcp_syncookies &&
1195                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1196                           (sysctl_max_syn_backlog >> 2)) &&
1197                          (!peer || !peer->tcp_ts_stamp) &&
1198                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1199                         /* Without syncookies last quarter of
1200                          * backlog is filled with destinations,
1201                          * proven to be alive.
1202                          * It means that we continue to communicate
1203                          * to destinations, already remembered
1204                          * to the moment of synflood.
1205                          */
1206                         LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
1207                                               "request from %u.%u."
1208                                               "%u.%u/%u\n",
1209                                               NIPQUAD(saddr),
1210                                               ntohs(skb->h.th->source)));
1211                         dst_release(dst);
1212                         goto drop_and_free;
1213                 }
1214
1215                 isn = tcp_v4_init_sequence(sk, skb);
1216         }
1217         tcp_rsk(req)->snt_isn = isn;
1218
1219         if (tcp_v4_send_synack(sk, req, dst))
1220                 goto drop_and_free;
1221
1222         if (want_cookie) {
1223                 reqsk_free(req);
1224         } else {
1225                 tcp_v4_synq_add(sk, req);
1226         }
1227         return 0;
1228
1229 drop_and_free:
1230         reqsk_free(req);
1231 drop:
1232         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1233         return 0;
1234 }
1235
1236
1237 /*
1238  * The three way handshake has completed - we got a valid synack -
1239  * now create the new socket.
1240  */
1241 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1242                                   struct request_sock *req,
1243                                   struct dst_entry *dst)
1244 {
1245         struct inet_request_sock *ireq;
1246         struct inet_sock *newinet;
1247         struct tcp_sock *newtp;
1248         struct sock *newsk;
1249
1250         if (sk_acceptq_is_full(sk))
1251                 goto exit_overflow;
1252
1253         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1254                 goto exit;
1255
1256         newsk = tcp_create_openreq_child(sk, req, skb);
1257         if (!newsk)
1258                 goto exit;
1259
1260         sk_setup_caps(newsk, dst);
1261
1262         newtp                 = tcp_sk(newsk);
1263         newinet               = inet_sk(newsk);
1264         ireq                  = inet_rsk(req);
1265         newinet->daddr        = ireq->rmt_addr;
1266         newinet->rcv_saddr    = ireq->loc_addr;
1267         newinet->saddr        = ireq->loc_addr;
1268         newinet->opt          = ireq->opt;
1269         ireq->opt             = NULL;
1270         newinet->mc_index     = inet_iif(skb);
1271         newinet->mc_ttl       = skb->nh.iph->ttl;
1272         newtp->ext_header_len = 0;
1273         if (newinet->opt)
1274                 newtp->ext_header_len = newinet->opt->optlen;
1275         newinet->id = newtp->write_seq ^ jiffies;
1276
1277         tcp_sync_mss(newsk, dst_mtu(dst));
1278         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1279         tcp_initialize_rcv_mss(newsk);
1280
1281         __inet_hash(&tcp_hashinfo, newsk, 0);
1282         __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1283
1284         return newsk;
1285
1286 exit_overflow:
1287         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1288 exit:
1289         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1290         dst_release(dst);
1291         return NULL;
1292 }
1293
1294 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1295 {
1296         struct tcphdr *th = skb->h.th;
1297         struct iphdr *iph = skb->nh.iph;
1298         struct sock *nsk;
1299         struct request_sock **prev;
1300         /* Find possible connection requests. */
1301         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1302                                                        iph->saddr, iph->daddr);
1303         if (req)
1304                 return tcp_check_req(sk, skb, req, prev);
1305
1306         nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
1307                                         th->source, skb->nh.iph->daddr,
1308                                         ntohs(th->dest), inet_iif(skb));
1309
1310         if (nsk) {
1311                 if (nsk->sk_state != TCP_TIME_WAIT) {
1312                         bh_lock_sock(nsk);
1313                         return nsk;
1314                 }
1315                 inet_twsk_put((struct inet_timewait_sock *)nsk);
1316                 return NULL;
1317         }
1318
1319 #ifdef CONFIG_SYN_COOKIES
1320         if (!th->rst && !th->syn && th->ack)
1321                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1322 #endif
1323         return sk;
1324 }
1325
1326 static int tcp_v4_checksum_init(struct sk_buff *skb)
1327 {
1328         if (skb->ip_summed == CHECKSUM_HW) {
1329                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1330                 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1331                                   skb->nh.iph->daddr, skb->csum))
1332                         return 0;
1333
1334                 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1335                 skb->ip_summed = CHECKSUM_NONE;
1336         }
1337         if (skb->len <= 76) {
1338                 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1339                                  skb->nh.iph->daddr,
1340                                  skb_checksum(skb, 0, skb->len, 0)))
1341                         return -1;
1342                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1343         } else {
1344                 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1345                                           skb->nh.iph->saddr,
1346                                           skb->nh.iph->daddr, 0);
1347         }
1348         return 0;
1349 }
1350
1351
1352 /* The socket must have it's spinlock held when we get
1353  * here.
1354  *
1355  * We have a potential double-lock case here, so even when
1356  * doing backlog processing we use the BH locking scheme.
1357  * This is because we cannot sleep with the original spinlock
1358  * held.
1359  */
1360 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1361 {
1362         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1363                 TCP_CHECK_TIMER(sk);
1364                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1365                         goto reset;
1366                 TCP_CHECK_TIMER(sk);
1367                 return 0;
1368         }
1369
1370         if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1371                 goto csum_err;
1372
1373         if (sk->sk_state == TCP_LISTEN) {
1374                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1375                 if (!nsk)
1376                         goto discard;
1377
1378                 if (nsk != sk) {
1379                         if (tcp_child_process(sk, nsk, skb))
1380                                 goto reset;
1381                         return 0;
1382                 }
1383         }
1384
1385         TCP_CHECK_TIMER(sk);
1386         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1387                 goto reset;
1388         TCP_CHECK_TIMER(sk);
1389         return 0;
1390
1391 reset:
1392         tcp_v4_send_reset(skb);
1393 discard:
1394         kfree_skb(skb);
1395         /* Be careful here. If this function gets more complicated and
1396          * gcc suffers from register pressure on the x86, sk (in %ebx)
1397          * might be destroyed here. This current version compiles correctly,
1398          * but you have been warned.
1399          */
1400         return 0;
1401
1402 csum_err:
1403         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1404         goto discard;
1405 }
1406
1407 /*
1408  *      From tcp_input.c
1409  */
1410
1411 int tcp_v4_rcv(struct sk_buff *skb)
1412 {
1413         struct tcphdr *th;
1414         struct sock *sk;
1415         int ret;
1416
1417         if (skb->pkt_type != PACKET_HOST)
1418                 goto discard_it;
1419
1420         /* Count it even if it's bad */
1421         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1422
1423         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1424                 goto discard_it;
1425
1426         th = skb->h.th;
1427
1428         if (th->doff < sizeof(struct tcphdr) / 4)
1429                 goto bad_packet;
1430         if (!pskb_may_pull(skb, th->doff * 4))
1431                 goto discard_it;
1432
1433         /* An explanation is required here, I think.
1434          * Packet length and doff are validated by header prediction,
1435          * provided case of th->doff==0 is elimineted.
1436          * So, we defer the checks. */
1437         if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1438              tcp_v4_checksum_init(skb) < 0))
1439                 goto bad_packet;
1440
1441         th = skb->h.th;
1442         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1443         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1444                                     skb->len - th->doff * 4);
1445         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1446         TCP_SKB_CB(skb)->when    = 0;
1447         TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
1448         TCP_SKB_CB(skb)->sacked  = 0;
1449
1450         sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
1451                            skb->nh.iph->daddr, ntohs(th->dest),
1452                            inet_iif(skb));
1453
1454         if (!sk)
1455                 goto no_tcp_socket;
1456
1457 process:
1458         if (sk->sk_state == TCP_TIME_WAIT)
1459                 goto do_time_wait;
1460
1461         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1462                 goto discard_and_relse;
1463
1464         if (sk_filter(sk, skb, 0))
1465                 goto discard_and_relse;
1466
1467         skb->dev = NULL;
1468
1469         bh_lock_sock(sk);
1470         ret = 0;
1471         if (!sock_owned_by_user(sk)) {
1472                 if (!tcp_prequeue(sk, skb))
1473                         ret = tcp_v4_do_rcv(sk, skb);
1474         } else
1475                 sk_add_backlog(sk, skb);
1476         bh_unlock_sock(sk);
1477
1478         sock_put(sk);
1479
1480         return ret;
1481
1482 no_tcp_socket:
1483         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1484                 goto discard_it;
1485
1486         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1487 bad_packet:
1488                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1489         } else {
1490                 tcp_v4_send_reset(skb);
1491         }
1492
1493 discard_it:
1494         /* Discard frame. */
1495         kfree_skb(skb);
1496         return 0;
1497
1498 discard_and_relse:
1499         sock_put(sk);
1500         goto discard_it;
1501
1502 do_time_wait:
1503         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1504                 inet_twsk_put((struct inet_timewait_sock *) sk);
1505                 goto discard_it;
1506         }
1507
1508         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1509                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1510                 inet_twsk_put((struct inet_timewait_sock *) sk);
1511                 goto discard_it;
1512         }
1513         switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
1514                                            skb, th)) {
1515         case TCP_TW_SYN: {
1516                 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1517                                                         skb->nh.iph->daddr,
1518                                                         ntohs(th->dest),
1519                                                         inet_iif(skb));
1520                 if (sk2) {
1521                         tcp_tw_deschedule((struct inet_timewait_sock *)sk);
1522                         inet_twsk_put((struct inet_timewait_sock *)sk);
1523                         sk = sk2;
1524                         goto process;
1525                 }
1526                 /* Fall through to ACK */
1527         }
1528         case TCP_TW_ACK:
1529                 tcp_v4_timewait_ack(sk, skb);
1530                 break;
1531         case TCP_TW_RST:
1532                 goto no_tcp_socket;
1533         case TCP_TW_SUCCESS:;
1534         }
1535         goto discard_it;
1536 }
1537
1538 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1539 {
1540         struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1541         struct inet_sock *inet = inet_sk(sk);
1542
1543         sin->sin_family         = AF_INET;
1544         sin->sin_addr.s_addr    = inet->daddr;
1545         sin->sin_port           = inet->dport;
1546 }
1547
1548 /* VJ's idea. Save last timestamp seen from this destination
1549  * and hold it at least for normal timewait interval to use for duplicate
1550  * segment detection in subsequent connections, before they enter synchronized
1551  * state.
1552  */
1553
1554 int tcp_v4_remember_stamp(struct sock *sk)
1555 {
1556         struct inet_sock *inet = inet_sk(sk);
1557         struct tcp_sock *tp = tcp_sk(sk);
1558         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1559         struct inet_peer *peer = NULL;
1560         int release_it = 0;
1561
1562         if (!rt || rt->rt_dst != inet->daddr) {
1563                 peer = inet_getpeer(inet->daddr, 1);
1564                 release_it = 1;
1565         } else {
1566                 if (!rt->peer)
1567                         rt_bind_peer(rt, 1);
1568                 peer = rt->peer;
1569         }
1570
1571         if (peer) {
1572                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1573                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1574                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1575                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1576                         peer->tcp_ts = tp->rx_opt.ts_recent;
1577                 }
1578                 if (release_it)
1579                         inet_putpeer(peer);
1580                 return 1;
1581         }
1582
1583         return 0;
1584 }
1585
1586 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1587 {
1588         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1589
1590         if (peer) {
1591                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1592
1593                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1594                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1595                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1596                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1597                         peer->tcp_ts       = tcptw->tw_ts_recent;
1598                 }
1599                 inet_putpeer(peer);
1600                 return 1;
1601         }
1602
1603         return 0;
1604 }
1605
1606 struct tcp_func ipv4_specific = {
1607         .queue_xmit     =       ip_queue_xmit,
1608         .send_check     =       tcp_v4_send_check,
1609         .rebuild_header =       inet_sk_rebuild_header,
1610         .conn_request   =       tcp_v4_conn_request,
1611         .syn_recv_sock  =       tcp_v4_syn_recv_sock,
1612         .remember_stamp =       tcp_v4_remember_stamp,
1613         .net_header_len =       sizeof(struct iphdr),
1614         .setsockopt     =       ip_setsockopt,
1615         .getsockopt     =       ip_getsockopt,
1616         .addr2sockaddr  =       v4_addr2sockaddr,
1617         .sockaddr_len   =       sizeof(struct sockaddr_in),
1618 };
1619
1620 /* NOTE: A lot of things set to zero explicitly by call to
1621  *       sk_alloc() so need not be done here.
1622  */
1623 static int tcp_v4_init_sock(struct sock *sk)
1624 {
1625         struct tcp_sock *tp = tcp_sk(sk);
1626
1627         skb_queue_head_init(&tp->out_of_order_queue);
1628         tcp_init_xmit_timers(sk);
1629         tcp_prequeue_init(tp);
1630
1631         inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
1632         tp->mdev = TCP_TIMEOUT_INIT;
1633
1634         /* So many TCP implementations out there (incorrectly) count the
1635          * initial SYN frame in their delayed-ACK and congestion control
1636          * algorithms that we must have the following bandaid to talk
1637          * efficiently to them.  -DaveM
1638          */
1639         tp->snd_cwnd = 2;
1640
1641         /* See draft-stevens-tcpca-spec-01 for discussion of the
1642          * initialization of these values.
1643          */
1644         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1645         tp->snd_cwnd_clamp = ~0;
1646         tp->mss_cache = 536;
1647
1648         tp->reordering = sysctl_tcp_reordering;
1649         tp->ca_ops = &tcp_init_congestion_ops;
1650
1651         sk->sk_state = TCP_CLOSE;
1652
1653         sk->sk_write_space = sk_stream_write_space;
1654         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1655
1656         tp->af_specific = &ipv4_specific;
1657
1658         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1659         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1660
1661         atomic_inc(&tcp_sockets_allocated);
1662
1663         return 0;
1664 }
1665
1666 int tcp_v4_destroy_sock(struct sock *sk)
1667 {
1668         struct tcp_sock *tp = tcp_sk(sk);
1669
1670         tcp_clear_xmit_timers(sk);
1671
1672         tcp_cleanup_congestion_control(tp);
1673
1674         /* Cleanup up the write buffer. */
1675         sk_stream_writequeue_purge(sk);
1676
1677         /* Cleans up our, hopefully empty, out_of_order_queue. */
1678         __skb_queue_purge(&tp->out_of_order_queue);
1679
1680         /* Clean prequeue, it must be empty really */
1681         __skb_queue_purge(&tp->ucopy.prequeue);
1682
1683         /* Clean up a referenced TCP bind bucket. */
1684         if (inet_csk(sk)->icsk_bind_hash)
1685                 inet_put_port(&tcp_hashinfo, sk);
1686
1687         /*
1688          * If sendmsg cached page exists, toss it.
1689          */
1690         if (sk->sk_sndmsg_page) {
1691                 __free_page(sk->sk_sndmsg_page);
1692                 sk->sk_sndmsg_page = NULL;
1693         }
1694
1695         atomic_dec(&tcp_sockets_allocated);
1696
1697         return 0;
1698 }
1699
1700 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1701
1702 #ifdef CONFIG_PROC_FS
1703 /* Proc filesystem TCP sock list dumping. */
1704
1705 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1706 {
1707         return hlist_empty(head) ? NULL :
1708                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1709 }
1710
1711 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1712 {
1713         return tw->tw_node.next ?
1714                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1715 }
1716
1717 static void *listening_get_next(struct seq_file *seq, void *cur)
1718 {
1719         struct inet_connection_sock *icsk;
1720         struct hlist_node *node;
1721         struct sock *sk = cur;
1722         struct tcp_iter_state* st = seq->private;
1723
1724         if (!sk) {
1725                 st->bucket = 0;
1726                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1727                 goto get_sk;
1728         }
1729
1730         ++st->num;
1731
1732         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1733                 struct request_sock *req = cur;
1734
1735                 icsk = inet_csk(st->syn_wait_sk);
1736                 req = req->dl_next;
1737                 while (1) {
1738                         while (req) {
1739                                 if (req->rsk_ops->family == st->family) {
1740                                         cur = req;
1741                                         goto out;
1742                                 }
1743                                 req = req->dl_next;
1744                         }
1745                         if (++st->sbucket >= TCP_SYNQ_HSIZE)
1746                                 break;
1747 get_req:
1748                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1749                 }
1750                 sk        = sk_next(st->syn_wait_sk);
1751                 st->state = TCP_SEQ_STATE_LISTENING;
1752                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1753         } else {
1754                 icsk = inet_csk(sk);
1755                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1756                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1757                         goto start_req;
1758                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1759                 sk = sk_next(sk);
1760         }
1761 get_sk:
1762         sk_for_each_from(sk, node) {
1763                 if (sk->sk_family == st->family) {
1764                         cur = sk;
1765                         goto out;
1766                 }
1767                 icsk = inet_csk(sk);
1768                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1769                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1770 start_req:
1771                         st->uid         = sock_i_uid(sk);
1772                         st->syn_wait_sk = sk;
1773                         st->state       = TCP_SEQ_STATE_OPENREQ;
1774                         st->sbucket     = 0;
1775                         goto get_req;
1776                 }
1777                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1778         }
1779         if (++st->bucket < INET_LHTABLE_SIZE) {
1780                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1781                 goto get_sk;
1782         }
1783         cur = NULL;
1784 out:
1785         return cur;
1786 }
1787
1788 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1789 {
1790         void *rc = listening_get_next(seq, NULL);
1791
1792         while (rc && *pos) {
1793                 rc = listening_get_next(seq, rc);
1794                 --*pos;
1795         }
1796         return rc;
1797 }
1798
1799 static void *established_get_first(struct seq_file *seq)
1800 {
1801         struct tcp_iter_state* st = seq->private;
1802         void *rc = NULL;
1803
1804         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1805                 struct sock *sk;
1806                 struct hlist_node *node;
1807                 struct inet_timewait_sock *tw;
1808
1809                 /* We can reschedule _before_ having picked the target: */
1810                 cond_resched_softirq();
1811
1812                 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1813                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1814                         if (sk->sk_family != st->family) {
1815                                 continue;
1816                         }
1817                         rc = sk;
1818                         goto out;
1819                 }
1820                 st->state = TCP_SEQ_STATE_TIME_WAIT;
1821                 inet_twsk_for_each(tw, node,
1822                                    &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
1823                         if (tw->tw_family != st->family) {
1824                                 continue;
1825                         }
1826                         rc = tw;
1827                         goto out;
1828                 }
1829                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1830                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1831         }
1832 out:
1833         return rc;
1834 }
1835
1836 static void *established_get_next(struct seq_file *seq, void *cur)
1837 {
1838         struct sock *sk = cur;
1839         struct inet_timewait_sock *tw;
1840         struct hlist_node *node;
1841         struct tcp_iter_state* st = seq->private;
1842
1843         ++st->num;
1844
1845         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1846                 tw = cur;
1847                 tw = tw_next(tw);
1848 get_tw:
1849                 while (tw && tw->tw_family != st->family) {
1850                         tw = tw_next(tw);
1851                 }
1852                 if (tw) {
1853                         cur = tw;
1854                         goto out;
1855                 }
1856                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1857                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1858
1859                 /* We can reschedule between buckets: */
1860                 cond_resched_softirq();
1861
1862                 if (++st->bucket < tcp_hashinfo.ehash_size) {
1863                         read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1864                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
1865                 } else {
1866                         cur = NULL;
1867                         goto out;
1868                 }
1869         } else
1870                 sk = sk_next(sk);
1871
1872         sk_for_each_from(sk, node) {
1873                 if (sk->sk_family == st->family)
1874                         goto found;
1875         }
1876
1877         st->state = TCP_SEQ_STATE_TIME_WAIT;
1878         tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
1879         goto get_tw;
1880 found:
1881         cur = sk;
1882 out:
1883         return cur;
1884 }
1885
1886 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1887 {
1888         void *rc = established_get_first(seq);
1889
1890         while (rc && pos) {
1891                 rc = established_get_next(seq, rc);
1892                 --pos;
1893         }
1894         return rc;
1895 }
1896
1897 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1898 {
1899         void *rc;
1900         struct tcp_iter_state* st = seq->private;
1901
1902         inet_listen_lock(&tcp_hashinfo);
1903         st->state = TCP_SEQ_STATE_LISTENING;
1904         rc        = listening_get_idx(seq, &pos);
1905
1906         if (!rc) {
1907                 inet_listen_unlock(&tcp_hashinfo);
1908                 local_bh_disable();
1909                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1910                 rc        = established_get_idx(seq, pos);
1911         }
1912
1913         return rc;
1914 }
1915
1916 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1917 {
1918         struct tcp_iter_state* st = seq->private;
1919         st->state = TCP_SEQ_STATE_LISTENING;
1920         st->num = 0;
1921         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
1922 }
1923
1924 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1925 {
1926         void *rc = NULL;
1927         struct tcp_iter_state* st;
1928
1929         if (v == SEQ_START_TOKEN) {
1930                 rc = tcp_get_idx(seq, 0);
1931                 goto out;
1932         }
1933         st = seq->private;
1934
1935         switch (st->state) {
1936         case TCP_SEQ_STATE_OPENREQ:
1937         case TCP_SEQ_STATE_LISTENING:
1938                 rc = listening_get_next(seq, v);
1939                 if (!rc) {
1940                         inet_listen_unlock(&tcp_hashinfo);
1941                         local_bh_disable();
1942                         st->state = TCP_SEQ_STATE_ESTABLISHED;
1943                         rc        = established_get_first(seq);
1944                 }
1945                 break;
1946         case TCP_SEQ_STATE_ESTABLISHED:
1947         case TCP_SEQ_STATE_TIME_WAIT:
1948                 rc = established_get_next(seq, v);
1949                 break;
1950         }
1951 out:
1952         ++*pos;
1953         return rc;
1954 }
1955
1956 static void tcp_seq_stop(struct seq_file *seq, void *v)
1957 {
1958         struct tcp_iter_state* st = seq->private;
1959
1960         switch (st->state) {
1961         case TCP_SEQ_STATE_OPENREQ:
1962                 if (v) {
1963                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
1964                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1965                 }
1966         case TCP_SEQ_STATE_LISTENING:
1967                 if (v != SEQ_START_TOKEN)
1968                         inet_listen_unlock(&tcp_hashinfo);
1969                 break;
1970         case TCP_SEQ_STATE_TIME_WAIT:
1971         case TCP_SEQ_STATE_ESTABLISHED:
1972                 if (v)
1973                         read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1974                 local_bh_enable();
1975                 break;
1976         }
1977 }
1978
1979 static int tcp_seq_open(struct inode *inode, struct file *file)
1980 {
1981         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
1982         struct seq_file *seq;
1983         struct tcp_iter_state *s;
1984         int rc;
1985
1986         if (unlikely(afinfo == NULL))
1987                 return -EINVAL;
1988
1989         s = kmalloc(sizeof(*s), GFP_KERNEL);
1990         if (!s)
1991                 return -ENOMEM;
1992         memset(s, 0, sizeof(*s));
1993         s->family               = afinfo->family;
1994         s->seq_ops.start        = tcp_seq_start;
1995         s->seq_ops.next         = tcp_seq_next;
1996         s->seq_ops.show         = afinfo->seq_show;
1997         s->seq_ops.stop         = tcp_seq_stop;
1998
1999         rc = seq_open(file, &s->seq_ops);
2000         if (rc)
2001                 goto out_kfree;
2002         seq          = file->private_data;
2003         seq->private = s;
2004 out:
2005         return rc;
2006 out_kfree:
2007         kfree(s);
2008         goto out;
2009 }
2010
2011 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2012 {
2013         int rc = 0;
2014         struct proc_dir_entry *p;
2015
2016         if (!afinfo)
2017                 return -EINVAL;
2018         afinfo->seq_fops->owner         = afinfo->owner;
2019         afinfo->seq_fops->open          = tcp_seq_open;
2020         afinfo->seq_fops->read          = seq_read;
2021         afinfo->seq_fops->llseek        = seq_lseek;
2022         afinfo->seq_fops->release       = seq_release_private;
2023
2024         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2025         if (p)
2026                 p->data = afinfo;
2027         else
2028                 rc = -ENOMEM;
2029         return rc;
2030 }
2031
2032 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2033 {
2034         if (!afinfo)
2035                 return;
2036         proc_net_remove(afinfo->name);
2037         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2038 }
2039
2040 static void get_openreq4(struct sock *sk, struct request_sock *req,
2041                          char *tmpbuf, int i, int uid)
2042 {
2043         const struct inet_request_sock *ireq = inet_rsk(req);
2044         int ttd = req->expires - jiffies;
2045
2046         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2047                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2048                 i,
2049                 ireq->loc_addr,
2050                 ntohs(inet_sk(sk)->sport),
2051                 ireq->rmt_addr,
2052                 ntohs(ireq->rmt_port),
2053                 TCP_SYN_RECV,
2054                 0, 0, /* could print option size, but that is af dependent. */
2055                 1,    /* timers active (only the expire timer) */
2056                 jiffies_to_clock_t(ttd),
2057                 req->retrans,
2058                 uid,
2059                 0,  /* non standard timer */
2060                 0, /* open_requests have no inode */
2061                 atomic_read(&sk->sk_refcnt),
2062                 req);
2063 }
2064
2065 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2066 {
2067         int timer_active;
2068         unsigned long timer_expires;
2069         struct tcp_sock *tp = tcp_sk(sp);
2070         const struct inet_connection_sock *icsk = inet_csk(sp);
2071         struct inet_sock *inet = inet_sk(sp);
2072         unsigned int dest = inet->daddr;
2073         unsigned int src = inet->rcv_saddr;
2074         __u16 destp = ntohs(inet->dport);
2075         __u16 srcp = ntohs(inet->sport);
2076
2077         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2078                 timer_active    = 1;
2079                 timer_expires   = icsk->icsk_timeout;
2080         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2081                 timer_active    = 4;
2082                 timer_expires   = icsk->icsk_timeout;
2083         } else if (timer_pending(&sp->sk_timer)) {
2084                 timer_active    = 2;
2085                 timer_expires   = sp->sk_timer.expires;
2086         } else {
2087                 timer_active    = 0;
2088                 timer_expires = jiffies;
2089         }
2090
2091         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2092                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2093                 i, src, srcp, dest, destp, sp->sk_state,
2094                 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2095                 timer_active,
2096                 jiffies_to_clock_t(timer_expires - jiffies),
2097                 icsk->icsk_retransmits,
2098                 sock_i_uid(sp),
2099                 tp->probes_out,
2100                 sock_i_ino(sp),
2101                 atomic_read(&sp->sk_refcnt), sp,
2102                 icsk->icsk_rto,
2103                 icsk->icsk_ack.ato,
2104                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2105                 tp->snd_cwnd,
2106                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2107 }
2108
2109 static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i)
2110 {
2111         unsigned int dest, src;
2112         __u16 destp, srcp;
2113         int ttd = tw->tw_ttd - jiffies;
2114
2115         if (ttd < 0)
2116                 ttd = 0;
2117
2118         dest  = tw->tw_daddr;
2119         src   = tw->tw_rcv_saddr;
2120         destp = ntohs(tw->tw_dport);
2121         srcp  = ntohs(tw->tw_sport);
2122
2123         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2124                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2125                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2126                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2127                 atomic_read(&tw->tw_refcnt), tw);
2128 }
2129
2130 #define TMPSZ 150
2131
2132 static int tcp4_seq_show(struct seq_file *seq, void *v)
2133 {
2134         struct tcp_iter_state* st;
2135         char tmpbuf[TMPSZ + 1];
2136
2137         if (v == SEQ_START_TOKEN) {
2138                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2139                            "  sl  local_address rem_address   st tx_queue "
2140                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2141                            "inode");
2142                 goto out;
2143         }
2144         st = seq->private;
2145
2146         switch (st->state) {
2147         case TCP_SEQ_STATE_LISTENING:
2148         case TCP_SEQ_STATE_ESTABLISHED:
2149                 get_tcp4_sock(v, tmpbuf, st->num);
2150                 break;
2151         case TCP_SEQ_STATE_OPENREQ:
2152                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2153                 break;
2154         case TCP_SEQ_STATE_TIME_WAIT:
2155                 get_timewait4_sock(v, tmpbuf, st->num);
2156                 break;
2157         }
2158         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2159 out:
2160         return 0;
2161 }
2162
2163 static struct file_operations tcp4_seq_fops;
2164 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2165         .owner          = THIS_MODULE,
2166         .name           = "tcp",
2167         .family         = AF_INET,
2168         .seq_show       = tcp4_seq_show,
2169         .seq_fops       = &tcp4_seq_fops,
2170 };
2171
2172 int __init tcp4_proc_init(void)
2173 {
2174         return tcp_proc_register(&tcp4_seq_afinfo);
2175 }
2176
2177 void tcp4_proc_exit(void)
2178 {
2179         tcp_proc_unregister(&tcp4_seq_afinfo);
2180 }
2181 #endif /* CONFIG_PROC_FS */
2182
2183 struct proto tcp_prot = {
2184         .name                   = "TCP",
2185         .owner                  = THIS_MODULE,
2186         .close                  = tcp_close,
2187         .connect                = tcp_v4_connect,
2188         .disconnect             = tcp_disconnect,
2189         .accept                 = inet_csk_accept,
2190         .ioctl                  = tcp_ioctl,
2191         .init                   = tcp_v4_init_sock,
2192         .destroy                = tcp_v4_destroy_sock,
2193         .shutdown               = tcp_shutdown,
2194         .setsockopt             = tcp_setsockopt,
2195         .getsockopt             = tcp_getsockopt,
2196         .sendmsg                = tcp_sendmsg,
2197         .recvmsg                = tcp_recvmsg,
2198         .backlog_rcv            = tcp_v4_do_rcv,
2199         .hash                   = tcp_v4_hash,
2200         .unhash                 = tcp_unhash,
2201         .get_port               = tcp_v4_get_port,
2202         .enter_memory_pressure  = tcp_enter_memory_pressure,
2203         .sockets_allocated      = &tcp_sockets_allocated,
2204         .memory_allocated       = &tcp_memory_allocated,
2205         .memory_pressure        = &tcp_memory_pressure,
2206         .sysctl_mem             = sysctl_tcp_mem,
2207         .sysctl_wmem            = sysctl_tcp_wmem,
2208         .sysctl_rmem            = sysctl_tcp_rmem,
2209         .max_header             = MAX_TCP_HEADER,
2210         .obj_size               = sizeof(struct tcp_sock),
2211         .twsk_obj_size          = sizeof(struct tcp_timewait_sock),
2212         .rsk_prot               = &tcp_request_sock_ops,
2213 };
2214
2215
2216
2217 void __init tcp_v4_init(struct net_proto_family *ops)
2218 {
2219         int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2220         if (err < 0)
2221                 panic("Failed to create the TCP control socket.\n");
2222         tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
2223         inet_sk(tcp_socket->sk)->uc_ttl = -1;
2224
2225         /* Unhash it so that IP input processing does not even
2226          * see it, we do not wish this socket to see incoming
2227          * packets.
2228          */
2229         tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2230 }
2231
2232 EXPORT_SYMBOL(ipv4_specific);
2233 EXPORT_SYMBOL(inet_bind_bucket_create);
2234 EXPORT_SYMBOL(tcp_hashinfo);
2235 EXPORT_SYMBOL(tcp_prot);
2236 EXPORT_SYMBOL(tcp_unhash);
2237 EXPORT_SYMBOL(tcp_v4_conn_request);
2238 EXPORT_SYMBOL(tcp_v4_connect);
2239 EXPORT_SYMBOL(tcp_v4_do_rcv);
2240 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2241 EXPORT_SYMBOL(tcp_v4_send_check);
2242 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2243
2244 #ifdef CONFIG_PROC_FS
2245 EXPORT_SYMBOL(tcp_proc_register);
2246 EXPORT_SYMBOL(tcp_proc_unregister);
2247 #endif
2248 EXPORT_SYMBOL(sysctl_local_port_range);
2249 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2250 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2251