net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
   9  *
  10  *              IPv4 specific functions
  11  *
  12  *
  13  *              code split from:
  14  *              linux/ipv4/tcp.c
  15  *              linux/ipv4/tcp_input.c
  16  *              linux/ipv4/tcp_output.c
  17  *
  18  *              See tcp.c for author information
  19  *
  20  *      This program is free software; you can redistribute it and/or
  21  *      modify it under the terms of the GNU General Public License
  22  *      as published by the Free Software Foundation; either version
  23  *      2 of the License, or (at your option) any later version.
  24  */
  25
  26 /*
  27  * Changes:
  28  *              David S. Miller :       New socket lookup architecture.
  29  *                                      This code is dedicated to John Dyson.
  30  *              David S. Miller :       Change semantics of established hash,
  31  *                                      half is devoted to TIME_WAIT sockets
  32  *                                      and the rest go in the other half.
  33  *              Andi Kleen :            Add support for syncookies and fixed
  34  *                                      some bugs: ip options weren't passed to
  35  *                                      the TCP layer, missed a check for an
  36  *                                      ACK bit.
  37  *              Andi Kleen :            Implemented fast path mtu discovery.
  38  *                                      Fixed many serious bugs in the
  39  *                                      request_sock handling and moved
  40  *                                      most of it into the af independent code.
  41  *                                      Added tail drop and some other bugfixes.
  42  *                                      Added new listen sematics.
  43  *              Mike McLagan    :       Routing by source
  44  *      Juan Jose Ciarlante:            ip_dynaddr bits
  45  *              Andi Kleen:             various fixes.
  46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  47  *                                      coma.
  48  *      Andi Kleen              :       Fix new listen.
  49  *      Andi Kleen              :       Fix accept error reporting.
  50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  52  *                                      a single port at the same time.
  53  */
  54
  55 #include <linux/config.h>
  56
  57 #include <linux/types.h>
  58 #include <linux/fcntl.h>
  59 #include <linux/module.h>
  60 #include <linux/random.h>
  61 #include <linux/cache.h>
  62 #include <linux/jhash.h>
  63 #include <linux/init.h>
  64 #include <linux/times.h>
  65
  66 #include <net/icmp.h>
  67 #include <net/tcp.h>
  68 #include <net/ipv6.h>
  69 #include <net/inet_common.h>
  70 #include <net/xfrm.h>
  71
  72 #include <linux/inet.h>
  73 #include <linux/ipv6.h>
  74 #include <linux/stddef.h>
  75 #include <linux/proc_fs.h>
  76 #include <linux/seq_file.h>
  77
  78 extern int sysctl_ip_dynaddr;
  79 int sysctl_tcp_tw_reuse;
  80 int sysctl_tcp_low_latency;
  81
  82 /* Check TCP sequence numbers in ICMP packets. */
  83 #define ICMP_MIN_LENGTH 8
  84
  85 /* Socket used for sending RSTs */
  86 static struct socket *tcp_socket;
  87
  88 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
  89                        struct sk_buff *skb);
  90
  91 struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
  92         .__tcp_lhash_lock       =       RW_LOCK_UNLOCKED,
  93         .__tcp_lhash_users      =       ATOMIC_INIT(0),
  94         .__tcp_lhash_wait
  95           = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
  96         .__tcp_portalloc_lock   =       SPIN_LOCK_UNLOCKED
  97 };
  98
  99 /*
 100  * This array holds the first and last local port number.
 101  * For high-usage systems, use sysctl to change this to
 102  * 32768-61000
 103  */
 104 int sysctl_local_port_range[2] = { 1024, 4999 };
 105 int tcp_port_rover = 1024 - 1;
 106
 107 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
 108                                  __u32 faddr, __u16 fport)
 109 {
 110         int h = (laddr ^ lport) ^ (faddr ^ fport);
 111         h ^= h >> 16;
 112         h ^= h >> 8;
 113         return h & (tcp_ehash_size - 1);
 114 }
 115
 116 static __inline__ int tcp_sk_hashfn(struct sock *sk)
 117 {
 118         struct inet_sock *inet = inet_sk(sk);
 119         __u32 laddr = inet->rcv_saddr;
 120         __u16 lport = inet->num;
 121         __u32 faddr = inet->daddr;
 122         __u16 fport = inet->dport;
 123
 124         return tcp_hashfn(laddr, lport, faddr, fport);
 125 }
 126
 127 /* Allocate and initialize a new TCP local port bind bucket.
 128  * The bindhash mutex for snum's hash chain must be held here.
 129  */
 130 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
 131                                           unsigned short snum)
 132 {
 133         struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
 134                                                       SLAB_ATOMIC);
 135         if (tb) {
 136                 tb->port = snum;
 137                 tb->fastreuse = 0;
 138                 INIT_HLIST_HEAD(&tb->owners);
 139                 hlist_add_head(&tb->node, &head->chain);
 140         }
 141         return tb;
 142 }
 143
 144 /* Caller must hold hashbucket lock for this tb with local BH disabled */
 145 void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
 146 {
 147         if (hlist_empty(&tb->owners)) {
 148                 __hlist_del(&tb->node);
 149                 kmem_cache_free(tcp_bucket_cachep, tb);
 150         }
 151 }
 152
 153 /* Caller must disable local BH processing. */
 154 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
 155 {
 156         struct tcp_bind_hashbucket *head =
 157                                 &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
 158         struct tcp_bind_bucket *tb;
 159
 160         spin_lock(&head->lock);
 161         tb = tcp_sk(sk)->bind_hash;
 162         sk_add_bind_node(child, &tb->owners);
 163         tcp_sk(child)->bind_hash = tb;
 164         spin_unlock(&head->lock);
 165 }
 166
 167 inline void tcp_inherit_port(struct sock *sk, struct sock *child)
 168 {
 169         local_bh_disable();
 170         __tcp_inherit_port(sk, child);
 171         local_bh_enable();
 172 }
 173
 174 void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
 175                    unsigned short snum)
 176 {
 177         inet_sk(sk)->num = snum;
 178         sk_add_bind_node(sk, &tb->owners);
 179         tcp_sk(sk)->bind_hash = tb;
 180 }
 181
 182 static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
 183 {
 184         const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
 185         struct sock *sk2;
 186         struct hlist_node *node;
 187         int reuse = sk->sk_reuse;
 188
 189         sk_for_each_bound(sk2, node, &tb->owners) {
 190                 if (sk != sk2 &&
 191                     !tcp_v6_ipv6only(sk2) &&
 192                     (!sk->sk_bound_dev_if ||
 193                      !sk2->sk_bound_dev_if ||
 194                      sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
 195                         if (!reuse || !sk2->sk_reuse ||
 196                             sk2->sk_state == TCP_LISTEN) {
 197                                 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
 198                                 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
 199                                     sk2_rcv_saddr == sk_rcv_saddr)
 200                                         break;
 201                         }
 202                 }
 203         }
 204         return node != NULL;
 205 }
 206
 207 /* Obtain a reference to a local port for the given sock,
 208  * if snum is zero it means select any available local port.
 209  */
 210 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
 211 {
 212         struct tcp_bind_hashbucket *head;
 213         struct hlist_node *node;
 214         struct tcp_bind_bucket *tb;
 215         int ret;
 216
 217         local_bh_disable();
 218         if (!snum) {
 219                 int low = sysctl_local_port_range[0];
 220                 int high = sysctl_local_port_range[1];
 221                 int remaining = (high - low) + 1;
 222                 int rover;
 223
 224                 spin_lock(&tcp_portalloc_lock);
 225                 if (tcp_port_rover < low)
 226                         rover = low;
 227                 else
 228                         rover = tcp_port_rover;
 229                 do {
 230                         rover++;
 231                         if (rover > high)
 232                                 rover = low;
 233                         head = &tcp_bhash[tcp_bhashfn(rover)];
 234                         spin_lock(&head->lock);
 235                         tb_for_each(tb, node, &head->chain)
 236                                 if (tb->port == rover)
 237                                         goto next;
 238                         break;
 239                 next:
 240                         spin_unlock(&head->lock);
 241                 } while (--remaining > 0);
 242                 tcp_port_rover = rover;
 243                 spin_unlock(&tcp_portalloc_lock);
 244
 245                 /* Exhausted local port range during search?  It is not
 246                  * possible for us to be holding one of the bind hash
 247                  * locks if this test triggers, because if 'remaining'
 248                  * drops to zero, we broke out of the do/while loop at
 249                  * the top level, not from the 'break;' statement.
 250                  */
 251                 ret = 1;
 252                 if (unlikely(remaining <= 0))
 253                         goto fail;
 254
 255                 /* OK, here is the one we will use.  HEAD is
 256                  * non-NULL and we hold it's mutex.
 257                  */
 258                 snum = rover;
 259         } else {
 260                 head = &tcp_bhash[tcp_bhashfn(snum)];
 261                 spin_lock(&head->lock);
 262                 tb_for_each(tb, node, &head->chain)
 263                         if (tb->port == snum)
 264                                 goto tb_found;
 265         }
 266         tb = NULL;
 267         goto tb_not_found;
 268 tb_found:
 269         if (!hlist_empty(&tb->owners)) {
 270                 if (sk->sk_reuse > 1)
 271                         goto success;
 272                 if (tb->fastreuse > 0 &&
 273                     sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
 274                         goto success;
 275                 } else {
 276                         ret = 1;
 277                         if (tcp_bind_conflict(sk, tb))
 278                                 goto fail_unlock;
 279                 }
 280         }
 281 tb_not_found:
 282         ret = 1;
 283         if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
 284                 goto fail_unlock;
 285         if (hlist_empty(&tb->owners)) {
 286                 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
 287                         tb->fastreuse = 1;
 288                 else
 289                         tb->fastreuse = 0;
 290         } else if (tb->fastreuse &&
 291                    (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
 292                 tb->fastreuse = 0;
 293 success:
 294         if (!tcp_sk(sk)->bind_hash)
 295                 tcp_bind_hash(sk, tb, snum);
 296         BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
 297         ret = 0;
 298
 299 fail_unlock:
 300         spin_unlock(&head->lock);
 301 fail:
 302         local_bh_enable();
 303         return ret;
 304 }
 305
 306 /* Get rid of any references to a local port held by the
 307  * given sock.
 308  */
 309 static void __tcp_put_port(struct sock *sk)
 310 {
 311         struct inet_sock *inet = inet_sk(sk);
 312         struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
 313         struct tcp_bind_bucket *tb;
 314
 315         spin_lock(&head->lock);
 316         tb = tcp_sk(sk)->bind_hash;
 317         __sk_del_bind_node(sk);
 318         tcp_sk(sk)->bind_hash = NULL;
 319         inet->num = 0;
 320         tcp_bucket_destroy(tb);
 321         spin_unlock(&head->lock);
 322 }
 323
 324 void tcp_put_port(struct sock *sk)
 325 {
 326         local_bh_disable();
 327         __tcp_put_port(sk);
 328         local_bh_enable();
 329 }
 330
 331 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
 332  * Look, when several writers sleep and reader wakes them up, all but one
 333  * immediately hit write lock and grab all the cpus. Exclusive sleep solves
 334  * this, _but_ remember, it adds useless work on UP machines (wake up each
 335  * exclusive lock release). It should be ifdefed really.
 336  */
 337
 338 void tcp_listen_wlock(void)
 339 {
 340         write_lock(&tcp_lhash_lock);
 341
 342         if (atomic_read(&tcp_lhash_users)) {
 343                 DEFINE_WAIT(wait);
 344
 345                 for (;;) {
 346                         prepare_to_wait_exclusive(&tcp_lhash_wait,
 347                                                 &wait, TASK_UNINTERRUPTIBLE);
 348                         if (!atomic_read(&tcp_lhash_users))
 349                                 break;
 350                         write_unlock_bh(&tcp_lhash_lock);
 351                         schedule();
 352                         write_lock_bh(&tcp_lhash_lock);
 353                 }
 354
 355                 finish_wait(&tcp_lhash_wait, &wait);
 356         }
 357 }
 358
 359 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
 360 {
 361         struct hlist_head *list;
 362         rwlock_t *lock;
 363
 364         BUG_TRAP(sk_unhashed(sk));
 365         if (listen_possible && sk->sk_state == TCP_LISTEN) {
 366                 list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
 367                 lock = &tcp_lhash_lock;
 368                 tcp_listen_wlock();
 369         } else {
 370                 list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
 371                 lock = &tcp_ehash[sk->sk_hashent].lock;
 372                 write_lock(lock);
 373         }
 374         __sk_add_node(sk, list);
 375         sock_prot_inc_use(sk->sk_prot);
 376         write_unlock(lock);
 377         if (listen_possible && sk->sk_state == TCP_LISTEN)
 378                 wake_up(&tcp_lhash_wait);
 379 }
 380
 381 static void tcp_v4_hash(struct sock *sk)
 382 {
 383         if (sk->sk_state != TCP_CLOSE) {
 384                 local_bh_disable();
 385                 __tcp_v4_hash(sk, 1);
 386                 local_bh_enable();
 387         }
 388 }
 389
 390 void tcp_unhash(struct sock *sk)
 391 {
 392         rwlock_t *lock;
 393
 394         if (sk_unhashed(sk))
 395                 goto ende;
 396
 397         if (sk->sk_state == TCP_LISTEN) {
 398                 local_bh_disable();
 399                 tcp_listen_wlock();
 400                 lock = &tcp_lhash_lock;
 401         } else {
 402                 struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
 403                 lock = &head->lock;
 404                 write_lock_bh(&head->lock);
 405         }
 406
 407         if (__sk_del_node_init(sk))
 408                 sock_prot_dec_use(sk->sk_prot);
 409         write_unlock_bh(lock);
 410
 411  ende:
 412         if (sk->sk_state == TCP_LISTEN)
 413                 wake_up(&tcp_lhash_wait);
 414 }
 415
 416 /* Don't inline this cruft.  Here are some nice properties to
 417  * exploit here.  The BSD API does not allow a listening TCP
 418  * to specify the remote port nor the remote address for the
 419  * connection.  So always assume those are both wildcarded
 420  * during the search since they can never be otherwise.
 421  */
 422 static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
 423                                              unsigned short hnum, int dif)
 424 {
 425         struct sock *result = NULL, *sk;
 426         struct hlist_node *node;
 427         int score, hiscore;
 428
 429         hiscore=-1;
 430         sk_for_each(sk, node, head) {
 431                 struct inet_sock *inet = inet_sk(sk);
 432
 433                 if (inet->num == hnum && !ipv6_only_sock(sk)) {
 434                         __u32 rcv_saddr = inet->rcv_saddr;
 435
 436                         score = (sk->sk_family == PF_INET ? 1 : 0);
 437                         if (rcv_saddr) {
 438                                 if (rcv_saddr != daddr)
 439                                         continue;
 440                                 score+=2;
 441                         }
 442                         if (sk->sk_bound_dev_if) {
 443                                 if (sk->sk_bound_dev_if != dif)
 444                                         continue;
 445                                 score+=2;
 446                         }
 447                         if (score == 5)
 448                                 return sk;
 449                         if (score > hiscore) {
 450                                 hiscore = score;
 451                                 result = sk;
 452                         }
 453                 }
 454         }
 455         return result;
 456 }
 457
 458 /* Optimize the common listener case. */
 459 static inline struct sock *tcp_v4_lookup_listener(u32 daddr,
 460                 unsigned short hnum, int dif)
 461 {
 462         struct sock *sk = NULL;
 463         struct hlist_head *head;
 464
 465         read_lock(&tcp_lhash_lock);
 466         head = &tcp_listening_hash[tcp_lhashfn(hnum)];
 467         if (!hlist_empty(head)) {
 468                 struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
 469
 470                 if (inet->num == hnum && !sk->sk_node.next &&
 471                     (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
 472                     (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
 473                     !sk->sk_bound_dev_if)
 474                         goto sherry_cache;
 475                 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
 476         }
 477         if (sk) {
 478 sherry_cache:
 479                 sock_hold(sk);
 480         }
 481         read_unlock(&tcp_lhash_lock);
 482         return sk;
 483 }
 484
 485 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
 486  * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
 487  *
 488  * Local BH must be disabled here.
 489  */
 490
 491 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
 492                                                        u32 daddr, u16 hnum,
 493                                                        int dif)
 494 {
 495         struct tcp_ehash_bucket *head;
 496         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
 497         __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
 498         struct sock *sk;
 499         struct hlist_node *node;
 500         /* Optimize here for direct hit, only listening connections can
 501          * have wildcards anyways.
 502          */
 503         int hash = tcp_hashfn(daddr, hnum, saddr, sport);
 504         head = &tcp_ehash[hash];
 505         read_lock(&head->lock);
 506         sk_for_each(sk, node, &head->chain) {
 507                 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
 508                         goto hit; /* You sunk my battleship! */
 509         }
 510
 511         /* Must check for a TIME_WAIT'er before going to listener hash. */
 512         sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
 513                 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
 514                         goto hit;
 515         }
 516         sk = NULL;
 517 out:
 518         read_unlock(&head->lock);
 519         return sk;
 520 hit:
 521         sock_hold(sk);
 522         goto out;
 523 }
 524
 525 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
 526                                            u32 daddr, u16 hnum, int dif)
 527 {
 528         struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
 529                                                       daddr, hnum, dif);
 530
 531         return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
 532 }
 533
 534 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
 535                                   u16 dport, int dif)
 536 {
 537         struct sock *sk;
 538
 539         local_bh_disable();
 540         sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
 541         local_bh_enable();
 542
 543         return sk;
 544 }
 545
 546 EXPORT_SYMBOL_GPL(tcp_v4_lookup);
 547
 548 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 549 {
 550         return secure_tcp_sequence_number(skb->nh.iph->daddr,
 551                                           skb->nh.iph->saddr,
 552                                           skb->h.th->dest,
 553                                           skb->h.th->source);
 554 }
 555
 556 /* called with local bh disabled */
 557 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
 558                                       struct tcp_tw_bucket **twp)
 559 {
 560         struct inet_sock *inet = inet_sk(sk);
 561         u32 daddr = inet->rcv_saddr;
 562         u32 saddr = inet->daddr;
 563         int dif = sk->sk_bound_dev_if;
 564         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
 565         __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
 566         int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
 567         struct tcp_ehash_bucket *head = &tcp_ehash[hash];
 568         struct sock *sk2;
 569         struct hlist_node *node;
 570         struct tcp_tw_bucket *tw;
 571
 572         write_lock(&head->lock);
 573
 574         /* Check TIME-WAIT sockets first. */
 575         sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
 576                 tw = (struct tcp_tw_bucket *)sk2;
 577
 578                 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
 579                         struct tcp_sock *tp = tcp_sk(sk);
 580
 581                         /* With PAWS, it is safe from the viewpoint
 582                            of data integrity. Even without PAWS it
 583                            is safe provided sequence spaces do not
 584                            overlap i.e. at data rates <= 80Mbit/sec.
 585
 586                            Actually, the idea is close to VJ's one,
 587                            only timestamp cache is held not per host,
 588                            but per port pair and TW bucket is used
 589                            as state holder.
 590
 591                            If TW bucket has been already destroyed we
 592                            fall back to VJ's scheme and use initial
 593                            timestamp retrieved from peer table.
 594                          */
 595                         if (tw->tw_ts_recent_stamp &&
 596                             (!twp || (sysctl_tcp_tw_reuse &&
 597                                       xtime.tv_sec -
 598                                       tw->tw_ts_recent_stamp > 1))) {
 599                                 if ((tp->write_seq =
 600                                                 tw->tw_snd_nxt + 65535 + 2) == 0)
 601                                         tp->write_seq = 1;
 602                                 tp->rx_opt.ts_recent       = tw->tw_ts_recent;
 603                                 tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
 604                                 sock_hold(sk2);
 605                                 goto unique;
 606                         } else
 607                                 goto not_unique;
 608                 }
 609         }
 610         tw = NULL;
 611
 612         /* And established part... */
 613         sk_for_each(sk2, node, &head->chain) {
 614                 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
 615                         goto not_unique;
 616         }
 617
 618 unique:
 619         /* Must record num and sport now. Otherwise we will see
 620          * in hash table socket with a funny identity. */
 621         inet->num = lport;
 622         inet->sport = htons(lport);
 623         sk->sk_hashent = hash;
 624         BUG_TRAP(sk_unhashed(sk));
 625         __sk_add_node(sk, &head->chain);
 626         sock_prot_inc_use(sk->sk_prot);
 627         write_unlock(&head->lock);
 628
 629         if (twp) {
 630                 *twp = tw;
 631                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 632         } else if (tw) {
 633                 /* Silly. Should hash-dance instead... */
 634                 tcp_tw_deschedule(tw);
 635                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 636
 637                 tcp_tw_put(tw);
 638         }
 639
 640         return 0;
 641
 642 not_unique:
 643         write_unlock(&head->lock);
 644         return -EADDRNOTAVAIL;
 645 }
 646
 647 static inline u32 connect_port_offset(const struct sock *sk)
 648 {
 649         const struct inet_sock *inet = inet_sk(sk);
 650
 651         return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
 652                                          inet->dport);
 653 }
 654
 655 /*
 656  * Bind a port for a connect operation and hash it.
 657  */
 658 static inline int tcp_v4_hash_connect(struct sock *sk)
 659 {
 660         unsigned short snum = inet_sk(sk)->num;
 661         struct tcp_bind_hashbucket *head;
 662         struct tcp_bind_bucket *tb;
 663         int ret;
 664
 665         if (!snum) {
 666                 int low = sysctl_local_port_range[0];
 667                 int high = sysctl_local_port_range[1];
 668                 int range = high - low;
 669                 int i;
 670                 int port;
 671                 static u32 hint;
 672                 u32 offset = hint + connect_port_offset(sk);
 673                 struct hlist_node *node;
 674                 struct tcp_tw_bucket *tw = NULL;
 675
 676                 local_bh_disable();
 677                 for (i = 1; i <= range; i++) {
 678                         port = low + (i + offset) % range;
 679                         head = &tcp_bhash[tcp_bhashfn(port)];
 680                         spin_lock(&head->lock);
 681
 682                         /* Does not bother with rcv_saddr checks,
 683                          * because the established check is already
 684                          * unique enough.
 685                          */
 686                         tb_for_each(tb, node, &head->chain) {
 687                                 if (tb->port == port) {
 688                                         BUG_TRAP(!hlist_empty(&tb->owners));
 689                                         if (tb->fastreuse >= 0)
 690                                                 goto next_port;
 691                                         if (!__tcp_v4_check_established(sk,
 692                                                                         port,
 693                                                                         &tw))
 694                                                 goto ok;
 695                                         goto next_port;
 696                                 }
 697                         }
 698
 699                         tb = tcp_bucket_create(head, port);
 700                         if (!tb) {
 701                                 spin_unlock(&head->lock);
 702                                 break;
 703                         }
 704                         tb->fastreuse = -1;
 705                         goto ok;
 706
 707                 next_port:
 708                         spin_unlock(&head->lock);
 709                 }
 710                 local_bh_enable();
 711
 712                 return -EADDRNOTAVAIL;
 713
 714 ok:
 715                 hint += i;
 716
 717                 /* Head lock still held and bh's disabled */
 718                 tcp_bind_hash(sk, tb, port);
 719                 if (sk_unhashed(sk)) {
 720                         inet_sk(sk)->sport = htons(port);
 721                         __tcp_v4_hash(sk, 0);
 722                 }
 723                 spin_unlock(&head->lock);
 724
 725                 if (tw) {
 726                         tcp_tw_deschedule(tw);
 727                         tcp_tw_put(tw);
 728                 }
 729
 730                 ret = 0;
 731                 goto out;
 732         }
 733
 734         head  = &tcp_bhash[tcp_bhashfn(snum)];
 735         tb  = tcp_sk(sk)->bind_hash;
 736         spin_lock_bh(&head->lock);
 737         if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
 738                 __tcp_v4_hash(sk, 0);
 739                 spin_unlock_bh(&head->lock);
 740                 return 0;
 741         } else {
 742                 spin_unlock(&head->lock);
 743                 /* No definite answer... Walk to established hash table */
 744                 ret = __tcp_v4_check_established(sk, snum, NULL);
 745 out:
 746                 local_bh_enable();
 747                 return ret;
 748         }
 749 }
 750
 751 /* This will initiate an outgoing connection. */
 752 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 753 {
 754         struct inet_sock *inet = inet_sk(sk);
 755         struct tcp_sock *tp = tcp_sk(sk);
 756         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 757         struct rtable *rt;
 758         u32 daddr, nexthop;
 759         int tmp;
 760         int err;
 761
 762         if (addr_len < sizeof(struct sockaddr_in))
 763                 return -EINVAL;
 764
 765         if (usin->sin_family != AF_INET)
 766                 return -EAFNOSUPPORT;
 767
 768         nexthop = daddr = usin->sin_addr.s_addr;
 769         if (inet->opt && inet->opt->srr) {
 770                 if (!daddr)
 771                         return -EINVAL;
 772                 nexthop = inet->opt->faddr;
 773         }
 774
 775         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
 776                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 777                                IPPROTO_TCP,
 778                                inet->sport, usin->sin_port, sk);
 779         if (tmp < 0)
 780                 return tmp;
 781
 782         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 783                 ip_rt_put(rt);
 784                 return -ENETUNREACH;
 785         }
 786
 787         if (!inet->opt || !inet->opt->srr)
 788                 daddr = rt->rt_dst;
 789
 790         if (!inet->saddr)
 791                 inet->saddr = rt->rt_src;
 792         inet->rcv_saddr = inet->saddr;
 793
 794         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
 795                 /* Reset inherited state */
 796                 tp->rx_opt.ts_recent       = 0;
 797                 tp->rx_opt.ts_recent_stamp = 0;
 798                 tp->write_seq              = 0;
 799         }
 800
 801         if (sysctl_tcp_tw_recycle &&
 802             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
 803                 struct inet_peer *peer = rt_get_peer(rt);
 804
 805                 /* VJ's idea. We save last timestamp seen from
 806                  * the destination in peer table, when entering state TIME-WAIT
 807                  * and initialize rx_opt.ts_recent from it, when trying new connection.
 808                  */
 809
 810                 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
 811                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 812                         tp->rx_opt.ts_recent = peer->tcp_ts;
 813                 }
 814         }
 815
 816         inet->dport = usin->sin_port;
 817         inet->daddr = daddr;
 818
 819         tp->ext_header_len = 0;
 820         if (inet->opt)
 821                 tp->ext_header_len = inet->opt->optlen;
 822
 823         tp->rx_opt.mss_clamp = 536;
 824
 825         /* Socket identity is still unknown (sport may be zero).
 826          * However we set state to SYN-SENT and not releasing socket
 827          * lock select source port, enter ourselves into the hash tables and
 828          * complete initialization after this.
 829          */
 830         tcp_set_state(sk, TCP_SYN_SENT);
 831         err = tcp_v4_hash_connect(sk);
 832         if (err)
 833                 goto failure;
 834
 835         err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
 836         if (err)
 837                 goto failure;
 838
 839         /* OK, now commit destination to socket.  */
 840         __sk_dst_set(sk, &rt->u.dst);
 841         tcp_v4_setup_caps(sk, &rt->u.dst);
 842
 843         if (!tp->write_seq)
 844                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
 845                                                            inet->daddr,
 846                                                            inet->sport,
 847                                                            usin->sin_port);
 848
 849         inet->id = tp->write_seq ^ jiffies;
 850
 851         err = tcp_connect(sk);
 852         rt = NULL;
 853         if (err)
 854                 goto failure;
 855
 856         return 0;
 857
 858 failure:
 859         /* This unhashes the socket and releases the local port, if necessary. */
 860         tcp_set_state(sk, TCP_CLOSE);
 861         ip_rt_put(rt);
 862         sk->sk_route_caps = 0;
 863         inet->dport = 0;
 864         return err;
 865 }
 866
 867 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
 868 {
 869         return ((struct rtable *)skb->dst)->rt_iif;
 870 }
 871
 872 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
 873 {
 874         return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
 875 }
 876
 877 static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
 878                                               struct request_sock ***prevp,
 879                                               __u16 rport,
 880                                               __u32 raddr, __u32 laddr)
 881 {
 882         struct listen_sock *lopt = tp->accept_queue.listen_opt;
 883         struct request_sock *req, **prev;
 884
 885         for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
 886              (req = *prev) != NULL;
 887              prev = &req->dl_next) {
 888                 const struct inet_request_sock *ireq = inet_rsk(req);
 889
 890                 if (ireq->rmt_port == rport &&
 891                     ireq->rmt_addr == raddr &&
 892                     ireq->loc_addr == laddr &&
 893                     TCP_INET_FAMILY(req->rsk_ops->family)) {
 894                         BUG_TRAP(!req->sk);
 895                         *prevp = prev;
 896                         break;
 897                 }
 898         }
 899
 900         return req;
 901 }
 902
 903 static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
 904 {
 905         struct tcp_sock *tp = tcp_sk(sk);
 906         struct listen_sock *lopt = tp->accept_queue.listen_opt;
 907         u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
 908
 909         reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
 910         tcp_synq_added(sk);
 911 }
 912
 913
 914 /*
 915  * This routine does path mtu discovery as defined in RFC1191.
 916  */
 917 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
 918                                      u32 mtu)
 919 {
 920         struct dst_entry *dst;
 921         struct inet_sock *inet = inet_sk(sk);
 922         struct tcp_sock *tp = tcp_sk(sk);
 923
 924         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 925          * send out by Linux are always <576bytes so they should go through
 926          * unfragmented).
 927          */
 928         if (sk->sk_state == TCP_LISTEN)
 929                 return;
 930
 931         /* We don't check in the destentry if pmtu discovery is forbidden
 932          * on this route. We just assume that no packet_to_big packets
 933          * are send back when pmtu discovery is not active.
 934          * There is a small race when the user changes this flag in the
 935          * route, but I think that's acceptable.
 936          */
 937         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 938                 return;
 939
 940         dst->ops->update_pmtu(dst, mtu);
 941
 942         /* Something is about to be wrong... Remember soft error
 943          * for the case, if this connection will not able to recover.
 944          */
 945         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 946                 sk->sk_err_soft = EMSGSIZE;
 947
 948         mtu = dst_mtu(dst);
 949
 950         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 951             tp->pmtu_cookie > mtu) {
 952                 tcp_sync_mss(sk, mtu);
 953
 954                 /* Resend the TCP packet because it's
 955                  * clear that the old packet has been
 956                  * dropped. This is the new "fast" path mtu
 957                  * discovery.
 958                  */
 959                 tcp_simple_retransmit(sk);
 960         } /* else let the usual retransmit timer handle it */
 961 }
 962
 963 /*
 964  * This routine is called by the ICMP module when it gets some
 965  * sort of error condition.  If err < 0 then the socket should
 966  * be closed and the error returned to the user.  If err > 0
 967  * it's just the icmp type << 8 | icmp code.  After adjustment
 968  * header points to the first 8 bytes of the tcp header.  We need
 969  * to find the appropriate port.
 970  *
 971  * The locking strategy used here is very "optimistic". When
 972  * someone else accesses the socket the ICMP is just dropped
 973  * and for some paths there is no check at all.
 974  * A more general error queue to queue errors for later handling
 975  * is probably better.
 976  *
 977  */
 978
 979 void tcp_v4_err(struct sk_buff *skb, u32 info)
 980 {
 981         struct iphdr *iph = (struct iphdr *)skb->data;
 982         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 983         struct tcp_sock *tp;
 984         struct inet_sock *inet;
 985         int type = skb->h.icmph->type;
 986         int code = skb->h.icmph->code;
 987         struct sock *sk;
 988         __u32 seq;
 989         int err;
 990
 991         if (skb->len < (iph->ihl << 2) + 8) {
 992                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 993                 return;
 994         }
 995
 996         sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
 997                            th->source, tcp_v4_iif(skb));
 998         if (!sk) {
 999                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
1000                 return;
1001         }
1002         if (sk->sk_state == TCP_TIME_WAIT) {
1003                 tcp_tw_put((struct tcp_tw_bucket *)sk);
1004                 return;
1005         }
1006
1007         bh_lock_sock(sk);
1008         /* If too many ICMPs get dropped on busy
1009          * servers this needs to be solved differently.
1010          */
1011         if (sock_owned_by_user(sk))
1012                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
1013
1014         if (sk->sk_state == TCP_CLOSE)
1015                 goto out;
1016
1017         tp = tcp_sk(sk);
1018         seq = ntohl(th->seq);
1019         if (sk->sk_state != TCP_LISTEN &&
1020             !between(seq, tp->snd_una, tp->snd_nxt)) {
1021                 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
1022                 goto out;
1023         }
1024
1025         switch (type) {
1026         case ICMP_SOURCE_QUENCH:
1027                 /* Just silently ignore these. */
1028                 goto out;
1029         case ICMP_PARAMETERPROB:
1030                 err = EPROTO;
1031                 break;
1032         case ICMP_DEST_UNREACH:
1033                 if (code > NR_ICMP_UNREACH)
1034                         goto out;
1035
1036                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1037                         if (!sock_owned_by_user(sk))
1038                                 do_pmtu_discovery(sk, iph, info);
1039                         goto out;
1040                 }
1041
1042                 err = icmp_err_convert[code].errno;
1043                 break;
1044         case ICMP_TIME_EXCEEDED:
1045                 err = EHOSTUNREACH;
1046                 break;
1047         default:
1048                 goto out;
1049         }
1050
1051         switch (sk->sk_state) {
1052                 struct request_sock *req, **prev;
1053         case TCP_LISTEN:
1054                 if (sock_owned_by_user(sk))
1055                         goto out;
1056
1057                 req = tcp_v4_search_req(tp, &prev, th->dest,
1058                                         iph->daddr, iph->saddr);
1059                 if (!req)
1060                         goto out;
1061
1062                 /* ICMPs are not backlogged, hence we cannot get
1063                    an established socket here.
1064                  */
1065                 BUG_TRAP(!req->sk);
1066
1067                 if (seq != tcp_rsk(req)->snt_isn) {
1068                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
1069                         goto out;
1070                 }
1071
1072                 /*
1073                  * Still in SYN_RECV, just remove it silently.
1074                  * There is no good way to pass the error to the newly
1075                  * created socket, and POSIX does not want network
1076                  * errors returned from accept().
1077                  */
1078                 tcp_synq_drop(sk, req, prev);
1079                 goto out;
1080
1081         case TCP_SYN_SENT:
1082         case TCP_SYN_RECV:  /* Cannot happen.
1083                                It can f.e. if SYNs crossed.
1084                              */
1085                 if (!sock_owned_by_user(sk)) {
1086                         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1087                         sk->sk_err = err;
1088
1089                         sk->sk_error_report(sk);
1090
1091                         tcp_done(sk);
1092                 } else {
1093                         sk->sk_err_soft = err;
1094                 }
1095                 goto out;
1096         }
1097
1098         /* If we've already connected we will keep trying
1099          * until we time out, or the user gives up.
1100          *
1101          * rfc1122 4.2.3.9 allows to consider as hard errors
1102          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1103          * but it is obsoleted by pmtu discovery).
1104          *
1105          * Note, that in modern internet, where routing is unreliable
1106          * and in each dark corner broken firewalls sit, sending random
1107          * errors ordered by their masters even this two messages finally lose
1108          * their original sense (even Linux sends invalid PORT_UNREACHs)
1109          *
1110          * Now we are in compliance with RFCs.
1111          *                                                      --ANK (980905)
1112          */
1113
1114         inet = inet_sk(sk);
1115         if (!sock_owned_by_user(sk) && inet->recverr) {
1116                 sk->sk_err = err;
1117                 sk->sk_error_report(sk);
1118         } else  { /* Only an error on timeout */
1119                 sk->sk_err_soft = err;
1120         }
1121
1122 out:
1123         bh_unlock_sock(sk);
1124         sock_put(sk);
1125 }
1126
1127 /* This routine computes an IPv4 TCP checksum. */
1128 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1129                        struct sk_buff *skb)
1130 {
1131         struct inet_sock *inet = inet_sk(sk);
1132
1133         if (skb->ip_summed == CHECKSUM_HW) {
1134                 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1135                 skb->csum = offsetof(struct tcphdr, check);
1136         } else {
1137                 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1138                                          csum_partial((char *)th,
1139                                                       th->doff << 2,
1140                                                       skb->csum));
1141         }
1142 }
1143
1144 /*
1145  *      This routine will send an RST to the other tcp.
1146  *
1147  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1148  *                    for reset.
1149  *      Answer: if a packet caused RST, it is not for a socket
1150  *              existing in our system, if it is matched to a socket,
1151  *              it is just duplicate segment or bug in other side's TCP.
1152  *              So that we build reply only basing on parameters
1153  *              arrived with segment.
1154  *      Exception: precedence violation. We do not implement it in any case.
1155  */
1156
1157 static void tcp_v4_send_reset(struct sk_buff *skb)
1158 {
1159         struct tcphdr *th = skb->h.th;
1160         struct tcphdr rth;
1161         struct ip_reply_arg arg;
1162
1163         /* Never send a reset in response to a reset. */
1164         if (th->rst)
1165                 return;
1166
1167         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1168                 return;
1169
1170         /* Swap the send and the receive. */
1171         memset(&rth, 0, sizeof(struct tcphdr));
1172         rth.dest   = th->source;
1173         rth.source = th->dest;
1174         rth.doff   = sizeof(struct tcphdr) / 4;
1175         rth.rst    = 1;
1176
1177         if (th->ack) {
1178                 rth.seq = th->ack_seq;
1179         } else {
1180                 rth.ack = 1;
1181                 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1182                                     skb->len - (th->doff << 2));
1183         }
1184
1185         memset(&arg, 0, sizeof arg);
1186         arg.iov[0].iov_base = (unsigned char *)&rth;
1187         arg.iov[0].iov_len  = sizeof rth;
1188         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1189                                       skb->nh.iph->saddr, /*XXX*/
1190                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
1191         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1192
1193         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1194
1195         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1196         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1197 }
1198
1199 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1200    outside socket context is ugly, certainly. What can I do?
1201  */
1202
1203 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1204                             u32 win, u32 ts)
1205 {
1206         struct tcphdr *th = skb->h.th;
1207         struct {
1208                 struct tcphdr th;
1209                 u32 tsopt[3];
1210         } rep;
1211         struct ip_reply_arg arg;
1212
1213         memset(&rep.th, 0, sizeof(struct tcphdr));
1214         memset(&arg, 0, sizeof arg);
1215
1216         arg.iov[0].iov_base = (unsigned char *)&rep;
1217         arg.iov[0].iov_len  = sizeof(rep.th);
1218         if (ts) {
1219                 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1220                                      (TCPOPT_TIMESTAMP << 8) |
1221                                      TCPOLEN_TIMESTAMP);
1222                 rep.tsopt[1] = htonl(tcp_time_stamp);
1223                 rep.tsopt[2] = htonl(ts);
1224                 arg.iov[0].iov_len = sizeof(rep);
1225         }
1226
1227         /* Swap the send and the receive. */
1228         rep.th.dest    = th->source;
1229         rep.th.source  = th->dest;
1230         rep.th.doff    = arg.iov[0].iov_len / 4;
1231         rep.th.seq     = htonl(seq);
1232         rep.th.ack_seq = htonl(ack);
1233         rep.th.ack     = 1;
1234         rep.th.window  = htons(win);
1235
1236         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1237                                       skb->nh.iph->saddr, /*XXX*/
1238                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
1239         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1240
1241         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1242
1243         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1244 }
1245
1246 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1247 {
1248         struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1249
1250         tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1251                         tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1252
1253         tcp_tw_put(tw);
1254 }
1255
1256 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
1257 {
1258         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
1259                         req->ts_recent);
1260 }
1261
1262 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1263                                           struct request_sock *req)
1264 {
1265         struct rtable *rt;
1266         const struct inet_request_sock *ireq = inet_rsk(req);
1267         struct ip_options *opt = inet_rsk(req)->opt;
1268         struct flowi fl = { .oif = sk->sk_bound_dev_if,
1269                             .nl_u = { .ip4_u =
1270                                       { .daddr = ((opt && opt->srr) ?
1271                                                   opt->faddr :
1272                                                   ireq->rmt_addr),
1273                                         .saddr = ireq->loc_addr,
1274                                         .tos = RT_CONN_FLAGS(sk) } },
1275                             .proto = IPPROTO_TCP,
1276                             .uli_u = { .ports =
1277                                        { .sport = inet_sk(sk)->sport,
1278                                          .dport = ireq->rmt_port } } };
1279
1280         if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1281                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1282                 return NULL;
1283         }
1284         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1285                 ip_rt_put(rt);
1286                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1287                 return NULL;
1288         }
1289         return &rt->u.dst;
1290 }
1291
1292 /*
1293  *      Send a SYN-ACK after having received an ACK.
1294  *      This still operates on a request_sock only, not on a big
1295  *      socket.
1296  */
1297 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
1298                               struct dst_entry *dst)
1299 {
1300         const struct inet_request_sock *ireq = inet_rsk(req);
1301         int err = -1;
1302         struct sk_buff * skb;
1303
1304         /* First, grab a route. */
1305         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1306                 goto out;
1307
1308         skb = tcp_make_synack(sk, dst, req);
1309
1310         if (skb) {
1311                 struct tcphdr *th = skb->h.th;
1312
1313                 th->check = tcp_v4_check(th, skb->len,
1314                                          ireq->loc_addr,
1315                                          ireq->rmt_addr,
1316                                          csum_partial((char *)th, skb->len,
1317                                                       skb->csum));
1318
1319                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1320                                             ireq->rmt_addr,
1321                                             ireq->opt);
1322                 if (err == NET_XMIT_CN)
1323                         err = 0;
1324         }
1325
1326 out:
1327         dst_release(dst);
1328         return err;
1329 }
1330
1331 /*
1332  *      IPv4 request_sock destructor.
1333  */
1334 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1335 {
1336         if (inet_rsk(req)->opt)
1337                 kfree(inet_rsk(req)->opt);
1338 }
1339
1340 static inline void syn_flood_warning(struct sk_buff *skb)
1341 {
1342         static unsigned long warntime;
1343
1344         if (time_after(jiffies, (warntime + HZ * 60))) {
1345                 warntime = jiffies;
1346                 printk(KERN_INFO
1347                        "possible SYN flooding on port %d. Sending cookies.\n",
1348                        ntohs(skb->h.th->dest));
1349         }
1350 }
1351
1352 /*
1353  * Save and compile IPv4 options into the request_sock if needed.
1354  */
1355 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1356                                                      struct sk_buff *skb)
1357 {
1358         struct ip_options *opt = &(IPCB(skb)->opt);
1359         struct ip_options *dopt = NULL;
1360
1361         if (opt && opt->optlen) {
1362                 int opt_size = optlength(opt);
1363                 dopt = kmalloc(opt_size, GFP_ATOMIC);
1364                 if (dopt) {
1365                         if (ip_options_echo(dopt, skb)) {
1366                                 kfree(dopt);
1367                                 dopt = NULL;
1368                         }
1369                 }
1370         }
1371         return dopt;
1372 }
1373
1374 struct request_sock_ops tcp_request_sock_ops = {
1375         .family         =       PF_INET,
1376         .obj_size       =       sizeof(struct tcp_request_sock),
1377         .rtx_syn_ack    =       tcp_v4_send_synack,
1378         .send_ack       =       tcp_v4_reqsk_send_ack,
1379         .destructor     =       tcp_v4_reqsk_destructor,
1380         .send_reset     =       tcp_v4_send_reset,
1381 };
1382
1383 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1384 {
1385         struct inet_request_sock *ireq;
1386         struct tcp_options_received tmp_opt;
1387         struct request_sock *req;
1388         __u32 saddr = skb->nh.iph->saddr;
1389         __u32 daddr = skb->nh.iph->daddr;
1390         __u32 isn = TCP_SKB_CB(skb)->when;
1391         struct dst_entry *dst = NULL;
1392 #ifdef CONFIG_SYN_COOKIES
1393         int want_cookie = 0;
1394 #else
1395 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1396 #endif
1397
1398         /* Never answer to SYNs send to broadcast or multicast */
1399         if (((struct rtable *)skb->dst)->rt_flags &
1400             (RTCF_BROADCAST | RTCF_MULTICAST))
1401                 goto drop;
1402
1403         /* TW buckets are converted to open requests without
1404          * limitations, they conserve resources and peer is
1405          * evidently real one.
1406          */
1407         if (tcp_synq_is_full(sk) && !isn) {
1408 #ifdef CONFIG_SYN_COOKIES
1409                 if (sysctl_tcp_syncookies) {
1410                         want_cookie = 1;
1411                 } else
1412 #endif
1413                 goto drop;
1414         }
1415
1416         /* Accept backlog is full. If we have already queued enough
1417          * of warm entries in syn queue, drop request. It is better than
1418          * clogging syn queue with openreqs with exponentially increasing
1419          * timeout.
1420          */
1421         if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1422                 goto drop;
1423
1424         req = reqsk_alloc(&tcp_request_sock_ops);
1425         if (!req)
1426                 goto drop;
1427
1428         tcp_clear_options(&tmp_opt);
1429         tmp_opt.mss_clamp = 536;
1430         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1431
1432         tcp_parse_options(skb, &tmp_opt, 0);
1433
1434         if (want_cookie) {
1435                 tcp_clear_options(&tmp_opt);
1436                 tmp_opt.saw_tstamp = 0;
1437         }
1438
1439         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1440                 /* Some OSes (unknown ones, but I see them on web server, which
1441                  * contains information interesting only for windows'
1442                  * users) do not send their stamp in SYN. It is easy case.
1443                  * We simply do not advertise TS support.
1444                  */
1445                 tmp_opt.saw_tstamp = 0;
1446                 tmp_opt.tstamp_ok  = 0;
1447         }
1448         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1449
1450         tcp_openreq_init(req, &tmp_opt, skb);
1451
1452         ireq = inet_rsk(req);
1453         ireq->loc_addr = daddr;
1454         ireq->rmt_addr = saddr;
1455         ireq->opt = tcp_v4_save_options(sk, skb);
1456         if (!want_cookie)
1457                 TCP_ECN_create_request(req, skb->h.th);
1458
1459         if (want_cookie) {
1460 #ifdef CONFIG_SYN_COOKIES
1461                 syn_flood_warning(skb);
1462 #endif
1463                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1464         } else if (!isn) {
1465                 struct inet_peer *peer = NULL;
1466
1467                 /* VJ's idea. We save last timestamp seen
1468                  * from the destination in peer table, when entering
1469                  * state TIME-WAIT, and check against it before
1470                  * accepting new connection request.
1471                  *
1472                  * If "isn" is not zero, this request hit alive
1473                  * timewait bucket, so that all the necessary checks
1474                  * are made in the function processing timewait state.
1475                  */
1476                 if (tmp_opt.saw_tstamp &&
1477                     sysctl_tcp_tw_recycle &&
1478                     (dst = tcp_v4_route_req(sk, req)) != NULL &&
1479                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1480                     peer->v4daddr == saddr) {
1481                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1482                             (s32)(peer->tcp_ts - req->ts_recent) >
1483                                                         TCP_PAWS_WINDOW) {
1484                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1485                                 dst_release(dst);
1486                                 goto drop_and_free;
1487                         }
1488                 }
1489                 /* Kill the following clause, if you dislike this way. */
1490                 else if (!sysctl_tcp_syncookies &&
1491                          (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1492                           (sysctl_max_syn_backlog >> 2)) &&
1493                          (!peer || !peer->tcp_ts_stamp) &&
1494                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1495                         /* Without syncookies last quarter of
1496                          * backlog is filled with destinations,
1497                          * proven to be alive.
1498                          * It means that we continue to communicate
1499                          * to destinations, already remembered
1500                          * to the moment of synflood.
1501                          */
1502                         LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
1503                                               "request from %u.%u."
1504                                               "%u.%u/%u\n",
1505                                               NIPQUAD(saddr),
1506                                               ntohs(skb->h.th->source)));
1507                         dst_release(dst);
1508                         goto drop_and_free;
1509                 }
1510
1511                 isn = tcp_v4_init_sequence(sk, skb);
1512         }
1513         tcp_rsk(req)->snt_isn = isn;
1514
1515         if (tcp_v4_send_synack(sk, req, dst))
1516                 goto drop_and_free;
1517
1518         if (want_cookie) {
1519                 reqsk_free(req);
1520         } else {
1521                 tcp_v4_synq_add(sk, req);
1522         }
1523         return 0;
1524
1525 drop_and_free:
1526         reqsk_free(req);
1527 drop:
1528         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1529         return 0;
1530 }
1531
1532
1533 /*
1534  * The three way handshake has completed - we got a valid synack -
1535  * now create the new socket.
1536  */
1537 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1538                                   struct request_sock *req,
1539                                   struct dst_entry *dst)
1540 {
1541         struct inet_request_sock *ireq;
1542         struct inet_sock *newinet;
1543         struct tcp_sock *newtp;
1544         struct sock *newsk;
1545
1546         if (sk_acceptq_is_full(sk))
1547                 goto exit_overflow;
1548
1549         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1550                 goto exit;
1551
1552         newsk = tcp_create_openreq_child(sk, req, skb);
1553         if (!newsk)
1554                 goto exit;
1555
1556         newsk->sk_dst_cache = dst;
1557         tcp_v4_setup_caps(newsk, dst);
1558
1559         newtp                 = tcp_sk(newsk);
1560         newinet               = inet_sk(newsk);
1561         ireq                  = inet_rsk(req);
1562         newinet->daddr        = ireq->rmt_addr;
1563         newinet->rcv_saddr    = ireq->loc_addr;
1564         newinet->saddr        = ireq->loc_addr;
1565         newinet->opt          = ireq->opt;
1566         ireq->opt             = NULL;
1567         newinet->mc_index     = tcp_v4_iif(skb);
1568         newinet->mc_ttl       = skb->nh.iph->ttl;
1569         newtp->ext_header_len = 0;
1570         if (newinet->opt)
1571                 newtp->ext_header_len = newinet->opt->optlen;
1572         newinet->id = newtp->write_seq ^ jiffies;
1573
1574         tcp_sync_mss(newsk, dst_mtu(dst));
1575         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1576         tcp_initialize_rcv_mss(newsk);
1577
1578         __tcp_v4_hash(newsk, 0);
1579         __tcp_inherit_port(sk, newsk);
1580
1581         return newsk;
1582
1583 exit_overflow:
1584         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1585 exit:
1586         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1587         dst_release(dst);
1588         return NULL;
1589 }
1590
1591 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1592 {
1593         struct tcphdr *th = skb->h.th;
1594         struct iphdr *iph = skb->nh.iph;
1595         struct tcp_sock *tp = tcp_sk(sk);
1596         struct sock *nsk;
1597         struct request_sock **prev;
1598         /* Find possible connection requests. */
1599         struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
1600                                                      iph->saddr, iph->daddr);
1601         if (req)
1602                 return tcp_check_req(sk, skb, req, prev);
1603
1604         nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1605                                           th->source,
1606                                           skb->nh.iph->daddr,
1607                                           ntohs(th->dest),
1608                                           tcp_v4_iif(skb));
1609
1610         if (nsk) {
1611                 if (nsk->sk_state != TCP_TIME_WAIT) {
1612                         bh_lock_sock(nsk);
1613                         return nsk;
1614                 }
1615                 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1616                 return NULL;
1617         }
1618
1619 #ifdef CONFIG_SYN_COOKIES
1620         if (!th->rst && !th->syn && th->ack)
1621                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1622 #endif
1623         return sk;
1624 }
1625
1626 static int tcp_v4_checksum_init(struct sk_buff *skb)
1627 {
1628         if (skb->ip_summed == CHECKSUM_HW) {
1629                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1630                 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1631                                   skb->nh.iph->daddr, skb->csum))
1632                         return 0;
1633
1634                 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1635                 skb->ip_summed = CHECKSUM_NONE;
1636         }
1637         if (skb->len <= 76) {
1638                 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1639                                  skb->nh.iph->daddr,
1640                                  skb_checksum(skb, 0, skb->len, 0)))
1641                         return -1;
1642                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1643         } else {
1644                 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1645                                           skb->nh.iph->saddr,
1646                                           skb->nh.iph->daddr, 0);
1647         }
1648         return 0;
1649 }
1650
1651
1652 /* The socket must have it's spinlock held when we get
1653  * here.
1654  *
1655  * We have a potential double-lock case here, so even when
1656  * doing backlog processing we use the BH locking scheme.
1657  * This is because we cannot sleep with the original spinlock
1658  * held.
1659  */
1660 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1661 {
1662         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1663                 TCP_CHECK_TIMER(sk);
1664                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1665                         goto reset;
1666                 TCP_CHECK_TIMER(sk);
1667                 return 0;
1668         }
1669
1670         if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1671                 goto csum_err;
1672
1673         if (sk->sk_state == TCP_LISTEN) {
1674                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1675                 if (!nsk)
1676                         goto discard;
1677
1678                 if (nsk != sk) {
1679                         if (tcp_child_process(sk, nsk, skb))
1680                                 goto reset;
1681                         return 0;
1682                 }
1683         }
1684
1685         TCP_CHECK_TIMER(sk);
1686         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1687                 goto reset;
1688         TCP_CHECK_TIMER(sk);
1689         return 0;
1690
1691 reset:
1692         tcp_v4_send_reset(skb);
1693 discard:
1694         kfree_skb(skb);
1695         /* Be careful here. If this function gets more complicated and
1696          * gcc suffers from register pressure on the x86, sk (in %ebx)
1697          * might be destroyed here. This current version compiles correctly,
1698          * but you have been warned.
1699          */
1700         return 0;
1701
1702 csum_err:
1703         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1704         goto discard;
1705 }
1706
1707 /*
1708  *      From tcp_input.c
1709  */
1710
1711 int tcp_v4_rcv(struct sk_buff *skb)
1712 {
1713         struct tcphdr *th;
1714         struct sock *sk;
1715         int ret;
1716
1717         if (skb->pkt_type != PACKET_HOST)
1718                 goto discard_it;
1719
1720         /* Count it even if it's bad */
1721         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1722
1723         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1724                 goto discard_it;
1725
1726         th = skb->h.th;
1727
1728         if (th->doff < sizeof(struct tcphdr) / 4)
1729                 goto bad_packet;
1730         if (!pskb_may_pull(skb, th->doff * 4))
1731                 goto discard_it;
1732
1733         /* An explanation is required here, I think.
1734          * Packet length and doff are validated by header prediction,
1735          * provided case of th->doff==0 is elimineted.
1736          * So, we defer the checks. */
1737         if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1738              tcp_v4_checksum_init(skb) < 0))
1739                 goto bad_packet;
1740
1741         th = skb->h.th;
1742         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1743         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1744                                     skb->len - th->doff * 4);
1745         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1746         TCP_SKB_CB(skb)->when    = 0;
1747         TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
1748         TCP_SKB_CB(skb)->sacked  = 0;
1749
1750         sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1751                              skb->nh.iph->daddr, ntohs(th->dest),
1752                              tcp_v4_iif(skb));
1753
1754         if (!sk)
1755                 goto no_tcp_socket;
1756
1757 process:
1758         if (sk->sk_state == TCP_TIME_WAIT)
1759                 goto do_time_wait;
1760
1761         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1762                 goto discard_and_relse;
1763
1764         if (sk_filter(sk, skb, 0))
1765                 goto discard_and_relse;
1766
1767         skb->dev = NULL;
1768
1769         bh_lock_sock(sk);
1770         ret = 0;
1771         if (!sock_owned_by_user(sk)) {
1772                 if (!tcp_prequeue(sk, skb))
1773                         ret = tcp_v4_do_rcv(sk, skb);
1774         } else
1775                 sk_add_backlog(sk, skb);
1776         bh_unlock_sock(sk);
1777
1778         sock_put(sk);
1779
1780         return ret;
1781
1782 no_tcp_socket:
1783         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1784                 goto discard_it;
1785
1786         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1787 bad_packet:
1788                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1789         } else {
1790                 tcp_v4_send_reset(skb);
1791         }
1792
1793 discard_it:
1794         /* Discard frame. */
1795         kfree_skb(skb);
1796         return 0;
1797
1798 discard_and_relse:
1799         sock_put(sk);
1800         goto discard_it;
1801
1802 do_time_wait:
1803         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1804                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1805                 goto discard_it;
1806         }
1807
1808         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1809                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1810                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1811                 goto discard_it;
1812         }
1813         switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1814                                            skb, th, skb->len)) {
1815         case TCP_TW_SYN: {
1816                 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1817                                                           ntohs(th->dest),
1818                                                           tcp_v4_iif(skb));
1819                 if (sk2) {
1820                         tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1821                         tcp_tw_put((struct tcp_tw_bucket *)sk);
1822                         sk = sk2;
1823                         goto process;
1824                 }
1825                 /* Fall through to ACK */
1826         }
1827         case TCP_TW_ACK:
1828                 tcp_v4_timewait_ack(sk, skb);
1829                 break;
1830         case TCP_TW_RST:
1831                 goto no_tcp_socket;
1832         case TCP_TW_SUCCESS:;
1833         }
1834         goto discard_it;
1835 }
1836
1837 /* With per-bucket locks this operation is not-atomic, so that
1838  * this version is not worse.
1839  */
1840 static void __tcp_v4_rehash(struct sock *sk)
1841 {
1842         sk->sk_prot->unhash(sk);
1843         sk->sk_prot->hash(sk);
1844 }
1845
1846 static int tcp_v4_reselect_saddr(struct sock *sk)
1847 {
1848         struct inet_sock *inet = inet_sk(sk);
1849         int err;
1850         struct rtable *rt;
1851         __u32 old_saddr = inet->saddr;
1852         __u32 new_saddr;
1853         __u32 daddr = inet->daddr;
1854
1855         if (inet->opt && inet->opt->srr)
1856                 daddr = inet->opt->faddr;
1857
1858         /* Query new route. */
1859         err = ip_route_connect(&rt, daddr, 0,
1860                                RT_CONN_FLAGS(sk),
1861                                sk->sk_bound_dev_if,
1862                                IPPROTO_TCP,
1863                                inet->sport, inet->dport, sk);
1864         if (err)
1865                 return err;
1866
1867         __sk_dst_set(sk, &rt->u.dst);
1868         tcp_v4_setup_caps(sk, &rt->u.dst);
1869
1870         new_saddr = rt->rt_src;
1871
1872         if (new_saddr == old_saddr)
1873                 return 0;
1874
1875         if (sysctl_ip_dynaddr > 1) {
1876                 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
1877                                  "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
1878                        NIPQUAD(old_saddr),
1879                        NIPQUAD(new_saddr));
1880         }
1881
1882         inet->saddr = new_saddr;
1883         inet->rcv_saddr = new_saddr;
1884
1885         /* XXX The only one ugly spot where we need to
1886          * XXX really change the sockets identity after
1887          * XXX it has entered the hashes. -DaveM
1888          *
1889          * Besides that, it does not check for connection
1890          * uniqueness. Wait for troubles.
1891          */
1892         __tcp_v4_rehash(sk);
1893         return 0;
1894 }
1895
1896 int tcp_v4_rebuild_header(struct sock *sk)
1897 {
1898         struct inet_sock *inet = inet_sk(sk);
1899         struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1900         u32 daddr;
1901         int err;
1902
1903         /* Route is OK, nothing to do. */
1904         if (rt)
1905                 return 0;
1906
1907         /* Reroute. */
1908         daddr = inet->daddr;
1909         if (inet->opt && inet->opt->srr)
1910                 daddr = inet->opt->faddr;
1911
1912         {
1913                 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1914                                     .nl_u = { .ip4_u =
1915                                               { .daddr = daddr,
1916                                                 .saddr = inet->saddr,
1917                                                 .tos = RT_CONN_FLAGS(sk) } },
1918                                     .proto = IPPROTO_TCP,
1919                                     .uli_u = { .ports =
1920                                                { .sport = inet->sport,
1921                                                  .dport = inet->dport } } };
1922
1923                 err = ip_route_output_flow(&rt, &fl, sk, 0);
1924         }
1925         if (!err) {
1926                 __sk_dst_set(sk, &rt->u.dst);
1927                 tcp_v4_setup_caps(sk, &rt->u.dst);
1928                 return 0;
1929         }
1930
1931         /* Routing failed... */
1932         sk->sk_route_caps = 0;
1933
1934         if (!sysctl_ip_dynaddr ||
1935             sk->sk_state != TCP_SYN_SENT ||
1936             (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
1937             (err = tcp_v4_reselect_saddr(sk)) != 0)
1938                 sk->sk_err_soft = -err;
1939
1940         return err;
1941 }
1942
1943 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1944 {
1945         struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1946         struct inet_sock *inet = inet_sk(sk);
1947
1948         sin->sin_family         = AF_INET;
1949         sin->sin_addr.s_addr    = inet->daddr;
1950         sin->sin_port           = inet->dport;
1951 }
1952
1953 /* VJ's idea. Save last timestamp seen from this destination
1954  * and hold it at least for normal timewait interval to use for duplicate
1955  * segment detection in subsequent connections, before they enter synchronized
1956  * state.
1957  */
1958
1959 int tcp_v4_remember_stamp(struct sock *sk)
1960 {
1961         struct inet_sock *inet = inet_sk(sk);
1962         struct tcp_sock *tp = tcp_sk(sk);
1963         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1964         struct inet_peer *peer = NULL;
1965         int release_it = 0;
1966
1967         if (!rt || rt->rt_dst != inet->daddr) {
1968                 peer = inet_getpeer(inet->daddr, 1);
1969                 release_it = 1;
1970         } else {
1971                 if (!rt->peer)
1972                         rt_bind_peer(rt, 1);
1973                 peer = rt->peer;
1974         }
1975
1976         if (peer) {
1977                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1978                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1979                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1980                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1981                         peer->tcp_ts = tp->rx_opt.ts_recent;
1982                 }
1983                 if (release_it)
1984                         inet_putpeer(peer);
1985                 return 1;
1986         }
1987
1988         return 0;
1989 }
1990
1991 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1992 {
1993         struct inet_peer *peer = NULL;
1994
1995         peer = inet_getpeer(tw->tw_daddr, 1);
1996
1997         if (peer) {
1998                 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
1999                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2000                      peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
2001                         peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
2002                         peer->tcp_ts = tw->tw_ts_recent;
2003                 }
2004                 inet_putpeer(peer);
2005                 return 1;
2006         }
2007
2008         return 0;
2009 }
2010
2011 struct tcp_func ipv4_specific = {
2012         .queue_xmit     =       ip_queue_xmit,
2013         .send_check     =       tcp_v4_send_check,
2014         .rebuild_header =       tcp_v4_rebuild_header,
2015         .conn_request   =       tcp_v4_conn_request,
2016         .syn_recv_sock  =       tcp_v4_syn_recv_sock,
2017         .remember_stamp =       tcp_v4_remember_stamp,
2018         .net_header_len =       sizeof(struct iphdr),
2019         .setsockopt     =       ip_setsockopt,
2020         .getsockopt     =       ip_getsockopt,
2021         .addr2sockaddr  =       v4_addr2sockaddr,
2022         .sockaddr_len   =       sizeof(struct sockaddr_in),
2023 };
2024
2025 /* NOTE: A lot of things set to zero explicitly by call to
2026  *       sk_alloc() so need not be done here.
2027  */
2028 static int tcp_v4_init_sock(struct sock *sk)
2029 {
2030         struct tcp_sock *tp = tcp_sk(sk);
2031
2032         skb_queue_head_init(&tp->out_of_order_queue);
2033         tcp_init_xmit_timers(sk);
2034         tcp_prequeue_init(tp);
2035
2036         tp->rto  = TCP_TIMEOUT_INIT;
2037         tp->mdev = TCP_TIMEOUT_INIT;
2038
2039         /* So many TCP implementations out there (incorrectly) count the
2040          * initial SYN frame in their delayed-ACK and congestion control
2041          * algorithms that we must have the following bandaid to talk
2042          * efficiently to them.  -DaveM
2043          */
2044         tp->snd_cwnd = 2;
2045
2046         /* See draft-stevens-tcpca-spec-01 for discussion of the
2047          * initialization of these values.
2048          */
2049         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
2050         tp->snd_cwnd_clamp = ~0;
2051         tp->mss_cache = 536;
2052
2053         tp->reordering = sysctl_tcp_reordering;
2054         tp->ca_ops = &tcp_init_congestion_ops;
2055
2056         sk->sk_state = TCP_CLOSE;
2057
2058         sk->sk_write_space = sk_stream_write_space;
2059         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2060
2061         tp->af_specific = &ipv4_specific;
2062
2063         sk->sk_sndbuf = sysctl_tcp_wmem[1];
2064         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
2065
2066         atomic_inc(&tcp_sockets_allocated);
2067
2068         return 0;
2069 }
2070
2071 int tcp_v4_destroy_sock(struct sock *sk)
2072 {
2073         struct tcp_sock *tp = tcp_sk(sk);
2074
2075         tcp_clear_xmit_timers(sk);
2076
2077         tcp_cleanup_congestion_control(tp);
2078
2079         /* Cleanup up the write buffer. */
2080         sk_stream_writequeue_purge(sk);
2081
2082         /* Cleans up our, hopefully empty, out_of_order_queue. */
2083         __skb_queue_purge(&tp->out_of_order_queue);
2084
2085         /* Clean prequeue, it must be empty really */
2086         __skb_queue_purge(&tp->ucopy.prequeue);
2087
2088         /* Clean up a referenced TCP bind bucket. */
2089         if (tp->bind_hash)
2090                 tcp_put_port(sk);
2091
2092         /*
2093          * If sendmsg cached page exists, toss it.
2094          */
2095         if (sk->sk_sndmsg_page) {
2096                 __free_page(sk->sk_sndmsg_page);
2097                 sk->sk_sndmsg_page = NULL;
2098         }
2099
2100         atomic_dec(&tcp_sockets_allocated);
2101
2102         return 0;
2103 }
2104
2105 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2106
2107 #ifdef CONFIG_PROC_FS
2108 /* Proc filesystem TCP sock list dumping. */
2109
2110 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
2111 {
2112         return hlist_empty(head) ? NULL :
2113                 list_entry(head->first, struct tcp_tw_bucket, tw_node);
2114 }
2115
2116 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
2117 {
2118         return tw->tw_node.next ?
2119                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2120 }
2121
2122 static void *listening_get_next(struct seq_file *seq, void *cur)
2123 {
2124         struct tcp_sock *tp;
2125         struct hlist_node *node;
2126         struct sock *sk = cur;
2127         struct tcp_iter_state* st = seq->private;
2128
2129         if (!sk) {
2130                 st->bucket = 0;
2131                 sk = sk_head(&tcp_listening_hash[0]);
2132                 goto get_sk;
2133         }
2134
2135         ++st->num;
2136
2137         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2138                 struct request_sock *req = cur;
2139
2140                 tp = tcp_sk(st->syn_wait_sk);
2141                 req = req->dl_next;
2142                 while (1) {
2143                         while (req) {
2144                                 if (req->rsk_ops->family == st->family) {
2145                                         cur = req;
2146                                         goto out;
2147                                 }
2148                                 req = req->dl_next;
2149                         }
2150                         if (++st->sbucket >= TCP_SYNQ_HSIZE)
2151                                 break;
2152 get_req:
2153                         req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
2154                 }
2155                 sk        = sk_next(st->syn_wait_sk);
2156                 st->state = TCP_SEQ_STATE_LISTENING;
2157                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2158         } else {
2159                 tp = tcp_sk(sk);
2160                 read_lock_bh(&tp->accept_queue.syn_wait_lock);
2161                 if (reqsk_queue_len(&tp->accept_queue))
2162                         goto start_req;
2163                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2164                 sk = sk_next(sk);
2165         }
2166 get_sk:
2167         sk_for_each_from(sk, node) {
2168                 if (sk->sk_family == st->family) {
2169                         cur = sk;
2170                         goto out;
2171                 }
2172                 tp = tcp_sk(sk);
2173                 read_lock_bh(&tp->accept_queue.syn_wait_lock);
2174                 if (reqsk_queue_len(&tp->accept_queue)) {
2175 start_req:
2176                         st->uid         = sock_i_uid(sk);
2177                         st->syn_wait_sk = sk;
2178                         st->state       = TCP_SEQ_STATE_OPENREQ;
2179                         st->sbucket     = 0;
2180                         goto get_req;
2181                 }
2182                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2183         }
2184         if (++st->bucket < TCP_LHTABLE_SIZE) {
2185                 sk = sk_head(&tcp_listening_hash[st->bucket]);
2186                 goto get_sk;
2187         }
2188         cur = NULL;
2189 out:
2190         return cur;
2191 }
2192
2193 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2194 {
2195         void *rc = listening_get_next(seq, NULL);
2196
2197         while (rc && *pos) {
2198                 rc = listening_get_next(seq, rc);
2199                 --*pos;
2200         }
2201         return rc;
2202 }
2203
2204 static void *established_get_first(struct seq_file *seq)
2205 {
2206         struct tcp_iter_state* st = seq->private;
2207         void *rc = NULL;
2208
2209         for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2210                 struct sock *sk;
2211                 struct hlist_node *node;
2212                 struct tcp_tw_bucket *tw;
2213
2214                 /* We can reschedule _before_ having picked the target: */
2215                 cond_resched_softirq();
2216
2217                 read_lock(&tcp_ehash[st->bucket].lock);
2218                 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2219                         if (sk->sk_family != st->family) {
2220                                 continue;
2221                         }
2222                         rc = sk;
2223                         goto out;
2224                 }
2225                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2226                 tw_for_each(tw, node,
2227                             &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2228                         if (tw->tw_family != st->family) {
2229                                 continue;
2230                         }
2231                         rc = tw;
2232                         goto out;
2233                 }
2234                 read_unlock(&tcp_ehash[st->bucket].lock);
2235                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2236         }
2237 out:
2238         return rc;
2239 }
2240
2241 static void *established_get_next(struct seq_file *seq, void *cur)
2242 {
2243         struct sock *sk = cur;
2244         struct tcp_tw_bucket *tw;
2245         struct hlist_node *node;
2246         struct tcp_iter_state* st = seq->private;
2247
2248         ++st->num;
2249
2250         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2251                 tw = cur;
2252                 tw = tw_next(tw);
2253 get_tw:
2254                 while (tw && tw->tw_family != st->family) {
2255                         tw = tw_next(tw);
2256                 }
2257                 if (tw) {
2258                         cur = tw;
2259                         goto out;
2260                 }
2261                 read_unlock(&tcp_ehash[st->bucket].lock);
2262                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2263
2264                 /* We can reschedule between buckets: */
2265                 cond_resched_softirq();
2266
2267                 if (++st->bucket < tcp_ehash_size) {
2268                         read_lock(&tcp_ehash[st->bucket].lock);
2269                         sk = sk_head(&tcp_ehash[st->bucket].chain);
2270                 } else {
2271                         cur = NULL;
2272                         goto out;
2273                 }
2274         } else
2275                 sk = sk_next(sk);
2276
2277         sk_for_each_from(sk, node) {
2278                 if (sk->sk_family == st->family)
2279                         goto found;
2280         }
2281
2282         st->state = TCP_SEQ_STATE_TIME_WAIT;
2283         tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2284         goto get_tw;
2285 found:
2286         cur = sk;
2287 out:
2288         return cur;
2289 }
2290
2291 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2292 {
2293         void *rc = established_get_first(seq);
2294
2295         while (rc && pos) {
2296                 rc = established_get_next(seq, rc);
2297                 --pos;
2298         }
2299         return rc;
2300 }
2301
2302 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2303 {
2304         void *rc;
2305         struct tcp_iter_state* st = seq->private;
2306
2307         tcp_listen_lock();
2308         st->state = TCP_SEQ_STATE_LISTENING;
2309         rc        = listening_get_idx(seq, &pos);
2310
2311         if (!rc) {
2312                 tcp_listen_unlock();
2313                 local_bh_disable();
2314                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2315                 rc        = established_get_idx(seq, pos);
2316         }
2317
2318         return rc;
2319 }
2320
2321 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2322 {
2323         struct tcp_iter_state* st = seq->private;
2324         st->state = TCP_SEQ_STATE_LISTENING;
2325         st->num = 0;
2326         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2327 }
2328
2329 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2330 {
2331         void *rc = NULL;
2332         struct tcp_iter_state* st;
2333
2334         if (v == SEQ_START_TOKEN) {
2335                 rc = tcp_get_idx(seq, 0);
2336                 goto out;
2337         }
2338         st = seq->private;
2339
2340         switch (st->state) {
2341         case TCP_SEQ_STATE_OPENREQ:
2342         case TCP_SEQ_STATE_LISTENING:
2343                 rc = listening_get_next(seq, v);
2344                 if (!rc) {
2345                         tcp_listen_unlock();
2346                         local_bh_disable();
2347                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2348                         rc        = established_get_first(seq);
2349                 }
2350                 break;
2351         case TCP_SEQ_STATE_ESTABLISHED:
2352         case TCP_SEQ_STATE_TIME_WAIT:
2353                 rc = established_get_next(seq, v);
2354                 break;
2355         }
2356 out:
2357         ++*pos;
2358         return rc;
2359 }
2360
2361 static void tcp_seq_stop(struct seq_file *seq, void *v)
2362 {
2363         struct tcp_iter_state* st = seq->private;
2364
2365         switch (st->state) {
2366         case TCP_SEQ_STATE_OPENREQ:
2367                 if (v) {
2368                         struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
2369                         read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2370                 }
2371         case TCP_SEQ_STATE_LISTENING:
2372                 if (v != SEQ_START_TOKEN)
2373                         tcp_listen_unlock();
2374                 break;
2375         case TCP_SEQ_STATE_TIME_WAIT:
2376         case TCP_SEQ_STATE_ESTABLISHED:
2377                 if (v)
2378                         read_unlock(&tcp_ehash[st->bucket].lock);
2379                 local_bh_enable();
2380                 break;
2381         }
2382 }
2383
2384 static int tcp_seq_open(struct inode *inode, struct file *file)
2385 {
2386         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2387         struct seq_file *seq;
2388         struct tcp_iter_state *s;
2389         int rc;
2390
2391         if (unlikely(afinfo == NULL))
2392                 return -EINVAL;
2393
2394         s = kmalloc(sizeof(*s), GFP_KERNEL);
2395         if (!s)
2396                 return -ENOMEM;
2397         memset(s, 0, sizeof(*s));
2398         s->family               = afinfo->family;
2399         s->seq_ops.start        = tcp_seq_start;
2400         s->seq_ops.next         = tcp_seq_next;
2401         s->seq_ops.show         = afinfo->seq_show;
2402         s->seq_ops.stop         = tcp_seq_stop;
2403
2404         rc = seq_open(file, &s->seq_ops);
2405         if (rc)
2406                 goto out_kfree;
2407         seq          = file->private_data;
2408         seq->private = s;
2409 out:
2410         return rc;
2411 out_kfree:
2412         kfree(s);
2413         goto out;
2414 }
2415
2416 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2417 {
2418         int rc = 0;
2419         struct proc_dir_entry *p;
2420
2421         if (!afinfo)
2422                 return -EINVAL;
2423         afinfo->seq_fops->owner         = afinfo->owner;
2424         afinfo->seq_fops->open          = tcp_seq_open;
2425         afinfo->seq_fops->read          = seq_read;
2426         afinfo->seq_fops->llseek        = seq_lseek;
2427         afinfo->seq_fops->release       = seq_release_private;
2428
2429         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2430         if (p)
2431                 p->data = afinfo;
2432         else
2433                 rc = -ENOMEM;
2434         return rc;
2435 }
2436
2437 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2438 {
2439         if (!afinfo)
2440                 return;
2441         proc_net_remove(afinfo->name);
2442         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2443 }
2444
2445 static void get_openreq4(struct sock *sk, struct request_sock *req,
2446                          char *tmpbuf, int i, int uid)
2447 {
2448         const struct inet_request_sock *ireq = inet_rsk(req);
2449         int ttd = req->expires - jiffies;
2450
2451         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2452                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2453                 i,
2454                 ireq->loc_addr,
2455                 ntohs(inet_sk(sk)->sport),
2456                 ireq->rmt_addr,
2457                 ntohs(ireq->rmt_port),
2458                 TCP_SYN_RECV,
2459                 0, 0, /* could print option size, but that is af dependent. */
2460                 1,    /* timers active (only the expire timer) */
2461                 jiffies_to_clock_t(ttd),
2462                 req->retrans,
2463                 uid,
2464                 0,  /* non standard timer */
2465                 0, /* open_requests have no inode */
2466                 atomic_read(&sk->sk_refcnt),
2467                 req);
2468 }
2469
2470 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2471 {
2472         int timer_active;
2473         unsigned long timer_expires;
2474         struct tcp_sock *tp = tcp_sk(sp);
2475         struct inet_sock *inet = inet_sk(sp);
2476         unsigned int dest = inet->daddr;
2477         unsigned int src = inet->rcv_saddr;
2478         __u16 destp = ntohs(inet->dport);
2479         __u16 srcp = ntohs(inet->sport);
2480
2481         if (tp->pending == TCP_TIME_RETRANS) {
2482                 timer_active    = 1;
2483                 timer_expires   = tp->timeout;
2484         } else if (tp->pending == TCP_TIME_PROBE0) {
2485                 timer_active    = 4;
2486                 timer_expires   = tp->timeout;
2487         } else if (timer_pending(&sp->sk_timer)) {
2488                 timer_active    = 2;
2489                 timer_expires   = sp->sk_timer.expires;
2490         } else {
2491                 timer_active    = 0;
2492                 timer_expires = jiffies;
2493         }
2494
2495         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2496                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2497                 i, src, srcp, dest, destp, sp->sk_state,
2498                 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2499                 timer_active,
2500                 jiffies_to_clock_t(timer_expires - jiffies),
2501                 tp->retransmits,
2502                 sock_i_uid(sp),
2503                 tp->probes_out,
2504                 sock_i_ino(sp),
2505                 atomic_read(&sp->sk_refcnt), sp,
2506                 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2507                 tp->snd_cwnd,
2508                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2509 }
2510
2511 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2512 {
2513         unsigned int dest, src;
2514         __u16 destp, srcp;
2515         int ttd = tw->tw_ttd - jiffies;
2516
2517         if (ttd < 0)
2518                 ttd = 0;
2519
2520         dest  = tw->tw_daddr;
2521         src   = tw->tw_rcv_saddr;
2522         destp = ntohs(tw->tw_dport);
2523         srcp  = ntohs(tw->tw_sport);
2524
2525         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2526                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2527                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2528                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2529                 atomic_read(&tw->tw_refcnt), tw);
2530 }
2531
2532 #define TMPSZ 150
2533
2534 static int tcp4_seq_show(struct seq_file *seq, void *v)
2535 {
2536         struct tcp_iter_state* st;
2537         char tmpbuf[TMPSZ + 1];
2538
2539         if (v == SEQ_START_TOKEN) {
2540                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2541                            "  sl  local_address rem_address   st tx_queue "
2542                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2543                            "inode");
2544                 goto out;
2545         }
2546         st = seq->private;
2547
2548         switch (st->state) {
2549         case TCP_SEQ_STATE_LISTENING:
2550         case TCP_SEQ_STATE_ESTABLISHED:
2551                 get_tcp4_sock(v, tmpbuf, st->num);
2552                 break;
2553         case TCP_SEQ_STATE_OPENREQ:
2554                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2555                 break;
2556         case TCP_SEQ_STATE_TIME_WAIT:
2557                 get_timewait4_sock(v, tmpbuf, st->num);
2558                 break;
2559         }
2560         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2561 out:
2562         return 0;
2563 }
2564
2565 static struct file_operations tcp4_seq_fops;
2566 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2567         .owner          = THIS_MODULE,
2568         .name           = "tcp",
2569         .family         = AF_INET,
2570         .seq_show       = tcp4_seq_show,
2571         .seq_fops       = &tcp4_seq_fops,
2572 };
2573
2574 int __init tcp4_proc_init(void)
2575 {
2576         return tcp_proc_register(&tcp4_seq_afinfo);
2577 }
2578
2579 void tcp4_proc_exit(void)
2580 {
2581         tcp_proc_unregister(&tcp4_seq_afinfo);
2582 }
2583 #endif /* CONFIG_PROC_FS */
2584
2585 struct proto tcp_prot = {
2586         .name                   = "TCP",
2587         .owner                  = THIS_MODULE,
2588         .close                  = tcp_close,
2589         .connect                = tcp_v4_connect,
2590         .disconnect             = tcp_disconnect,
2591         .accept                 = tcp_accept,
2592         .ioctl                  = tcp_ioctl,
2593         .init                   = tcp_v4_init_sock,
2594         .destroy                = tcp_v4_destroy_sock,
2595         .shutdown               = tcp_shutdown,
2596         .setsockopt             = tcp_setsockopt,
2597         .getsockopt             = tcp_getsockopt,
2598         .sendmsg                = tcp_sendmsg,
2599         .recvmsg                = tcp_recvmsg,
2600         .backlog_rcv            = tcp_v4_do_rcv,
2601         .hash                   = tcp_v4_hash,
2602         .unhash                 = tcp_unhash,
2603         .get_port               = tcp_v4_get_port,
2604         .enter_memory_pressure  = tcp_enter_memory_pressure,
2605         .sockets_allocated      = &tcp_sockets_allocated,
2606         .memory_allocated       = &tcp_memory_allocated,
2607         .memory_pressure        = &tcp_memory_pressure,
2608         .sysctl_mem             = sysctl_tcp_mem,
2609         .sysctl_wmem            = sysctl_tcp_wmem,
2610         .sysctl_rmem            = sysctl_tcp_rmem,
2611         .max_header             = MAX_TCP_HEADER,
2612         .obj_size               = sizeof(struct tcp_sock),
2613         .rsk_prot               = &tcp_request_sock_ops,
2614 };
2615
2616
2617
2618 void __init tcp_v4_init(struct net_proto_family *ops)
2619 {
2620         int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2621         if (err < 0)
2622                 panic("Failed to create the TCP control socket.\n");
2623         tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
2624         inet_sk(tcp_socket->sk)->uc_ttl = -1;
2625
2626         /* Unhash it so that IP input processing does not even
2627          * see it, we do not wish this socket to see incoming
2628          * packets.
2629          */
2630         tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2631 }
2632
2633 EXPORT_SYMBOL(ipv4_specific);
2634 EXPORT_SYMBOL(tcp_bind_hash);
2635 EXPORT_SYMBOL(tcp_bucket_create);
2636 EXPORT_SYMBOL(tcp_hashinfo);
2637 EXPORT_SYMBOL(tcp_inherit_port);
2638 EXPORT_SYMBOL(tcp_listen_wlock);
2639 EXPORT_SYMBOL(tcp_port_rover);
2640 EXPORT_SYMBOL(tcp_prot);
2641 EXPORT_SYMBOL(tcp_put_port);
2642 EXPORT_SYMBOL(tcp_unhash);
2643 EXPORT_SYMBOL(tcp_v4_conn_request);
2644 EXPORT_SYMBOL(tcp_v4_connect);
2645 EXPORT_SYMBOL(tcp_v4_do_rcv);
2646 EXPORT_SYMBOL(tcp_v4_rebuild_header);
2647 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2648 EXPORT_SYMBOL(tcp_v4_send_check);
2649 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2650
2651 #ifdef CONFIG_PROC_FS
2652 EXPORT_SYMBOL(tcp_proc_register);
2653 EXPORT_SYMBOL(tcp_proc_unregister);
2654 #endif
2655 EXPORT_SYMBOL(sysctl_local_port_range);
2656 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2657 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2658