net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65
  66 #include <net/net_namespace.h>
  67 #include <net/icmp.h>
  68 #include <net/inet_hashtables.h>
  69 #include <net/tcp.h>
  70 #include <net/transp_v6.h>
  71 #include <net/ipv6.h>
  72 #include <net/inet_common.h>
  73 #include <net/timewait_sock.h>
  74 #include <net/xfrm.h>
  75 #include <net/secure_seq.h>
  76 #include <net/busy_poll.h>
  77
  78 #include <linux/inet.h>
  79 #include <linux/ipv6.h>
  80 #include <linux/stddef.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/seq_file.h>
  83
  84 #include <crypto/hash.h>
  85 #include <linux/scatterlist.h>
  86
  87 int sysctl_tcp_tw_reuse __read_mostly;
  88 int sysctl_tcp_low_latency __read_mostly;
  89 EXPORT_SYMBOL(sysctl_tcp_low_latency);
  90
  91 #ifdef CONFIG_TCP_MD5SIG
  92 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  93                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  94 #endif
  95
  96 struct inet_hashinfo tcp_hashinfo;
  97 EXPORT_SYMBOL(tcp_hashinfo);
  98
  99 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
 100 {
 101         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 102                                           ip_hdr(skb)->saddr,
 103                                           tcp_hdr(skb)->dest,
 104                                           tcp_hdr(skb)->source);
 105 }
 106
 107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 108 {
 109         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 110         struct tcp_sock *tp = tcp_sk(sk);
 111
 112         /* With PAWS, it is safe from the viewpoint
 113            of data integrity. Even without PAWS it is safe provided sequence
 114            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 115
 116            Actually, the idea is close to VJ's one, only timestamp cache is
 117            held not per host, but per port pair and TW bucket is used as state
 118            holder.
 119
 120            If TW bucket has been already destroyed we fall back to VJ's scheme
 121            and use initial timestamp retrieved from peer table.
 122          */
 123         if (tcptw->tw_ts_recent_stamp &&
 124             (!twp || (sysctl_tcp_tw_reuse &&
 125                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 126                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 127                 if (tp->write_seq == 0)
 128                         tp->write_seq = 1;
 129                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 130                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 131                 sock_hold(sktw);
 132                 return 1;
 133         }
 134
 135         return 0;
 136 }
 137 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 138
 139 /* This will initiate an outgoing connection. */
 140 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 141 {
 142         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 143         struct inet_sock *inet = inet_sk(sk);
 144         struct tcp_sock *tp = tcp_sk(sk);
 145         __be16 orig_sport, orig_dport;
 146         __be32 daddr, nexthop;
 147         struct flowi4 *fl4;
 148         struct rtable *rt;
 149         int err;
 150         struct ip_options_rcu *inet_opt;
 151
 152         if (addr_len < sizeof(struct sockaddr_in))
 153                 return -EINVAL;
 154
 155         if (usin->sin_family != AF_INET)
 156                 return -EAFNOSUPPORT;
 157
 158         nexthop = daddr = usin->sin_addr.s_addr;
 159         inet_opt = rcu_dereference_protected(inet->inet_opt,
 160                                              lockdep_sock_is_held(sk));
 161         if (inet_opt && inet_opt->opt.srr) {
 162                 if (!daddr)
 163                         return -EINVAL;
 164                 nexthop = inet_opt->opt.faddr;
 165         }
 166
 167         orig_sport = inet->inet_sport;
 168         orig_dport = usin->sin_port;
 169         fl4 = &inet->cork.fl.u.ip4;
 170         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 171                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 172                               IPPROTO_TCP,
 173                               orig_sport, orig_dport, sk);
 174         if (IS_ERR(rt)) {
 175                 err = PTR_ERR(rt);
 176                 if (err == -ENETUNREACH)
 177                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 178                 return err;
 179         }
 180
 181         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 182                 ip_rt_put(rt);
 183                 return -ENETUNREACH;
 184         }
 185
 186         if (!inet_opt || !inet_opt->opt.srr)
 187                 daddr = fl4->daddr;
 188
 189         if (!inet->inet_saddr)
 190                 inet->inet_saddr = fl4->saddr;
 191         sk_rcv_saddr_set(sk, inet->inet_saddr);
 192
 193         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 194                 /* Reset inherited state */
 195                 tp->rx_opt.ts_recent       = 0;
 196                 tp->rx_opt.ts_recent_stamp = 0;
 197                 if (likely(!tp->repair))
 198                         tp->write_seq      = 0;
 199         }
 200
 201         if (tcp_death_row.sysctl_tw_recycle &&
 202             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
 203                 tcp_fetch_timewait_stamp(sk, &rt->dst);
 204
 205         inet->inet_dport = usin->sin_port;
 206         sk_daddr_set(sk, daddr);
 207
 208         inet_csk(sk)->icsk_ext_hdr_len = 0;
 209         if (inet_opt)
 210                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 211
 212         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 213
 214         /* Socket identity is still unknown (sport may be zero).
 215          * However we set state to SYN-SENT and not releasing socket
 216          * lock select source port, enter ourselves into the hash tables and
 217          * complete initialization after this.
 218          */
 219         tcp_set_state(sk, TCP_SYN_SENT);
 220         err = inet_hash_connect(&tcp_death_row, sk);
 221         if (err)
 222                 goto failure;
 223
 224         sk_set_txhash(sk);
 225
 226         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 227                                inet->inet_sport, inet->inet_dport, sk);
 228         if (IS_ERR(rt)) {
 229                 err = PTR_ERR(rt);
 230                 rt = NULL;
 231                 goto failure;
 232         }
 233         /* OK, now commit destination to socket.  */
 234         sk->sk_gso_type = SKB_GSO_TCPV4;
 235         sk_setup_caps(sk, &rt->dst);
 236
 237         if (!tp->write_seq && likely(!tp->repair))
 238                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 239                                                            inet->inet_daddr,
 240                                                            inet->inet_sport,
 241                                                            usin->sin_port);
 242
 243         inet->inet_id = tp->write_seq ^ jiffies;
 244
 245         err = tcp_connect(sk);
 246
 247         rt = NULL;
 248         if (err)
 249                 goto failure;
 250
 251         return 0;
 252
 253 failure:
 254         /*
 255          * This unhashes the socket and releases the local port,
 256          * if necessary.
 257          */
 258         tcp_set_state(sk, TCP_CLOSE);
 259         ip_rt_put(rt);
 260         sk->sk_route_caps = 0;
 261         inet->inet_dport = 0;
 262         return err;
 263 }
 264 EXPORT_SYMBOL(tcp_v4_connect);
 265
 266 /*
 267  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 268  * It can be called through tcp_release_cb() if socket was owned by user
 269  * at the time tcp_v4_err() was called to handle ICMP message.
 270  */
 271 void tcp_v4_mtu_reduced(struct sock *sk)
 272 {
 273         struct dst_entry *dst;
 274         struct inet_sock *inet = inet_sk(sk);
 275         u32 mtu = tcp_sk(sk)->mtu_info;
 276
 277         dst = inet_csk_update_pmtu(sk, mtu);
 278         if (!dst)
 279                 return;
 280
 281         /* Something is about to be wrong... Remember soft error
 282          * for the case, if this connection will not able to recover.
 283          */
 284         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 285                 sk->sk_err_soft = EMSGSIZE;
 286
 287         mtu = dst_mtu(dst);
 288
 289         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 290             ip_sk_accept_pmtu(sk) &&
 291             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 292                 tcp_sync_mss(sk, mtu);
 293
 294                 /* Resend the TCP packet because it's
 295                  * clear that the old packet has been
 296                  * dropped. This is the new "fast" path mtu
 297                  * discovery.
 298                  */
 299                 tcp_simple_retransmit(sk);
 300         } /* else let the usual retransmit timer handle it */
 301 }
 302 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 303
 304 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 305 {
 306         struct dst_entry *dst = __sk_dst_check(sk, 0);
 307
 308         if (dst)
 309                 dst->ops->redirect(dst, sk, skb);
 310 }
 311
 312
 313 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 314 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 315 {
 316         struct request_sock *req = inet_reqsk(sk);
 317         struct net *net = sock_net(sk);
 318
 319         /* ICMPs are not backlogged, hence we cannot get
 320          * an established socket here.
 321          */
 322         if (seq != tcp_rsk(req)->snt_isn) {
 323                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 324         } else if (abort) {
 325                 /*
 326                  * Still in SYN_RECV, just remove it silently.
 327                  * There is no good way to pass the error to the newly
 328                  * created socket, and POSIX does not want network
 329                  * errors returned from accept().
 330                  */
 331                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 332                 tcp_listendrop(req->rsk_listener);
 333         }
 334         reqsk_put(req);
 335 }
 336 EXPORT_SYMBOL(tcp_req_err);
 337
 338 /*
 339  * This routine is called by the ICMP module when it gets some
 340  * sort of error condition.  If err < 0 then the socket should
 341  * be closed and the error returned to the user.  If err > 0
 342  * it's just the icmp type << 8 | icmp code.  After adjustment
 343  * header points to the first 8 bytes of the tcp header.  We need
 344  * to find the appropriate port.
 345  *
 346  * The locking strategy used here is very "optimistic". When
 347  * someone else accesses the socket the ICMP is just dropped
 348  * and for some paths there is no check at all.
 349  * A more general error queue to queue errors for later handling
 350  * is probably better.
 351  *
 352  */
 353
 354 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 355 {
 356         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 357         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 358         struct inet_connection_sock *icsk;
 359         struct tcp_sock *tp;
 360         struct inet_sock *inet;
 361         const int type = icmp_hdr(icmp_skb)->type;
 362         const int code = icmp_hdr(icmp_skb)->code;
 363         struct sock *sk;
 364         struct sk_buff *skb;
 365         struct request_sock *fastopen;
 366         __u32 seq, snd_una;
 367         __u32 remaining;
 368         int err;
 369         struct net *net = dev_net(icmp_skb->dev);
 370
 371         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 372                                        th->dest, iph->saddr, ntohs(th->source),
 373                                        inet_iif(icmp_skb));
 374         if (!sk) {
 375                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 376                 return;
 377         }
 378         if (sk->sk_state == TCP_TIME_WAIT) {
 379                 inet_twsk_put(inet_twsk(sk));
 380                 return;
 381         }
 382         seq = ntohl(th->seq);
 383         if (sk->sk_state == TCP_NEW_SYN_RECV)
 384                 return tcp_req_err(sk, seq,
 385                                   type == ICMP_PARAMETERPROB ||
 386                                   type == ICMP_TIME_EXCEEDED ||
 387                                   (type == ICMP_DEST_UNREACH &&
 388                                    (code == ICMP_NET_UNREACH ||
 389                                     code == ICMP_HOST_UNREACH)));
 390
 391         bh_lock_sock(sk);
 392         /* If too many ICMPs get dropped on busy
 393          * servers this needs to be solved differently.
 394          * We do take care of PMTU discovery (RFC1191) special case :
 395          * we can receive locally generated ICMP messages while socket is held.
 396          */
 397         if (sock_owned_by_user(sk)) {
 398                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 399                         NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 400         }
 401         if (sk->sk_state == TCP_CLOSE)
 402                 goto out;
 403
 404         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 405                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
 406                 goto out;
 407         }
 408
 409         icsk = inet_csk(sk);
 410         tp = tcp_sk(sk);
 411         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 412         fastopen = tp->fastopen_rsk;
 413         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 414         if (sk->sk_state != TCP_LISTEN &&
 415             !between(seq, snd_una, tp->snd_nxt)) {
 416                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 417                 goto out;
 418         }
 419
 420         switch (type) {
 421         case ICMP_REDIRECT:
 422                 do_redirect(icmp_skb, sk);
 423                 goto out;
 424         case ICMP_SOURCE_QUENCH:
 425                 /* Just silently ignore these. */
 426                 goto out;
 427         case ICMP_PARAMETERPROB:
 428                 err = EPROTO;
 429                 break;
 430         case ICMP_DEST_UNREACH:
 431                 if (code > NR_ICMP_UNREACH)
 432                         goto out;
 433
 434                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 435                         /* We are not interested in TCP_LISTEN and open_requests
 436                          * (SYN-ACKs send out by Linux are always <576bytes so
 437                          * they should go through unfragmented).
 438                          */
 439                         if (sk->sk_state == TCP_LISTEN)
 440                                 goto out;
 441
 442                         tp->mtu_info = info;
 443                         if (!sock_owned_by_user(sk)) {
 444                                 tcp_v4_mtu_reduced(sk);
 445                         } else {
 446                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
 447                                         sock_hold(sk);
 448                         }
 449                         goto out;
 450                 }
 451
 452                 err = icmp_err_convert[code].errno;
 453                 /* check if icmp_skb allows revert of backoff
 454                  * (see draft-zimmermann-tcp-lcd) */
 455                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 456                         break;
 457                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 458                     !icsk->icsk_backoff || fastopen)
 459                         break;
 460
 461                 if (sock_owned_by_user(sk))
 462                         break;
 463
 464                 icsk->icsk_backoff--;
 465                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 466                                                TCP_TIMEOUT_INIT;
 467                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 468
 469                 skb = tcp_write_queue_head(sk);
 470                 BUG_ON(!skb);
 471
 472                 remaining = icsk->icsk_rto -
 473                             min(icsk->icsk_rto,
 474                                 tcp_time_stamp - tcp_skb_timestamp(skb));
 475
 476                 if (remaining) {
 477                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 478                                                   remaining, TCP_RTO_MAX);
 479                 } else {
 480                         /* RTO revert clocked out retransmission.
 481                          * Will retransmit now */
 482                         tcp_retransmit_timer(sk);
 483                 }
 484
 485                 break;
 486         case ICMP_TIME_EXCEEDED:
 487                 err = EHOSTUNREACH;
 488                 break;
 489         default:
 490                 goto out;
 491         }
 492
 493         switch (sk->sk_state) {
 494         case TCP_SYN_SENT:
 495         case TCP_SYN_RECV:
 496                 /* Only in fast or simultaneous open. If a fast open socket is
 497                  * is already accepted it is treated as a connected one below.
 498                  */
 499                 if (fastopen && !fastopen->sk)
 500                         break;
 501
 502                 if (!sock_owned_by_user(sk)) {
 503                         sk->sk_err = err;
 504
 505                         sk->sk_error_report(sk);
 506
 507                         tcp_done(sk);
 508                 } else {
 509                         sk->sk_err_soft = err;
 510                 }
 511                 goto out;
 512         }
 513
 514         /* If we've already connected we will keep trying
 515          * until we time out, or the user gives up.
 516          *
 517          * rfc1122 4.2.3.9 allows to consider as hard errors
 518          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 519          * but it is obsoleted by pmtu discovery).
 520          *
 521          * Note, that in modern internet, where routing is unreliable
 522          * and in each dark corner broken firewalls sit, sending random
 523          * errors ordered by their masters even this two messages finally lose
 524          * their original sense (even Linux sends invalid PORT_UNREACHs)
 525          *
 526          * Now we are in compliance with RFCs.
 527          *                                                      --ANK (980905)
 528          */
 529
 530         inet = inet_sk(sk);
 531         if (!sock_owned_by_user(sk) && inet->recverr) {
 532                 sk->sk_err = err;
 533                 sk->sk_error_report(sk);
 534         } else  { /* Only an error on timeout */
 535                 sk->sk_err_soft = err;
 536         }
 537
 538 out:
 539         bh_unlock_sock(sk);
 540         sock_put(sk);
 541 }
 542
 543 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 544 {
 545         struct tcphdr *th = tcp_hdr(skb);
 546
 547         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 548                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 549                 skb->csum_start = skb_transport_header(skb) - skb->head;
 550                 skb->csum_offset = offsetof(struct tcphdr, check);
 551         } else {
 552                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 553                                          csum_partial(th,
 554                                                       th->doff << 2,
 555                                                       skb->csum));
 556         }
 557 }
 558
 559 /* This routine computes an IPv4 TCP checksum. */
 560 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 561 {
 562         const struct inet_sock *inet = inet_sk(sk);
 563
 564         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 565 }
 566 EXPORT_SYMBOL(tcp_v4_send_check);
 567
 568 /*
 569  *      This routine will send an RST to the other tcp.
 570  *
 571  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 572  *                    for reset.
 573  *      Answer: if a packet caused RST, it is not for a socket
 574  *              existing in our system, if it is matched to a socket,
 575  *              it is just duplicate segment or bug in other side's TCP.
 576  *              So that we build reply only basing on parameters
 577  *              arrived with segment.
 578  *      Exception: precedence violation. We do not implement it in any case.
 579  */
 580
 581 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 582 {
 583         const struct tcphdr *th = tcp_hdr(skb);
 584         struct {
 585                 struct tcphdr th;
 586 #ifdef CONFIG_TCP_MD5SIG
 587                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 588 #endif
 589         } rep;
 590         struct ip_reply_arg arg;
 591 #ifdef CONFIG_TCP_MD5SIG
 592         struct tcp_md5sig_key *key = NULL;
 593         const __u8 *hash_location = NULL;
 594         unsigned char newhash[16];
 595         int genhash;
 596         struct sock *sk1 = NULL;
 597 #endif
 598         struct net *net;
 599
 600         /* Never send a reset in response to a reset. */
 601         if (th->rst)
 602                 return;
 603
 604         /* If sk not NULL, it means we did a successful lookup and incoming
 605          * route had to be correct. prequeue might have dropped our dst.
 606          */
 607         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 608                 return;
 609
 610         /* Swap the send and the receive. */
 611         memset(&rep, 0, sizeof(rep));
 612         rep.th.dest   = th->source;
 613         rep.th.source = th->dest;
 614         rep.th.doff   = sizeof(struct tcphdr) / 4;
 615         rep.th.rst    = 1;
 616
 617         if (th->ack) {
 618                 rep.th.seq = th->ack_seq;
 619         } else {
 620                 rep.th.ack = 1;
 621                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 622                                        skb->len - (th->doff << 2));
 623         }
 624
 625         memset(&arg, 0, sizeof(arg));
 626         arg.iov[0].iov_base = (unsigned char *)&rep;
 627         arg.iov[0].iov_len  = sizeof(rep.th);
 628
 629         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 630 #ifdef CONFIG_TCP_MD5SIG
 631         rcu_read_lock();
 632         hash_location = tcp_parse_md5sig_option(th);
 633         if (sk && sk_fullsock(sk)) {
 634                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 635                                         &ip_hdr(skb)->saddr, AF_INET);
 636         } else if (hash_location) {
 637                 /*
 638                  * active side is lost. Try to find listening socket through
 639                  * source port, and then find md5 key through listening socket.
 640                  * we are not loose security here:
 641                  * Incoming packet is checked with md5 hash with finding key,
 642                  * no RST generated if md5 hash doesn't match.
 643                  */
 644                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 645                                              ip_hdr(skb)->saddr,
 646                                              th->source, ip_hdr(skb)->daddr,
 647                                              ntohs(th->source), inet_iif(skb));
 648                 /* don't send rst if it can't find key */
 649                 if (!sk1)
 650                         goto out;
 651
 652                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 653                                         &ip_hdr(skb)->saddr, AF_INET);
 654                 if (!key)
 655                         goto out;
 656
 657
 658                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 659                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 660                         goto out;
 661
 662         }
 663
 664         if (key) {
 665                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 666                                    (TCPOPT_NOP << 16) |
 667                                    (TCPOPT_MD5SIG << 8) |
 668                                    TCPOLEN_MD5SIG);
 669                 /* Update length and the length the header thinks exists */
 670                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 671                 rep.th.doff = arg.iov[0].iov_len / 4;
 672
 673                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 674                                      key, ip_hdr(skb)->saddr,
 675                                      ip_hdr(skb)->daddr, &rep.th);
 676         }
 677 #endif
 678         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 679                                       ip_hdr(skb)->saddr, /* XXX */
 680                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 681         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 682         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 683
 684         /* When socket is gone, all binding information is lost.
 685          * routing might fail in this case. No choice here, if we choose to force
 686          * input interface, we will misroute in case of asymmetric route.
 687          */
 688         if (sk)
 689                 arg.bound_dev_if = sk->sk_bound_dev_if;
 690
 691         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 692                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 693
 694         arg.tos = ip_hdr(skb)->tos;
 695         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 696                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 697                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 698                               &arg, arg.iov[0].iov_len);
 699
 700         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 701         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 702
 703 #ifdef CONFIG_TCP_MD5SIG
 704 out:
 705         rcu_read_unlock();
 706 #endif
 707 }
 708
 709 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 710    outside socket context is ugly, certainly. What can I do?
 711  */
 712
 713 static void tcp_v4_send_ack(struct net *net,
 714                             struct sk_buff *skb, u32 seq, u32 ack,
 715                             u32 win, u32 tsval, u32 tsecr, int oif,
 716                             struct tcp_md5sig_key *key,
 717                             int reply_flags, u8 tos)
 718 {
 719         const struct tcphdr *th = tcp_hdr(skb);
 720         struct {
 721                 struct tcphdr th;
 722                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 723 #ifdef CONFIG_TCP_MD5SIG
 724                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 725 #endif
 726                         ];
 727         } rep;
 728         struct ip_reply_arg arg;
 729
 730         memset(&rep.th, 0, sizeof(struct tcphdr));
 731         memset(&arg, 0, sizeof(arg));
 732
 733         arg.iov[0].iov_base = (unsigned char *)&rep;
 734         arg.iov[0].iov_len  = sizeof(rep.th);
 735         if (tsecr) {
 736                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 737                                    (TCPOPT_TIMESTAMP << 8) |
 738                                    TCPOLEN_TIMESTAMP);
 739                 rep.opt[1] = htonl(tsval);
 740                 rep.opt[2] = htonl(tsecr);
 741                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 742         }
 743
 744         /* Swap the send and the receive. */
 745         rep.th.dest    = th->source;
 746         rep.th.source  = th->dest;
 747         rep.th.doff    = arg.iov[0].iov_len / 4;
 748         rep.th.seq     = htonl(seq);
 749         rep.th.ack_seq = htonl(ack);
 750         rep.th.ack     = 1;
 751         rep.th.window  = htons(win);
 752
 753 #ifdef CONFIG_TCP_MD5SIG
 754         if (key) {
 755                 int offset = (tsecr) ? 3 : 0;
 756
 757                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 758                                           (TCPOPT_NOP << 16) |
 759                                           (TCPOPT_MD5SIG << 8) |
 760                                           TCPOLEN_MD5SIG);
 761                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 762                 rep.th.doff = arg.iov[0].iov_len/4;
 763
 764                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 765                                     key, ip_hdr(skb)->saddr,
 766                                     ip_hdr(skb)->daddr, &rep.th);
 767         }
 768 #endif
 769         arg.flags = reply_flags;
 770         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 771                                       ip_hdr(skb)->saddr, /* XXX */
 772                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 773         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 774         if (oif)
 775                 arg.bound_dev_if = oif;
 776         arg.tos = tos;
 777         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 778                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 779                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 780                               &arg, arg.iov[0].iov_len);
 781
 782         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 783 }
 784
 785 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 786 {
 787         struct inet_timewait_sock *tw = inet_twsk(sk);
 788         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 789
 790         tcp_v4_send_ack(sock_net(sk), skb,
 791                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 792                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 793                         tcp_time_stamp + tcptw->tw_ts_offset,
 794                         tcptw->tw_ts_recent,
 795                         tw->tw_bound_dev_if,
 796                         tcp_twsk_md5_key(tcptw),
 797                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 798                         tw->tw_tos
 799                         );
 800
 801         inet_twsk_put(tw);
 802 }
 803
 804 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 805                                   struct request_sock *req)
 806 {
 807         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 808          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 809          */
 810         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 811                                              tcp_sk(sk)->snd_nxt;
 812
 813         tcp_v4_send_ack(sock_net(sk), skb, seq,
 814                         tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
 815                         tcp_time_stamp,
 816                         req->ts_recent,
 817                         0,
 818                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
 819                                           AF_INET),
 820                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 821                         ip_hdr(skb)->tos);
 822 }
 823
 824 /*
 825  *      Send a SYN-ACK after having received a SYN.
 826  *      This still operates on a request_sock only, not on a big
 827  *      socket.
 828  */
 829 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 830                               struct flowi *fl,
 831                               struct request_sock *req,
 832                               struct tcp_fastopen_cookie *foc,
 833                               enum tcp_synack_type synack_type)
 834 {
 835         const struct inet_request_sock *ireq = inet_rsk(req);
 836         struct flowi4 fl4;
 837         int err = -1;
 838         struct sk_buff *skb;
 839
 840         /* First, grab a route. */
 841         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 842                 return -1;
 843
 844         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 845
 846         if (skb) {
 847                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 848
 849                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 850                                             ireq->ir_rmt_addr,
 851                                             ireq->opt);
 852                 err = net_xmit_eval(err);
 853         }
 854
 855         return err;
 856 }
 857
 858 /*
 859  *      IPv4 request_sock destructor.
 860  */
 861 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 862 {
 863         kfree(inet_rsk(req)->opt);
 864 }
 865
 866 #ifdef CONFIG_TCP_MD5SIG
 867 /*
 868  * RFC2385 MD5 checksumming requires a mapping of
 869  * IP address->MD5 Key.
 870  * We need to maintain these in the sk structure.
 871  */
 872
 873 /* Find the Key structure for an address.  */
 874 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
 875                                          const union tcp_md5_addr *addr,
 876                                          int family)
 877 {
 878         const struct tcp_sock *tp = tcp_sk(sk);
 879         struct tcp_md5sig_key *key;
 880         unsigned int size = sizeof(struct in_addr);
 881         const struct tcp_md5sig_info *md5sig;
 882
 883         /* caller either holds rcu_read_lock() or socket lock */
 884         md5sig = rcu_dereference_check(tp->md5sig_info,
 885                                        lockdep_sock_is_held(sk));
 886         if (!md5sig)
 887                 return NULL;
 888 #if IS_ENABLED(CONFIG_IPV6)
 889         if (family == AF_INET6)
 890                 size = sizeof(struct in6_addr);
 891 #endif
 892         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 893                 if (key->family != family)
 894                         continue;
 895                 if (!memcmp(&key->addr, addr, size))
 896                         return key;
 897         }
 898         return NULL;
 899 }
 900 EXPORT_SYMBOL(tcp_md5_do_lookup);
 901
 902 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
 903                                          const struct sock *addr_sk)
 904 {
 905         const union tcp_md5_addr *addr;
 906
 907         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
 908         return tcp_md5_do_lookup(sk, addr, AF_INET);
 909 }
 910 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 911
 912 /* This can be called on a newly created socket, from other files */
 913 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 914                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
 915 {
 916         /* Add Key to the list */
 917         struct tcp_md5sig_key *key;
 918         struct tcp_sock *tp = tcp_sk(sk);
 919         struct tcp_md5sig_info *md5sig;
 920
 921         key = tcp_md5_do_lookup(sk, addr, family);
 922         if (key) {
 923                 /* Pre-existing entry - just update that one. */
 924                 memcpy(key->key, newkey, newkeylen);
 925                 key->keylen = newkeylen;
 926                 return 0;
 927         }
 928
 929         md5sig = rcu_dereference_protected(tp->md5sig_info,
 930                                            lockdep_sock_is_held(sk));
 931         if (!md5sig) {
 932                 md5sig = kmalloc(sizeof(*md5sig), gfp);
 933                 if (!md5sig)
 934                         return -ENOMEM;
 935
 936                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 937                 INIT_HLIST_HEAD(&md5sig->head);
 938                 rcu_assign_pointer(tp->md5sig_info, md5sig);
 939         }
 940
 941         key = sock_kmalloc(sk, sizeof(*key), gfp);
 942         if (!key)
 943                 return -ENOMEM;
 944         if (!tcp_alloc_md5sig_pool()) {
 945                 sock_kfree_s(sk, key, sizeof(*key));
 946                 return -ENOMEM;
 947         }
 948
 949         memcpy(key->key, newkey, newkeylen);
 950         key->keylen = newkeylen;
 951         key->family = family;
 952         memcpy(&key->addr, addr,
 953                (family == AF_INET6) ? sizeof(struct in6_addr) :
 954                                       sizeof(struct in_addr));
 955         hlist_add_head_rcu(&key->node, &md5sig->head);
 956         return 0;
 957 }
 958 EXPORT_SYMBOL(tcp_md5_do_add);
 959
 960 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
 961 {
 962         struct tcp_md5sig_key *key;
 963
 964         key = tcp_md5_do_lookup(sk, addr, family);
 965         if (!key)
 966                 return -ENOENT;
 967         hlist_del_rcu(&key->node);
 968         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
 969         kfree_rcu(key, rcu);
 970         return 0;
 971 }
 972 EXPORT_SYMBOL(tcp_md5_do_del);
 973
 974 static void tcp_clear_md5_list(struct sock *sk)
 975 {
 976         struct tcp_sock *tp = tcp_sk(sk);
 977         struct tcp_md5sig_key *key;
 978         struct hlist_node *n;
 979         struct tcp_md5sig_info *md5sig;
 980
 981         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
 982
 983         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
 984                 hlist_del_rcu(&key->node);
 985                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
 986                 kfree_rcu(key, rcu);
 987         }
 988 }
 989
 990 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
 991                                  int optlen)
 992 {
 993         struct tcp_md5sig cmd;
 994         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
 995
 996         if (optlen < sizeof(cmd))
 997                 return -EINVAL;
 998
 999         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1000                 return -EFAULT;
1001
1002         if (sin->sin_family != AF_INET)
1003                 return -EINVAL;
1004
1005         if (!cmd.tcpm_keylen)
1006                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1007                                       AF_INET);
1008
1009         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1010                 return -EINVAL;
1011
1012         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1013                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1014                               GFP_KERNEL);
1015 }
1016
1017 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1018                                         __be32 daddr, __be32 saddr, int nbytes)
1019 {
1020         struct tcp4_pseudohdr *bp;
1021         struct scatterlist sg;
1022
1023         bp = &hp->md5_blk.ip4;
1024
1025         /*
1026          * 1. the TCP pseudo-header (in the order: source IP address,
1027          * destination IP address, zero-padded protocol number, and
1028          * segment length)
1029          */
1030         bp->saddr = saddr;
1031         bp->daddr = daddr;
1032         bp->pad = 0;
1033         bp->protocol = IPPROTO_TCP;
1034         bp->len = cpu_to_be16(nbytes);
1035
1036         sg_init_one(&sg, bp, sizeof(*bp));
1037         ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(*bp));
1038         return crypto_ahash_update(hp->md5_req);
1039 }
1040
1041 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1042                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1043 {
1044         struct tcp_md5sig_pool *hp;
1045         struct ahash_request *req;
1046
1047         hp = tcp_get_md5sig_pool();
1048         if (!hp)
1049                 goto clear_hash_noput;
1050         req = hp->md5_req;
1051
1052         if (crypto_ahash_init(req))
1053                 goto clear_hash;
1054         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1055                 goto clear_hash;
1056         if (tcp_md5_hash_header(hp, th))
1057                 goto clear_hash;
1058         if (tcp_md5_hash_key(hp, key))
1059                 goto clear_hash;
1060         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1061         if (crypto_ahash_final(req))
1062                 goto clear_hash;
1063
1064         tcp_put_md5sig_pool();
1065         return 0;
1066
1067 clear_hash:
1068         tcp_put_md5sig_pool();
1069 clear_hash_noput:
1070         memset(md5_hash, 0, 16);
1071         return 1;
1072 }
1073
1074 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1075                         const struct sock *sk,
1076                         const struct sk_buff *skb)
1077 {
1078         struct tcp_md5sig_pool *hp;
1079         struct ahash_request *req;
1080         const struct tcphdr *th = tcp_hdr(skb);
1081         __be32 saddr, daddr;
1082
1083         if (sk) { /* valid for establish/request sockets */
1084                 saddr = sk->sk_rcv_saddr;
1085                 daddr = sk->sk_daddr;
1086         } else {
1087                 const struct iphdr *iph = ip_hdr(skb);
1088                 saddr = iph->saddr;
1089                 daddr = iph->daddr;
1090         }
1091
1092         hp = tcp_get_md5sig_pool();
1093         if (!hp)
1094                 goto clear_hash_noput;
1095         req = hp->md5_req;
1096
1097         if (crypto_ahash_init(req))
1098                 goto clear_hash;
1099
1100         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1101                 goto clear_hash;
1102         if (tcp_md5_hash_header(hp, th))
1103                 goto clear_hash;
1104         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1105                 goto clear_hash;
1106         if (tcp_md5_hash_key(hp, key))
1107                 goto clear_hash;
1108         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1109         if (crypto_ahash_final(req))
1110                 goto clear_hash;
1111
1112         tcp_put_md5sig_pool();
1113         return 0;
1114
1115 clear_hash:
1116         tcp_put_md5sig_pool();
1117 clear_hash_noput:
1118         memset(md5_hash, 0, 16);
1119         return 1;
1120 }
1121 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1122
1123 #endif
1124
1125 /* Called with rcu_read_lock() */
1126 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1127                                     const struct sk_buff *skb)
1128 {
1129 #ifdef CONFIG_TCP_MD5SIG
1130         /*
1131          * This gets called for each TCP segment that arrives
1132          * so we want to be efficient.
1133          * We have 3 drop cases:
1134          * o No MD5 hash and one expected.
1135          * o MD5 hash and we're not expecting one.
1136          * o MD5 hash and its wrong.
1137          */
1138         const __u8 *hash_location = NULL;
1139         struct tcp_md5sig_key *hash_expected;
1140         const struct iphdr *iph = ip_hdr(skb);
1141         const struct tcphdr *th = tcp_hdr(skb);
1142         int genhash;
1143         unsigned char newhash[16];
1144
1145         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1146                                           AF_INET);
1147         hash_location = tcp_parse_md5sig_option(th);
1148
1149         /* We've parsed the options - do we have a hash? */
1150         if (!hash_expected && !hash_location)
1151                 return false;
1152
1153         if (hash_expected && !hash_location) {
1154                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1155                 return true;
1156         }
1157
1158         if (!hash_expected && hash_location) {
1159                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1160                 return true;
1161         }
1162
1163         /* Okay, so this is hash_expected and hash_location -
1164          * so we need to calculate the checksum.
1165          */
1166         genhash = tcp_v4_md5_hash_skb(newhash,
1167                                       hash_expected,
1168                                       NULL, skb);
1169
1170         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1171                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1172                                      &iph->saddr, ntohs(th->source),
1173                                      &iph->daddr, ntohs(th->dest),
1174                                      genhash ? " tcp_v4_calc_md5_hash failed"
1175                                      : "");
1176                 return true;
1177         }
1178         return false;
1179 #endif
1180         return false;
1181 }
1182
1183 static void tcp_v4_init_req(struct request_sock *req,
1184                             const struct sock *sk_listener,
1185                             struct sk_buff *skb)
1186 {
1187         struct inet_request_sock *ireq = inet_rsk(req);
1188
1189         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1190         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1191         ireq->no_srccheck = inet_sk(sk_listener)->transparent;
1192         ireq->opt = tcp_v4_save_options(skb);
1193 }
1194
1195 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1196                                           struct flowi *fl,
1197                                           const struct request_sock *req,
1198                                           bool *strict)
1199 {
1200         struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1201
1202         if (strict) {
1203                 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1204                         *strict = true;
1205                 else
1206                         *strict = false;
1207         }
1208
1209         return dst;
1210 }
1211
1212 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1213         .family         =       PF_INET,
1214         .obj_size       =       sizeof(struct tcp_request_sock),
1215         .rtx_syn_ack    =       tcp_rtx_synack,
1216         .send_ack       =       tcp_v4_reqsk_send_ack,
1217         .destructor     =       tcp_v4_reqsk_destructor,
1218         .send_reset     =       tcp_v4_send_reset,
1219         .syn_ack_timeout =      tcp_syn_ack_timeout,
1220 };
1221
1222 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1223         .mss_clamp      =       TCP_MSS_DEFAULT,
1224 #ifdef CONFIG_TCP_MD5SIG
1225         .req_md5_lookup =       tcp_v4_md5_lookup,
1226         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1227 #endif
1228         .init_req       =       tcp_v4_init_req,
1229 #ifdef CONFIG_SYN_COOKIES
1230         .cookie_init_seq =      cookie_v4_init_sequence,
1231 #endif
1232         .route_req      =       tcp_v4_route_req,
1233         .init_seq       =       tcp_v4_init_sequence,
1234         .send_synack    =       tcp_v4_send_synack,
1235 };
1236
1237 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1238 {
1239         /* Never answer to SYNs send to broadcast or multicast */
1240         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1241                 goto drop;
1242
1243         return tcp_conn_request(&tcp_request_sock_ops,
1244                                 &tcp_request_sock_ipv4_ops, sk, skb);
1245
1246 drop:
1247         tcp_listendrop(sk);
1248         return 0;
1249 }
1250 EXPORT_SYMBOL(tcp_v4_conn_request);
1251
1252
1253 /*
1254  * The three way handshake has completed - we got a valid synack -
1255  * now create the new socket.
1256  */
1257 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1258                                   struct request_sock *req,
1259                                   struct dst_entry *dst,
1260                                   struct request_sock *req_unhash,
1261                                   bool *own_req)
1262 {
1263         struct inet_request_sock *ireq;
1264         struct inet_sock *newinet;
1265         struct tcp_sock *newtp;
1266         struct sock *newsk;
1267 #ifdef CONFIG_TCP_MD5SIG
1268         struct tcp_md5sig_key *key;
1269 #endif
1270         struct ip_options_rcu *inet_opt;
1271
1272         if (sk_acceptq_is_full(sk))
1273                 goto exit_overflow;
1274
1275         newsk = tcp_create_openreq_child(sk, req, skb);
1276         if (!newsk)
1277                 goto exit_nonewsk;
1278
1279         newsk->sk_gso_type = SKB_GSO_TCPV4;
1280         inet_sk_rx_dst_set(newsk, skb);
1281
1282         newtp                 = tcp_sk(newsk);
1283         newinet               = inet_sk(newsk);
1284         ireq                  = inet_rsk(req);
1285         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1286         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1287         newsk->sk_bound_dev_if = ireq->ir_iif;
1288         newinet->inet_saddr           = ireq->ir_loc_addr;
1289         inet_opt              = ireq->opt;
1290         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1291         ireq->opt             = NULL;
1292         newinet->mc_index     = inet_iif(skb);
1293         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1294         newinet->rcv_tos      = ip_hdr(skb)->tos;
1295         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1296         if (inet_opt)
1297                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1298         newinet->inet_id = newtp->write_seq ^ jiffies;
1299
1300         if (!dst) {
1301                 dst = inet_csk_route_child_sock(sk, newsk, req);
1302                 if (!dst)
1303                         goto put_and_exit;
1304         } else {
1305                 /* syncookie case : see end of cookie_v4_check() */
1306         }
1307         sk_setup_caps(newsk, dst);
1308
1309         tcp_ca_openreq_child(newsk, dst);
1310
1311         tcp_sync_mss(newsk, dst_mtu(dst));
1312         newtp->advmss = dst_metric_advmss(dst);
1313         if (tcp_sk(sk)->rx_opt.user_mss &&
1314             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1315                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1316
1317         tcp_initialize_rcv_mss(newsk);
1318
1319 #ifdef CONFIG_TCP_MD5SIG
1320         /* Copy over the MD5 key from the original socket */
1321         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1322                                 AF_INET);
1323         if (key) {
1324                 /*
1325                  * We're using one, so create a matching key
1326                  * on the newsk structure. If we fail to get
1327                  * memory, then we end up not copying the key
1328                  * across. Shucks.
1329                  */
1330                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1331                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1332                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1333         }
1334 #endif
1335
1336         if (__inet_inherit_port(sk, newsk) < 0)
1337                 goto put_and_exit;
1338         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1339         if (*own_req)
1340                 tcp_move_syn(newtp, req);
1341
1342         return newsk;
1343
1344 exit_overflow:
1345         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1346 exit_nonewsk:
1347         dst_release(dst);
1348 exit:
1349         tcp_listendrop(sk);
1350         return NULL;
1351 put_and_exit:
1352         inet_csk_prepare_forced_close(newsk);
1353         tcp_done(newsk);
1354         goto exit;
1355 }
1356 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1357
1358 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1359 {
1360 #ifdef CONFIG_SYN_COOKIES
1361         const struct tcphdr *th = tcp_hdr(skb);
1362
1363         if (!th->syn)
1364                 sk = cookie_v4_check(sk, skb);
1365 #endif
1366         return sk;
1367 }
1368
1369 /* The socket must have it's spinlock held when we get
1370  * here, unless it is a TCP_LISTEN socket.
1371  *
1372  * We have a potential double-lock case here, so even when
1373  * doing backlog processing we use the BH locking scheme.
1374  * This is because we cannot sleep with the original spinlock
1375  * held.
1376  */
1377 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1378 {
1379         struct sock *rsk;
1380
1381         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1382                 struct dst_entry *dst = sk->sk_rx_dst;
1383
1384                 sock_rps_save_rxhash(sk, skb);
1385                 sk_mark_napi_id(sk, skb);
1386                 if (dst) {
1387                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1388                             !dst->ops->check(dst, 0)) {
1389                                 dst_release(dst);
1390                                 sk->sk_rx_dst = NULL;
1391                         }
1392                 }
1393                 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1394                 return 0;
1395         }
1396
1397         if (tcp_checksum_complete(skb))
1398                 goto csum_err;
1399
1400         if (sk->sk_state == TCP_LISTEN) {
1401                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1402
1403                 if (!nsk)
1404                         goto discard;
1405                 if (nsk != sk) {
1406                         sock_rps_save_rxhash(nsk, skb);
1407                         sk_mark_napi_id(nsk, skb);
1408                         if (tcp_child_process(sk, nsk, skb)) {
1409                                 rsk = nsk;
1410                                 goto reset;
1411                         }
1412                         return 0;
1413                 }
1414         } else
1415                 sock_rps_save_rxhash(sk, skb);
1416
1417         if (tcp_rcv_state_process(sk, skb)) {
1418                 rsk = sk;
1419                 goto reset;
1420         }
1421         return 0;
1422
1423 reset:
1424         tcp_v4_send_reset(rsk, skb);
1425 discard:
1426         kfree_skb(skb);
1427         /* Be careful here. If this function gets more complicated and
1428          * gcc suffers from register pressure on the x86, sk (in %ebx)
1429          * might be destroyed here. This current version compiles correctly,
1430          * but you have been warned.
1431          */
1432         return 0;
1433
1434 csum_err:
1435         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1436         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1437         goto discard;
1438 }
1439 EXPORT_SYMBOL(tcp_v4_do_rcv);
1440
1441 void tcp_v4_early_demux(struct sk_buff *skb)
1442 {
1443         const struct iphdr *iph;
1444         const struct tcphdr *th;
1445         struct sock *sk;
1446
1447         if (skb->pkt_type != PACKET_HOST)
1448                 return;
1449
1450         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1451                 return;
1452
1453         iph = ip_hdr(skb);
1454         th = tcp_hdr(skb);
1455
1456         if (th->doff < sizeof(struct tcphdr) / 4)
1457                 return;
1458
1459         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1460                                        iph->saddr, th->source,
1461                                        iph->daddr, ntohs(th->dest),
1462                                        skb->skb_iif);
1463         if (sk) {
1464                 skb->sk = sk;
1465                 skb->destructor = sock_edemux;
1466                 if (sk_fullsock(sk)) {
1467                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1468
1469                         if (dst)
1470                                 dst = dst_check(dst, 0);
1471                         if (dst &&
1472                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1473                                 skb_dst_set_noref(skb, dst);
1474                 }
1475         }
1476 }
1477
1478 /* Packet is added to VJ-style prequeue for processing in process
1479  * context, if a reader task is waiting. Apparently, this exciting
1480  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1481  * failed somewhere. Latency? Burstiness? Well, at least now we will
1482  * see, why it failed. 8)8)                               --ANK
1483  *
1484  */
1485 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1486 {
1487         struct tcp_sock *tp = tcp_sk(sk);
1488
1489         if (sysctl_tcp_low_latency || !tp->ucopy.task)
1490                 return false;
1491
1492         if (skb->len <= tcp_hdrlen(skb) &&
1493             skb_queue_len(&tp->ucopy.prequeue) == 0)
1494                 return false;
1495
1496         /* Before escaping RCU protected region, we need to take care of skb
1497          * dst. Prequeue is only enabled for established sockets.
1498          * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1499          * Instead of doing full sk_rx_dst validity here, let's perform
1500          * an optimistic check.
1501          */
1502         if (likely(sk->sk_rx_dst))
1503                 skb_dst_drop(skb);
1504         else
1505                 skb_dst_force_safe(skb);
1506
1507         __skb_queue_tail(&tp->ucopy.prequeue, skb);
1508         tp->ucopy.memory += skb->truesize;
1509         if (tp->ucopy.memory > sk->sk_rcvbuf) {
1510                 struct sk_buff *skb1;
1511
1512                 BUG_ON(sock_owned_by_user(sk));
1513
1514                 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1515                         sk_backlog_rcv(sk, skb1);
1516                         NET_INC_STATS_BH(sock_net(sk),
1517                                          LINUX_MIB_TCPPREQUEUEDROPPED);
1518                 }
1519
1520                 tp->ucopy.memory = 0;
1521         } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1522                 wake_up_interruptible_sync_poll(sk_sleep(sk),
1523                                            POLLIN | POLLRDNORM | POLLRDBAND);
1524                 if (!inet_csk_ack_scheduled(sk))
1525                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1526                                                   (3 * tcp_rto_min(sk)) / 4,
1527                                                   TCP_RTO_MAX);
1528         }
1529         return true;
1530 }
1531 EXPORT_SYMBOL(tcp_prequeue);
1532
1533 /*
1534  *      From tcp_input.c
1535  */
1536
1537 int tcp_v4_rcv(struct sk_buff *skb)
1538 {
1539         struct net *net = dev_net(skb->dev);
1540         const struct iphdr *iph;
1541         const struct tcphdr *th;
1542         bool refcounted;
1543         struct sock *sk;
1544         int ret;
1545
1546         if (skb->pkt_type != PACKET_HOST)
1547                 goto discard_it;
1548
1549         /* Count it even if it's bad */
1550         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1551
1552         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1553                 goto discard_it;
1554
1555         th = tcp_hdr(skb);
1556
1557         if (th->doff < sizeof(struct tcphdr) / 4)
1558                 goto bad_packet;
1559         if (!pskb_may_pull(skb, th->doff * 4))
1560                 goto discard_it;
1561
1562         /* An explanation is required here, I think.
1563          * Packet length and doff are validated by header prediction,
1564          * provided case of th->doff==0 is eliminated.
1565          * So, we defer the checks. */
1566
1567         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1568                 goto csum_error;
1569
1570         th = tcp_hdr(skb);
1571         iph = ip_hdr(skb);
1572         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1573          * barrier() makes sure compiler wont play fool^Waliasing games.
1574          */
1575         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1576                 sizeof(struct inet_skb_parm));
1577         barrier();
1578
1579         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1580         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1581                                     skb->len - th->doff * 4);
1582         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1583         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1584         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1585         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1586         TCP_SKB_CB(skb)->sacked  = 0;
1587
1588 lookup:
1589         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1590                                th->dest, &refcounted);
1591         if (!sk)
1592                 goto no_tcp_socket;
1593
1594 process:
1595         if (sk->sk_state == TCP_TIME_WAIT)
1596                 goto do_time_wait;
1597
1598         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1599                 struct request_sock *req = inet_reqsk(sk);
1600                 struct sock *nsk;
1601
1602                 sk = req->rsk_listener;
1603                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1604                         reqsk_put(req);
1605                         goto discard_it;
1606                 }
1607                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1608                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1609                         goto lookup;
1610                 }
1611                 /* We own a reference on the listener, increase it again
1612                  * as we might lose it too soon.
1613                  */
1614                 sock_hold(sk);
1615                 refcounted = true;
1616                 nsk = tcp_check_req(sk, skb, req, false);
1617                 if (!nsk) {
1618                         reqsk_put(req);
1619                         goto discard_and_relse;
1620                 }
1621                 if (nsk == sk) {
1622                         reqsk_put(req);
1623                 } else if (tcp_child_process(sk, nsk, skb)) {
1624                         tcp_v4_send_reset(nsk, skb);
1625                         goto discard_and_relse;
1626                 } else {
1627                         sock_put(sk);
1628                         return 0;
1629                 }
1630         }
1631         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1632                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1633                 goto discard_and_relse;
1634         }
1635
1636         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1637                 goto discard_and_relse;
1638
1639         if (tcp_v4_inbound_md5_hash(sk, skb))
1640                 goto discard_and_relse;
1641
1642         nf_reset(skb);
1643
1644         if (sk_filter(sk, skb))
1645                 goto discard_and_relse;
1646
1647         skb->dev = NULL;
1648
1649         if (sk->sk_state == TCP_LISTEN) {
1650                 ret = tcp_v4_do_rcv(sk, skb);
1651                 goto put_and_return;
1652         }
1653
1654         sk_incoming_cpu_update(sk);
1655
1656         bh_lock_sock_nested(sk);
1657         tcp_segs_in(tcp_sk(sk), skb);
1658         ret = 0;
1659         if (!sock_owned_by_user(sk)) {
1660                 if (!tcp_prequeue(sk, skb))
1661                         ret = tcp_v4_do_rcv(sk, skb);
1662         } else if (unlikely(sk_add_backlog(sk, skb,
1663                                            sk->sk_rcvbuf + sk->sk_sndbuf))) {
1664                 bh_unlock_sock(sk);
1665                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1666                 goto discard_and_relse;
1667         }
1668         bh_unlock_sock(sk);
1669
1670 put_and_return:
1671         if (refcounted)
1672                 sock_put(sk);
1673
1674         return ret;
1675
1676 no_tcp_socket:
1677         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1678                 goto discard_it;
1679
1680         if (tcp_checksum_complete(skb)) {
1681 csum_error:
1682                 TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1683 bad_packet:
1684                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1685         } else {
1686                 tcp_v4_send_reset(NULL, skb);
1687         }
1688
1689 discard_it:
1690         /* Discard frame. */
1691         kfree_skb(skb);
1692         return 0;
1693
1694 discard_and_relse:
1695         sk_drops_add(sk, skb);
1696         if (refcounted)
1697                 sock_put(sk);
1698         goto discard_it;
1699
1700 do_time_wait:
1701         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1702                 inet_twsk_put(inet_twsk(sk));
1703                 goto discard_it;
1704         }
1705
1706         if (tcp_checksum_complete(skb)) {
1707                 inet_twsk_put(inet_twsk(sk));
1708                 goto csum_error;
1709         }
1710         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1711         case TCP_TW_SYN: {
1712                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1713                                                         &tcp_hashinfo, skb,
1714                                                         __tcp_hdrlen(th),
1715                                                         iph->saddr, th->source,
1716                                                         iph->daddr, th->dest,
1717                                                         inet_iif(skb));
1718                 if (sk2) {
1719                         inet_twsk_deschedule_put(inet_twsk(sk));
1720                         sk = sk2;
1721                         refcounted = false;
1722                         goto process;
1723                 }
1724                 /* Fall through to ACK */
1725         }
1726         case TCP_TW_ACK:
1727                 tcp_v4_timewait_ack(sk, skb);
1728                 break;
1729         case TCP_TW_RST:
1730                 tcp_v4_send_reset(sk, skb);
1731                 inet_twsk_deschedule_put(inet_twsk(sk));
1732                 goto discard_it;
1733         case TCP_TW_SUCCESS:;
1734         }
1735         goto discard_it;
1736 }
1737
1738 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1739         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1740         .twsk_unique    = tcp_twsk_unique,
1741         .twsk_destructor= tcp_twsk_destructor,
1742 };
1743
1744 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1745 {
1746         struct dst_entry *dst = skb_dst(skb);
1747
1748         if (dst && dst_hold_safe(dst)) {
1749                 sk->sk_rx_dst = dst;
1750                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1751         }
1752 }
1753 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1754
1755 const struct inet_connection_sock_af_ops ipv4_specific = {
1756         .queue_xmit        = ip_queue_xmit,
1757         .send_check        = tcp_v4_send_check,
1758         .rebuild_header    = inet_sk_rebuild_header,
1759         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1760         .conn_request      = tcp_v4_conn_request,
1761         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1762         .net_header_len    = sizeof(struct iphdr),
1763         .setsockopt        = ip_setsockopt,
1764         .getsockopt        = ip_getsockopt,
1765         .addr2sockaddr     = inet_csk_addr2sockaddr,
1766         .sockaddr_len      = sizeof(struct sockaddr_in),
1767         .bind_conflict     = inet_csk_bind_conflict,
1768 #ifdef CONFIG_COMPAT
1769         .compat_setsockopt = compat_ip_setsockopt,
1770         .compat_getsockopt = compat_ip_getsockopt,
1771 #endif
1772         .mtu_reduced       = tcp_v4_mtu_reduced,
1773 };
1774 EXPORT_SYMBOL(ipv4_specific);
1775
1776 #ifdef CONFIG_TCP_MD5SIG
1777 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1778         .md5_lookup             = tcp_v4_md5_lookup,
1779         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1780         .md5_parse              = tcp_v4_parse_md5_keys,
1781 };
1782 #endif
1783
1784 /* NOTE: A lot of things set to zero explicitly by call to
1785  *       sk_alloc() so need not be done here.
1786  */
1787 static int tcp_v4_init_sock(struct sock *sk)
1788 {
1789         struct inet_connection_sock *icsk = inet_csk(sk);
1790
1791         tcp_init_sock(sk);
1792
1793         icsk->icsk_af_ops = &ipv4_specific;
1794
1795 #ifdef CONFIG_TCP_MD5SIG
1796         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1797 #endif
1798
1799         return 0;
1800 }
1801
1802 void tcp_v4_destroy_sock(struct sock *sk)
1803 {
1804         struct tcp_sock *tp = tcp_sk(sk);
1805
1806         tcp_clear_xmit_timers(sk);
1807
1808         tcp_cleanup_congestion_control(sk);
1809
1810         /* Cleanup up the write buffer. */
1811         tcp_write_queue_purge(sk);
1812
1813         /* Cleans up our, hopefully empty, out_of_order_queue. */
1814         __skb_queue_purge(&tp->out_of_order_queue);
1815
1816 #ifdef CONFIG_TCP_MD5SIG
1817         /* Clean up the MD5 key list, if any */
1818         if (tp->md5sig_info) {
1819                 tcp_clear_md5_list(sk);
1820                 kfree_rcu(tp->md5sig_info, rcu);
1821                 tp->md5sig_info = NULL;
1822         }
1823 #endif
1824
1825         /* Clean prequeue, it must be empty really */
1826         __skb_queue_purge(&tp->ucopy.prequeue);
1827
1828         /* Clean up a referenced TCP bind bucket. */
1829         if (inet_csk(sk)->icsk_bind_hash)
1830                 inet_put_port(sk);
1831
1832         BUG_ON(tp->fastopen_rsk);
1833
1834         /* If socket is aborted during connect operation */
1835         tcp_free_fastopen_req(tp);
1836         tcp_saved_syn_free(tp);
1837
1838         sk_sockets_allocated_dec(sk);
1839
1840         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
1841                 sock_release_memcg(sk);
1842 }
1843 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1844
1845 #ifdef CONFIG_PROC_FS
1846 /* Proc filesystem TCP sock list dumping. */
1847
1848 /*
1849  * Get next listener socket follow cur.  If cur is NULL, get first socket
1850  * starting from bucket given in st->bucket; when st->bucket is zero the
1851  * very first socket in the hash table is returned.
1852  */
1853 static void *listening_get_next(struct seq_file *seq, void *cur)
1854 {
1855         struct tcp_iter_state *st = seq->private;
1856         struct net *net = seq_file_net(seq);
1857         struct inet_listen_hashbucket *ilb;
1858         struct inet_connection_sock *icsk;
1859         struct sock *sk = cur;
1860
1861         if (!sk) {
1862 get_head:
1863                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1864                 spin_lock_bh(&ilb->lock);
1865                 sk = sk_head(&ilb->head);
1866                 st->offset = 0;
1867                 goto get_sk;
1868         }
1869         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1870         ++st->num;
1871         ++st->offset;
1872
1873         sk = sk_next(sk);
1874 get_sk:
1875         sk_for_each_from(sk) {
1876                 if (!net_eq(sock_net(sk), net))
1877                         continue;
1878                 if (sk->sk_family == st->family)
1879                         return sk;
1880                 icsk = inet_csk(sk);
1881         }
1882         spin_unlock_bh(&ilb->lock);
1883         st->offset = 0;
1884         if (++st->bucket < INET_LHTABLE_SIZE)
1885                 goto get_head;
1886         return NULL;
1887 }
1888
1889 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1890 {
1891         struct tcp_iter_state *st = seq->private;
1892         void *rc;
1893
1894         st->bucket = 0;
1895         st->offset = 0;
1896         rc = listening_get_next(seq, NULL);
1897
1898         while (rc && *pos) {
1899                 rc = listening_get_next(seq, rc);
1900                 --*pos;
1901         }
1902         return rc;
1903 }
1904
1905 static inline bool empty_bucket(const struct tcp_iter_state *st)
1906 {
1907         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1908 }
1909
1910 /*
1911  * Get first established socket starting from bucket given in st->bucket.
1912  * If st->bucket is zero, the very first socket in the hash is returned.
1913  */
1914 static void *established_get_first(struct seq_file *seq)
1915 {
1916         struct tcp_iter_state *st = seq->private;
1917         struct net *net = seq_file_net(seq);
1918         void *rc = NULL;
1919
1920         st->offset = 0;
1921         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1922                 struct sock *sk;
1923                 struct hlist_nulls_node *node;
1924                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1925
1926                 /* Lockless fast path for the common case of empty buckets */
1927                 if (empty_bucket(st))
1928                         continue;
1929
1930                 spin_lock_bh(lock);
1931                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1932                         if (sk->sk_family != st->family ||
1933                             !net_eq(sock_net(sk), net)) {
1934                                 continue;
1935                         }
1936                         rc = sk;
1937                         goto out;
1938                 }
1939                 spin_unlock_bh(lock);
1940         }
1941 out:
1942         return rc;
1943 }
1944
1945 static void *established_get_next(struct seq_file *seq, void *cur)
1946 {
1947         struct sock *sk = cur;
1948         struct hlist_nulls_node *node;
1949         struct tcp_iter_state *st = seq->private;
1950         struct net *net = seq_file_net(seq);
1951
1952         ++st->num;
1953         ++st->offset;
1954
1955         sk = sk_nulls_next(sk);
1956
1957         sk_nulls_for_each_from(sk, node) {
1958                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1959                         return sk;
1960         }
1961
1962         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1963         ++st->bucket;
1964         return established_get_first(seq);
1965 }
1966
1967 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1968 {
1969         struct tcp_iter_state *st = seq->private;
1970         void *rc;
1971
1972         st->bucket = 0;
1973         rc = established_get_first(seq);
1974
1975         while (rc && pos) {
1976                 rc = established_get_next(seq, rc);
1977                 --pos;
1978         }
1979         return rc;
1980 }
1981
1982 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1983 {
1984         void *rc;
1985         struct tcp_iter_state *st = seq->private;
1986
1987         st->state = TCP_SEQ_STATE_LISTENING;
1988         rc        = listening_get_idx(seq, &pos);
1989
1990         if (!rc) {
1991                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1992                 rc        = established_get_idx(seq, pos);
1993         }
1994
1995         return rc;
1996 }
1997
1998 static void *tcp_seek_last_pos(struct seq_file *seq)
1999 {
2000         struct tcp_iter_state *st = seq->private;
2001         int offset = st->offset;
2002         int orig_num = st->num;
2003         void *rc = NULL;
2004
2005         switch (st->state) {
2006         case TCP_SEQ_STATE_LISTENING:
2007                 if (st->bucket >= INET_LHTABLE_SIZE)
2008                         break;
2009                 st->state = TCP_SEQ_STATE_LISTENING;
2010                 rc = listening_get_next(seq, NULL);
2011                 while (offset-- && rc)
2012                         rc = listening_get_next(seq, rc);
2013                 if (rc)
2014                         break;
2015                 st->bucket = 0;
2016                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2017                 /* Fallthrough */
2018         case TCP_SEQ_STATE_ESTABLISHED:
2019                 if (st->bucket > tcp_hashinfo.ehash_mask)
2020                         break;
2021                 rc = established_get_first(seq);
2022                 while (offset-- && rc)
2023                         rc = established_get_next(seq, rc);
2024         }
2025
2026         st->num = orig_num;
2027
2028         return rc;
2029 }
2030
2031 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2032 {
2033         struct tcp_iter_state *st = seq->private;
2034         void *rc;
2035
2036         if (*pos && *pos == st->last_pos) {
2037                 rc = tcp_seek_last_pos(seq);
2038                 if (rc)
2039                         goto out;
2040         }
2041
2042         st->state = TCP_SEQ_STATE_LISTENING;
2043         st->num = 0;
2044         st->bucket = 0;
2045         st->offset = 0;
2046         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2047
2048 out:
2049         st->last_pos = *pos;
2050         return rc;
2051 }
2052
2053 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2054 {
2055         struct tcp_iter_state *st = seq->private;
2056         void *rc = NULL;
2057
2058         if (v == SEQ_START_TOKEN) {
2059                 rc = tcp_get_idx(seq, 0);
2060                 goto out;
2061         }
2062
2063         switch (st->state) {
2064         case TCP_SEQ_STATE_LISTENING:
2065                 rc = listening_get_next(seq, v);
2066                 if (!rc) {
2067                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2068                         st->bucket = 0;
2069                         st->offset = 0;
2070                         rc        = established_get_first(seq);
2071                 }
2072                 break;
2073         case TCP_SEQ_STATE_ESTABLISHED:
2074                 rc = established_get_next(seq, v);
2075                 break;
2076         }
2077 out:
2078         ++*pos;
2079         st->last_pos = *pos;
2080         return rc;
2081 }
2082
2083 static void tcp_seq_stop(struct seq_file *seq, void *v)
2084 {
2085         struct tcp_iter_state *st = seq->private;
2086
2087         switch (st->state) {
2088         case TCP_SEQ_STATE_LISTENING:
2089                 if (v != SEQ_START_TOKEN)
2090                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2091                 break;
2092         case TCP_SEQ_STATE_ESTABLISHED:
2093                 if (v)
2094                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2095                 break;
2096         }
2097 }
2098
2099 int tcp_seq_open(struct inode *inode, struct file *file)
2100 {
2101         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2102         struct tcp_iter_state *s;
2103         int err;
2104
2105         err = seq_open_net(inode, file, &afinfo->seq_ops,
2106                           sizeof(struct tcp_iter_state));
2107         if (err < 0)
2108                 return err;
2109
2110         s = ((struct seq_file *)file->private_data)->private;
2111         s->family               = afinfo->family;
2112         s->last_pos             = 0;
2113         return 0;
2114 }
2115 EXPORT_SYMBOL(tcp_seq_open);
2116
2117 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2118 {
2119         int rc = 0;
2120         struct proc_dir_entry *p;
2121
2122         afinfo->seq_ops.start           = tcp_seq_start;
2123         afinfo->seq_ops.next            = tcp_seq_next;
2124         afinfo->seq_ops.stop            = tcp_seq_stop;
2125
2126         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2127                              afinfo->seq_fops, afinfo);
2128         if (!p)
2129                 rc = -ENOMEM;
2130         return rc;
2131 }
2132 EXPORT_SYMBOL(tcp_proc_register);
2133
2134 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2135 {
2136         remove_proc_entry(afinfo->name, net->proc_net);
2137 }
2138 EXPORT_SYMBOL(tcp_proc_unregister);
2139
2140 static void get_openreq4(const struct request_sock *req,
2141                          struct seq_file *f, int i)
2142 {
2143         const struct inet_request_sock *ireq = inet_rsk(req);
2144         long delta = req->rsk_timer.expires - jiffies;
2145
2146         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2147                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2148                 i,
2149                 ireq->ir_loc_addr,
2150                 ireq->ir_num,
2151                 ireq->ir_rmt_addr,
2152                 ntohs(ireq->ir_rmt_port),
2153                 TCP_SYN_RECV,
2154                 0, 0, /* could print option size, but that is af dependent. */
2155                 1,    /* timers active (only the expire timer) */
2156                 jiffies_delta_to_clock_t(delta),
2157                 req->num_timeout,
2158                 from_kuid_munged(seq_user_ns(f),
2159                                  sock_i_uid(req->rsk_listener)),
2160                 0,  /* non standard timer */
2161                 0, /* open_requests have no inode */
2162                 0,
2163                 req);
2164 }
2165
2166 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2167 {
2168         int timer_active;
2169         unsigned long timer_expires;
2170         const struct tcp_sock *tp = tcp_sk(sk);
2171         const struct inet_connection_sock *icsk = inet_csk(sk);
2172         const struct inet_sock *inet = inet_sk(sk);
2173         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2174         __be32 dest = inet->inet_daddr;
2175         __be32 src = inet->inet_rcv_saddr;
2176         __u16 destp = ntohs(inet->inet_dport);
2177         __u16 srcp = ntohs(inet->inet_sport);
2178         int rx_queue;
2179         int state;
2180
2181         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2182             icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2183             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2184                 timer_active    = 1;
2185                 timer_expires   = icsk->icsk_timeout;
2186         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2187                 timer_active    = 4;
2188                 timer_expires   = icsk->icsk_timeout;
2189         } else if (timer_pending(&sk->sk_timer)) {
2190                 timer_active    = 2;
2191                 timer_expires   = sk->sk_timer.expires;
2192         } else {
2193                 timer_active    = 0;
2194                 timer_expires = jiffies;
2195         }
2196
2197         state = sk_state_load(sk);
2198         if (state == TCP_LISTEN)
2199                 rx_queue = sk->sk_ack_backlog;
2200         else
2201                 /* Because we don't lock the socket,
2202                  * we might find a transient negative value.
2203                  */
2204                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2205
2206         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2207                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2208                 i, src, srcp, dest, destp, state,
2209                 tp->write_seq - tp->snd_una,
2210                 rx_queue,
2211                 timer_active,
2212                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2213                 icsk->icsk_retransmits,
2214                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2215                 icsk->icsk_probes_out,
2216                 sock_i_ino(sk),
2217                 atomic_read(&sk->sk_refcnt), sk,
2218                 jiffies_to_clock_t(icsk->icsk_rto),
2219                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2220                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2221                 tp->snd_cwnd,
2222                 state == TCP_LISTEN ?
2223                     fastopenq->max_qlen :
2224                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2225 }
2226
2227 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2228                                struct seq_file *f, int i)
2229 {
2230         long delta = tw->tw_timer.expires - jiffies;
2231         __be32 dest, src;
2232         __u16 destp, srcp;
2233
2234         dest  = tw->tw_daddr;
2235         src   = tw->tw_rcv_saddr;
2236         destp = ntohs(tw->tw_dport);
2237         srcp  = ntohs(tw->tw_sport);
2238
2239         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2240                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2241                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2242                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2243                 atomic_read(&tw->tw_refcnt), tw);
2244 }
2245
2246 #define TMPSZ 150
2247
2248 static int tcp4_seq_show(struct seq_file *seq, void *v)
2249 {
2250         struct tcp_iter_state *st;
2251         struct sock *sk = v;
2252
2253         seq_setwidth(seq, TMPSZ - 1);
2254         if (v == SEQ_START_TOKEN) {
2255                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2256                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2257                            "inode");
2258                 goto out;
2259         }
2260         st = seq->private;
2261
2262         if (sk->sk_state == TCP_TIME_WAIT)
2263                 get_timewait4_sock(v, seq, st->num);
2264         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2265                 get_openreq4(v, seq, st->num);
2266         else
2267                 get_tcp4_sock(v, seq, st->num);
2268 out:
2269         seq_pad(seq, '\n');
2270         return 0;
2271 }
2272
2273 static const struct file_operations tcp_afinfo_seq_fops = {
2274         .owner   = THIS_MODULE,
2275         .open    = tcp_seq_open,
2276         .read    = seq_read,
2277         .llseek  = seq_lseek,
2278         .release = seq_release_net
2279 };
2280
2281 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2282         .name           = "tcp",
2283         .family         = AF_INET,
2284         .seq_fops       = &tcp_afinfo_seq_fops,
2285         .seq_ops        = {
2286                 .show           = tcp4_seq_show,
2287         },
2288 };
2289
2290 static int __net_init tcp4_proc_init_net(struct net *net)
2291 {
2292         return tcp_proc_register(net, &tcp4_seq_afinfo);
2293 }
2294
2295 static void __net_exit tcp4_proc_exit_net(struct net *net)
2296 {
2297         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2298 }
2299
2300 static struct pernet_operations tcp4_net_ops = {
2301         .init = tcp4_proc_init_net,
2302         .exit = tcp4_proc_exit_net,
2303 };
2304
2305 int __init tcp4_proc_init(void)
2306 {
2307         return register_pernet_subsys(&tcp4_net_ops);
2308 }
2309
2310 void tcp4_proc_exit(void)
2311 {
2312         unregister_pernet_subsys(&tcp4_net_ops);
2313 }
2314 #endif /* CONFIG_PROC_FS */
2315
2316 struct proto tcp_prot = {
2317         .name                   = "TCP",
2318         .owner                  = THIS_MODULE,
2319         .close                  = tcp_close,
2320         .connect                = tcp_v4_connect,
2321         .disconnect             = tcp_disconnect,
2322         .accept                 = inet_csk_accept,
2323         .ioctl                  = tcp_ioctl,
2324         .init                   = tcp_v4_init_sock,
2325         .destroy                = tcp_v4_destroy_sock,
2326         .shutdown               = tcp_shutdown,
2327         .setsockopt             = tcp_setsockopt,
2328         .getsockopt             = tcp_getsockopt,
2329         .recvmsg                = tcp_recvmsg,
2330         .sendmsg                = tcp_sendmsg,
2331         .sendpage               = tcp_sendpage,
2332         .backlog_rcv            = tcp_v4_do_rcv,
2333         .release_cb             = tcp_release_cb,
2334         .hash                   = inet_hash,
2335         .unhash                 = inet_unhash,
2336         .get_port               = inet_csk_get_port,
2337         .enter_memory_pressure  = tcp_enter_memory_pressure,
2338         .stream_memory_free     = tcp_stream_memory_free,
2339         .sockets_allocated      = &tcp_sockets_allocated,
2340         .orphan_count           = &tcp_orphan_count,
2341         .memory_allocated       = &tcp_memory_allocated,
2342         .memory_pressure        = &tcp_memory_pressure,
2343         .sysctl_mem             = sysctl_tcp_mem,
2344         .sysctl_wmem            = sysctl_tcp_wmem,
2345         .sysctl_rmem            = sysctl_tcp_rmem,
2346         .max_header             = MAX_TCP_HEADER,
2347         .obj_size               = sizeof(struct tcp_sock),
2348         .slab_flags             = SLAB_DESTROY_BY_RCU,
2349         .twsk_prot              = &tcp_timewait_sock_ops,
2350         .rsk_prot               = &tcp_request_sock_ops,
2351         .h.hashinfo             = &tcp_hashinfo,
2352         .no_autobind            = true,
2353 #ifdef CONFIG_COMPAT
2354         .compat_setsockopt      = compat_tcp_setsockopt,
2355         .compat_getsockopt      = compat_tcp_getsockopt,
2356 #endif
2357         .diag_destroy           = tcp_abort,
2358 };
2359 EXPORT_SYMBOL(tcp_prot);
2360
2361 static void __net_exit tcp_sk_exit(struct net *net)
2362 {
2363         int cpu;
2364
2365         for_each_possible_cpu(cpu)
2366                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2367         free_percpu(net->ipv4.tcp_sk);
2368 }
2369
2370 static int __net_init tcp_sk_init(struct net *net)
2371 {
2372         int res, cpu;
2373
2374         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2375         if (!net->ipv4.tcp_sk)
2376                 return -ENOMEM;
2377
2378         for_each_possible_cpu(cpu) {
2379                 struct sock *sk;
2380
2381                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2382                                            IPPROTO_TCP, net);
2383                 if (res)
2384                         goto fail;
2385                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2386                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2387         }
2388
2389         net->ipv4.sysctl_tcp_ecn = 2;
2390         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2391
2392         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2393         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2394         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2395
2396         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2397         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2398         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2399
2400         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2401         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2402         net->ipv4.sysctl_tcp_syncookies = 1;
2403         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2404         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2405         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2406         net->ipv4.sysctl_tcp_orphan_retries = 0;
2407         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2408         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2409
2410         return 0;
2411 fail:
2412         tcp_sk_exit(net);
2413
2414         return res;
2415 }
2416
2417 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2418 {
2419         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2420 }
2421
2422 static struct pernet_operations __net_initdata tcp_sk_ops = {
2423        .init       = tcp_sk_init,
2424        .exit       = tcp_sk_exit,
2425        .exit_batch = tcp_sk_exit_batch,
2426 };
2427
2428 void __init tcp_v4_init(void)
2429 {
2430         inet_hashinfo_init(&tcp_hashinfo);
2431         if (register_pernet_subsys(&tcp_sk_ops))
2432                 panic("Failed to create the TCP control socket.\n");
2433 }