Merge branch 'tcp-more-reliable-window-probes'
authorDavid S. Miller <davem@davemloft.net>
Sat, 9 May 2015 20:42:32 +0000 (16:42 -0400)
committerDavid S. Miller <davem@davemloft.net>
Sat, 9 May 2015 20:42:32 +0000 (16:42 -0400)
Eric Dumazet says:

====================
tcp: more reliable window probes

This series address a problem caused by small rto_min timers in DC,
leading to either timer storms or early flow terminations.

We also add two new SNMP counters for proper monitoring :
TCPWinProbe and TCPKeepAlive

v2: added TCPKeepAlive counter, as suggested by Yuchung & Neal
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
include/net/tcp.h
include/uapi/linux/snmp.h
net/ipv4/proc.c
net/ipv4/tcp_input.c
net/ipv4/tcp_output.c
net/ipv4/tcp_timer.c

index 6d204f3f9df8cafb82d856db08769a7d24dfd79e..b8ea12880fd960d2141d788402e50b940a191380 100644 (file)
@@ -527,7 +527,7 @@ int tcp_fragment(struct sock *, struct sk_buff *, u32, unsigned int, gfp_t);
 
 void tcp_send_probe0(struct sock *);
 void tcp_send_partial(struct sock *);
-int tcp_write_wakeup(struct sock *);
+int tcp_write_wakeup(struct sock *, int mib);
 void tcp_send_fin(struct sock *sk);
 void tcp_send_active_reset(struct sock *sk, gfp_t priority);
 int tcp_send_synack(struct sock *);
@@ -1043,14 +1043,31 @@ static inline bool tcp_is_cwnd_limited(const struct sock *sk)
        return tp->is_cwnd_limited;
 }
 
-static inline void tcp_check_probe_timer(struct sock *sk)
+/* Something is really bad, we could not queue an additional packet,
+ * because qdisc is full or receiver sent a 0 window.
+ * We do not want to add fuel to the fire, or abort too early,
+ * so make sure the timer we arm now is at least 200ms in the future,
+ * regardless of current icsk_rto value (as it could be ~2ms)
+ */
+static inline unsigned long tcp_probe0_base(const struct sock *sk)
 {
-       const struct tcp_sock *tp = tcp_sk(sk);
-       const struct inet_connection_sock *icsk = inet_csk(sk);
+       return max_t(unsigned long, inet_csk(sk)->icsk_rto, TCP_RTO_MIN);
+}
 
-       if (!tp->packets_out && !icsk->icsk_pending)
+/* Variant of inet_csk_rto_backoff() used for zero window probes */
+static inline unsigned long tcp_probe0_when(const struct sock *sk,
+                                           unsigned long max_when)
+{
+       u64 when = (u64)tcp_probe0_base(sk) << inet_csk(sk)->icsk_backoff;
+
+       return (unsigned long)min_t(u64, when, max_when);
+}
+
+static inline void tcp_check_probe_timer(struct sock *sk)
+{
+       if (!tcp_sk(sk)->packets_out && !inet_csk(sk)->icsk_pending)
                inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
-                                         icsk->icsk_rto, TCP_RTO_MAX);
+                                         tcp_probe0_base(sk), TCP_RTO_MAX);
 }
 
 static inline void tcp_init_wl(struct tcp_sock *tp, u32 seq)
index 6a6fb747c78db0bfb763212c826832f206107a0d..eee8968407f063b5d9c4776a30ebe45e5b782f2d 100644 (file)
@@ -276,6 +276,8 @@ enum
        LINUX_MIB_TCPACKSKIPPEDFINWAIT2,        /* TCPACKSkippedFinWait2 */
        LINUX_MIB_TCPACKSKIPPEDTIMEWAIT,        /* TCPACKSkippedTimeWait */
        LINUX_MIB_TCPACKSKIPPEDCHALLENGE,       /* TCPACKSkippedChallenge */
+       LINUX_MIB_TCPWINPROBE,                  /* TCPWinProbe */
+       LINUX_MIB_TCPKEEPALIVE,                 /* TCPKeepAlive */
        __LINUX_MIB_MAX
 };
 
index e1f3b911dd1e3739a63e38b63a1b9a7b29bfd7f0..da5d483e236ac1e37b631c6091219fbefbe497b4 100644 (file)
@@ -298,6 +298,8 @@ static const struct snmp_mib snmp4_net_list[] = {
        SNMP_MIB_ITEM("TCPACKSkippedFinWait2", LINUX_MIB_TCPACKSKIPPEDFINWAIT2),
        SNMP_MIB_ITEM("TCPACKSkippedTimeWait", LINUX_MIB_TCPACKSKIPPEDTIMEWAIT),
        SNMP_MIB_ITEM("TCPACKSkippedChallenge", LINUX_MIB_TCPACKSKIPPEDCHALLENGE),
+       SNMP_MIB_ITEM("TCPWinProbe", LINUX_MIB_TCPWINPROBE),
+       SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE),
        SNMP_MIB_SENTINEL
 };
 
index df2ca615cd0c2221b439a921aeb28bdf2317b85c..cf8b20ff665852a370bfd5ed6adb51143546bf6b 100644 (file)
@@ -3233,7 +3233,7 @@ static void tcp_ack_probe(struct sock *sk)
                 * This function is not for random using!
                 */
        } else {
-               unsigned long when = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
+               unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX);
 
                inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
                                          when, TCP_RTO_MAX);
index a369e8a70b2c775bfee94d7f329ee892c2cdc895..7386d32cd670be48e69c4bac0ff4d6e3d8689d81 100644 (file)
@@ -3382,7 +3382,7 @@ EXPORT_SYMBOL_GPL(tcp_send_ack);
  * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
  * out-of-date with SND.UNA-1 to probe window.
  */
-static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
+static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
@@ -3400,6 +3400,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
         */
        tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
        skb_mstamp_get(&skb->skb_mstamp);
+       NET_INC_STATS_BH(sock_net(sk), mib);
        return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
 }
 
@@ -3407,12 +3408,12 @@ void tcp_send_window_probe(struct sock *sk)
 {
        if (sk->sk_state == TCP_ESTABLISHED) {
                tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
-               tcp_xmit_probe_skb(sk, 0);
+               tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE);
        }
 }
 
 /* Initiate keepalive or window probe from timer. */
-int tcp_write_wakeup(struct sock *sk)
+int tcp_write_wakeup(struct sock *sk, int mib)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
@@ -3449,8 +3450,8 @@ int tcp_write_wakeup(struct sock *sk)
                return err;
        } else {
                if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
-                       tcp_xmit_probe_skb(sk, 1);
-               return tcp_xmit_probe_skb(sk, 0);
+                       tcp_xmit_probe_skb(sk, 1, mib);
+               return tcp_xmit_probe_skb(sk, 0, mib);
        }
 }
 
@@ -3464,7 +3465,7 @@ void tcp_send_probe0(struct sock *sk)
        unsigned long probe_max;
        int err;
 
-       err = tcp_write_wakeup(sk);
+       err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
 
        if (tp->packets_out || !tcp_send_head(sk)) {
                /* Cancel probe timer, if it is not required. */
@@ -3490,7 +3491,7 @@ void tcp_send_probe0(struct sock *sk)
                probe_max = TCP_RESOURCE_PROBE_INTERVAL;
        }
        inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
-                                 inet_csk_rto_backoff(icsk, probe_max),
+                                 tcp_probe0_when(sk, probe_max),
                                  TCP_RTO_MAX);
 }
 
index 8c65dc147d8bcfb58e14c20b774711ffbcc30d5a..65bf670e87143667b3e04c9cf48d8b9455c18049 100644 (file)
@@ -616,7 +616,7 @@ static void tcp_keepalive_timer (unsigned long data)
                        tcp_write_err(sk);
                        goto out;
                }
-               if (tcp_write_wakeup(sk) <= 0) {
+               if (tcp_write_wakeup(sk, LINUX_MIB_TCPKEEPALIVE) <= 0) {
                        icsk->icsk_probes_out++;
                        elapsed = keepalive_intvl_when(tp);
                } else {
This page took 0.031844 seconds and 5 git commands to generate.