Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net

[deliverable/linux.git] / net / ipv4 / tcp_input.c
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c

index d377f4854cb853c454541062fb6f8438608d470c..432c36649db3dd8d579ddb05ae886dd3251dcb66 100644 (file)
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -237,7 +237,11 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *s
                         tcp_enter_quickack_mode((struct sock *)tp);
                 break;
         case INET_ECN_CE:
-               tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+               if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
+                       /* Better not delay acks, sender can have a very low cwnd */
+                       tcp_enter_quickack_mode((struct sock *)tp);
+                       tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+               }
                 /* fallinto */
         default:
                 tp->ecn_flags |= TCP_ECN_SEEN;
@@ -374,7 +378,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
  /* 4. Try to fixup all. It is made immediately after connection enters
   *    established state.
   */
-static void tcp_init_buffer_space(struct sock *sk)
+void tcp_init_buffer_space(struct sock *sk)
  {
         struct tcp_sock *tp = tcp_sk(sk);
         int maxwin;
@@ -739,29 +743,6 @@ __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
         return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
  }
  
-/* Set slow start threshold and cwnd not falling to slow start */
-void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
-{
-       struct tcp_sock *tp = tcp_sk(sk);
-       const struct inet_connection_sock *icsk = inet_csk(sk);
-
-       tp->prior_ssthresh = 0;
-       tp->bytes_acked = 0;
-       if (icsk->icsk_ca_state < TCP_CA_CWR) {
-               tp->undo_marker = 0;
-               if (set_ssthresh)
-                       tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
-               tp->snd_cwnd = min(tp->snd_cwnd,
-                                  tcp_packets_in_flight(tp) + 1U);
-               tp->snd_cwnd_cnt = 0;
-               tp->high_seq = tp->snd_nxt;
-               tp->snd_cwnd_stamp = tcp_time_stamp;
-               TCP_ECN_queue_cwr(tp);
-
-               tcp_set_ca_state(sk, TCP_CA_CWR);
-       }
-}
-
  /*
   * Packet counting of FACK is based on in-order assumptions, therefore TCP
   * disables it when reordering is detected
@@ -2489,35 +2470,6 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
         tp->snd_cwnd_stamp = tcp_time_stamp;
  }
  
-/* Lower bound on congestion window is slow start threshold
- * unless congestion avoidance choice decides to overide it.
- */
-static inline u32 tcp_cwnd_min(const struct sock *sk)
-{
-       const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
-
-       return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh;
-}
-
-/* Decrease cwnd each second ack. */
-static void tcp_cwnd_down(struct sock *sk, int flag)
-{
-       struct tcp_sock *tp = tcp_sk(sk);
-       int decr = tp->snd_cwnd_cnt + 1;
-
-       if ((flag & (FLAG_ANY_PROGRESS | FLAG_DSACKING_ACK)) ||
-           (tcp_is_reno(tp) && !(flag & FLAG_NOT_DUP))) {
-               tp->snd_cwnd_cnt = decr & 1;
-               decr >>= 1;
-
-               if (decr && tp->snd_cwnd > tcp_cwnd_min(sk))
-                       tp->snd_cwnd -= decr;
-
-               tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1);
-               tp->snd_cwnd_stamp = tcp_time_stamp;
-       }
-}
-
  /* Nothing was retransmitted or returned timestamp is less
   * than timestamp of the first retransmission.
   */
@@ -2719,24 +2671,80 @@ static bool tcp_try_undo_loss(struct sock *sk)
         return false;
  }
  
-static inline void tcp_complete_cwr(struct sock *sk)
+/* The cwnd reduction in CWR and Recovery use the PRR algorithm
+ * https://datatracker.ietf.org/doc/draft-ietf-tcpm-proportional-rate-reduction/
+ * It computes the number of packets to send (sndcnt) based on packets newly
+ * delivered:
+ *   1) If the packets in flight is larger than ssthresh, PRR spreads the
+ *     cwnd reductions across a full RTT.
+ *   2) If packets in flight is lower than ssthresh (such as due to excess
+ *     losses and/or application stalls), do not perform any further cwnd
+ *     reductions, but instead slow start up to ssthresh.
+ */
+static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh)
  {
         struct tcp_sock *tp = tcp_sk(sk);
  
-       /* Do not moderate cwnd if it's already undone in cwr or recovery. */
-       if (tp->undo_marker) {
-               if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) {
-                       tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
-                       tp->snd_cwnd_stamp = tcp_time_stamp;
-               } else if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH) {
-                       /* PRR algorithm. */
-                       tp->snd_cwnd = tp->snd_ssthresh;
-                       tp->snd_cwnd_stamp = tcp_time_stamp;
-               }
+       tp->high_seq = tp->snd_nxt;
+       tp->bytes_acked = 0;
+       tp->snd_cwnd_cnt = 0;
+       tp->prior_cwnd = tp->snd_cwnd;
+       tp->prr_delivered = 0;
+       tp->prr_out = 0;
+       if (set_ssthresh)
+               tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
+       TCP_ECN_queue_cwr(tp);
+}
+
+static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked,
+                              int fast_rexmit)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       int sndcnt = 0;
+       int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
+
+       tp->prr_delivered += newly_acked_sacked;
+       if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) {
+               u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
+                              tp->prior_cwnd - 1;
+               sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
+       } else {
+               sndcnt = min_t(int, delta,
+                              max_t(int, tp->prr_delivered - tp->prr_out,
+                                    newly_acked_sacked) + 1);
+       }
+
+       sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0));
+       tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
+}
+
+static inline void tcp_end_cwnd_reduction(struct sock *sk)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+
+       /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
+       if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR ||
+           (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) {
+               tp->snd_cwnd = tp->snd_ssthresh;
+               tp->snd_cwnd_stamp = tcp_time_stamp;
         }
         tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
  }
  
+/* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */
+void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+
+       tp->prior_ssthresh = 0;
+       tp->bytes_acked = 0;
+       if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
+               tp->undo_marker = 0;
+               tcp_init_cwnd_reduction(sk, set_ssthresh);
+               tcp_set_ca_state(sk, TCP_CA_CWR);
+       }
+}
+
  static void tcp_try_keep_open(struct sock *sk)
  {
         struct tcp_sock *tp = tcp_sk(sk);
@@ -2751,7 +2759,7 @@ static void tcp_try_keep_open(struct sock *sk)
         }
  }
  
-static void tcp_try_to_open(struct sock *sk, int flag)
+static void tcp_try_to_open(struct sock *sk, int flag, int newly_acked_sacked)
  {
         struct tcp_sock *tp = tcp_sk(sk);
  
@@ -2768,7 +2776,7 @@ static void tcp_try_to_open(struct sock *sk, int flag)
                 if (inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
                         tcp_moderate_cwnd(tp);
         } else {
-               tcp_cwnd_down(sk, flag);
+               tcp_cwnd_reduction(sk, newly_acked_sacked, 0);
         }
  }
  
@@ -2850,38 +2858,6 @@ void tcp_simple_retransmit(struct sock *sk)
  }
  EXPORT_SYMBOL(tcp_simple_retransmit);
  
-/* This function implements the PRR algorithm, specifcally the PRR-SSRB
- * (proportional rate reduction with slow start reduction bound) as described in
- * http://www.ietf.org/id/draft-mathis-tcpm-proportional-rate-reduction-01.txt.
- * It computes the number of packets to send (sndcnt) based on packets newly
- * delivered:
- *   1) If the packets in flight is larger than ssthresh, PRR spreads the
- *     cwnd reductions across a full RTT.
- *   2) If packets in flight is lower than ssthresh (such as due to excess
- *     losses and/or application stalls), do not perform any further cwnd
- *     reductions, but instead slow start up to ssthresh.
- */
-static void tcp_update_cwnd_in_recovery(struct sock *sk, int newly_acked_sacked,
-                                       int fast_rexmit, int flag)
-{
-       struct tcp_sock *tp = tcp_sk(sk);
-       int sndcnt = 0;
-       int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
-
-       if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) {
-               u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
-                              tp->prior_cwnd - 1;
-               sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
-       } else {
-               sndcnt = min_t(int, delta,
-                              max_t(int, tp->prr_delivered - tp->prr_out,
-                                    newly_acked_sacked) + 1);
-       }
-
-       sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0));
-       tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
-}
-
  static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
  {
         struct tcp_sock *tp = tcp_sk(sk);
@@ -2894,7 +2870,6 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
  
         NET_INC_STATS_BH(sock_net(sk), mib_idx);
  
-       tp->high_seq = tp->snd_nxt;
         tp->prior_ssthresh = 0;
         tp->undo_marker = tp->snd_una;
         tp->undo_retrans = tp->retrans_out;
@@ -2902,15 +2877,8 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
         if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
                 if (!ece_ack)
                         tp->prior_ssthresh = tcp_current_ssthresh(sk);
-               tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
-               TCP_ECN_queue_cwr(tp);
+               tcp_init_cwnd_reduction(sk, true);
         }
-
-       tp->bytes_acked = 0;
-       tp->snd_cwnd_cnt = 0;
-       tp->prior_cwnd = tp->snd_cwnd;
-       tp->prr_delivered = 0;
-       tp->prr_out = 0;
         tcp_set_ca_state(sk, TCP_CA_Recovery);
  }
  
@@ -2970,7 +2938,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
                         /* CWR is to be held something *above* high_seq
                          * is ACKed for CWR bit to reach receiver. */
                         if (tp->snd_una != tp->high_seq) {
-                               tcp_complete_cwr(sk);
+                               tcp_end_cwnd_reduction(sk);
                                 tcp_set_ca_state(sk, TCP_CA_Open);
                         }
                         break;
@@ -2980,7 +2948,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
                                 tcp_reset_reno_sack(tp);
                         if (tcp_try_undo_recovery(sk))
                                 return;
-                       tcp_complete_cwr(sk);
+                       tcp_end_cwnd_reduction(sk);
                         break;
                 }
         }
@@ -3021,7 +2989,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
                         tcp_try_undo_dsack(sk);
  
                 if (!tcp_time_to_recover(sk, flag)) {
-                       tcp_try_to_open(sk, flag);
+                       tcp_try_to_open(sk, flag, newly_acked_sacked);
                         return;
                 }
  
@@ -3043,8 +3011,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
  
         if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk)))
                 tcp_update_scoreboard(sk, fast_rexmit);
-       tp->prr_delivered += newly_acked_sacked;
-       tcp_update_cwnd_in_recovery(sk, newly_acked_sacked, fast_rexmit, flag);
+       tcp_cwnd_reduction(sk, newly_acked_sacked, fast_rexmit);
         tcp_xmit_retransmit_queue(sk);
  }
  
@@ -3123,6 +3090,12 @@ void tcp_rearm_rto(struct sock *sk)
  {
         struct tcp_sock *tp = tcp_sk(sk);
  
+       /* If the retrans timer is currently being used by Fast Open
+        * for SYN-ACK retrans purpose, stay put.
+        */
+       if (tp->fastopen_rsk)
+               return;
+
         if (!tp->packets_out) {
                 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
         } else {
@@ -3384,7 +3357,7 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
  {
         const struct tcp_sock *tp = tcp_sk(sk);
         return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) &&
-               !((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Recovery | TCPF_CA_CWR));
+               !tcp_in_cwnd_reduction(sk);
  }
  
  /* Check that window update is acceptable.
@@ -3452,9 +3425,9 @@ static void tcp_conservative_spur_to_response(struct tcp_sock *tp)
  }
  
  /* A conservative spurious RTO response algorithm: reduce cwnd using
- * rate halving and continue in congestion avoidance.
+ * PRR and continue in congestion avoidance.
   */
-static void tcp_ratehalving_spur_to_response(struct sock *sk)
+static void tcp_cwr_spur_to_response(struct sock *sk)
  {
         tcp_enter_cwr(sk, 0);
  }
@@ -3462,7 +3435,7 @@ static void tcp_ratehalving_spur_to_response(struct sock *sk)
  static void tcp_undo_spur_to_response(struct sock *sk, int flag)
  {
         if (flag & FLAG_ECE)
-               tcp_ratehalving_spur_to_response(sk);
+               tcp_cwr_spur_to_response(sk);
         else
                 tcp_undo_cwr(sk, true);
  }
@@ -3569,7 +3542,7 @@ static bool tcp_process_frto(struct sock *sk, int flag)
                         tcp_conservative_spur_to_response(tp);
                         break;
                 default:
-                       tcp_ratehalving_spur_to_response(sk);
+                       tcp_cwr_spur_to_response(sk);
                         break;
                 }
                 tp->frto_counter = 0;
@@ -4034,7 +4007,7 @@ static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
  }
  
  /* When we get a reset we do this. */
-static void tcp_reset(struct sock *sk)
+void tcp_reset(struct sock *sk)
  {
         /* We want the right error as BSD sees it (and indeed as we do). */
         switch (sk->sk_state) {
@@ -5740,7 +5713,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
  
                 TCP_ECN_rcv_synack(tp, th);
  
-               tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
+               tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
                 tcp_ack(sk, skb, FLAG_SLOWPATH);
  
                 /* Ok.. it's good. Set up sequence numbers and
@@ -5753,7 +5726,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
                  * never scaled.
                  */
                 tp->snd_wnd = ntohs(th->window);
-               tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
  
                 if (!tp->rx_opt.wscale_ok) {
                         tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
@@ -5891,7 +5863,9 @@ discard:
                 tcp_send_synack(sk);
  #if 0
                 /* Note, we could accept data and URG from this segment.
-                * There are no obstacles to make this.
+                * There are no obstacles to make this (except that we must
+                * either change tcp_recvmsg() to prevent it from returning data
+                * before 3WHS completes per RFC793, or employ TCP Fast Open).
                  *
                  * However, if we ignore data in ACKless segments sometimes,
                  * we have no reasons to accept it sometimes.
@@ -5931,6 +5905,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
  {
         struct tcp_sock *tp = tcp_sk(sk);
         struct inet_connection_sock *icsk = inet_csk(sk);
+       struct request_sock *req;
         int queued = 0;
  
         tp->rx_opt.saw_tstamp = 0;
@@ -5986,6 +5961,14 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                 return 0;
         }
  
+       req = tp->fastopen_rsk;
+       if (req != NULL) {
+               BUG_ON(sk->sk_state != TCP_SYN_RECV &&
+                   sk->sk_state != TCP_FIN_WAIT1);
+
+               if (tcp_check_req(sk, skb, req, NULL, true) == NULL)
+                       goto discard;
+       }
         if (!tcp_validate_incoming(sk, skb, th, 0))
                 return 0;
  
@@ -5996,7 +5979,25 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                 switch (sk->sk_state) {
                 case TCP_SYN_RECV:
                         if (acceptable) {
-                               tp->copied_seq = tp->rcv_nxt;
+                               /* Once we leave TCP_SYN_RECV, we no longer
+                                * need req so release it.
+                                */
+                               if (req) {
+                                       tcp_synack_rtt_meas(sk, req);
+                                       tp->total_retrans = req->retrans;
+
+                                       reqsk_fastopen_remove(sk, req, false);
+                               } else {
+                                       /* Make sure socket is routed, for
+                                        * correct metrics.
+                                        */
+                                       icsk->icsk_af_ops->rebuild_header(sk);
+                                       tcp_init_congestion_control(sk);
+
+                                       tcp_mtup_init(sk);
+                                       tcp_init_buffer_space(sk);
+                                       tp->copied_seq = tp->rcv_nxt;
+                               }
                                 smp_mb();
                                 tcp_set_state(sk, TCP_ESTABLISHED);
                                 sk->sk_state_change(sk);
@@ -6018,23 +6019,27 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                                 if (tp->rx_opt.tstamp_ok)
                                         tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
  
-                               /* Make sure socket is routed, for
-                                * correct metrics.
-                                */
-                               icsk->icsk_af_ops->rebuild_header(sk);
-
-                               tcp_init_metrics(sk);
-
-                               tcp_init_congestion_control(sk);
+                               if (req) {
+                                       /* Re-arm the timer because data may
+                                        * have been sent out. This is similar
+                                        * to the regular data transmission case
+                                        * when new data has just been ack'ed.
+                                        *
+                                        * (TFO) - we could try to be more
+                                        * aggressive and retranmitting any data
+                                        * sooner based on when they were sent
+                                        * out.
+                                        */
+                                       tcp_rearm_rto(sk);
+                               } else
+                                       tcp_init_metrics(sk);
  
                                 /* Prevent spurious tcp_cwnd_restart() on
                                  * first data packet.
                                  */
                                 tp->lsndtime = tcp_time_stamp;
  
-                               tcp_mtup_init(sk);
                                 tcp_initialize_rcv_mss(sk);
-                               tcp_init_buffer_space(sk);
                                 tcp_fast_path_on(tp);
                         } else {
                                 return 1;
@@ -6042,6 +6047,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                         break;
  
                 case TCP_FIN_WAIT1:
+                       /* If we enter the TCP_FIN_WAIT1 state and we are a
+                        * Fast Open socket and this is the first acceptable
+                        * ACK we have received, this would have acknowledged
+                        * our SYNACK so stop the SYNACK timer.
+                        */
+                       if (acceptable && req != NULL) {
+                               /* We no longer need the request sock. */
+                               reqsk_fastopen_remove(sk, req, false);
+                               tcp_rearm_rto(sk);
+                       }
                         if (tp->snd_una == tp->write_seq) {
                                 struct dst_entry *dst;