Print this page
DLPX-25998 TCP congestion control is inadequate
Reviewed at: http://reviews.delphix.com/r/34808/
DLPX-43064 include high-resolution round-trip times in connstat (EP-652)
DLPX-42721 Create inline function for TCP RTO calculation
DLPX-37540 TCP per-connection kernel statistics DLPX-37544 connstat command to display per-connection TCP statistics

@@ -21,11 +21,11 @@
 
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright 2011 Joyent, Inc.  All rights reserved.
- * Copyright (c) 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
  */
 
 #include <sys/types.h>
 #include <sys/strlog.h>
 #include <sys/strsun.h>

@@ -592,11 +592,11 @@
                 tcp->tcp_rack_cur_max = 2;
         }
         mp = tcp_ack_mp(tcp);
 
         if (mp != NULL) {
-                BUMP_LOCAL(tcp->tcp_obsegs);
+                TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
                 TCPS_BUMP_MIB(tcps, tcpOutAck);
                 TCPS_BUMP_MIB(tcps, tcpOutAckDelayed);
                 tcp_send_data(tcp, mp);
         }
 }

@@ -754,14 +754,18 @@
                         clock_t time_to_wait;
 
                         TCPS_BUMP_MIB(tcps, tcpTimRetrans);
                         if (!tcp->tcp_xmit_head)
                                 break;
-                        time_to_wait = ddi_get_lbolt() -
-                            (clock_t)tcp->tcp_xmit_head->b_prev;
-                        time_to_wait = tcp->tcp_rto -
-                            TICK_TO_MSEC(time_to_wait);
+#ifdef KERNEL_32
+                        time_to_wait = TICK_TO_MSEC(ddi_get_lbolt() -
+                            (clock_t)tcp->tcp_xmit_head->b_prev);
+#else
+                        time_to_wait = NSEC2MSEC(gethrtime() -
+                            (hrtime_t)(intptr_t)tcp->tcp_xmit_head->b_prev);
+#endif
+                        time_to_wait = tcp->tcp_rto - time_to_wait;
                         /*
                          * If the timer fires too early, 1 clock tick earlier,
                          * restart the timer.
                          */
                         if (time_to_wait > msec_per_tick) {

@@ -783,41 +787,12 @@
                                 if (connp->conn_debug) {
                                         (void) strlog(TCP_MOD_ID, 0, 1,
                                             SL_TRACE, "tcp_timer: zero win");
                                 }
                         } else {
-                                /*
-                                 * After retransmission, we need to do
-                                 * slow start.  Set the ssthresh to one
-                                 * half of current effective window and
-                                 * cwnd to one MSS.  Also reset
-                                 * tcp_cwnd_cnt.
-                                 *
-                                 * Note that if tcp_ssthresh is reduced because
-                                 * of ECN, do not reduce it again unless it is
-                                 * already one window of data away (tcp_cwr
-                                 * should then be cleared) or this is a
-                                 * timeout for a retransmitted segment.
-                                 */
-                                uint32_t npkt;
-
-                                if (!tcp->tcp_cwr || tcp->tcp_rexmit) {
-                                        npkt = ((tcp->tcp_timer_backoff ?
-                                            tcp->tcp_cwnd_ssthresh :
-                                            tcp->tcp_snxt -
-                                            tcp->tcp_suna) >> 1) / tcp->tcp_mss;
-                                        tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) *
-                                            tcp->tcp_mss;
+                                cc_cong_signal(tcp, NULL, CC_RTO);
                                 }
-                                tcp->tcp_cwnd = tcp->tcp_mss;
-                                tcp->tcp_cwnd_cnt = 0;
-                                if (tcp->tcp_ecn_ok) {
-                                        tcp->tcp_cwr = B_TRUE;
-                                        tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
-                                        tcp->tcp_ecn_cwr_sent = B_FALSE;
-                                }
-                        }
                         break;
                 }
                 /*
                  * We have something to send yet we cannot send.  The
                  * reason can be:

@@ -852,10 +827,11 @@
                         if (tcp->tcp_swnd == 0) {
                                 /* Extend window for zero window probe */
                                 tcp->tcp_swnd++;
                                 tcp->tcp_zero_win_probe = B_TRUE;
                                 TCPS_BUMP_MIB(tcps, tcpOutWinProbe);
+                                tcp->tcp_cs.tcp_out_zwnd_probes++;
                         } else {
                                 /*
                                  * Handle timeout from sender SWS avoidance.
                                  * Reset our knowledge of the max send window
                                  * since the receiver might have reduced its

@@ -1010,23 +986,21 @@
                  * tcp_rtt_update so that we won't accidentally cache a
                  * bad value.  But only do this if this is not a zero
                  * window probe.
                  */
                 if (tcp->tcp_rtt_sa != 0 && tcp->tcp_zero_win_probe == 0) {
-                        tcp->tcp_rtt_sd += (tcp->tcp_rtt_sa >> 3) +
-                            (tcp->tcp_rtt_sa >> 5);
+                        tcp->tcp_rtt_sd += tcp->tcp_rtt_sa >> 3 +
+                            tcp->tcp_rtt_sa >> 5;
                         tcp->tcp_rtt_sa = 0;
                         tcp_ip_notify(tcp);
                         tcp->tcp_rtt_update = 0;
                 }
         }
 
 timer_rexmit:
         tcp->tcp_timer_backoff++;
-        if ((ms = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
-            tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5)) <
-            tcp->tcp_rto_min) {
+        if ((ms = tcp_calculate_rto(tcp, tcps)) < tcp->tcp_rto_min) {
                 /*
                  * This means the original RTO is tcp_rexmit_interval_min.
                  * So we will use tcp_rexmit_interval_min as the RTO value
                  * and do the backoff.
                  */

@@ -1057,12 +1031,17 @@
         if (mss > tcp->tcp_mss)
                 mss = tcp->tcp_mss;
         if (mss > tcp->tcp_swnd && tcp->tcp_swnd != 0)
                 mss = tcp->tcp_swnd;
 
-        if ((mp = tcp->tcp_xmit_head) != NULL)
+        if ((mp = tcp->tcp_xmit_head) != NULL) {
+#ifdef KERNEL_32
                 mp->b_prev = (mblk_t *)ddi_get_lbolt();
+#else
+                mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
+#endif
+        }
         mp = tcp_xmit_mp(tcp, mp, mss, NULL, NULL, tcp->tcp_suna, B_TRUE, &mss,
             B_TRUE);
 
         /*
          * When slow start after retransmission begins, start with

@@ -1089,10 +1068,12 @@
         }
 
         tcp->tcp_csuna = tcp->tcp_snxt;
         TCPS_BUMP_MIB(tcps, tcpRetransSegs);
         TCPS_UPDATE_MIB(tcps, tcpRetransBytes, mss);
+        tcp->tcp_cs.tcp_out_retrans_segs++;
+        tcp->tcp_cs.tcp_out_retrans_bytes += mss;
         tcp_send_data(tcp, mp);
 
 }
 
 /*