Print this page
DLPX-25998 TCP congestion control is inadequate
Reviewed at: http://reviews.delphix.com/r/34808/
DLPX-43064 include high-resolution round-trip times in connstat (EP-652)
DLPX-42721 Create inline function for TCP RTO calculation
DLPX-37540 TCP per-connection kernel statistics DLPX-37544 connstat command to display per-connection TCP statistics
@@ -21,11 +21,11 @@
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright 2011 Joyent, Inc. All rights reserved.
- * Copyright (c) 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
*/
#include <sys/types.h>
#include <sys/strlog.h>
#include <sys/strsun.h>
@@ -592,11 +592,11 @@
tcp->tcp_rack_cur_max = 2;
}
mp = tcp_ack_mp(tcp);
if (mp != NULL) {
- BUMP_LOCAL(tcp->tcp_obsegs);
+ TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
TCPS_BUMP_MIB(tcps, tcpOutAck);
TCPS_BUMP_MIB(tcps, tcpOutAckDelayed);
tcp_send_data(tcp, mp);
}
}
@@ -754,14 +754,18 @@
clock_t time_to_wait;
TCPS_BUMP_MIB(tcps, tcpTimRetrans);
if (!tcp->tcp_xmit_head)
break;
- time_to_wait = ddi_get_lbolt() -
- (clock_t)tcp->tcp_xmit_head->b_prev;
- time_to_wait = tcp->tcp_rto -
- TICK_TO_MSEC(time_to_wait);
+#ifdef KERNEL_32
+ time_to_wait = TICK_TO_MSEC(ddi_get_lbolt() -
+ (clock_t)tcp->tcp_xmit_head->b_prev);
+#else
+ time_to_wait = NSEC2MSEC(gethrtime() -
+ (hrtime_t)(intptr_t)tcp->tcp_xmit_head->b_prev);
+#endif
+ time_to_wait = tcp->tcp_rto - time_to_wait;
/*
* If the timer fires too early, 1 clock tick earlier,
* restart the timer.
*/
if (time_to_wait > msec_per_tick) {
@@ -783,41 +787,12 @@
if (connp->conn_debug) {
(void) strlog(TCP_MOD_ID, 0, 1,
SL_TRACE, "tcp_timer: zero win");
}
} else {
- /*
- * After retransmission, we need to do
- * slow start. Set the ssthresh to one
- * half of current effective window and
- * cwnd to one MSS. Also reset
- * tcp_cwnd_cnt.
- *
- * Note that if tcp_ssthresh is reduced because
- * of ECN, do not reduce it again unless it is
- * already one window of data away (tcp_cwr
- * should then be cleared) or this is a
- * timeout for a retransmitted segment.
- */
- uint32_t npkt;
-
- if (!tcp->tcp_cwr || tcp->tcp_rexmit) {
- npkt = ((tcp->tcp_timer_backoff ?
- tcp->tcp_cwnd_ssthresh :
- tcp->tcp_snxt -
- tcp->tcp_suna) >> 1) / tcp->tcp_mss;
- tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) *
- tcp->tcp_mss;
+ cc_cong_signal(tcp, NULL, CC_RTO);
}
- tcp->tcp_cwnd = tcp->tcp_mss;
- tcp->tcp_cwnd_cnt = 0;
- if (tcp->tcp_ecn_ok) {
- tcp->tcp_cwr = B_TRUE;
- tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
- tcp->tcp_ecn_cwr_sent = B_FALSE;
- }
- }
break;
}
/*
* We have something to send yet we cannot send. The
* reason can be:
@@ -852,10 +827,11 @@
if (tcp->tcp_swnd == 0) {
/* Extend window for zero window probe */
tcp->tcp_swnd++;
tcp->tcp_zero_win_probe = B_TRUE;
TCPS_BUMP_MIB(tcps, tcpOutWinProbe);
+ tcp->tcp_cs.tcp_out_zwnd_probes++;
} else {
/*
* Handle timeout from sender SWS avoidance.
* Reset our knowledge of the max send window
* since the receiver might have reduced its
@@ -1010,23 +986,21 @@
* tcp_rtt_update so that we won't accidentally cache a
* bad value. But only do this if this is not a zero
* window probe.
*/
if (tcp->tcp_rtt_sa != 0 && tcp->tcp_zero_win_probe == 0) {
- tcp->tcp_rtt_sd += (tcp->tcp_rtt_sa >> 3) +
- (tcp->tcp_rtt_sa >> 5);
+ tcp->tcp_rtt_sd += tcp->tcp_rtt_sa >> 3 +
+ tcp->tcp_rtt_sa >> 5;
tcp->tcp_rtt_sa = 0;
tcp_ip_notify(tcp);
tcp->tcp_rtt_update = 0;
}
}
timer_rexmit:
tcp->tcp_timer_backoff++;
- if ((ms = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
- tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5)) <
- tcp->tcp_rto_min) {
+ if ((ms = tcp_calculate_rto(tcp, tcps)) < tcp->tcp_rto_min) {
/*
* This means the original RTO is tcp_rexmit_interval_min.
* So we will use tcp_rexmit_interval_min as the RTO value
* and do the backoff.
*/
@@ -1057,12 +1031,17 @@
if (mss > tcp->tcp_mss)
mss = tcp->tcp_mss;
if (mss > tcp->tcp_swnd && tcp->tcp_swnd != 0)
mss = tcp->tcp_swnd;
- if ((mp = tcp->tcp_xmit_head) != NULL)
+ if ((mp = tcp->tcp_xmit_head) != NULL) {
+#ifdef KERNEL_32
mp->b_prev = (mblk_t *)ddi_get_lbolt();
+#else
+ mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
+#endif
+ }
mp = tcp_xmit_mp(tcp, mp, mss, NULL, NULL, tcp->tcp_suna, B_TRUE, &mss,
B_TRUE);
/*
* When slow start after retransmission begins, start with
@@ -1089,10 +1068,12 @@
}
tcp->tcp_csuna = tcp->tcp_snxt;
TCPS_BUMP_MIB(tcps, tcpRetransSegs);
TCPS_UPDATE_MIB(tcps, tcpRetransBytes, mss);
+ tcp->tcp_cs.tcp_out_retrans_segs++;
+ tcp->tcp_cs.tcp_out_retrans_bytes += mss;
tcp_send_data(tcp, mp);
}
/*