Print this page
DLPX-25998 TCP congestion control is inadequate
Reviewed at: http://reviews.delphix.com/r/34808/
DLPX-43064 include high-resolution round-trip times in connstat (EP-652)
DLPX-42721 Create inline function for TCP RTO calculation
DLPX-37540 TCP per-connection kernel statistics DLPX-37544 connstat command to display per-connection TCP statistics
*** 21,31 ****
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright 2011 Joyent, Inc. All rights reserved.
! * Copyright (c) 2014 by Delphix. All rights reserved.
*/
#include <sys/types.h>
#include <sys/strlog.h>
#include <sys/strsun.h>
--- 21,31 ----
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright 2011 Joyent, Inc. All rights reserved.
! * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
*/
#include <sys/types.h>
#include <sys/strlog.h>
#include <sys/strsun.h>
*** 592,602 ****
tcp->tcp_rack_cur_max = 2;
}
mp = tcp_ack_mp(tcp);
if (mp != NULL) {
! BUMP_LOCAL(tcp->tcp_obsegs);
TCPS_BUMP_MIB(tcps, tcpOutAck);
TCPS_BUMP_MIB(tcps, tcpOutAckDelayed);
tcp_send_data(tcp, mp);
}
}
--- 592,602 ----
tcp->tcp_rack_cur_max = 2;
}
mp = tcp_ack_mp(tcp);
if (mp != NULL) {
! TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
TCPS_BUMP_MIB(tcps, tcpOutAck);
TCPS_BUMP_MIB(tcps, tcpOutAckDelayed);
tcp_send_data(tcp, mp);
}
}
*** 754,767 ****
clock_t time_to_wait;
TCPS_BUMP_MIB(tcps, tcpTimRetrans);
if (!tcp->tcp_xmit_head)
break;
! time_to_wait = ddi_get_lbolt() -
! (clock_t)tcp->tcp_xmit_head->b_prev;
! time_to_wait = tcp->tcp_rto -
! TICK_TO_MSEC(time_to_wait);
/*
* If the timer fires too early, 1 clock tick earlier,
* restart the timer.
*/
if (time_to_wait > msec_per_tick) {
--- 754,771 ----
clock_t time_to_wait;
TCPS_BUMP_MIB(tcps, tcpTimRetrans);
if (!tcp->tcp_xmit_head)
break;
! #ifdef KERNEL_32
! time_to_wait = TICK_TO_MSEC(ddi_get_lbolt() -
! (clock_t)tcp->tcp_xmit_head->b_prev);
! #else
! time_to_wait = NSEC2MSEC(gethrtime() -
! (hrtime_t)(intptr_t)tcp->tcp_xmit_head->b_prev);
! #endif
! time_to_wait = tcp->tcp_rto - time_to_wait;
/*
* If the timer fires too early, 1 clock tick earlier,
* restart the timer.
*/
if (time_to_wait > msec_per_tick) {
*** 783,823 ****
if (connp->conn_debug) {
(void) strlog(TCP_MOD_ID, 0, 1,
SL_TRACE, "tcp_timer: zero win");
}
} else {
! /*
! * After retransmission, we need to do
! * slow start. Set the ssthresh to one
! * half of current effective window and
! * cwnd to one MSS. Also reset
! * tcp_cwnd_cnt.
! *
! * Note that if tcp_ssthresh is reduced because
! * of ECN, do not reduce it again unless it is
! * already one window of data away (tcp_cwr
! * should then be cleared) or this is a
! * timeout for a retransmitted segment.
! */
! uint32_t npkt;
!
! if (!tcp->tcp_cwr || tcp->tcp_rexmit) {
! npkt = ((tcp->tcp_timer_backoff ?
! tcp->tcp_cwnd_ssthresh :
! tcp->tcp_snxt -
! tcp->tcp_suna) >> 1) / tcp->tcp_mss;
! tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) *
! tcp->tcp_mss;
}
- tcp->tcp_cwnd = tcp->tcp_mss;
- tcp->tcp_cwnd_cnt = 0;
- if (tcp->tcp_ecn_ok) {
- tcp->tcp_cwr = B_TRUE;
- tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
- tcp->tcp_ecn_cwr_sent = B_FALSE;
- }
- }
break;
}
/*
* We have something to send yet we cannot send. The
* reason can be:
--- 787,798 ----
if (connp->conn_debug) {
(void) strlog(TCP_MOD_ID, 0, 1,
SL_TRACE, "tcp_timer: zero win");
}
} else {
! cc_cong_signal(tcp, NULL, CC_RTO);
}
break;
}
/*
* We have something to send yet we cannot send. The
* reason can be:
*** 852,861 ****
--- 827,837 ----
if (tcp->tcp_swnd == 0) {
/* Extend window for zero window probe */
tcp->tcp_swnd++;
tcp->tcp_zero_win_probe = B_TRUE;
TCPS_BUMP_MIB(tcps, tcpOutWinProbe);
+ tcp->tcp_cs.tcp_out_zwnd_probes++;
} else {
/*
* Handle timeout from sender SWS avoidance.
* Reset our knowledge of the max send window
* since the receiver might have reduced its
*** 1010,1032 ****
* tcp_rtt_update so that we won't accidentally cache a
* bad value. But only do this if this is not a zero
* window probe.
*/
if (tcp->tcp_rtt_sa != 0 && tcp->tcp_zero_win_probe == 0) {
! tcp->tcp_rtt_sd += (tcp->tcp_rtt_sa >> 3) +
! (tcp->tcp_rtt_sa >> 5);
tcp->tcp_rtt_sa = 0;
tcp_ip_notify(tcp);
tcp->tcp_rtt_update = 0;
}
}
timer_rexmit:
tcp->tcp_timer_backoff++;
! if ((ms = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
! tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5)) <
! tcp->tcp_rto_min) {
/*
* This means the original RTO is tcp_rexmit_interval_min.
* So we will use tcp_rexmit_interval_min as the RTO value
* and do the backoff.
*/
--- 986,1006 ----
* tcp_rtt_update so that we won't accidentally cache a
* bad value. But only do this if this is not a zero
* window probe.
*/
if (tcp->tcp_rtt_sa != 0 && tcp->tcp_zero_win_probe == 0) {
! tcp->tcp_rtt_sd += tcp->tcp_rtt_sa >> 3 +
! tcp->tcp_rtt_sa >> 5;
tcp->tcp_rtt_sa = 0;
tcp_ip_notify(tcp);
tcp->tcp_rtt_update = 0;
}
}
timer_rexmit:
tcp->tcp_timer_backoff++;
! if ((ms = tcp_calculate_rto(tcp, tcps)) < tcp->tcp_rto_min) {
/*
* This means the original RTO is tcp_rexmit_interval_min.
* So we will use tcp_rexmit_interval_min as the RTO value
* and do the backoff.
*/
*** 1057,1068 ****
if (mss > tcp->tcp_mss)
mss = tcp->tcp_mss;
if (mss > tcp->tcp_swnd && tcp->tcp_swnd != 0)
mss = tcp->tcp_swnd;
! if ((mp = tcp->tcp_xmit_head) != NULL)
mp->b_prev = (mblk_t *)ddi_get_lbolt();
mp = tcp_xmit_mp(tcp, mp, mss, NULL, NULL, tcp->tcp_suna, B_TRUE, &mss,
B_TRUE);
/*
* When slow start after retransmission begins, start with
--- 1031,1047 ----
if (mss > tcp->tcp_mss)
mss = tcp->tcp_mss;
if (mss > tcp->tcp_swnd && tcp->tcp_swnd != 0)
mss = tcp->tcp_swnd;
! if ((mp = tcp->tcp_xmit_head) != NULL) {
! #ifdef KERNEL_32
mp->b_prev = (mblk_t *)ddi_get_lbolt();
+ #else
+ mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
+ #endif
+ }
mp = tcp_xmit_mp(tcp, mp, mss, NULL, NULL, tcp->tcp_suna, B_TRUE, &mss,
B_TRUE);
/*
* When slow start after retransmission begins, start with
*** 1089,1098 ****
--- 1068,1079 ----
}
tcp->tcp_csuna = tcp->tcp_snxt;
TCPS_BUMP_MIB(tcps, tcpRetransSegs);
TCPS_UPDATE_MIB(tcps, tcpRetransBytes, mss);
+ tcp->tcp_cs.tcp_out_retrans_segs++;
+ tcp->tcp_cs.tcp_out_retrans_bytes += mss;
tcp_send_data(tcp, mp);
}
/*