Print this page
DLPX-25998 TCP congestion control is inadequate
Reviewed at: http://reviews.delphix.com/r/34808/
DLPX-43064 include high-resolution round-trip times in connstat (EP-652)
DLPX-42721 Create inline function for TCP RTO calculation
DLPX-37540 TCP per-connection kernel statistics DLPX-37544 connstat command to display per-connection TCP statistics

*** 21,31 **** /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright 2011 Joyent, Inc. All rights reserved. ! * Copyright (c) 2014 by Delphix. All rights reserved. */ #include <sys/types.h> #include <sys/strlog.h> #include <sys/strsun.h> --- 21,31 ---- /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright 2011 Joyent, Inc. All rights reserved. ! * Copyright (c) 2014, 2017 by Delphix. All rights reserved. */ #include <sys/types.h> #include <sys/strlog.h> #include <sys/strsun.h>
*** 592,602 **** tcp->tcp_rack_cur_max = 2; } mp = tcp_ack_mp(tcp); if (mp != NULL) { ! BUMP_LOCAL(tcp->tcp_obsegs); TCPS_BUMP_MIB(tcps, tcpOutAck); TCPS_BUMP_MIB(tcps, tcpOutAckDelayed); tcp_send_data(tcp, mp); } } --- 592,602 ---- tcp->tcp_rack_cur_max = 2; } mp = tcp_ack_mp(tcp); if (mp != NULL) { ! TCPS_BUMP_MIB(tcps, tcpHCOutSegs); TCPS_BUMP_MIB(tcps, tcpOutAck); TCPS_BUMP_MIB(tcps, tcpOutAckDelayed); tcp_send_data(tcp, mp); } }
*** 754,767 **** clock_t time_to_wait; TCPS_BUMP_MIB(tcps, tcpTimRetrans); if (!tcp->tcp_xmit_head) break; ! time_to_wait = ddi_get_lbolt() - ! (clock_t)tcp->tcp_xmit_head->b_prev; ! time_to_wait = tcp->tcp_rto - ! TICK_TO_MSEC(time_to_wait); /* * If the timer fires too early, 1 clock tick earlier, * restart the timer. */ if (time_to_wait > msec_per_tick) { --- 754,771 ---- clock_t time_to_wait; TCPS_BUMP_MIB(tcps, tcpTimRetrans); if (!tcp->tcp_xmit_head) break; ! #ifdef KERNEL_32 ! time_to_wait = TICK_TO_MSEC(ddi_get_lbolt() - ! (clock_t)tcp->tcp_xmit_head->b_prev); ! #else ! time_to_wait = NSEC2MSEC(gethrtime() - ! (hrtime_t)(intptr_t)tcp->tcp_xmit_head->b_prev); ! #endif ! time_to_wait = tcp->tcp_rto - time_to_wait; /* * If the timer fires too early, 1 clock tick earlier, * restart the timer. */ if (time_to_wait > msec_per_tick) {
*** 783,823 **** if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, "tcp_timer: zero win"); } } else { ! /* ! * After retransmission, we need to do ! * slow start. Set the ssthresh to one ! * half of current effective window and ! * cwnd to one MSS. Also reset ! * tcp_cwnd_cnt. ! * ! * Note that if tcp_ssthresh is reduced because ! * of ECN, do not reduce it again unless it is ! * already one window of data away (tcp_cwr ! * should then be cleared) or this is a ! * timeout for a retransmitted segment. ! */ ! uint32_t npkt; ! ! if (!tcp->tcp_cwr || tcp->tcp_rexmit) { ! npkt = ((tcp->tcp_timer_backoff ? ! tcp->tcp_cwnd_ssthresh : ! tcp->tcp_snxt - ! tcp->tcp_suna) >> 1) / tcp->tcp_mss; ! tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * ! tcp->tcp_mss; } - tcp->tcp_cwnd = tcp->tcp_mss; - tcp->tcp_cwnd_cnt = 0; - if (tcp->tcp_ecn_ok) { - tcp->tcp_cwr = B_TRUE; - tcp->tcp_cwr_snd_max = tcp->tcp_snxt; - tcp->tcp_ecn_cwr_sent = B_FALSE; - } - } break; } /* * We have something to send yet we cannot send. The * reason can be: --- 787,798 ---- if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, "tcp_timer: zero win"); } } else { ! cc_cong_signal(tcp, NULL, CC_RTO); } break; } /* * We have something to send yet we cannot send. The * reason can be:
*** 852,861 **** --- 827,837 ---- if (tcp->tcp_swnd == 0) { /* Extend window for zero window probe */ tcp->tcp_swnd++; tcp->tcp_zero_win_probe = B_TRUE; TCPS_BUMP_MIB(tcps, tcpOutWinProbe); + tcp->tcp_cs.tcp_out_zwnd_probes++; } else { /* * Handle timeout from sender SWS avoidance. * Reset our knowledge of the max send window * since the receiver might have reduced its
*** 1010,1032 **** * tcp_rtt_update so that we won't accidentally cache a * bad value. But only do this if this is not a zero * window probe. */ if (tcp->tcp_rtt_sa != 0 && tcp->tcp_zero_win_probe == 0) { ! tcp->tcp_rtt_sd += (tcp->tcp_rtt_sa >> 3) + ! (tcp->tcp_rtt_sa >> 5); tcp->tcp_rtt_sa = 0; tcp_ip_notify(tcp); tcp->tcp_rtt_update = 0; } } timer_rexmit: tcp->tcp_timer_backoff++; ! if ((ms = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + ! tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5)) < ! tcp->tcp_rto_min) { /* * This means the original RTO is tcp_rexmit_interval_min. * So we will use tcp_rexmit_interval_min as the RTO value * and do the backoff. */ --- 986,1006 ---- * tcp_rtt_update so that we won't accidentally cache a * bad value. But only do this if this is not a zero * window probe. */ if (tcp->tcp_rtt_sa != 0 && tcp->tcp_zero_win_probe == 0) { ! tcp->tcp_rtt_sd += tcp->tcp_rtt_sa >> 3 + ! tcp->tcp_rtt_sa >> 5; tcp->tcp_rtt_sa = 0; tcp_ip_notify(tcp); tcp->tcp_rtt_update = 0; } } timer_rexmit: tcp->tcp_timer_backoff++; ! if ((ms = tcp_calculate_rto(tcp, tcps)) < tcp->tcp_rto_min) { /* * This means the original RTO is tcp_rexmit_interval_min. * So we will use tcp_rexmit_interval_min as the RTO value * and do the backoff. */
*** 1057,1068 **** if (mss > tcp->tcp_mss) mss = tcp->tcp_mss; if (mss > tcp->tcp_swnd && tcp->tcp_swnd != 0) mss = tcp->tcp_swnd; ! if ((mp = tcp->tcp_xmit_head) != NULL) mp->b_prev = (mblk_t *)ddi_get_lbolt(); mp = tcp_xmit_mp(tcp, mp, mss, NULL, NULL, tcp->tcp_suna, B_TRUE, &mss, B_TRUE); /* * When slow start after retransmission begins, start with --- 1031,1047 ---- if (mss > tcp->tcp_mss) mss = tcp->tcp_mss; if (mss > tcp->tcp_swnd && tcp->tcp_swnd != 0) mss = tcp->tcp_swnd; ! if ((mp = tcp->tcp_xmit_head) != NULL) { ! #ifdef KERNEL_32 mp->b_prev = (mblk_t *)ddi_get_lbolt(); + #else + mp->b_prev = (mblk_t *)(intptr_t)gethrtime(); + #endif + } mp = tcp_xmit_mp(tcp, mp, mss, NULL, NULL, tcp->tcp_suna, B_TRUE, &mss, B_TRUE); /* * When slow start after retransmission begins, start with
*** 1089,1098 **** --- 1068,1079 ---- } tcp->tcp_csuna = tcp->tcp_snxt; TCPS_BUMP_MIB(tcps, tcpRetransSegs); TCPS_UPDATE_MIB(tcps, tcpRetransBytes, mss); + tcp->tcp_cs.tcp_out_retrans_segs++; + tcp->tcp_cs.tcp_out_retrans_bytes += mss; tcp_send_data(tcp, mp); } /*