Print this page
DLPX-25998 TCP congestion control is inadequate
Reviewed at: http://reviews.delphix.com/r/34808/
DLPX-43064 include high-resolution round-trip times in connstat (EP-652)
DLPX-37540 TCP per-connection kernel statistics DLPX-37544 connstat command to display per-connection TCP statistics

*** 19,29 **** * CDDL HEADER END */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. ! * Copyright (c) 2014 by Delphix. All rights reserved. */ /* This file contains all TCP output processing functions. */ #include <sys/types.h> --- 19,29 ---- * CDDL HEADER END */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. ! * Copyright (c) 2014, 2017 by Delphix. All rights reserved. */ /* This file contains all TCP output processing functions. */ #include <sys/types.h>
*** 61,71 **** const int, int *, uint_t *, int *, mblk_t **, mblk_t *); static void tcp_xmit_early_reset(char *, mblk_t *, uint32_t, uint32_t, int, ip_recv_attr_t *, ip_stack_t *, conn_t *); static boolean_t tcp_send_rst_chk(tcp_stack_t *); static void tcp_process_shrunk_swnd(tcp_t *, uint32_t); ! static void tcp_fill_header(tcp_t *, uchar_t *, clock_t, int); /* * Functions called directly via squeue having a prototype of edesc_t. */ static void tcp_wput_nondata(void *, mblk_t *, void *, ip_recv_attr_t *); --- 61,71 ---- const int, int *, uint_t *, int *, mblk_t **, mblk_t *); static void tcp_xmit_early_reset(char *, mblk_t *, uint32_t, uint32_t, int, ip_recv_attr_t *, ip_stack_t *, conn_t *); static boolean_t tcp_send_rst_chk(tcp_stack_t *); static void tcp_process_shrunk_swnd(tcp_t *, uint32_t); ! static void tcp_fill_header(tcp_t *, uchar_t *, int); /* * Functions called directly via squeue having a prototype of edesc_t. */ static void tcp_wput_nondata(void *, mblk_t *, void *, ip_recv_attr_t *);
*** 78,87 **** --- 78,99 ---- * speedup is observed for values larger than sixteen. Zero will * disable the optimisation. */ static int tcp_tx_pull_len = 16; + static void + cc_after_idle(tcp_t *tcp) + { + uint32_t old_cwnd = tcp->tcp_cwnd; + + if (CC_ALGO(tcp)->after_idle != NULL) + CC_ALGO(tcp)->after_idle(&tcp->tcp_ccv); + + DTRACE_PROBE3(cwnd__cc__after__idle, tcp_t *, tcp, uint32_t, old_cwnd, + uint32_t, tcp->tcp_cwnd); + } + void tcp_wput(queue_t *q, mblk_t *mp) { conn_t *connp = Q_TO_CONN(q); tcp_t *tcp;
*** 215,225 **** int32_t mss; int32_t num_sack_blk = 0; int32_t total_hdr_len; int32_t tcp_hdr_len; int rc; - tcp_stack_t *tcps = tcp->tcp_tcps; conn_t *connp = tcp->tcp_connp; clock_t now = LBOLT_FASTPATH; tcpstate = tcp->tcp_state; if (mp == NULL) { --- 227,236 ----
*** 370,380 **** tcp_hdr_len = connp->conn_ht_ulp_len; } if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet && (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) { ! TCP_SET_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle); } if (tcpstate == TCPS_SYN_RCVD) { /* * The three-way connection establishment handshake is not * complete yet. We want to queue the data for transmission --- 381,391 ---- tcp_hdr_len = connp->conn_ht_ulp_len; } if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet && (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) { ! cc_after_idle(tcp); } if (tcpstate == TCPS_SYN_RCVD) { /* * The three-way connection establishment handshake is not * complete yet. We want to queue the data for transmission
*** 451,461 **** --- 462,476 ---- /* Bypass all other unnecessary processing. */ goto done; } } + #ifdef KERNEL_32 local_time = (mblk_t *)now; + #else + local_time = (mblk_t *)(intptr_t)gethrtime(); + #endif /* * "Our" Nagle Algorithm. This is not the same as in the old * BSD. This is more in line with the true intent of Nagle. *
*** 1188,1198 **** * Reinitialize tcp_cwnd after idle. */ now = LBOLT_FASTPATH; if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet && (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) { ! TCP_SET_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle); } usable = tcp->tcp_swnd; /* tcp window size */ if (usable > tcp->tcp_cwnd) usable = tcp->tcp_cwnd; /* congestion window smaller */ --- 1203,1213 ---- * Reinitialize tcp_cwnd after idle. */ now = LBOLT_FASTPATH; if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet && (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) { ! cc_after_idle(tcp); } usable = tcp->tcp_swnd; /* tcp window size */ if (usable > tcp->tcp_cwnd) usable = tcp->tcp_cwnd; /* congestion window smaller */
*** 1251,1261 **** --- 1266,1280 ---- tcp->tcp_snxt = snxt + len; tcp->tcp_rack = tcp->tcp_rnxt; if ((mp1 = dupb(mp)) == 0) goto no_memory; + #ifdef KERNEL_32 mp->b_prev = (mblk_t *)(uintptr_t)now; + #else + mp->b_prev = (mblk_t *)(intptr_t)gethrtime(); + #endif mp->b_next = (mblk_t *)(uintptr_t)snxt; /* adjust tcp header information */ tcpha = tcp->tcp_tcpha; tcpha->tha_flags = (TH_ACK|TH_PUSH);
*** 1266,1276 **** tcpha->tha_seq = htonl(snxt); TCPS_BUMP_MIB(tcps, tcpOutDataSegs); TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len); ! BUMP_LOCAL(tcp->tcp_obsegs); /* Update the latest receive window size in TCP header. */ tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); tcp->tcp_last_sent_len = (ushort_t)len; --- 1285,1297 ---- tcpha->tha_seq = htonl(snxt); TCPS_BUMP_MIB(tcps, tcpOutDataSegs); TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len); ! TCPS_BUMP_MIB(tcps, tcpHCOutSegs); ! tcp->tcp_cs.tcp_out_data_segs++; ! tcp->tcp_cs.tcp_out_data_bytes += len; /* Update the latest receive window size in TCP header. */ tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); tcp->tcp_last_sent_len = (ushort_t)len;
*** 1306,1321 **** } mp1->b_rptr = rptr; /* Fill in the timestamp option. */ if (tcp->tcp_snd_ts_ok) { ! uint32_t llbolt = (uint32_t)LBOLT_FASTPATH; ! ! U32_TO_BE32(llbolt, ! (char *)tcpha + TCP_MIN_HEADER_LENGTH+4); U32_TO_BE32(tcp->tcp_ts_recent, ! (char *)tcpha + TCP_MIN_HEADER_LENGTH+8); } else { ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH); } /* copy header into outgoing packet */ --- 1327,1340 ---- } mp1->b_rptr = rptr; /* Fill in the timestamp option. */ if (tcp->tcp_snd_ts_ok) { ! U32_TO_BE32(now, ! (char *)tcpha + TCP_MIN_HEADER_LENGTH + 4); U32_TO_BE32(tcp->tcp_ts_recent, ! (char *)tcpha + TCP_MIN_HEADER_LENGTH + 8); } else { ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH); } /* copy header into outgoing packet */
*** 1955,1974 **** (mblk_t *)(uintptr_t)(*snxt); mp1 = mp1->b_cont; } *snxt += len; *tail_unsent = (*xmit_tail)->b_wptr - mp1->b_wptr; ! BUMP_LOCAL(tcp->tcp_obsegs); TCPS_BUMP_MIB(tcps, tcpOutDataSegs); TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len); tcp_send_data(tcp, mp); continue; } *snxt += len; /* Adjust later if we don't send all of len */ TCPS_BUMP_MIB(tcps, tcpOutDataSegs); TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len); if (*tail_unsent) { /* Are the bytes above us in flight? */ rptr = (*xmit_tail)->b_wptr - *tail_unsent; if (rptr != (*xmit_tail)->b_rptr) { --- 1974,1998 ---- (mblk_t *)(uintptr_t)(*snxt); mp1 = mp1->b_cont; } *snxt += len; *tail_unsent = (*xmit_tail)->b_wptr - mp1->b_wptr; ! TCPS_BUMP_MIB(tcps, tcpHCOutSegs); TCPS_BUMP_MIB(tcps, tcpOutDataSegs); TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len); + tcp->tcp_cs.tcp_out_data_segs++; + tcp->tcp_cs.tcp_out_data_bytes += len; tcp_send_data(tcp, mp); continue; } *snxt += len; /* Adjust later if we don't send all of len */ + TCPS_BUMP_MIB(tcps, tcpHCOutSegs); TCPS_BUMP_MIB(tcps, tcpOutDataSegs); TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len); + tcp->tcp_cs.tcp_out_data_segs++; + tcp->tcp_cs.tcp_out_data_bytes += len; if (*tail_unsent) { /* Are the bytes above us in flight? */ rptr = (*xmit_tail)->b_wptr - *tail_unsent; if (rptr != (*xmit_tail)->b_rptr) {
*** 2061,2071 **** /* * Fill in the header using the template header, and add * options such as time-stamp, ECN and/or SACK, as needed. */ ! tcp_fill_header(tcp, rptr, (clock_t)local_time, num_sack_blk); mp->b_rptr = rptr; if (*tail_unsent) { int spill = *tail_unsent; --- 2085,2095 ---- /* * Fill in the header using the template header, and add * options such as time-stamp, ECN and/or SACK, as needed. */ ! tcp_fill_header(tcp, rptr, num_sack_blk); mp->b_rptr = rptr; if (*tail_unsent) { int spill = *tail_unsent;
*** 2140,2149 **** --- 2164,2174 ---- */ *usable -= spill; *snxt += spill; tcp->tcp_last_sent_len += spill; TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, spill); + tcp->tcp_cs.tcp_out_data_bytes += spill; /* * Adjust the checksum */ tcpha = (tcpha_t *)(rptr + ixa->ixa_ip_hdr_length);
*** 2188,2198 **** /* * Restore values of ixa_fragsize and ixa_extra_ident. */ ixa->ixa_fragsize = ixa->ixa_pmtu; ixa->ixa_extra_ident = 0; ! tcp->tcp_obsegs += num_lso_seg; TCP_STAT(tcps, tcp_lso_times); TCP_STAT_UPDATE(tcps, tcp_lso_pkt_out, num_lso_seg); } else { /* * Make sure to clean up LSO information. Wherever a --- 2213,2223 ---- /* * Restore values of ixa_fragsize and ixa_extra_ident. */ ixa->ixa_fragsize = ixa->ixa_pmtu; ixa->ixa_extra_ident = 0; ! TCPS_BUMP_MIB(tcps, tcpHCOutSegs); TCP_STAT(tcps, tcp_lso_times); TCP_STAT_UPDATE(tcps, tcp_lso_pkt_out, num_lso_seg); } else { /* * Make sure to clean up LSO information. Wherever a
*** 2199,2209 **** * new mp uses the prepended header room after dupb(), * lso_info_cleanup() should be called. */ lso_info_cleanup(mp); tcp_send_data(tcp, mp); ! BUMP_LOCAL(tcp->tcp_obsegs); } } return (0); } --- 2224,2234 ---- * new mp uses the prepended header room after dupb(), * lso_info_cleanup() should be called. */ lso_info_cleanup(mp); tcp_send_data(tcp, mp); ! TCPS_BUMP_MIB(tcps, tcpHCOutSegs); } } return (0); }
*** 2279,2290 **** /* * We do not have a good algorithm to update ssthresh at this time. * So don't do any update. */ bzero(&uinfo, sizeof (uinfo)); ! uinfo.iulp_rtt = tcp->tcp_rtt_sa; ! uinfo.iulp_rtt_sd = tcp->tcp_rtt_sd; /* * Note that uinfo is kept for conn_faddr in the DCE. Could update even * if source routed but we don't. */ --- 2304,2315 ---- /* * We do not have a good algorithm to update ssthresh at this time. * So don't do any update. */ bzero(&uinfo, sizeof (uinfo)); ! uinfo.iulp_rtt = NSEC2MSEC(tcp->tcp_rtt_sa); ! uinfo.iulp_rtt_sd = NSEC2MSEC(tcp->tcp_rtt_sd); /* * Note that uinfo is kept for conn_faddr in the DCE. Could update even * if source routed but we don't. */
*** 2415,2425 **** tcp->tcp_tcpha->tha_win = tcpha->tha_win; tcp->tcp_rack = ack; tcp->tcp_rack_cnt = 0; TCPS_BUMP_MIB(tcps, tcpOutAck); } ! BUMP_LOCAL(tcp->tcp_obsegs); tcpha->tha_seq = htonl(seq); tcpha->tha_ack = htonl(ack); /* * Include the adjustment for a source route if any. */ --- 2440,2450 ---- tcp->tcp_tcpha->tha_win = tcpha->tha_win; tcp->tcp_rack = ack; tcp->tcp_rack_cnt = 0; TCPS_BUMP_MIB(tcps, tcpOutAck); } ! TCPS_BUMP_MIB(tcps, tcpHCOutSegs); tcpha->tha_seq = htonl(seq); tcpha->tha_ack = htonl(ack); /* * Include the adjustment for a source route if any. */
*** 3384,3398 **** --- 3409,3429 ---- tcp_send_data(tcp, xmit_mp); /* * Update the send timestamp to avoid false retransmission. */ + #ifdef KERNEL_32 snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt(); + #else + snxt_mp->b_prev = (mblk_t *)(intptr_t)gethrtime(); + #endif TCPS_BUMP_MIB(tcps, tcpRetransSegs); TCPS_UPDATE_MIB(tcps, tcpRetransBytes, seg_len); TCPS_BUMP_MIB(tcps, tcpOutSackRetransSegs); + tcp->tcp_cs.tcp_out_retrans_segs++; + tcp->tcp_cs.tcp_out_retrans_bytes += seg_len; /* * Update tcp_rexmit_max to extend this SACK recovery phase. * This happens when new data sent during fast recovery is * also lost. If TCP retransmits those new data, it needs * to extend SACK recover phase to avoid starting another
*** 3456,3468 **** --- 3487,3505 ---- win -= cnt; /* * Update the send timestamp to avoid false * retransmission. */ + #ifdef KERNEL_32 old_snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt(); + #else + old_snxt_mp->b_prev = (mblk_t *)(intptr_t)gethrtime(); + #endif TCPS_BUMP_MIB(tcps, tcpRetransSegs); TCPS_UPDATE_MIB(tcps, tcpRetransBytes, cnt); + tcp->tcp_cs.tcp_out_retrans_segs++; + tcp->tcp_cs.tcp_out_retrans_bytes += cnt; tcp->tcp_rexmit_nxt = snxt; } /* * If we have transmitted all we have at the time
*** 3616,3626 **** * tcp_fill_header is called by tcp_send() to fill the outgoing TCP header * with the template header, as well as other options such as time-stamp, * ECN and/or SACK. */ static void ! tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk) { tcpha_t *tcp_tmpl, *tcpha; uint32_t *dst, *src; int hdrlen; conn_t *connp = tcp->tcp_connp; --- 3653,3663 ---- * tcp_fill_header is called by tcp_send() to fill the outgoing TCP header * with the template header, as well as other options such as time-stamp, * ECN and/or SACK. */ static void ! tcp_fill_header(tcp_t *tcp, uchar_t *rptr, int num_sack_blk) { tcpha_t *tcp_tmpl, *tcpha; uint32_t *dst, *src; int hdrlen; conn_t *connp = tcp->tcp_connp;
*** 3638,3648 **** src = (uint32_t *)connp->conn_ht_iphc; hdrlen = connp->conn_ht_iphc_len; /* Fill time-stamp option if needed */ if (tcp->tcp_snd_ts_ok) { ! U32_TO_BE32((uint32_t)now, (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 4); U32_TO_BE32(tcp->tcp_ts_recent, (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 8); } else { ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH); --- 3675,3685 ---- src = (uint32_t *)connp->conn_ht_iphc; hdrlen = connp->conn_ht_iphc_len; /* Fill time-stamp option if needed */ if (tcp->tcp_snd_ts_ok) { ! U32_TO_BE32(LBOLT_FASTPATH, (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 4); U32_TO_BE32(tcp->tcp_ts_recent, (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 8); } else { ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);