Print this page
DLPX-25998 TCP congestion control is inadequate
Reviewed at: http://reviews.delphix.com/r/34808/
DLPX-45697 Adding Avg. RTT to connstat
DLPX-43064 include high-resolution round-trip times in connstat (EP-652)
DLPX-42721 Create inline function for TCP RTO calculation
DLPX-37540 TCP per-connection kernel statistics DLPX-37544 connstat command to display per-connection TCP statistics

*** 21,31 **** /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright 2017 Joyent, Inc. ! * Copyright (c) 2014 by Delphix. All rights reserved. */ /* This file contains all TCP input processing functions. */ #include <sys/types.h> --- 21,31 ---- /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright 2017 Joyent, Inc. ! * Copyright (c) 2014, 2016 by Delphix. All rights reserved. */ /* This file contains all TCP input processing functions. */ #include <sys/types.h>
*** 164,177 **** static void tcp_input_listener(void *, mblk_t *, void *, ip_recv_attr_t *); static void tcp_process_options(tcp_t *, tcpha_t *); static mblk_t *tcp_reass(tcp_t *, mblk_t *, uint32_t); static void tcp_reass_elim_overlap(tcp_t *, mblk_t *); static void tcp_rsrv_input(void *, mblk_t *, void *, ip_recv_attr_t *); ! static void tcp_set_rto(tcp_t *, time_t); static void tcp_setcred_data(mblk_t *, ip_recv_attr_t *); /* * Set the MSS associated with a particular tcp based on its current value, * and a new one passed in. Observe minimums and maximums, and reset other * state variables that we want to view as multiples of MSS. * * The value of MSS could be either increased or descreased. --- 164,304 ---- static void tcp_input_listener(void *, mblk_t *, void *, ip_recv_attr_t *); static void tcp_process_options(tcp_t *, tcpha_t *); static mblk_t *tcp_reass(tcp_t *, mblk_t *, uint32_t); static void tcp_reass_elim_overlap(tcp_t *, mblk_t *); static void tcp_rsrv_input(void *, mblk_t *, void *, ip_recv_attr_t *); ! static void tcp_set_rto(tcp_t *, hrtime_t); static void tcp_setcred_data(mblk_t *, ip_recv_attr_t *); /* + * CC wrapper hook functions + */ + static void + cc_ack_received(tcp_t *tcp, uint32_t seg_ack, int32_t bytes_acked, + uint16_t type) + { + uint32_t old_cwnd = tcp->tcp_cwnd; + + tcp->tcp_ccv.bytes_this_ack = bytes_acked; + if (tcp->tcp_cwnd <= tcp->tcp_swnd) + tcp->tcp_ccv.flags |= CCF_CWND_LIMITED; + else + tcp->tcp_ccv.flags &= ~CCF_CWND_LIMITED; + + if (type == CC_ACK) { + if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) { + if (tcp->tcp_ccv.flags & CCF_RTO) + tcp->tcp_ccv.flags &= ~CCF_RTO; + + tcp->tcp_ccv.t_bytes_acked += + min(tcp->tcp_ccv.bytes_this_ack, + tcp->tcp_tcps->tcps_abc_l_var * tcp->tcp_mss); + if (tcp->tcp_ccv.t_bytes_acked >= tcp->tcp_cwnd) { + tcp->tcp_ccv.t_bytes_acked -= tcp->tcp_cwnd; + tcp->tcp_ccv.flags |= CCF_ABC_SENTAWND; + } + } else { + tcp->tcp_ccv.flags &= ~CCF_ABC_SENTAWND; + tcp->tcp_ccv.t_bytes_acked = 0; + } + } + + if (CC_ALGO(tcp)->ack_received != NULL) { + /* + * The FreeBSD code where this originated had a comment "Find + * a way to live without this" in several places where curack + * got set. If they eventually dump curack from from the cc + * variables, we'll need to adapt our code. + */ + tcp->tcp_ccv.curack = seg_ack; + CC_ALGO(tcp)->ack_received(&tcp->tcp_ccv, type); + } + + DTRACE_PROBE3(cwnd__cc__ack__received, tcp_t *, tcp, uint32_t, old_cwnd, + uint32_t, tcp->tcp_cwnd); + } + + void + cc_cong_signal(tcp_t *tcp, uint32_t seg_ack, uint32_t type) + { + uint32_t old_cwnd = tcp->tcp_cwnd; + uint32_t old_cwnd_ssthresh = tcp->tcp_cwnd_ssthresh; + switch (type) { + case CC_NDUPACK: + if (!IN_FASTRECOVERY(tcp->tcp_ccv.flags)) { + tcp->tcp_rexmit_max = tcp->tcp_snxt; + if (tcp->tcp_ecn_ok) { + tcp->tcp_cwr_snd_max = tcp->tcp_snxt; + tcp->tcp_cwr = B_TRUE; + tcp->tcp_ecn_cwr_sent = B_FALSE; + } + } + break; + case CC_ECN: + if (!IN_CONGRECOVERY(tcp->tcp_ccv.flags)) { + tcp->tcp_rexmit_max = tcp->tcp_snxt; + if (tcp->tcp_ecn_ok) { + tcp->tcp_cwr_snd_max = tcp->tcp_snxt; + tcp->tcp_cwr = B_TRUE; + tcp->tcp_ecn_cwr_sent = B_FALSE; + } + } + break; + case CC_RTO: + tcp->tcp_ccv.flags |= CCF_RTO; + tcp->tcp_dupack_cnt = 0; + tcp->tcp_ccv.t_bytes_acked = 0; + /* + * Give up on fast recovery and congestion recovery if we were + * attempting either. + */ + EXIT_RECOVERY(tcp->tcp_ccv.flags); + if (CC_ALGO(tcp)->cong_signal == NULL) { + /* + * RFC5681 Section 3.1 + * ssthresh = max (FlightSize / 2, 2*SMSS) eq (4) + */ + tcp->tcp_cwnd_ssthresh = max( + (tcp->tcp_snxt - tcp->tcp_suna) / 2 / tcp->tcp_mss, + 2) * tcp->tcp_mss; + tcp->tcp_cwnd = tcp->tcp_mss; + } + + if (tcp->tcp_ecn_ok) { + tcp->tcp_cwr = B_TRUE; + tcp->tcp_cwr_snd_max = tcp->tcp_snxt; + tcp->tcp_ecn_cwr_sent = B_FALSE; + } + break; + } + + if (CC_ALGO(tcp)->cong_signal != NULL) { + tcp->tcp_ccv.curack = seg_ack; + CC_ALGO(tcp)->cong_signal(&tcp->tcp_ccv, type); + } + + DTRACE_PROBE6(cwnd__cc__cong__signal, tcp_t *, tcp, uint32_t, old_cwnd, + uint32_t, tcp->tcp_cwnd, uint32_t, old_cwnd_ssthresh, + uint32_t, tcp->tcp_cwnd_ssthresh, uint32_t, type); + } + + static void + cc_post_recovery(tcp_t *tcp, uint32_t seg_ack) + { + uint32_t old_cwnd = tcp->tcp_cwnd; + + if (CC_ALGO(tcp)->post_recovery != NULL) { + tcp->tcp_ccv.curack = seg_ack; + CC_ALGO(tcp)->post_recovery(&tcp->tcp_ccv); + } + tcp->tcp_ccv.t_bytes_acked = 0; + + DTRACE_PROBE3(cwnd__cc__post__recovery, tcp_t *, tcp, + uint32_t, old_cwnd, uint32_t, tcp->tcp_cwnd); + } + + /* * Set the MSS associated with a particular tcp based on its current value, * and a new one passed in. Observe minimums and maximums, and reset other * state variables that we want to view as multiples of MSS. * * The value of MSS could be either increased or descreased.
*** 546,555 **** --- 673,685 ---- /* * Initialize tcp_cwnd value. After tcp_mss_set(), tcp_mss has been * updated properly. */ TCP_SET_INIT_CWND(tcp, tcp->tcp_mss, tcps->tcps_slow_start_initial); + + if (tcp->tcp_cc_algo->conn_init != NULL) + tcp->tcp_cc_algo->conn_init(&tcp->tcp_ccv); } /* * Add a new piece to the tcp reassembly queue. If the gap at the beginning * is filled, return as much as we can. The message passed in may be
*** 557,567 **** * number for this piece. */ static mblk_t * tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start) { ! uint32_t end; mblk_t *mp1; mblk_t *mp2; mblk_t *next_mp; uint32_t u1; tcp_stack_t *tcps = tcp->tcp_tcps; --- 687,697 ---- * number for this piece. */ static mblk_t * tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start) { ! uint32_t end, bytes; mblk_t *mp1; mblk_t *mp2; mblk_t *next_mp; uint32_t u1; tcp_stack_t *tcps = tcp->tcp_tcps;
*** 576,605 **** if (start == end) { /* Empty. Blast it. */ freeb(mp); continue; } mp->b_cont = NULL; TCP_REASS_SET_SEQ(mp, start); TCP_REASS_SET_END(mp, end); mp1 = tcp->tcp_reass_tail; ! if (!mp1) { ! tcp->tcp_reass_tail = mp; tcp->tcp_reass_head = mp; - TCPS_BUMP_MIB(tcps, tcpInDataUnorderSegs); - TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes, - end - start); - continue; } - /* New stuff completely beyond tail? */ - if (SEQ_GEQ(start, TCP_REASS_END(mp1))) { - /* Link it on end. */ - mp1->b_cont = mp; tcp->tcp_reass_tail = mp; TCPS_BUMP_MIB(tcps, tcpInDataUnorderSegs); ! TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes, ! end - start); continue; } mp1 = tcp->tcp_reass_head; u1 = TCP_REASS_SEQ(mp1); /* New stuff at the front? */ --- 706,735 ---- if (start == end) { /* Empty. Blast it. */ freeb(mp); continue; } + bytes = end - start; mp->b_cont = NULL; TCP_REASS_SET_SEQ(mp, start); TCP_REASS_SET_END(mp, end); mp1 = tcp->tcp_reass_tail; ! if (mp1 == NULL || SEQ_GEQ(start, TCP_REASS_END(mp1))) { ! if (mp1 != NULL) { ! /* ! * New stuff is beyond the tail; link it on the ! * end. ! */ ! mp1->b_cont = mp; ! } else { tcp->tcp_reass_head = mp; } tcp->tcp_reass_tail = mp; TCPS_BUMP_MIB(tcps, tcpInDataUnorderSegs); ! TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes, bytes); ! tcp->tcp_cs.tcp_in_data_unorder_segs++; ! tcp->tcp_cs.tcp_in_data_unorder_bytes += bytes; continue; } mp1 = tcp->tcp_reass_head; u1 = TCP_REASS_SEQ(mp1); /* New stuff at the front? */
*** 2322,2333 **** int urp; tcp_opt_t tcpopt; ip_pkt_t ipp; boolean_t ofo_seg = B_FALSE; /* Out of order segment */ uint32_t cwnd; - uint32_t add; - int npkt; int mss; conn_t *connp = (conn_t *)arg; squeue_t *sqp = (squeue_t *)arg2; tcp_t *tcp = connp->conn_tcp; tcp_stack_t *tcps = tcp->tcp_tcps; --- 2452,2461 ----
*** 2412,2422 **** tcp->tcp_last_recv_time = LBOLT_FASTPATH; } flags = (unsigned int)tcpha->tha_flags & 0xFF; ! BUMP_LOCAL(tcp->tcp_ibsegs); DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp); if ((flags & TH_URG) && sqp != NULL) { /* * TCP can't handle urgent pointers that arrive before --- 2540,2550 ---- tcp->tcp_last_recv_time = LBOLT_FASTPATH; } flags = (unsigned int)tcpha->tha_flags & 0xFF; ! TCPS_BUMP_MIB(tcps, tcpHCInSegs); DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp); if ((flags & TH_URG) && sqp != NULL) { /* * TCP can't handle urgent pointers that arrive before
*** 2599,2608 **** --- 2727,2739 ---- * Set tcp_cwnd back to 1 MSS, per * recommendation from * draft-floyd-incr-init-win-01.txt, * Increasing TCP's Initial Window. */ + DTRACE_PROBE3(cwnd__retransmitted__syn, + tcp_t *, tcp, uint32_t, tcp->tcp_cwnd, + uint32_t, tcp->tcp_mss); tcp->tcp_cwnd = tcp->tcp_mss; } tcp->tcp_swl1 = seg_seq; tcp->tcp_swl2 = seg_ack;
*** 2657,2667 **** (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid); tcp->tcp_ack_tid = 0; } tcp_send_data(tcp, ack_mp); ! BUMP_LOCAL(tcp->tcp_obsegs); TCPS_BUMP_MIB(tcps, tcpOutAck); if (!IPCL_IS_NONSTR(connp)) { /* Send up T_CONN_CON */ if (ira->ira_cred != NULL) { --- 2788,2798 ---- (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid); tcp->tcp_ack_tid = 0; } tcp_send_data(tcp, ack_mp); ! TCPS_BUMP_MIB(tcps, tcpHCOutSegs); TCPS_BUMP_MIB(tcps, tcpOutAck); if (!IPCL_IS_NONSTR(connp)) { /* Send up T_CONN_CON */ if (ira->ira_cred != NULL) {
*** 3046,3055 **** --- 3177,3187 ---- if (rgap < 0) { mblk_t *mp2; if (tcp->tcp_rwnd == 0) { TCPS_BUMP_MIB(tcps, tcpInWinProbe); + tcp->tcp_cs.tcp_in_zwnd_probes++; } else { TCPS_BUMP_MIB(tcps, tcpInDataPastWinSegs); TCPS_UPDATE_MIB(tcps, tcpInDataPastWinBytes, -rgap); }
*** 3295,3304 **** --- 3427,3439 ---- } } } else if (seg_len > 0) { TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs); TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, seg_len); + tcp->tcp_cs.tcp_in_data_inorder_segs++; + tcp->tcp_cs.tcp_in_data_inorder_bytes += seg_len; + /* * If an out of order FIN was received before, and the seq * num and len of the new segment match that of the FIN, * put the FIN flag back in. */
*** 3360,3370 **** /* * urp could be -1 when the urp field in the packet is 0 * and TCP_OLD_URP_INTERPRETATION is set. This implies that the urgent * byte was at seg_seq - 1, in which case we ignore the urgent flag. */ ! if (flags & TH_URG && urp >= 0) { if (!tcp->tcp_urp_last_valid || SEQ_GT(urp + seg_seq, tcp->tcp_urp_last)) { /* * Non-STREAMS sockets handle the urgent data a litte * differently from STREAMS based sockets. There is no --- 3495,3505 ---- /* * urp could be -1 when the urp field in the packet is 0 * and TCP_OLD_URP_INTERPRETATION is set. This implies that the urgent * byte was at seg_seq - 1, in which case we ignore the urgent flag. */ ! if ((flags & TH_URG) && urp >= 0) { if (!tcp->tcp_urp_last_valid || SEQ_GT(urp + seg_seq, tcp->tcp_urp_last)) { /* * Non-STREAMS sockets handle the urgent data a litte * differently from STREAMS based sockets. There is no
*** 3817,3826 **** --- 3952,3964 ---- if (tcp->tcp_rexmit) { tcp->tcp_rexmit = B_FALSE; tcp->tcp_rexmit_nxt = tcp->tcp_snxt; tcp->tcp_rexmit_max = tcp->tcp_snxt; tcp->tcp_ms_we_have_waited = 0; + DTRACE_PROBE3(cwnd__retransmitted__syn, + tcp_t *, tcp, uint32_t, tcp->tcp_cwnd, + uint32_t, tcp->tcp_mss); tcp->tcp_cwnd = mss; } /* * We set the send window to zero here.
*** 3860,3897 **** * set, reduce tcp_cwnd and tcp_ssthresh. But this should only be * done once per window (or more loosely, per RTT). */ if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max)) tcp->tcp_cwr = B_FALSE; ! if (tcp->tcp_ecn_ok && (flags & TH_ECE)) { ! if (!tcp->tcp_cwr) { ! npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) / mss; ! tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * mss; ! tcp->tcp_cwnd = npkt * mss; /* * If the cwnd is 0, use the timer to clock out * new segments. This is required by the ECN spec. */ ! if (npkt == 0) { TCP_TIMER_RESTART(tcp, tcp->tcp_rto); - /* - * This makes sure that when the ACK comes - * back, we will increase tcp_cwnd by 1 MSS. - */ - tcp->tcp_cwnd_cnt = 0; - } tcp->tcp_cwr = B_TRUE; /* * This marks the end of the current window of in * flight data. That is why we don't use * tcp_suna + tcp_swnd. Only data in flight can * provide ECN info. */ tcp->tcp_cwr_snd_max = tcp->tcp_snxt; - tcp->tcp_ecn_cwr_sent = B_FALSE; } - } mp1 = tcp->tcp_xmit_head; if (bytes_acked == 0) { if (!ofo_seg && seg_len == 0 && new_swnd == tcp->tcp_swnd) { int dupack_cnt; --- 3998,4024 ---- * set, reduce tcp_cwnd and tcp_ssthresh. But this should only be * done once per window (or more loosely, per RTT). */ if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max)) tcp->tcp_cwr = B_FALSE; ! if (tcp->tcp_ecn_ok && (flags & TH_ECE) && !tcp->tcp_cwr) { ! cc_cong_signal(tcp, seg_ack, CC_ECN); /* * If the cwnd is 0, use the timer to clock out * new segments. This is required by the ECN spec. */ ! if (tcp->tcp_cwnd == 0) TCP_TIMER_RESTART(tcp, tcp->tcp_rto); tcp->tcp_cwr = B_TRUE; /* * This marks the end of the current window of in * flight data. That is why we don't use * tcp_suna + tcp_swnd. Only data in flight can * provide ECN info. */ tcp->tcp_cwr_snd_max = tcp->tcp_snxt; } mp1 = tcp->tcp_xmit_head; if (bytes_acked == 0) { if (!ofo_seg && seg_len == 0 && new_swnd == tcp->tcp_swnd) { int dupack_cnt;
*** 3908,3917 **** --- 4035,4046 ---- if (mp1 && tcp->tcp_suna != tcp->tcp_snxt && ! tcp->tcp_rexmit) { /* Do Limited Transmit */ if ((dupack_cnt = ++tcp->tcp_dupack_cnt) < tcps->tcps_dupack_fast_retransmit) { + cc_ack_received(tcp, seg_ack, + bytes_acked, CC_DUPACK); /* * RFC 3042 * * What we need to do is temporarily * increase tcp_cwnd so that new
*** 3954,3969 **** * Adjust cwnd since the duplicate * ack indicates that a packet was * dropped (due to congestion.) */ if (!tcp->tcp_cwr) { ! npkt = ((tcp->tcp_snxt - ! tcp->tcp_suna) >> 1) / mss; ! tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * ! mss; ! tcp->tcp_cwnd = (npkt + ! tcp->tcp_dupack_cnt) * mss; } if (tcp->tcp_ecn_ok) { tcp->tcp_cwr = B_TRUE; tcp->tcp_cwr_snd_max = tcp->tcp_snxt; tcp->tcp_ecn_cwr_sent = B_FALSE; --- 4083,4096 ---- * Adjust cwnd since the duplicate * ack indicates that a packet was * dropped (due to congestion.) */ if (!tcp->tcp_cwr) { ! cc_cong_signal(tcp, seg_ack, ! CC_NDUPACK); ! cc_ack_received(tcp, seg_ack, ! bytes_acked, CC_DUPACK); } if (tcp->tcp_ecn_ok) { tcp->tcp_cwr = B_TRUE; tcp->tcp_cwr_snd_max = tcp->tcp_snxt; tcp->tcp_ecn_cwr_sent = B_FALSE;
*** 4021,4030 **** --- 4148,4159 ---- } else { flags |= TH_REXMIT_NEEDED; } /* tcp_snd_sack_ok */ } else { + cc_ack_received(tcp, seg_ack, + bytes_acked, CC_DUPACK); /* * Here we perform congestion * avoidance, but NOT slow start. * This is known as the Fast * Recovery Algorithm.
*** 4042,4051 **** --- 4171,4184 ---- * cwnd. */ cwnd = tcp->tcp_cwnd + mss; if (cwnd > tcp->tcp_cwnd_max) cwnd = tcp->tcp_cwnd_max; + DTRACE_PROBE3(cwnd__fast__recovery, + tcp_t *, tcp, + uint32_t, tcp->tcp_cwnd, + uint32_t, cwnd); tcp->tcp_cwnd = cwnd; if (tcp->tcp_unsent > 0) flags |= TH_XMIT_NEEDED; } }
*** 4144,4154 **** } return; } mp = tcp_ack_mp(tcp); if (mp != NULL) { ! BUMP_LOCAL(tcp->tcp_obsegs); TCPS_BUMP_MIB(tcps, tcpOutAck); tcp_send_data(tcp, mp); } return; } --- 4277,4287 ---- } return; } mp = tcp_ack_mp(tcp); if (mp != NULL) { ! TCPS_BUMP_MIB(tcps, tcpHCOutSegs); TCPS_BUMP_MIB(tcps, tcpOutAck); tcp_send_data(tcp, mp); } return; }
*** 4174,4192 **** */ if (tcp->tcp_dupack_cnt >= tcps->tcps_dupack_fast_retransmit) { ASSERT(tcp->tcp_rexmit == B_FALSE); if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) { tcp->tcp_dupack_cnt = 0; ! /* ! * Restore the orig tcp_cwnd_ssthresh after ! * fast retransmit phase. ! */ ! if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) { ! tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh; ! } tcp->tcp_rexmit_max = seg_ack; - tcp->tcp_cwnd_cnt = 0; /* * Remove all notsack info to avoid confusion with * the next fast retrasnmit/recovery phase. */ --- 4307,4320 ---- */ if (tcp->tcp_dupack_cnt >= tcps->tcps_dupack_fast_retransmit) { ASSERT(tcp->tcp_rexmit == B_FALSE); if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) { tcp->tcp_dupack_cnt = 0; ! ! cc_post_recovery(tcp, seg_ack); ! tcp->tcp_rexmit_max = seg_ack; /* * Remove all notsack info to avoid confusion with * the next fast retrasnmit/recovery phase. */
*** 4211,4222 **** * original value when we started fast * recovery. This is to prevent overly * aggressive behaviour in sending new * segments. */ ! tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh + tcps->tcps_dupack_fast_retransmit * mss; tcp->tcp_cwnd_cnt = tcp->tcp_cwnd; flags |= TH_REXMIT_NEEDED; } } } else { --- 4339,4354 ---- * original value when we started fast * recovery. This is to prevent overly * aggressive behaviour in sending new * segments. */ ! cwnd = tcp->tcp_cwnd_ssthresh + tcps->tcps_dupack_fast_retransmit * mss; + DTRACE_PROBE3(cwnd__fast__retransmit__part__ack, + tcp_t *, tcp, uint32_t, tcp->tcp_cwnd, + uint32_t, cwnd); + tcp->tcp_cwnd = cwnd; tcp->tcp_cwnd_cnt = tcp->tcp_cwnd; flags |= TH_REXMIT_NEEDED; } } } else {
*** 4273,4341 **** * If TCP is not ECN capable or TCP is ECN capable but the * congestion experience bit is not set, increase the tcp_cwnd as * usual. */ if (!tcp->tcp_ecn_ok || !(flags & TH_ECE)) { ! cwnd = tcp->tcp_cwnd; ! add = mss; ! ! if (cwnd >= tcp->tcp_cwnd_ssthresh) { ! /* ! * This is to prevent an increase of less than 1 MSS of ! * tcp_cwnd. With partial increase, tcp_wput_data() ! * may send out tinygrams in order to preserve mblk ! * boundaries. ! * ! * By initializing tcp_cwnd_cnt to new tcp_cwnd and ! * decrementing it by 1 MSS for every ACKs, tcp_cwnd is ! * increased by 1 MSS for every RTTs. ! */ ! if (tcp->tcp_cwnd_cnt <= 0) { ! tcp->tcp_cwnd_cnt = cwnd + add; ! } else { ! tcp->tcp_cwnd_cnt -= add; ! add = 0; } } - tcp->tcp_cwnd = MIN(cwnd + add, tcp->tcp_cwnd_max); - } /* See if the latest urgent data has been acknowledged */ if ((tcp->tcp_valid_bits & TCP_URG_VALID) && SEQ_GT(seg_ack, tcp->tcp_urg)) tcp->tcp_valid_bits &= ~TCP_URG_VALID; - /* Can we update the RTT estimates? */ - if (tcp->tcp_snd_ts_ok) { - /* Ignore zero timestamp echo-reply. */ - if (tcpopt.tcp_opt_ts_ecr != 0) { - tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH - - (int32_t)tcpopt.tcp_opt_ts_ecr); - } - - /* If needed, restart the timer. */ - if (tcp->tcp_set_timer == 1) { - TCP_TIMER_RESTART(tcp, tcp->tcp_rto); - tcp->tcp_set_timer = 0; - } /* ! * Update tcp_csuna in case the other side stops sending ! * us timestamps. */ ! tcp->tcp_csuna = tcp->tcp_snxt; ! } else if (SEQ_GT(seg_ack, tcp->tcp_csuna)) { /* * An ACK sequence we haven't seen before, so get the RTT * and update the RTO. But first check if the timestamp is * valid to use. */ if ((mp1->b_next != NULL) && ! SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next))) tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH - (int32_t)(intptr_t)mp1->b_prev); ! else TCPS_BUMP_MIB(tcps, tcpRttNoUpdate); /* Remeber the last sequence to be ACKed */ tcp->tcp_csuna = seg_ack; if (tcp->tcp_set_timer == 1) { TCP_TIMER_RESTART(tcp, tcp->tcp_rto); --- 4405,4453 ---- * If TCP is not ECN capable or TCP is ECN capable but the * congestion experience bit is not set, increase the tcp_cwnd as * usual. */ if (!tcp->tcp_ecn_ok || !(flags & TH_ECE)) { ! if (IN_RECOVERY(tcp->tcp_ccv.flags)) { ! EXIT_RECOVERY(tcp->tcp_ccv.flags); } + cc_ack_received(tcp, seg_ack, bytes_acked, CC_ACK); } /* See if the latest urgent data has been acknowledged */ if ((tcp->tcp_valid_bits & TCP_URG_VALID) && SEQ_GT(seg_ack, tcp->tcp_urg)) tcp->tcp_valid_bits &= ~TCP_URG_VALID; /* ! * Update the RTT estimates. Note that we don't use the TCP ! * timestamp option to calculate RTT even if one is present. This is ! * because the timestamp option's resolution (CPU tick) is ! * too coarse to measure modern datacenter networks' microsecond ! * latencies. The timestamp field's resolution is limited by its ! * 4-byte width (see RFC1323), and since we always store a ! * high-resolution nanosecond presision timestamp along with the data, ! * there is no point to ever using the timestamp option. */ ! if (SEQ_GT(seg_ack, tcp->tcp_csuna)) { /* * An ACK sequence we haven't seen before, so get the RTT * and update the RTO. But first check if the timestamp is * valid to use. */ if ((mp1->b_next != NULL) && ! SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next))) { ! #ifdef KERNEL_32 tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH - (int32_t)(intptr_t)mp1->b_prev); ! #else ! tcp_set_rto(tcp, gethrtime() - ! (hrtime_t)(intptr_t)mp1->b_prev); ! #endif ! } else { TCPS_BUMP_MIB(tcps, tcpRttNoUpdate); + } /* Remeber the last sequence to be ACKed */ tcp->tcp_csuna = seg_ack; if (tcp->tcp_set_timer == 1) { TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
*** 4359,4370 **** --- 4471,4487 ---- * Set a new timestamp if all the bytes timed by the * old timestamp have been ack'ed. */ if (SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next))) { + #ifdef KERNEL_32 mp1->b_prev = (mblk_t *)(uintptr_t)LBOLT_FASTPATH; + #else + mp1->b_prev = + (mblk_t *)(intptr_t)gethrtime(); + #endif mp1->b_next = NULL; } break; } mp1->b_next = NULL;
*** 4836,4851 **** --- 4953,4975 ---- mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, snd_size, NULL, NULL, tcp->tcp_suna, B_TRUE, &snd_size, B_TRUE); if (mp1 != NULL) { + #ifdef KERNEL_32 tcp->tcp_xmit_head->b_prev = (mblk_t *)LBOLT_FASTPATH; + #else + tcp->tcp_xmit_head->b_prev = + (mblk_t *)(intptr_t)gethrtime(); + #endif tcp->tcp_csuna = tcp->tcp_snxt; TCPS_BUMP_MIB(tcps, tcpRetransSegs); TCPS_UPDATE_MIB(tcps, tcpRetransBytes, snd_size); + tcp->tcp_cs.tcp_out_retrans_segs++; + tcp->tcp_cs.tcp_out_retrans_bytes += snd_size; tcp_send_data(tcp, mp1); } } if (flags & TH_NEED_SACK_REXMIT) { tcp_sack_rexmit(tcp, &flags);
*** 4871,4884 **** /* * This will restart the timer. Restarting the * timer is used to avoid a timeout before the * limited transmitted segment's ACK gets back. */ ! if (tcp->tcp_xmit_head != NULL) tcp->tcp_xmit_head->b_prev = (mblk_t *)LBOLT_FASTPATH; } /* Anything more to do? */ if ((flags & (TH_ACK_NEEDED|TH_ACK_TIMER_NEEDED| TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0) goto done; --- 4995,5014 ---- /* * This will restart the timer. Restarting the * timer is used to avoid a timeout before the * limited transmitted segment's ACK gets back. */ ! if (tcp->tcp_xmit_head != NULL) { ! #ifdef KERNEL_32 tcp->tcp_xmit_head->b_prev = (mblk_t *)LBOLT_FASTPATH; + #else + tcp->tcp_xmit_head->b_prev = + (mblk_t *)(intptr_t)gethrtime(); + #endif } + } /* Anything more to do? */ if ((flags & (TH_ACK_NEEDED|TH_ACK_TIMER_NEEDED| TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0) goto done;
*** 4916,4926 **** */ mp1 = tcp_ack_mp(tcp); if (mp1 != NULL) { tcp_send_data(tcp, mp1); ! BUMP_LOCAL(tcp->tcp_obsegs); TCPS_BUMP_MIB(tcps, tcpOutAck); } if (tcp->tcp_ack_tid != 0) { (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid); tcp->tcp_ack_tid = 0; --- 5046,5056 ---- */ mp1 = tcp_ack_mp(tcp); if (mp1 != NULL) { tcp_send_data(tcp, mp1); ! TCPS_BUMP_MIB(tcps, tcpHCOutSegs); TCPS_BUMP_MIB(tcps, tcpOutAck); } if (tcp->tcp_ack_tid != 0) { (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid); tcp->tcp_ack_tid = 0;
*** 5209,5250 **** } ASSERT(optptr == mp->b_wptr); return (mp); } ! /* The minimum of smoothed mean deviation in RTO calculation. */ ! #define TCP_SD_MIN 400 /* ! * Set RTO for this connection. The formula is from Jacobson and Karels' ! * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names ! * are the same as those in Appendix A.2 of that paper. * * m = new measurement * sa = smoothed RTT average (8 * average estimates). * sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates). */ static void ! tcp_set_rto(tcp_t *tcp, clock_t rtt) { ! long m = TICK_TO_MSEC(rtt); ! clock_t sa = tcp->tcp_rtt_sa; ! clock_t sv = tcp->tcp_rtt_sd; ! clock_t rto; tcp_stack_t *tcps = tcp->tcp_tcps; TCPS_BUMP_MIB(tcps, tcpRttUpdate); tcp->tcp_rtt_update++; /* tcp_rtt_sa is not 0 means this is a new sample. */ if (sa != 0) { /* ! * Update average estimator: ! * new rtt = 7/8 old rtt + 1/8 Error */ ! /* m is now Error in estimate. */ m -= sa >> 3; if ((sa += m) <= 0) { /* * Don't allow the smoothed average to be negative. * We use 0 to denote reinitialization of the --- 5339,5392 ---- } ASSERT(optptr == mp->b_wptr); return (mp); } ! /* The minimum of smoothed mean deviation in RTO calculation (nsec). */ ! #define TCP_SD_MIN 400000000 /* ! * Set RTO for this connection based on a new round-trip time measurement. ! * The formula is from Jacobson and Karels' "Congestion Avoidance and Control" ! * in SIGCOMM '88. The variable names are the same as those in Appendix A.2 ! * of that paper. * * m = new measurement * sa = smoothed RTT average (8 * average estimates). * sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates). */ static void ! tcp_set_rto(tcp_t *tcp, hrtime_t rtt) { ! hrtime_t m = rtt; ! hrtime_t sa = tcp->tcp_rtt_sa; ! hrtime_t sv = tcp->tcp_rtt_sd; tcp_stack_t *tcps = tcp->tcp_tcps; TCPS_BUMP_MIB(tcps, tcpRttUpdate); tcp->tcp_rtt_update++; + tcp->tcp_rtt_sum += m; + tcp->tcp_rtt_cnt++; /* tcp_rtt_sa is not 0 means this is a new sample. */ if (sa != 0) { /* ! * Update average estimator (see section 2.3 of RFC6298): ! * SRTT = 7/8 SRTT + 1/8 rtt ! * ! * We maintain tcp_rtt_sa as 8 * SRTT, so this reduces to: ! * tcp_rtt_sa = 7 SRTT + rtt ! * tcp_rtt_sa = tcp_rtt_sa - 1/8 tcp_rtt_sa + rtt ! * tcp_rtt_sa = tcp_rtt_sa + (rtt - 1/8 tcp_rtt_sa) ! * ! * (rtt - 1/8 tcp_rtt_sa) is simply the difference ! * between the new rtt measurement and the existing smoothed ! * RTT average. This is referred to as "Error" in subsequent ! * calculations. */ ! /* m is now Error. */ m -= sa >> 3; if ((sa += m) <= 0) { /* * Don't allow the smoothed average to be negative. * We use 0 to denote reinitialization of the
*** 5253,5263 **** sa = 1; } /* * Update deviation estimator: ! * new mdev = 3/4 old mdev + 1/4 (abs(Error) - old mdev) */ if (m < 0) m = -m; m -= sv >> 2; sv += m; --- 5395,5409 ---- sa = 1; } /* * Update deviation estimator: ! * mdev = 3/4 mdev + 1/4 abs(Error) ! * ! * We maintain tcp_rtt_sd as 4 * mdev, so this reduces to: ! * tcp_rtt_sd = 3 mdev + abs(Error) ! * tcp_rtt_sd = tcp_rtt_sd - 1/4 tcp_rtt_sd + abs(Error) */ if (m < 0) m = -m; m -= sv >> 2; sv += m;
*** 5279,5309 **** * effect as in a long train of segments, a receiver * does not delay its ACKs. So set the minimum of sv * to be TCP_SD_MIN, which is default to 400 ms, twice * of BSD DATO. That means the minimum of mean * deviation is 100 ms. - * */ sv = TCP_SD_MIN; } tcp->tcp_rtt_sa = sa; tcp->tcp_rtt_sd = sv; - /* - * RTO = average estimates (sa / 8) + 4 * deviation estimates (sv) - * - * Add tcp_rexmit_interval extra in case of extreme environment - * where the algorithm fails to work. The default value of - * tcp_rexmit_interval_extra should be 0. - * - * As we use a finer grained clock than BSD and update - * RTO for every ACKs, add in another .25 of RTT to the - * deviation of RTO to accomodate burstiness of 1/4 of - * window size. - */ - rto = (sa >> 3) + sv + tcps->tcps_rexmit_interval_extra + (sa >> 5); ! TCP_SET_RTO(tcp, rto); /* Now, we can reset tcp_timer_backoff to use the new RTO... */ tcp->tcp_timer_backoff = 0; } --- 5425,5441 ---- * effect as in a long train of segments, a receiver * does not delay its ACKs. So set the minimum of sv * to be TCP_SD_MIN, which is default to 400 ms, twice * of BSD DATO. That means the minimum of mean * deviation is 100 ms. */ sv = TCP_SD_MIN; } tcp->tcp_rtt_sa = sa; tcp->tcp_rtt_sd = sv; ! tcp->tcp_rto = tcp_calculate_rto(tcp, tcps); /* Now, we can reset tcp_timer_backoff to use the new RTO... */ tcp->tcp_timer_backoff = 0; }
*** 5624,5633 **** --- 5756,5769 ---- uint32_t npkt; npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) / tcp->tcp_mss; tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * tcp->tcp_mss; + + DTRACE_PROBE3(cwnd__source__quench, tcp_t *, tcp, + uint32_t, tcp->tcp_cwnd, + uint32_t, tcp->tcp_mss); tcp->tcp_cwnd = tcp->tcp_mss; tcp->tcp_cwnd_cnt = 0; } break; }