Print this page
DLPX-25998 TCP congestion control is inadequate
Reviewed at: http://reviews.delphix.com/r/34808/
DLPX-45697 Adding Avg. RTT to connstat
DLPX-43064 include high-resolution round-trip times in connstat (EP-652)
DLPX-42721 Create inline function for TCP RTO calculation
DLPX-37540 TCP per-connection kernel statistics DLPX-37544 connstat command to display per-connection TCP statistics
@@ -21,11 +21,11 @@
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright 2017 Joyent, Inc.
- * Copyright (c) 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
*/
/* This file contains all TCP input processing functions. */
#include <sys/types.h>
@@ -164,14 +164,141 @@
static void tcp_input_listener(void *, mblk_t *, void *, ip_recv_attr_t *);
static void tcp_process_options(tcp_t *, tcpha_t *);
static mblk_t *tcp_reass(tcp_t *, mblk_t *, uint32_t);
static void tcp_reass_elim_overlap(tcp_t *, mblk_t *);
static void tcp_rsrv_input(void *, mblk_t *, void *, ip_recv_attr_t *);
-static void tcp_set_rto(tcp_t *, time_t);
+static void tcp_set_rto(tcp_t *, hrtime_t);
static void tcp_setcred_data(mblk_t *, ip_recv_attr_t *);
/*
+ * CC wrapper hook functions
+ */
+static void
+cc_ack_received(tcp_t *tcp, uint32_t seg_ack, int32_t bytes_acked,
+ uint16_t type)
+{
+ uint32_t old_cwnd = tcp->tcp_cwnd;
+
+ tcp->tcp_ccv.bytes_this_ack = bytes_acked;
+ if (tcp->tcp_cwnd <= tcp->tcp_swnd)
+ tcp->tcp_ccv.flags |= CCF_CWND_LIMITED;
+ else
+ tcp->tcp_ccv.flags &= ~CCF_CWND_LIMITED;
+
+ if (type == CC_ACK) {
+ if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) {
+ if (tcp->tcp_ccv.flags & CCF_RTO)
+ tcp->tcp_ccv.flags &= ~CCF_RTO;
+
+ tcp->tcp_ccv.t_bytes_acked +=
+ min(tcp->tcp_ccv.bytes_this_ack,
+ tcp->tcp_tcps->tcps_abc_l_var * tcp->tcp_mss);
+ if (tcp->tcp_ccv.t_bytes_acked >= tcp->tcp_cwnd) {
+ tcp->tcp_ccv.t_bytes_acked -= tcp->tcp_cwnd;
+ tcp->tcp_ccv.flags |= CCF_ABC_SENTAWND;
+ }
+ } else {
+ tcp->tcp_ccv.flags &= ~CCF_ABC_SENTAWND;
+ tcp->tcp_ccv.t_bytes_acked = 0;
+ }
+ }
+
+ if (CC_ALGO(tcp)->ack_received != NULL) {
+ /*
+ * The FreeBSD code where this originated had a comment "Find
+ * a way to live without this" in several places where curack
+ * got set. If they eventually dump curack from from the cc
+ * variables, we'll need to adapt our code.
+ */
+ tcp->tcp_ccv.curack = seg_ack;
+ CC_ALGO(tcp)->ack_received(&tcp->tcp_ccv, type);
+ }
+
+ DTRACE_PROBE3(cwnd__cc__ack__received, tcp_t *, tcp, uint32_t, old_cwnd,
+ uint32_t, tcp->tcp_cwnd);
+}
+
+void
+cc_cong_signal(tcp_t *tcp, uint32_t seg_ack, uint32_t type)
+{
+ uint32_t old_cwnd = tcp->tcp_cwnd;
+ uint32_t old_cwnd_ssthresh = tcp->tcp_cwnd_ssthresh;
+ switch (type) {
+ case CC_NDUPACK:
+ if (!IN_FASTRECOVERY(tcp->tcp_ccv.flags)) {
+ tcp->tcp_rexmit_max = tcp->tcp_snxt;
+ if (tcp->tcp_ecn_ok) {
+ tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
+ tcp->tcp_cwr = B_TRUE;
+ tcp->tcp_ecn_cwr_sent = B_FALSE;
+ }
+ }
+ break;
+ case CC_ECN:
+ if (!IN_CONGRECOVERY(tcp->tcp_ccv.flags)) {
+ tcp->tcp_rexmit_max = tcp->tcp_snxt;
+ if (tcp->tcp_ecn_ok) {
+ tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
+ tcp->tcp_cwr = B_TRUE;
+ tcp->tcp_ecn_cwr_sent = B_FALSE;
+ }
+ }
+ break;
+ case CC_RTO:
+ tcp->tcp_ccv.flags |= CCF_RTO;
+ tcp->tcp_dupack_cnt = 0;
+ tcp->tcp_ccv.t_bytes_acked = 0;
+ /*
+ * Give up on fast recovery and congestion recovery if we were
+ * attempting either.
+ */
+ EXIT_RECOVERY(tcp->tcp_ccv.flags);
+ if (CC_ALGO(tcp)->cong_signal == NULL) {
+ /*
+ * RFC5681 Section 3.1
+ * ssthresh = max (FlightSize / 2, 2*SMSS) eq (4)
+ */
+ tcp->tcp_cwnd_ssthresh = max(
+ (tcp->tcp_snxt - tcp->tcp_suna) / 2 / tcp->tcp_mss,
+ 2) * tcp->tcp_mss;
+ tcp->tcp_cwnd = tcp->tcp_mss;
+ }
+
+ if (tcp->tcp_ecn_ok) {
+ tcp->tcp_cwr = B_TRUE;
+ tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
+ tcp->tcp_ecn_cwr_sent = B_FALSE;
+ }
+ break;
+ }
+
+ if (CC_ALGO(tcp)->cong_signal != NULL) {
+ tcp->tcp_ccv.curack = seg_ack;
+ CC_ALGO(tcp)->cong_signal(&tcp->tcp_ccv, type);
+ }
+
+ DTRACE_PROBE6(cwnd__cc__cong__signal, tcp_t *, tcp, uint32_t, old_cwnd,
+ uint32_t, tcp->tcp_cwnd, uint32_t, old_cwnd_ssthresh,
+ uint32_t, tcp->tcp_cwnd_ssthresh, uint32_t, type);
+}
+
+static void
+cc_post_recovery(tcp_t *tcp, uint32_t seg_ack)
+{
+ uint32_t old_cwnd = tcp->tcp_cwnd;
+
+ if (CC_ALGO(tcp)->post_recovery != NULL) {
+ tcp->tcp_ccv.curack = seg_ack;
+ CC_ALGO(tcp)->post_recovery(&tcp->tcp_ccv);
+ }
+ tcp->tcp_ccv.t_bytes_acked = 0;
+
+ DTRACE_PROBE3(cwnd__cc__post__recovery, tcp_t *, tcp,
+ uint32_t, old_cwnd, uint32_t, tcp->tcp_cwnd);
+}
+
+/*
* Set the MSS associated with a particular tcp based on its current value,
* and a new one passed in. Observe minimums and maximums, and reset other
* state variables that we want to view as multiples of MSS.
*
* The value of MSS could be either increased or descreased.
@@ -546,10 +673,13 @@
/*
* Initialize tcp_cwnd value. After tcp_mss_set(), tcp_mss has been
* updated properly.
*/
TCP_SET_INIT_CWND(tcp, tcp->tcp_mss, tcps->tcps_slow_start_initial);
+
+ if (tcp->tcp_cc_algo->conn_init != NULL)
+ tcp->tcp_cc_algo->conn_init(&tcp->tcp_ccv);
}
/*
* Add a new piece to the tcp reassembly queue. If the gap at the beginning
* is filled, return as much as we can. The message passed in may be
@@ -557,11 +687,11 @@
* number for this piece.
*/
static mblk_t *
tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start)
{
- uint32_t end;
+ uint32_t end, bytes;
mblk_t *mp1;
mblk_t *mp2;
mblk_t *next_mp;
uint32_t u1;
tcp_stack_t *tcps = tcp->tcp_tcps;
@@ -576,30 +706,30 @@
if (start == end) {
/* Empty. Blast it. */
freeb(mp);
continue;
}
+ bytes = end - start;
mp->b_cont = NULL;
TCP_REASS_SET_SEQ(mp, start);
TCP_REASS_SET_END(mp, end);
mp1 = tcp->tcp_reass_tail;
- if (!mp1) {
- tcp->tcp_reass_tail = mp;
+ if (mp1 == NULL || SEQ_GEQ(start, TCP_REASS_END(mp1))) {
+ if (mp1 != NULL) {
+ /*
+ * New stuff is beyond the tail; link it on the
+ * end.
+ */
+ mp1->b_cont = mp;
+ } else {
tcp->tcp_reass_head = mp;
- TCPS_BUMP_MIB(tcps, tcpInDataUnorderSegs);
- TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes,
- end - start);
- continue;
}
- /* New stuff completely beyond tail? */
- if (SEQ_GEQ(start, TCP_REASS_END(mp1))) {
- /* Link it on end. */
- mp1->b_cont = mp;
tcp->tcp_reass_tail = mp;
TCPS_BUMP_MIB(tcps, tcpInDataUnorderSegs);
- TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes,
- end - start);
+ TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes, bytes);
+ tcp->tcp_cs.tcp_in_data_unorder_segs++;
+ tcp->tcp_cs.tcp_in_data_unorder_bytes += bytes;
continue;
}
mp1 = tcp->tcp_reass_head;
u1 = TCP_REASS_SEQ(mp1);
/* New stuff at the front? */
@@ -2322,12 +2452,10 @@
int urp;
tcp_opt_t tcpopt;
ip_pkt_t ipp;
boolean_t ofo_seg = B_FALSE; /* Out of order segment */
uint32_t cwnd;
- uint32_t add;
- int npkt;
int mss;
conn_t *connp = (conn_t *)arg;
squeue_t *sqp = (squeue_t *)arg2;
tcp_t *tcp = connp->conn_tcp;
tcp_stack_t *tcps = tcp->tcp_tcps;
@@ -2412,11 +2540,11 @@
tcp->tcp_last_recv_time = LBOLT_FASTPATH;
}
flags = (unsigned int)tcpha->tha_flags & 0xFF;
- BUMP_LOCAL(tcp->tcp_ibsegs);
+ TCPS_BUMP_MIB(tcps, tcpHCInSegs);
DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp);
if ((flags & TH_URG) && sqp != NULL) {
/*
* TCP can't handle urgent pointers that arrive before
@@ -2599,10 +2727,13 @@
* Set tcp_cwnd back to 1 MSS, per
* recommendation from
* draft-floyd-incr-init-win-01.txt,
* Increasing TCP's Initial Window.
*/
+ DTRACE_PROBE3(cwnd__retransmitted__syn,
+ tcp_t *, tcp, uint32_t, tcp->tcp_cwnd,
+ uint32_t, tcp->tcp_mss);
tcp->tcp_cwnd = tcp->tcp_mss;
}
tcp->tcp_swl1 = seg_seq;
tcp->tcp_swl2 = seg_ack;
@@ -2657,11 +2788,11 @@
(void) TCP_TIMER_CANCEL(tcp,
tcp->tcp_ack_tid);
tcp->tcp_ack_tid = 0;
}
tcp_send_data(tcp, ack_mp);
- BUMP_LOCAL(tcp->tcp_obsegs);
+ TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
TCPS_BUMP_MIB(tcps, tcpOutAck);
if (!IPCL_IS_NONSTR(connp)) {
/* Send up T_CONN_CON */
if (ira->ira_cred != NULL) {
@@ -3046,10 +3177,11 @@
if (rgap < 0) {
mblk_t *mp2;
if (tcp->tcp_rwnd == 0) {
TCPS_BUMP_MIB(tcps, tcpInWinProbe);
+ tcp->tcp_cs.tcp_in_zwnd_probes++;
} else {
TCPS_BUMP_MIB(tcps, tcpInDataPastWinSegs);
TCPS_UPDATE_MIB(tcps, tcpInDataPastWinBytes, -rgap);
}
@@ -3295,10 +3427,13 @@
}
}
} else if (seg_len > 0) {
TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs);
TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, seg_len);
+ tcp->tcp_cs.tcp_in_data_inorder_segs++;
+ tcp->tcp_cs.tcp_in_data_inorder_bytes += seg_len;
+
/*
* If an out of order FIN was received before, and the seq
* num and len of the new segment match that of the FIN,
* put the FIN flag back in.
*/
@@ -3360,11 +3495,11 @@
/*
* urp could be -1 when the urp field in the packet is 0
* and TCP_OLD_URP_INTERPRETATION is set. This implies that the urgent
* byte was at seg_seq - 1, in which case we ignore the urgent flag.
*/
- if (flags & TH_URG && urp >= 0) {
+ if ((flags & TH_URG) && urp >= 0) {
if (!tcp->tcp_urp_last_valid ||
SEQ_GT(urp + seg_seq, tcp->tcp_urp_last)) {
/*
* Non-STREAMS sockets handle the urgent data a litte
* differently from STREAMS based sockets. There is no
@@ -3817,10 +3952,13 @@
if (tcp->tcp_rexmit) {
tcp->tcp_rexmit = B_FALSE;
tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
tcp->tcp_rexmit_max = tcp->tcp_snxt;
tcp->tcp_ms_we_have_waited = 0;
+ DTRACE_PROBE3(cwnd__retransmitted__syn,
+ tcp_t *, tcp, uint32_t, tcp->tcp_cwnd,
+ uint32_t, tcp->tcp_mss);
tcp->tcp_cwnd = mss;
}
/*
* We set the send window to zero here.
@@ -3860,38 +3998,27 @@
* set, reduce tcp_cwnd and tcp_ssthresh. But this should only be
* done once per window (or more loosely, per RTT).
*/
if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max))
tcp->tcp_cwr = B_FALSE;
- if (tcp->tcp_ecn_ok && (flags & TH_ECE)) {
- if (!tcp->tcp_cwr) {
- npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) / mss;
- tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * mss;
- tcp->tcp_cwnd = npkt * mss;
+ if (tcp->tcp_ecn_ok && (flags & TH_ECE) && !tcp->tcp_cwr) {
+ cc_cong_signal(tcp, seg_ack, CC_ECN);
/*
* If the cwnd is 0, use the timer to clock out
* new segments. This is required by the ECN spec.
*/
- if (npkt == 0) {
+ if (tcp->tcp_cwnd == 0)
TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
- /*
- * This makes sure that when the ACK comes
- * back, we will increase tcp_cwnd by 1 MSS.
- */
- tcp->tcp_cwnd_cnt = 0;
- }
tcp->tcp_cwr = B_TRUE;
/*
* This marks the end of the current window of in
* flight data. That is why we don't use
* tcp_suna + tcp_swnd. Only data in flight can
* provide ECN info.
*/
tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
- tcp->tcp_ecn_cwr_sent = B_FALSE;
}
- }
mp1 = tcp->tcp_xmit_head;
if (bytes_acked == 0) {
if (!ofo_seg && seg_len == 0 && new_swnd == tcp->tcp_swnd) {
int dupack_cnt;
@@ -3908,10 +4035,12 @@
if (mp1 && tcp->tcp_suna != tcp->tcp_snxt &&
! tcp->tcp_rexmit) {
/* Do Limited Transmit */
if ((dupack_cnt = ++tcp->tcp_dupack_cnt) <
tcps->tcps_dupack_fast_retransmit) {
+ cc_ack_received(tcp, seg_ack,
+ bytes_acked, CC_DUPACK);
/*
* RFC 3042
*
* What we need to do is temporarily
* increase tcp_cwnd so that new
@@ -3954,16 +4083,14 @@
* Adjust cwnd since the duplicate
* ack indicates that a packet was
* dropped (due to congestion.)
*/
if (!tcp->tcp_cwr) {
- npkt = ((tcp->tcp_snxt -
- tcp->tcp_suna) >> 1) / mss;
- tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) *
- mss;
- tcp->tcp_cwnd = (npkt +
- tcp->tcp_dupack_cnt) * mss;
+ cc_cong_signal(tcp, seg_ack,
+ CC_NDUPACK);
+ cc_ack_received(tcp, seg_ack,
+ bytes_acked, CC_DUPACK);
}
if (tcp->tcp_ecn_ok) {
tcp->tcp_cwr = B_TRUE;
tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
tcp->tcp_ecn_cwr_sent = B_FALSE;
@@ -4021,10 +4148,12 @@
} else {
flags |= TH_REXMIT_NEEDED;
} /* tcp_snd_sack_ok */
} else {
+ cc_ack_received(tcp, seg_ack,
+ bytes_acked, CC_DUPACK);
/*
* Here we perform congestion
* avoidance, but NOT slow start.
* This is known as the Fast
* Recovery Algorithm.
@@ -4042,10 +4171,14 @@
* cwnd.
*/
cwnd = tcp->tcp_cwnd + mss;
if (cwnd > tcp->tcp_cwnd_max)
cwnd = tcp->tcp_cwnd_max;
+ DTRACE_PROBE3(cwnd__fast__recovery,
+ tcp_t *, tcp,
+ uint32_t, tcp->tcp_cwnd,
+ uint32_t, cwnd);
tcp->tcp_cwnd = cwnd;
if (tcp->tcp_unsent > 0)
flags |= TH_XMIT_NEEDED;
}
}
@@ -4144,11 +4277,11 @@
}
return;
}
mp = tcp_ack_mp(tcp);
if (mp != NULL) {
- BUMP_LOCAL(tcp->tcp_obsegs);
+ TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
TCPS_BUMP_MIB(tcps, tcpOutAck);
tcp_send_data(tcp, mp);
}
return;
}
@@ -4174,19 +4307,14 @@
*/
if (tcp->tcp_dupack_cnt >= tcps->tcps_dupack_fast_retransmit) {
ASSERT(tcp->tcp_rexmit == B_FALSE);
if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) {
tcp->tcp_dupack_cnt = 0;
- /*
- * Restore the orig tcp_cwnd_ssthresh after
- * fast retransmit phase.
- */
- if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) {
- tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh;
- }
+
+ cc_post_recovery(tcp, seg_ack);
+
tcp->tcp_rexmit_max = seg_ack;
- tcp->tcp_cwnd_cnt = 0;
/*
* Remove all notsack info to avoid confusion with
* the next fast retrasnmit/recovery phase.
*/
@@ -4211,12 +4339,16 @@
* original value when we started fast
* recovery. This is to prevent overly
* aggressive behaviour in sending new
* segments.
*/
- tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh +
+ cwnd = tcp->tcp_cwnd_ssthresh +
tcps->tcps_dupack_fast_retransmit * mss;
+ DTRACE_PROBE3(cwnd__fast__retransmit__part__ack,
+ tcp_t *, tcp, uint32_t, tcp->tcp_cwnd,
+ uint32_t, cwnd);
+ tcp->tcp_cwnd = cwnd;
tcp->tcp_cwnd_cnt = tcp->tcp_cwnd;
flags |= TH_REXMIT_NEEDED;
}
}
} else {
@@ -4273,69 +4405,49 @@
* If TCP is not ECN capable or TCP is ECN capable but the
* congestion experience bit is not set, increase the tcp_cwnd as
* usual.
*/
if (!tcp->tcp_ecn_ok || !(flags & TH_ECE)) {
- cwnd = tcp->tcp_cwnd;
- add = mss;
-
- if (cwnd >= tcp->tcp_cwnd_ssthresh) {
- /*
- * This is to prevent an increase of less than 1 MSS of
- * tcp_cwnd. With partial increase, tcp_wput_data()
- * may send out tinygrams in order to preserve mblk
- * boundaries.
- *
- * By initializing tcp_cwnd_cnt to new tcp_cwnd and
- * decrementing it by 1 MSS for every ACKs, tcp_cwnd is
- * increased by 1 MSS for every RTTs.
- */
- if (tcp->tcp_cwnd_cnt <= 0) {
- tcp->tcp_cwnd_cnt = cwnd + add;
- } else {
- tcp->tcp_cwnd_cnt -= add;
- add = 0;
+ if (IN_RECOVERY(tcp->tcp_ccv.flags)) {
+ EXIT_RECOVERY(tcp->tcp_ccv.flags);
}
+ cc_ack_received(tcp, seg_ack, bytes_acked, CC_ACK);
}
- tcp->tcp_cwnd = MIN(cwnd + add, tcp->tcp_cwnd_max);
- }
/* See if the latest urgent data has been acknowledged */
if ((tcp->tcp_valid_bits & TCP_URG_VALID) &&
SEQ_GT(seg_ack, tcp->tcp_urg))
tcp->tcp_valid_bits &= ~TCP_URG_VALID;
- /* Can we update the RTT estimates? */
- if (tcp->tcp_snd_ts_ok) {
- /* Ignore zero timestamp echo-reply. */
- if (tcpopt.tcp_opt_ts_ecr != 0) {
- tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH -
- (int32_t)tcpopt.tcp_opt_ts_ecr);
- }
-
- /* If needed, restart the timer. */
- if (tcp->tcp_set_timer == 1) {
- TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
- tcp->tcp_set_timer = 0;
- }
/*
- * Update tcp_csuna in case the other side stops sending
- * us timestamps.
+ * Update the RTT estimates. Note that we don't use the TCP
+ * timestamp option to calculate RTT even if one is present. This is
+ * because the timestamp option's resolution (CPU tick) is
+ * too coarse to measure modern datacenter networks' microsecond
+ * latencies. The timestamp field's resolution is limited by its
+ * 4-byte width (see RFC1323), and since we always store a
+ * high-resolution nanosecond presision timestamp along with the data,
+ * there is no point to ever using the timestamp option.
*/
- tcp->tcp_csuna = tcp->tcp_snxt;
- } else if (SEQ_GT(seg_ack, tcp->tcp_csuna)) {
+ if (SEQ_GT(seg_ack, tcp->tcp_csuna)) {
/*
* An ACK sequence we haven't seen before, so get the RTT
* and update the RTO. But first check if the timestamp is
* valid to use.
*/
if ((mp1->b_next != NULL) &&
- SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next)))
+ SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next))) {
+#ifdef KERNEL_32
tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH -
(int32_t)(intptr_t)mp1->b_prev);
- else
+#else
+ tcp_set_rto(tcp, gethrtime() -
+ (hrtime_t)(intptr_t)mp1->b_prev);
+#endif
+ } else {
TCPS_BUMP_MIB(tcps, tcpRttNoUpdate);
+ }
/* Remeber the last sequence to be ACKed */
tcp->tcp_csuna = seg_ack;
if (tcp->tcp_set_timer == 1) {
TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
@@ -4359,12 +4471,17 @@
* Set a new timestamp if all the bytes timed by the
* old timestamp have been ack'ed.
*/
if (SEQ_GT(seg_ack,
(uint32_t)(uintptr_t)(mp1->b_next))) {
+#ifdef KERNEL_32
mp1->b_prev =
(mblk_t *)(uintptr_t)LBOLT_FASTPATH;
+#else
+ mp1->b_prev =
+ (mblk_t *)(intptr_t)gethrtime();
+#endif
mp1->b_next = NULL;
}
break;
}
mp1->b_next = NULL;
@@ -4836,16 +4953,23 @@
mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, snd_size,
NULL, NULL, tcp->tcp_suna, B_TRUE, &snd_size,
B_TRUE);
if (mp1 != NULL) {
+#ifdef KERNEL_32
tcp->tcp_xmit_head->b_prev =
(mblk_t *)LBOLT_FASTPATH;
+#else
+ tcp->tcp_xmit_head->b_prev =
+ (mblk_t *)(intptr_t)gethrtime();
+#endif
tcp->tcp_csuna = tcp->tcp_snxt;
TCPS_BUMP_MIB(tcps, tcpRetransSegs);
TCPS_UPDATE_MIB(tcps, tcpRetransBytes,
snd_size);
+ tcp->tcp_cs.tcp_out_retrans_segs++;
+ tcp->tcp_cs.tcp_out_retrans_bytes += snd_size;
tcp_send_data(tcp, mp1);
}
}
if (flags & TH_NEED_SACK_REXMIT) {
tcp_sack_rexmit(tcp, &flags);
@@ -4871,14 +4995,20 @@
/*
* This will restart the timer. Restarting the
* timer is used to avoid a timeout before the
* limited transmitted segment's ACK gets back.
*/
- if (tcp->tcp_xmit_head != NULL)
+ if (tcp->tcp_xmit_head != NULL) {
+#ifdef KERNEL_32
tcp->tcp_xmit_head->b_prev =
(mblk_t *)LBOLT_FASTPATH;
+#else
+ tcp->tcp_xmit_head->b_prev =
+ (mblk_t *)(intptr_t)gethrtime();
+#endif
}
+ }
/* Anything more to do? */
if ((flags & (TH_ACK_NEEDED|TH_ACK_TIMER_NEEDED|
TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0)
goto done;
@@ -4916,11 +5046,11 @@
*/
mp1 = tcp_ack_mp(tcp);
if (mp1 != NULL) {
tcp_send_data(tcp, mp1);
- BUMP_LOCAL(tcp->tcp_obsegs);
+ TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
TCPS_BUMP_MIB(tcps, tcpOutAck);
}
if (tcp->tcp_ack_tid != 0) {
(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid);
tcp->tcp_ack_tid = 0;
@@ -5209,42 +5339,54 @@
}
ASSERT(optptr == mp->b_wptr);
return (mp);
}
-/* The minimum of smoothed mean deviation in RTO calculation. */
-#define TCP_SD_MIN 400
+/* The minimum of smoothed mean deviation in RTO calculation (nsec). */
+#define TCP_SD_MIN 400000000
/*
- * Set RTO for this connection. The formula is from Jacobson and Karels'
- * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names
- * are the same as those in Appendix A.2 of that paper.
+ * Set RTO for this connection based on a new round-trip time measurement.
+ * The formula is from Jacobson and Karels' "Congestion Avoidance and Control"
+ * in SIGCOMM '88. The variable names are the same as those in Appendix A.2
+ * of that paper.
*
* m = new measurement
* sa = smoothed RTT average (8 * average estimates).
* sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates).
*/
static void
-tcp_set_rto(tcp_t *tcp, clock_t rtt)
+tcp_set_rto(tcp_t *tcp, hrtime_t rtt)
{
- long m = TICK_TO_MSEC(rtt);
- clock_t sa = tcp->tcp_rtt_sa;
- clock_t sv = tcp->tcp_rtt_sd;
- clock_t rto;
+ hrtime_t m = rtt;
+ hrtime_t sa = tcp->tcp_rtt_sa;
+ hrtime_t sv = tcp->tcp_rtt_sd;
tcp_stack_t *tcps = tcp->tcp_tcps;
TCPS_BUMP_MIB(tcps, tcpRttUpdate);
tcp->tcp_rtt_update++;
+ tcp->tcp_rtt_sum += m;
+ tcp->tcp_rtt_cnt++;
/* tcp_rtt_sa is not 0 means this is a new sample. */
if (sa != 0) {
/*
- * Update average estimator:
- * new rtt = 7/8 old rtt + 1/8 Error
+ * Update average estimator (see section 2.3 of RFC6298):
+ * SRTT = 7/8 SRTT + 1/8 rtt
+ *
+ * We maintain tcp_rtt_sa as 8 * SRTT, so this reduces to:
+ * tcp_rtt_sa = 7 SRTT + rtt
+ * tcp_rtt_sa = tcp_rtt_sa - 1/8 tcp_rtt_sa + rtt
+ * tcp_rtt_sa = tcp_rtt_sa + (rtt - 1/8 tcp_rtt_sa)
+ *
+ * (rtt - 1/8 tcp_rtt_sa) is simply the difference
+ * between the new rtt measurement and the existing smoothed
+ * RTT average. This is referred to as "Error" in subsequent
+ * calculations.
*/
- /* m is now Error in estimate. */
+ /* m is now Error. */
m -= sa >> 3;
if ((sa += m) <= 0) {
/*
* Don't allow the smoothed average to be negative.
* We use 0 to denote reinitialization of the
@@ -5253,11 +5395,15 @@
sa = 1;
}
/*
* Update deviation estimator:
- * new mdev = 3/4 old mdev + 1/4 (abs(Error) - old mdev)
+ * mdev = 3/4 mdev + 1/4 abs(Error)
+ *
+ * We maintain tcp_rtt_sd as 4 * mdev, so this reduces to:
+ * tcp_rtt_sd = 3 mdev + abs(Error)
+ * tcp_rtt_sd = tcp_rtt_sd - 1/4 tcp_rtt_sd + abs(Error)
*/
if (m < 0)
m = -m;
m -= sv >> 2;
sv += m;
@@ -5279,31 +5425,17 @@
* effect as in a long train of segments, a receiver
* does not delay its ACKs. So set the minimum of sv
* to be TCP_SD_MIN, which is default to 400 ms, twice
* of BSD DATO. That means the minimum of mean
* deviation is 100 ms.
- *
*/
sv = TCP_SD_MIN;
}
tcp->tcp_rtt_sa = sa;
tcp->tcp_rtt_sd = sv;
- /*
- * RTO = average estimates (sa / 8) + 4 * deviation estimates (sv)
- *
- * Add tcp_rexmit_interval extra in case of extreme environment
- * where the algorithm fails to work. The default value of
- * tcp_rexmit_interval_extra should be 0.
- *
- * As we use a finer grained clock than BSD and update
- * RTO for every ACKs, add in another .25 of RTT to the
- * deviation of RTO to accomodate burstiness of 1/4 of
- * window size.
- */
- rto = (sa >> 3) + sv + tcps->tcps_rexmit_interval_extra + (sa >> 5);
- TCP_SET_RTO(tcp, rto);
+ tcp->tcp_rto = tcp_calculate_rto(tcp, tcps);
/* Now, we can reset tcp_timer_backoff to use the new RTO... */
tcp->tcp_timer_backoff = 0;
}
@@ -5624,10 +5756,14 @@
uint32_t npkt;
npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) /
tcp->tcp_mss;
tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * tcp->tcp_mss;
+
+ DTRACE_PROBE3(cwnd__source__quench, tcp_t *, tcp,
+ uint32_t, tcp->tcp_cwnd,
+ uint32_t, tcp->tcp_mss);
tcp->tcp_cwnd = tcp->tcp_mss;
tcp->tcp_cwnd_cnt = 0;
}
break;
}