Print this page
DLPX-25998 TCP congestion control is inadequate
Reviewed at: http://reviews.delphix.com/r/34808/
DLPX-45697 Adding Avg. RTT to connstat
DLPX-43064 include high-resolution round-trip times in connstat (EP-652)
DLPX-42721 Create inline function for TCP RTO calculation
DLPX-37540 TCP per-connection kernel statistics DLPX-37544 connstat command to display per-connection TCP statistics

@@ -21,11 +21,11 @@
 
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright 2017 Joyent, Inc.
- * Copyright (c) 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
  */
 
 /* This file contains all TCP input processing functions. */
 
 #include <sys/types.h>

@@ -164,14 +164,141 @@
 static void     tcp_input_listener(void *, mblk_t *, void *, ip_recv_attr_t *);
 static void     tcp_process_options(tcp_t *, tcpha_t *);
 static mblk_t   *tcp_reass(tcp_t *, mblk_t *, uint32_t);
 static void     tcp_reass_elim_overlap(tcp_t *, mblk_t *);
 static void     tcp_rsrv_input(void *, mblk_t *, void *, ip_recv_attr_t *);
-static void     tcp_set_rto(tcp_t *, time_t);
+static void     tcp_set_rto(tcp_t *, hrtime_t);
 static void     tcp_setcred_data(mblk_t *, ip_recv_attr_t *);
 
 /*
+ * CC wrapper hook functions
+ */
+static void
+cc_ack_received(tcp_t *tcp, uint32_t seg_ack, int32_t bytes_acked,
+    uint16_t type)
+{
+        uint32_t old_cwnd = tcp->tcp_cwnd;
+
+        tcp->tcp_ccv.bytes_this_ack = bytes_acked;
+        if (tcp->tcp_cwnd <= tcp->tcp_swnd)
+                tcp->tcp_ccv.flags |= CCF_CWND_LIMITED;
+        else
+                tcp->tcp_ccv.flags &= ~CCF_CWND_LIMITED;
+
+        if (type == CC_ACK) {
+                if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) {
+                        if (tcp->tcp_ccv.flags & CCF_RTO)
+                                tcp->tcp_ccv.flags &= ~CCF_RTO;
+
+                        tcp->tcp_ccv.t_bytes_acked +=
+                            min(tcp->tcp_ccv.bytes_this_ack,
+                            tcp->tcp_tcps->tcps_abc_l_var * tcp->tcp_mss);
+                        if (tcp->tcp_ccv.t_bytes_acked >= tcp->tcp_cwnd) {
+                                tcp->tcp_ccv.t_bytes_acked -= tcp->tcp_cwnd;
+                                tcp->tcp_ccv.flags |= CCF_ABC_SENTAWND;
+                        }
+                } else {
+                        tcp->tcp_ccv.flags &= ~CCF_ABC_SENTAWND;
+                        tcp->tcp_ccv.t_bytes_acked = 0;
+                }
+        }
+
+        if (CC_ALGO(tcp)->ack_received != NULL) {
+                /*
+                 * The FreeBSD code where this originated had a comment "Find
+                 * a way to live without this" in several places where curack
+                 * got set.  If they eventually dump curack from from the cc
+                 * variables, we'll need to adapt our code.
+                 */
+                tcp->tcp_ccv.curack = seg_ack;
+                CC_ALGO(tcp)->ack_received(&tcp->tcp_ccv, type);
+        }
+
+        DTRACE_PROBE3(cwnd__cc__ack__received, tcp_t *, tcp, uint32_t, old_cwnd,
+            uint32_t, tcp->tcp_cwnd);
+}
+
+void
+cc_cong_signal(tcp_t *tcp, uint32_t seg_ack, uint32_t type)
+{
+        uint32_t old_cwnd = tcp->tcp_cwnd;
+        uint32_t old_cwnd_ssthresh = tcp->tcp_cwnd_ssthresh;
+        switch (type) {
+        case CC_NDUPACK:
+                if (!IN_FASTRECOVERY(tcp->tcp_ccv.flags)) {
+                        tcp->tcp_rexmit_max = tcp->tcp_snxt;
+                        if (tcp->tcp_ecn_ok) {
+                                tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
+                                tcp->tcp_cwr = B_TRUE;
+                                tcp->tcp_ecn_cwr_sent = B_FALSE;
+                        }
+                }
+                break;
+        case CC_ECN:
+                if (!IN_CONGRECOVERY(tcp->tcp_ccv.flags)) {
+                        tcp->tcp_rexmit_max = tcp->tcp_snxt;
+                        if (tcp->tcp_ecn_ok) {
+                                tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
+                                tcp->tcp_cwr = B_TRUE;
+                                tcp->tcp_ecn_cwr_sent = B_FALSE;
+                        }
+                }
+                break;
+        case CC_RTO:
+                tcp->tcp_ccv.flags |= CCF_RTO;
+                tcp->tcp_dupack_cnt = 0;
+                tcp->tcp_ccv.t_bytes_acked = 0;
+                /*
+                 * Give up on fast recovery and congestion recovery if we were
+                 * attempting either.
+                 */
+                EXIT_RECOVERY(tcp->tcp_ccv.flags);
+                if (CC_ALGO(tcp)->cong_signal == NULL) {
+                        /*
+                         * RFC5681 Section 3.1
+                         * ssthresh = max (FlightSize / 2, 2*SMSS) eq (4)
+                         */
+                        tcp->tcp_cwnd_ssthresh = max(
+                            (tcp->tcp_snxt - tcp->tcp_suna) / 2 / tcp->tcp_mss,
+                            2) * tcp->tcp_mss;
+                        tcp->tcp_cwnd = tcp->tcp_mss;
+                }
+
+                if (tcp->tcp_ecn_ok) {
+                        tcp->tcp_cwr = B_TRUE;
+                        tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
+                        tcp->tcp_ecn_cwr_sent = B_FALSE;
+                }
+                break;
+        }
+
+        if (CC_ALGO(tcp)->cong_signal != NULL) {
+                tcp->tcp_ccv.curack = seg_ack;
+                CC_ALGO(tcp)->cong_signal(&tcp->tcp_ccv, type);
+        }
+
+        DTRACE_PROBE6(cwnd__cc__cong__signal, tcp_t *, tcp, uint32_t, old_cwnd,
+            uint32_t, tcp->tcp_cwnd, uint32_t, old_cwnd_ssthresh,
+            uint32_t, tcp->tcp_cwnd_ssthresh, uint32_t, type);
+}
+
+static void
+cc_post_recovery(tcp_t *tcp, uint32_t seg_ack)
+{
+        uint32_t old_cwnd = tcp->tcp_cwnd;
+
+        if (CC_ALGO(tcp)->post_recovery != NULL) {
+                tcp->tcp_ccv.curack = seg_ack;
+                CC_ALGO(tcp)->post_recovery(&tcp->tcp_ccv);
+        }
+        tcp->tcp_ccv.t_bytes_acked = 0;
+
+        DTRACE_PROBE3(cwnd__cc__post__recovery, tcp_t *, tcp,
+            uint32_t, old_cwnd, uint32_t, tcp->tcp_cwnd);
+}
+
+/*
  * Set the MSS associated with a particular tcp based on its current value,
  * and a new one passed in. Observe minimums and maximums, and reset other
  * state variables that we want to view as multiples of MSS.
  *
  * The value of MSS could be either increased or descreased.

@@ -546,10 +673,13 @@
         /*
          * Initialize tcp_cwnd value. After tcp_mss_set(), tcp_mss has been
          * updated properly.
          */
         TCP_SET_INIT_CWND(tcp, tcp->tcp_mss, tcps->tcps_slow_start_initial);
+
+        if (tcp->tcp_cc_algo->conn_init != NULL)
+                tcp->tcp_cc_algo->conn_init(&tcp->tcp_ccv);
 }
 
 /*
  * Add a new piece to the tcp reassembly queue.  If the gap at the beginning
  * is filled, return as much as we can.  The message passed in may be

@@ -557,11 +687,11 @@
  * number for this piece.
  */
 static mblk_t *
 tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start)
 {
-        uint32_t        end;
+        uint32_t        end, bytes;
         mblk_t          *mp1;
         mblk_t          *mp2;
         mblk_t          *next_mp;
         uint32_t        u1;
         tcp_stack_t     *tcps = tcp->tcp_tcps;

@@ -576,30 +706,30 @@
                 if (start == end) {
                         /* Empty.  Blast it. */
                         freeb(mp);
                         continue;
                 }
+                bytes = end - start;
                 mp->b_cont = NULL;
                 TCP_REASS_SET_SEQ(mp, start);
                 TCP_REASS_SET_END(mp, end);
                 mp1 = tcp->tcp_reass_tail;
-                if (!mp1) {
-                        tcp->tcp_reass_tail = mp;
+                if (mp1 == NULL || SEQ_GEQ(start, TCP_REASS_END(mp1))) {
+                        if (mp1 != NULL) {
+                                /*
+                                 * New stuff is beyond the tail; link it on the
+                                 * end.
+                                 */
+                                mp1->b_cont = mp;
+                        } else {
                         tcp->tcp_reass_head = mp;
-                        TCPS_BUMP_MIB(tcps, tcpInDataUnorderSegs);
-                        TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes,
-                            end - start);
-                        continue;
                 }
-                /* New stuff completely beyond tail? */
-                if (SEQ_GEQ(start, TCP_REASS_END(mp1))) {
-                        /* Link it on end. */
-                        mp1->b_cont = mp;
                         tcp->tcp_reass_tail = mp;
                         TCPS_BUMP_MIB(tcps, tcpInDataUnorderSegs);
-                        TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes,
-                            end - start);
+                        TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes, bytes);
+                        tcp->tcp_cs.tcp_in_data_unorder_segs++;
+                        tcp->tcp_cs.tcp_in_data_unorder_bytes += bytes;
                         continue;
                 }
                 mp1 = tcp->tcp_reass_head;
                 u1 = TCP_REASS_SEQ(mp1);
                 /* New stuff at the front? */

@@ -2322,12 +2452,10 @@
         int             urp;
         tcp_opt_t       tcpopt;
         ip_pkt_t        ipp;
         boolean_t       ofo_seg = B_FALSE; /* Out of order segment */
         uint32_t        cwnd;
-        uint32_t        add;
-        int             npkt;
         int             mss;
         conn_t          *connp = (conn_t *)arg;
         squeue_t        *sqp = (squeue_t *)arg2;
         tcp_t           *tcp = connp->conn_tcp;
         tcp_stack_t     *tcps = tcp->tcp_tcps;

@@ -2412,11 +2540,11 @@
                 tcp->tcp_last_recv_time = LBOLT_FASTPATH;
         }
 
         flags = (unsigned int)tcpha->tha_flags & 0xFF;
 
-        BUMP_LOCAL(tcp->tcp_ibsegs);
+        TCPS_BUMP_MIB(tcps, tcpHCInSegs);
         DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp);
 
         if ((flags & TH_URG) && sqp != NULL) {
                 /*
                  * TCP can't handle urgent pointers that arrive before

@@ -2599,10 +2727,13 @@
                                  * Set tcp_cwnd back to 1 MSS, per
                                  * recommendation from
                                  * draft-floyd-incr-init-win-01.txt,
                                  * Increasing TCP's Initial Window.
                                  */
+                                DTRACE_PROBE3(cwnd__retransmitted__syn,
+                                    tcp_t *, tcp, uint32_t, tcp->tcp_cwnd,
+                                    uint32_t, tcp->tcp_mss);
                                 tcp->tcp_cwnd = tcp->tcp_mss;
                         }
 
                         tcp->tcp_swl1 = seg_seq;
                         tcp->tcp_swl2 = seg_ack;

@@ -2657,11 +2788,11 @@
                                                 (void) TCP_TIMER_CANCEL(tcp,
                                                     tcp->tcp_ack_tid);
                                                 tcp->tcp_ack_tid = 0;
                                         }
                                         tcp_send_data(tcp, ack_mp);
-                                        BUMP_LOCAL(tcp->tcp_obsegs);
+                                        TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
                                         TCPS_BUMP_MIB(tcps, tcpOutAck);
 
                                         if (!IPCL_IS_NONSTR(connp)) {
                                                 /* Send up T_CONN_CON */
                                                 if (ira->ira_cred != NULL) {

@@ -3046,10 +3177,11 @@
         if (rgap < 0) {
                 mblk_t  *mp2;
 
                 if (tcp->tcp_rwnd == 0) {
                         TCPS_BUMP_MIB(tcps, tcpInWinProbe);
+                        tcp->tcp_cs.tcp_in_zwnd_probes++;
                 } else {
                         TCPS_BUMP_MIB(tcps, tcpInDataPastWinSegs);
                         TCPS_UPDATE_MIB(tcps, tcpInDataPastWinBytes, -rgap);
                 }
 

@@ -3295,10 +3427,13 @@
                         }
                 }
         } else if (seg_len > 0) {
                 TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs);
                 TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, seg_len);
+                tcp->tcp_cs.tcp_in_data_inorder_segs++;
+                tcp->tcp_cs.tcp_in_data_inorder_bytes += seg_len;
+
                 /*
                  * If an out of order FIN was received before, and the seq
                  * num and len of the new segment match that of the FIN,
                  * put the FIN flag back in.
                  */

@@ -3360,11 +3495,11 @@
         /*
          * urp could be -1 when the urp field in the packet is 0
          * and TCP_OLD_URP_INTERPRETATION is set. This implies that the urgent
          * byte was at seg_seq - 1, in which case we ignore the urgent flag.
          */
-        if (flags & TH_URG && urp >= 0) {
+        if ((flags & TH_URG) && urp >= 0) {
                 if (!tcp->tcp_urp_last_valid ||
                     SEQ_GT(urp + seg_seq, tcp->tcp_urp_last)) {
                         /*
                          * Non-STREAMS sockets handle the urgent data a litte
                          * differently from STREAMS based sockets. There is no

@@ -3817,10 +3952,13 @@
                 if (tcp->tcp_rexmit) {
                         tcp->tcp_rexmit = B_FALSE;
                         tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
                         tcp->tcp_rexmit_max = tcp->tcp_snxt;
                         tcp->tcp_ms_we_have_waited = 0;
+                        DTRACE_PROBE3(cwnd__retransmitted__syn,
+                            tcp_t *, tcp, uint32_t, tcp->tcp_cwnd,
+                            uint32_t, tcp->tcp_mss);
                         tcp->tcp_cwnd = mss;
                 }
 
                 /*
                  * We set the send window to zero here.

@@ -3860,38 +3998,27 @@
          * set, reduce tcp_cwnd and tcp_ssthresh.  But this should only be
          * done once per window (or more loosely, per RTT).
          */
         if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max))
                 tcp->tcp_cwr = B_FALSE;
-        if (tcp->tcp_ecn_ok && (flags & TH_ECE)) {
-                if (!tcp->tcp_cwr) {
-                        npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) / mss;
-                        tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * mss;
-                        tcp->tcp_cwnd = npkt * mss;
+        if (tcp->tcp_ecn_ok && (flags & TH_ECE) && !tcp->tcp_cwr) {
+                cc_cong_signal(tcp, seg_ack, CC_ECN);
                         /*
                          * If the cwnd is 0, use the timer to clock out
                          * new segments.  This is required by the ECN spec.
                          */
-                        if (npkt == 0) {
+                if (tcp->tcp_cwnd == 0)
                                 TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
-                                /*
-                                 * This makes sure that when the ACK comes
-                                 * back, we will increase tcp_cwnd by 1 MSS.
-                                 */
-                                tcp->tcp_cwnd_cnt = 0;
-                        }
                         tcp->tcp_cwr = B_TRUE;
                         /*
                          * This marks the end of the current window of in
                          * flight data.  That is why we don't use
                          * tcp_suna + tcp_swnd.  Only data in flight can
                          * provide ECN info.
                          */
                         tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
-                        tcp->tcp_ecn_cwr_sent = B_FALSE;
                 }
-        }
 
         mp1 = tcp->tcp_xmit_head;
         if (bytes_acked == 0) {
                 if (!ofo_seg && seg_len == 0 && new_swnd == tcp->tcp_swnd) {
                         int dupack_cnt;

@@ -3908,10 +4035,12 @@
                         if (mp1 && tcp->tcp_suna != tcp->tcp_snxt &&
                             ! tcp->tcp_rexmit) {
                                 /* Do Limited Transmit */
                                 if ((dupack_cnt = ++tcp->tcp_dupack_cnt) <
                                     tcps->tcps_dupack_fast_retransmit) {
+                                        cc_ack_received(tcp, seg_ack,
+                                            bytes_acked, CC_DUPACK);
                                         /*
                                          * RFC 3042
                                          *
                                          * What we need to do is temporarily
                                          * increase tcp_cwnd so that new

@@ -3954,16 +4083,14 @@
                                  * Adjust cwnd since the duplicate
                                  * ack indicates that a packet was
                                  * dropped (due to congestion.)
                                  */
                                 if (!tcp->tcp_cwr) {
-                                        npkt = ((tcp->tcp_snxt -
-                                            tcp->tcp_suna) >> 1) / mss;
-                                        tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) *
-                                            mss;
-                                        tcp->tcp_cwnd = (npkt +
-                                            tcp->tcp_dupack_cnt) * mss;
+                                        cc_cong_signal(tcp, seg_ack,
+                                            CC_NDUPACK);
+                                        cc_ack_received(tcp, seg_ack,
+                                            bytes_acked, CC_DUPACK);
                                 }
                                 if (tcp->tcp_ecn_ok) {
                                         tcp->tcp_cwr = B_TRUE;
                                         tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
                                         tcp->tcp_ecn_cwr_sent = B_FALSE;

@@ -4021,10 +4148,12 @@
                                 } else {
                                         flags |= TH_REXMIT_NEEDED;
                                 } /* tcp_snd_sack_ok */
 
                                 } else {
+                                        cc_ack_received(tcp, seg_ack,
+                                            bytes_acked, CC_DUPACK);
                                         /*
                                          * Here we perform congestion
                                          * avoidance, but NOT slow start.
                                          * This is known as the Fast
                                          * Recovery Algorithm.

@@ -4042,10 +4171,14 @@
                                          * cwnd.
                                          */
                                         cwnd = tcp->tcp_cwnd + mss;
                                         if (cwnd > tcp->tcp_cwnd_max)
                                                 cwnd = tcp->tcp_cwnd_max;
+                                        DTRACE_PROBE3(cwnd__fast__recovery,
+                                            tcp_t *, tcp,
+                                            uint32_t, tcp->tcp_cwnd,
+                                            uint32_t, cwnd);
                                         tcp->tcp_cwnd = cwnd;
                                         if (tcp->tcp_unsent > 0)
                                                 flags |= TH_XMIT_NEEDED;
                                         }
                                 }

@@ -4144,11 +4277,11 @@
                                 }
                                 return;
                         }
                         mp = tcp_ack_mp(tcp);
                         if (mp != NULL) {
-                                BUMP_LOCAL(tcp->tcp_obsegs);
+                                TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
                                 TCPS_BUMP_MIB(tcps, tcpOutAck);
                                 tcp_send_data(tcp, mp);
                         }
                         return;
                 }

@@ -4174,19 +4307,14 @@
          */
         if (tcp->tcp_dupack_cnt >= tcps->tcps_dupack_fast_retransmit) {
                 ASSERT(tcp->tcp_rexmit == B_FALSE);
                 if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) {
                         tcp->tcp_dupack_cnt = 0;
-                        /*
-                         * Restore the orig tcp_cwnd_ssthresh after
-                         * fast retransmit phase.
-                         */
-                        if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) {
-                                tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh;
-                        }
+
+                        cc_post_recovery(tcp, seg_ack);
+
                         tcp->tcp_rexmit_max = seg_ack;
-                        tcp->tcp_cwnd_cnt = 0;
 
                         /*
                          * Remove all notsack info to avoid confusion with
                          * the next fast retrasnmit/recovery phase.
                          */

@@ -4211,12 +4339,16 @@
                                  * original value when we started fast
                                  * recovery.  This is to prevent overly
                                  * aggressive behaviour in sending new
                                  * segments.
                                  */
-                                tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh +
+                                cwnd = tcp->tcp_cwnd_ssthresh +
                                     tcps->tcps_dupack_fast_retransmit * mss;
+                                DTRACE_PROBE3(cwnd__fast__retransmit__part__ack,
+                                    tcp_t *, tcp, uint32_t, tcp->tcp_cwnd,
+                                    uint32_t, cwnd);
+                                tcp->tcp_cwnd = cwnd;
                                 tcp->tcp_cwnd_cnt = tcp->tcp_cwnd;
                                 flags |= TH_REXMIT_NEEDED;
                         }
                 }
         } else {

@@ -4273,69 +4405,49 @@
          * If TCP is not ECN capable or TCP is ECN capable but the
          * congestion experience bit is not set, increase the tcp_cwnd as
          * usual.
          */
         if (!tcp->tcp_ecn_ok || !(flags & TH_ECE)) {
-                cwnd = tcp->tcp_cwnd;
-                add = mss;
-
-                if (cwnd >= tcp->tcp_cwnd_ssthresh) {
-                        /*
-                         * This is to prevent an increase of less than 1 MSS of
-                         * tcp_cwnd.  With partial increase, tcp_wput_data()
-                         * may send out tinygrams in order to preserve mblk
-                         * boundaries.
-                         *
-                         * By initializing tcp_cwnd_cnt to new tcp_cwnd and
-                         * decrementing it by 1 MSS for every ACKs, tcp_cwnd is
-                         * increased by 1 MSS for every RTTs.
-                         */
-                        if (tcp->tcp_cwnd_cnt <= 0) {
-                                tcp->tcp_cwnd_cnt = cwnd + add;
-                        } else {
-                                tcp->tcp_cwnd_cnt -= add;
-                                add = 0;
+                if (IN_RECOVERY(tcp->tcp_ccv.flags)) {
+                        EXIT_RECOVERY(tcp->tcp_ccv.flags);
                         }
+                cc_ack_received(tcp, seg_ack, bytes_acked, CC_ACK);
                 }
-                tcp->tcp_cwnd = MIN(cwnd + add, tcp->tcp_cwnd_max);
-        }
 
         /* See if the latest urgent data has been acknowledged */
         if ((tcp->tcp_valid_bits & TCP_URG_VALID) &&
             SEQ_GT(seg_ack, tcp->tcp_urg))
                 tcp->tcp_valid_bits &= ~TCP_URG_VALID;
 
-        /* Can we update the RTT estimates? */
-        if (tcp->tcp_snd_ts_ok) {
-                /* Ignore zero timestamp echo-reply. */
-                if (tcpopt.tcp_opt_ts_ecr != 0) {
-                        tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH -
-                            (int32_t)tcpopt.tcp_opt_ts_ecr);
-                }
-
-                /* If needed, restart the timer. */
-                if (tcp->tcp_set_timer == 1) {
-                        TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
-                        tcp->tcp_set_timer = 0;
-                }
                 /*
-                 * Update tcp_csuna in case the other side stops sending
-                 * us timestamps.
+         * Update the RTT estimates. Note that we don't use the TCP
+         * timestamp option to calculate RTT even if one is present. This is
+         * because the timestamp option's resolution (CPU tick) is
+         * too coarse to measure modern datacenter networks' microsecond
+         * latencies. The timestamp field's resolution is limited by its
+         * 4-byte width (see RFC1323), and since we always store a
+         * high-resolution nanosecond presision timestamp along with the data,
+         * there is no point to ever using the timestamp option.
                  */
-                tcp->tcp_csuna = tcp->tcp_snxt;
-        } else if (SEQ_GT(seg_ack, tcp->tcp_csuna)) {
+        if (SEQ_GT(seg_ack, tcp->tcp_csuna)) {
                 /*
                  * An ACK sequence we haven't seen before, so get the RTT
                  * and update the RTO. But first check if the timestamp is
                  * valid to use.
                  */
                 if ((mp1->b_next != NULL) &&
-                    SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next)))
+                    SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next))) {
+#ifdef KERNEL_32
                         tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH -
                             (int32_t)(intptr_t)mp1->b_prev);
-                else
+#else
+                        tcp_set_rto(tcp, gethrtime() -
+                            (hrtime_t)(intptr_t)mp1->b_prev);
+#endif
+                } else {
                         TCPS_BUMP_MIB(tcps, tcpRttNoUpdate);
+                }
 
                 /* Remeber the last sequence to be ACKed */
                 tcp->tcp_csuna = seg_ack;
                 if (tcp->tcp_set_timer == 1) {
                         TCP_TIMER_RESTART(tcp, tcp->tcp_rto);

@@ -4359,12 +4471,17 @@
                          * Set a new timestamp if all the bytes timed by the
                          * old timestamp have been ack'ed.
                          */
                         if (SEQ_GT(seg_ack,
                             (uint32_t)(uintptr_t)(mp1->b_next))) {
+#ifdef KERNEL_32
                                 mp1->b_prev =
                                     (mblk_t *)(uintptr_t)LBOLT_FASTPATH;
+#else
+                                mp1->b_prev =
+                                    (mblk_t *)(intptr_t)gethrtime();
+#endif
                                 mp1->b_next = NULL;
                         }
                         break;
                 }
                 mp1->b_next = NULL;

@@ -4836,16 +4953,23 @@
                         mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, snd_size,
                             NULL, NULL, tcp->tcp_suna, B_TRUE, &snd_size,
                             B_TRUE);
 
                         if (mp1 != NULL) {
+#ifdef KERNEL_32
                                 tcp->tcp_xmit_head->b_prev =
                                     (mblk_t *)LBOLT_FASTPATH;
+#else
+                                tcp->tcp_xmit_head->b_prev =
+                                    (mblk_t *)(intptr_t)gethrtime();
+#endif
                                 tcp->tcp_csuna = tcp->tcp_snxt;
                                 TCPS_BUMP_MIB(tcps, tcpRetransSegs);
                                 TCPS_UPDATE_MIB(tcps, tcpRetransBytes,
                                     snd_size);
+                                tcp->tcp_cs.tcp_out_retrans_segs++;
+                                tcp->tcp_cs.tcp_out_retrans_bytes += snd_size;
                                 tcp_send_data(tcp, mp1);
                         }
                 }
                 if (flags & TH_NEED_SACK_REXMIT) {
                         tcp_sack_rexmit(tcp, &flags);

@@ -4871,14 +4995,20 @@
                         /*
                          * This will restart the timer.  Restarting the
                          * timer is used to avoid a timeout before the
                          * limited transmitted segment's ACK gets back.
                          */
-                        if (tcp->tcp_xmit_head != NULL)
+                        if (tcp->tcp_xmit_head != NULL) {
+#ifdef KERNEL_32
                                 tcp->tcp_xmit_head->b_prev =
                                     (mblk_t *)LBOLT_FASTPATH;
+#else
+                                tcp->tcp_xmit_head->b_prev =
+                                    (mblk_t *)(intptr_t)gethrtime();
+#endif
                 }
+                }
 
                 /* Anything more to do? */
                 if ((flags & (TH_ACK_NEEDED|TH_ACK_TIMER_NEEDED|
                     TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0)
                         goto done;

@@ -4916,11 +5046,11 @@
                  */
                 mp1 = tcp_ack_mp(tcp);
 
                 if (mp1 != NULL) {
                         tcp_send_data(tcp, mp1);
-                        BUMP_LOCAL(tcp->tcp_obsegs);
+                        TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
                         TCPS_BUMP_MIB(tcps, tcpOutAck);
                 }
                 if (tcp->tcp_ack_tid != 0) {
                         (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid);
                         tcp->tcp_ack_tid = 0;

@@ -5209,42 +5339,54 @@
         }
         ASSERT(optptr == mp->b_wptr);
         return (mp);
 }
 
-/* The minimum of smoothed mean deviation in RTO calculation. */
-#define TCP_SD_MIN      400
+/* The minimum of smoothed mean deviation in RTO calculation (nsec). */
+#define TCP_SD_MIN      400000000
 
 /*
- * Set RTO for this connection.  The formula is from Jacobson and Karels'
- * "Congestion Avoidance and Control" in SIGCOMM '88.  The variable names
- * are the same as those in Appendix A.2 of that paper.
+ * Set RTO for this connection based on a new round-trip time measurement.
+ * The formula is from Jacobson and Karels' "Congestion Avoidance and Control"
+ * in SIGCOMM '88.  The variable names are the same as those in Appendix A.2
+ * of that paper.
  *
  * m = new measurement
  * sa = smoothed RTT average (8 * average estimates).
  * sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates).
  */
 static void
-tcp_set_rto(tcp_t *tcp, clock_t rtt)
+tcp_set_rto(tcp_t *tcp, hrtime_t rtt)
 {
-        long m = TICK_TO_MSEC(rtt);
-        clock_t sa = tcp->tcp_rtt_sa;
-        clock_t sv = tcp->tcp_rtt_sd;
-        clock_t rto;
+        hrtime_t m = rtt;
+        hrtime_t sa = tcp->tcp_rtt_sa;
+        hrtime_t sv = tcp->tcp_rtt_sd;
         tcp_stack_t     *tcps = tcp->tcp_tcps;
 
         TCPS_BUMP_MIB(tcps, tcpRttUpdate);
         tcp->tcp_rtt_update++;
+        tcp->tcp_rtt_sum += m;
+        tcp->tcp_rtt_cnt++;
 
         /* tcp_rtt_sa is not 0 means this is a new sample. */
         if (sa != 0) {
                 /*
-                 * Update average estimator:
-                 *      new rtt = 7/8 old rtt + 1/8 Error
+                 * Update average estimator (see section 2.3 of RFC6298):
+                 *      SRTT = 7/8 SRTT + 1/8 rtt
+                 *
+                 * We maintain tcp_rtt_sa as 8 * SRTT, so this reduces to:
+                 *      tcp_rtt_sa = 7 SRTT + rtt
+                 *      tcp_rtt_sa = tcp_rtt_sa - 1/8 tcp_rtt_sa + rtt
+                 *      tcp_rtt_sa = tcp_rtt_sa + (rtt - 1/8 tcp_rtt_sa)
+                 *
+                 * (rtt - 1/8 tcp_rtt_sa) is simply the difference
+                 * between the new rtt measurement and the existing smoothed
+                 * RTT average. This is referred to as "Error" in subsequent
+                 * calculations.
                  */
 
-                /* m is now Error in estimate. */
+                /* m is now Error. */
                 m -= sa >> 3;
                 if ((sa += m) <= 0) {
                         /*
                          * Don't allow the smoothed average to be negative.
                          * We use 0 to denote reinitialization of the

@@ -5253,11 +5395,15 @@
                         sa = 1;
                 }
 
                 /*
                  * Update deviation estimator:
-                 *      new mdev = 3/4 old mdev + 1/4 (abs(Error) - old mdev)
+                 *      mdev = 3/4 mdev + 1/4 abs(Error)
+                 *
+                 * We maintain tcp_rtt_sd as 4 * mdev, so this reduces to:
+                 *      tcp_rtt_sd = 3 mdev + abs(Error)
+                 *      tcp_rtt_sd = tcp_rtt_sd - 1/4 tcp_rtt_sd + abs(Error)
                  */
                 if (m < 0)
                         m = -m;
                 m -= sv >> 2;
                 sv += m;

@@ -5279,31 +5425,17 @@
                  * effect as in a long train of segments, a receiver
                  * does not delay its ACKs.  So set the minimum of sv
                  * to be TCP_SD_MIN, which is default to 400 ms, twice
                  * of BSD DATO.  That means the minimum of mean
                  * deviation is 100 ms.
-                 *
                  */
                 sv = TCP_SD_MIN;
         }
         tcp->tcp_rtt_sa = sa;
         tcp->tcp_rtt_sd = sv;
-        /*
-         * RTO = average estimates (sa / 8) + 4 * deviation estimates (sv)
-         *
-         * Add tcp_rexmit_interval extra in case of extreme environment
-         * where the algorithm fails to work.  The default value of
-         * tcp_rexmit_interval_extra should be 0.
-         *
-         * As we use a finer grained clock than BSD and update
-         * RTO for every ACKs, add in another .25 of RTT to the
-         * deviation of RTO to accomodate burstiness of 1/4 of
-         * window size.
-         */
-        rto = (sa >> 3) + sv + tcps->tcps_rexmit_interval_extra + (sa >> 5);
 
-        TCP_SET_RTO(tcp, rto);
+        tcp->tcp_rto = tcp_calculate_rto(tcp, tcps);
 
         /* Now, we can reset tcp_timer_backoff to use the new RTO... */
         tcp->tcp_timer_backoff = 0;
 }
 

@@ -5624,10 +5756,14 @@
                         uint32_t npkt;
 
                         npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) /
                             tcp->tcp_mss;
                         tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * tcp->tcp_mss;
+
+                        DTRACE_PROBE3(cwnd__source__quench, tcp_t *, tcp,
+                            uint32_t, tcp->tcp_cwnd,
+                            uint32_t, tcp->tcp_mss);
                         tcp->tcp_cwnd = tcp->tcp_mss;
                         tcp->tcp_cwnd_cnt = 0;
                 }
                 break;
         }