Print this page
DLPX-25998 TCP congestion control is inadequate
Reviewed at: http://reviews.delphix.com/r/34808/
DLPX-43064 include high-resolution round-trip times in connstat (EP-652)
DLPX-37540 TCP per-connection kernel statistics DLPX-37544 connstat command to display per-connection TCP statistics

@@ -19,11 +19,11 @@
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
  */
 
 /* This file contains all TCP output processing functions. */
 
 #include <sys/types.h>

@@ -61,11 +61,11 @@
                     const int, int *, uint_t *, int *, mblk_t **, mblk_t *);
 static void     tcp_xmit_early_reset(char *, mblk_t *, uint32_t, uint32_t,
                     int, ip_recv_attr_t *, ip_stack_t *, conn_t *);
 static boolean_t        tcp_send_rst_chk(tcp_stack_t *);
 static void     tcp_process_shrunk_swnd(tcp_t *, uint32_t);
-static void     tcp_fill_header(tcp_t *, uchar_t *, clock_t, int);
+static void     tcp_fill_header(tcp_t *, uchar_t *, int);
 
 /*
  * Functions called directly via squeue having a prototype of edesc_t.
  */
 static void     tcp_wput_nondata(void *, mblk_t *, void *, ip_recv_attr_t *);

@@ -78,10 +78,22 @@
  * speedup is observed for values larger than sixteen.  Zero will
  * disable the optimisation.
  */
 static int tcp_tx_pull_len = 16;
 
+static void
+cc_after_idle(tcp_t *tcp)
+{
+        uint32_t old_cwnd = tcp->tcp_cwnd;
+
+        if (CC_ALGO(tcp)->after_idle != NULL)
+                CC_ALGO(tcp)->after_idle(&tcp->tcp_ccv);
+
+        DTRACE_PROBE3(cwnd__cc__after__idle, tcp_t *, tcp, uint32_t, old_cwnd,
+            uint32_t, tcp->tcp_cwnd);
+}
+
 void
 tcp_wput(queue_t *q, mblk_t *mp)
 {
         conn_t  *connp = Q_TO_CONN(q);
         tcp_t   *tcp;

@@ -215,11 +227,10 @@
         int32_t         mss;
         int32_t         num_sack_blk = 0;
         int32_t         total_hdr_len;
         int32_t         tcp_hdr_len;
         int             rc;
-        tcp_stack_t     *tcps = tcp->tcp_tcps;
         conn_t          *connp = tcp->tcp_connp;
         clock_t         now = LBOLT_FASTPATH;
 
         tcpstate = tcp->tcp_state;
         if (mp == NULL) {

@@ -370,11 +381,11 @@
                 tcp_hdr_len = connp->conn_ht_ulp_len;
         }
 
         if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
             (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
-                TCP_SET_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle);
+                cc_after_idle(tcp);
         }
         if (tcpstate == TCPS_SYN_RCVD) {
                 /*
                  * The three-way connection establishment handshake is not
                  * complete yet. We want to queue the data for transmission

@@ -451,11 +462,15 @@
                         /* Bypass all other unnecessary processing. */
                         goto done;
                 }
         }
 
+#ifdef KERNEL_32
         local_time = (mblk_t *)now;
+#else
+        local_time = (mblk_t *)(intptr_t)gethrtime();
+#endif
 
         /*
          * "Our" Nagle Algorithm.  This is not the same as in the old
          * BSD.  This is more in line with the true intent of Nagle.
          *

@@ -1188,11 +1203,11 @@
          * Reinitialize tcp_cwnd after idle.
          */
         now = LBOLT_FASTPATH;
         if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
             (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
-                TCP_SET_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle);
+                cc_after_idle(tcp);
         }
 
         usable = tcp->tcp_swnd;         /* tcp window size */
         if (usable > tcp->tcp_cwnd)
                 usable = tcp->tcp_cwnd; /* congestion window smaller */

@@ -1251,11 +1266,15 @@
         tcp->tcp_snxt = snxt + len;
         tcp->tcp_rack = tcp->tcp_rnxt;
 
         if ((mp1 = dupb(mp)) == 0)
                 goto no_memory;
+#ifdef KERNEL_32
         mp->b_prev = (mblk_t *)(uintptr_t)now;
+#else
+        mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
+#endif
         mp->b_next = (mblk_t *)(uintptr_t)snxt;
 
         /* adjust tcp header information */
         tcpha = tcp->tcp_tcpha;
         tcpha->tha_flags = (TH_ACK|TH_PUSH);

@@ -1266,11 +1285,13 @@
 
         tcpha->tha_seq = htonl(snxt);
 
         TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
         TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
-        BUMP_LOCAL(tcp->tcp_obsegs);
+        TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
+        tcp->tcp_cs.tcp_out_data_segs++;
+        tcp->tcp_cs.tcp_out_data_bytes += len;
 
         /* Update the latest receive window size in TCP header. */
         tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
 
         tcp->tcp_last_sent_len = (ushort_t)len;

@@ -1306,16 +1327,14 @@
         }
         mp1->b_rptr = rptr;
 
         /* Fill in the timestamp option. */
         if (tcp->tcp_snd_ts_ok) {
-                uint32_t llbolt = (uint32_t)LBOLT_FASTPATH;
-
-                U32_TO_BE32(llbolt,
-                    (char *)tcpha + TCP_MIN_HEADER_LENGTH+4);
+                U32_TO_BE32(now,
+                    (char *)tcpha + TCP_MIN_HEADER_LENGTH + 4);
                 U32_TO_BE32(tcp->tcp_ts_recent,
-                    (char *)tcpha + TCP_MIN_HEADER_LENGTH+8);
+                    (char *)tcpha + TCP_MIN_HEADER_LENGTH + 8);
         } else {
                 ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
         }
 
         /* copy header into outgoing packet */

@@ -1955,20 +1974,25 @@
                                     (mblk_t *)(uintptr_t)(*snxt);
                                 mp1 = mp1->b_cont;
                         }
                         *snxt += len;
                         *tail_unsent = (*xmit_tail)->b_wptr - mp1->b_wptr;
-                        BUMP_LOCAL(tcp->tcp_obsegs);
+                        TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
                         TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
                         TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
+                        tcp->tcp_cs.tcp_out_data_segs++;
+                        tcp->tcp_cs.tcp_out_data_bytes += len;
                         tcp_send_data(tcp, mp);
                         continue;
                 }
 
                 *snxt += len;   /* Adjust later if we don't send all of len */
+                TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
                 TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
                 TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
+                tcp->tcp_cs.tcp_out_data_segs++;
+                tcp->tcp_cs.tcp_out_data_bytes += len;
 
                 if (*tail_unsent) {
                         /* Are the bytes above us in flight? */
                         rptr = (*xmit_tail)->b_wptr - *tail_unsent;
                         if (rptr != (*xmit_tail)->b_rptr) {

@@ -2061,11 +2085,11 @@
 
                 /*
                  * Fill in the header using the template header, and add
                  * options such as time-stamp, ECN and/or SACK, as needed.
                  */
-                tcp_fill_header(tcp, rptr, (clock_t)local_time, num_sack_blk);
+                tcp_fill_header(tcp, rptr, num_sack_blk);
 
                 mp->b_rptr = rptr;
 
                 if (*tail_unsent) {
                         int spill = *tail_unsent;

@@ -2140,10 +2164,11 @@
                                  */
                                 *usable -= spill;
                                 *snxt += spill;
                                 tcp->tcp_last_sent_len += spill;
                                 TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, spill);
+                                tcp->tcp_cs.tcp_out_data_bytes += spill;
                                 /*
                                  * Adjust the checksum
                                  */
                                 tcpha = (tcpha_t *)(rptr +
                                     ixa->ixa_ip_hdr_length);

@@ -2188,11 +2213,11 @@
                         /*
                          * Restore values of ixa_fragsize and ixa_extra_ident.
                          */
                         ixa->ixa_fragsize = ixa->ixa_pmtu;
                         ixa->ixa_extra_ident = 0;
-                        tcp->tcp_obsegs += num_lso_seg;
+                        TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
                         TCP_STAT(tcps, tcp_lso_times);
                         TCP_STAT_UPDATE(tcps, tcp_lso_pkt_out, num_lso_seg);
                 } else {
                         /*
                          * Make sure to clean up LSO information. Wherever a

@@ -2199,11 +2224,11 @@
                          * new mp uses the prepended header room after dupb(),
                          * lso_info_cleanup() should be called.
                          */
                         lso_info_cleanup(mp);
                         tcp_send_data(tcp, mp);
-                        BUMP_LOCAL(tcp->tcp_obsegs);
+                        TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
                 }
         }
 
         return (0);
 }

@@ -2279,12 +2304,12 @@
         /*
          * We do not have a good algorithm to update ssthresh at this time.
          * So don't do any update.
          */
         bzero(&uinfo, sizeof (uinfo));
-        uinfo.iulp_rtt = tcp->tcp_rtt_sa;
-        uinfo.iulp_rtt_sd = tcp->tcp_rtt_sd;
+        uinfo.iulp_rtt = NSEC2MSEC(tcp->tcp_rtt_sa);
+        uinfo.iulp_rtt_sd = NSEC2MSEC(tcp->tcp_rtt_sd);
 
         /*
          * Note that uinfo is kept for conn_faddr in the DCE. Could update even
          * if source routed but we don't.
          */

@@ -2415,11 +2440,11 @@
                 tcp->tcp_tcpha->tha_win = tcpha->tha_win;
                 tcp->tcp_rack = ack;
                 tcp->tcp_rack_cnt = 0;
                 TCPS_BUMP_MIB(tcps, tcpOutAck);
         }
-        BUMP_LOCAL(tcp->tcp_obsegs);
+        TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
         tcpha->tha_seq = htonl(seq);
         tcpha->tha_ack = htonl(ack);
         /*
          * Include the adjustment for a source route if any.
          */

@@ -3384,15 +3409,21 @@
                 tcp_send_data(tcp, xmit_mp);
 
                 /*
                  * Update the send timestamp to avoid false retransmission.
                  */
+#ifdef KERNEL_32
                 snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt();
+#else
+                snxt_mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
+#endif
 
                 TCPS_BUMP_MIB(tcps, tcpRetransSegs);
                 TCPS_UPDATE_MIB(tcps, tcpRetransBytes, seg_len);
                 TCPS_BUMP_MIB(tcps, tcpOutSackRetransSegs);
+                tcp->tcp_cs.tcp_out_retrans_segs++;
+                tcp->tcp_cs.tcp_out_retrans_bytes += seg_len;
                 /*
                  * Update tcp_rexmit_max to extend this SACK recovery phase.
                  * This happens when new data sent during fast recovery is
                  * also lost.  If TCP retransmits those new data, it needs
                  * to extend SACK recover phase to avoid starting another

@@ -3456,13 +3487,19 @@
                         win -= cnt;
                         /*
                          * Update the send timestamp to avoid false
                          * retransmission.
                          */
+#ifdef KERNEL_32
                         old_snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt();
+#else
+                        old_snxt_mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
+#endif
                         TCPS_BUMP_MIB(tcps, tcpRetransSegs);
                         TCPS_UPDATE_MIB(tcps, tcpRetransBytes, cnt);
+                        tcp->tcp_cs.tcp_out_retrans_segs++;
+                        tcp->tcp_cs.tcp_out_retrans_bytes += cnt;
 
                         tcp->tcp_rexmit_nxt = snxt;
                 }
                 /*
                  * If we have transmitted all we have at the time

@@ -3616,11 +3653,11 @@
  * tcp_fill_header is called by tcp_send() to fill the outgoing TCP header
  * with the template header, as well as other options such as time-stamp,
  * ECN and/or SACK.
  */
 static void
-tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk)
+tcp_fill_header(tcp_t *tcp, uchar_t *rptr, int num_sack_blk)
 {
         tcpha_t *tcp_tmpl, *tcpha;
         uint32_t *dst, *src;
         int hdrlen;
         conn_t *connp = tcp->tcp_connp;

@@ -3638,11 +3675,11 @@
         src = (uint32_t *)connp->conn_ht_iphc;
         hdrlen = connp->conn_ht_iphc_len;
 
         /* Fill time-stamp option if needed */
         if (tcp->tcp_snd_ts_ok) {
-                U32_TO_BE32((uint32_t)now,
+                U32_TO_BE32(LBOLT_FASTPATH,
                     (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 4);
                 U32_TO_BE32(tcp->tcp_ts_recent,
                     (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 8);
         } else {
                 ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);