Print this page
DLPX-25998 TCP congestion control is inadequate
Reviewed at: http://reviews.delphix.com/r/34808/
DLPX-45697 Adding Avg. RTT to connstat
DLPX-43064 include high-resolution round-trip times in connstat (EP-652)
DLPX-42721 Create inline function for TCP RTO calculation
DLPX-37540 TCP per-connection kernel statistics DLPX-37544 connstat command to display per-connection TCP statistics

@@ -21,11 +21,11 @@
 
 /*
  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2017 Joyent, Inc.
  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2013,2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
  * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
  */
 /* Copyright (c) 1990 Mentat Inc. */
 
 #include <sys/types.h>

@@ -72,10 +72,11 @@
 #include <net/if.h>
 #include <net/route.h>
 #include <inet/ipsec_impl.h>
 
 #include <inet/common.h>
+#include <inet/cc.h>
 #include <inet/ip.h>
 #include <inet/ip_impl.h>
 #include <inet/ip6.h>
 #include <inet/ip_ndp.h>
 #include <inet/proto_set.h>

@@ -264,12 +265,10 @@
 } tcpt_t;
 
 /*
  * Functions called directly via squeue having a prototype of edesc_t.
  */
-void            tcp_input_listener(void *arg, mblk_t *mp, void *arg2,
-    ip_recv_attr_t *ira);
 void            tcp_input_data(void *arg, mblk_t *mp, void *arg2,
     ip_recv_attr_t *ira);
 static void     tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2,
     ip_recv_attr_t *dummy);
 

@@ -574,11 +573,41 @@
         connp->conn_state_flags = CONN_INCIPIENT;
         ASSERT(connp->conn_proto == IPPROTO_TCP);
         ASSERT(connp->conn_ref == 1);
 }
 
+#pragma inline(tcp_calculate_rto)
+
 /*
+ * RTO = average estimates (sa / 8) + 4 * deviation estimates (sd)
+ *
+ * Add tcp_rexmit_interval extra in case of extreme environment where the
+ * algorithm fails to work.  The default value of tcp_rexmit_interval_extra
+ * should be 0.
+ *
+ * As we use a finer grained clock than BSD and update RTO for every ACKs, add
+ * in another .25 of RTT to the deviation of RTO to accommodate burstiness of
+ * 1/4 of window size.
+ */
+clock_t
+tcp_calculate_rto(tcp_t *tcp, tcp_stack_t *tcps)
+{
+        clock_t rto;
+
+        rto = NSEC2MSEC((tcp->tcp_rtt_sa >> 3) + (tcp->tcp_rtt_sa >> 5) +
+            tcp->tcp_rtt_sd) + tcps->tcps_rexmit_interval_extra +
+            tcps->tcps_conn_grace_period;
+
+        if (rto < tcp->tcp_rto_min)
+                rto = tcp->tcp_rto_min;
+        else if (rto > tcp->tcp_rto_max)
+                rto = tcp->tcp_rto_max;
+
+        return (rto);
+}
+
+/*
  * Adapt to the information, such as rtt and rtt_sd, provided from the
  * DCE and IRE maintained by IP.
  *
  * Checks for multicast and broadcast destination address.
  * Returns zero if ok; an errno on failure.

@@ -638,19 +667,13 @@
                 return (error);
 
         tcp->tcp_localnet = uinfo.iulp_localnet;
 
         if (uinfo.iulp_rtt != 0) {
-                clock_t rto;
-
-                tcp->tcp_rtt_sa = uinfo.iulp_rtt;
-                tcp->tcp_rtt_sd = uinfo.iulp_rtt_sd;
-                rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
-                    tcps->tcps_rexmit_interval_extra +
-                    (tcp->tcp_rtt_sa >> 5);
-
-                TCP_SET_RTO(tcp, rto);
+                tcp->tcp_rtt_sa = MSEC2NSEC(uinfo.iulp_rtt);
+                tcp->tcp_rtt_sd = MSEC2NSEC(uinfo.iulp_rtt_sd);
+                tcp->tcp_rto = tcp_calculate_rto(tcp, tcps);
         }
         if (uinfo.iulp_ssthresh != 0)
                 tcp->tcp_cwnd_ssthresh = uinfo.iulp_ssthresh;
         else
                 tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;

@@ -1236,15 +1259,10 @@
         int32_t         oldstate;
 
         if (!TCP_IS_SOCKET(tcp))
                 tcp_acceptor_hash_remove(tcp);
 
-        TCPS_UPDATE_MIB(tcps, tcpHCInSegs, tcp->tcp_ibsegs);
-        tcp->tcp_ibsegs = 0;
-        TCPS_UPDATE_MIB(tcps, tcpHCOutSegs, tcp->tcp_obsegs);
-        tcp->tcp_obsegs = 0;
-
         /*
          * This can be called via tcp_time_wait_processing() if TCP gets a
          * SYN with sequence number outside the TIME-WAIT connection's
          * window.  So we need to check for TIME-WAIT state here as the
          * connection counter is already decremented.  See SET_TIME_WAIT()

@@ -1419,10 +1437,14 @@
          * It happens to have exactly two members of identical size
          * the following code is enough.
          */
         tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind);
 
+        /* Allow the CC algorithm to clean up after itself. */
+        if (tcp->tcp_cc_algo != NULL && tcp->tcp_cc_algo->cb_destroy != NULL)
+                tcp->tcp_cc_algo->cb_destroy(&tcp->tcp_ccv);
+
         /*
          * Destroy any association with SO_REUSEPORT group.
          */
         if (tcp->tcp_rg_bind != NULL) {
                 /*

@@ -1480,11 +1502,11 @@
  * from which it was allocated. Best results are obtained if conn is
  * allocated from listener's squeue and freed to the same. Time wait
  * collector will free up the freelist is the connection ends up sitting
  * there for too long.
  */
-void *
+conn_t *
 tcp_get_conn(void *arg, tcp_stack_t *tcps)
 {
         tcp_t                   *tcp = NULL;
         conn_t                  *connp = NULL;
         squeue_t                *sqp = (squeue_t *)arg;

@@ -1519,11 +1541,11 @@
                 connp->conn_ixa->ixa_notify_cookie = tcp;
                 ASSERT(connp->conn_ixa->ixa_notify == tcp_notify);
                 connp->conn_recv = tcp_input_data;
                 ASSERT(connp->conn_recvicmp == tcp_icmp_input);
                 ASSERT(connp->conn_verifyicmp == tcp_verifyicmp);
-                return ((void *)connp);
+                return (connp);
         }
         mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
         /*
          * Pre-allocate the tcp_rsrv_mp. This mblk will not be freed until
          * this conn_t/tcp_t is freed at ipcl_conn_destroy().

@@ -1554,11 +1576,11 @@
          * thus it is inside the squeue.
          */
         connp->conn_ixa->ixa_notify = tcp_notify;
         connp->conn_ixa->ixa_notify_cookie = tcp;
 
-        return ((void *)connp);
+        return (connp);
 }
 
 /*
  * Handle connect to IPv4 destinations, including connections for AF_INET6
  * sockets connecting to IPv4 mapped IPv6 destinations.

@@ -1924,19 +1946,10 @@
             connp->conn_ipversion == IPV6_VERSION)));
 
         /* Cancel outstanding timers */
         tcp_timers_stop(tcp);
 
-        /*
-         * Reset everything in the state vector, after updating global
-         * MIB data from instance counters.
-         */
-        TCPS_UPDATE_MIB(tcps, tcpHCInSegs, tcp->tcp_ibsegs);
-        tcp->tcp_ibsegs = 0;
-        TCPS_UPDATE_MIB(tcps, tcpHCOutSegs, tcp->tcp_obsegs);
-        tcp->tcp_obsegs = 0;
-
         tcp_close_mpp(&tcp->tcp_xmit_head);
         if (tcp->tcp_snd_zcopy_aware)
                 tcp_zcopy_notify(tcp);
         tcp->tcp_xmit_last = tcp->tcp_xmit_tail = NULL;
         tcp->tcp_unsent = tcp->tcp_xmit_tail_unsent = 0;

@@ -2104,13 +2117,10 @@
         tcp->tcp_snxt = 0;                      /* Displayed in mib */
         tcp->tcp_suna = 0;                      /* Displayed in mib */
         tcp->tcp_swnd = 0;
         DONTCARE(tcp->tcp_cwnd);        /* Init in tcp_process_options */
 
-        ASSERT(tcp->tcp_ibsegs == 0);
-        ASSERT(tcp->tcp_obsegs == 0);
-
         if (connp->conn_ht_iphc != NULL) {
                 kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
                 connp->conn_ht_iphc = NULL;
                 connp->conn_ht_iphc_allocated = 0;
                 connp->conn_ht_iphc_len = 0;

@@ -2198,10 +2208,12 @@
 
         tcp->tcp_rto = 0;                       /* Displayed in MIB */
         DONTCARE(tcp->tcp_rtt_sa);              /* Init in tcp_init_values */
         DONTCARE(tcp->tcp_rtt_sd);              /* Init in tcp_init_values */
         tcp->tcp_rtt_update = 0;
+        tcp->tcp_rtt_sum = 0;
+        tcp->tcp_rtt_cnt = 0;
 
         DONTCARE(tcp->tcp_swl1); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */
         DONTCARE(tcp->tcp_swl2); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */
 
         tcp->tcp_rack = 0;                      /* Displayed in mib */

@@ -2333,10 +2345,15 @@
         PRESERVE(tcp->tcp_connid);
 
         ASSERT(tcp->tcp_listen_cnt == NULL);
         ASSERT(tcp->tcp_reass_tid == 0);
 
+        /* Allow the CC algorithm to clean up after itself. */
+        if (tcp->tcp_cc_algo->cb_destroy != NULL)
+                tcp->tcp_cc_algo->cb_destroy(&tcp->tcp_ccv);
+        tcp->tcp_cc_algo = NULL;
+
 #undef  DONTCARE
 #undef  PRESERVE
 }
 
 /*

@@ -2346,19 +2363,23 @@
 void
 tcp_init_values(tcp_t *tcp, tcp_t *parent)
 {
         tcp_stack_t     *tcps = tcp->tcp_tcps;
         conn_t          *connp = tcp->tcp_connp;
-        clock_t         rto;
 
         ASSERT((connp->conn_family == AF_INET &&
             connp->conn_ipversion == IPV4_VERSION) ||
             (connp->conn_family == AF_INET6 &&
             (connp->conn_ipversion == IPV4_VERSION ||
             connp->conn_ipversion == IPV6_VERSION)));
 
+        tcp->tcp_ccv.type = IPPROTO_TCP;
+        tcp->tcp_ccv.ccvc.tcp = tcp;
+
         if (parent == NULL) {
+                tcp->tcp_cc_algo = tcps->tcps_default_cc_algo;
+
                 tcp->tcp_naglim = tcps->tcps_naglim_def;
 
                 tcp->tcp_rto_initial = tcps->tcps_rexmit_interval_initial;
                 tcp->tcp_rto_min = tcps->tcps_rexmit_interval_min;
                 tcp->tcp_rto_max = tcps->tcps_rexmit_interval_max;

@@ -2382,10 +2403,12 @@
                  * Default value of tcp_init_cwnd is 0, so no need to set here
                  * if parent is NULL.  But we need to inherit it from parent.
                  */
         } else {
                 /* Inherit various TCP parameters from the parent. */
+                tcp->tcp_cc_algo = parent->tcp_cc_algo;
+
                 tcp->tcp_naglim = parent->tcp_naglim;
 
                 tcp->tcp_rto_initial = parent->tcp_rto_initial;
                 tcp->tcp_rto_min = parent->tcp_rto_min;
                 tcp->tcp_rto_max = parent->tcp_rto_max;

@@ -2408,23 +2431,23 @@
                 tcp->tcp_ka_rinterval = parent->tcp_ka_rinterval;
 
                 tcp->tcp_init_cwnd = parent->tcp_init_cwnd;
         }
 
+        if (tcp->tcp_cc_algo->cb_init != NULL)
+                VERIFY(tcp->tcp_cc_algo->cb_init(&tcp->tcp_ccv) == 0);
+
         /*
          * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO
          * will be close to tcp_rexmit_interval_initial.  By doing this, we
          * allow the algorithm to adjust slowly to large fluctuations of RTT
          * during first few transmissions of a connection as seen in slow
          * links.
          */
-        tcp->tcp_rtt_sa = tcp->tcp_rto_initial << 2;
-        tcp->tcp_rtt_sd = tcp->tcp_rto_initial >> 1;
-        rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
-            tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5) +
-            tcps->tcps_conn_grace_period;
-        TCP_SET_RTO(tcp, rto);
+        tcp->tcp_rtt_sa = MSEC2NSEC(tcp->tcp_rto_initial) << 2;
+        tcp->tcp_rtt_sd = MSEC2NSEC(tcp->tcp_rto_initial) >> 1;
+        tcp->tcp_rto = tcp_calculate_rto(tcp, tcps);
 
         tcp->tcp_timer_backoff = 0;
         tcp->tcp_ms_we_have_waited = 0;
         tcp->tcp_last_recv_time = ddi_get_lbolt();
         tcp->tcp_cwnd_max = tcps->tcps_cwnd_max_;

@@ -2657,11 +2680,11 @@
                 else
                         zoneid = crgetzoneid(credp);
         }
 
         sqp = IP_SQUEUE_GET((uint_t)gethrtime());
-        connp = (conn_t *)tcp_get_conn(sqp, tcps);
+        connp = tcp_get_conn(sqp, tcps);
         /*
          * Both tcp_get_conn and netstack_find_by_cred incremented refcnt,
          * so we drop it by one.
          */
         netstack_rele(tcps->tcps_netstack);

@@ -3845,10 +3868,13 @@
 
         mutex_init(&tcps->tcps_listener_conf_lock, NULL, MUTEX_DEFAULT, NULL);
         list_create(&tcps->tcps_listener_conf, sizeof (tcp_listener_t),
             offsetof(tcp_listener_t, tl_link));
 
+        tcps->tcps_default_cc_algo = cc_load_algo(CC_DEFAULT_ALGO_NAME);
+        ASSERT3P(tcps->tcps_default_cc_algo, !=, NULL);
+
         return (tcps);
 }
 
 /*
  * Called when the IP module is about to be unloaded.