Print this page
DLPX-25998 TCP congestion control is inadequate
Reviewed at: http://reviews.delphix.com/r/34808/
DLPX-45697 Adding Avg. RTT to connstat
DLPX-43064 include high-resolution round-trip times in connstat (EP-652)
DLPX-42721 Create inline function for TCP RTO calculation
DLPX-37540 TCP per-connection kernel statistics DLPX-37544 connstat command to display per-connection TCP statistics
@@ -21,11 +21,11 @@
/*
* Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2017 Joyent, Inc.
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2013,2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
* Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
*/
/* Copyright (c) 1990 Mentat Inc. */
#include <sys/types.h>
@@ -72,10 +72,11 @@
#include <net/if.h>
#include <net/route.h>
#include <inet/ipsec_impl.h>
#include <inet/common.h>
+#include <inet/cc.h>
#include <inet/ip.h>
#include <inet/ip_impl.h>
#include <inet/ip6.h>
#include <inet/ip_ndp.h>
#include <inet/proto_set.h>
@@ -264,12 +265,10 @@
} tcpt_t;
/*
* Functions called directly via squeue having a prototype of edesc_t.
*/
-void tcp_input_listener(void *arg, mblk_t *mp, void *arg2,
- ip_recv_attr_t *ira);
void tcp_input_data(void *arg, mblk_t *mp, void *arg2,
ip_recv_attr_t *ira);
static void tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2,
ip_recv_attr_t *dummy);
@@ -574,11 +573,41 @@
connp->conn_state_flags = CONN_INCIPIENT;
ASSERT(connp->conn_proto == IPPROTO_TCP);
ASSERT(connp->conn_ref == 1);
}
+#pragma inline(tcp_calculate_rto)
+
/*
+ * RTO = average estimates (sa / 8) + 4 * deviation estimates (sd)
+ *
+ * Add tcp_rexmit_interval extra in case of extreme environment where the
+ * algorithm fails to work. The default value of tcp_rexmit_interval_extra
+ * should be 0.
+ *
+ * As we use a finer grained clock than BSD and update RTO for every ACKs, add
+ * in another .25 of RTT to the deviation of RTO to accommodate burstiness of
+ * 1/4 of window size.
+ */
+clock_t
+tcp_calculate_rto(tcp_t *tcp, tcp_stack_t *tcps)
+{
+ clock_t rto;
+
+ rto = NSEC2MSEC((tcp->tcp_rtt_sa >> 3) + (tcp->tcp_rtt_sa >> 5) +
+ tcp->tcp_rtt_sd) + tcps->tcps_rexmit_interval_extra +
+ tcps->tcps_conn_grace_period;
+
+ if (rto < tcp->tcp_rto_min)
+ rto = tcp->tcp_rto_min;
+ else if (rto > tcp->tcp_rto_max)
+ rto = tcp->tcp_rto_max;
+
+ return (rto);
+}
+
+/*
* Adapt to the information, such as rtt and rtt_sd, provided from the
* DCE and IRE maintained by IP.
*
* Checks for multicast and broadcast destination address.
* Returns zero if ok; an errno on failure.
@@ -638,19 +667,13 @@
return (error);
tcp->tcp_localnet = uinfo.iulp_localnet;
if (uinfo.iulp_rtt != 0) {
- clock_t rto;
-
- tcp->tcp_rtt_sa = uinfo.iulp_rtt;
- tcp->tcp_rtt_sd = uinfo.iulp_rtt_sd;
- rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
- tcps->tcps_rexmit_interval_extra +
- (tcp->tcp_rtt_sa >> 5);
-
- TCP_SET_RTO(tcp, rto);
+ tcp->tcp_rtt_sa = MSEC2NSEC(uinfo.iulp_rtt);
+ tcp->tcp_rtt_sd = MSEC2NSEC(uinfo.iulp_rtt_sd);
+ tcp->tcp_rto = tcp_calculate_rto(tcp, tcps);
}
if (uinfo.iulp_ssthresh != 0)
tcp->tcp_cwnd_ssthresh = uinfo.iulp_ssthresh;
else
tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
@@ -1236,15 +1259,10 @@
int32_t oldstate;
if (!TCP_IS_SOCKET(tcp))
tcp_acceptor_hash_remove(tcp);
- TCPS_UPDATE_MIB(tcps, tcpHCInSegs, tcp->tcp_ibsegs);
- tcp->tcp_ibsegs = 0;
- TCPS_UPDATE_MIB(tcps, tcpHCOutSegs, tcp->tcp_obsegs);
- tcp->tcp_obsegs = 0;
-
/*
* This can be called via tcp_time_wait_processing() if TCP gets a
* SYN with sequence number outside the TIME-WAIT connection's
* window. So we need to check for TIME-WAIT state here as the
* connection counter is already decremented. See SET_TIME_WAIT()
@@ -1419,10 +1437,14 @@
* It happens to have exactly two members of identical size
* the following code is enough.
*/
tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind);
+ /* Allow the CC algorithm to clean up after itself. */
+ if (tcp->tcp_cc_algo != NULL && tcp->tcp_cc_algo->cb_destroy != NULL)
+ tcp->tcp_cc_algo->cb_destroy(&tcp->tcp_ccv);
+
/*
* Destroy any association with SO_REUSEPORT group.
*/
if (tcp->tcp_rg_bind != NULL) {
/*
@@ -1480,11 +1502,11 @@
* from which it was allocated. Best results are obtained if conn is
* allocated from listener's squeue and freed to the same. Time wait
* collector will free up the freelist is the connection ends up sitting
* there for too long.
*/
-void *
+conn_t *
tcp_get_conn(void *arg, tcp_stack_t *tcps)
{
tcp_t *tcp = NULL;
conn_t *connp = NULL;
squeue_t *sqp = (squeue_t *)arg;
@@ -1519,11 +1541,11 @@
connp->conn_ixa->ixa_notify_cookie = tcp;
ASSERT(connp->conn_ixa->ixa_notify == tcp_notify);
connp->conn_recv = tcp_input_data;
ASSERT(connp->conn_recvicmp == tcp_icmp_input);
ASSERT(connp->conn_verifyicmp == tcp_verifyicmp);
- return ((void *)connp);
+ return (connp);
}
mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
/*
* Pre-allocate the tcp_rsrv_mp. This mblk will not be freed until
* this conn_t/tcp_t is freed at ipcl_conn_destroy().
@@ -1554,11 +1576,11 @@
* thus it is inside the squeue.
*/
connp->conn_ixa->ixa_notify = tcp_notify;
connp->conn_ixa->ixa_notify_cookie = tcp;
- return ((void *)connp);
+ return (connp);
}
/*
* Handle connect to IPv4 destinations, including connections for AF_INET6
* sockets connecting to IPv4 mapped IPv6 destinations.
@@ -1924,19 +1946,10 @@
connp->conn_ipversion == IPV6_VERSION)));
/* Cancel outstanding timers */
tcp_timers_stop(tcp);
- /*
- * Reset everything in the state vector, after updating global
- * MIB data from instance counters.
- */
- TCPS_UPDATE_MIB(tcps, tcpHCInSegs, tcp->tcp_ibsegs);
- tcp->tcp_ibsegs = 0;
- TCPS_UPDATE_MIB(tcps, tcpHCOutSegs, tcp->tcp_obsegs);
- tcp->tcp_obsegs = 0;
-
tcp_close_mpp(&tcp->tcp_xmit_head);
if (tcp->tcp_snd_zcopy_aware)
tcp_zcopy_notify(tcp);
tcp->tcp_xmit_last = tcp->tcp_xmit_tail = NULL;
tcp->tcp_unsent = tcp->tcp_xmit_tail_unsent = 0;
@@ -2104,13 +2117,10 @@
tcp->tcp_snxt = 0; /* Displayed in mib */
tcp->tcp_suna = 0; /* Displayed in mib */
tcp->tcp_swnd = 0;
DONTCARE(tcp->tcp_cwnd); /* Init in tcp_process_options */
- ASSERT(tcp->tcp_ibsegs == 0);
- ASSERT(tcp->tcp_obsegs == 0);
-
if (connp->conn_ht_iphc != NULL) {
kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
connp->conn_ht_iphc = NULL;
connp->conn_ht_iphc_allocated = 0;
connp->conn_ht_iphc_len = 0;
@@ -2198,10 +2208,12 @@
tcp->tcp_rto = 0; /* Displayed in MIB */
DONTCARE(tcp->tcp_rtt_sa); /* Init in tcp_init_values */
DONTCARE(tcp->tcp_rtt_sd); /* Init in tcp_init_values */
tcp->tcp_rtt_update = 0;
+ tcp->tcp_rtt_sum = 0;
+ tcp->tcp_rtt_cnt = 0;
DONTCARE(tcp->tcp_swl1); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */
DONTCARE(tcp->tcp_swl2); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */
tcp->tcp_rack = 0; /* Displayed in mib */
@@ -2333,10 +2345,15 @@
PRESERVE(tcp->tcp_connid);
ASSERT(tcp->tcp_listen_cnt == NULL);
ASSERT(tcp->tcp_reass_tid == 0);
+ /* Allow the CC algorithm to clean up after itself. */
+ if (tcp->tcp_cc_algo->cb_destroy != NULL)
+ tcp->tcp_cc_algo->cb_destroy(&tcp->tcp_ccv);
+ tcp->tcp_cc_algo = NULL;
+
#undef DONTCARE
#undef PRESERVE
}
/*
@@ -2346,19 +2363,23 @@
void
tcp_init_values(tcp_t *tcp, tcp_t *parent)
{
tcp_stack_t *tcps = tcp->tcp_tcps;
conn_t *connp = tcp->tcp_connp;
- clock_t rto;
ASSERT((connp->conn_family == AF_INET &&
connp->conn_ipversion == IPV4_VERSION) ||
(connp->conn_family == AF_INET6 &&
(connp->conn_ipversion == IPV4_VERSION ||
connp->conn_ipversion == IPV6_VERSION)));
+ tcp->tcp_ccv.type = IPPROTO_TCP;
+ tcp->tcp_ccv.ccvc.tcp = tcp;
+
if (parent == NULL) {
+ tcp->tcp_cc_algo = tcps->tcps_default_cc_algo;
+
tcp->tcp_naglim = tcps->tcps_naglim_def;
tcp->tcp_rto_initial = tcps->tcps_rexmit_interval_initial;
tcp->tcp_rto_min = tcps->tcps_rexmit_interval_min;
tcp->tcp_rto_max = tcps->tcps_rexmit_interval_max;
@@ -2382,10 +2403,12 @@
* Default value of tcp_init_cwnd is 0, so no need to set here
* if parent is NULL. But we need to inherit it from parent.
*/
} else {
/* Inherit various TCP parameters from the parent. */
+ tcp->tcp_cc_algo = parent->tcp_cc_algo;
+
tcp->tcp_naglim = parent->tcp_naglim;
tcp->tcp_rto_initial = parent->tcp_rto_initial;
tcp->tcp_rto_min = parent->tcp_rto_min;
tcp->tcp_rto_max = parent->tcp_rto_max;
@@ -2408,23 +2431,23 @@
tcp->tcp_ka_rinterval = parent->tcp_ka_rinterval;
tcp->tcp_init_cwnd = parent->tcp_init_cwnd;
}
+ if (tcp->tcp_cc_algo->cb_init != NULL)
+ VERIFY(tcp->tcp_cc_algo->cb_init(&tcp->tcp_ccv) == 0);
+
/*
* Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO
* will be close to tcp_rexmit_interval_initial. By doing this, we
* allow the algorithm to adjust slowly to large fluctuations of RTT
* during first few transmissions of a connection as seen in slow
* links.
*/
- tcp->tcp_rtt_sa = tcp->tcp_rto_initial << 2;
- tcp->tcp_rtt_sd = tcp->tcp_rto_initial >> 1;
- rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
- tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5) +
- tcps->tcps_conn_grace_period;
- TCP_SET_RTO(tcp, rto);
+ tcp->tcp_rtt_sa = MSEC2NSEC(tcp->tcp_rto_initial) << 2;
+ tcp->tcp_rtt_sd = MSEC2NSEC(tcp->tcp_rto_initial) >> 1;
+ tcp->tcp_rto = tcp_calculate_rto(tcp, tcps);
tcp->tcp_timer_backoff = 0;
tcp->tcp_ms_we_have_waited = 0;
tcp->tcp_last_recv_time = ddi_get_lbolt();
tcp->tcp_cwnd_max = tcps->tcps_cwnd_max_;
@@ -2657,11 +2680,11 @@
else
zoneid = crgetzoneid(credp);
}
sqp = IP_SQUEUE_GET((uint_t)gethrtime());
- connp = (conn_t *)tcp_get_conn(sqp, tcps);
+ connp = tcp_get_conn(sqp, tcps);
/*
* Both tcp_get_conn and netstack_find_by_cred incremented refcnt,
* so we drop it by one.
*/
netstack_rele(tcps->tcps_netstack);
@@ -3845,10 +3868,13 @@
mutex_init(&tcps->tcps_listener_conf_lock, NULL, MUTEX_DEFAULT, NULL);
list_create(&tcps->tcps_listener_conf, sizeof (tcp_listener_t),
offsetof(tcp_listener_t, tl_link));
+ tcps->tcps_default_cc_algo = cc_load_algo(CC_DEFAULT_ALGO_NAME);
+ ASSERT3P(tcps->tcps_default_cc_algo, !=, NULL);
+
return (tcps);
}
/*
* Called when the IP module is about to be unloaded.