Print this page
DLPX-25998 TCP congestion control is inadequate
Reviewed at: http://reviews.delphix.com/r/34808/
DLPX-45697 Adding Avg. RTT to connstat
DLPX-43064 include high-resolution round-trip times in connstat (EP-652)
DLPX-42721 Create inline function for TCP RTO calculation
DLPX-37540 TCP per-connection kernel statistics DLPX-37544 connstat command to display per-connection TCP statistics

*** 21,31 **** /* * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2017 Joyent, Inc. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. ! * Copyright (c) 2013,2014 by Delphix. All rights reserved. * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved. */ /* Copyright (c) 1990 Mentat Inc. */ #include <sys/types.h> --- 21,31 ---- /* * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2017 Joyent, Inc. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. ! * Copyright (c) 2013, 2017 by Delphix. All rights reserved. * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved. */ /* Copyright (c) 1990 Mentat Inc. */ #include <sys/types.h>
*** 72,81 **** --- 72,82 ---- #include <net/if.h> #include <net/route.h> #include <inet/ipsec_impl.h> #include <inet/common.h> + #include <inet/cc.h> #include <inet/ip.h> #include <inet/ip_impl.h> #include <inet/ip6.h> #include <inet/ip_ndp.h> #include <inet/proto_set.h>
*** 264,275 **** } tcpt_t; /* * Functions called directly via squeue having a prototype of edesc_t. */ - void tcp_input_listener(void *arg, mblk_t *mp, void *arg2, - ip_recv_attr_t *ira); void tcp_input_data(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira); static void tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy); --- 265,274 ----
*** 574,584 **** --- 573,613 ---- connp->conn_state_flags = CONN_INCIPIENT; ASSERT(connp->conn_proto == IPPROTO_TCP); ASSERT(connp->conn_ref == 1); } + #pragma inline(tcp_calculate_rto) + /* + * RTO = average estimates (sa / 8) + 4 * deviation estimates (sd) + * + * Add tcp_rexmit_interval extra in case of extreme environment where the + * algorithm fails to work. The default value of tcp_rexmit_interval_extra + * should be 0. + * + * As we use a finer grained clock than BSD and update RTO for every ACKs, add + * in another .25 of RTT to the deviation of RTO to accommodate burstiness of + * 1/4 of window size. + */ + clock_t + tcp_calculate_rto(tcp_t *tcp, tcp_stack_t *tcps) + { + clock_t rto; + + rto = NSEC2MSEC((tcp->tcp_rtt_sa >> 3) + (tcp->tcp_rtt_sa >> 5) + + tcp->tcp_rtt_sd) + tcps->tcps_rexmit_interval_extra + + tcps->tcps_conn_grace_period; + + if (rto < tcp->tcp_rto_min) + rto = tcp->tcp_rto_min; + else if (rto > tcp->tcp_rto_max) + rto = tcp->tcp_rto_max; + + return (rto); + } + + /* * Adapt to the information, such as rtt and rtt_sd, provided from the * DCE and IRE maintained by IP. * * Checks for multicast and broadcast destination address. * Returns zero if ok; an errno on failure.
*** 638,656 **** return (error); tcp->tcp_localnet = uinfo.iulp_localnet; if (uinfo.iulp_rtt != 0) { ! clock_t rto; ! ! tcp->tcp_rtt_sa = uinfo.iulp_rtt; ! tcp->tcp_rtt_sd = uinfo.iulp_rtt_sd; ! rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + ! tcps->tcps_rexmit_interval_extra + ! (tcp->tcp_rtt_sa >> 5); ! ! TCP_SET_RTO(tcp, rto); } if (uinfo.iulp_ssthresh != 0) tcp->tcp_cwnd_ssthresh = uinfo.iulp_ssthresh; else tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN; --- 667,679 ---- return (error); tcp->tcp_localnet = uinfo.iulp_localnet; if (uinfo.iulp_rtt != 0) { ! tcp->tcp_rtt_sa = MSEC2NSEC(uinfo.iulp_rtt); ! tcp->tcp_rtt_sd = MSEC2NSEC(uinfo.iulp_rtt_sd); ! tcp->tcp_rto = tcp_calculate_rto(tcp, tcps); } if (uinfo.iulp_ssthresh != 0) tcp->tcp_cwnd_ssthresh = uinfo.iulp_ssthresh; else tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
*** 1236,1250 **** int32_t oldstate; if (!TCP_IS_SOCKET(tcp)) tcp_acceptor_hash_remove(tcp); - TCPS_UPDATE_MIB(tcps, tcpHCInSegs, tcp->tcp_ibsegs); - tcp->tcp_ibsegs = 0; - TCPS_UPDATE_MIB(tcps, tcpHCOutSegs, tcp->tcp_obsegs); - tcp->tcp_obsegs = 0; - /* * This can be called via tcp_time_wait_processing() if TCP gets a * SYN with sequence number outside the TIME-WAIT connection's * window. So we need to check for TIME-WAIT state here as the * connection counter is already decremented. See SET_TIME_WAIT() --- 1259,1268 ----
*** 1419,1428 **** --- 1437,1450 ---- * It happens to have exactly two members of identical size * the following code is enough. */ tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind); + /* Allow the CC algorithm to clean up after itself. */ + if (tcp->tcp_cc_algo != NULL && tcp->tcp_cc_algo->cb_destroy != NULL) + tcp->tcp_cc_algo->cb_destroy(&tcp->tcp_ccv); + /* * Destroy any association with SO_REUSEPORT group. */ if (tcp->tcp_rg_bind != NULL) { /*
*** 1480,1490 **** * from which it was allocated. Best results are obtained if conn is * allocated from listener's squeue and freed to the same. Time wait * collector will free up the freelist is the connection ends up sitting * there for too long. */ ! void * tcp_get_conn(void *arg, tcp_stack_t *tcps) { tcp_t *tcp = NULL; conn_t *connp = NULL; squeue_t *sqp = (squeue_t *)arg; --- 1502,1512 ---- * from which it was allocated. Best results are obtained if conn is * allocated from listener's squeue and freed to the same. Time wait * collector will free up the freelist is the connection ends up sitting * there for too long. */ ! conn_t * tcp_get_conn(void *arg, tcp_stack_t *tcps) { tcp_t *tcp = NULL; conn_t *connp = NULL; squeue_t *sqp = (squeue_t *)arg;
*** 1519,1529 **** connp->conn_ixa->ixa_notify_cookie = tcp; ASSERT(connp->conn_ixa->ixa_notify == tcp_notify); connp->conn_recv = tcp_input_data; ASSERT(connp->conn_recvicmp == tcp_icmp_input); ASSERT(connp->conn_verifyicmp == tcp_verifyicmp); ! return ((void *)connp); } mutex_exit(&tcp_time_wait->tcp_time_wait_lock); /* * Pre-allocate the tcp_rsrv_mp. This mblk will not be freed until * this conn_t/tcp_t is freed at ipcl_conn_destroy(). --- 1541,1551 ---- connp->conn_ixa->ixa_notify_cookie = tcp; ASSERT(connp->conn_ixa->ixa_notify == tcp_notify); connp->conn_recv = tcp_input_data; ASSERT(connp->conn_recvicmp == tcp_icmp_input); ASSERT(connp->conn_verifyicmp == tcp_verifyicmp); ! return (connp); } mutex_exit(&tcp_time_wait->tcp_time_wait_lock); /* * Pre-allocate the tcp_rsrv_mp. This mblk will not be freed until * this conn_t/tcp_t is freed at ipcl_conn_destroy().
*** 1554,1564 **** * thus it is inside the squeue. */ connp->conn_ixa->ixa_notify = tcp_notify; connp->conn_ixa->ixa_notify_cookie = tcp; ! return ((void *)connp); } /* * Handle connect to IPv4 destinations, including connections for AF_INET6 * sockets connecting to IPv4 mapped IPv6 destinations. --- 1576,1586 ---- * thus it is inside the squeue. */ connp->conn_ixa->ixa_notify = tcp_notify; connp->conn_ixa->ixa_notify_cookie = tcp; ! return (connp); } /* * Handle connect to IPv4 destinations, including connections for AF_INET6 * sockets connecting to IPv4 mapped IPv6 destinations.
*** 1924,1942 **** connp->conn_ipversion == IPV6_VERSION))); /* Cancel outstanding timers */ tcp_timers_stop(tcp); - /* - * Reset everything in the state vector, after updating global - * MIB data from instance counters. - */ - TCPS_UPDATE_MIB(tcps, tcpHCInSegs, tcp->tcp_ibsegs); - tcp->tcp_ibsegs = 0; - TCPS_UPDATE_MIB(tcps, tcpHCOutSegs, tcp->tcp_obsegs); - tcp->tcp_obsegs = 0; - tcp_close_mpp(&tcp->tcp_xmit_head); if (tcp->tcp_snd_zcopy_aware) tcp_zcopy_notify(tcp); tcp->tcp_xmit_last = tcp->tcp_xmit_tail = NULL; tcp->tcp_unsent = tcp->tcp_xmit_tail_unsent = 0; --- 1946,1955 ----
*** 2104,2116 **** tcp->tcp_snxt = 0; /* Displayed in mib */ tcp->tcp_suna = 0; /* Displayed in mib */ tcp->tcp_swnd = 0; DONTCARE(tcp->tcp_cwnd); /* Init in tcp_process_options */ - ASSERT(tcp->tcp_ibsegs == 0); - ASSERT(tcp->tcp_obsegs == 0); - if (connp->conn_ht_iphc != NULL) { kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated); connp->conn_ht_iphc = NULL; connp->conn_ht_iphc_allocated = 0; connp->conn_ht_iphc_len = 0; --- 2117,2126 ----
*** 2198,2207 **** --- 2208,2219 ---- tcp->tcp_rto = 0; /* Displayed in MIB */ DONTCARE(tcp->tcp_rtt_sa); /* Init in tcp_init_values */ DONTCARE(tcp->tcp_rtt_sd); /* Init in tcp_init_values */ tcp->tcp_rtt_update = 0; + tcp->tcp_rtt_sum = 0; + tcp->tcp_rtt_cnt = 0; DONTCARE(tcp->tcp_swl1); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */ DONTCARE(tcp->tcp_swl2); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */ tcp->tcp_rack = 0; /* Displayed in mib */
*** 2333,2342 **** --- 2345,2359 ---- PRESERVE(tcp->tcp_connid); ASSERT(tcp->tcp_listen_cnt == NULL); ASSERT(tcp->tcp_reass_tid == 0); + /* Allow the CC algorithm to clean up after itself. */ + if (tcp->tcp_cc_algo->cb_destroy != NULL) + tcp->tcp_cc_algo->cb_destroy(&tcp->tcp_ccv); + tcp->tcp_cc_algo = NULL; + #undef DONTCARE #undef PRESERVE } /*
*** 2346,2364 **** void tcp_init_values(tcp_t *tcp, tcp_t *parent) { tcp_stack_t *tcps = tcp->tcp_tcps; conn_t *connp = tcp->tcp_connp; - clock_t rto; ASSERT((connp->conn_family == AF_INET && connp->conn_ipversion == IPV4_VERSION) || (connp->conn_family == AF_INET6 && (connp->conn_ipversion == IPV4_VERSION || connp->conn_ipversion == IPV6_VERSION))); if (parent == NULL) { tcp->tcp_naglim = tcps->tcps_naglim_def; tcp->tcp_rto_initial = tcps->tcps_rexmit_interval_initial; tcp->tcp_rto_min = tcps->tcps_rexmit_interval_min; tcp->tcp_rto_max = tcps->tcps_rexmit_interval_max; --- 2363,2385 ---- void tcp_init_values(tcp_t *tcp, tcp_t *parent) { tcp_stack_t *tcps = tcp->tcp_tcps; conn_t *connp = tcp->tcp_connp; ASSERT((connp->conn_family == AF_INET && connp->conn_ipversion == IPV4_VERSION) || (connp->conn_family == AF_INET6 && (connp->conn_ipversion == IPV4_VERSION || connp->conn_ipversion == IPV6_VERSION))); + tcp->tcp_ccv.type = IPPROTO_TCP; + tcp->tcp_ccv.ccvc.tcp = tcp; + if (parent == NULL) { + tcp->tcp_cc_algo = tcps->tcps_default_cc_algo; + tcp->tcp_naglim = tcps->tcps_naglim_def; tcp->tcp_rto_initial = tcps->tcps_rexmit_interval_initial; tcp->tcp_rto_min = tcps->tcps_rexmit_interval_min; tcp->tcp_rto_max = tcps->tcps_rexmit_interval_max;
*** 2382,2391 **** --- 2403,2414 ---- * Default value of tcp_init_cwnd is 0, so no need to set here * if parent is NULL. But we need to inherit it from parent. */ } else { /* Inherit various TCP parameters from the parent. */ + tcp->tcp_cc_algo = parent->tcp_cc_algo; + tcp->tcp_naglim = parent->tcp_naglim; tcp->tcp_rto_initial = parent->tcp_rto_initial; tcp->tcp_rto_min = parent->tcp_rto_min; tcp->tcp_rto_max = parent->tcp_rto_max;
*** 2408,2430 **** tcp->tcp_ka_rinterval = parent->tcp_ka_rinterval; tcp->tcp_init_cwnd = parent->tcp_init_cwnd; } /* * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO * will be close to tcp_rexmit_interval_initial. By doing this, we * allow the algorithm to adjust slowly to large fluctuations of RTT * during first few transmissions of a connection as seen in slow * links. */ ! tcp->tcp_rtt_sa = tcp->tcp_rto_initial << 2; ! tcp->tcp_rtt_sd = tcp->tcp_rto_initial >> 1; ! rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + ! tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5) + ! tcps->tcps_conn_grace_period; ! TCP_SET_RTO(tcp, rto); tcp->tcp_timer_backoff = 0; tcp->tcp_ms_we_have_waited = 0; tcp->tcp_last_recv_time = ddi_get_lbolt(); tcp->tcp_cwnd_max = tcps->tcps_cwnd_max_; --- 2431,2453 ---- tcp->tcp_ka_rinterval = parent->tcp_ka_rinterval; tcp->tcp_init_cwnd = parent->tcp_init_cwnd; } + if (tcp->tcp_cc_algo->cb_init != NULL) + VERIFY(tcp->tcp_cc_algo->cb_init(&tcp->tcp_ccv) == 0); + /* * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO * will be close to tcp_rexmit_interval_initial. By doing this, we * allow the algorithm to adjust slowly to large fluctuations of RTT * during first few transmissions of a connection as seen in slow * links. */ ! tcp->tcp_rtt_sa = MSEC2NSEC(tcp->tcp_rto_initial) << 2; ! tcp->tcp_rtt_sd = MSEC2NSEC(tcp->tcp_rto_initial) >> 1; ! tcp->tcp_rto = tcp_calculate_rto(tcp, tcps); tcp->tcp_timer_backoff = 0; tcp->tcp_ms_we_have_waited = 0; tcp->tcp_last_recv_time = ddi_get_lbolt(); tcp->tcp_cwnd_max = tcps->tcps_cwnd_max_;
*** 2657,2667 **** else zoneid = crgetzoneid(credp); } sqp = IP_SQUEUE_GET((uint_t)gethrtime()); ! connp = (conn_t *)tcp_get_conn(sqp, tcps); /* * Both tcp_get_conn and netstack_find_by_cred incremented refcnt, * so we drop it by one. */ netstack_rele(tcps->tcps_netstack); --- 2680,2690 ---- else zoneid = crgetzoneid(credp); } sqp = IP_SQUEUE_GET((uint_t)gethrtime()); ! connp = tcp_get_conn(sqp, tcps); /* * Both tcp_get_conn and netstack_find_by_cred incremented refcnt, * so we drop it by one. */ netstack_rele(tcps->tcps_netstack);
*** 3845,3854 **** --- 3868,3880 ---- mutex_init(&tcps->tcps_listener_conf_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&tcps->tcps_listener_conf, sizeof (tcp_listener_t), offsetof(tcp_listener_t, tl_link)); + tcps->tcps_default_cc_algo = cc_load_algo(CC_DEFAULT_ALGO_NAME); + ASSERT3P(tcps->tcps_default_cc_algo, !=, NULL); + return (tcps); } /* * Called when the IP module is about to be unloaded.