Print this page
DLPX-25998 TCP congestion control is inadequate
Reviewed at: http://reviews.delphix.com/r/34808/
DLPX-43064 include high-resolution round-trip times in connstat (EP-652)
DLPX-37540 TCP per-connection kernel statistics DLPX-37544 connstat command to display per-connection TCP statistics
*** 19,29 ****
* CDDL HEADER END
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
! * Copyright (c) 2014 by Delphix. All rights reserved.
*/
/* This file contains all TCP output processing functions. */
#include <sys/types.h>
--- 19,29 ----
* CDDL HEADER END
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
! * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
*/
/* This file contains all TCP output processing functions. */
#include <sys/types.h>
*** 61,71 ****
const int, int *, uint_t *, int *, mblk_t **, mblk_t *);
static void tcp_xmit_early_reset(char *, mblk_t *, uint32_t, uint32_t,
int, ip_recv_attr_t *, ip_stack_t *, conn_t *);
static boolean_t tcp_send_rst_chk(tcp_stack_t *);
static void tcp_process_shrunk_swnd(tcp_t *, uint32_t);
! static void tcp_fill_header(tcp_t *, uchar_t *, clock_t, int);
/*
* Functions called directly via squeue having a prototype of edesc_t.
*/
static void tcp_wput_nondata(void *, mblk_t *, void *, ip_recv_attr_t *);
--- 61,71 ----
const int, int *, uint_t *, int *, mblk_t **, mblk_t *);
static void tcp_xmit_early_reset(char *, mblk_t *, uint32_t, uint32_t,
int, ip_recv_attr_t *, ip_stack_t *, conn_t *);
static boolean_t tcp_send_rst_chk(tcp_stack_t *);
static void tcp_process_shrunk_swnd(tcp_t *, uint32_t);
! static void tcp_fill_header(tcp_t *, uchar_t *, int);
/*
* Functions called directly via squeue having a prototype of edesc_t.
*/
static void tcp_wput_nondata(void *, mblk_t *, void *, ip_recv_attr_t *);
*** 78,87 ****
--- 78,99 ----
* speedup is observed for values larger than sixteen. Zero will
* disable the optimisation.
*/
static int tcp_tx_pull_len = 16;
+ static void
+ cc_after_idle(tcp_t *tcp)
+ {
+ uint32_t old_cwnd = tcp->tcp_cwnd;
+
+ if (CC_ALGO(tcp)->after_idle != NULL)
+ CC_ALGO(tcp)->after_idle(&tcp->tcp_ccv);
+
+ DTRACE_PROBE3(cwnd__cc__after__idle, tcp_t *, tcp, uint32_t, old_cwnd,
+ uint32_t, tcp->tcp_cwnd);
+ }
+
void
tcp_wput(queue_t *q, mblk_t *mp)
{
conn_t *connp = Q_TO_CONN(q);
tcp_t *tcp;
*** 215,225 ****
int32_t mss;
int32_t num_sack_blk = 0;
int32_t total_hdr_len;
int32_t tcp_hdr_len;
int rc;
- tcp_stack_t *tcps = tcp->tcp_tcps;
conn_t *connp = tcp->tcp_connp;
clock_t now = LBOLT_FASTPATH;
tcpstate = tcp->tcp_state;
if (mp == NULL) {
--- 227,236 ----
*** 370,380 ****
tcp_hdr_len = connp->conn_ht_ulp_len;
}
if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
(TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
! TCP_SET_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle);
}
if (tcpstate == TCPS_SYN_RCVD) {
/*
* The three-way connection establishment handshake is not
* complete yet. We want to queue the data for transmission
--- 381,391 ----
tcp_hdr_len = connp->conn_ht_ulp_len;
}
if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
(TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
! cc_after_idle(tcp);
}
if (tcpstate == TCPS_SYN_RCVD) {
/*
* The three-way connection establishment handshake is not
* complete yet. We want to queue the data for transmission
*** 451,461 ****
--- 462,476 ----
/* Bypass all other unnecessary processing. */
goto done;
}
}
+ #ifdef KERNEL_32
local_time = (mblk_t *)now;
+ #else
+ local_time = (mblk_t *)(intptr_t)gethrtime();
+ #endif
/*
* "Our" Nagle Algorithm. This is not the same as in the old
* BSD. This is more in line with the true intent of Nagle.
*
*** 1188,1198 ****
* Reinitialize tcp_cwnd after idle.
*/
now = LBOLT_FASTPATH;
if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
(TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
! TCP_SET_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle);
}
usable = tcp->tcp_swnd; /* tcp window size */
if (usable > tcp->tcp_cwnd)
usable = tcp->tcp_cwnd; /* congestion window smaller */
--- 1203,1213 ----
* Reinitialize tcp_cwnd after idle.
*/
now = LBOLT_FASTPATH;
if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
(TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
! cc_after_idle(tcp);
}
usable = tcp->tcp_swnd; /* tcp window size */
if (usable > tcp->tcp_cwnd)
usable = tcp->tcp_cwnd; /* congestion window smaller */
*** 1251,1261 ****
--- 1266,1280 ----
tcp->tcp_snxt = snxt + len;
tcp->tcp_rack = tcp->tcp_rnxt;
if ((mp1 = dupb(mp)) == 0)
goto no_memory;
+ #ifdef KERNEL_32
mp->b_prev = (mblk_t *)(uintptr_t)now;
+ #else
+ mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
+ #endif
mp->b_next = (mblk_t *)(uintptr_t)snxt;
/* adjust tcp header information */
tcpha = tcp->tcp_tcpha;
tcpha->tha_flags = (TH_ACK|TH_PUSH);
*** 1266,1276 ****
tcpha->tha_seq = htonl(snxt);
TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
! BUMP_LOCAL(tcp->tcp_obsegs);
/* Update the latest receive window size in TCP header. */
tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
tcp->tcp_last_sent_len = (ushort_t)len;
--- 1285,1297 ----
tcpha->tha_seq = htonl(snxt);
TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
! TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
! tcp->tcp_cs.tcp_out_data_segs++;
! tcp->tcp_cs.tcp_out_data_bytes += len;
/* Update the latest receive window size in TCP header. */
tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
tcp->tcp_last_sent_len = (ushort_t)len;
*** 1306,1321 ****
}
mp1->b_rptr = rptr;
/* Fill in the timestamp option. */
if (tcp->tcp_snd_ts_ok) {
! uint32_t llbolt = (uint32_t)LBOLT_FASTPATH;
!
! U32_TO_BE32(llbolt,
! (char *)tcpha + TCP_MIN_HEADER_LENGTH+4);
U32_TO_BE32(tcp->tcp_ts_recent,
! (char *)tcpha + TCP_MIN_HEADER_LENGTH+8);
} else {
ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
}
/* copy header into outgoing packet */
--- 1327,1340 ----
}
mp1->b_rptr = rptr;
/* Fill in the timestamp option. */
if (tcp->tcp_snd_ts_ok) {
! U32_TO_BE32(now,
! (char *)tcpha + TCP_MIN_HEADER_LENGTH + 4);
U32_TO_BE32(tcp->tcp_ts_recent,
! (char *)tcpha + TCP_MIN_HEADER_LENGTH + 8);
} else {
ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
}
/* copy header into outgoing packet */
*** 1955,1974 ****
(mblk_t *)(uintptr_t)(*snxt);
mp1 = mp1->b_cont;
}
*snxt += len;
*tail_unsent = (*xmit_tail)->b_wptr - mp1->b_wptr;
! BUMP_LOCAL(tcp->tcp_obsegs);
TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
tcp_send_data(tcp, mp);
continue;
}
*snxt += len; /* Adjust later if we don't send all of len */
TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
if (*tail_unsent) {
/* Are the bytes above us in flight? */
rptr = (*xmit_tail)->b_wptr - *tail_unsent;
if (rptr != (*xmit_tail)->b_rptr) {
--- 1974,1998 ----
(mblk_t *)(uintptr_t)(*snxt);
mp1 = mp1->b_cont;
}
*snxt += len;
*tail_unsent = (*xmit_tail)->b_wptr - mp1->b_wptr;
! TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
+ tcp->tcp_cs.tcp_out_data_segs++;
+ tcp->tcp_cs.tcp_out_data_bytes += len;
tcp_send_data(tcp, mp);
continue;
}
*snxt += len; /* Adjust later if we don't send all of len */
+ TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
+ tcp->tcp_cs.tcp_out_data_segs++;
+ tcp->tcp_cs.tcp_out_data_bytes += len;
if (*tail_unsent) {
/* Are the bytes above us in flight? */
rptr = (*xmit_tail)->b_wptr - *tail_unsent;
if (rptr != (*xmit_tail)->b_rptr) {
*** 2061,2071 ****
/*
* Fill in the header using the template header, and add
* options such as time-stamp, ECN and/or SACK, as needed.
*/
! tcp_fill_header(tcp, rptr, (clock_t)local_time, num_sack_blk);
mp->b_rptr = rptr;
if (*tail_unsent) {
int spill = *tail_unsent;
--- 2085,2095 ----
/*
* Fill in the header using the template header, and add
* options such as time-stamp, ECN and/or SACK, as needed.
*/
! tcp_fill_header(tcp, rptr, num_sack_blk);
mp->b_rptr = rptr;
if (*tail_unsent) {
int spill = *tail_unsent;
*** 2140,2149 ****
--- 2164,2174 ----
*/
*usable -= spill;
*snxt += spill;
tcp->tcp_last_sent_len += spill;
TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, spill);
+ tcp->tcp_cs.tcp_out_data_bytes += spill;
/*
* Adjust the checksum
*/
tcpha = (tcpha_t *)(rptr +
ixa->ixa_ip_hdr_length);
*** 2188,2198 ****
/*
* Restore values of ixa_fragsize and ixa_extra_ident.
*/
ixa->ixa_fragsize = ixa->ixa_pmtu;
ixa->ixa_extra_ident = 0;
! tcp->tcp_obsegs += num_lso_seg;
TCP_STAT(tcps, tcp_lso_times);
TCP_STAT_UPDATE(tcps, tcp_lso_pkt_out, num_lso_seg);
} else {
/*
* Make sure to clean up LSO information. Wherever a
--- 2213,2223 ----
/*
* Restore values of ixa_fragsize and ixa_extra_ident.
*/
ixa->ixa_fragsize = ixa->ixa_pmtu;
ixa->ixa_extra_ident = 0;
! TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
TCP_STAT(tcps, tcp_lso_times);
TCP_STAT_UPDATE(tcps, tcp_lso_pkt_out, num_lso_seg);
} else {
/*
* Make sure to clean up LSO information. Wherever a
*** 2199,2209 ****
* new mp uses the prepended header room after dupb(),
* lso_info_cleanup() should be called.
*/
lso_info_cleanup(mp);
tcp_send_data(tcp, mp);
! BUMP_LOCAL(tcp->tcp_obsegs);
}
}
return (0);
}
--- 2224,2234 ----
* new mp uses the prepended header room after dupb(),
* lso_info_cleanup() should be called.
*/
lso_info_cleanup(mp);
tcp_send_data(tcp, mp);
! TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
}
}
return (0);
}
*** 2279,2290 ****
/*
* We do not have a good algorithm to update ssthresh at this time.
* So don't do any update.
*/
bzero(&uinfo, sizeof (uinfo));
! uinfo.iulp_rtt = tcp->tcp_rtt_sa;
! uinfo.iulp_rtt_sd = tcp->tcp_rtt_sd;
/*
* Note that uinfo is kept for conn_faddr in the DCE. Could update even
* if source routed but we don't.
*/
--- 2304,2315 ----
/*
* We do not have a good algorithm to update ssthresh at this time.
* So don't do any update.
*/
bzero(&uinfo, sizeof (uinfo));
! uinfo.iulp_rtt = NSEC2MSEC(tcp->tcp_rtt_sa);
! uinfo.iulp_rtt_sd = NSEC2MSEC(tcp->tcp_rtt_sd);
/*
* Note that uinfo is kept for conn_faddr in the DCE. Could update even
* if source routed but we don't.
*/
*** 2415,2425 ****
tcp->tcp_tcpha->tha_win = tcpha->tha_win;
tcp->tcp_rack = ack;
tcp->tcp_rack_cnt = 0;
TCPS_BUMP_MIB(tcps, tcpOutAck);
}
! BUMP_LOCAL(tcp->tcp_obsegs);
tcpha->tha_seq = htonl(seq);
tcpha->tha_ack = htonl(ack);
/*
* Include the adjustment for a source route if any.
*/
--- 2440,2450 ----
tcp->tcp_tcpha->tha_win = tcpha->tha_win;
tcp->tcp_rack = ack;
tcp->tcp_rack_cnt = 0;
TCPS_BUMP_MIB(tcps, tcpOutAck);
}
! TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
tcpha->tha_seq = htonl(seq);
tcpha->tha_ack = htonl(ack);
/*
* Include the adjustment for a source route if any.
*/
*** 3384,3398 ****
--- 3409,3429 ----
tcp_send_data(tcp, xmit_mp);
/*
* Update the send timestamp to avoid false retransmission.
*/
+ #ifdef KERNEL_32
snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt();
+ #else
+ snxt_mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
+ #endif
TCPS_BUMP_MIB(tcps, tcpRetransSegs);
TCPS_UPDATE_MIB(tcps, tcpRetransBytes, seg_len);
TCPS_BUMP_MIB(tcps, tcpOutSackRetransSegs);
+ tcp->tcp_cs.tcp_out_retrans_segs++;
+ tcp->tcp_cs.tcp_out_retrans_bytes += seg_len;
/*
* Update tcp_rexmit_max to extend this SACK recovery phase.
* This happens when new data sent during fast recovery is
* also lost. If TCP retransmits those new data, it needs
* to extend SACK recover phase to avoid starting another
*** 3456,3468 ****
--- 3487,3505 ----
win -= cnt;
/*
* Update the send timestamp to avoid false
* retransmission.
*/
+ #ifdef KERNEL_32
old_snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt();
+ #else
+ old_snxt_mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
+ #endif
TCPS_BUMP_MIB(tcps, tcpRetransSegs);
TCPS_UPDATE_MIB(tcps, tcpRetransBytes, cnt);
+ tcp->tcp_cs.tcp_out_retrans_segs++;
+ tcp->tcp_cs.tcp_out_retrans_bytes += cnt;
tcp->tcp_rexmit_nxt = snxt;
}
/*
* If we have transmitted all we have at the time
*** 3616,3626 ****
* tcp_fill_header is called by tcp_send() to fill the outgoing TCP header
* with the template header, as well as other options such as time-stamp,
* ECN and/or SACK.
*/
static void
! tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk)
{
tcpha_t *tcp_tmpl, *tcpha;
uint32_t *dst, *src;
int hdrlen;
conn_t *connp = tcp->tcp_connp;
--- 3653,3663 ----
* tcp_fill_header is called by tcp_send() to fill the outgoing TCP header
* with the template header, as well as other options such as time-stamp,
* ECN and/or SACK.
*/
static void
! tcp_fill_header(tcp_t *tcp, uchar_t *rptr, int num_sack_blk)
{
tcpha_t *tcp_tmpl, *tcpha;
uint32_t *dst, *src;
int hdrlen;
conn_t *connp = tcp->tcp_connp;
*** 3638,3648 ****
src = (uint32_t *)connp->conn_ht_iphc;
hdrlen = connp->conn_ht_iphc_len;
/* Fill time-stamp option if needed */
if (tcp->tcp_snd_ts_ok) {
! U32_TO_BE32((uint32_t)now,
(char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 4);
U32_TO_BE32(tcp->tcp_ts_recent,
(char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 8);
} else {
ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
--- 3675,3685 ----
src = (uint32_t *)connp->conn_ht_iphc;
hdrlen = connp->conn_ht_iphc_len;
/* Fill time-stamp option if needed */
if (tcp->tcp_snd_ts_ok) {
! U32_TO_BE32(LBOLT_FASTPATH,
(char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 4);
U32_TO_BE32(tcp->tcp_ts_recent,
(char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 8);
} else {
ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);