Print this page
5295 remove maxburst logic from TCP's send algorithm Reviewed by: Dan McDonald <danmcd@omniti.com>


   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  25  * Copyright (c) 2011 Joyent, Inc. All rights reserved.

  26  */
  27 
  28 /* This file contains all TCP input processing functions. */
  29 
  30 #include <sys/types.h>
  31 #include <sys/stream.h>
  32 #include <sys/strsun.h>
  33 #include <sys/strsubr.h>
  34 #include <sys/stropts.h>
  35 #include <sys/strlog.h>
  36 #define _SUN_TPI_VERSION 2
  37 #include <sys/tihdr.h>
  38 #include <sys/suntpi.h>
  39 #include <sys/xti_inet.h>
  40 #include <sys/squeue_impl.h>
  41 #include <sys/squeue.h>
  42 #include <sys/tsol/tnet.h>
  43 
  44 #include <inet/common.h>
  45 #include <inet/ip.h>


2627                                 freemsg(mp);
2628                                 return;
2629                         }
2630                         TCPS_CONN_INC(tcps);
2631                         /* SYN was acked - making progress */
2632                         tcp->tcp_ip_forward_progress = B_TRUE;
2633 
2634                         /* One for the SYN */
2635                         tcp->tcp_suna = tcp->tcp_iss + 1;
2636                         tcp->tcp_valid_bits &= ~TCP_ISS_VALID;
2637 
2638                         /*
2639                          * If SYN was retransmitted, need to reset all
2640                          * retransmission info.  This is because this
2641                          * segment will be treated as a dup ACK.
2642                          */
2643                         if (tcp->tcp_rexmit) {
2644                                 tcp->tcp_rexmit = B_FALSE;
2645                                 tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
2646                                 tcp->tcp_rexmit_max = tcp->tcp_snxt;
2647                                 tcp->tcp_snd_burst = tcp->tcp_localnet ?
2648                                     TCP_CWND_INFINITE : TCP_CWND_NORMAL;
2649                                 tcp->tcp_ms_we_have_waited = 0;
2650 
2651                                 /*
2652                                  * Set tcp_cwnd back to 1 MSS, per
2653                                  * recommendation from
2654                                  * draft-floyd-incr-init-win-01.txt,
2655                                  * Increasing TCP's Initial Window.
2656                                  */
2657                                 tcp->tcp_cwnd = tcp->tcp_mss;
2658                         }
2659 
2660                         tcp->tcp_swl1 = seg_seq;
2661                         tcp->tcp_swl2 = seg_ack;
2662 
2663                         new_swnd = ntohs(tcpha->tha_win);
2664                         tcp->tcp_swnd = new_swnd;
2665                         if (new_swnd > tcp->tcp_max_swnd)
2666                                 tcp->tcp_max_swnd = new_swnd;
2667 
2668                         /*


3823                         DTRACE_TCP5(accept__established, mlbk_t *, NULL,
3824                             ip_xmit_attr_t *, connp->conn_ixa, void_ip_t *,
3825                             iphdr, tcp_t *, tcp, tcph_t *, tcpha);
3826                 }
3827                 TCPS_CONN_INC(tcps);
3828 
3829                 tcp->tcp_suna = tcp->tcp_iss + 1; /* One for the SYN */
3830                 bytes_acked--;
3831                 /* SYN was acked - making progress */
3832                 tcp->tcp_ip_forward_progress = B_TRUE;
3833 
3834                 /*
3835                  * If SYN was retransmitted, need to reset all
3836                  * retransmission info as this segment will be
3837                  * treated as a dup ACK.
3838                  */
3839                 if (tcp->tcp_rexmit) {
3840                         tcp->tcp_rexmit = B_FALSE;
3841                         tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
3842                         tcp->tcp_rexmit_max = tcp->tcp_snxt;
3843                         tcp->tcp_snd_burst = tcp->tcp_localnet ?
3844                             TCP_CWND_INFINITE : TCP_CWND_NORMAL;
3845                         tcp->tcp_ms_we_have_waited = 0;
3846                         tcp->tcp_cwnd = mss;
3847                 }
3848 
3849                 /*
3850                  * We set the send window to zero here.
3851                  * This is needed if there is data to be
3852                  * processed already on the queue.
3853                  * Later (at swnd_update label), the
3854                  * "new_swnd > tcp_swnd" condition is satisfied
3855                  * the XMIT_NEEDED flag is set in the current
3856                  * (SYN_RCVD) state. This ensures tcp_wput_data() is
3857                  * called if there is already data on queue in
3858                  * this state.
3859                  */
3860                 tcp->tcp_swnd = 0;
3861 
3862                 if (new_swnd > tcp->tcp_max_swnd)
3863                         tcp->tcp_max_swnd = new_swnd;
3864                 tcp->tcp_swl1 = seg_seq;


3993                                         tcp->tcp_ecn_cwr_sent = B_FALSE;
3994                                 }
3995 
3996                                 /*
3997                                  * We do Hoe's algorithm.  Refer to her
3998                                  * paper "Improving the Start-up Behavior
3999                                  * of a Congestion Control Scheme for TCP,"
4000                                  * appeared in SIGCOMM'96.
4001                                  *
4002                                  * Save highest seq no we have sent so far.
4003                                  * Be careful about the invisible FIN byte.
4004                                  */
4005                                 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
4006                                     (tcp->tcp_unsent == 0)) {
4007                                         tcp->tcp_rexmit_max = tcp->tcp_fss;
4008                                 } else {
4009                                         tcp->tcp_rexmit_max = tcp->tcp_snxt;
4010                                 }
4011 
4012                                 /*
4013                                  * Do not allow bursty traffic during.
4014                                  * fast recovery.  Refer to Fall and Floyd's
4015                                  * paper "Simulation-based Comparisons of
4016                                  * Tahoe, Reno and SACK TCP" (in CCR?)
4017                                  * This is a best current practise.
4018                                  */
4019                                 tcp->tcp_snd_burst = TCP_CWND_SS;
4020 
4021                                 /*
4022                                  * For SACK:
4023                                  * Calculate tcp_pipe, which is the
4024                                  * estimated number of bytes in
4025                                  * network.
4026                                  *
4027                                  * tcp_fack is the highest sack'ed seq num
4028                                  * TCP has received.
4029                                  *
4030                                  * tcp_pipe is explained in the above quoted
4031                                  * Fall and Floyd's paper.  tcp_fack is
4032                                  * explained in Mathis and Mahdavi's
4033                                  * "Forward Acknowledgment: Refining TCP
4034                                  * Congestion Control" in SIGCOMM '96.
4035                                  */
4036                                 if (tcp->tcp_snd_sack_ok) {
4037                                         if (tcp->tcp_notsack_list != NULL) {
4038                                                 tcp->tcp_pipe = tcp->tcp_snxt -
4039                                                     tcp->tcp_fack;
4040                                                 tcp->tcp_sack_snxt = seg_ack;
4041                                                 flags |= TH_NEED_SACK_REXMIT;


4201 
4202         /*
4203          * If we got an ACK after fast retransmit, check to see
4204          * if it is a partial ACK.  If it is not and the congestion
4205          * window was inflated to account for the other side's
4206          * cached packets, retract it.  If it is, do Hoe's algorithm.
4207          */
4208         if (tcp->tcp_dupack_cnt >= tcps->tcps_dupack_fast_retransmit) {
4209                 ASSERT(tcp->tcp_rexmit == B_FALSE);
4210                 if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) {
4211                         tcp->tcp_dupack_cnt = 0;
4212                         /*
4213                          * Restore the orig tcp_cwnd_ssthresh after
4214                          * fast retransmit phase.
4215                          */
4216                         if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) {
4217                                 tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh;
4218                         }
4219                         tcp->tcp_rexmit_max = seg_ack;
4220                         tcp->tcp_cwnd_cnt = 0;
4221                         tcp->tcp_snd_burst = tcp->tcp_localnet ?
4222                             TCP_CWND_INFINITE : TCP_CWND_NORMAL;
4223 
4224                         /*
4225                          * Remove all notsack info to avoid confusion with
4226                          * the next fast retrasnmit/recovery phase.
4227                          */
4228                         if (tcp->tcp_snd_sack_ok) {
4229                                 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list,
4230                                     tcp);
4231                         }
4232                 } else {
4233                         if (tcp->tcp_snd_sack_ok &&
4234                             tcp->tcp_notsack_list != NULL) {
4235                                 flags |= TH_NEED_SACK_REXMIT;
4236                                 tcp->tcp_pipe -= mss;
4237                                 if (tcp->tcp_pipe < 0)
4238                                         tcp->tcp_pipe = 0;
4239                         } else {
4240                                 /*
4241                                  * Hoe's algorithm:
4242                                  *


4263                          * tcp_rexmit_nxt.  Otherwise, update tcp_rexmit_nxt
4264                          * to the correct value.
4265                          *
4266                          * Note that SEQ_LEQ() is used.  This is to avoid
4267                          * unnecessary fast retransmit caused by dup ACKs
4268                          * received when TCP does slow start retransmission
4269                          * after a time out.  During this phase, TCP may
4270                          * send out segments which are already received.
4271                          * This causes dup ACKs to be sent back.
4272                          */
4273                         if (SEQ_LEQ(seg_ack, tcp->tcp_rexmit_max)) {
4274                                 if (SEQ_GT(seg_ack, tcp->tcp_rexmit_nxt)) {
4275                                         tcp->tcp_rexmit_nxt = seg_ack;
4276                                 }
4277                                 if (seg_ack != tcp->tcp_rexmit_max) {
4278                                         flags |= TH_XMIT_NEEDED;
4279                                 }
4280                         } else {
4281                                 tcp->tcp_rexmit = B_FALSE;
4282                                 tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
4283                                 tcp->tcp_snd_burst = tcp->tcp_localnet ?
4284                                     TCP_CWND_INFINITE : TCP_CWND_NORMAL;
4285                         }
4286                         tcp->tcp_ms_we_have_waited = 0;
4287                 }
4288         }
4289 
4290         TCPS_BUMP_MIB(tcps, tcpInAckSegs);
4291         TCPS_UPDATE_MIB(tcps, tcpInAckBytes, bytes_acked);
4292         tcp->tcp_suna = seg_ack;
4293         if (tcp->tcp_zero_win_probe != 0) {
4294                 tcp->tcp_zero_win_probe = 0;
4295                 tcp->tcp_timer_backoff = 0;
4296         }
4297 
4298         /*
4299          * If tcp_xmit_head is NULL, then it must be the FIN being ack'ed.
4300          * Note that it cannot be the SYN being ack'ed.  The code flow
4301          * will not reach here.
4302          */
4303         if (mp1 == NULL) {
4304                 goto fin_acked;




   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  25  * Copyright (c) 2011 Joyent, Inc. All rights reserved.
  26  * Copyright (c) 2014 by Delphix. All rights reserved.
  27  */
  28 
  29 /* This file contains all TCP input processing functions. */
  30 
  31 #include <sys/types.h>
  32 #include <sys/stream.h>
  33 #include <sys/strsun.h>
  34 #include <sys/strsubr.h>
  35 #include <sys/stropts.h>
  36 #include <sys/strlog.h>
  37 #define _SUN_TPI_VERSION 2
  38 #include <sys/tihdr.h>
  39 #include <sys/suntpi.h>
  40 #include <sys/xti_inet.h>
  41 #include <sys/squeue_impl.h>
  42 #include <sys/squeue.h>
  43 #include <sys/tsol/tnet.h>
  44 
  45 #include <inet/common.h>
  46 #include <inet/ip.h>


2628                                 freemsg(mp);
2629                                 return;
2630                         }
2631                         TCPS_CONN_INC(tcps);
2632                         /* SYN was acked - making progress */
2633                         tcp->tcp_ip_forward_progress = B_TRUE;
2634 
2635                         /* One for the SYN */
2636                         tcp->tcp_suna = tcp->tcp_iss + 1;
2637                         tcp->tcp_valid_bits &= ~TCP_ISS_VALID;
2638 
2639                         /*
2640                          * If SYN was retransmitted, need to reset all
2641                          * retransmission info.  This is because this
2642                          * segment will be treated as a dup ACK.
2643                          */
2644                         if (tcp->tcp_rexmit) {
2645                                 tcp->tcp_rexmit = B_FALSE;
2646                                 tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
2647                                 tcp->tcp_rexmit_max = tcp->tcp_snxt;


2648                                 tcp->tcp_ms_we_have_waited = 0;
2649 
2650                                 /*
2651                                  * Set tcp_cwnd back to 1 MSS, per
2652                                  * recommendation from
2653                                  * draft-floyd-incr-init-win-01.txt,
2654                                  * Increasing TCP's Initial Window.
2655                                  */
2656                                 tcp->tcp_cwnd = tcp->tcp_mss;
2657                         }
2658 
2659                         tcp->tcp_swl1 = seg_seq;
2660                         tcp->tcp_swl2 = seg_ack;
2661 
2662                         new_swnd = ntohs(tcpha->tha_win);
2663                         tcp->tcp_swnd = new_swnd;
2664                         if (new_swnd > tcp->tcp_max_swnd)
2665                                 tcp->tcp_max_swnd = new_swnd;
2666 
2667                         /*


3822                         DTRACE_TCP5(accept__established, mlbk_t *, NULL,
3823                             ip_xmit_attr_t *, connp->conn_ixa, void_ip_t *,
3824                             iphdr, tcp_t *, tcp, tcph_t *, tcpha);
3825                 }
3826                 TCPS_CONN_INC(tcps);
3827 
3828                 tcp->tcp_suna = tcp->tcp_iss + 1; /* One for the SYN */
3829                 bytes_acked--;
3830                 /* SYN was acked - making progress */
3831                 tcp->tcp_ip_forward_progress = B_TRUE;
3832 
3833                 /*
3834                  * If SYN was retransmitted, need to reset all
3835                  * retransmission info as this segment will be
3836                  * treated as a dup ACK.
3837                  */
3838                 if (tcp->tcp_rexmit) {
3839                         tcp->tcp_rexmit = B_FALSE;
3840                         tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
3841                         tcp->tcp_rexmit_max = tcp->tcp_snxt;


3842                         tcp->tcp_ms_we_have_waited = 0;
3843                         tcp->tcp_cwnd = mss;
3844                 }
3845 
3846                 /*
3847                  * We set the send window to zero here.
3848                  * This is needed if there is data to be
3849                  * processed already on the queue.
3850                  * Later (at swnd_update label), the
3851                  * "new_swnd > tcp_swnd" condition is satisfied
3852                  * the XMIT_NEEDED flag is set in the current
3853                  * (SYN_RCVD) state. This ensures tcp_wput_data() is
3854                  * called if there is already data on queue in
3855                  * this state.
3856                  */
3857                 tcp->tcp_swnd = 0;
3858 
3859                 if (new_swnd > tcp->tcp_max_swnd)
3860                         tcp->tcp_max_swnd = new_swnd;
3861                 tcp->tcp_swl1 = seg_seq;


3990                                         tcp->tcp_ecn_cwr_sent = B_FALSE;
3991                                 }
3992 
3993                                 /*
3994                                  * We do Hoe's algorithm.  Refer to her
3995                                  * paper "Improving the Start-up Behavior
3996                                  * of a Congestion Control Scheme for TCP,"
3997                                  * appeared in SIGCOMM'96.
3998                                  *
3999                                  * Save highest seq no we have sent so far.
4000                                  * Be careful about the invisible FIN byte.
4001                                  */
4002                                 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
4003                                     (tcp->tcp_unsent == 0)) {
4004                                         tcp->tcp_rexmit_max = tcp->tcp_fss;
4005                                 } else {
4006                                         tcp->tcp_rexmit_max = tcp->tcp_snxt;
4007                                 }
4008 
4009                                 /*









4010                                  * For SACK:
4011                                  * Calculate tcp_pipe, which is the
4012                                  * estimated number of bytes in
4013                                  * network.
4014                                  *
4015                                  * tcp_fack is the highest sack'ed seq num
4016                                  * TCP has received.
4017                                  *
4018                                  * tcp_pipe is explained in the above quoted
4019                                  * Fall and Floyd's paper.  tcp_fack is
4020                                  * explained in Mathis and Mahdavi's
4021                                  * "Forward Acknowledgment: Refining TCP
4022                                  * Congestion Control" in SIGCOMM '96.
4023                                  */
4024                                 if (tcp->tcp_snd_sack_ok) {
4025                                         if (tcp->tcp_notsack_list != NULL) {
4026                                                 tcp->tcp_pipe = tcp->tcp_snxt -
4027                                                     tcp->tcp_fack;
4028                                                 tcp->tcp_sack_snxt = seg_ack;
4029                                                 flags |= TH_NEED_SACK_REXMIT;


4189 
4190         /*
4191          * If we got an ACK after fast retransmit, check to see
4192          * if it is a partial ACK.  If it is not and the congestion
4193          * window was inflated to account for the other side's
4194          * cached packets, retract it.  If it is, do Hoe's algorithm.
4195          */
4196         if (tcp->tcp_dupack_cnt >= tcps->tcps_dupack_fast_retransmit) {
4197                 ASSERT(tcp->tcp_rexmit == B_FALSE);
4198                 if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) {
4199                         tcp->tcp_dupack_cnt = 0;
4200                         /*
4201                          * Restore the orig tcp_cwnd_ssthresh after
4202                          * fast retransmit phase.
4203                          */
4204                         if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) {
4205                                 tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh;
4206                         }
4207                         tcp->tcp_rexmit_max = seg_ack;
4208                         tcp->tcp_cwnd_cnt = 0;


4209 
4210                         /*
4211                          * Remove all notsack info to avoid confusion with
4212                          * the next fast retrasnmit/recovery phase.
4213                          */
4214                         if (tcp->tcp_snd_sack_ok) {
4215                                 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list,
4216                                     tcp);
4217                         }
4218                 } else {
4219                         if (tcp->tcp_snd_sack_ok &&
4220                             tcp->tcp_notsack_list != NULL) {
4221                                 flags |= TH_NEED_SACK_REXMIT;
4222                                 tcp->tcp_pipe -= mss;
4223                                 if (tcp->tcp_pipe < 0)
4224                                         tcp->tcp_pipe = 0;
4225                         } else {
4226                                 /*
4227                                  * Hoe's algorithm:
4228                                  *


4249                          * tcp_rexmit_nxt.  Otherwise, update tcp_rexmit_nxt
4250                          * to the correct value.
4251                          *
4252                          * Note that SEQ_LEQ() is used.  This is to avoid
4253                          * unnecessary fast retransmit caused by dup ACKs
4254                          * received when TCP does slow start retransmission
4255                          * after a time out.  During this phase, TCP may
4256                          * send out segments which are already received.
4257                          * This causes dup ACKs to be sent back.
4258                          */
4259                         if (SEQ_LEQ(seg_ack, tcp->tcp_rexmit_max)) {
4260                                 if (SEQ_GT(seg_ack, tcp->tcp_rexmit_nxt)) {
4261                                         tcp->tcp_rexmit_nxt = seg_ack;
4262                                 }
4263                                 if (seg_ack != tcp->tcp_rexmit_max) {
4264                                         flags |= TH_XMIT_NEEDED;
4265                                 }
4266                         } else {
4267                                 tcp->tcp_rexmit = B_FALSE;
4268                                 tcp->tcp_rexmit_nxt = tcp->tcp_snxt;


4269                         }
4270                         tcp->tcp_ms_we_have_waited = 0;
4271                 }
4272         }
4273 
4274         TCPS_BUMP_MIB(tcps, tcpInAckSegs);
4275         TCPS_UPDATE_MIB(tcps, tcpInAckBytes, bytes_acked);
4276         tcp->tcp_suna = seg_ack;
4277         if (tcp->tcp_zero_win_probe != 0) {
4278                 tcp->tcp_zero_win_probe = 0;
4279                 tcp->tcp_timer_backoff = 0;
4280         }
4281 
4282         /*
4283          * If tcp_xmit_head is NULL, then it must be the FIN being ack'ed.
4284          * Note that it cannot be the SYN being ack'ed.  The code flow
4285          * will not reach here.
4286          */
4287         if (mp1 == NULL) {
4288                 goto fin_acked;