Print this page
5295 remove maxburst logic from TCP's send algorithm Reviewed by: Dan McDonald <danmcd@omniti.com>


   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.

  24  */
  25 
  26 /* This file contains all TCP output processing functions. */
  27 
  28 #include <sys/types.h>
  29 #include <sys/stream.h>
  30 #include <sys/strsun.h>
  31 #include <sys/strsubr.h>
  32 #include <sys/stropts.h>
  33 #include <sys/strlog.h>
  34 #define _SUN_TPI_VERSION 2
  35 #include <sys/tihdr.h>
  36 #include <sys/suntpi.h>
  37 #include <sys/xti_inet.h>
  38 #include <sys/timod.h>
  39 #include <sys/pattr.h>
  40 #include <sys/squeue_impl.h>
  41 #include <sys/squeue.h>
  42 #include <sys/sockio.h>
  43 #include <sys/tsol/tnet.h>


1744         /* Guard against a RST having blown it away while on the squeue */
1745         if (tcp->tcp_state == TCPS_CLOSED) {
1746                 freemsg(mp);
1747                 return;
1748         }
1749 
1750         /*
1751          * In the off-chance that the eager received and responded to
1752          * some other packet while the SYN|ACK was queued, we recalculate
1753          * the ixa_pktlen. It would be better to fix the SYN/accept
1754          * multithreading scheme to avoid this complexity.
1755          */
1756         ixa->ixa_pktlen = msgdsize(mp);
1757         (void) conn_ip_output(mp, ixa);
1758 }
1759 
1760 /*
1761  * tcp_send() is called by tcp_wput_data() and returns one of the following:
1762  *
1763  * -1 = failed allocation.
1764  *  0 = success; burst count reached, or usable send window is too small,
1765  *      and that we'd rather wait until later before sending again.
1766  */
1767 static int
1768 tcp_send(tcp_t *tcp, const int mss, const int total_hdr_len,
1769     const int tcp_hdr_len, const int num_sack_blk, int *usable,
1770     uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time)
1771 {
1772         int             num_burst_seg = tcp->tcp_snd_burst;
1773         int             num_lso_seg = 1;
1774         uint_t          lso_usable;
1775         boolean_t       do_lso_send = B_FALSE;
1776         tcp_stack_t     *tcps = tcp->tcp_tcps;
1777         conn_t          *connp = tcp->tcp_connp;
1778         ip_xmit_attr_t  *ixa = connp->conn_ixa;
1779 
1780         /*
1781          * Check LSO possibility. The value of tcp->tcp_lso indicates whether
1782          * the underlying connection is LSO capable. Will check whether having
1783          * enough available data to initiate LSO transmission in the for(){}
1784          * loops.
1785          */
1786         if (tcp->tcp_lso && (tcp->tcp_valid_bits & ~TCP_FSS_VALID) == 0)
1787                 do_lso_send = B_TRUE;
1788 
1789         for (;;) {
1790                 struct datab    *db;
1791                 tcpha_t         *tcpha;
1792                 uint32_t        sum;
1793                 mblk_t          *mp, *mp1;
1794                 uchar_t         *rptr;
1795                 int             len;
1796 
1797                 /*
1798                  * Burst count reached, return successfully.
1799                  */
1800                 if (num_burst_seg == 0)
1801                         break;
1802 
1803                 /*
1804                  * Calculate the maximum payload length we can send at one
1805                  * time.
1806                  */
1807                 if (do_lso_send) {
1808                         /*
1809                          * Check whether be able to to do LSO for the current
1810                          * available data.
1811                          */
1812                         if (num_burst_seg >= 2 && (*usable - 1) / mss >= 1) {
1813                                 lso_usable = MIN(tcp->tcp_lso_max, *usable);
1814                                 lso_usable = MIN(lso_usable,
1815                                     num_burst_seg * mss);
1816 
1817                                 num_lso_seg = lso_usable / mss;
1818                                 if (lso_usable % mss) {
1819                                         num_lso_seg++;
1820                                         tcp->tcp_last_sent_len = (ushort_t)
1821                                             (lso_usable % mss);
1822                                 } else {
1823                                         tcp->tcp_last_sent_len = (ushort_t)mss;
1824                                 }
1825                         } else {
1826                                 do_lso_send = B_FALSE;
1827                                 num_lso_seg = 1;
1828                                 lso_usable = mss;
1829                         }
1830                 }
1831 
1832                 ASSERT(num_lso_seg <= IP_MAXPACKET / mss + 1);
1833 #ifdef DEBUG
1834                 DTRACE_PROBE2(tcp_send_lso, int, num_lso_seg, boolean_t,
1835                     do_lso_send);
1836 #endif
1837                 /*
1838                  * Adjust num_burst_seg here.
1839                  */
1840                 num_burst_seg -= num_lso_seg;
1841 
1842                 len = mss;
1843                 if (len > *usable) {
1844                         ASSERT(do_lso_send == B_FALSE);
1845 
1846                         len = *usable;
1847                         if (len <= 0) {
1848                                 /* Terminate the loop */
1849                                 break;  /* success; too small */
1850                         }
1851                         /*
1852                          * Sender silly-window avoidance.
1853                          * Ignore this if we are going to send a
1854                          * zero window probe out.
1855                          *
1856                          * TODO: force data into microscopic window?
1857                          *      ==> (!pushed || (unsent > usable))
1858                          */
1859                         if (len < (tcp->tcp_max_swnd >> 1) &&
1860                             (tcp->tcp_unsent - (*snxt - tcp->tcp_snxt)) > len &&


3407 
3408                 TCPS_BUMP_MIB(tcps, tcpRetransSegs);
3409                 TCPS_UPDATE_MIB(tcps, tcpRetransBytes, seg_len);
3410                 TCPS_BUMP_MIB(tcps, tcpOutSackRetransSegs);
3411                 /*
3412                  * Update tcp_rexmit_max to extend this SACK recovery phase.
3413                  * This happens when new data sent during fast recovery is
3414                  * also lost.  If TCP retransmits those new data, it needs
3415                  * to extend SACK recover phase to avoid starting another
3416                  * fast retransmit/recovery unnecessarily.
3417                  */
3418                 if (SEQ_GT(tcp->tcp_sack_snxt, tcp->tcp_rexmit_max)) {
3419                         tcp->tcp_rexmit_max = tcp->tcp_sack_snxt;
3420                 }
3421         }
3422 }
3423 
3424 /*
3425  * tcp_ss_rexmit() is called to do slow start retransmission after a timeout
3426  * or ICMP errors.
3427  *
3428  * To limit the number of duplicate segments, we limit the number of segment
3429  * to be sent in one time to tcp_snd_burst, the burst variable.
3430  */
3431 void
3432 tcp_ss_rexmit(tcp_t *tcp)
3433 {
3434         uint32_t        snxt;
3435         uint32_t        smax;
3436         int32_t         win;
3437         int32_t         mss;
3438         int32_t         off;
3439         int32_t         burst = tcp->tcp_snd_burst;
3440         mblk_t          *snxt_mp;
3441         tcp_stack_t     *tcps = tcp->tcp_tcps;
3442 
3443         /*
3444          * Note that tcp_rexmit can be set even though TCP has retransmitted
3445          * all unack'ed segments.
3446          */
3447         if (SEQ_LT(tcp->tcp_rexmit_nxt, tcp->tcp_rexmit_max)) {
3448                 smax = tcp->tcp_rexmit_max;
3449                 snxt = tcp->tcp_rexmit_nxt;
3450                 if (SEQ_LT(snxt, tcp->tcp_suna)) {
3451                         snxt = tcp->tcp_suna;
3452                 }
3453                 win = MIN(tcp->tcp_cwnd, tcp->tcp_swnd);
3454                 win -= snxt - tcp->tcp_suna;
3455                 mss = tcp->tcp_mss;
3456                 snxt_mp = tcp_get_seg_mp(tcp, snxt, &off);
3457 
3458                 while (SEQ_LT(snxt, smax) && (win > 0) &&
3459                     (burst > 0) && (snxt_mp != NULL)) {
3460                         mblk_t  *xmit_mp;
3461                         mblk_t  *old_snxt_mp = snxt_mp;
3462                         uint32_t cnt = mss;
3463 
3464                         if (win < cnt) {
3465                                 cnt = win;
3466                         }
3467                         if (SEQ_GT(snxt + cnt, smax)) {
3468                                 cnt = smax - snxt;
3469                         }
3470                         xmit_mp = tcp_xmit_mp(tcp, snxt_mp, cnt, &off,
3471                             &snxt_mp, snxt, B_TRUE, &cnt, B_TRUE);
3472                         if (xmit_mp == NULL)
3473                                 return;
3474 
3475                         tcp_send_data(tcp, xmit_mp);
3476 
3477                         snxt += cnt;
3478                         win -= cnt;
3479                         /*
3480                          * Update the send timestamp to avoid false
3481                          * retransmission.
3482                          */
3483                         old_snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt();
3484                         TCPS_BUMP_MIB(tcps, tcpRetransSegs);
3485                         TCPS_UPDATE_MIB(tcps, tcpRetransBytes, cnt);
3486 
3487                         tcp->tcp_rexmit_nxt = snxt;
3488                         burst--;
3489                 }
3490                 /*
3491                  * If we have transmitted all we have at the time
3492                  * we started the retranmission, we can leave
3493                  * the rest of the job to tcp_wput_data().  But we
3494                  * need to check the send window first.  If the
3495                  * win is not 0, go on with tcp_wput_data().
3496                  */
3497                 if (SEQ_LT(snxt, smax) || win == 0) {
3498                         return;
3499                 }
3500         }
3501         /* Only call tcp_wput_data() if there is data to be sent. */
3502         if (tcp->tcp_unsent) {
3503                 tcp_wput_data(tcp, NULL, B_FALSE);
3504         }
3505 }
3506 
3507 /*
3508  * Do slow start retransmission after ICMP errors of PMTU changes.
3509  */
3510 void
3511 tcp_rexmit_after_error(tcp_t *tcp)
3512 {
3513         /*
3514          * All sent data has been acknowledged or no data left to send, just
3515          * to return.
3516          */
3517         if (!SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) ||
3518             (tcp->tcp_xmit_head == NULL))
3519                 return;
3520 
3521         if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && (tcp->tcp_unsent == 0))
3522                 tcp->tcp_rexmit_max = tcp->tcp_fss;
3523         else
3524                 tcp->tcp_rexmit_max = tcp->tcp_snxt;
3525 
3526         tcp->tcp_rexmit_nxt = tcp->tcp_suna;
3527         tcp->tcp_rexmit = B_TRUE;
3528         tcp->tcp_dupack_cnt = 0;
3529         tcp->tcp_snd_burst = TCP_CWND_SS;
3530         tcp_ss_rexmit(tcp);
3531 }
3532 
3533 /*
3534  * tcp_get_seg_mp() is called to get the pointer to a segment in the
3535  * send queue which starts at the given sequence number. If the given
3536  * sequence number is equal to last valid sequence number (tcp_snxt), the
3537  * returned mblk is the last valid mblk, and off is set to the length of
3538  * that mblk.
3539  *
3540  * send queue which starts at the given seq. no.
3541  *
3542  * Parameters:
3543  *      tcp_t *tcp: the tcp instance pointer.
3544  *      uint32_t seq: the starting seq. no of the requested segment.
3545  *      int32_t *off: after the execution, *off will be the offset to
3546  *              the returned mblk which points to the requested seq no.
3547  *              It is the caller's responsibility to send in a non-null off.
3548  *
3549  * Return:




   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2014 by Delphix. All rights reserved.
  25  */
  26 
  27 /* This file contains all TCP output processing functions. */
  28 
  29 #include <sys/types.h>
  30 #include <sys/stream.h>
  31 #include <sys/strsun.h>
  32 #include <sys/strsubr.h>
  33 #include <sys/stropts.h>
  34 #include <sys/strlog.h>
  35 #define _SUN_TPI_VERSION 2
  36 #include <sys/tihdr.h>
  37 #include <sys/suntpi.h>
  38 #include <sys/xti_inet.h>
  39 #include <sys/timod.h>
  40 #include <sys/pattr.h>
  41 #include <sys/squeue_impl.h>
  42 #include <sys/squeue.h>
  43 #include <sys/sockio.h>
  44 #include <sys/tsol/tnet.h>


1745         /* Guard against a RST having blown it away while on the squeue */
1746         if (tcp->tcp_state == TCPS_CLOSED) {
1747                 freemsg(mp);
1748                 return;
1749         }
1750 
1751         /*
1752          * In the off-chance that the eager received and responded to
1753          * some other packet while the SYN|ACK was queued, we recalculate
1754          * the ixa_pktlen. It would be better to fix the SYN/accept
1755          * multithreading scheme to avoid this complexity.
1756          */
1757         ixa->ixa_pktlen = msgdsize(mp);
1758         (void) conn_ip_output(mp, ixa);
1759 }
1760 
1761 /*
1762  * tcp_send() is called by tcp_wput_data() and returns one of the following:
1763  *
1764  * -1 = failed allocation.
1765  *  0 = We've either successfully sent data, or our usable send window is too
1766  *      small and we'd rather wait until later before sending again.
1767  */
1768 static int
1769 tcp_send(tcp_t *tcp, const int mss, const int total_hdr_len,
1770     const int tcp_hdr_len, const int num_sack_blk, int *usable,
1771     uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time)
1772 {

1773         int             num_lso_seg = 1;
1774         uint_t          lso_usable;
1775         boolean_t       do_lso_send = B_FALSE;
1776         tcp_stack_t     *tcps = tcp->tcp_tcps;
1777         conn_t          *connp = tcp->tcp_connp;
1778         ip_xmit_attr_t  *ixa = connp->conn_ixa;
1779 
1780         /*
1781          * Check LSO possibility. The value of tcp->tcp_lso indicates whether
1782          * the underlying connection is LSO capable. Will check whether having
1783          * enough available data to initiate LSO transmission in the for(){}
1784          * loops.
1785          */
1786         if (tcp->tcp_lso && (tcp->tcp_valid_bits & ~TCP_FSS_VALID) == 0)
1787                 do_lso_send = B_TRUE;
1788 
1789         for (;;) {
1790                 struct datab    *db;
1791                 tcpha_t         *tcpha;
1792                 uint32_t        sum;
1793                 mblk_t          *mp, *mp1;
1794                 uchar_t         *rptr;
1795                 int             len;
1796 
1797                 /*






1798                  * Calculate the maximum payload length we can send at one
1799                  * time.
1800                  */
1801                 if (do_lso_send) {
1802                         /*
1803                          * Determine whether or not it's possible to do LSO,
1804                          * and if so, how much data we can send.
1805                          */
1806                         if ((*usable - 1) / mss >= 1) {
1807                                 lso_usable = MIN(tcp->tcp_lso_max, *usable);



1808                                 num_lso_seg = lso_usable / mss;
1809                                 if (lso_usable % mss) {
1810                                         num_lso_seg++;
1811                                         tcp->tcp_last_sent_len = (ushort_t)
1812                                             (lso_usable % mss);
1813                                 } else {
1814                                         tcp->tcp_last_sent_len = (ushort_t)mss;
1815                                 }
1816                         } else {
1817                                 do_lso_send = B_FALSE;
1818                                 num_lso_seg = 1;
1819                                 lso_usable = mss;
1820                         }
1821                 }
1822 
1823                 ASSERT(num_lso_seg <= IP_MAXPACKET / mss + 1);








1824 
1825                 len = mss;
1826                 if (len > *usable) {
1827                         ASSERT(do_lso_send == B_FALSE);
1828 
1829                         len = *usable;
1830                         if (len <= 0) {
1831                                 /* Terminate the loop */
1832                                 break;  /* success; too small */
1833                         }
1834                         /*
1835                          * Sender silly-window avoidance.
1836                          * Ignore this if we are going to send a
1837                          * zero window probe out.
1838                          *
1839                          * TODO: force data into microscopic window?
1840                          *      ==> (!pushed || (unsent > usable))
1841                          */
1842                         if (len < (tcp->tcp_max_swnd >> 1) &&
1843                             (tcp->tcp_unsent - (*snxt - tcp->tcp_snxt)) > len &&


3390 
3391                 TCPS_BUMP_MIB(tcps, tcpRetransSegs);
3392                 TCPS_UPDATE_MIB(tcps, tcpRetransBytes, seg_len);
3393                 TCPS_BUMP_MIB(tcps, tcpOutSackRetransSegs);
3394                 /*
3395                  * Update tcp_rexmit_max to extend this SACK recovery phase.
3396                  * This happens when new data sent during fast recovery is
3397                  * also lost.  If TCP retransmits those new data, it needs
3398                  * to extend SACK recover phase to avoid starting another
3399                  * fast retransmit/recovery unnecessarily.
3400                  */
3401                 if (SEQ_GT(tcp->tcp_sack_snxt, tcp->tcp_rexmit_max)) {
3402                         tcp->tcp_rexmit_max = tcp->tcp_sack_snxt;
3403                 }
3404         }
3405 }
3406 
3407 /*
3408  * tcp_ss_rexmit() is called to do slow start retransmission after a timeout
3409  * or ICMP errors.



3410  */
3411 void
3412 tcp_ss_rexmit(tcp_t *tcp)
3413 {
3414         uint32_t        snxt;
3415         uint32_t        smax;
3416         int32_t         win;
3417         int32_t         mss;
3418         int32_t         off;

3419         mblk_t          *snxt_mp;
3420         tcp_stack_t     *tcps = tcp->tcp_tcps;
3421 
3422         /*
3423          * Note that tcp_rexmit can be set even though TCP has retransmitted
3424          * all unack'ed segments.
3425          */
3426         if (SEQ_LT(tcp->tcp_rexmit_nxt, tcp->tcp_rexmit_max)) {
3427                 smax = tcp->tcp_rexmit_max;
3428                 snxt = tcp->tcp_rexmit_nxt;
3429                 if (SEQ_LT(snxt, tcp->tcp_suna)) {
3430                         snxt = tcp->tcp_suna;
3431                 }
3432                 win = MIN(tcp->tcp_cwnd, tcp->tcp_swnd);
3433                 win -= snxt - tcp->tcp_suna;
3434                 mss = tcp->tcp_mss;
3435                 snxt_mp = tcp_get_seg_mp(tcp, snxt, &off);
3436 
3437                 while (SEQ_LT(snxt, smax) && (win > 0) && (snxt_mp != NULL)) {

3438                         mblk_t  *xmit_mp;
3439                         mblk_t  *old_snxt_mp = snxt_mp;
3440                         uint32_t cnt = mss;
3441 
3442                         if (win < cnt) {
3443                                 cnt = win;
3444                         }
3445                         if (SEQ_GT(snxt + cnt, smax)) {
3446                                 cnt = smax - snxt;
3447                         }
3448                         xmit_mp = tcp_xmit_mp(tcp, snxt_mp, cnt, &off,
3449                             &snxt_mp, snxt, B_TRUE, &cnt, B_TRUE);
3450                         if (xmit_mp == NULL)
3451                                 return;
3452 
3453                         tcp_send_data(tcp, xmit_mp);
3454 
3455                         snxt += cnt;
3456                         win -= cnt;
3457                         /*
3458                          * Update the send timestamp to avoid false
3459                          * retransmission.
3460                          */
3461                         old_snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt();
3462                         TCPS_BUMP_MIB(tcps, tcpRetransSegs);
3463                         TCPS_UPDATE_MIB(tcps, tcpRetransBytes, cnt);
3464 
3465                         tcp->tcp_rexmit_nxt = snxt;

3466                 }
3467                 /*
3468                  * If we have transmitted all we have at the time
3469                  * we started the retranmission, we can leave
3470                  * the rest of the job to tcp_wput_data().  But we
3471                  * need to check the send window first.  If the
3472                  * win is not 0, go on with tcp_wput_data().
3473                  */
3474                 if (SEQ_LT(snxt, smax) || win == 0) {
3475                         return;
3476                 }
3477         }
3478         /* Only call tcp_wput_data() if there is data to be sent. */
3479         if (tcp->tcp_unsent) {
3480                 tcp_wput_data(tcp, NULL, B_FALSE);
3481         }
3482 }
3483 
3484 /*
3485  * Do slow start retransmission after ICMP errors of PMTU changes.
3486  */
3487 void
3488 tcp_rexmit_after_error(tcp_t *tcp)
3489 {
3490         /*
3491          * All sent data has been acknowledged or no data left to send, just
3492          * to return.
3493          */
3494         if (!SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) ||
3495             (tcp->tcp_xmit_head == NULL))
3496                 return;
3497 
3498         if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && (tcp->tcp_unsent == 0))
3499                 tcp->tcp_rexmit_max = tcp->tcp_fss;
3500         else
3501                 tcp->tcp_rexmit_max = tcp->tcp_snxt;
3502 
3503         tcp->tcp_rexmit_nxt = tcp->tcp_suna;
3504         tcp->tcp_rexmit = B_TRUE;
3505         tcp->tcp_dupack_cnt = 0;

3506         tcp_ss_rexmit(tcp);
3507 }
3508 
3509 /*
3510  * tcp_get_seg_mp() is called to get the pointer to a segment in the
3511  * send queue which starts at the given sequence number. If the given
3512  * sequence number is equal to last valid sequence number (tcp_snxt), the
3513  * returned mblk is the last valid mblk, and off is set to the length of
3514  * that mblk.
3515  *
3516  * send queue which starts at the given seq. no.
3517  *
3518  * Parameters:
3519  *      tcp_t *tcp: the tcp instance pointer.
3520  *      uint32_t seq: the starting seq. no of the requested segment.
3521  *      int32_t *off: after the execution, *off will be the offset to
3522  *              the returned mblk which points to the requested seq no.
3523  *              It is the caller's responsibility to send in a non-null off.
3524  *
3525  * Return: