4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 /* This file contains all TCP output processing functions. */
27
28 #include <sys/types.h>
29 #include <sys/stream.h>
30 #include <sys/strsun.h>
31 #include <sys/strsubr.h>
32 #include <sys/stropts.h>
33 #include <sys/strlog.h>
34 #define _SUN_TPI_VERSION 2
35 #include <sys/tihdr.h>
36 #include <sys/suntpi.h>
37 #include <sys/xti_inet.h>
38 #include <sys/timod.h>
39 #include <sys/pattr.h>
40 #include <sys/squeue_impl.h>
41 #include <sys/squeue.h>
42 #include <sys/sockio.h>
43 #include <sys/tsol/tnet.h>
1744 /* Guard against a RST having blown it away while on the squeue */
1745 if (tcp->tcp_state == TCPS_CLOSED) {
1746 freemsg(mp);
1747 return;
1748 }
1749
1750 /*
1751 * In the off-chance that the eager received and responded to
1752 * some other packet while the SYN|ACK was queued, we recalculate
1753 * the ixa_pktlen. It would be better to fix the SYN/accept
1754 * multithreading scheme to avoid this complexity.
1755 */
1756 ixa->ixa_pktlen = msgdsize(mp);
1757 (void) conn_ip_output(mp, ixa);
1758 }
1759
1760 /*
1761 * tcp_send() is called by tcp_wput_data() and returns one of the following:
1762 *
1763 * -1 = failed allocation.
1764 * 0 = success; burst count reached, or usable send window is too small,
1765 * and that we'd rather wait until later before sending again.
1766 */
1767 static int
1768 tcp_send(tcp_t *tcp, const int mss, const int total_hdr_len,
1769 const int tcp_hdr_len, const int num_sack_blk, int *usable,
1770 uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time)
1771 {
1772 int num_burst_seg = tcp->tcp_snd_burst;
1773 int num_lso_seg = 1;
1774 uint_t lso_usable;
1775 boolean_t do_lso_send = B_FALSE;
1776 tcp_stack_t *tcps = tcp->tcp_tcps;
1777 conn_t *connp = tcp->tcp_connp;
1778 ip_xmit_attr_t *ixa = connp->conn_ixa;
1779
1780 /*
1781 * Check LSO possibility. The value of tcp->tcp_lso indicates whether
1782 * the underlying connection is LSO capable. Will check whether having
1783 * enough available data to initiate LSO transmission in the for(){}
1784 * loops.
1785 */
1786 if (tcp->tcp_lso && (tcp->tcp_valid_bits & ~TCP_FSS_VALID) == 0)
1787 do_lso_send = B_TRUE;
1788
1789 for (;;) {
1790 struct datab *db;
1791 tcpha_t *tcpha;
1792 uint32_t sum;
1793 mblk_t *mp, *mp1;
1794 uchar_t *rptr;
1795 int len;
1796
1797 /*
1798 * Burst count reached, return successfully.
1799 */
1800 if (num_burst_seg == 0)
1801 break;
1802
1803 /*
1804 * Calculate the maximum payload length we can send at one
1805 * time.
1806 */
1807 if (do_lso_send) {
1808 /*
1809 * Check whether be able to to do LSO for the current
1810 * available data.
1811 */
1812 if (num_burst_seg >= 2 && (*usable - 1) / mss >= 1) {
1813 lso_usable = MIN(tcp->tcp_lso_max, *usable);
1814 lso_usable = MIN(lso_usable,
1815 num_burst_seg * mss);
1816
1817 num_lso_seg = lso_usable / mss;
1818 if (lso_usable % mss) {
1819 num_lso_seg++;
1820 tcp->tcp_last_sent_len = (ushort_t)
1821 (lso_usable % mss);
1822 } else {
1823 tcp->tcp_last_sent_len = (ushort_t)mss;
1824 }
1825 } else {
1826 do_lso_send = B_FALSE;
1827 num_lso_seg = 1;
1828 lso_usable = mss;
1829 }
1830 }
1831
1832 ASSERT(num_lso_seg <= IP_MAXPACKET / mss + 1);
1833 #ifdef DEBUG
1834 DTRACE_PROBE2(tcp_send_lso, int, num_lso_seg, boolean_t,
1835 do_lso_send);
1836 #endif
1837 /*
1838 * Adjust num_burst_seg here.
1839 */
1840 num_burst_seg -= num_lso_seg;
1841
1842 len = mss;
1843 if (len > *usable) {
1844 ASSERT(do_lso_send == B_FALSE);
1845
1846 len = *usable;
1847 if (len <= 0) {
1848 /* Terminate the loop */
1849 break; /* success; too small */
1850 }
1851 /*
1852 * Sender silly-window avoidance.
1853 * Ignore this if we are going to send a
1854 * zero window probe out.
1855 *
1856 * TODO: force data into microscopic window?
1857 * ==> (!pushed || (unsent > usable))
1858 */
1859 if (len < (tcp->tcp_max_swnd >> 1) &&
1860 (tcp->tcp_unsent - (*snxt - tcp->tcp_snxt)) > len &&
3407
3408 TCPS_BUMP_MIB(tcps, tcpRetransSegs);
3409 TCPS_UPDATE_MIB(tcps, tcpRetransBytes, seg_len);
3410 TCPS_BUMP_MIB(tcps, tcpOutSackRetransSegs);
3411 /*
3412 * Update tcp_rexmit_max to extend this SACK recovery phase.
3413 * This happens when new data sent during fast recovery is
3414 * also lost. If TCP retransmits those new data, it needs
3415 * to extend SACK recover phase to avoid starting another
3416 * fast retransmit/recovery unnecessarily.
3417 */
3418 if (SEQ_GT(tcp->tcp_sack_snxt, tcp->tcp_rexmit_max)) {
3419 tcp->tcp_rexmit_max = tcp->tcp_sack_snxt;
3420 }
3421 }
3422 }
3423
3424 /*
3425 * tcp_ss_rexmit() is called to do slow start retransmission after a timeout
3426 * or ICMP errors.
3427 *
3428 * To limit the number of duplicate segments, we limit the number of segment
3429 * to be sent in one time to tcp_snd_burst, the burst variable.
3430 */
3431 void
3432 tcp_ss_rexmit(tcp_t *tcp)
3433 {
3434 uint32_t snxt;
3435 uint32_t smax;
3436 int32_t win;
3437 int32_t mss;
3438 int32_t off;
3439 int32_t burst = tcp->tcp_snd_burst;
3440 mblk_t *snxt_mp;
3441 tcp_stack_t *tcps = tcp->tcp_tcps;
3442
3443 /*
3444 * Note that tcp_rexmit can be set even though TCP has retransmitted
3445 * all unack'ed segments.
3446 */
3447 if (SEQ_LT(tcp->tcp_rexmit_nxt, tcp->tcp_rexmit_max)) {
3448 smax = tcp->tcp_rexmit_max;
3449 snxt = tcp->tcp_rexmit_nxt;
3450 if (SEQ_LT(snxt, tcp->tcp_suna)) {
3451 snxt = tcp->tcp_suna;
3452 }
3453 win = MIN(tcp->tcp_cwnd, tcp->tcp_swnd);
3454 win -= snxt - tcp->tcp_suna;
3455 mss = tcp->tcp_mss;
3456 snxt_mp = tcp_get_seg_mp(tcp, snxt, &off);
3457
3458 while (SEQ_LT(snxt, smax) && (win > 0) &&
3459 (burst > 0) && (snxt_mp != NULL)) {
3460 mblk_t *xmit_mp;
3461 mblk_t *old_snxt_mp = snxt_mp;
3462 uint32_t cnt = mss;
3463
3464 if (win < cnt) {
3465 cnt = win;
3466 }
3467 if (SEQ_GT(snxt + cnt, smax)) {
3468 cnt = smax - snxt;
3469 }
3470 xmit_mp = tcp_xmit_mp(tcp, snxt_mp, cnt, &off,
3471 &snxt_mp, snxt, B_TRUE, &cnt, B_TRUE);
3472 if (xmit_mp == NULL)
3473 return;
3474
3475 tcp_send_data(tcp, xmit_mp);
3476
3477 snxt += cnt;
3478 win -= cnt;
3479 /*
3480 * Update the send timestamp to avoid false
3481 * retransmission.
3482 */
3483 old_snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt();
3484 TCPS_BUMP_MIB(tcps, tcpRetransSegs);
3485 TCPS_UPDATE_MIB(tcps, tcpRetransBytes, cnt);
3486
3487 tcp->tcp_rexmit_nxt = snxt;
3488 burst--;
3489 }
3490 /*
3491 * If we have transmitted all we have at the time
3492 * we started the retranmission, we can leave
3493 * the rest of the job to tcp_wput_data(). But we
3494 * need to check the send window first. If the
3495 * win is not 0, go on with tcp_wput_data().
3496 */
3497 if (SEQ_LT(snxt, smax) || win == 0) {
3498 return;
3499 }
3500 }
3501 /* Only call tcp_wput_data() if there is data to be sent. */
3502 if (tcp->tcp_unsent) {
3503 tcp_wput_data(tcp, NULL, B_FALSE);
3504 }
3505 }
3506
3507 /*
3508 * Do slow start retransmission after ICMP errors of PMTU changes.
3509 */
3510 void
3511 tcp_rexmit_after_error(tcp_t *tcp)
3512 {
3513 /*
3514 * All sent data has been acknowledged or no data left to send, just
3515 * to return.
3516 */
3517 if (!SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) ||
3518 (tcp->tcp_xmit_head == NULL))
3519 return;
3520
3521 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && (tcp->tcp_unsent == 0))
3522 tcp->tcp_rexmit_max = tcp->tcp_fss;
3523 else
3524 tcp->tcp_rexmit_max = tcp->tcp_snxt;
3525
3526 tcp->tcp_rexmit_nxt = tcp->tcp_suna;
3527 tcp->tcp_rexmit = B_TRUE;
3528 tcp->tcp_dupack_cnt = 0;
3529 tcp->tcp_snd_burst = TCP_CWND_SS;
3530 tcp_ss_rexmit(tcp);
3531 }
3532
3533 /*
3534 * tcp_get_seg_mp() is called to get the pointer to a segment in the
3535 * send queue which starts at the given sequence number. If the given
3536 * sequence number is equal to last valid sequence number (tcp_snxt), the
3537 * returned mblk is the last valid mblk, and off is set to the length of
3538 * that mblk.
3539 *
3540 * send queue which starts at the given seq. no.
3541 *
3542 * Parameters:
3543 * tcp_t *tcp: the tcp instance pointer.
3544 * uint32_t seq: the starting seq. no of the requested segment.
3545 * int32_t *off: after the execution, *off will be the offset to
3546 * the returned mblk which points to the requested seq no.
3547 * It is the caller's responsibility to send in a non-null off.
3548 *
3549 * Return:
|
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2014 by Delphix. All rights reserved.
25 */
26
27 /* This file contains all TCP output processing functions. */
28
29 #include <sys/types.h>
30 #include <sys/stream.h>
31 #include <sys/strsun.h>
32 #include <sys/strsubr.h>
33 #include <sys/stropts.h>
34 #include <sys/strlog.h>
35 #define _SUN_TPI_VERSION 2
36 #include <sys/tihdr.h>
37 #include <sys/suntpi.h>
38 #include <sys/xti_inet.h>
39 #include <sys/timod.h>
40 #include <sys/pattr.h>
41 #include <sys/squeue_impl.h>
42 #include <sys/squeue.h>
43 #include <sys/sockio.h>
44 #include <sys/tsol/tnet.h>
1745 /* Guard against a RST having blown it away while on the squeue */
1746 if (tcp->tcp_state == TCPS_CLOSED) {
1747 freemsg(mp);
1748 return;
1749 }
1750
1751 /*
1752 * In the off-chance that the eager received and responded to
1753 * some other packet while the SYN|ACK was queued, we recalculate
1754 * the ixa_pktlen. It would be better to fix the SYN/accept
1755 * multithreading scheme to avoid this complexity.
1756 */
1757 ixa->ixa_pktlen = msgdsize(mp);
1758 (void) conn_ip_output(mp, ixa);
1759 }
1760
1761 /*
1762 * tcp_send() is called by tcp_wput_data() and returns one of the following:
1763 *
1764 * -1 = failed allocation.
1765 * 0 = We've either successfully sent data, or our usable send window is too
1766 * small and we'd rather wait until later before sending again.
1767 */
1768 static int
1769 tcp_send(tcp_t *tcp, const int mss, const int total_hdr_len,
1770 const int tcp_hdr_len, const int num_sack_blk, int *usable,
1771 uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time)
1772 {
1773 int num_lso_seg = 1;
1774 uint_t lso_usable;
1775 boolean_t do_lso_send = B_FALSE;
1776 tcp_stack_t *tcps = tcp->tcp_tcps;
1777 conn_t *connp = tcp->tcp_connp;
1778 ip_xmit_attr_t *ixa = connp->conn_ixa;
1779
1780 /*
1781 * Check LSO possibility. The value of tcp->tcp_lso indicates whether
1782 * the underlying connection is LSO capable. Will check whether having
1783 * enough available data to initiate LSO transmission in the for(){}
1784 * loops.
1785 */
1786 if (tcp->tcp_lso && (tcp->tcp_valid_bits & ~TCP_FSS_VALID) == 0)
1787 do_lso_send = B_TRUE;
1788
1789 for (;;) {
1790 struct datab *db;
1791 tcpha_t *tcpha;
1792 uint32_t sum;
1793 mblk_t *mp, *mp1;
1794 uchar_t *rptr;
1795 int len;
1796
1797 /*
1798 * Calculate the maximum payload length we can send at one
1799 * time.
1800 */
1801 if (do_lso_send) {
1802 /*
1803 * Determine whether or not it's possible to do LSO,
1804 * and if so, how much data we can send.
1805 */
1806 if ((*usable - 1) / mss >= 1) {
1807 lso_usable = MIN(tcp->tcp_lso_max, *usable);
1808 num_lso_seg = lso_usable / mss;
1809 if (lso_usable % mss) {
1810 num_lso_seg++;
1811 tcp->tcp_last_sent_len = (ushort_t)
1812 (lso_usable % mss);
1813 } else {
1814 tcp->tcp_last_sent_len = (ushort_t)mss;
1815 }
1816 } else {
1817 do_lso_send = B_FALSE;
1818 num_lso_seg = 1;
1819 lso_usable = mss;
1820 }
1821 }
1822
1823 ASSERT(num_lso_seg <= IP_MAXPACKET / mss + 1);
1824
1825 len = mss;
1826 if (len > *usable) {
1827 ASSERT(do_lso_send == B_FALSE);
1828
1829 len = *usable;
1830 if (len <= 0) {
1831 /* Terminate the loop */
1832 break; /* success; too small */
1833 }
1834 /*
1835 * Sender silly-window avoidance.
1836 * Ignore this if we are going to send a
1837 * zero window probe out.
1838 *
1839 * TODO: force data into microscopic window?
1840 * ==> (!pushed || (unsent > usable))
1841 */
1842 if (len < (tcp->tcp_max_swnd >> 1) &&
1843 (tcp->tcp_unsent - (*snxt - tcp->tcp_snxt)) > len &&
3390
3391 TCPS_BUMP_MIB(tcps, tcpRetransSegs);
3392 TCPS_UPDATE_MIB(tcps, tcpRetransBytes, seg_len);
3393 TCPS_BUMP_MIB(tcps, tcpOutSackRetransSegs);
3394 /*
3395 * Update tcp_rexmit_max to extend this SACK recovery phase.
3396 * This happens when new data sent during fast recovery is
3397 * also lost. If TCP retransmits those new data, it needs
3398 * to extend SACK recover phase to avoid starting another
3399 * fast retransmit/recovery unnecessarily.
3400 */
3401 if (SEQ_GT(tcp->tcp_sack_snxt, tcp->tcp_rexmit_max)) {
3402 tcp->tcp_rexmit_max = tcp->tcp_sack_snxt;
3403 }
3404 }
3405 }
3406
3407 /*
3408 * tcp_ss_rexmit() is called to do slow start retransmission after a timeout
3409 * or ICMP errors.
3410 */
3411 void
3412 tcp_ss_rexmit(tcp_t *tcp)
3413 {
3414 uint32_t snxt;
3415 uint32_t smax;
3416 int32_t win;
3417 int32_t mss;
3418 int32_t off;
3419 mblk_t *snxt_mp;
3420 tcp_stack_t *tcps = tcp->tcp_tcps;
3421
3422 /*
3423 * Note that tcp_rexmit can be set even though TCP has retransmitted
3424 * all unack'ed segments.
3425 */
3426 if (SEQ_LT(tcp->tcp_rexmit_nxt, tcp->tcp_rexmit_max)) {
3427 smax = tcp->tcp_rexmit_max;
3428 snxt = tcp->tcp_rexmit_nxt;
3429 if (SEQ_LT(snxt, tcp->tcp_suna)) {
3430 snxt = tcp->tcp_suna;
3431 }
3432 win = MIN(tcp->tcp_cwnd, tcp->tcp_swnd);
3433 win -= snxt - tcp->tcp_suna;
3434 mss = tcp->tcp_mss;
3435 snxt_mp = tcp_get_seg_mp(tcp, snxt, &off);
3436
3437 while (SEQ_LT(snxt, smax) && (win > 0) && (snxt_mp != NULL)) {
3438 mblk_t *xmit_mp;
3439 mblk_t *old_snxt_mp = snxt_mp;
3440 uint32_t cnt = mss;
3441
3442 if (win < cnt) {
3443 cnt = win;
3444 }
3445 if (SEQ_GT(snxt + cnt, smax)) {
3446 cnt = smax - snxt;
3447 }
3448 xmit_mp = tcp_xmit_mp(tcp, snxt_mp, cnt, &off,
3449 &snxt_mp, snxt, B_TRUE, &cnt, B_TRUE);
3450 if (xmit_mp == NULL)
3451 return;
3452
3453 tcp_send_data(tcp, xmit_mp);
3454
3455 snxt += cnt;
3456 win -= cnt;
3457 /*
3458 * Update the send timestamp to avoid false
3459 * retransmission.
3460 */
3461 old_snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt();
3462 TCPS_BUMP_MIB(tcps, tcpRetransSegs);
3463 TCPS_UPDATE_MIB(tcps, tcpRetransBytes, cnt);
3464
3465 tcp->tcp_rexmit_nxt = snxt;
3466 }
3467 /*
3468 * If we have transmitted all we have at the time
3469 * we started the retranmission, we can leave
3470 * the rest of the job to tcp_wput_data(). But we
3471 * need to check the send window first. If the
3472 * win is not 0, go on with tcp_wput_data().
3473 */
3474 if (SEQ_LT(snxt, smax) || win == 0) {
3475 return;
3476 }
3477 }
3478 /* Only call tcp_wput_data() if there is data to be sent. */
3479 if (tcp->tcp_unsent) {
3480 tcp_wput_data(tcp, NULL, B_FALSE);
3481 }
3482 }
3483
3484 /*
3485 * Do slow start retransmission after ICMP errors of PMTU changes.
3486 */
3487 void
3488 tcp_rexmit_after_error(tcp_t *tcp)
3489 {
3490 /*
3491 * All sent data has been acknowledged or no data left to send, just
3492 * to return.
3493 */
3494 if (!SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) ||
3495 (tcp->tcp_xmit_head == NULL))
3496 return;
3497
3498 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && (tcp->tcp_unsent == 0))
3499 tcp->tcp_rexmit_max = tcp->tcp_fss;
3500 else
3501 tcp->tcp_rexmit_max = tcp->tcp_snxt;
3502
3503 tcp->tcp_rexmit_nxt = tcp->tcp_suna;
3504 tcp->tcp_rexmit = B_TRUE;
3505 tcp->tcp_dupack_cnt = 0;
3506 tcp_ss_rexmit(tcp);
3507 }
3508
3509 /*
3510 * tcp_get_seg_mp() is called to get the pointer to a segment in the
3511 * send queue which starts at the given sequence number. If the given
3512 * sequence number is equal to last valid sequence number (tcp_snxt), the
3513 * returned mblk is the last valid mblk, and off is set to the length of
3514 * that mblk.
3515 *
3516 * send queue which starts at the given seq. no.
3517 *
3518 * Parameters:
3519 * tcp_t *tcp: the tcp instance pointer.
3520 * uint32_t seq: the starting seq. no of the requested segment.
3521 * int32_t *off: after the execution, *off will be the offset to
3522 * the returned mblk which points to the requested seq no.
3523 * It is the caller's responsibility to send in a non-null off.
3524 *
3525 * Return:
|