6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
25 * Copyright 2017 Joyent, Inc.
26 * Copyright (c) 2014 by Delphix. All rights reserved.
27 */
28
29 /* This file contains all TCP input processing functions. */
30
31 #include <sys/types.h>
32 #include <sys/stream.h>
33 #include <sys/strsun.h>
34 #include <sys/strsubr.h>
35 #include <sys/stropts.h>
36 #include <sys/strlog.h>
37 #define _SUN_TPI_VERSION 2
38 #include <sys/tihdr.h>
39 #include <sys/suntpi.h>
40 #include <sys/xti_inet.h>
41 #include <sys/squeue_impl.h>
42 #include <sys/squeue.h>
43 #include <sys/tsol/tnet.h>
44
45 #include <inet/common.h>
46 #include <inet/ip.h>
149 static uint32_t tcp_init_wnd_chk = 4096;
150
151 /* Process ICMP source quench message or not. */
152 static boolean_t tcp_icmp_source_quench = B_FALSE;
153
154 static boolean_t tcp_outbound_squeue_switch = B_FALSE;
155
156 static mblk_t *tcp_conn_create_v4(conn_t *, conn_t *, mblk_t *,
157 ip_recv_attr_t *);
158 static mblk_t *tcp_conn_create_v6(conn_t *, conn_t *, mblk_t *,
159 ip_recv_attr_t *);
160 static boolean_t tcp_drop_q0(tcp_t *);
161 static void tcp_icmp_error_ipv6(tcp_t *, mblk_t *, ip_recv_attr_t *);
162 static mblk_t *tcp_input_add_ancillary(tcp_t *, mblk_t *, ip_pkt_t *,
163 ip_recv_attr_t *);
164 static void tcp_input_listener(void *, mblk_t *, void *, ip_recv_attr_t *);
165 static void tcp_process_options(tcp_t *, tcpha_t *);
166 static mblk_t *tcp_reass(tcp_t *, mblk_t *, uint32_t);
167 static void tcp_reass_elim_overlap(tcp_t *, mblk_t *);
168 static void tcp_rsrv_input(void *, mblk_t *, void *, ip_recv_attr_t *);
169 static void tcp_set_rto(tcp_t *, time_t);
170 static void tcp_setcred_data(mblk_t *, ip_recv_attr_t *);
171
172 /*
173 * Set the MSS associated with a particular tcp based on its current value,
174 * and a new one passed in. Observe minimums and maximums, and reset other
175 * state variables that we want to view as multiples of MSS.
176 *
177 * The value of MSS could be either increased or descreased.
178 */
179 void
180 tcp_mss_set(tcp_t *tcp, uint32_t mss)
181 {
182 uint32_t mss_max;
183 tcp_stack_t *tcps = tcp->tcp_tcps;
184 conn_t *connp = tcp->tcp_connp;
185
186 if (connp->conn_ipversion == IPV4_VERSION)
187 mss_max = tcps->tcps_mss_max_ipv4;
188 else
189 mss_max = tcps->tcps_mss_max_ipv6;
190
191 if (mss < tcps->tcps_mss_min)
192 mss = tcps->tcps_mss_min;
531 IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) + TCP_MIN_HEADER_LENGTH);
532
533 /*
534 * Set MSS to the smaller one of both ends of the connection.
535 * We should not have called tcp_mss_set() before, but our
536 * side of the MSS should have been set to a proper value
537 * by tcp_set_destination(). tcp_mss_set() will also set up the
538 * STREAM head parameters properly.
539 *
540 * If we have a larger-than-16-bit window but the other side
541 * didn't want to do window scale, tcp_rwnd_set() will take
542 * care of that.
543 */
544 tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss));
545
546 /*
547 * Initialize tcp_cwnd value. After tcp_mss_set(), tcp_mss has been
548 * updated properly.
549 */
550 TCP_SET_INIT_CWND(tcp, tcp->tcp_mss, tcps->tcps_slow_start_initial);
551 }
552
553 /*
554 * Add a new piece to the tcp reassembly queue. If the gap at the beginning
555 * is filled, return as much as we can. The message passed in may be
556 * multi-part, chained using b_cont. "start" is the starting sequence
557 * number for this piece.
558 */
559 static mblk_t *
560 tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start)
561 {
562 uint32_t end;
563 mblk_t *mp1;
564 mblk_t *mp2;
565 mblk_t *next_mp;
566 uint32_t u1;
567 tcp_stack_t *tcps = tcp->tcp_tcps;
568
569
570 /* Walk through all the new pieces. */
571 do {
572 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <=
573 (uintptr_t)INT_MAX);
574 end = start + (int)(mp->b_wptr - mp->b_rptr);
575 next_mp = mp->b_cont;
576 if (start == end) {
577 /* Empty. Blast it. */
578 freeb(mp);
579 continue;
580 }
581 mp->b_cont = NULL;
582 TCP_REASS_SET_SEQ(mp, start);
583 TCP_REASS_SET_END(mp, end);
584 mp1 = tcp->tcp_reass_tail;
585 if (!mp1) {
586 tcp->tcp_reass_tail = mp;
587 tcp->tcp_reass_head = mp;
588 TCPS_BUMP_MIB(tcps, tcpInDataUnorderSegs);
589 TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes,
590 end - start);
591 continue;
592 }
593 /* New stuff completely beyond tail? */
594 if (SEQ_GEQ(start, TCP_REASS_END(mp1))) {
595 /* Link it on end. */
596 mp1->b_cont = mp;
597 tcp->tcp_reass_tail = mp;
598 TCPS_BUMP_MIB(tcps, tcpInDataUnorderSegs);
599 TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes,
600 end - start);
601 continue;
602 }
603 mp1 = tcp->tcp_reass_head;
604 u1 = TCP_REASS_SEQ(mp1);
605 /* New stuff at the front? */
606 if (SEQ_LT(start, u1)) {
607 /* Yes... Check for overlap. */
608 mp->b_cont = mp1;
609 tcp->tcp_reass_head = mp;
610 tcp_reass_elim_overlap(tcp, mp);
611 continue;
612 }
613 /*
614 * The new piece fits somewhere between the head and tail.
615 * We find our slot, where mp1 precedes us and mp2 trails.
616 */
617 for (; (mp2 = mp1->b_cont) != NULL; mp1 = mp2) {
618 u1 = TCP_REASS_SEQ(mp2);
619 if (SEQ_LEQ(start, u1))
620 break;
2307 tcp_input_data(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
2308 {
2309 int32_t bytes_acked;
2310 int32_t gap;
2311 mblk_t *mp1;
2312 uint_t flags;
2313 uint32_t new_swnd = 0;
2314 uchar_t *iphdr;
2315 uchar_t *rptr;
2316 int32_t rgap;
2317 uint32_t seg_ack;
2318 int seg_len;
2319 uint_t ip_hdr_len;
2320 uint32_t seg_seq;
2321 tcpha_t *tcpha;
2322 int urp;
2323 tcp_opt_t tcpopt;
2324 ip_pkt_t ipp;
2325 boolean_t ofo_seg = B_FALSE; /* Out of order segment */
2326 uint32_t cwnd;
2327 uint32_t add;
2328 int npkt;
2329 int mss;
2330 conn_t *connp = (conn_t *)arg;
2331 squeue_t *sqp = (squeue_t *)arg2;
2332 tcp_t *tcp = connp->conn_tcp;
2333 tcp_stack_t *tcps = tcp->tcp_tcps;
2334 sock_upcalls_t *sockupcalls;
2335
2336 /*
2337 * RST from fused tcp loopback peer should trigger an unfuse.
2338 */
2339 if (tcp->tcp_fused) {
2340 TCP_STAT(tcps, tcp_fusion_aborted);
2341 tcp_unfuse(tcp);
2342 }
2343
2344 iphdr = mp->b_rptr;
2345 rptr = mp->b_rptr;
2346 ASSERT(OK_32PTR(rptr));
2347
2348 ip_hdr_len = ira->ira_ip_hdr_length;
2397
2398 if (tcp->tcp_state == TCPS_TIME_WAIT) {
2399 tcp_time_wait_processing(tcp, mp, seg_seq, seg_ack,
2400 seg_len, tcpha, ira);
2401 return;
2402 }
2403
2404 if (sqp != NULL) {
2405 /*
2406 * This is the correct place to update tcp_last_recv_time. Note
2407 * that it is also updated for tcp structure that belongs to
2408 * global and listener queues which do not really need updating.
2409 * But that should not cause any harm. And it is updated for
2410 * all kinds of incoming segments, not only for data segments.
2411 */
2412 tcp->tcp_last_recv_time = LBOLT_FASTPATH;
2413 }
2414
2415 flags = (unsigned int)tcpha->tha_flags & 0xFF;
2416
2417 BUMP_LOCAL(tcp->tcp_ibsegs);
2418 DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp);
2419
2420 if ((flags & TH_URG) && sqp != NULL) {
2421 /*
2422 * TCP can't handle urgent pointers that arrive before
2423 * the connection has been accept()ed since it can't
2424 * buffer OOB data. Discard segment if this happens.
2425 *
2426 * We can't just rely on a non-null tcp_listener to indicate
2427 * that the accept() has completed since unlinking of the
2428 * eager and completion of the accept are not atomic.
2429 * tcp_detached, when it is not set (B_FALSE) indicates
2430 * that the accept() has completed.
2431 *
2432 * Nor can it reassemble urgent pointers, so discard
2433 * if it's not the next segment expected.
2434 *
2435 * Otherwise, collapse chain into one mblk (discard if
2436 * that fails). This makes sure the headers, retransmitted
2437 * data, and new data all are in the same mblk.
2584 tcp->tcp_suna = tcp->tcp_iss + 1;
2585 tcp->tcp_valid_bits &= ~TCP_ISS_VALID;
2586
2587 /*
2588 * If SYN was retransmitted, need to reset all
2589 * retransmission info. This is because this
2590 * segment will be treated as a dup ACK.
2591 */
2592 if (tcp->tcp_rexmit) {
2593 tcp->tcp_rexmit = B_FALSE;
2594 tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
2595 tcp->tcp_rexmit_max = tcp->tcp_snxt;
2596 tcp->tcp_ms_we_have_waited = 0;
2597
2598 /*
2599 * Set tcp_cwnd back to 1 MSS, per
2600 * recommendation from
2601 * draft-floyd-incr-init-win-01.txt,
2602 * Increasing TCP's Initial Window.
2603 */
2604 tcp->tcp_cwnd = tcp->tcp_mss;
2605 }
2606
2607 tcp->tcp_swl1 = seg_seq;
2608 tcp->tcp_swl2 = seg_ack;
2609
2610 new_swnd = ntohs(tcpha->tha_win);
2611 tcp->tcp_swnd = new_swnd;
2612 if (new_swnd > tcp->tcp_max_swnd)
2613 tcp->tcp_max_swnd = new_swnd;
2614
2615 /*
2616 * Always send the three-way handshake ack immediately
2617 * in order to make the connection complete as soon as
2618 * possible on the accepting host.
2619 */
2620 flags |= TH_ACK_NEEDED;
2621
2622 /*
2623 * Trace connect-established here.
2642 if (tcp->tcp_loopback) {
2643 mblk_t *ack_mp;
2644
2645 ASSERT(!tcp->tcp_unfusable);
2646 ASSERT(mp1 != NULL);
2647 /*
2648 * For loopback, we always get a pure SYN-ACK
2649 * and only need to send back the final ACK
2650 * with no data (this is because the other
2651 * tcp is ours and we don't do T/TCP). This
2652 * final ACK triggers the passive side to
2653 * perform fusion in ESTABLISHED state.
2654 */
2655 if ((ack_mp = tcp_ack_mp(tcp)) != NULL) {
2656 if (tcp->tcp_ack_tid != 0) {
2657 (void) TCP_TIMER_CANCEL(tcp,
2658 tcp->tcp_ack_tid);
2659 tcp->tcp_ack_tid = 0;
2660 }
2661 tcp_send_data(tcp, ack_mp);
2662 BUMP_LOCAL(tcp->tcp_obsegs);
2663 TCPS_BUMP_MIB(tcps, tcpOutAck);
2664
2665 if (!IPCL_IS_NONSTR(connp)) {
2666 /* Send up T_CONN_CON */
2667 if (ira->ira_cred != NULL) {
2668 mblk_setcred(mp1,
2669 ira->ira_cred,
2670 ira->ira_cpid);
2671 }
2672 putnext(connp->conn_rq, mp1);
2673 } else {
2674 (*sockupcalls->su_connected)
2675 (connp->conn_upper_handle,
2676 tcp->tcp_connid,
2677 ira->ira_cred,
2678 ira->ira_cpid);
2679 freemsg(mp1);
2680 }
2681
2682 freemsg(mp);
3031 mp2 = mp;
3032 mp = mp->b_cont;
3033 freeb(mp2);
3034 } while (gap < 0);
3035 /*
3036 * If the urgent data has already been acknowledged, we
3037 * should ignore TH_URG below
3038 */
3039 if (urp < 0)
3040 flags &= ~TH_URG;
3041 }
3042 /*
3043 * rgap is the amount of stuff received out of window. A negative
3044 * value is the amount out of window.
3045 */
3046 if (rgap < 0) {
3047 mblk_t *mp2;
3048
3049 if (tcp->tcp_rwnd == 0) {
3050 TCPS_BUMP_MIB(tcps, tcpInWinProbe);
3051 } else {
3052 TCPS_BUMP_MIB(tcps, tcpInDataPastWinSegs);
3053 TCPS_UPDATE_MIB(tcps, tcpInDataPastWinBytes, -rgap);
3054 }
3055
3056 /*
3057 * seg_len does not include the FIN, so if more than
3058 * just the FIN is out of window, we act like we don't
3059 * see it. (If just the FIN is out of window, rgap
3060 * will be zero and we will go ahead and acknowledge
3061 * the FIN.)
3062 */
3063 flags &= ~TH_FIN;
3064
3065 /* Fix seg_len and make sure there is something left. */
3066 seg_len += rgap;
3067 if (seg_len <= 0) {
3068 /*
3069 * Resets are only valid if they lie within our offered
3070 * window. If the RST bit is set, we just ignore this
3280 *
3281 * But TCP should not perform fast retransmit
3282 * because of the ack number. TCP uses
3283 * seg_len == 0 to determine if it is a pure
3284 * ACK. And this is not a pure ACK.
3285 */
3286 seg_len = 0;
3287 ofo_seg = B_TRUE;
3288
3289 if (tcps->tcps_reass_timeout != 0 &&
3290 tcp->tcp_reass_tid == 0) {
3291 tcp->tcp_reass_tid = TCP_TIMER(tcp,
3292 tcp_reass_timer,
3293 tcps->tcps_reass_timeout);
3294 }
3295 }
3296 }
3297 } else if (seg_len > 0) {
3298 TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs);
3299 TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, seg_len);
3300 /*
3301 * If an out of order FIN was received before, and the seq
3302 * num and len of the new segment match that of the FIN,
3303 * put the FIN flag back in.
3304 */
3305 if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) &&
3306 seg_seq + seg_len == tcp->tcp_ofo_fin_seq) {
3307 flags |= TH_FIN;
3308 tcp->tcp_valid_bits &= ~TCP_OFO_FIN_VALID;
3309 }
3310 }
3311 if ((flags & (TH_RST | TH_SYN | TH_URG | TH_ACK)) != TH_ACK) {
3312 if (flags & TH_RST) {
3313 freemsg(mp);
3314 switch (tcp->tcp_state) {
3315 case TCPS_SYN_RCVD:
3316 (void) tcp_clean_death(tcp, ECONNREFUSED);
3317 break;
3318 case TCPS_ESTABLISHED:
3319 case TCPS_FIN_WAIT_1:
3345 SEQ_LEQ(seg_seq, tcp->tcp_rnxt + tcp->tcp_rwnd));
3346 freemsg(mp);
3347 /*
3348 * If the ACK flag is not set, just use our snxt as the
3349 * seq number of the RST segment.
3350 */
3351 if (!(flags & TH_ACK)) {
3352 seg_ack = tcp->tcp_snxt;
3353 }
3354 tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1,
3355 TH_RST|TH_ACK);
3356 ASSERT(tcp->tcp_state != TCPS_TIME_WAIT);
3357 (void) tcp_clean_death(tcp, ECONNRESET);
3358 return;
3359 }
3360 /*
3361 * urp could be -1 when the urp field in the packet is 0
3362 * and TCP_OLD_URP_INTERPRETATION is set. This implies that the urgent
3363 * byte was at seg_seq - 1, in which case we ignore the urgent flag.
3364 */
3365 if (flags & TH_URG && urp >= 0) {
3366 if (!tcp->tcp_urp_last_valid ||
3367 SEQ_GT(urp + seg_seq, tcp->tcp_urp_last)) {
3368 /*
3369 * Non-STREAMS sockets handle the urgent data a litte
3370 * differently from STREAMS based sockets. There is no
3371 * need to mark any mblks with the MSG{NOT,}MARKNEXT
3372 * flags to keep SIOCATMARK happy. Instead a
3373 * su_signal_oob upcall is made to update the mark.
3374 * Neither is a T_EXDATA_IND mblk needed to be
3375 * prepended to the urgent data. The urgent data is
3376 * delivered using the su_recv upcall, where we set
3377 * the MSG_OOB flag to indicate that it is urg data.
3378 *
3379 * Neither TH_SEND_URP_MARK nor TH_MARKNEXT_NEEDED
3380 * are used by non-STREAMS sockets.
3381 */
3382 if (IPCL_IS_NONSTR(connp)) {
3383 if (!TCP_IS_DETACHED(tcp)) {
3384 (*sockupcalls->su_signal_oob)
3385 (connp->conn_upper_handle, urp);
3802 ip_xmit_attr_t *, connp->conn_ixa, void_ip_t *,
3803 iphdr, tcp_t *, tcp, tcph_t *, tcpha);
3804 }
3805 TCPS_CONN_INC(tcps);
3806
3807 tcp->tcp_suna = tcp->tcp_iss + 1; /* One for the SYN */
3808 bytes_acked--;
3809 /* SYN was acked - making progress */
3810 tcp->tcp_ip_forward_progress = B_TRUE;
3811
3812 /*
3813 * If SYN was retransmitted, need to reset all
3814 * retransmission info as this segment will be
3815 * treated as a dup ACK.
3816 */
3817 if (tcp->tcp_rexmit) {
3818 tcp->tcp_rexmit = B_FALSE;
3819 tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
3820 tcp->tcp_rexmit_max = tcp->tcp_snxt;
3821 tcp->tcp_ms_we_have_waited = 0;
3822 tcp->tcp_cwnd = mss;
3823 }
3824
3825 /*
3826 * We set the send window to zero here.
3827 * This is needed if there is data to be
3828 * processed already on the queue.
3829 * Later (at swnd_update label), the
3830 * "new_swnd > tcp_swnd" condition is satisfied
3831 * the XMIT_NEEDED flag is set in the current
3832 * (SYN_RCVD) state. This ensures tcp_wput_data() is
3833 * called if there is already data on queue in
3834 * this state.
3835 */
3836 tcp->tcp_swnd = 0;
3837
3838 if (new_swnd > tcp->tcp_max_swnd)
3839 tcp->tcp_max_swnd = new_swnd;
3840 tcp->tcp_swl1 = seg_seq;
3841 tcp->tcp_swl2 = seg_ack;
3845 DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *,
3846 connp->conn_ixa, void, NULL, tcp_t *, tcp, void, NULL,
3847 int32_t, TCPS_SYN_RCVD);
3848
3849 /* Fuse when both sides are in ESTABLISHED state */
3850 if (tcp->tcp_loopback && do_tcp_fusion)
3851 tcp_fuse(tcp, iphdr, tcpha);
3852
3853 }
3854 /* This code follows 4.4BSD-Lite2 mostly. */
3855 if (bytes_acked < 0)
3856 goto est;
3857
3858 /*
3859 * If TCP is ECN capable and the congestion experience bit is
3860 * set, reduce tcp_cwnd and tcp_ssthresh. But this should only be
3861 * done once per window (or more loosely, per RTT).
3862 */
3863 if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max))
3864 tcp->tcp_cwr = B_FALSE;
3865 if (tcp->tcp_ecn_ok && (flags & TH_ECE)) {
3866 if (!tcp->tcp_cwr) {
3867 npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) / mss;
3868 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * mss;
3869 tcp->tcp_cwnd = npkt * mss;
3870 /*
3871 * If the cwnd is 0, use the timer to clock out
3872 * new segments. This is required by the ECN spec.
3873 */
3874 if (npkt == 0) {
3875 TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
3876 /*
3877 * This makes sure that when the ACK comes
3878 * back, we will increase tcp_cwnd by 1 MSS.
3879 */
3880 tcp->tcp_cwnd_cnt = 0;
3881 }
3882 tcp->tcp_cwr = B_TRUE;
3883 /*
3884 * This marks the end of the current window of in
3885 * flight data. That is why we don't use
3886 * tcp_suna + tcp_swnd. Only data in flight can
3887 * provide ECN info.
3888 */
3889 tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
3890 tcp->tcp_ecn_cwr_sent = B_FALSE;
3891 }
3892 }
3893
3894 mp1 = tcp->tcp_xmit_head;
3895 if (bytes_acked == 0) {
3896 if (!ofo_seg && seg_len == 0 && new_swnd == tcp->tcp_swnd) {
3897 int dupack_cnt;
3898
3899 TCPS_BUMP_MIB(tcps, tcpInDupAck);
3900 /*
3901 * Fast retransmit. When we have seen exactly three
3902 * identical ACKs while we have unacked data
3903 * outstanding we take it as a hint that our peer
3904 * dropped something.
3905 *
3906 * If TCP is retransmitting, don't do fast retransmit.
3907 */
3908 if (mp1 && tcp->tcp_suna != tcp->tcp_snxt &&
3909 ! tcp->tcp_rexmit) {
3910 /* Do Limited Transmit */
3911 if ((dupack_cnt = ++tcp->tcp_dupack_cnt) <
3912 tcps->tcps_dupack_fast_retransmit) {
3913 /*
3914 * RFC 3042
3915 *
3916 * What we need to do is temporarily
3917 * increase tcp_cwnd so that new
3918 * data can be sent if it is allowed
3919 * by the receive window (tcp_rwnd).
3920 * tcp_wput_data() will take care of
3921 * the rest.
3922 *
3923 * If the connection is SACK capable,
3924 * only do limited xmit when there
3925 * is SACK info.
3926 *
3927 * Note how tcp_cwnd is incremented.
3928 * The first dup ACK will increase
3929 * it by 1 MSS. The second dup ACK
3930 * will increase it by 2 MSS. This
3931 * means that only 1 new segment will
3932 * be sent for each dup ACK.
3939 (tcp->tcp_dupack_cnt - 1);
3940 flags |= TH_LIMIT_XMIT;
3941 }
3942 } else if (dupack_cnt ==
3943 tcps->tcps_dupack_fast_retransmit) {
3944
3945 /*
3946 * If we have reduced tcp_ssthresh
3947 * because of ECN, do not reduce it again
3948 * unless it is already one window of data
3949 * away. After one window of data, tcp_cwr
3950 * should then be cleared. Note that
3951 * for non ECN capable connection, tcp_cwr
3952 * should always be false.
3953 *
3954 * Adjust cwnd since the duplicate
3955 * ack indicates that a packet was
3956 * dropped (due to congestion.)
3957 */
3958 if (!tcp->tcp_cwr) {
3959 npkt = ((tcp->tcp_snxt -
3960 tcp->tcp_suna) >> 1) / mss;
3961 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) *
3962 mss;
3963 tcp->tcp_cwnd = (npkt +
3964 tcp->tcp_dupack_cnt) * mss;
3965 }
3966 if (tcp->tcp_ecn_ok) {
3967 tcp->tcp_cwr = B_TRUE;
3968 tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
3969 tcp->tcp_ecn_cwr_sent = B_FALSE;
3970 }
3971
3972 /*
3973 * We do Hoe's algorithm. Refer to her
3974 * paper "Improving the Start-up Behavior
3975 * of a Congestion Control Scheme for TCP,"
3976 * appeared in SIGCOMM'96.
3977 *
3978 * Save highest seq no we have sent so far.
3979 * Be careful about the invisible FIN byte.
3980 */
3981 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
3982 (tcp->tcp_unsent == 0)) {
3983 tcp->tcp_rexmit_max = tcp->tcp_fss;
3984 } else {
4006 tcp->tcp_fack;
4007 tcp->tcp_sack_snxt = seg_ack;
4008 flags |= TH_NEED_SACK_REXMIT;
4009 } else {
4010 /*
4011 * Always initialize tcp_pipe
4012 * even though we don't have
4013 * any SACK info. If later
4014 * we get SACK info and
4015 * tcp_pipe is not initialized,
4016 * funny things will happen.
4017 */
4018 tcp->tcp_pipe =
4019 tcp->tcp_cwnd_ssthresh;
4020 }
4021 } else {
4022 flags |= TH_REXMIT_NEEDED;
4023 } /* tcp_snd_sack_ok */
4024
4025 } else {
4026 /*
4027 * Here we perform congestion
4028 * avoidance, but NOT slow start.
4029 * This is known as the Fast
4030 * Recovery Algorithm.
4031 */
4032 if (tcp->tcp_snd_sack_ok &&
4033 tcp->tcp_notsack_list != NULL) {
4034 flags |= TH_NEED_SACK_REXMIT;
4035 tcp->tcp_pipe -= mss;
4036 if (tcp->tcp_pipe < 0)
4037 tcp->tcp_pipe = 0;
4038 } else {
4039 /*
4040 * We know that one more packet has
4041 * left the pipe thus we can update
4042 * cwnd.
4043 */
4044 cwnd = tcp->tcp_cwnd + mss;
4045 if (cwnd > tcp->tcp_cwnd_max)
4046 cwnd = tcp->tcp_cwnd_max;
4047 tcp->tcp_cwnd = cwnd;
4048 if (tcp->tcp_unsent > 0)
4049 flags |= TH_XMIT_NEEDED;
4050 }
4051 }
4052 }
4053 } else if (tcp->tcp_zero_win_probe) {
4054 /*
4055 * If the window has opened, need to arrange
4056 * to send additional data.
4057 */
4058 if (new_swnd != 0) {
4059 /* tcp_suna != tcp_snxt */
4060 /* Packet contains a window update */
4061 TCPS_BUMP_MIB(tcps, tcpInWinUpdate);
4062 tcp->tcp_zero_win_probe = 0;
4063 tcp->tcp_timer_backoff = 0;
4064 tcp->tcp_ms_we_have_waited = 0;
4065
4066 /*
4129 * greater than 0, check if the number of such
4130 * bogus ACks is greater than that count. If yes,
4131 * don't send back any ACK. This prevents TCP from
4132 * getting into an ACK storm if somehow an attacker
4133 * successfully spoofs an acceptable segment to our
4134 * peer. If this continues (count > 2 X threshold),
4135 * we should abort this connection.
4136 */
4137 if (tcp_drop_ack_unsent_cnt > 0 &&
4138 ++tcp->tcp_in_ack_unsent >
4139 tcp_drop_ack_unsent_cnt) {
4140 TCP_STAT(tcps, tcp_in_ack_unsent_drop);
4141 if (tcp->tcp_in_ack_unsent > 2 *
4142 tcp_drop_ack_unsent_cnt) {
4143 (void) tcp_clean_death(tcp, EPROTO);
4144 }
4145 return;
4146 }
4147 mp = tcp_ack_mp(tcp);
4148 if (mp != NULL) {
4149 BUMP_LOCAL(tcp->tcp_obsegs);
4150 TCPS_BUMP_MIB(tcps, tcpOutAck);
4151 tcp_send_data(tcp, mp);
4152 }
4153 return;
4154 }
4155 } else if (tcp->tcp_is_wnd_shrnk && SEQ_GEQ(seg_ack,
4156 tcp->tcp_snxt_shrunk)) {
4157 tcp->tcp_is_wnd_shrnk = B_FALSE;
4158 }
4159
4160 /*
4161 * TCP gets a new ACK, update the notsack'ed list to delete those
4162 * blocks that are covered by this ACK.
4163 */
4164 if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) {
4165 tcp_notsack_remove(&(tcp->tcp_notsack_list), seg_ack,
4166 &(tcp->tcp_num_notsack_blk), &(tcp->tcp_cnt_notsack_list));
4167 }
4168
4169 /*
4170 * If we got an ACK after fast retransmit, check to see
4171 * if it is a partial ACK. If it is not and the congestion
4172 * window was inflated to account for the other side's
4173 * cached packets, retract it. If it is, do Hoe's algorithm.
4174 */
4175 if (tcp->tcp_dupack_cnt >= tcps->tcps_dupack_fast_retransmit) {
4176 ASSERT(tcp->tcp_rexmit == B_FALSE);
4177 if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) {
4178 tcp->tcp_dupack_cnt = 0;
4179 /*
4180 * Restore the orig tcp_cwnd_ssthresh after
4181 * fast retransmit phase.
4182 */
4183 if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) {
4184 tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh;
4185 }
4186 tcp->tcp_rexmit_max = seg_ack;
4187 tcp->tcp_cwnd_cnt = 0;
4188
4189 /*
4190 * Remove all notsack info to avoid confusion with
4191 * the next fast retrasnmit/recovery phase.
4192 */
4193 if (tcp->tcp_snd_sack_ok) {
4194 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list,
4195 tcp);
4196 }
4197 } else {
4198 if (tcp->tcp_snd_sack_ok &&
4199 tcp->tcp_notsack_list != NULL) {
4200 flags |= TH_NEED_SACK_REXMIT;
4201 tcp->tcp_pipe -= mss;
4202 if (tcp->tcp_pipe < 0)
4203 tcp->tcp_pipe = 0;
4204 } else {
4205 /*
4206 * Hoe's algorithm:
4207 *
4208 * Retransmit the unack'ed segment and
4209 * restart fast recovery. Note that we
4210 * need to scale back tcp_cwnd to the
4211 * original value when we started fast
4212 * recovery. This is to prevent overly
4213 * aggressive behaviour in sending new
4214 * segments.
4215 */
4216 tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh +
4217 tcps->tcps_dupack_fast_retransmit * mss;
4218 tcp->tcp_cwnd_cnt = tcp->tcp_cwnd;
4219 flags |= TH_REXMIT_NEEDED;
4220 }
4221 }
4222 } else {
4223 tcp->tcp_dupack_cnt = 0;
4224 if (tcp->tcp_rexmit) {
4225 /*
4226 * TCP is retranmitting. If the ACK ack's all
4227 * outstanding data, update tcp_rexmit_max and
4228 * tcp_rexmit_nxt. Otherwise, update tcp_rexmit_nxt
4229 * to the correct value.
4230 *
4231 * Note that SEQ_LEQ() is used. This is to avoid
4232 * unnecessary fast retransmit caused by dup ACKs
4233 * received when TCP does slow start retransmission
4234 * after a time out. During this phase, TCP may
4235 * send out segments which are already received.
4236 * This causes dup ACKs to be sent back.
4237 */
4258 tcp->tcp_timer_backoff = 0;
4259 }
4260
4261 /*
4262 * If tcp_xmit_head is NULL, then it must be the FIN being ack'ed.
4263 * Note that it cannot be the SYN being ack'ed. The code flow
4264 * will not reach here.
4265 */
4266 if (mp1 == NULL) {
4267 goto fin_acked;
4268 }
4269
4270 /*
4271 * Update the congestion window.
4272 *
4273 * If TCP is not ECN capable or TCP is ECN capable but the
4274 * congestion experience bit is not set, increase the tcp_cwnd as
4275 * usual.
4276 */
4277 if (!tcp->tcp_ecn_ok || !(flags & TH_ECE)) {
4278 cwnd = tcp->tcp_cwnd;
4279 add = mss;
4280
4281 if (cwnd >= tcp->tcp_cwnd_ssthresh) {
4282 /*
4283 * This is to prevent an increase of less than 1 MSS of
4284 * tcp_cwnd. With partial increase, tcp_wput_data()
4285 * may send out tinygrams in order to preserve mblk
4286 * boundaries.
4287 *
4288 * By initializing tcp_cwnd_cnt to new tcp_cwnd and
4289 * decrementing it by 1 MSS for every ACKs, tcp_cwnd is
4290 * increased by 1 MSS for every RTTs.
4291 */
4292 if (tcp->tcp_cwnd_cnt <= 0) {
4293 tcp->tcp_cwnd_cnt = cwnd + add;
4294 } else {
4295 tcp->tcp_cwnd_cnt -= add;
4296 add = 0;
4297 }
4298 }
4299 tcp->tcp_cwnd = MIN(cwnd + add, tcp->tcp_cwnd_max);
4300 }
4301
4302 /* See if the latest urgent data has been acknowledged */
4303 if ((tcp->tcp_valid_bits & TCP_URG_VALID) &&
4304 SEQ_GT(seg_ack, tcp->tcp_urg))
4305 tcp->tcp_valid_bits &= ~TCP_URG_VALID;
4306
4307 /* Can we update the RTT estimates? */
4308 if (tcp->tcp_snd_ts_ok) {
4309 /* Ignore zero timestamp echo-reply. */
4310 if (tcpopt.tcp_opt_ts_ecr != 0) {
4311 tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH -
4312 (int32_t)tcpopt.tcp_opt_ts_ecr);
4313 }
4314
4315 /* If needed, restart the timer. */
4316 if (tcp->tcp_set_timer == 1) {
4317 TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
4318 tcp->tcp_set_timer = 0;
4319 }
4320 /*
4321 * Update tcp_csuna in case the other side stops sending
4322 * us timestamps.
4323 */
4324 tcp->tcp_csuna = tcp->tcp_snxt;
4325 } else if (SEQ_GT(seg_ack, tcp->tcp_csuna)) {
4326 /*
4327 * An ACK sequence we haven't seen before, so get the RTT
4328 * and update the RTO. But first check if the timestamp is
4329 * valid to use.
4330 */
4331 if ((mp1->b_next != NULL) &&
4332 SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next)))
4333 tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH -
4334 (int32_t)(intptr_t)mp1->b_prev);
4335 else
4336 TCPS_BUMP_MIB(tcps, tcpRttNoUpdate);
4337
4338 /* Remeber the last sequence to be ACKed */
4339 tcp->tcp_csuna = seg_ack;
4340 if (tcp->tcp_set_timer == 1) {
4341 TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
4342 tcp->tcp_set_timer = 0;
4343 }
4344 } else {
4345 TCPS_BUMP_MIB(tcps, tcpRttNoUpdate);
4346 }
4347
4348 /* Eat acknowledged bytes off the xmit queue. */
4349 for (;;) {
4350 mblk_t *mp2;
4351 uchar_t *wptr;
4352
4353 wptr = mp1->b_wptr;
4354 ASSERT((uintptr_t)(wptr - mp1->b_rptr) <= (uintptr_t)INT_MAX);
4355 bytes_acked -= (int)(wptr - mp1->b_rptr);
4356 if (bytes_acked < 0) {
4357 mp1->b_rptr = wptr + bytes_acked;
4358 /*
4359 * Set a new timestamp if all the bytes timed by the
4360 * old timestamp have been ack'ed.
4361 */
4362 if (SEQ_GT(seg_ack,
4363 (uint32_t)(uintptr_t)(mp1->b_next))) {
4364 mp1->b_prev =
4365 (mblk_t *)(uintptr_t)LBOLT_FASTPATH;
4366 mp1->b_next = NULL;
4367 }
4368 break;
4369 }
4370 mp1->b_next = NULL;
4371 mp1->b_prev = NULL;
4372 mp2 = mp1;
4373 mp1 = mp1->b_cont;
4374
4375 /*
4376 * This notification is required for some zero-copy
4377 * clients to maintain a copy semantic. After the data
4378 * is ack'ed, client is safe to modify or reuse the buffer.
4379 */
4380 if (tcp->tcp_snd_zcopy_aware &&
4381 (mp2->b_datap->db_struioflag & STRUIO_ZCNOTIFY))
4382 tcp_zcopy_notify(tcp);
4383 freeb(mp2);
4384 if (bytes_acked == 0) {
4385 if (mp1 == NULL) {
4821 TH_NEED_SACK_REXMIT|TH_LIMIT_XMIT|TH_ACK_TIMER_NEEDED|
4822 TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0)
4823 goto done;
4824
4825 /* Any transmit work to do and a non-zero window? */
4826 if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_NEED_SACK_REXMIT|
4827 TH_LIMIT_XMIT)) && tcp->tcp_swnd != 0) {
4828 if (flags & TH_REXMIT_NEEDED) {
4829 uint32_t snd_size = tcp->tcp_snxt - tcp->tcp_suna;
4830
4831 TCPS_BUMP_MIB(tcps, tcpOutFastRetrans);
4832 if (snd_size > mss)
4833 snd_size = mss;
4834 if (snd_size > tcp->tcp_swnd)
4835 snd_size = tcp->tcp_swnd;
4836 mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, snd_size,
4837 NULL, NULL, tcp->tcp_suna, B_TRUE, &snd_size,
4838 B_TRUE);
4839
4840 if (mp1 != NULL) {
4841 tcp->tcp_xmit_head->b_prev =
4842 (mblk_t *)LBOLT_FASTPATH;
4843 tcp->tcp_csuna = tcp->tcp_snxt;
4844 TCPS_BUMP_MIB(tcps, tcpRetransSegs);
4845 TCPS_UPDATE_MIB(tcps, tcpRetransBytes,
4846 snd_size);
4847 tcp_send_data(tcp, mp1);
4848 }
4849 }
4850 if (flags & TH_NEED_SACK_REXMIT) {
4851 tcp_sack_rexmit(tcp, &flags);
4852 }
4853 /*
4854 * For TH_LIMIT_XMIT, tcp_wput_data() is called to send
4855 * out new segment. Note that tcp_rexmit should not be
4856 * set, otherwise TH_LIMIT_XMIT should not be set.
4857 */
4858 if (flags & (TH_XMIT_NEEDED|TH_LIMIT_XMIT)) {
4859 if (!tcp->tcp_rexmit) {
4860 tcp_wput_data(tcp, NULL, B_FALSE);
4861 } else {
4862 tcp_ss_rexmit(tcp);
4863 }
4864 }
4865 /*
4866 * Adjust tcp_cwnd back to normal value after sending
4867 * new data segments.
4868 */
4869 if (flags & TH_LIMIT_XMIT) {
4870 tcp->tcp_cwnd -= mss << (tcp->tcp_dupack_cnt - 1);
4871 /*
4872 * This will restart the timer. Restarting the
4873 * timer is used to avoid a timeout before the
4874 * limited transmitted segment's ACK gets back.
4875 */
4876 if (tcp->tcp_xmit_head != NULL)
4877 tcp->tcp_xmit_head->b_prev =
4878 (mblk_t *)LBOLT_FASTPATH;
4879 }
4880
4881 /* Anything more to do? */
4882 if ((flags & (TH_ACK_NEEDED|TH_ACK_TIMER_NEEDED|
4883 TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0)
4884 goto done;
4885 }
4886 ack_check:
4887 if (flags & TH_SEND_URP_MARK) {
4888 ASSERT(tcp->tcp_urp_mark_mp);
4889 ASSERT(!IPCL_IS_NONSTR(connp));
4890 /*
4891 * Send up any queued data and then send the mark message
4892 */
4893 if (tcp->tcp_rcv_list != NULL) {
4894 flags |= tcp_rcv_drain(tcp);
4895
4896 }
4897 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg);
4898 mp1 = tcp->tcp_urp_mark_mp;
4899 tcp->tcp_urp_mark_mp = NULL;
4901 tcp_setcred_data(mp1, ira);
4902
4903 putnext(connp->conn_rq, mp1);
4904 #ifdef DEBUG
4905 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
4906 "tcp_rput: sending zero-length %s %s",
4907 ((mp1->b_flag & MSGMARKNEXT) ? "MSGMARKNEXT" :
4908 "MSGNOTMARKNEXT"),
4909 tcp_display(tcp, NULL, DISP_PORT_ONLY));
4910 #endif /* DEBUG */
4911 flags &= ~TH_SEND_URP_MARK;
4912 }
4913 if (flags & TH_ACK_NEEDED) {
4914 /*
4915 * Time to send an ack for some reason.
4916 */
4917 mp1 = tcp_ack_mp(tcp);
4918
4919 if (mp1 != NULL) {
4920 tcp_send_data(tcp, mp1);
4921 BUMP_LOCAL(tcp->tcp_obsegs);
4922 TCPS_BUMP_MIB(tcps, tcpOutAck);
4923 }
4924 if (tcp->tcp_ack_tid != 0) {
4925 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid);
4926 tcp->tcp_ack_tid = 0;
4927 }
4928 }
4929 if (flags & TH_ACK_TIMER_NEEDED) {
4930 /*
4931 * Arrange for deferred ACK or push wait timeout.
4932 * Start timer if it is not already running.
4933 */
4934 if (tcp->tcp_ack_tid == 0) {
4935 tcp->tcp_ack_tid = TCP_TIMER(tcp, tcp_ack_timer,
4936 tcp->tcp_localnet ?
4937 tcps->tcps_local_dack_interval :
4938 tcps->tcps_deferred_ack_interval);
4939 }
4940 }
4941 if (flags & TH_ORDREL_NEEDED) {
5194 }
5195 if (addflag.crb_ipv6_recvdstopts) {
5196 toh = (struct T_opthdr *)optptr;
5197 toh->level = IPPROTO_IPV6;
5198 toh->name = IPV6_DSTOPTS;
5199 toh->len = sizeof (*toh) + ipp->ipp_dstoptslen;
5200 toh->status = 0;
5201 optptr += sizeof (*toh);
5202 bcopy(ipp->ipp_dstopts, optptr, ipp->ipp_dstoptslen);
5203 optptr += ipp->ipp_dstoptslen;
5204 ASSERT(OK_32PTR(optptr));
5205 /* Save as last value */
5206 ip_savebuf((void **)&tcp->tcp_dstopts, &tcp->tcp_dstoptslen,
5207 (ipp->ipp_fields & IPPF_DSTOPTS),
5208 ipp->ipp_dstopts, ipp->ipp_dstoptslen);
5209 }
5210 ASSERT(optptr == mp->b_wptr);
5211 return (mp);
5212 }
5213
5214 /* The minimum of smoothed mean deviation in RTO calculation. */
5215 #define TCP_SD_MIN 400
5216
5217 /*
5218 * Set RTO for this connection. The formula is from Jacobson and Karels'
5219 * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names
5220 * are the same as those in Appendix A.2 of that paper.
5221 *
5222 * m = new measurement
5223 * sa = smoothed RTT average (8 * average estimates).
5224 * sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates).
5225 */
5226 static void
5227 tcp_set_rto(tcp_t *tcp, clock_t rtt)
5228 {
5229 long m = TICK_TO_MSEC(rtt);
5230 clock_t sa = tcp->tcp_rtt_sa;
5231 clock_t sv = tcp->tcp_rtt_sd;
5232 clock_t rto;
5233 tcp_stack_t *tcps = tcp->tcp_tcps;
5234
5235 TCPS_BUMP_MIB(tcps, tcpRttUpdate);
5236 tcp->tcp_rtt_update++;
5237
5238 /* tcp_rtt_sa is not 0 means this is a new sample. */
5239 if (sa != 0) {
5240 /*
5241 * Update average estimator:
5242 * new rtt = 7/8 old rtt + 1/8 Error
5243 */
5244
5245 /* m is now Error in estimate. */
5246 m -= sa >> 3;
5247 if ((sa += m) <= 0) {
5248 /*
5249 * Don't allow the smoothed average to be negative.
5250 * We use 0 to denote reinitialization of the
5251 * variables.
5252 */
5253 sa = 1;
5254 }
5255
5256 /*
5257 * Update deviation estimator:
5258 * new mdev = 3/4 old mdev + 1/4 (abs(Error) - old mdev)
5259 */
5260 if (m < 0)
5261 m = -m;
5262 m -= sv >> 2;
5263 sv += m;
5264 } else {
5265 /*
5266 * This follows BSD's implementation. So the reinitialized
5267 * RTO is 3 * m. We cannot go less than 2 because if the
5268 * link is bandwidth dominated, doubling the window size
5269 * during slow start means doubling the RTT. We want to be
5270 * more conservative when we reinitialize our estimates. 3
5271 * is just a convenient number.
5272 */
5273 sa = m << 3;
5274 sv = m << 1;
5275 }
5276 if (sv < TCP_SD_MIN) {
5277 /*
5278 * We do not know that if sa captures the delay ACK
5279 * effect as in a long train of segments, a receiver
5280 * does not delay its ACKs. So set the minimum of sv
5281 * to be TCP_SD_MIN, which is default to 400 ms, twice
5282 * of BSD DATO. That means the minimum of mean
5283 * deviation is 100 ms.
5284 *
5285 */
5286 sv = TCP_SD_MIN;
5287 }
5288 tcp->tcp_rtt_sa = sa;
5289 tcp->tcp_rtt_sd = sv;
5290 /*
5291 * RTO = average estimates (sa / 8) + 4 * deviation estimates (sv)
5292 *
5293 * Add tcp_rexmit_interval extra in case of extreme environment
5294 * where the algorithm fails to work. The default value of
5295 * tcp_rexmit_interval_extra should be 0.
5296 *
5297 * As we use a finer grained clock than BSD and update
5298 * RTO for every ACKs, add in another .25 of RTT to the
5299 * deviation of RTO to accomodate burstiness of 1/4 of
5300 * window size.
5301 */
5302 rto = (sa >> 3) + sv + tcps->tcps_rexmit_interval_extra + (sa >> 5);
5303
5304 TCP_SET_RTO(tcp, rto);
5305
5306 /* Now, we can reset tcp_timer_backoff to use the new RTO... */
5307 tcp->tcp_timer_backoff = 0;
5308 }
5309
5310 /*
5311 * On a labeled system we have some protocols above TCP, such as RPC, which
5312 * appear to assume that every mblk in a chain has a db_credp.
5313 */
5314 static void
5315 tcp_setcred_data(mblk_t *mp, ip_recv_attr_t *ira)
5316 {
5317 ASSERT(is_system_labeled());
5318 ASSERT(ira->ira_cred != NULL);
5319
5320 while (mp != NULL) {
5321 mblk_setcred(mp, ira->ira_cred, NOPID);
5322 mp = mp->b_cont;
5323 }
5324 }
5609 default:
5610 break;
5611 }
5612 break;
5613 case ICMP_SOURCE_QUENCH:
5614 /*
5615 * use a global boolean to control
5616 * whether TCP should respond to ICMP_SOURCE_QUENCH.
5617 * The default is false.
5618 */
5619 if (tcp_icmp_source_quench) {
5620 /*
5621 * Reduce the sending rate as if we got a
5622 * retransmit timeout
5623 */
5624 uint32_t npkt;
5625
5626 npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) /
5627 tcp->tcp_mss;
5628 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * tcp->tcp_mss;
5629 tcp->tcp_cwnd = tcp->tcp_mss;
5630 tcp->tcp_cwnd_cnt = 0;
5631 }
5632 break;
5633 }
5634 freemsg(mp);
5635 }
5636
5637 /*
5638 * tcp_icmp_error_ipv6 is called from tcp_icmp_input to process ICMPv6
5639 * error messages passed up by IP.
5640 * Assumes that IP has pulled up all the extension headers as well
5641 * as the ICMPv6 header.
5642 */
5643 static void
5644 tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, ip_recv_attr_t *ira)
5645 {
5646 icmp6_t *icmp6;
5647 ip6_t *ip6h;
5648 uint16_t iph_hdr_length = ira->ira_ip_hdr_length;
|
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
25 * Copyright 2017 Joyent, Inc.
26 * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
27 */
28
29 /* This file contains all TCP input processing functions. */
30
31 #include <sys/types.h>
32 #include <sys/stream.h>
33 #include <sys/strsun.h>
34 #include <sys/strsubr.h>
35 #include <sys/stropts.h>
36 #include <sys/strlog.h>
37 #define _SUN_TPI_VERSION 2
38 #include <sys/tihdr.h>
39 #include <sys/suntpi.h>
40 #include <sys/xti_inet.h>
41 #include <sys/squeue_impl.h>
42 #include <sys/squeue.h>
43 #include <sys/tsol/tnet.h>
44
45 #include <inet/common.h>
46 #include <inet/ip.h>
149 static uint32_t tcp_init_wnd_chk = 4096;
150
151 /* Process ICMP source quench message or not. */
152 static boolean_t tcp_icmp_source_quench = B_FALSE;
153
154 static boolean_t tcp_outbound_squeue_switch = B_FALSE;
155
156 static mblk_t *tcp_conn_create_v4(conn_t *, conn_t *, mblk_t *,
157 ip_recv_attr_t *);
158 static mblk_t *tcp_conn_create_v6(conn_t *, conn_t *, mblk_t *,
159 ip_recv_attr_t *);
160 static boolean_t tcp_drop_q0(tcp_t *);
161 static void tcp_icmp_error_ipv6(tcp_t *, mblk_t *, ip_recv_attr_t *);
162 static mblk_t *tcp_input_add_ancillary(tcp_t *, mblk_t *, ip_pkt_t *,
163 ip_recv_attr_t *);
164 static void tcp_input_listener(void *, mblk_t *, void *, ip_recv_attr_t *);
165 static void tcp_process_options(tcp_t *, tcpha_t *);
166 static mblk_t *tcp_reass(tcp_t *, mblk_t *, uint32_t);
167 static void tcp_reass_elim_overlap(tcp_t *, mblk_t *);
168 static void tcp_rsrv_input(void *, mblk_t *, void *, ip_recv_attr_t *);
169 static void tcp_set_rto(tcp_t *, hrtime_t);
170 static void tcp_setcred_data(mblk_t *, ip_recv_attr_t *);
171
172 /*
173 * CC wrapper hook functions
174 */
175 static void
176 cc_ack_received(tcp_t *tcp, uint32_t seg_ack, int32_t bytes_acked,
177 uint16_t type)
178 {
179 uint32_t old_cwnd = tcp->tcp_cwnd;
180
181 tcp->tcp_ccv.bytes_this_ack = bytes_acked;
182 if (tcp->tcp_cwnd <= tcp->tcp_swnd)
183 tcp->tcp_ccv.flags |= CCF_CWND_LIMITED;
184 else
185 tcp->tcp_ccv.flags &= ~CCF_CWND_LIMITED;
186
187 if (type == CC_ACK) {
188 if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) {
189 if (tcp->tcp_ccv.flags & CCF_RTO)
190 tcp->tcp_ccv.flags &= ~CCF_RTO;
191
192 tcp->tcp_ccv.t_bytes_acked +=
193 min(tcp->tcp_ccv.bytes_this_ack,
194 tcp->tcp_tcps->tcps_abc_l_var * tcp->tcp_mss);
195 if (tcp->tcp_ccv.t_bytes_acked >= tcp->tcp_cwnd) {
196 tcp->tcp_ccv.t_bytes_acked -= tcp->tcp_cwnd;
197 tcp->tcp_ccv.flags |= CCF_ABC_SENTAWND;
198 }
199 } else {
200 tcp->tcp_ccv.flags &= ~CCF_ABC_SENTAWND;
201 tcp->tcp_ccv.t_bytes_acked = 0;
202 }
203 }
204
205 if (CC_ALGO(tcp)->ack_received != NULL) {
206 /*
207 * The FreeBSD code where this originated had a comment "Find
208 * a way to live without this" in several places where curack
209 * got set. If they eventually dump curack from from the cc
210 * variables, we'll need to adapt our code.
211 */
212 tcp->tcp_ccv.curack = seg_ack;
213 CC_ALGO(tcp)->ack_received(&tcp->tcp_ccv, type);
214 }
215
216 DTRACE_PROBE3(cwnd__cc__ack__received, tcp_t *, tcp, uint32_t, old_cwnd,
217 uint32_t, tcp->tcp_cwnd);
218 }
219
220 void
221 cc_cong_signal(tcp_t *tcp, uint32_t seg_ack, uint32_t type)
222 {
223 uint32_t old_cwnd = tcp->tcp_cwnd;
224 uint32_t old_cwnd_ssthresh = tcp->tcp_cwnd_ssthresh;
225 switch (type) {
226 case CC_NDUPACK:
227 if (!IN_FASTRECOVERY(tcp->tcp_ccv.flags)) {
228 tcp->tcp_rexmit_max = tcp->tcp_snxt;
229 if (tcp->tcp_ecn_ok) {
230 tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
231 tcp->tcp_cwr = B_TRUE;
232 tcp->tcp_ecn_cwr_sent = B_FALSE;
233 }
234 }
235 break;
236 case CC_ECN:
237 if (!IN_CONGRECOVERY(tcp->tcp_ccv.flags)) {
238 tcp->tcp_rexmit_max = tcp->tcp_snxt;
239 if (tcp->tcp_ecn_ok) {
240 tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
241 tcp->tcp_cwr = B_TRUE;
242 tcp->tcp_ecn_cwr_sent = B_FALSE;
243 }
244 }
245 break;
246 case CC_RTO:
247 tcp->tcp_ccv.flags |= CCF_RTO;
248 tcp->tcp_dupack_cnt = 0;
249 tcp->tcp_ccv.t_bytes_acked = 0;
250 /*
251 * Give up on fast recovery and congestion recovery if we were
252 * attempting either.
253 */
254 EXIT_RECOVERY(tcp->tcp_ccv.flags);
255 if (CC_ALGO(tcp)->cong_signal == NULL) {
256 /*
257 * RFC5681 Section 3.1
258 * ssthresh = max (FlightSize / 2, 2*SMSS) eq (4)
259 */
260 tcp->tcp_cwnd_ssthresh = max(
261 (tcp->tcp_snxt - tcp->tcp_suna) / 2 / tcp->tcp_mss,
262 2) * tcp->tcp_mss;
263 tcp->tcp_cwnd = tcp->tcp_mss;
264 }
265
266 if (tcp->tcp_ecn_ok) {
267 tcp->tcp_cwr = B_TRUE;
268 tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
269 tcp->tcp_ecn_cwr_sent = B_FALSE;
270 }
271 break;
272 }
273
274 if (CC_ALGO(tcp)->cong_signal != NULL) {
275 tcp->tcp_ccv.curack = seg_ack;
276 CC_ALGO(tcp)->cong_signal(&tcp->tcp_ccv, type);
277 }
278
279 DTRACE_PROBE6(cwnd__cc__cong__signal, tcp_t *, tcp, uint32_t, old_cwnd,
280 uint32_t, tcp->tcp_cwnd, uint32_t, old_cwnd_ssthresh,
281 uint32_t, tcp->tcp_cwnd_ssthresh, uint32_t, type);
282 }
283
284 static void
285 cc_post_recovery(tcp_t *tcp, uint32_t seg_ack)
286 {
287 uint32_t old_cwnd = tcp->tcp_cwnd;
288
289 if (CC_ALGO(tcp)->post_recovery != NULL) {
290 tcp->tcp_ccv.curack = seg_ack;
291 CC_ALGO(tcp)->post_recovery(&tcp->tcp_ccv);
292 }
293 tcp->tcp_ccv.t_bytes_acked = 0;
294
295 DTRACE_PROBE3(cwnd__cc__post__recovery, tcp_t *, tcp,
296 uint32_t, old_cwnd, uint32_t, tcp->tcp_cwnd);
297 }
298
299 /*
300 * Set the MSS associated with a particular tcp based on its current value,
301 * and a new one passed in. Observe minimums and maximums, and reset other
302 * state variables that we want to view as multiples of MSS.
303 *
304 * The value of MSS could be either increased or descreased.
305 */
306 void
307 tcp_mss_set(tcp_t *tcp, uint32_t mss)
308 {
309 uint32_t mss_max;
310 tcp_stack_t *tcps = tcp->tcp_tcps;
311 conn_t *connp = tcp->tcp_connp;
312
313 if (connp->conn_ipversion == IPV4_VERSION)
314 mss_max = tcps->tcps_mss_max_ipv4;
315 else
316 mss_max = tcps->tcps_mss_max_ipv6;
317
318 if (mss < tcps->tcps_mss_min)
319 mss = tcps->tcps_mss_min;
658 IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) + TCP_MIN_HEADER_LENGTH);
659
660 /*
661 * Set MSS to the smaller one of both ends of the connection.
662 * We should not have called tcp_mss_set() before, but our
663 * side of the MSS should have been set to a proper value
664 * by tcp_set_destination(). tcp_mss_set() will also set up the
665 * STREAM head parameters properly.
666 *
667 * If we have a larger-than-16-bit window but the other side
668 * didn't want to do window scale, tcp_rwnd_set() will take
669 * care of that.
670 */
671 tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss));
672
673 /*
674 * Initialize tcp_cwnd value. After tcp_mss_set(), tcp_mss has been
675 * updated properly.
676 */
677 TCP_SET_INIT_CWND(tcp, tcp->tcp_mss, tcps->tcps_slow_start_initial);
678
679 if (tcp->tcp_cc_algo->conn_init != NULL)
680 tcp->tcp_cc_algo->conn_init(&tcp->tcp_ccv);
681 }
682
683 /*
684 * Add a new piece to the tcp reassembly queue. If the gap at the beginning
685 * is filled, return as much as we can. The message passed in may be
686 * multi-part, chained using b_cont. "start" is the starting sequence
687 * number for this piece.
688 */
689 static mblk_t *
690 tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start)
691 {
692 uint32_t end, bytes;
693 mblk_t *mp1;
694 mblk_t *mp2;
695 mblk_t *next_mp;
696 uint32_t u1;
697 tcp_stack_t *tcps = tcp->tcp_tcps;
698
699
700 /* Walk through all the new pieces. */
701 do {
702 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <=
703 (uintptr_t)INT_MAX);
704 end = start + (int)(mp->b_wptr - mp->b_rptr);
705 next_mp = mp->b_cont;
706 if (start == end) {
707 /* Empty. Blast it. */
708 freeb(mp);
709 continue;
710 }
711 bytes = end - start;
712 mp->b_cont = NULL;
713 TCP_REASS_SET_SEQ(mp, start);
714 TCP_REASS_SET_END(mp, end);
715 mp1 = tcp->tcp_reass_tail;
716 if (mp1 == NULL || SEQ_GEQ(start, TCP_REASS_END(mp1))) {
717 if (mp1 != NULL) {
718 /*
719 * New stuff is beyond the tail; link it on the
720 * end.
721 */
722 mp1->b_cont = mp;
723 } else {
724 tcp->tcp_reass_head = mp;
725 }
726 tcp->tcp_reass_tail = mp;
727 TCPS_BUMP_MIB(tcps, tcpInDataUnorderSegs);
728 TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes, bytes);
729 tcp->tcp_cs.tcp_in_data_unorder_segs++;
730 tcp->tcp_cs.tcp_in_data_unorder_bytes += bytes;
731 continue;
732 }
733 mp1 = tcp->tcp_reass_head;
734 u1 = TCP_REASS_SEQ(mp1);
735 /* New stuff at the front? */
736 if (SEQ_LT(start, u1)) {
737 /* Yes... Check for overlap. */
738 mp->b_cont = mp1;
739 tcp->tcp_reass_head = mp;
740 tcp_reass_elim_overlap(tcp, mp);
741 continue;
742 }
743 /*
744 * The new piece fits somewhere between the head and tail.
745 * We find our slot, where mp1 precedes us and mp2 trails.
746 */
747 for (; (mp2 = mp1->b_cont) != NULL; mp1 = mp2) {
748 u1 = TCP_REASS_SEQ(mp2);
749 if (SEQ_LEQ(start, u1))
750 break;
2437 tcp_input_data(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
2438 {
2439 int32_t bytes_acked;
2440 int32_t gap;
2441 mblk_t *mp1;
2442 uint_t flags;
2443 uint32_t new_swnd = 0;
2444 uchar_t *iphdr;
2445 uchar_t *rptr;
2446 int32_t rgap;
2447 uint32_t seg_ack;
2448 int seg_len;
2449 uint_t ip_hdr_len;
2450 uint32_t seg_seq;
2451 tcpha_t *tcpha;
2452 int urp;
2453 tcp_opt_t tcpopt;
2454 ip_pkt_t ipp;
2455 boolean_t ofo_seg = B_FALSE; /* Out of order segment */
2456 uint32_t cwnd;
2457 int mss;
2458 conn_t *connp = (conn_t *)arg;
2459 squeue_t *sqp = (squeue_t *)arg2;
2460 tcp_t *tcp = connp->conn_tcp;
2461 tcp_stack_t *tcps = tcp->tcp_tcps;
2462 sock_upcalls_t *sockupcalls;
2463
2464 /*
2465 * RST from fused tcp loopback peer should trigger an unfuse.
2466 */
2467 if (tcp->tcp_fused) {
2468 TCP_STAT(tcps, tcp_fusion_aborted);
2469 tcp_unfuse(tcp);
2470 }
2471
2472 iphdr = mp->b_rptr;
2473 rptr = mp->b_rptr;
2474 ASSERT(OK_32PTR(rptr));
2475
2476 ip_hdr_len = ira->ira_ip_hdr_length;
2525
2526 if (tcp->tcp_state == TCPS_TIME_WAIT) {
2527 tcp_time_wait_processing(tcp, mp, seg_seq, seg_ack,
2528 seg_len, tcpha, ira);
2529 return;
2530 }
2531
2532 if (sqp != NULL) {
2533 /*
2534 * This is the correct place to update tcp_last_recv_time. Note
2535 * that it is also updated for tcp structure that belongs to
2536 * global and listener queues which do not really need updating.
2537 * But that should not cause any harm. And it is updated for
2538 * all kinds of incoming segments, not only for data segments.
2539 */
2540 tcp->tcp_last_recv_time = LBOLT_FASTPATH;
2541 }
2542
2543 flags = (unsigned int)tcpha->tha_flags & 0xFF;
2544
2545 TCPS_BUMP_MIB(tcps, tcpHCInSegs);
2546 DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp);
2547
2548 if ((flags & TH_URG) && sqp != NULL) {
2549 /*
2550 * TCP can't handle urgent pointers that arrive before
2551 * the connection has been accept()ed since it can't
2552 * buffer OOB data. Discard segment if this happens.
2553 *
2554 * We can't just rely on a non-null tcp_listener to indicate
2555 * that the accept() has completed since unlinking of the
2556 * eager and completion of the accept are not atomic.
2557 * tcp_detached, when it is not set (B_FALSE) indicates
2558 * that the accept() has completed.
2559 *
2560 * Nor can it reassemble urgent pointers, so discard
2561 * if it's not the next segment expected.
2562 *
2563 * Otherwise, collapse chain into one mblk (discard if
2564 * that fails). This makes sure the headers, retransmitted
2565 * data, and new data all are in the same mblk.
2712 tcp->tcp_suna = tcp->tcp_iss + 1;
2713 tcp->tcp_valid_bits &= ~TCP_ISS_VALID;
2714
2715 /*
2716 * If SYN was retransmitted, need to reset all
2717 * retransmission info. This is because this
2718 * segment will be treated as a dup ACK.
2719 */
2720 if (tcp->tcp_rexmit) {
2721 tcp->tcp_rexmit = B_FALSE;
2722 tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
2723 tcp->tcp_rexmit_max = tcp->tcp_snxt;
2724 tcp->tcp_ms_we_have_waited = 0;
2725
2726 /*
2727 * Set tcp_cwnd back to 1 MSS, per
2728 * recommendation from
2729 * draft-floyd-incr-init-win-01.txt,
2730 * Increasing TCP's Initial Window.
2731 */
2732 DTRACE_PROBE3(cwnd__retransmitted__syn,
2733 tcp_t *, tcp, uint32_t, tcp->tcp_cwnd,
2734 uint32_t, tcp->tcp_mss);
2735 tcp->tcp_cwnd = tcp->tcp_mss;
2736 }
2737
2738 tcp->tcp_swl1 = seg_seq;
2739 tcp->tcp_swl2 = seg_ack;
2740
2741 new_swnd = ntohs(tcpha->tha_win);
2742 tcp->tcp_swnd = new_swnd;
2743 if (new_swnd > tcp->tcp_max_swnd)
2744 tcp->tcp_max_swnd = new_swnd;
2745
2746 /*
2747 * Always send the three-way handshake ack immediately
2748 * in order to make the connection complete as soon as
2749 * possible on the accepting host.
2750 */
2751 flags |= TH_ACK_NEEDED;
2752
2753 /*
2754 * Trace connect-established here.
2773 if (tcp->tcp_loopback) {
2774 mblk_t *ack_mp;
2775
2776 ASSERT(!tcp->tcp_unfusable);
2777 ASSERT(mp1 != NULL);
2778 /*
2779 * For loopback, we always get a pure SYN-ACK
2780 * and only need to send back the final ACK
2781 * with no data (this is because the other
2782 * tcp is ours and we don't do T/TCP). This
2783 * final ACK triggers the passive side to
2784 * perform fusion in ESTABLISHED state.
2785 */
2786 if ((ack_mp = tcp_ack_mp(tcp)) != NULL) {
2787 if (tcp->tcp_ack_tid != 0) {
2788 (void) TCP_TIMER_CANCEL(tcp,
2789 tcp->tcp_ack_tid);
2790 tcp->tcp_ack_tid = 0;
2791 }
2792 tcp_send_data(tcp, ack_mp);
2793 TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
2794 TCPS_BUMP_MIB(tcps, tcpOutAck);
2795
2796 if (!IPCL_IS_NONSTR(connp)) {
2797 /* Send up T_CONN_CON */
2798 if (ira->ira_cred != NULL) {
2799 mblk_setcred(mp1,
2800 ira->ira_cred,
2801 ira->ira_cpid);
2802 }
2803 putnext(connp->conn_rq, mp1);
2804 } else {
2805 (*sockupcalls->su_connected)
2806 (connp->conn_upper_handle,
2807 tcp->tcp_connid,
2808 ira->ira_cred,
2809 ira->ira_cpid);
2810 freemsg(mp1);
2811 }
2812
2813 freemsg(mp);
3162 mp2 = mp;
3163 mp = mp->b_cont;
3164 freeb(mp2);
3165 } while (gap < 0);
3166 /*
3167 * If the urgent data has already been acknowledged, we
3168 * should ignore TH_URG below
3169 */
3170 if (urp < 0)
3171 flags &= ~TH_URG;
3172 }
3173 /*
3174 * rgap is the amount of stuff received out of window. A negative
3175 * value is the amount out of window.
3176 */
3177 if (rgap < 0) {
3178 mblk_t *mp2;
3179
3180 if (tcp->tcp_rwnd == 0) {
3181 TCPS_BUMP_MIB(tcps, tcpInWinProbe);
3182 tcp->tcp_cs.tcp_in_zwnd_probes++;
3183 } else {
3184 TCPS_BUMP_MIB(tcps, tcpInDataPastWinSegs);
3185 TCPS_UPDATE_MIB(tcps, tcpInDataPastWinBytes, -rgap);
3186 }
3187
3188 /*
3189 * seg_len does not include the FIN, so if more than
3190 * just the FIN is out of window, we act like we don't
3191 * see it. (If just the FIN is out of window, rgap
3192 * will be zero and we will go ahead and acknowledge
3193 * the FIN.)
3194 */
3195 flags &= ~TH_FIN;
3196
3197 /* Fix seg_len and make sure there is something left. */
3198 seg_len += rgap;
3199 if (seg_len <= 0) {
3200 /*
3201 * Resets are only valid if they lie within our offered
3202 * window. If the RST bit is set, we just ignore this
3412 *
3413 * But TCP should not perform fast retransmit
3414 * because of the ack number. TCP uses
3415 * seg_len == 0 to determine if it is a pure
3416 * ACK. And this is not a pure ACK.
3417 */
3418 seg_len = 0;
3419 ofo_seg = B_TRUE;
3420
3421 if (tcps->tcps_reass_timeout != 0 &&
3422 tcp->tcp_reass_tid == 0) {
3423 tcp->tcp_reass_tid = TCP_TIMER(tcp,
3424 tcp_reass_timer,
3425 tcps->tcps_reass_timeout);
3426 }
3427 }
3428 }
3429 } else if (seg_len > 0) {
3430 TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs);
3431 TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, seg_len);
3432 tcp->tcp_cs.tcp_in_data_inorder_segs++;
3433 tcp->tcp_cs.tcp_in_data_inorder_bytes += seg_len;
3434
3435 /*
3436 * If an out of order FIN was received before, and the seq
3437 * num and len of the new segment match that of the FIN,
3438 * put the FIN flag back in.
3439 */
3440 if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) &&
3441 seg_seq + seg_len == tcp->tcp_ofo_fin_seq) {
3442 flags |= TH_FIN;
3443 tcp->tcp_valid_bits &= ~TCP_OFO_FIN_VALID;
3444 }
3445 }
3446 if ((flags & (TH_RST | TH_SYN | TH_URG | TH_ACK)) != TH_ACK) {
3447 if (flags & TH_RST) {
3448 freemsg(mp);
3449 switch (tcp->tcp_state) {
3450 case TCPS_SYN_RCVD:
3451 (void) tcp_clean_death(tcp, ECONNREFUSED);
3452 break;
3453 case TCPS_ESTABLISHED:
3454 case TCPS_FIN_WAIT_1:
3480 SEQ_LEQ(seg_seq, tcp->tcp_rnxt + tcp->tcp_rwnd));
3481 freemsg(mp);
3482 /*
3483 * If the ACK flag is not set, just use our snxt as the
3484 * seq number of the RST segment.
3485 */
3486 if (!(flags & TH_ACK)) {
3487 seg_ack = tcp->tcp_snxt;
3488 }
3489 tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1,
3490 TH_RST|TH_ACK);
3491 ASSERT(tcp->tcp_state != TCPS_TIME_WAIT);
3492 (void) tcp_clean_death(tcp, ECONNRESET);
3493 return;
3494 }
3495 /*
3496 * urp could be -1 when the urp field in the packet is 0
3497 * and TCP_OLD_URP_INTERPRETATION is set. This implies that the urgent
3498 * byte was at seg_seq - 1, in which case we ignore the urgent flag.
3499 */
3500 if ((flags & TH_URG) && urp >= 0) {
3501 if (!tcp->tcp_urp_last_valid ||
3502 SEQ_GT(urp + seg_seq, tcp->tcp_urp_last)) {
3503 /*
3504 * Non-STREAMS sockets handle the urgent data a litte
3505 * differently from STREAMS based sockets. There is no
3506 * need to mark any mblks with the MSG{NOT,}MARKNEXT
3507 * flags to keep SIOCATMARK happy. Instead a
3508 * su_signal_oob upcall is made to update the mark.
3509 * Neither is a T_EXDATA_IND mblk needed to be
3510 * prepended to the urgent data. The urgent data is
3511 * delivered using the su_recv upcall, where we set
3512 * the MSG_OOB flag to indicate that it is urg data.
3513 *
3514 * Neither TH_SEND_URP_MARK nor TH_MARKNEXT_NEEDED
3515 * are used by non-STREAMS sockets.
3516 */
3517 if (IPCL_IS_NONSTR(connp)) {
3518 if (!TCP_IS_DETACHED(tcp)) {
3519 (*sockupcalls->su_signal_oob)
3520 (connp->conn_upper_handle, urp);
3937 ip_xmit_attr_t *, connp->conn_ixa, void_ip_t *,
3938 iphdr, tcp_t *, tcp, tcph_t *, tcpha);
3939 }
3940 TCPS_CONN_INC(tcps);
3941
3942 tcp->tcp_suna = tcp->tcp_iss + 1; /* One for the SYN */
3943 bytes_acked--;
3944 /* SYN was acked - making progress */
3945 tcp->tcp_ip_forward_progress = B_TRUE;
3946
3947 /*
3948 * If SYN was retransmitted, need to reset all
3949 * retransmission info as this segment will be
3950 * treated as a dup ACK.
3951 */
3952 if (tcp->tcp_rexmit) {
3953 tcp->tcp_rexmit = B_FALSE;
3954 tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
3955 tcp->tcp_rexmit_max = tcp->tcp_snxt;
3956 tcp->tcp_ms_we_have_waited = 0;
3957 DTRACE_PROBE3(cwnd__retransmitted__syn,
3958 tcp_t *, tcp, uint32_t, tcp->tcp_cwnd,
3959 uint32_t, tcp->tcp_mss);
3960 tcp->tcp_cwnd = mss;
3961 }
3962
3963 /*
3964 * We set the send window to zero here.
3965 * This is needed if there is data to be
3966 * processed already on the queue.
3967 * Later (at swnd_update label), the
3968 * "new_swnd > tcp_swnd" condition is satisfied
3969 * the XMIT_NEEDED flag is set in the current
3970 * (SYN_RCVD) state. This ensures tcp_wput_data() is
3971 * called if there is already data on queue in
3972 * this state.
3973 */
3974 tcp->tcp_swnd = 0;
3975
3976 if (new_swnd > tcp->tcp_max_swnd)
3977 tcp->tcp_max_swnd = new_swnd;
3978 tcp->tcp_swl1 = seg_seq;
3979 tcp->tcp_swl2 = seg_ack;
3983 DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *,
3984 connp->conn_ixa, void, NULL, tcp_t *, tcp, void, NULL,
3985 int32_t, TCPS_SYN_RCVD);
3986
3987 /* Fuse when both sides are in ESTABLISHED state */
3988 if (tcp->tcp_loopback && do_tcp_fusion)
3989 tcp_fuse(tcp, iphdr, tcpha);
3990
3991 }
3992 /* This code follows 4.4BSD-Lite2 mostly. */
3993 if (bytes_acked < 0)
3994 goto est;
3995
3996 /*
3997 * If TCP is ECN capable and the congestion experience bit is
3998 * set, reduce tcp_cwnd and tcp_ssthresh. But this should only be
3999 * done once per window (or more loosely, per RTT).
4000 */
4001 if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max))
4002 tcp->tcp_cwr = B_FALSE;
4003 if (tcp->tcp_ecn_ok && (flags & TH_ECE) && !tcp->tcp_cwr) {
4004 cc_cong_signal(tcp, seg_ack, CC_ECN);
4005 /*
4006 * If the cwnd is 0, use the timer to clock out
4007 * new segments. This is required by the ECN spec.
4008 */
4009 if (tcp->tcp_cwnd == 0)
4010 TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
4011 tcp->tcp_cwr = B_TRUE;
4012 /*
4013 * This marks the end of the current window of in
4014 * flight data. That is why we don't use
4015 * tcp_suna + tcp_swnd. Only data in flight can
4016 * provide ECN info.
4017 */
4018 tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
4019 }
4020
4021 mp1 = tcp->tcp_xmit_head;
4022 if (bytes_acked == 0) {
4023 if (!ofo_seg && seg_len == 0 && new_swnd == tcp->tcp_swnd) {
4024 int dupack_cnt;
4025
4026 TCPS_BUMP_MIB(tcps, tcpInDupAck);
4027 /*
4028 * Fast retransmit. When we have seen exactly three
4029 * identical ACKs while we have unacked data
4030 * outstanding we take it as a hint that our peer
4031 * dropped something.
4032 *
4033 * If TCP is retransmitting, don't do fast retransmit.
4034 */
4035 if (mp1 && tcp->tcp_suna != tcp->tcp_snxt &&
4036 ! tcp->tcp_rexmit) {
4037 /* Do Limited Transmit */
4038 if ((dupack_cnt = ++tcp->tcp_dupack_cnt) <
4039 tcps->tcps_dupack_fast_retransmit) {
4040 cc_ack_received(tcp, seg_ack,
4041 bytes_acked, CC_DUPACK);
4042 /*
4043 * RFC 3042
4044 *
4045 * What we need to do is temporarily
4046 * increase tcp_cwnd so that new
4047 * data can be sent if it is allowed
4048 * by the receive window (tcp_rwnd).
4049 * tcp_wput_data() will take care of
4050 * the rest.
4051 *
4052 * If the connection is SACK capable,
4053 * only do limited xmit when there
4054 * is SACK info.
4055 *
4056 * Note how tcp_cwnd is incremented.
4057 * The first dup ACK will increase
4058 * it by 1 MSS. The second dup ACK
4059 * will increase it by 2 MSS. This
4060 * means that only 1 new segment will
4061 * be sent for each dup ACK.
4068 (tcp->tcp_dupack_cnt - 1);
4069 flags |= TH_LIMIT_XMIT;
4070 }
4071 } else if (dupack_cnt ==
4072 tcps->tcps_dupack_fast_retransmit) {
4073
4074 /*
4075 * If we have reduced tcp_ssthresh
4076 * because of ECN, do not reduce it again
4077 * unless it is already one window of data
4078 * away. After one window of data, tcp_cwr
4079 * should then be cleared. Note that
4080 * for non ECN capable connection, tcp_cwr
4081 * should always be false.
4082 *
4083 * Adjust cwnd since the duplicate
4084 * ack indicates that a packet was
4085 * dropped (due to congestion.)
4086 */
4087 if (!tcp->tcp_cwr) {
4088 cc_cong_signal(tcp, seg_ack,
4089 CC_NDUPACK);
4090 cc_ack_received(tcp, seg_ack,
4091 bytes_acked, CC_DUPACK);
4092 }
4093 if (tcp->tcp_ecn_ok) {
4094 tcp->tcp_cwr = B_TRUE;
4095 tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
4096 tcp->tcp_ecn_cwr_sent = B_FALSE;
4097 }
4098
4099 /*
4100 * We do Hoe's algorithm. Refer to her
4101 * paper "Improving the Start-up Behavior
4102 * of a Congestion Control Scheme for TCP,"
4103 * appeared in SIGCOMM'96.
4104 *
4105 * Save highest seq no we have sent so far.
4106 * Be careful about the invisible FIN byte.
4107 */
4108 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
4109 (tcp->tcp_unsent == 0)) {
4110 tcp->tcp_rexmit_max = tcp->tcp_fss;
4111 } else {
4133 tcp->tcp_fack;
4134 tcp->tcp_sack_snxt = seg_ack;
4135 flags |= TH_NEED_SACK_REXMIT;
4136 } else {
4137 /*
4138 * Always initialize tcp_pipe
4139 * even though we don't have
4140 * any SACK info. If later
4141 * we get SACK info and
4142 * tcp_pipe is not initialized,
4143 * funny things will happen.
4144 */
4145 tcp->tcp_pipe =
4146 tcp->tcp_cwnd_ssthresh;
4147 }
4148 } else {
4149 flags |= TH_REXMIT_NEEDED;
4150 } /* tcp_snd_sack_ok */
4151
4152 } else {
4153 cc_ack_received(tcp, seg_ack,
4154 bytes_acked, CC_DUPACK);
4155 /*
4156 * Here we perform congestion
4157 * avoidance, but NOT slow start.
4158 * This is known as the Fast
4159 * Recovery Algorithm.
4160 */
4161 if (tcp->tcp_snd_sack_ok &&
4162 tcp->tcp_notsack_list != NULL) {
4163 flags |= TH_NEED_SACK_REXMIT;
4164 tcp->tcp_pipe -= mss;
4165 if (tcp->tcp_pipe < 0)
4166 tcp->tcp_pipe = 0;
4167 } else {
4168 /*
4169 * We know that one more packet has
4170 * left the pipe thus we can update
4171 * cwnd.
4172 */
4173 cwnd = tcp->tcp_cwnd + mss;
4174 if (cwnd > tcp->tcp_cwnd_max)
4175 cwnd = tcp->tcp_cwnd_max;
4176 DTRACE_PROBE3(cwnd__fast__recovery,
4177 tcp_t *, tcp,
4178 uint32_t, tcp->tcp_cwnd,
4179 uint32_t, cwnd);
4180 tcp->tcp_cwnd = cwnd;
4181 if (tcp->tcp_unsent > 0)
4182 flags |= TH_XMIT_NEEDED;
4183 }
4184 }
4185 }
4186 } else if (tcp->tcp_zero_win_probe) {
4187 /*
4188 * If the window has opened, need to arrange
4189 * to send additional data.
4190 */
4191 if (new_swnd != 0) {
4192 /* tcp_suna != tcp_snxt */
4193 /* Packet contains a window update */
4194 TCPS_BUMP_MIB(tcps, tcpInWinUpdate);
4195 tcp->tcp_zero_win_probe = 0;
4196 tcp->tcp_timer_backoff = 0;
4197 tcp->tcp_ms_we_have_waited = 0;
4198
4199 /*
4262 * greater than 0, check if the number of such
4263 * bogus ACks is greater than that count. If yes,
4264 * don't send back any ACK. This prevents TCP from
4265 * getting into an ACK storm if somehow an attacker
4266 * successfully spoofs an acceptable segment to our
4267 * peer. If this continues (count > 2 X threshold),
4268 * we should abort this connection.
4269 */
4270 if (tcp_drop_ack_unsent_cnt > 0 &&
4271 ++tcp->tcp_in_ack_unsent >
4272 tcp_drop_ack_unsent_cnt) {
4273 TCP_STAT(tcps, tcp_in_ack_unsent_drop);
4274 if (tcp->tcp_in_ack_unsent > 2 *
4275 tcp_drop_ack_unsent_cnt) {
4276 (void) tcp_clean_death(tcp, EPROTO);
4277 }
4278 return;
4279 }
4280 mp = tcp_ack_mp(tcp);
4281 if (mp != NULL) {
4282 TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
4283 TCPS_BUMP_MIB(tcps, tcpOutAck);
4284 tcp_send_data(tcp, mp);
4285 }
4286 return;
4287 }
4288 } else if (tcp->tcp_is_wnd_shrnk && SEQ_GEQ(seg_ack,
4289 tcp->tcp_snxt_shrunk)) {
4290 tcp->tcp_is_wnd_shrnk = B_FALSE;
4291 }
4292
4293 /*
4294 * TCP gets a new ACK, update the notsack'ed list to delete those
4295 * blocks that are covered by this ACK.
4296 */
4297 if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) {
4298 tcp_notsack_remove(&(tcp->tcp_notsack_list), seg_ack,
4299 &(tcp->tcp_num_notsack_blk), &(tcp->tcp_cnt_notsack_list));
4300 }
4301
4302 /*
4303 * If we got an ACK after fast retransmit, check to see
4304 * if it is a partial ACK. If it is not and the congestion
4305 * window was inflated to account for the other side's
4306 * cached packets, retract it. If it is, do Hoe's algorithm.
4307 */
4308 if (tcp->tcp_dupack_cnt >= tcps->tcps_dupack_fast_retransmit) {
4309 ASSERT(tcp->tcp_rexmit == B_FALSE);
4310 if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) {
4311 tcp->tcp_dupack_cnt = 0;
4312
4313 cc_post_recovery(tcp, seg_ack);
4314
4315 tcp->tcp_rexmit_max = seg_ack;
4316
4317 /*
4318 * Remove all notsack info to avoid confusion with
4319 * the next fast retrasnmit/recovery phase.
4320 */
4321 if (tcp->tcp_snd_sack_ok) {
4322 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list,
4323 tcp);
4324 }
4325 } else {
4326 if (tcp->tcp_snd_sack_ok &&
4327 tcp->tcp_notsack_list != NULL) {
4328 flags |= TH_NEED_SACK_REXMIT;
4329 tcp->tcp_pipe -= mss;
4330 if (tcp->tcp_pipe < 0)
4331 tcp->tcp_pipe = 0;
4332 } else {
4333 /*
4334 * Hoe's algorithm:
4335 *
4336 * Retransmit the unack'ed segment and
4337 * restart fast recovery. Note that we
4338 * need to scale back tcp_cwnd to the
4339 * original value when we started fast
4340 * recovery. This is to prevent overly
4341 * aggressive behaviour in sending new
4342 * segments.
4343 */
4344 cwnd = tcp->tcp_cwnd_ssthresh +
4345 tcps->tcps_dupack_fast_retransmit * mss;
4346 DTRACE_PROBE3(cwnd__fast__retransmit__part__ack,
4347 tcp_t *, tcp, uint32_t, tcp->tcp_cwnd,
4348 uint32_t, cwnd);
4349 tcp->tcp_cwnd = cwnd;
4350 tcp->tcp_cwnd_cnt = tcp->tcp_cwnd;
4351 flags |= TH_REXMIT_NEEDED;
4352 }
4353 }
4354 } else {
4355 tcp->tcp_dupack_cnt = 0;
4356 if (tcp->tcp_rexmit) {
4357 /*
4358 * TCP is retranmitting. If the ACK ack's all
4359 * outstanding data, update tcp_rexmit_max and
4360 * tcp_rexmit_nxt. Otherwise, update tcp_rexmit_nxt
4361 * to the correct value.
4362 *
4363 * Note that SEQ_LEQ() is used. This is to avoid
4364 * unnecessary fast retransmit caused by dup ACKs
4365 * received when TCP does slow start retransmission
4366 * after a time out. During this phase, TCP may
4367 * send out segments which are already received.
4368 * This causes dup ACKs to be sent back.
4369 */
4390 tcp->tcp_timer_backoff = 0;
4391 }
4392
4393 /*
4394 * If tcp_xmit_head is NULL, then it must be the FIN being ack'ed.
4395 * Note that it cannot be the SYN being ack'ed. The code flow
4396 * will not reach here.
4397 */
4398 if (mp1 == NULL) {
4399 goto fin_acked;
4400 }
4401
4402 /*
4403 * Update the congestion window.
4404 *
4405 * If TCP is not ECN capable or TCP is ECN capable but the
4406 * congestion experience bit is not set, increase the tcp_cwnd as
4407 * usual.
4408 */
4409 if (!tcp->tcp_ecn_ok || !(flags & TH_ECE)) {
4410 if (IN_RECOVERY(tcp->tcp_ccv.flags)) {
4411 EXIT_RECOVERY(tcp->tcp_ccv.flags);
4412 }
4413 cc_ack_received(tcp, seg_ack, bytes_acked, CC_ACK);
4414 }
4415
4416 /* See if the latest urgent data has been acknowledged */
4417 if ((tcp->tcp_valid_bits & TCP_URG_VALID) &&
4418 SEQ_GT(seg_ack, tcp->tcp_urg))
4419 tcp->tcp_valid_bits &= ~TCP_URG_VALID;
4420
4421 /*
4422 * Update the RTT estimates. Note that we don't use the TCP
4423 * timestamp option to calculate RTT even if one is present. This is
4424 * because the timestamp option's resolution (CPU tick) is
4425 * too coarse to measure modern datacenter networks' microsecond
4426 * latencies. The timestamp field's resolution is limited by its
4427 * 4-byte width (see RFC1323), and since we always store a
4428 * high-resolution nanosecond presision timestamp along with the data,
4429 * there is no point to ever using the timestamp option.
4430 */
4431 if (SEQ_GT(seg_ack, tcp->tcp_csuna)) {
4432 /*
4433 * An ACK sequence we haven't seen before, so get the RTT
4434 * and update the RTO. But first check if the timestamp is
4435 * valid to use.
4436 */
4437 if ((mp1->b_next != NULL) &&
4438 SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next))) {
4439 #ifdef KERNEL_32
4440 tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH -
4441 (int32_t)(intptr_t)mp1->b_prev);
4442 #else
4443 tcp_set_rto(tcp, gethrtime() -
4444 (hrtime_t)(intptr_t)mp1->b_prev);
4445 #endif
4446 } else {
4447 TCPS_BUMP_MIB(tcps, tcpRttNoUpdate);
4448 }
4449
4450 /* Remeber the last sequence to be ACKed */
4451 tcp->tcp_csuna = seg_ack;
4452 if (tcp->tcp_set_timer == 1) {
4453 TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
4454 tcp->tcp_set_timer = 0;
4455 }
4456 } else {
4457 TCPS_BUMP_MIB(tcps, tcpRttNoUpdate);
4458 }
4459
4460 /* Eat acknowledged bytes off the xmit queue. */
4461 for (;;) {
4462 mblk_t *mp2;
4463 uchar_t *wptr;
4464
4465 wptr = mp1->b_wptr;
4466 ASSERT((uintptr_t)(wptr - mp1->b_rptr) <= (uintptr_t)INT_MAX);
4467 bytes_acked -= (int)(wptr - mp1->b_rptr);
4468 if (bytes_acked < 0) {
4469 mp1->b_rptr = wptr + bytes_acked;
4470 /*
4471 * Set a new timestamp if all the bytes timed by the
4472 * old timestamp have been ack'ed.
4473 */
4474 if (SEQ_GT(seg_ack,
4475 (uint32_t)(uintptr_t)(mp1->b_next))) {
4476 #ifdef KERNEL_32
4477 mp1->b_prev =
4478 (mblk_t *)(uintptr_t)LBOLT_FASTPATH;
4479 #else
4480 mp1->b_prev =
4481 (mblk_t *)(intptr_t)gethrtime();
4482 #endif
4483 mp1->b_next = NULL;
4484 }
4485 break;
4486 }
4487 mp1->b_next = NULL;
4488 mp1->b_prev = NULL;
4489 mp2 = mp1;
4490 mp1 = mp1->b_cont;
4491
4492 /*
4493 * This notification is required for some zero-copy
4494 * clients to maintain a copy semantic. After the data
4495 * is ack'ed, client is safe to modify or reuse the buffer.
4496 */
4497 if (tcp->tcp_snd_zcopy_aware &&
4498 (mp2->b_datap->db_struioflag & STRUIO_ZCNOTIFY))
4499 tcp_zcopy_notify(tcp);
4500 freeb(mp2);
4501 if (bytes_acked == 0) {
4502 if (mp1 == NULL) {
4938 TH_NEED_SACK_REXMIT|TH_LIMIT_XMIT|TH_ACK_TIMER_NEEDED|
4939 TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0)
4940 goto done;
4941
4942 /* Any transmit work to do and a non-zero window? */
4943 if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_NEED_SACK_REXMIT|
4944 TH_LIMIT_XMIT)) && tcp->tcp_swnd != 0) {
4945 if (flags & TH_REXMIT_NEEDED) {
4946 uint32_t snd_size = tcp->tcp_snxt - tcp->tcp_suna;
4947
4948 TCPS_BUMP_MIB(tcps, tcpOutFastRetrans);
4949 if (snd_size > mss)
4950 snd_size = mss;
4951 if (snd_size > tcp->tcp_swnd)
4952 snd_size = tcp->tcp_swnd;
4953 mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, snd_size,
4954 NULL, NULL, tcp->tcp_suna, B_TRUE, &snd_size,
4955 B_TRUE);
4956
4957 if (mp1 != NULL) {
4958 #ifdef KERNEL_32
4959 tcp->tcp_xmit_head->b_prev =
4960 (mblk_t *)LBOLT_FASTPATH;
4961 #else
4962 tcp->tcp_xmit_head->b_prev =
4963 (mblk_t *)(intptr_t)gethrtime();
4964 #endif
4965 tcp->tcp_csuna = tcp->tcp_snxt;
4966 TCPS_BUMP_MIB(tcps, tcpRetransSegs);
4967 TCPS_UPDATE_MIB(tcps, tcpRetransBytes,
4968 snd_size);
4969 tcp->tcp_cs.tcp_out_retrans_segs++;
4970 tcp->tcp_cs.tcp_out_retrans_bytes += snd_size;
4971 tcp_send_data(tcp, mp1);
4972 }
4973 }
4974 if (flags & TH_NEED_SACK_REXMIT) {
4975 tcp_sack_rexmit(tcp, &flags);
4976 }
4977 /*
4978 * For TH_LIMIT_XMIT, tcp_wput_data() is called to send
4979 * out new segment. Note that tcp_rexmit should not be
4980 * set, otherwise TH_LIMIT_XMIT should not be set.
4981 */
4982 if (flags & (TH_XMIT_NEEDED|TH_LIMIT_XMIT)) {
4983 if (!tcp->tcp_rexmit) {
4984 tcp_wput_data(tcp, NULL, B_FALSE);
4985 } else {
4986 tcp_ss_rexmit(tcp);
4987 }
4988 }
4989 /*
4990 * Adjust tcp_cwnd back to normal value after sending
4991 * new data segments.
4992 */
4993 if (flags & TH_LIMIT_XMIT) {
4994 tcp->tcp_cwnd -= mss << (tcp->tcp_dupack_cnt - 1);
4995 /*
4996 * This will restart the timer. Restarting the
4997 * timer is used to avoid a timeout before the
4998 * limited transmitted segment's ACK gets back.
4999 */
5000 if (tcp->tcp_xmit_head != NULL) {
5001 #ifdef KERNEL_32
5002 tcp->tcp_xmit_head->b_prev =
5003 (mblk_t *)LBOLT_FASTPATH;
5004 #else
5005 tcp->tcp_xmit_head->b_prev =
5006 (mblk_t *)(intptr_t)gethrtime();
5007 #endif
5008 }
5009 }
5010
5011 /* Anything more to do? */
5012 if ((flags & (TH_ACK_NEEDED|TH_ACK_TIMER_NEEDED|
5013 TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0)
5014 goto done;
5015 }
5016 ack_check:
5017 if (flags & TH_SEND_URP_MARK) {
5018 ASSERT(tcp->tcp_urp_mark_mp);
5019 ASSERT(!IPCL_IS_NONSTR(connp));
5020 /*
5021 * Send up any queued data and then send the mark message
5022 */
5023 if (tcp->tcp_rcv_list != NULL) {
5024 flags |= tcp_rcv_drain(tcp);
5025
5026 }
5027 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg);
5028 mp1 = tcp->tcp_urp_mark_mp;
5029 tcp->tcp_urp_mark_mp = NULL;
5031 tcp_setcred_data(mp1, ira);
5032
5033 putnext(connp->conn_rq, mp1);
5034 #ifdef DEBUG
5035 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
5036 "tcp_rput: sending zero-length %s %s",
5037 ((mp1->b_flag & MSGMARKNEXT) ? "MSGMARKNEXT" :
5038 "MSGNOTMARKNEXT"),
5039 tcp_display(tcp, NULL, DISP_PORT_ONLY));
5040 #endif /* DEBUG */
5041 flags &= ~TH_SEND_URP_MARK;
5042 }
5043 if (flags & TH_ACK_NEEDED) {
5044 /*
5045 * Time to send an ack for some reason.
5046 */
5047 mp1 = tcp_ack_mp(tcp);
5048
5049 if (mp1 != NULL) {
5050 tcp_send_data(tcp, mp1);
5051 TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
5052 TCPS_BUMP_MIB(tcps, tcpOutAck);
5053 }
5054 if (tcp->tcp_ack_tid != 0) {
5055 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid);
5056 tcp->tcp_ack_tid = 0;
5057 }
5058 }
5059 if (flags & TH_ACK_TIMER_NEEDED) {
5060 /*
5061 * Arrange for deferred ACK or push wait timeout.
5062 * Start timer if it is not already running.
5063 */
5064 if (tcp->tcp_ack_tid == 0) {
5065 tcp->tcp_ack_tid = TCP_TIMER(tcp, tcp_ack_timer,
5066 tcp->tcp_localnet ?
5067 tcps->tcps_local_dack_interval :
5068 tcps->tcps_deferred_ack_interval);
5069 }
5070 }
5071 if (flags & TH_ORDREL_NEEDED) {
5324 }
5325 if (addflag.crb_ipv6_recvdstopts) {
5326 toh = (struct T_opthdr *)optptr;
5327 toh->level = IPPROTO_IPV6;
5328 toh->name = IPV6_DSTOPTS;
5329 toh->len = sizeof (*toh) + ipp->ipp_dstoptslen;
5330 toh->status = 0;
5331 optptr += sizeof (*toh);
5332 bcopy(ipp->ipp_dstopts, optptr, ipp->ipp_dstoptslen);
5333 optptr += ipp->ipp_dstoptslen;
5334 ASSERT(OK_32PTR(optptr));
5335 /* Save as last value */
5336 ip_savebuf((void **)&tcp->tcp_dstopts, &tcp->tcp_dstoptslen,
5337 (ipp->ipp_fields & IPPF_DSTOPTS),
5338 ipp->ipp_dstopts, ipp->ipp_dstoptslen);
5339 }
5340 ASSERT(optptr == mp->b_wptr);
5341 return (mp);
5342 }
5343
5344 /* The minimum of smoothed mean deviation in RTO calculation (nsec). */
5345 #define TCP_SD_MIN 400000000
5346
5347 /*
5348 * Set RTO for this connection based on a new round-trip time measurement.
5349 * The formula is from Jacobson and Karels' "Congestion Avoidance and Control"
5350 * in SIGCOMM '88. The variable names are the same as those in Appendix A.2
5351 * of that paper.
5352 *
5353 * m = new measurement
5354 * sa = smoothed RTT average (8 * average estimates).
5355 * sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates).
5356 */
5357 static void
5358 tcp_set_rto(tcp_t *tcp, hrtime_t rtt)
5359 {
5360 hrtime_t m = rtt;
5361 hrtime_t sa = tcp->tcp_rtt_sa;
5362 hrtime_t sv = tcp->tcp_rtt_sd;
5363 tcp_stack_t *tcps = tcp->tcp_tcps;
5364
5365 TCPS_BUMP_MIB(tcps, tcpRttUpdate);
5366 tcp->tcp_rtt_update++;
5367 tcp->tcp_rtt_sum += m;
5368 tcp->tcp_rtt_cnt++;
5369
5370 /* tcp_rtt_sa is not 0 means this is a new sample. */
5371 if (sa != 0) {
5372 /*
5373 * Update average estimator (see section 2.3 of RFC6298):
5374 * SRTT = 7/8 SRTT + 1/8 rtt
5375 *
5376 * We maintain tcp_rtt_sa as 8 * SRTT, so this reduces to:
5377 * tcp_rtt_sa = 7 SRTT + rtt
5378 * tcp_rtt_sa = tcp_rtt_sa - 1/8 tcp_rtt_sa + rtt
5379 * tcp_rtt_sa = tcp_rtt_sa + (rtt - 1/8 tcp_rtt_sa)
5380 *
5381 * (rtt - 1/8 tcp_rtt_sa) is simply the difference
5382 * between the new rtt measurement and the existing smoothed
5383 * RTT average. This is referred to as "Error" in subsequent
5384 * calculations.
5385 */
5386
5387 /* m is now Error. */
5388 m -= sa >> 3;
5389 if ((sa += m) <= 0) {
5390 /*
5391 * Don't allow the smoothed average to be negative.
5392 * We use 0 to denote reinitialization of the
5393 * variables.
5394 */
5395 sa = 1;
5396 }
5397
5398 /*
5399 * Update deviation estimator:
5400 * mdev = 3/4 mdev + 1/4 abs(Error)
5401 *
5402 * We maintain tcp_rtt_sd as 4 * mdev, so this reduces to:
5403 * tcp_rtt_sd = 3 mdev + abs(Error)
5404 * tcp_rtt_sd = tcp_rtt_sd - 1/4 tcp_rtt_sd + abs(Error)
5405 */
5406 if (m < 0)
5407 m = -m;
5408 m -= sv >> 2;
5409 sv += m;
5410 } else {
5411 /*
5412 * This follows BSD's implementation. So the reinitialized
5413 * RTO is 3 * m. We cannot go less than 2 because if the
5414 * link is bandwidth dominated, doubling the window size
5415 * during slow start means doubling the RTT. We want to be
5416 * more conservative when we reinitialize our estimates. 3
5417 * is just a convenient number.
5418 */
5419 sa = m << 3;
5420 sv = m << 1;
5421 }
5422 if (sv < TCP_SD_MIN) {
5423 /*
5424 * We do not know that if sa captures the delay ACK
5425 * effect as in a long train of segments, a receiver
5426 * does not delay its ACKs. So set the minimum of sv
5427 * to be TCP_SD_MIN, which is default to 400 ms, twice
5428 * of BSD DATO. That means the minimum of mean
5429 * deviation is 100 ms.
5430 */
5431 sv = TCP_SD_MIN;
5432 }
5433 tcp->tcp_rtt_sa = sa;
5434 tcp->tcp_rtt_sd = sv;
5435
5436 tcp->tcp_rto = tcp_calculate_rto(tcp, tcps);
5437
5438 /* Now, we can reset tcp_timer_backoff to use the new RTO... */
5439 tcp->tcp_timer_backoff = 0;
5440 }
5441
5442 /*
5443 * On a labeled system we have some protocols above TCP, such as RPC, which
5444 * appear to assume that every mblk in a chain has a db_credp.
5445 */
5446 static void
5447 tcp_setcred_data(mblk_t *mp, ip_recv_attr_t *ira)
5448 {
5449 ASSERT(is_system_labeled());
5450 ASSERT(ira->ira_cred != NULL);
5451
5452 while (mp != NULL) {
5453 mblk_setcred(mp, ira->ira_cred, NOPID);
5454 mp = mp->b_cont;
5455 }
5456 }
5741 default:
5742 break;
5743 }
5744 break;
5745 case ICMP_SOURCE_QUENCH:
5746 /*
5747 * use a global boolean to control
5748 * whether TCP should respond to ICMP_SOURCE_QUENCH.
5749 * The default is false.
5750 */
5751 if (tcp_icmp_source_quench) {
5752 /*
5753 * Reduce the sending rate as if we got a
5754 * retransmit timeout
5755 */
5756 uint32_t npkt;
5757
5758 npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) /
5759 tcp->tcp_mss;
5760 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * tcp->tcp_mss;
5761
5762 DTRACE_PROBE3(cwnd__source__quench, tcp_t *, tcp,
5763 uint32_t, tcp->tcp_cwnd,
5764 uint32_t, tcp->tcp_mss);
5765 tcp->tcp_cwnd = tcp->tcp_mss;
5766 tcp->tcp_cwnd_cnt = 0;
5767 }
5768 break;
5769 }
5770 freemsg(mp);
5771 }
5772
5773 /*
5774 * tcp_icmp_error_ipv6 is called from tcp_icmp_input to process ICMPv6
5775 * error messages passed up by IP.
5776 * Assumes that IP has pulled up all the extension headers as well
5777 * as the ICMPv6 header.
5778 */
5779 static void
5780 tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, ip_recv_attr_t *ira)
5781 {
5782 icmp6_t *icmp6;
5783 ip6_t *ip6h;
5784 uint16_t iph_hdr_length = ira->ira_ip_hdr_length;
|