Print this page
DLPX-25998 TCP congestion control is inadequate
Reviewed at: http://reviews.delphix.com/r/34808/
DLPX-45697 Adding Avg. RTT to connstat
DLPX-43064 include high-resolution round-trip times in connstat (EP-652)
DLPX-42721 Create inline function for TCP RTO calculation
DLPX-37540 TCP per-connection kernel statistics DLPX-37544 connstat command to display per-connection TCP statistics


   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  25  * Copyright 2017 Joyent, Inc.
  26  * Copyright (c) 2014 by Delphix. All rights reserved.
  27  */
  28 
  29 /* This file contains all TCP input processing functions. */
  30 
  31 #include <sys/types.h>
  32 #include <sys/stream.h>
  33 #include <sys/strsun.h>
  34 #include <sys/strsubr.h>
  35 #include <sys/stropts.h>
  36 #include <sys/strlog.h>
  37 #define _SUN_TPI_VERSION 2
  38 #include <sys/tihdr.h>
  39 #include <sys/suntpi.h>
  40 #include <sys/xti_inet.h>
  41 #include <sys/squeue_impl.h>
  42 #include <sys/squeue.h>
  43 #include <sys/tsol/tnet.h>
  44 
  45 #include <inet/common.h>
  46 #include <inet/ip.h>


 149 static uint32_t tcp_init_wnd_chk = 4096;
 150 
 151 /* Process ICMP source quench message or not. */
 152 static boolean_t tcp_icmp_source_quench = B_FALSE;
 153 
 154 static boolean_t tcp_outbound_squeue_switch = B_FALSE;
 155 
 156 static mblk_t   *tcp_conn_create_v4(conn_t *, conn_t *, mblk_t *,
 157                     ip_recv_attr_t *);
 158 static mblk_t   *tcp_conn_create_v6(conn_t *, conn_t *, mblk_t *,
 159                     ip_recv_attr_t *);
 160 static boolean_t        tcp_drop_q0(tcp_t *);
 161 static void     tcp_icmp_error_ipv6(tcp_t *, mblk_t *, ip_recv_attr_t *);
 162 static mblk_t   *tcp_input_add_ancillary(tcp_t *, mblk_t *, ip_pkt_t *,
 163                     ip_recv_attr_t *);
 164 static void     tcp_input_listener(void *, mblk_t *, void *, ip_recv_attr_t *);
 165 static void     tcp_process_options(tcp_t *, tcpha_t *);
 166 static mblk_t   *tcp_reass(tcp_t *, mblk_t *, uint32_t);
 167 static void     tcp_reass_elim_overlap(tcp_t *, mblk_t *);
 168 static void     tcp_rsrv_input(void *, mblk_t *, void *, ip_recv_attr_t *);
 169 static void     tcp_set_rto(tcp_t *, time_t);
 170 static void     tcp_setcred_data(mblk_t *, ip_recv_attr_t *);
 171 
 172 /*































































































































 173  * Set the MSS associated with a particular tcp based on its current value,
 174  * and a new one passed in. Observe minimums and maximums, and reset other
 175  * state variables that we want to view as multiples of MSS.
 176  *
 177  * The value of MSS could be either increased or descreased.
 178  */
 179 void
 180 tcp_mss_set(tcp_t *tcp, uint32_t mss)
 181 {
 182         uint32_t        mss_max;
 183         tcp_stack_t     *tcps = tcp->tcp_tcps;
 184         conn_t          *connp = tcp->tcp_connp;
 185 
 186         if (connp->conn_ipversion == IPV4_VERSION)
 187                 mss_max = tcps->tcps_mss_max_ipv4;
 188         else
 189                 mss_max = tcps->tcps_mss_max_ipv6;
 190 
 191         if (mss < tcps->tcps_mss_min)
 192                 mss = tcps->tcps_mss_min;


 531             IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) + TCP_MIN_HEADER_LENGTH);
 532 
 533         /*
 534          * Set MSS to the smaller one of both ends of the connection.
 535          * We should not have called tcp_mss_set() before, but our
 536          * side of the MSS should have been set to a proper value
 537          * by tcp_set_destination().  tcp_mss_set() will also set up the
 538          * STREAM head parameters properly.
 539          *
 540          * If we have a larger-than-16-bit window but the other side
 541          * didn't want to do window scale, tcp_rwnd_set() will take
 542          * care of that.
 543          */
 544         tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss));
 545 
 546         /*
 547          * Initialize tcp_cwnd value. After tcp_mss_set(), tcp_mss has been
 548          * updated properly.
 549          */
 550         TCP_SET_INIT_CWND(tcp, tcp->tcp_mss, tcps->tcps_slow_start_initial);



 551 }
 552 
 553 /*
 554  * Add a new piece to the tcp reassembly queue.  If the gap at the beginning
 555  * is filled, return as much as we can.  The message passed in may be
 556  * multi-part, chained using b_cont.  "start" is the starting sequence
 557  * number for this piece.
 558  */
 559 static mblk_t *
 560 tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start)
 561 {
 562         uint32_t        end;
 563         mblk_t          *mp1;
 564         mblk_t          *mp2;
 565         mblk_t          *next_mp;
 566         uint32_t        u1;
 567         tcp_stack_t     *tcps = tcp->tcp_tcps;
 568 
 569 
 570         /* Walk through all the new pieces. */
 571         do {
 572                 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <=
 573                     (uintptr_t)INT_MAX);
 574                 end = start + (int)(mp->b_wptr - mp->b_rptr);
 575                 next_mp = mp->b_cont;
 576                 if (start == end) {
 577                         /* Empty.  Blast it. */
 578                         freeb(mp);
 579                         continue;
 580                 }

 581                 mp->b_cont = NULL;
 582                 TCP_REASS_SET_SEQ(mp, start);
 583                 TCP_REASS_SET_END(mp, end);
 584                 mp1 = tcp->tcp_reass_tail;
 585                 if (!mp1) {
 586                         tcp->tcp_reass_tail = mp;






 587                         tcp->tcp_reass_head = mp;
 588                         TCPS_BUMP_MIB(tcps, tcpInDataUnorderSegs);
 589                         TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes,
 590                             end - start);
 591                         continue;
 592                 }
 593                 /* New stuff completely beyond tail? */
 594                 if (SEQ_GEQ(start, TCP_REASS_END(mp1))) {
 595                         /* Link it on end. */
 596                         mp1->b_cont = mp;
 597                         tcp->tcp_reass_tail = mp;
 598                         TCPS_BUMP_MIB(tcps, tcpInDataUnorderSegs);
 599                         TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes,
 600                             end - start);

 601                         continue;
 602                 }
 603                 mp1 = tcp->tcp_reass_head;
 604                 u1 = TCP_REASS_SEQ(mp1);
 605                 /* New stuff at the front? */
 606                 if (SEQ_LT(start, u1)) {
 607                         /* Yes... Check for overlap. */
 608                         mp->b_cont = mp1;
 609                         tcp->tcp_reass_head = mp;
 610                         tcp_reass_elim_overlap(tcp, mp);
 611                         continue;
 612                 }
 613                 /*
 614                  * The new piece fits somewhere between the head and tail.
 615                  * We find our slot, where mp1 precedes us and mp2 trails.
 616                  */
 617                 for (; (mp2 = mp1->b_cont) != NULL; mp1 = mp2) {
 618                         u1 = TCP_REASS_SEQ(mp2);
 619                         if (SEQ_LEQ(start, u1))
 620                                 break;


2307 tcp_input_data(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
2308 {
2309         int32_t         bytes_acked;
2310         int32_t         gap;
2311         mblk_t          *mp1;
2312         uint_t          flags;
2313         uint32_t        new_swnd = 0;
2314         uchar_t         *iphdr;
2315         uchar_t         *rptr;
2316         int32_t         rgap;
2317         uint32_t        seg_ack;
2318         int             seg_len;
2319         uint_t          ip_hdr_len;
2320         uint32_t        seg_seq;
2321         tcpha_t         *tcpha;
2322         int             urp;
2323         tcp_opt_t       tcpopt;
2324         ip_pkt_t        ipp;
2325         boolean_t       ofo_seg = B_FALSE; /* Out of order segment */
2326         uint32_t        cwnd;
2327         uint32_t        add;
2328         int             npkt;
2329         int             mss;
2330         conn_t          *connp = (conn_t *)arg;
2331         squeue_t        *sqp = (squeue_t *)arg2;
2332         tcp_t           *tcp = connp->conn_tcp;
2333         tcp_stack_t     *tcps = tcp->tcp_tcps;
2334         sock_upcalls_t  *sockupcalls;
2335 
2336         /*
2337          * RST from fused tcp loopback peer should trigger an unfuse.
2338          */
2339         if (tcp->tcp_fused) {
2340                 TCP_STAT(tcps, tcp_fusion_aborted);
2341                 tcp_unfuse(tcp);
2342         }
2343 
2344         iphdr = mp->b_rptr;
2345         rptr = mp->b_rptr;
2346         ASSERT(OK_32PTR(rptr));
2347 
2348         ip_hdr_len = ira->ira_ip_hdr_length;


2397 
2398         if (tcp->tcp_state == TCPS_TIME_WAIT) {
2399                 tcp_time_wait_processing(tcp, mp, seg_seq, seg_ack,
2400                     seg_len, tcpha, ira);
2401                 return;
2402         }
2403 
2404         if (sqp != NULL) {
2405                 /*
2406                  * This is the correct place to update tcp_last_recv_time. Note
2407                  * that it is also updated for tcp structure that belongs to
2408                  * global and listener queues which do not really need updating.
2409                  * But that should not cause any harm.  And it is updated for
2410                  * all kinds of incoming segments, not only for data segments.
2411                  */
2412                 tcp->tcp_last_recv_time = LBOLT_FASTPATH;
2413         }
2414 
2415         flags = (unsigned int)tcpha->tha_flags & 0xFF;
2416 
2417         BUMP_LOCAL(tcp->tcp_ibsegs);
2418         DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp);
2419 
2420         if ((flags & TH_URG) && sqp != NULL) {
2421                 /*
2422                  * TCP can't handle urgent pointers that arrive before
2423                  * the connection has been accept()ed since it can't
2424                  * buffer OOB data.  Discard segment if this happens.
2425                  *
2426                  * We can't just rely on a non-null tcp_listener to indicate
2427                  * that the accept() has completed since unlinking of the
2428                  * eager and completion of the accept are not atomic.
2429                  * tcp_detached, when it is not set (B_FALSE) indicates
2430                  * that the accept() has completed.
2431                  *
2432                  * Nor can it reassemble urgent pointers, so discard
2433                  * if it's not the next segment expected.
2434                  *
2435                  * Otherwise, collapse chain into one mblk (discard if
2436                  * that fails).  This makes sure the headers, retransmitted
2437                  * data, and new data all are in the same mblk.


2584                         tcp->tcp_suna = tcp->tcp_iss + 1;
2585                         tcp->tcp_valid_bits &= ~TCP_ISS_VALID;
2586 
2587                         /*
2588                          * If SYN was retransmitted, need to reset all
2589                          * retransmission info.  This is because this
2590                          * segment will be treated as a dup ACK.
2591                          */
2592                         if (tcp->tcp_rexmit) {
2593                                 tcp->tcp_rexmit = B_FALSE;
2594                                 tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
2595                                 tcp->tcp_rexmit_max = tcp->tcp_snxt;
2596                                 tcp->tcp_ms_we_have_waited = 0;
2597 
2598                                 /*
2599                                  * Set tcp_cwnd back to 1 MSS, per
2600                                  * recommendation from
2601                                  * draft-floyd-incr-init-win-01.txt,
2602                                  * Increasing TCP's Initial Window.
2603                                  */



2604                                 tcp->tcp_cwnd = tcp->tcp_mss;
2605                         }
2606 
2607                         tcp->tcp_swl1 = seg_seq;
2608                         tcp->tcp_swl2 = seg_ack;
2609 
2610                         new_swnd = ntohs(tcpha->tha_win);
2611                         tcp->tcp_swnd = new_swnd;
2612                         if (new_swnd > tcp->tcp_max_swnd)
2613                                 tcp->tcp_max_swnd = new_swnd;
2614 
2615                         /*
2616                          * Always send the three-way handshake ack immediately
2617                          * in order to make the connection complete as soon as
2618                          * possible on the accepting host.
2619                          */
2620                         flags |= TH_ACK_NEEDED;
2621 
2622                         /*
2623                          * Trace connect-established here.


2642                         if (tcp->tcp_loopback) {
2643                                 mblk_t *ack_mp;
2644 
2645                                 ASSERT(!tcp->tcp_unfusable);
2646                                 ASSERT(mp1 != NULL);
2647                                 /*
2648                                  * For loopback, we always get a pure SYN-ACK
2649                                  * and only need to send back the final ACK
2650                                  * with no data (this is because the other
2651                                  * tcp is ours and we don't do T/TCP).  This
2652                                  * final ACK triggers the passive side to
2653                                  * perform fusion in ESTABLISHED state.
2654                                  */
2655                                 if ((ack_mp = tcp_ack_mp(tcp)) != NULL) {
2656                                         if (tcp->tcp_ack_tid != 0) {
2657                                                 (void) TCP_TIMER_CANCEL(tcp,
2658                                                     tcp->tcp_ack_tid);
2659                                                 tcp->tcp_ack_tid = 0;
2660                                         }
2661                                         tcp_send_data(tcp, ack_mp);
2662                                         BUMP_LOCAL(tcp->tcp_obsegs);
2663                                         TCPS_BUMP_MIB(tcps, tcpOutAck);
2664 
2665                                         if (!IPCL_IS_NONSTR(connp)) {
2666                                                 /* Send up T_CONN_CON */
2667                                                 if (ira->ira_cred != NULL) {
2668                                                         mblk_setcred(mp1,
2669                                                             ira->ira_cred,
2670                                                             ira->ira_cpid);
2671                                                 }
2672                                                 putnext(connp->conn_rq, mp1);
2673                                         } else {
2674                                                 (*sockupcalls->su_connected)
2675                                                     (connp->conn_upper_handle,
2676                                                     tcp->tcp_connid,
2677                                                     ira->ira_cred,
2678                                                     ira->ira_cpid);
2679                                                 freemsg(mp1);
2680                                         }
2681 
2682                                         freemsg(mp);


3031                         mp2 = mp;
3032                         mp = mp->b_cont;
3033                         freeb(mp2);
3034                 } while (gap < 0);
3035                 /*
3036                  * If the urgent data has already been acknowledged, we
3037                  * should ignore TH_URG below
3038                  */
3039                 if (urp < 0)
3040                         flags &= ~TH_URG;
3041         }
3042         /*
3043          * rgap is the amount of stuff received out of window.  A negative
3044          * value is the amount out of window.
3045          */
3046         if (rgap < 0) {
3047                 mblk_t  *mp2;
3048 
3049                 if (tcp->tcp_rwnd == 0) {
3050                         TCPS_BUMP_MIB(tcps, tcpInWinProbe);

3051                 } else {
3052                         TCPS_BUMP_MIB(tcps, tcpInDataPastWinSegs);
3053                         TCPS_UPDATE_MIB(tcps, tcpInDataPastWinBytes, -rgap);
3054                 }
3055 
3056                 /*
3057                  * seg_len does not include the FIN, so if more than
3058                  * just the FIN is out of window, we act like we don't
3059                  * see it.  (If just the FIN is out of window, rgap
3060                  * will be zero and we will go ahead and acknowledge
3061                  * the FIN.)
3062                  */
3063                 flags &= ~TH_FIN;
3064 
3065                 /* Fix seg_len and make sure there is something left. */
3066                 seg_len += rgap;
3067                 if (seg_len <= 0) {
3068                         /*
3069                          * Resets are only valid if they lie within our offered
3070                          * window.  If the RST bit is set, we just ignore this


3280                                  *
3281                                  * But TCP should not perform fast retransmit
3282                                  * because of the ack number.  TCP uses
3283                                  * seg_len == 0 to determine if it is a pure
3284                                  * ACK.  And this is not a pure ACK.
3285                                  */
3286                                 seg_len = 0;
3287                                 ofo_seg = B_TRUE;
3288 
3289                                 if (tcps->tcps_reass_timeout != 0 &&
3290                                     tcp->tcp_reass_tid == 0) {
3291                                         tcp->tcp_reass_tid = TCP_TIMER(tcp,
3292                                             tcp_reass_timer,
3293                                             tcps->tcps_reass_timeout);
3294                                 }
3295                         }
3296                 }
3297         } else if (seg_len > 0) {
3298                 TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs);
3299                 TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, seg_len);



3300                 /*
3301                  * If an out of order FIN was received before, and the seq
3302                  * num and len of the new segment match that of the FIN,
3303                  * put the FIN flag back in.
3304                  */
3305                 if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) &&
3306                     seg_seq + seg_len == tcp->tcp_ofo_fin_seq) {
3307                         flags |= TH_FIN;
3308                         tcp->tcp_valid_bits &= ~TCP_OFO_FIN_VALID;
3309                 }
3310         }
3311         if ((flags & (TH_RST | TH_SYN | TH_URG | TH_ACK)) != TH_ACK) {
3312         if (flags & TH_RST) {
3313                 freemsg(mp);
3314                 switch (tcp->tcp_state) {
3315                 case TCPS_SYN_RCVD:
3316                         (void) tcp_clean_death(tcp, ECONNREFUSED);
3317                         break;
3318                 case TCPS_ESTABLISHED:
3319                 case TCPS_FIN_WAIT_1:


3345                     SEQ_LEQ(seg_seq, tcp->tcp_rnxt + tcp->tcp_rwnd));
3346                 freemsg(mp);
3347                 /*
3348                  * If the ACK flag is not set, just use our snxt as the
3349                  * seq number of the RST segment.
3350                  */
3351                 if (!(flags & TH_ACK)) {
3352                         seg_ack = tcp->tcp_snxt;
3353                 }
3354                 tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1,
3355                     TH_RST|TH_ACK);
3356                 ASSERT(tcp->tcp_state != TCPS_TIME_WAIT);
3357                 (void) tcp_clean_death(tcp, ECONNRESET);
3358                 return;
3359         }
3360         /*
3361          * urp could be -1 when the urp field in the packet is 0
3362          * and TCP_OLD_URP_INTERPRETATION is set. This implies that the urgent
3363          * byte was at seg_seq - 1, in which case we ignore the urgent flag.
3364          */
3365         if (flags & TH_URG && urp >= 0) {
3366                 if (!tcp->tcp_urp_last_valid ||
3367                     SEQ_GT(urp + seg_seq, tcp->tcp_urp_last)) {
3368                         /*
3369                          * Non-STREAMS sockets handle the urgent data a litte
3370                          * differently from STREAMS based sockets. There is no
3371                          * need to mark any mblks with the MSG{NOT,}MARKNEXT
3372                          * flags to keep SIOCATMARK happy. Instead a
3373                          * su_signal_oob upcall is made to update the mark.
3374                          * Neither is a T_EXDATA_IND mblk needed to be
3375                          * prepended to the urgent data. The urgent data is
3376                          * delivered using the su_recv upcall, where we set
3377                          * the MSG_OOB flag to indicate that it is urg data.
3378                          *
3379                          * Neither TH_SEND_URP_MARK nor TH_MARKNEXT_NEEDED
3380                          * are used by non-STREAMS sockets.
3381                          */
3382                         if (IPCL_IS_NONSTR(connp)) {
3383                                 if (!TCP_IS_DETACHED(tcp)) {
3384                                         (*sockupcalls->su_signal_oob)
3385                                             (connp->conn_upper_handle, urp);


3802                             ip_xmit_attr_t *, connp->conn_ixa, void_ip_t *,
3803                             iphdr, tcp_t *, tcp, tcph_t *, tcpha);
3804                 }
3805                 TCPS_CONN_INC(tcps);
3806 
3807                 tcp->tcp_suna = tcp->tcp_iss + 1; /* One for the SYN */
3808                 bytes_acked--;
3809                 /* SYN was acked - making progress */
3810                 tcp->tcp_ip_forward_progress = B_TRUE;
3811 
3812                 /*
3813                  * If SYN was retransmitted, need to reset all
3814                  * retransmission info as this segment will be
3815                  * treated as a dup ACK.
3816                  */
3817                 if (tcp->tcp_rexmit) {
3818                         tcp->tcp_rexmit = B_FALSE;
3819                         tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
3820                         tcp->tcp_rexmit_max = tcp->tcp_snxt;
3821                         tcp->tcp_ms_we_have_waited = 0;



3822                         tcp->tcp_cwnd = mss;
3823                 }
3824 
3825                 /*
3826                  * We set the send window to zero here.
3827                  * This is needed if there is data to be
3828                  * processed already on the queue.
3829                  * Later (at swnd_update label), the
3830                  * "new_swnd > tcp_swnd" condition is satisfied
3831                  * the XMIT_NEEDED flag is set in the current
3832                  * (SYN_RCVD) state. This ensures tcp_wput_data() is
3833                  * called if there is already data on queue in
3834                  * this state.
3835                  */
3836                 tcp->tcp_swnd = 0;
3837 
3838                 if (new_swnd > tcp->tcp_max_swnd)
3839                         tcp->tcp_max_swnd = new_swnd;
3840                 tcp->tcp_swl1 = seg_seq;
3841                 tcp->tcp_swl2 = seg_ack;


3845                 DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *,
3846                     connp->conn_ixa, void, NULL, tcp_t *, tcp, void, NULL,
3847                     int32_t, TCPS_SYN_RCVD);
3848 
3849                 /* Fuse when both sides are in ESTABLISHED state */
3850                 if (tcp->tcp_loopback && do_tcp_fusion)
3851                         tcp_fuse(tcp, iphdr, tcpha);
3852 
3853         }
3854         /* This code follows 4.4BSD-Lite2 mostly. */
3855         if (bytes_acked < 0)
3856                 goto est;
3857 
3858         /*
3859          * If TCP is ECN capable and the congestion experience bit is
3860          * set, reduce tcp_cwnd and tcp_ssthresh.  But this should only be
3861          * done once per window (or more loosely, per RTT).
3862          */
3863         if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max))
3864                 tcp->tcp_cwr = B_FALSE;
3865         if (tcp->tcp_ecn_ok && (flags & TH_ECE)) {
3866                 if (!tcp->tcp_cwr) {
3867                         npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) / mss;
3868                         tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * mss;
3869                         tcp->tcp_cwnd = npkt * mss;
3870                         /*
3871                          * If the cwnd is 0, use the timer to clock out
3872                          * new segments.  This is required by the ECN spec.
3873                          */
3874                         if (npkt == 0) {
3875                                 TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
3876                                 /*
3877                                  * This makes sure that when the ACK comes
3878                                  * back, we will increase tcp_cwnd by 1 MSS.
3879                                  */
3880                                 tcp->tcp_cwnd_cnt = 0;
3881                         }
3882                         tcp->tcp_cwr = B_TRUE;
3883                         /*
3884                          * This marks the end of the current window of in
3885                          * flight data.  That is why we don't use
3886                          * tcp_suna + tcp_swnd.  Only data in flight can
3887                          * provide ECN info.
3888                          */
3889                         tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
3890                         tcp->tcp_ecn_cwr_sent = B_FALSE;
3891                 }
3892         }
3893 
3894         mp1 = tcp->tcp_xmit_head;
3895         if (bytes_acked == 0) {
3896                 if (!ofo_seg && seg_len == 0 && new_swnd == tcp->tcp_swnd) {
3897                         int dupack_cnt;
3898 
3899                         TCPS_BUMP_MIB(tcps, tcpInDupAck);
3900                         /*
3901                          * Fast retransmit.  When we have seen exactly three
3902                          * identical ACKs while we have unacked data
3903                          * outstanding we take it as a hint that our peer
3904                          * dropped something.
3905                          *
3906                          * If TCP is retransmitting, don't do fast retransmit.
3907                          */
3908                         if (mp1 && tcp->tcp_suna != tcp->tcp_snxt &&
3909                             ! tcp->tcp_rexmit) {
3910                                 /* Do Limited Transmit */
3911                                 if ((dupack_cnt = ++tcp->tcp_dupack_cnt) <
3912                                     tcps->tcps_dupack_fast_retransmit) {


3913                                         /*
3914                                          * RFC 3042
3915                                          *
3916                                          * What we need to do is temporarily
3917                                          * increase tcp_cwnd so that new
3918                                          * data can be sent if it is allowed
3919                                          * by the receive window (tcp_rwnd).
3920                                          * tcp_wput_data() will take care of
3921                                          * the rest.
3922                                          *
3923                                          * If the connection is SACK capable,
3924                                          * only do limited xmit when there
3925                                          * is SACK info.
3926                                          *
3927                                          * Note how tcp_cwnd is incremented.
3928                                          * The first dup ACK will increase
3929                                          * it by 1 MSS.  The second dup ACK
3930                                          * will increase it by 2 MSS.  This
3931                                          * means that only 1 new segment will
3932                                          * be sent for each dup ACK.


3939                                                     (tcp->tcp_dupack_cnt - 1);
3940                                                 flags |= TH_LIMIT_XMIT;
3941                                         }
3942                                 } else if (dupack_cnt ==
3943                                     tcps->tcps_dupack_fast_retransmit) {
3944 
3945                                 /*
3946                                  * If we have reduced tcp_ssthresh
3947                                  * because of ECN, do not reduce it again
3948                                  * unless it is already one window of data
3949                                  * away.  After one window of data, tcp_cwr
3950                                  * should then be cleared.  Note that
3951                                  * for non ECN capable connection, tcp_cwr
3952                                  * should always be false.
3953                                  *
3954                                  * Adjust cwnd since the duplicate
3955                                  * ack indicates that a packet was
3956                                  * dropped (due to congestion.)
3957                                  */
3958                                 if (!tcp->tcp_cwr) {
3959                                         npkt = ((tcp->tcp_snxt -
3960                                             tcp->tcp_suna) >> 1) / mss;
3961                                         tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) *
3962                                             mss;
3963                                         tcp->tcp_cwnd = (npkt +
3964                                             tcp->tcp_dupack_cnt) * mss;
3965                                 }
3966                                 if (tcp->tcp_ecn_ok) {
3967                                         tcp->tcp_cwr = B_TRUE;
3968                                         tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
3969                                         tcp->tcp_ecn_cwr_sent = B_FALSE;
3970                                 }
3971 
3972                                 /*
3973                                  * We do Hoe's algorithm.  Refer to her
3974                                  * paper "Improving the Start-up Behavior
3975                                  * of a Congestion Control Scheme for TCP,"
3976                                  * appeared in SIGCOMM'96.
3977                                  *
3978                                  * Save highest seq no we have sent so far.
3979                                  * Be careful about the invisible FIN byte.
3980                                  */
3981                                 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
3982                                     (tcp->tcp_unsent == 0)) {
3983                                         tcp->tcp_rexmit_max = tcp->tcp_fss;
3984                                 } else {


4006                                                     tcp->tcp_fack;
4007                                                 tcp->tcp_sack_snxt = seg_ack;
4008                                                 flags |= TH_NEED_SACK_REXMIT;
4009                                         } else {
4010                                                 /*
4011                                                  * Always initialize tcp_pipe
4012                                                  * even though we don't have
4013                                                  * any SACK info.  If later
4014                                                  * we get SACK info and
4015                                                  * tcp_pipe is not initialized,
4016                                                  * funny things will happen.
4017                                                  */
4018                                                 tcp->tcp_pipe =
4019                                                     tcp->tcp_cwnd_ssthresh;
4020                                         }
4021                                 } else {
4022                                         flags |= TH_REXMIT_NEEDED;
4023                                 } /* tcp_snd_sack_ok */
4024 
4025                                 } else {


4026                                         /*
4027                                          * Here we perform congestion
4028                                          * avoidance, but NOT slow start.
4029                                          * This is known as the Fast
4030                                          * Recovery Algorithm.
4031                                          */
4032                                         if (tcp->tcp_snd_sack_ok &&
4033                                             tcp->tcp_notsack_list != NULL) {
4034                                                 flags |= TH_NEED_SACK_REXMIT;
4035                                                 tcp->tcp_pipe -= mss;
4036                                                 if (tcp->tcp_pipe < 0)
4037                                                         tcp->tcp_pipe = 0;
4038                                         } else {
4039                                         /*
4040                                          * We know that one more packet has
4041                                          * left the pipe thus we can update
4042                                          * cwnd.
4043                                          */
4044                                         cwnd = tcp->tcp_cwnd + mss;
4045                                         if (cwnd > tcp->tcp_cwnd_max)
4046                                                 cwnd = tcp->tcp_cwnd_max;




4047                                         tcp->tcp_cwnd = cwnd;
4048                                         if (tcp->tcp_unsent > 0)
4049                                                 flags |= TH_XMIT_NEEDED;
4050                                         }
4051                                 }
4052                         }
4053                 } else if (tcp->tcp_zero_win_probe) {
4054                         /*
4055                          * If the window has opened, need to arrange
4056                          * to send additional data.
4057                          */
4058                         if (new_swnd != 0) {
4059                                 /* tcp_suna != tcp_snxt */
4060                                 /* Packet contains a window update */
4061                                 TCPS_BUMP_MIB(tcps, tcpInWinUpdate);
4062                                 tcp->tcp_zero_win_probe = 0;
4063                                 tcp->tcp_timer_backoff = 0;
4064                                 tcp->tcp_ms_we_have_waited = 0;
4065 
4066                                 /*


4129                          * greater than 0, check if the number of such
4130                          * bogus ACks is greater than that count.  If yes,
4131                          * don't send back any ACK.  This prevents TCP from
4132                          * getting into an ACK storm if somehow an attacker
4133                          * successfully spoofs an acceptable segment to our
4134                          * peer.  If this continues (count > 2 X threshold),
4135                          * we should abort this connection.
4136                          */
4137                         if (tcp_drop_ack_unsent_cnt > 0 &&
4138                             ++tcp->tcp_in_ack_unsent >
4139                             tcp_drop_ack_unsent_cnt) {
4140                                 TCP_STAT(tcps, tcp_in_ack_unsent_drop);
4141                                 if (tcp->tcp_in_ack_unsent > 2 *
4142                                     tcp_drop_ack_unsent_cnt) {
4143                                         (void) tcp_clean_death(tcp, EPROTO);
4144                                 }
4145                                 return;
4146                         }
4147                         mp = tcp_ack_mp(tcp);
4148                         if (mp != NULL) {
4149                                 BUMP_LOCAL(tcp->tcp_obsegs);
4150                                 TCPS_BUMP_MIB(tcps, tcpOutAck);
4151                                 tcp_send_data(tcp, mp);
4152                         }
4153                         return;
4154                 }
4155         } else if (tcp->tcp_is_wnd_shrnk && SEQ_GEQ(seg_ack,
4156             tcp->tcp_snxt_shrunk)) {
4157                         tcp->tcp_is_wnd_shrnk = B_FALSE;
4158         }
4159 
4160         /*
4161          * TCP gets a new ACK, update the notsack'ed list to delete those
4162          * blocks that are covered by this ACK.
4163          */
4164         if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) {
4165                 tcp_notsack_remove(&(tcp->tcp_notsack_list), seg_ack,
4166                     &(tcp->tcp_num_notsack_blk), &(tcp->tcp_cnt_notsack_list));
4167         }
4168 
4169         /*
4170          * If we got an ACK after fast retransmit, check to see
4171          * if it is a partial ACK.  If it is not and the congestion
4172          * window was inflated to account for the other side's
4173          * cached packets, retract it.  If it is, do Hoe's algorithm.
4174          */
4175         if (tcp->tcp_dupack_cnt >= tcps->tcps_dupack_fast_retransmit) {
4176                 ASSERT(tcp->tcp_rexmit == B_FALSE);
4177                 if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) {
4178                         tcp->tcp_dupack_cnt = 0;
4179                         /*
4180                          * Restore the orig tcp_cwnd_ssthresh after
4181                          * fast retransmit phase.
4182                          */
4183                         if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) {
4184                                 tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh;
4185                         }
4186                         tcp->tcp_rexmit_max = seg_ack;
4187                         tcp->tcp_cwnd_cnt = 0;
4188 
4189                         /*
4190                          * Remove all notsack info to avoid confusion with
4191                          * the next fast retrasnmit/recovery phase.
4192                          */
4193                         if (tcp->tcp_snd_sack_ok) {
4194                                 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list,
4195                                     tcp);
4196                         }
4197                 } else {
4198                         if (tcp->tcp_snd_sack_ok &&
4199                             tcp->tcp_notsack_list != NULL) {
4200                                 flags |= TH_NEED_SACK_REXMIT;
4201                                 tcp->tcp_pipe -= mss;
4202                                 if (tcp->tcp_pipe < 0)
4203                                         tcp->tcp_pipe = 0;
4204                         } else {
4205                                 /*
4206                                  * Hoe's algorithm:
4207                                  *
4208                                  * Retransmit the unack'ed segment and
4209                                  * restart fast recovery.  Note that we
4210                                  * need to scale back tcp_cwnd to the
4211                                  * original value when we started fast
4212                                  * recovery.  This is to prevent overly
4213                                  * aggressive behaviour in sending new
4214                                  * segments.
4215                                  */
4216                                 tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh +
4217                                     tcps->tcps_dupack_fast_retransmit * mss;




4218                                 tcp->tcp_cwnd_cnt = tcp->tcp_cwnd;
4219                                 flags |= TH_REXMIT_NEEDED;
4220                         }
4221                 }
4222         } else {
4223                 tcp->tcp_dupack_cnt = 0;
4224                 if (tcp->tcp_rexmit) {
4225                         /*
4226                          * TCP is retranmitting.  If the ACK ack's all
4227                          * outstanding data, update tcp_rexmit_max and
4228                          * tcp_rexmit_nxt.  Otherwise, update tcp_rexmit_nxt
4229                          * to the correct value.
4230                          *
4231                          * Note that SEQ_LEQ() is used.  This is to avoid
4232                          * unnecessary fast retransmit caused by dup ACKs
4233                          * received when TCP does slow start retransmission
4234                          * after a time out.  During this phase, TCP may
4235                          * send out segments which are already received.
4236                          * This causes dup ACKs to be sent back.
4237                          */


4258                 tcp->tcp_timer_backoff = 0;
4259         }
4260 
4261         /*
4262          * If tcp_xmit_head is NULL, then it must be the FIN being ack'ed.
4263          * Note that it cannot be the SYN being ack'ed.  The code flow
4264          * will not reach here.
4265          */
4266         if (mp1 == NULL) {
4267                 goto fin_acked;
4268         }
4269 
4270         /*
4271          * Update the congestion window.
4272          *
4273          * If TCP is not ECN capable or TCP is ECN capable but the
4274          * congestion experience bit is not set, increase the tcp_cwnd as
4275          * usual.
4276          */
4277         if (!tcp->tcp_ecn_ok || !(flags & TH_ECE)) {
4278                 cwnd = tcp->tcp_cwnd;
4279                 add = mss;
4280 
4281                 if (cwnd >= tcp->tcp_cwnd_ssthresh) {
4282                         /*
4283                          * This is to prevent an increase of less than 1 MSS of
4284                          * tcp_cwnd.  With partial increase, tcp_wput_data()
4285                          * may send out tinygrams in order to preserve mblk
4286                          * boundaries.
4287                          *
4288                          * By initializing tcp_cwnd_cnt to new tcp_cwnd and
4289                          * decrementing it by 1 MSS for every ACKs, tcp_cwnd is
4290                          * increased by 1 MSS for every RTTs.
4291                          */
4292                         if (tcp->tcp_cwnd_cnt <= 0) {
4293                                 tcp->tcp_cwnd_cnt = cwnd + add;
4294                         } else {
4295                                 tcp->tcp_cwnd_cnt -= add;
4296                                 add = 0;
4297                         }

4298                 }
4299                 tcp->tcp_cwnd = MIN(cwnd + add, tcp->tcp_cwnd_max);
4300         }
4301 
4302         /* See if the latest urgent data has been acknowledged */
4303         if ((tcp->tcp_valid_bits & TCP_URG_VALID) &&
4304             SEQ_GT(seg_ack, tcp->tcp_urg))
4305                 tcp->tcp_valid_bits &= ~TCP_URG_VALID;
4306 
4307         /* Can we update the RTT estimates? */
4308         if (tcp->tcp_snd_ts_ok) {
4309                 /* Ignore zero timestamp echo-reply. */
4310                 if (tcpopt.tcp_opt_ts_ecr != 0) {
4311                         tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH -
4312                             (int32_t)tcpopt.tcp_opt_ts_ecr);
4313                 }
4314 
4315                 /* If needed, restart the timer. */
4316                 if (tcp->tcp_set_timer == 1) {
4317                         TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
4318                         tcp->tcp_set_timer = 0;
4319                 }
4320                 /*
4321                  * Update tcp_csuna in case the other side stops sending
4322                  * us timestamps.






4323                  */
4324                 tcp->tcp_csuna = tcp->tcp_snxt;
4325         } else if (SEQ_GT(seg_ack, tcp->tcp_csuna)) {
4326                 /*
4327                  * An ACK sequence we haven't seen before, so get the RTT
4328                  * and update the RTO. But first check if the timestamp is
4329                  * valid to use.
4330                  */
4331                 if ((mp1->b_next != NULL) &&
4332                     SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next)))

4333                         tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH -
4334                             (int32_t)(intptr_t)mp1->b_prev);
4335                 else




4336                         TCPS_BUMP_MIB(tcps, tcpRttNoUpdate);

4337 
4338                 /* Remeber the last sequence to be ACKed */
4339                 tcp->tcp_csuna = seg_ack;
4340                 if (tcp->tcp_set_timer == 1) {
4341                         TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
4342                         tcp->tcp_set_timer = 0;
4343                 }
4344         } else {
4345                 TCPS_BUMP_MIB(tcps, tcpRttNoUpdate);
4346         }
4347 
4348         /* Eat acknowledged bytes off the xmit queue. */
4349         for (;;) {
4350                 mblk_t  *mp2;
4351                 uchar_t *wptr;
4352 
4353                 wptr = mp1->b_wptr;
4354                 ASSERT((uintptr_t)(wptr - mp1->b_rptr) <= (uintptr_t)INT_MAX);
4355                 bytes_acked -= (int)(wptr - mp1->b_rptr);
4356                 if (bytes_acked < 0) {
4357                         mp1->b_rptr = wptr + bytes_acked;
4358                         /*
4359                          * Set a new timestamp if all the bytes timed by the
4360                          * old timestamp have been ack'ed.
4361                          */
4362                         if (SEQ_GT(seg_ack,
4363                             (uint32_t)(uintptr_t)(mp1->b_next))) {

4364                                 mp1->b_prev =
4365                                     (mblk_t *)(uintptr_t)LBOLT_FASTPATH;




4366                                 mp1->b_next = NULL;
4367                         }
4368                         break;
4369                 }
4370                 mp1->b_next = NULL;
4371                 mp1->b_prev = NULL;
4372                 mp2 = mp1;
4373                 mp1 = mp1->b_cont;
4374 
4375                 /*
4376                  * This notification is required for some zero-copy
4377                  * clients to maintain a copy semantic. After the data
4378                  * is ack'ed, client is safe to modify or reuse the buffer.
4379                  */
4380                 if (tcp->tcp_snd_zcopy_aware &&
4381                     (mp2->b_datap->db_struioflag & STRUIO_ZCNOTIFY))
4382                         tcp_zcopy_notify(tcp);
4383                 freeb(mp2);
4384                 if (bytes_acked == 0) {
4385                         if (mp1 == NULL) {


4821             TH_NEED_SACK_REXMIT|TH_LIMIT_XMIT|TH_ACK_TIMER_NEEDED|
4822             TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0)
4823                 goto done;
4824 
4825         /* Any transmit work to do and a non-zero window? */
4826         if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_NEED_SACK_REXMIT|
4827             TH_LIMIT_XMIT)) && tcp->tcp_swnd != 0) {
4828                 if (flags & TH_REXMIT_NEEDED) {
4829                         uint32_t snd_size = tcp->tcp_snxt - tcp->tcp_suna;
4830 
4831                         TCPS_BUMP_MIB(tcps, tcpOutFastRetrans);
4832                         if (snd_size > mss)
4833                                 snd_size = mss;
4834                         if (snd_size > tcp->tcp_swnd)
4835                                 snd_size = tcp->tcp_swnd;
4836                         mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, snd_size,
4837                             NULL, NULL, tcp->tcp_suna, B_TRUE, &snd_size,
4838                             B_TRUE);
4839 
4840                         if (mp1 != NULL) {

4841                                 tcp->tcp_xmit_head->b_prev =
4842                                     (mblk_t *)LBOLT_FASTPATH;




4843                                 tcp->tcp_csuna = tcp->tcp_snxt;
4844                                 TCPS_BUMP_MIB(tcps, tcpRetransSegs);
4845                                 TCPS_UPDATE_MIB(tcps, tcpRetransBytes,
4846                                     snd_size);


4847                                 tcp_send_data(tcp, mp1);
4848                         }
4849                 }
4850                 if (flags & TH_NEED_SACK_REXMIT) {
4851                         tcp_sack_rexmit(tcp, &flags);
4852                 }
4853                 /*
4854                  * For TH_LIMIT_XMIT, tcp_wput_data() is called to send
4855                  * out new segment.  Note that tcp_rexmit should not be
4856                  * set, otherwise TH_LIMIT_XMIT should not be set.
4857                  */
4858                 if (flags & (TH_XMIT_NEEDED|TH_LIMIT_XMIT)) {
4859                         if (!tcp->tcp_rexmit) {
4860                                 tcp_wput_data(tcp, NULL, B_FALSE);
4861                         } else {
4862                                 tcp_ss_rexmit(tcp);
4863                         }
4864                 }
4865                 /*
4866                  * Adjust tcp_cwnd back to normal value after sending
4867                  * new data segments.
4868                  */
4869                 if (flags & TH_LIMIT_XMIT) {
4870                         tcp->tcp_cwnd -= mss << (tcp->tcp_dupack_cnt - 1);
4871                         /*
4872                          * This will restart the timer.  Restarting the
4873                          * timer is used to avoid a timeout before the
4874                          * limited transmitted segment's ACK gets back.
4875                          */
4876                         if (tcp->tcp_xmit_head != NULL)

4877                                 tcp->tcp_xmit_head->b_prev =
4878                                     (mblk_t *)LBOLT_FASTPATH;




4879                 }

4880 
4881                 /* Anything more to do? */
4882                 if ((flags & (TH_ACK_NEEDED|TH_ACK_TIMER_NEEDED|
4883                     TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0)
4884                         goto done;
4885         }
4886 ack_check:
4887         if (flags & TH_SEND_URP_MARK) {
4888                 ASSERT(tcp->tcp_urp_mark_mp);
4889                 ASSERT(!IPCL_IS_NONSTR(connp));
4890                 /*
4891                  * Send up any queued data and then send the mark message
4892                  */
4893                 if (tcp->tcp_rcv_list != NULL) {
4894                         flags |= tcp_rcv_drain(tcp);
4895 
4896                 }
4897                 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg);
4898                 mp1 = tcp->tcp_urp_mark_mp;
4899                 tcp->tcp_urp_mark_mp = NULL;


4901                         tcp_setcred_data(mp1, ira);
4902 
4903                 putnext(connp->conn_rq, mp1);
4904 #ifdef DEBUG
4905                 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
4906                     "tcp_rput: sending zero-length %s %s",
4907                     ((mp1->b_flag & MSGMARKNEXT) ? "MSGMARKNEXT" :
4908                     "MSGNOTMARKNEXT"),
4909                     tcp_display(tcp, NULL, DISP_PORT_ONLY));
4910 #endif /* DEBUG */
4911                 flags &= ~TH_SEND_URP_MARK;
4912         }
4913         if (flags & TH_ACK_NEEDED) {
4914                 /*
4915                  * Time to send an ack for some reason.
4916                  */
4917                 mp1 = tcp_ack_mp(tcp);
4918 
4919                 if (mp1 != NULL) {
4920                         tcp_send_data(tcp, mp1);
4921                         BUMP_LOCAL(tcp->tcp_obsegs);
4922                         TCPS_BUMP_MIB(tcps, tcpOutAck);
4923                 }
4924                 if (tcp->tcp_ack_tid != 0) {
4925                         (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid);
4926                         tcp->tcp_ack_tid = 0;
4927                 }
4928         }
4929         if (flags & TH_ACK_TIMER_NEEDED) {
4930                 /*
4931                  * Arrange for deferred ACK or push wait timeout.
4932                  * Start timer if it is not already running.
4933                  */
4934                 if (tcp->tcp_ack_tid == 0) {
4935                         tcp->tcp_ack_tid = TCP_TIMER(tcp, tcp_ack_timer,
4936                             tcp->tcp_localnet ?
4937                             tcps->tcps_local_dack_interval :
4938                             tcps->tcps_deferred_ack_interval);
4939                 }
4940         }
4941         if (flags & TH_ORDREL_NEEDED) {


5194         }
5195         if (addflag.crb_ipv6_recvdstopts) {
5196                 toh = (struct T_opthdr *)optptr;
5197                 toh->level = IPPROTO_IPV6;
5198                 toh->name = IPV6_DSTOPTS;
5199                 toh->len = sizeof (*toh) + ipp->ipp_dstoptslen;
5200                 toh->status = 0;
5201                 optptr += sizeof (*toh);
5202                 bcopy(ipp->ipp_dstopts, optptr, ipp->ipp_dstoptslen);
5203                 optptr += ipp->ipp_dstoptslen;
5204                 ASSERT(OK_32PTR(optptr));
5205                 /* Save as last value */
5206                 ip_savebuf((void **)&tcp->tcp_dstopts, &tcp->tcp_dstoptslen,
5207                     (ipp->ipp_fields & IPPF_DSTOPTS),
5208                     ipp->ipp_dstopts, ipp->ipp_dstoptslen);
5209         }
5210         ASSERT(optptr == mp->b_wptr);
5211         return (mp);
5212 }
5213 
5214 /* The minimum of smoothed mean deviation in RTO calculation. */
5215 #define TCP_SD_MIN      400
5216 
5217 /*
5218  * Set RTO for this connection.  The formula is from Jacobson and Karels'
5219  * "Congestion Avoidance and Control" in SIGCOMM '88.  The variable names
5220  * are the same as those in Appendix A.2 of that paper.

5221  *
5222  * m = new measurement
5223  * sa = smoothed RTT average (8 * average estimates).
5224  * sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates).
5225  */
5226 static void
5227 tcp_set_rto(tcp_t *tcp, clock_t rtt)
5228 {
5229         long m = TICK_TO_MSEC(rtt);
5230         clock_t sa = tcp->tcp_rtt_sa;
5231         clock_t sv = tcp->tcp_rtt_sd;
5232         clock_t rto;
5233         tcp_stack_t     *tcps = tcp->tcp_tcps;
5234 
5235         TCPS_BUMP_MIB(tcps, tcpRttUpdate);
5236         tcp->tcp_rtt_update++;


5237 
5238         /* tcp_rtt_sa is not 0 means this is a new sample. */
5239         if (sa != 0) {
5240                 /*
5241                  * Update average estimator:
5242                  *      new rtt = 7/8 old rtt + 1/8 Error










5243                  */
5244 
5245                 /* m is now Error in estimate. */
5246                 m -= sa >> 3;
5247                 if ((sa += m) <= 0) {
5248                         /*
5249                          * Don't allow the smoothed average to be negative.
5250                          * We use 0 to denote reinitialization of the
5251                          * variables.
5252                          */
5253                         sa = 1;
5254                 }
5255 
5256                 /*
5257                  * Update deviation estimator:
5258                  *      new mdev = 3/4 old mdev + 1/4 (abs(Error) - old mdev)




5259                  */
5260                 if (m < 0)
5261                         m = -m;
5262                 m -= sv >> 2;
5263                 sv += m;
5264         } else {
5265                 /*
5266                  * This follows BSD's implementation.  So the reinitialized
5267                  * RTO is 3 * m.  We cannot go less than 2 because if the
5268                  * link is bandwidth dominated, doubling the window size
5269                  * during slow start means doubling the RTT.  We want to be
5270                  * more conservative when we reinitialize our estimates.  3
5271                  * is just a convenient number.
5272                  */
5273                 sa = m << 3;
5274                 sv = m << 1;
5275         }
5276         if (sv < TCP_SD_MIN) {
5277                 /*
5278                  * We do not know that if sa captures the delay ACK
5279                  * effect as in a long train of segments, a receiver
5280                  * does not delay its ACKs.  So set the minimum of sv
5281                  * to be TCP_SD_MIN, which is default to 400 ms, twice
5282                  * of BSD DATO.  That means the minimum of mean
5283                  * deviation is 100 ms.
5284                  *
5285                  */
5286                 sv = TCP_SD_MIN;
5287         }
5288         tcp->tcp_rtt_sa = sa;
5289         tcp->tcp_rtt_sd = sv;
5290         /*
5291          * RTO = average estimates (sa / 8) + 4 * deviation estimates (sv)
5292          *
5293          * Add tcp_rexmit_interval extra in case of extreme environment
5294          * where the algorithm fails to work.  The default value of
5295          * tcp_rexmit_interval_extra should be 0.
5296          *
5297          * As we use a finer grained clock than BSD and update
5298          * RTO for every ACKs, add in another .25 of RTT to the
5299          * deviation of RTO to accomodate burstiness of 1/4 of
5300          * window size.
5301          */
5302         rto = (sa >> 3) + sv + tcps->tcps_rexmit_interval_extra + (sa >> 5);
5303 
5304         TCP_SET_RTO(tcp, rto);
5305 
5306         /* Now, we can reset tcp_timer_backoff to use the new RTO... */
5307         tcp->tcp_timer_backoff = 0;
5308 }
5309 
5310 /*
5311  * On a labeled system we have some protocols above TCP, such as RPC, which
5312  * appear to assume that every mblk in a chain has a db_credp.
5313  */
5314 static void
5315 tcp_setcred_data(mblk_t *mp, ip_recv_attr_t *ira)
5316 {
5317         ASSERT(is_system_labeled());
5318         ASSERT(ira->ira_cred != NULL);
5319 
5320         while (mp != NULL) {
5321                 mblk_setcred(mp, ira->ira_cred, NOPID);
5322                 mp = mp->b_cont;
5323         }
5324 }


5609                 default:
5610                         break;
5611                 }
5612                 break;
5613         case ICMP_SOURCE_QUENCH:
5614                 /*
5615                  * use a global boolean to control
5616                  * whether TCP should respond to ICMP_SOURCE_QUENCH.
5617                  * The default is false.
5618                  */
5619                 if (tcp_icmp_source_quench) {
5620                         /*
5621                          * Reduce the sending rate as if we got a
5622                          * retransmit timeout
5623                          */
5624                         uint32_t npkt;
5625 
5626                         npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) /
5627                             tcp->tcp_mss;
5628                         tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * tcp->tcp_mss;




5629                         tcp->tcp_cwnd = tcp->tcp_mss;
5630                         tcp->tcp_cwnd_cnt = 0;
5631                 }
5632                 break;
5633         }
5634         freemsg(mp);
5635 }
5636 
5637 /*
5638  * tcp_icmp_error_ipv6 is called from tcp_icmp_input to process ICMPv6
5639  * error messages passed up by IP.
5640  * Assumes that IP has pulled up all the extension headers as well
5641  * as the ICMPv6 header.
5642  */
5643 static void
5644 tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, ip_recv_attr_t *ira)
5645 {
5646         icmp6_t         *icmp6;
5647         ip6_t           *ip6h;
5648         uint16_t        iph_hdr_length = ira->ira_ip_hdr_length;




   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  25  * Copyright 2017 Joyent, Inc.
  26  * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
  27  */
  28 
  29 /* This file contains all TCP input processing functions. */
  30 
  31 #include <sys/types.h>
  32 #include <sys/stream.h>
  33 #include <sys/strsun.h>
  34 #include <sys/strsubr.h>
  35 #include <sys/stropts.h>
  36 #include <sys/strlog.h>
  37 #define _SUN_TPI_VERSION 2
  38 #include <sys/tihdr.h>
  39 #include <sys/suntpi.h>
  40 #include <sys/xti_inet.h>
  41 #include <sys/squeue_impl.h>
  42 #include <sys/squeue.h>
  43 #include <sys/tsol/tnet.h>
  44 
  45 #include <inet/common.h>
  46 #include <inet/ip.h>


 149 static uint32_t tcp_init_wnd_chk = 4096;
 150 
 151 /* Process ICMP source quench message or not. */
 152 static boolean_t tcp_icmp_source_quench = B_FALSE;
 153 
 154 static boolean_t tcp_outbound_squeue_switch = B_FALSE;
 155 
 156 static mblk_t   *tcp_conn_create_v4(conn_t *, conn_t *, mblk_t *,
 157                     ip_recv_attr_t *);
 158 static mblk_t   *tcp_conn_create_v6(conn_t *, conn_t *, mblk_t *,
 159                     ip_recv_attr_t *);
 160 static boolean_t        tcp_drop_q0(tcp_t *);
 161 static void     tcp_icmp_error_ipv6(tcp_t *, mblk_t *, ip_recv_attr_t *);
 162 static mblk_t   *tcp_input_add_ancillary(tcp_t *, mblk_t *, ip_pkt_t *,
 163                     ip_recv_attr_t *);
 164 static void     tcp_input_listener(void *, mblk_t *, void *, ip_recv_attr_t *);
 165 static void     tcp_process_options(tcp_t *, tcpha_t *);
 166 static mblk_t   *tcp_reass(tcp_t *, mblk_t *, uint32_t);
 167 static void     tcp_reass_elim_overlap(tcp_t *, mblk_t *);
 168 static void     tcp_rsrv_input(void *, mblk_t *, void *, ip_recv_attr_t *);
 169 static void     tcp_set_rto(tcp_t *, hrtime_t);
 170 static void     tcp_setcred_data(mblk_t *, ip_recv_attr_t *);
 171 
 172 /*
 173  * CC wrapper hook functions
 174  */
 175 static void
 176 cc_ack_received(tcp_t *tcp, uint32_t seg_ack, int32_t bytes_acked,
 177     uint16_t type)
 178 {
 179         uint32_t old_cwnd = tcp->tcp_cwnd;
 180 
 181         tcp->tcp_ccv.bytes_this_ack = bytes_acked;
 182         if (tcp->tcp_cwnd <= tcp->tcp_swnd)
 183                 tcp->tcp_ccv.flags |= CCF_CWND_LIMITED;
 184         else
 185                 tcp->tcp_ccv.flags &= ~CCF_CWND_LIMITED;
 186 
 187         if (type == CC_ACK) {
 188                 if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) {
 189                         if (tcp->tcp_ccv.flags & CCF_RTO)
 190                                 tcp->tcp_ccv.flags &= ~CCF_RTO;
 191 
 192                         tcp->tcp_ccv.t_bytes_acked +=
 193                             min(tcp->tcp_ccv.bytes_this_ack,
 194                             tcp->tcp_tcps->tcps_abc_l_var * tcp->tcp_mss);
 195                         if (tcp->tcp_ccv.t_bytes_acked >= tcp->tcp_cwnd) {
 196                                 tcp->tcp_ccv.t_bytes_acked -= tcp->tcp_cwnd;
 197                                 tcp->tcp_ccv.flags |= CCF_ABC_SENTAWND;
 198                         }
 199                 } else {
 200                         tcp->tcp_ccv.flags &= ~CCF_ABC_SENTAWND;
 201                         tcp->tcp_ccv.t_bytes_acked = 0;
 202                 }
 203         }
 204 
 205         if (CC_ALGO(tcp)->ack_received != NULL) {
 206                 /*
 207                  * The FreeBSD code where this originated had a comment "Find
 208                  * a way to live without this" in several places where curack
 209                  * got set.  If they eventually dump curack from from the cc
 210                  * variables, we'll need to adapt our code.
 211                  */
 212                 tcp->tcp_ccv.curack = seg_ack;
 213                 CC_ALGO(tcp)->ack_received(&tcp->tcp_ccv, type);
 214         }
 215 
 216         DTRACE_PROBE3(cwnd__cc__ack__received, tcp_t *, tcp, uint32_t, old_cwnd,
 217             uint32_t, tcp->tcp_cwnd);
 218 }
 219 
 220 void
 221 cc_cong_signal(tcp_t *tcp, uint32_t seg_ack, uint32_t type)
 222 {
 223         uint32_t old_cwnd = tcp->tcp_cwnd;
 224         uint32_t old_cwnd_ssthresh = tcp->tcp_cwnd_ssthresh;
 225         switch (type) {
 226         case CC_NDUPACK:
 227                 if (!IN_FASTRECOVERY(tcp->tcp_ccv.flags)) {
 228                         tcp->tcp_rexmit_max = tcp->tcp_snxt;
 229                         if (tcp->tcp_ecn_ok) {
 230                                 tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
 231                                 tcp->tcp_cwr = B_TRUE;
 232                                 tcp->tcp_ecn_cwr_sent = B_FALSE;
 233                         }
 234                 }
 235                 break;
 236         case CC_ECN:
 237                 if (!IN_CONGRECOVERY(tcp->tcp_ccv.flags)) {
 238                         tcp->tcp_rexmit_max = tcp->tcp_snxt;
 239                         if (tcp->tcp_ecn_ok) {
 240                                 tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
 241                                 tcp->tcp_cwr = B_TRUE;
 242                                 tcp->tcp_ecn_cwr_sent = B_FALSE;
 243                         }
 244                 }
 245                 break;
 246         case CC_RTO:
 247                 tcp->tcp_ccv.flags |= CCF_RTO;
 248                 tcp->tcp_dupack_cnt = 0;
 249                 tcp->tcp_ccv.t_bytes_acked = 0;
 250                 /*
 251                  * Give up on fast recovery and congestion recovery if we were
 252                  * attempting either.
 253                  */
 254                 EXIT_RECOVERY(tcp->tcp_ccv.flags);
 255                 if (CC_ALGO(tcp)->cong_signal == NULL) {
 256                         /*
 257                          * RFC5681 Section 3.1
 258                          * ssthresh = max (FlightSize / 2, 2*SMSS) eq (4)
 259                          */
 260                         tcp->tcp_cwnd_ssthresh = max(
 261                             (tcp->tcp_snxt - tcp->tcp_suna) / 2 / tcp->tcp_mss,
 262                             2) * tcp->tcp_mss;
 263                         tcp->tcp_cwnd = tcp->tcp_mss;
 264                 }
 265 
 266                 if (tcp->tcp_ecn_ok) {
 267                         tcp->tcp_cwr = B_TRUE;
 268                         tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
 269                         tcp->tcp_ecn_cwr_sent = B_FALSE;
 270                 }
 271                 break;
 272         }
 273 
 274         if (CC_ALGO(tcp)->cong_signal != NULL) {
 275                 tcp->tcp_ccv.curack = seg_ack;
 276                 CC_ALGO(tcp)->cong_signal(&tcp->tcp_ccv, type);
 277         }
 278 
 279         DTRACE_PROBE6(cwnd__cc__cong__signal, tcp_t *, tcp, uint32_t, old_cwnd,
 280             uint32_t, tcp->tcp_cwnd, uint32_t, old_cwnd_ssthresh,
 281             uint32_t, tcp->tcp_cwnd_ssthresh, uint32_t, type);
 282 }
 283 
 284 static void
 285 cc_post_recovery(tcp_t *tcp, uint32_t seg_ack)
 286 {
 287         uint32_t old_cwnd = tcp->tcp_cwnd;
 288 
 289         if (CC_ALGO(tcp)->post_recovery != NULL) {
 290                 tcp->tcp_ccv.curack = seg_ack;
 291                 CC_ALGO(tcp)->post_recovery(&tcp->tcp_ccv);
 292         }
 293         tcp->tcp_ccv.t_bytes_acked = 0;
 294 
 295         DTRACE_PROBE3(cwnd__cc__post__recovery, tcp_t *, tcp,
 296             uint32_t, old_cwnd, uint32_t, tcp->tcp_cwnd);
 297 }
 298 
 299 /*
 300  * Set the MSS associated with a particular tcp based on its current value,
 301  * and a new one passed in. Observe minimums and maximums, and reset other
 302  * state variables that we want to view as multiples of MSS.
 303  *
 304  * The value of MSS could be either increased or descreased.
 305  */
 306 void
 307 tcp_mss_set(tcp_t *tcp, uint32_t mss)
 308 {
 309         uint32_t        mss_max;
 310         tcp_stack_t     *tcps = tcp->tcp_tcps;
 311         conn_t          *connp = tcp->tcp_connp;
 312 
 313         if (connp->conn_ipversion == IPV4_VERSION)
 314                 mss_max = tcps->tcps_mss_max_ipv4;
 315         else
 316                 mss_max = tcps->tcps_mss_max_ipv6;
 317 
 318         if (mss < tcps->tcps_mss_min)
 319                 mss = tcps->tcps_mss_min;


 658             IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) + TCP_MIN_HEADER_LENGTH);
 659 
 660         /*
 661          * Set MSS to the smaller one of both ends of the connection.
 662          * We should not have called tcp_mss_set() before, but our
 663          * side of the MSS should have been set to a proper value
 664          * by tcp_set_destination().  tcp_mss_set() will also set up the
 665          * STREAM head parameters properly.
 666          *
 667          * If we have a larger-than-16-bit window but the other side
 668          * didn't want to do window scale, tcp_rwnd_set() will take
 669          * care of that.
 670          */
 671         tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss));
 672 
 673         /*
 674          * Initialize tcp_cwnd value. After tcp_mss_set(), tcp_mss has been
 675          * updated properly.
 676          */
 677         TCP_SET_INIT_CWND(tcp, tcp->tcp_mss, tcps->tcps_slow_start_initial);
 678 
 679         if (tcp->tcp_cc_algo->conn_init != NULL)
 680                 tcp->tcp_cc_algo->conn_init(&tcp->tcp_ccv);
 681 }
 682 
 683 /*
 684  * Add a new piece to the tcp reassembly queue.  If the gap at the beginning
 685  * is filled, return as much as we can.  The message passed in may be
 686  * multi-part, chained using b_cont.  "start" is the starting sequence
 687  * number for this piece.
 688  */
 689 static mblk_t *
 690 tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start)
 691 {
 692         uint32_t        end, bytes;
 693         mblk_t          *mp1;
 694         mblk_t          *mp2;
 695         mblk_t          *next_mp;
 696         uint32_t        u1;
 697         tcp_stack_t     *tcps = tcp->tcp_tcps;
 698 
 699 
 700         /* Walk through all the new pieces. */
 701         do {
 702                 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <=
 703                     (uintptr_t)INT_MAX);
 704                 end = start + (int)(mp->b_wptr - mp->b_rptr);
 705                 next_mp = mp->b_cont;
 706                 if (start == end) {
 707                         /* Empty.  Blast it. */
 708                         freeb(mp);
 709                         continue;
 710                 }
 711                 bytes = end - start;
 712                 mp->b_cont = NULL;
 713                 TCP_REASS_SET_SEQ(mp, start);
 714                 TCP_REASS_SET_END(mp, end);
 715                 mp1 = tcp->tcp_reass_tail;
 716                 if (mp1 == NULL || SEQ_GEQ(start, TCP_REASS_END(mp1))) {
 717                         if (mp1 != NULL) {
 718                                 /*
 719                                  * New stuff is beyond the tail; link it on the
 720                                  * end.
 721                                  */
 722                                 mp1->b_cont = mp;
 723                         } else {
 724                                 tcp->tcp_reass_head = mp;




 725                         }




 726                         tcp->tcp_reass_tail = mp;
 727                         TCPS_BUMP_MIB(tcps, tcpInDataUnorderSegs);
 728                         TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes, bytes);
 729                         tcp->tcp_cs.tcp_in_data_unorder_segs++;
 730                         tcp->tcp_cs.tcp_in_data_unorder_bytes += bytes;
 731                         continue;
 732                 }
 733                 mp1 = tcp->tcp_reass_head;
 734                 u1 = TCP_REASS_SEQ(mp1);
 735                 /* New stuff at the front? */
 736                 if (SEQ_LT(start, u1)) {
 737                         /* Yes... Check for overlap. */
 738                         mp->b_cont = mp1;
 739                         tcp->tcp_reass_head = mp;
 740                         tcp_reass_elim_overlap(tcp, mp);
 741                         continue;
 742                 }
 743                 /*
 744                  * The new piece fits somewhere between the head and tail.
 745                  * We find our slot, where mp1 precedes us and mp2 trails.
 746                  */
 747                 for (; (mp2 = mp1->b_cont) != NULL; mp1 = mp2) {
 748                         u1 = TCP_REASS_SEQ(mp2);
 749                         if (SEQ_LEQ(start, u1))
 750                                 break;


2437 tcp_input_data(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
2438 {
2439         int32_t         bytes_acked;
2440         int32_t         gap;
2441         mblk_t          *mp1;
2442         uint_t          flags;
2443         uint32_t        new_swnd = 0;
2444         uchar_t         *iphdr;
2445         uchar_t         *rptr;
2446         int32_t         rgap;
2447         uint32_t        seg_ack;
2448         int             seg_len;
2449         uint_t          ip_hdr_len;
2450         uint32_t        seg_seq;
2451         tcpha_t         *tcpha;
2452         int             urp;
2453         tcp_opt_t       tcpopt;
2454         ip_pkt_t        ipp;
2455         boolean_t       ofo_seg = B_FALSE; /* Out of order segment */
2456         uint32_t        cwnd;


2457         int             mss;
2458         conn_t          *connp = (conn_t *)arg;
2459         squeue_t        *sqp = (squeue_t *)arg2;
2460         tcp_t           *tcp = connp->conn_tcp;
2461         tcp_stack_t     *tcps = tcp->tcp_tcps;
2462         sock_upcalls_t  *sockupcalls;
2463 
2464         /*
2465          * RST from fused tcp loopback peer should trigger an unfuse.
2466          */
2467         if (tcp->tcp_fused) {
2468                 TCP_STAT(tcps, tcp_fusion_aborted);
2469                 tcp_unfuse(tcp);
2470         }
2471 
2472         iphdr = mp->b_rptr;
2473         rptr = mp->b_rptr;
2474         ASSERT(OK_32PTR(rptr));
2475 
2476         ip_hdr_len = ira->ira_ip_hdr_length;


2525 
2526         if (tcp->tcp_state == TCPS_TIME_WAIT) {
2527                 tcp_time_wait_processing(tcp, mp, seg_seq, seg_ack,
2528                     seg_len, tcpha, ira);
2529                 return;
2530         }
2531 
2532         if (sqp != NULL) {
2533                 /*
2534                  * This is the correct place to update tcp_last_recv_time. Note
2535                  * that it is also updated for tcp structure that belongs to
2536                  * global and listener queues which do not really need updating.
2537                  * But that should not cause any harm.  And it is updated for
2538                  * all kinds of incoming segments, not only for data segments.
2539                  */
2540                 tcp->tcp_last_recv_time = LBOLT_FASTPATH;
2541         }
2542 
2543         flags = (unsigned int)tcpha->tha_flags & 0xFF;
2544 
2545         TCPS_BUMP_MIB(tcps, tcpHCInSegs);
2546         DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp);
2547 
2548         if ((flags & TH_URG) && sqp != NULL) {
2549                 /*
2550                  * TCP can't handle urgent pointers that arrive before
2551                  * the connection has been accept()ed since it can't
2552                  * buffer OOB data.  Discard segment if this happens.
2553                  *
2554                  * We can't just rely on a non-null tcp_listener to indicate
2555                  * that the accept() has completed since unlinking of the
2556                  * eager and completion of the accept are not atomic.
2557                  * tcp_detached, when it is not set (B_FALSE) indicates
2558                  * that the accept() has completed.
2559                  *
2560                  * Nor can it reassemble urgent pointers, so discard
2561                  * if it's not the next segment expected.
2562                  *
2563                  * Otherwise, collapse chain into one mblk (discard if
2564                  * that fails).  This makes sure the headers, retransmitted
2565                  * data, and new data all are in the same mblk.


2712                         tcp->tcp_suna = tcp->tcp_iss + 1;
2713                         tcp->tcp_valid_bits &= ~TCP_ISS_VALID;
2714 
2715                         /*
2716                          * If SYN was retransmitted, need to reset all
2717                          * retransmission info.  This is because this
2718                          * segment will be treated as a dup ACK.
2719                          */
2720                         if (tcp->tcp_rexmit) {
2721                                 tcp->tcp_rexmit = B_FALSE;
2722                                 tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
2723                                 tcp->tcp_rexmit_max = tcp->tcp_snxt;
2724                                 tcp->tcp_ms_we_have_waited = 0;
2725 
2726                                 /*
2727                                  * Set tcp_cwnd back to 1 MSS, per
2728                                  * recommendation from
2729                                  * draft-floyd-incr-init-win-01.txt,
2730                                  * Increasing TCP's Initial Window.
2731                                  */
2732                                 DTRACE_PROBE3(cwnd__retransmitted__syn,
2733                                     tcp_t *, tcp, uint32_t, tcp->tcp_cwnd,
2734                                     uint32_t, tcp->tcp_mss);
2735                                 tcp->tcp_cwnd = tcp->tcp_mss;
2736                         }
2737 
2738                         tcp->tcp_swl1 = seg_seq;
2739                         tcp->tcp_swl2 = seg_ack;
2740 
2741                         new_swnd = ntohs(tcpha->tha_win);
2742                         tcp->tcp_swnd = new_swnd;
2743                         if (new_swnd > tcp->tcp_max_swnd)
2744                                 tcp->tcp_max_swnd = new_swnd;
2745 
2746                         /*
2747                          * Always send the three-way handshake ack immediately
2748                          * in order to make the connection complete as soon as
2749                          * possible on the accepting host.
2750                          */
2751                         flags |= TH_ACK_NEEDED;
2752 
2753                         /*
2754                          * Trace connect-established here.


2773                         if (tcp->tcp_loopback) {
2774                                 mblk_t *ack_mp;
2775 
2776                                 ASSERT(!tcp->tcp_unfusable);
2777                                 ASSERT(mp1 != NULL);
2778                                 /*
2779                                  * For loopback, we always get a pure SYN-ACK
2780                                  * and only need to send back the final ACK
2781                                  * with no data (this is because the other
2782                                  * tcp is ours and we don't do T/TCP).  This
2783                                  * final ACK triggers the passive side to
2784                                  * perform fusion in ESTABLISHED state.
2785                                  */
2786                                 if ((ack_mp = tcp_ack_mp(tcp)) != NULL) {
2787                                         if (tcp->tcp_ack_tid != 0) {
2788                                                 (void) TCP_TIMER_CANCEL(tcp,
2789                                                     tcp->tcp_ack_tid);
2790                                                 tcp->tcp_ack_tid = 0;
2791                                         }
2792                                         tcp_send_data(tcp, ack_mp);
2793                                         TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
2794                                         TCPS_BUMP_MIB(tcps, tcpOutAck);
2795 
2796                                         if (!IPCL_IS_NONSTR(connp)) {
2797                                                 /* Send up T_CONN_CON */
2798                                                 if (ira->ira_cred != NULL) {
2799                                                         mblk_setcred(mp1,
2800                                                             ira->ira_cred,
2801                                                             ira->ira_cpid);
2802                                                 }
2803                                                 putnext(connp->conn_rq, mp1);
2804                                         } else {
2805                                                 (*sockupcalls->su_connected)
2806                                                     (connp->conn_upper_handle,
2807                                                     tcp->tcp_connid,
2808                                                     ira->ira_cred,
2809                                                     ira->ira_cpid);
2810                                                 freemsg(mp1);
2811                                         }
2812 
2813                                         freemsg(mp);


3162                         mp2 = mp;
3163                         mp = mp->b_cont;
3164                         freeb(mp2);
3165                 } while (gap < 0);
3166                 /*
3167                  * If the urgent data has already been acknowledged, we
3168                  * should ignore TH_URG below
3169                  */
3170                 if (urp < 0)
3171                         flags &= ~TH_URG;
3172         }
3173         /*
3174          * rgap is the amount of stuff received out of window.  A negative
3175          * value is the amount out of window.
3176          */
3177         if (rgap < 0) {
3178                 mblk_t  *mp2;
3179 
3180                 if (tcp->tcp_rwnd == 0) {
3181                         TCPS_BUMP_MIB(tcps, tcpInWinProbe);
3182                         tcp->tcp_cs.tcp_in_zwnd_probes++;
3183                 } else {
3184                         TCPS_BUMP_MIB(tcps, tcpInDataPastWinSegs);
3185                         TCPS_UPDATE_MIB(tcps, tcpInDataPastWinBytes, -rgap);
3186                 }
3187 
3188                 /*
3189                  * seg_len does not include the FIN, so if more than
3190                  * just the FIN is out of window, we act like we don't
3191                  * see it.  (If just the FIN is out of window, rgap
3192                  * will be zero and we will go ahead and acknowledge
3193                  * the FIN.)
3194                  */
3195                 flags &= ~TH_FIN;
3196 
3197                 /* Fix seg_len and make sure there is something left. */
3198                 seg_len += rgap;
3199                 if (seg_len <= 0) {
3200                         /*
3201                          * Resets are only valid if they lie within our offered
3202                          * window.  If the RST bit is set, we just ignore this


3412                                  *
3413                                  * But TCP should not perform fast retransmit
3414                                  * because of the ack number.  TCP uses
3415                                  * seg_len == 0 to determine if it is a pure
3416                                  * ACK.  And this is not a pure ACK.
3417                                  */
3418                                 seg_len = 0;
3419                                 ofo_seg = B_TRUE;
3420 
3421                                 if (tcps->tcps_reass_timeout != 0 &&
3422                                     tcp->tcp_reass_tid == 0) {
3423                                         tcp->tcp_reass_tid = TCP_TIMER(tcp,
3424                                             tcp_reass_timer,
3425                                             tcps->tcps_reass_timeout);
3426                                 }
3427                         }
3428                 }
3429         } else if (seg_len > 0) {
3430                 TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs);
3431                 TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, seg_len);
3432                 tcp->tcp_cs.tcp_in_data_inorder_segs++;
3433                 tcp->tcp_cs.tcp_in_data_inorder_bytes += seg_len;
3434 
3435                 /*
3436                  * If an out of order FIN was received before, and the seq
3437                  * num and len of the new segment match that of the FIN,
3438                  * put the FIN flag back in.
3439                  */
3440                 if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) &&
3441                     seg_seq + seg_len == tcp->tcp_ofo_fin_seq) {
3442                         flags |= TH_FIN;
3443                         tcp->tcp_valid_bits &= ~TCP_OFO_FIN_VALID;
3444                 }
3445         }
3446         if ((flags & (TH_RST | TH_SYN | TH_URG | TH_ACK)) != TH_ACK) {
3447         if (flags & TH_RST) {
3448                 freemsg(mp);
3449                 switch (tcp->tcp_state) {
3450                 case TCPS_SYN_RCVD:
3451                         (void) tcp_clean_death(tcp, ECONNREFUSED);
3452                         break;
3453                 case TCPS_ESTABLISHED:
3454                 case TCPS_FIN_WAIT_1:


3480                     SEQ_LEQ(seg_seq, tcp->tcp_rnxt + tcp->tcp_rwnd));
3481                 freemsg(mp);
3482                 /*
3483                  * If the ACK flag is not set, just use our snxt as the
3484                  * seq number of the RST segment.
3485                  */
3486                 if (!(flags & TH_ACK)) {
3487                         seg_ack = tcp->tcp_snxt;
3488                 }
3489                 tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1,
3490                     TH_RST|TH_ACK);
3491                 ASSERT(tcp->tcp_state != TCPS_TIME_WAIT);
3492                 (void) tcp_clean_death(tcp, ECONNRESET);
3493                 return;
3494         }
3495         /*
3496          * urp could be -1 when the urp field in the packet is 0
3497          * and TCP_OLD_URP_INTERPRETATION is set. This implies that the urgent
3498          * byte was at seg_seq - 1, in which case we ignore the urgent flag.
3499          */
3500         if ((flags & TH_URG) && urp >= 0) {
3501                 if (!tcp->tcp_urp_last_valid ||
3502                     SEQ_GT(urp + seg_seq, tcp->tcp_urp_last)) {
3503                         /*
3504                          * Non-STREAMS sockets handle the urgent data a litte
3505                          * differently from STREAMS based sockets. There is no
3506                          * need to mark any mblks with the MSG{NOT,}MARKNEXT
3507                          * flags to keep SIOCATMARK happy. Instead a
3508                          * su_signal_oob upcall is made to update the mark.
3509                          * Neither is a T_EXDATA_IND mblk needed to be
3510                          * prepended to the urgent data. The urgent data is
3511                          * delivered using the su_recv upcall, where we set
3512                          * the MSG_OOB flag to indicate that it is urg data.
3513                          *
3514                          * Neither TH_SEND_URP_MARK nor TH_MARKNEXT_NEEDED
3515                          * are used by non-STREAMS sockets.
3516                          */
3517                         if (IPCL_IS_NONSTR(connp)) {
3518                                 if (!TCP_IS_DETACHED(tcp)) {
3519                                         (*sockupcalls->su_signal_oob)
3520                                             (connp->conn_upper_handle, urp);


3937                             ip_xmit_attr_t *, connp->conn_ixa, void_ip_t *,
3938                             iphdr, tcp_t *, tcp, tcph_t *, tcpha);
3939                 }
3940                 TCPS_CONN_INC(tcps);
3941 
3942                 tcp->tcp_suna = tcp->tcp_iss + 1; /* One for the SYN */
3943                 bytes_acked--;
3944                 /* SYN was acked - making progress */
3945                 tcp->tcp_ip_forward_progress = B_TRUE;
3946 
3947                 /*
3948                  * If SYN was retransmitted, need to reset all
3949                  * retransmission info as this segment will be
3950                  * treated as a dup ACK.
3951                  */
3952                 if (tcp->tcp_rexmit) {
3953                         tcp->tcp_rexmit = B_FALSE;
3954                         tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
3955                         tcp->tcp_rexmit_max = tcp->tcp_snxt;
3956                         tcp->tcp_ms_we_have_waited = 0;
3957                         DTRACE_PROBE3(cwnd__retransmitted__syn,
3958                             tcp_t *, tcp, uint32_t, tcp->tcp_cwnd,
3959                             uint32_t, tcp->tcp_mss);
3960                         tcp->tcp_cwnd = mss;
3961                 }
3962 
3963                 /*
3964                  * We set the send window to zero here.
3965                  * This is needed if there is data to be
3966                  * processed already on the queue.
3967                  * Later (at swnd_update label), the
3968                  * "new_swnd > tcp_swnd" condition is satisfied
3969                  * the XMIT_NEEDED flag is set in the current
3970                  * (SYN_RCVD) state. This ensures tcp_wput_data() is
3971                  * called if there is already data on queue in
3972                  * this state.
3973                  */
3974                 tcp->tcp_swnd = 0;
3975 
3976                 if (new_swnd > tcp->tcp_max_swnd)
3977                         tcp->tcp_max_swnd = new_swnd;
3978                 tcp->tcp_swl1 = seg_seq;
3979                 tcp->tcp_swl2 = seg_ack;


3983                 DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *,
3984                     connp->conn_ixa, void, NULL, tcp_t *, tcp, void, NULL,
3985                     int32_t, TCPS_SYN_RCVD);
3986 
3987                 /* Fuse when both sides are in ESTABLISHED state */
3988                 if (tcp->tcp_loopback && do_tcp_fusion)
3989                         tcp_fuse(tcp, iphdr, tcpha);
3990 
3991         }
3992         /* This code follows 4.4BSD-Lite2 mostly. */
3993         if (bytes_acked < 0)
3994                 goto est;
3995 
3996         /*
3997          * If TCP is ECN capable and the congestion experience bit is
3998          * set, reduce tcp_cwnd and tcp_ssthresh.  But this should only be
3999          * done once per window (or more loosely, per RTT).
4000          */
4001         if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max))
4002                 tcp->tcp_cwr = B_FALSE;
4003         if (tcp->tcp_ecn_ok && (flags & TH_ECE) && !tcp->tcp_cwr) {
4004                 cc_cong_signal(tcp, seg_ack, CC_ECN);



4005                 /*
4006                  * If the cwnd is 0, use the timer to clock out
4007                  * new segments.  This is required by the ECN spec.
4008                  */
4009                 if (tcp->tcp_cwnd == 0)
4010                         TCP_TIMER_RESTART(tcp, tcp->tcp_rto);






4011                 tcp->tcp_cwr = B_TRUE;
4012                 /*
4013                  * This marks the end of the current window of in
4014                  * flight data.  That is why we don't use
4015                  * tcp_suna + tcp_swnd.  Only data in flight can
4016                  * provide ECN info.
4017                  */
4018                 tcp->tcp_cwr_snd_max = tcp->tcp_snxt;

4019         }

4020 
4021         mp1 = tcp->tcp_xmit_head;
4022         if (bytes_acked == 0) {
4023                 if (!ofo_seg && seg_len == 0 && new_swnd == tcp->tcp_swnd) {
4024                         int dupack_cnt;
4025 
4026                         TCPS_BUMP_MIB(tcps, tcpInDupAck);
4027                         /*
4028                          * Fast retransmit.  When we have seen exactly three
4029                          * identical ACKs while we have unacked data
4030                          * outstanding we take it as a hint that our peer
4031                          * dropped something.
4032                          *
4033                          * If TCP is retransmitting, don't do fast retransmit.
4034                          */
4035                         if (mp1 && tcp->tcp_suna != tcp->tcp_snxt &&
4036                             ! tcp->tcp_rexmit) {
4037                                 /* Do Limited Transmit */
4038                                 if ((dupack_cnt = ++tcp->tcp_dupack_cnt) <
4039                                     tcps->tcps_dupack_fast_retransmit) {
4040                                         cc_ack_received(tcp, seg_ack,
4041                                             bytes_acked, CC_DUPACK);
4042                                         /*
4043                                          * RFC 3042
4044                                          *
4045                                          * What we need to do is temporarily
4046                                          * increase tcp_cwnd so that new
4047                                          * data can be sent if it is allowed
4048                                          * by the receive window (tcp_rwnd).
4049                                          * tcp_wput_data() will take care of
4050                                          * the rest.
4051                                          *
4052                                          * If the connection is SACK capable,
4053                                          * only do limited xmit when there
4054                                          * is SACK info.
4055                                          *
4056                                          * Note how tcp_cwnd is incremented.
4057                                          * The first dup ACK will increase
4058                                          * it by 1 MSS.  The second dup ACK
4059                                          * will increase it by 2 MSS.  This
4060                                          * means that only 1 new segment will
4061                                          * be sent for each dup ACK.


4068                                                     (tcp->tcp_dupack_cnt - 1);
4069                                                 flags |= TH_LIMIT_XMIT;
4070                                         }
4071                                 } else if (dupack_cnt ==
4072                                     tcps->tcps_dupack_fast_retransmit) {
4073 
4074                                 /*
4075                                  * If we have reduced tcp_ssthresh
4076                                  * because of ECN, do not reduce it again
4077                                  * unless it is already one window of data
4078                                  * away.  After one window of data, tcp_cwr
4079                                  * should then be cleared.  Note that
4080                                  * for non ECN capable connection, tcp_cwr
4081                                  * should always be false.
4082                                  *
4083                                  * Adjust cwnd since the duplicate
4084                                  * ack indicates that a packet was
4085                                  * dropped (due to congestion.)
4086                                  */
4087                                 if (!tcp->tcp_cwr) {
4088                                         cc_cong_signal(tcp, seg_ack,
4089                                             CC_NDUPACK);
4090                                         cc_ack_received(tcp, seg_ack,
4091                                             bytes_acked, CC_DUPACK);


4092                                 }
4093                                 if (tcp->tcp_ecn_ok) {
4094                                         tcp->tcp_cwr = B_TRUE;
4095                                         tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
4096                                         tcp->tcp_ecn_cwr_sent = B_FALSE;
4097                                 }
4098 
4099                                 /*
4100                                  * We do Hoe's algorithm.  Refer to her
4101                                  * paper "Improving the Start-up Behavior
4102                                  * of a Congestion Control Scheme for TCP,"
4103                                  * appeared in SIGCOMM'96.
4104                                  *
4105                                  * Save highest seq no we have sent so far.
4106                                  * Be careful about the invisible FIN byte.
4107                                  */
4108                                 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
4109                                     (tcp->tcp_unsent == 0)) {
4110                                         tcp->tcp_rexmit_max = tcp->tcp_fss;
4111                                 } else {


4133                                                     tcp->tcp_fack;
4134                                                 tcp->tcp_sack_snxt = seg_ack;
4135                                                 flags |= TH_NEED_SACK_REXMIT;
4136                                         } else {
4137                                                 /*
4138                                                  * Always initialize tcp_pipe
4139                                                  * even though we don't have
4140                                                  * any SACK info.  If later
4141                                                  * we get SACK info and
4142                                                  * tcp_pipe is not initialized,
4143                                                  * funny things will happen.
4144                                                  */
4145                                                 tcp->tcp_pipe =
4146                                                     tcp->tcp_cwnd_ssthresh;
4147                                         }
4148                                 } else {
4149                                         flags |= TH_REXMIT_NEEDED;
4150                                 } /* tcp_snd_sack_ok */
4151 
4152                                 } else {
4153                                         cc_ack_received(tcp, seg_ack,
4154                                             bytes_acked, CC_DUPACK);
4155                                         /*
4156                                          * Here we perform congestion
4157                                          * avoidance, but NOT slow start.
4158                                          * This is known as the Fast
4159                                          * Recovery Algorithm.
4160                                          */
4161                                         if (tcp->tcp_snd_sack_ok &&
4162                                             tcp->tcp_notsack_list != NULL) {
4163                                                 flags |= TH_NEED_SACK_REXMIT;
4164                                                 tcp->tcp_pipe -= mss;
4165                                                 if (tcp->tcp_pipe < 0)
4166                                                         tcp->tcp_pipe = 0;
4167                                         } else {
4168                                         /*
4169                                          * We know that one more packet has
4170                                          * left the pipe thus we can update
4171                                          * cwnd.
4172                                          */
4173                                         cwnd = tcp->tcp_cwnd + mss;
4174                                         if (cwnd > tcp->tcp_cwnd_max)
4175                                                 cwnd = tcp->tcp_cwnd_max;
4176                                         DTRACE_PROBE3(cwnd__fast__recovery,
4177                                             tcp_t *, tcp,
4178                                             uint32_t, tcp->tcp_cwnd,
4179                                             uint32_t, cwnd);
4180                                         tcp->tcp_cwnd = cwnd;
4181                                         if (tcp->tcp_unsent > 0)
4182                                                 flags |= TH_XMIT_NEEDED;
4183                                         }
4184                                 }
4185                         }
4186                 } else if (tcp->tcp_zero_win_probe) {
4187                         /*
4188                          * If the window has opened, need to arrange
4189                          * to send additional data.
4190                          */
4191                         if (new_swnd != 0) {
4192                                 /* tcp_suna != tcp_snxt */
4193                                 /* Packet contains a window update */
4194                                 TCPS_BUMP_MIB(tcps, tcpInWinUpdate);
4195                                 tcp->tcp_zero_win_probe = 0;
4196                                 tcp->tcp_timer_backoff = 0;
4197                                 tcp->tcp_ms_we_have_waited = 0;
4198 
4199                                 /*


4262                          * greater than 0, check if the number of such
4263                          * bogus ACks is greater than that count.  If yes,
4264                          * don't send back any ACK.  This prevents TCP from
4265                          * getting into an ACK storm if somehow an attacker
4266                          * successfully spoofs an acceptable segment to our
4267                          * peer.  If this continues (count > 2 X threshold),
4268                          * we should abort this connection.
4269                          */
4270                         if (tcp_drop_ack_unsent_cnt > 0 &&
4271                             ++tcp->tcp_in_ack_unsent >
4272                             tcp_drop_ack_unsent_cnt) {
4273                                 TCP_STAT(tcps, tcp_in_ack_unsent_drop);
4274                                 if (tcp->tcp_in_ack_unsent > 2 *
4275                                     tcp_drop_ack_unsent_cnt) {
4276                                         (void) tcp_clean_death(tcp, EPROTO);
4277                                 }
4278                                 return;
4279                         }
4280                         mp = tcp_ack_mp(tcp);
4281                         if (mp != NULL) {
4282                                 TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
4283                                 TCPS_BUMP_MIB(tcps, tcpOutAck);
4284                                 tcp_send_data(tcp, mp);
4285                         }
4286                         return;
4287                 }
4288         } else if (tcp->tcp_is_wnd_shrnk && SEQ_GEQ(seg_ack,
4289             tcp->tcp_snxt_shrunk)) {
4290                         tcp->tcp_is_wnd_shrnk = B_FALSE;
4291         }
4292 
4293         /*
4294          * TCP gets a new ACK, update the notsack'ed list to delete those
4295          * blocks that are covered by this ACK.
4296          */
4297         if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) {
4298                 tcp_notsack_remove(&(tcp->tcp_notsack_list), seg_ack,
4299                     &(tcp->tcp_num_notsack_blk), &(tcp->tcp_cnt_notsack_list));
4300         }
4301 
4302         /*
4303          * If we got an ACK after fast retransmit, check to see
4304          * if it is a partial ACK.  If it is not and the congestion
4305          * window was inflated to account for the other side's
4306          * cached packets, retract it.  If it is, do Hoe's algorithm.
4307          */
4308         if (tcp->tcp_dupack_cnt >= tcps->tcps_dupack_fast_retransmit) {
4309                 ASSERT(tcp->tcp_rexmit == B_FALSE);
4310                 if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) {
4311                         tcp->tcp_dupack_cnt = 0;
4312 
4313                         cc_post_recovery(tcp, seg_ack);
4314 




4315                         tcp->tcp_rexmit_max = seg_ack;

4316 
4317                         /*
4318                          * Remove all notsack info to avoid confusion with
4319                          * the next fast retrasnmit/recovery phase.
4320                          */
4321                         if (tcp->tcp_snd_sack_ok) {
4322                                 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list,
4323                                     tcp);
4324                         }
4325                 } else {
4326                         if (tcp->tcp_snd_sack_ok &&
4327                             tcp->tcp_notsack_list != NULL) {
4328                                 flags |= TH_NEED_SACK_REXMIT;
4329                                 tcp->tcp_pipe -= mss;
4330                                 if (tcp->tcp_pipe < 0)
4331                                         tcp->tcp_pipe = 0;
4332                         } else {
4333                                 /*
4334                                  * Hoe's algorithm:
4335                                  *
4336                                  * Retransmit the unack'ed segment and
4337                                  * restart fast recovery.  Note that we
4338                                  * need to scale back tcp_cwnd to the
4339                                  * original value when we started fast
4340                                  * recovery.  This is to prevent overly
4341                                  * aggressive behaviour in sending new
4342                                  * segments.
4343                                  */
4344                                 cwnd = tcp->tcp_cwnd_ssthresh +
4345                                     tcps->tcps_dupack_fast_retransmit * mss;
4346                                 DTRACE_PROBE3(cwnd__fast__retransmit__part__ack,
4347                                     tcp_t *, tcp, uint32_t, tcp->tcp_cwnd,
4348                                     uint32_t, cwnd);
4349                                 tcp->tcp_cwnd = cwnd;
4350                                 tcp->tcp_cwnd_cnt = tcp->tcp_cwnd;
4351                                 flags |= TH_REXMIT_NEEDED;
4352                         }
4353                 }
4354         } else {
4355                 tcp->tcp_dupack_cnt = 0;
4356                 if (tcp->tcp_rexmit) {
4357                         /*
4358                          * TCP is retranmitting.  If the ACK ack's all
4359                          * outstanding data, update tcp_rexmit_max and
4360                          * tcp_rexmit_nxt.  Otherwise, update tcp_rexmit_nxt
4361                          * to the correct value.
4362                          *
4363                          * Note that SEQ_LEQ() is used.  This is to avoid
4364                          * unnecessary fast retransmit caused by dup ACKs
4365                          * received when TCP does slow start retransmission
4366                          * after a time out.  During this phase, TCP may
4367                          * send out segments which are already received.
4368                          * This causes dup ACKs to be sent back.
4369                          */


4390                 tcp->tcp_timer_backoff = 0;
4391         }
4392 
4393         /*
4394          * If tcp_xmit_head is NULL, then it must be the FIN being ack'ed.
4395          * Note that it cannot be the SYN being ack'ed.  The code flow
4396          * will not reach here.
4397          */
4398         if (mp1 == NULL) {
4399                 goto fin_acked;
4400         }
4401 
4402         /*
4403          * Update the congestion window.
4404          *
4405          * If TCP is not ECN capable or TCP is ECN capable but the
4406          * congestion experience bit is not set, increase the tcp_cwnd as
4407          * usual.
4408          */
4409         if (!tcp->tcp_ecn_ok || !(flags & TH_ECE)) {
4410                 if (IN_RECOVERY(tcp->tcp_ccv.flags)) {
4411                         EXIT_RECOVERY(tcp->tcp_ccv.flags);

















4412                 }
4413                 cc_ack_received(tcp, seg_ack, bytes_acked, CC_ACK);
4414         }


4415 
4416         /* See if the latest urgent data has been acknowledged */
4417         if ((tcp->tcp_valid_bits & TCP_URG_VALID) &&
4418             SEQ_GT(seg_ack, tcp->tcp_urg))
4419                 tcp->tcp_valid_bits &= ~TCP_URG_VALID;
4420 













4421         /*
4422          * Update the RTT estimates. Note that we don't use the TCP
4423          * timestamp option to calculate RTT even if one is present. This is
4424          * because the timestamp option's resolution (CPU tick) is
4425          * too coarse to measure modern datacenter networks' microsecond
4426          * latencies. The timestamp field's resolution is limited by its
4427          * 4-byte width (see RFC1323), and since we always store a
4428          * high-resolution nanosecond presision timestamp along with the data,
4429          * there is no point to ever using the timestamp option.
4430          */
4431         if (SEQ_GT(seg_ack, tcp->tcp_csuna)) {

4432                 /*
4433                  * An ACK sequence we haven't seen before, so get the RTT
4434                  * and update the RTO. But first check if the timestamp is
4435                  * valid to use.
4436                  */
4437                 if ((mp1->b_next != NULL) &&
4438                     SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next))) {
4439 #ifdef KERNEL_32
4440                         tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH -
4441                             (int32_t)(intptr_t)mp1->b_prev);
4442 #else
4443                         tcp_set_rto(tcp, gethrtime() -
4444                             (hrtime_t)(intptr_t)mp1->b_prev);
4445 #endif
4446                 } else {
4447                         TCPS_BUMP_MIB(tcps, tcpRttNoUpdate);
4448                 }
4449 
4450                 /* Remeber the last sequence to be ACKed */
4451                 tcp->tcp_csuna = seg_ack;
4452                 if (tcp->tcp_set_timer == 1) {
4453                         TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
4454                         tcp->tcp_set_timer = 0;
4455                 }
4456         } else {
4457                 TCPS_BUMP_MIB(tcps, tcpRttNoUpdate);
4458         }
4459 
4460         /* Eat acknowledged bytes off the xmit queue. */
4461         for (;;) {
4462                 mblk_t  *mp2;
4463                 uchar_t *wptr;
4464 
4465                 wptr = mp1->b_wptr;
4466                 ASSERT((uintptr_t)(wptr - mp1->b_rptr) <= (uintptr_t)INT_MAX);
4467                 bytes_acked -= (int)(wptr - mp1->b_rptr);
4468                 if (bytes_acked < 0) {
4469                         mp1->b_rptr = wptr + bytes_acked;
4470                         /*
4471                          * Set a new timestamp if all the bytes timed by the
4472                          * old timestamp have been ack'ed.
4473                          */
4474                         if (SEQ_GT(seg_ack,
4475                             (uint32_t)(uintptr_t)(mp1->b_next))) {
4476 #ifdef KERNEL_32
4477                                 mp1->b_prev =
4478                                     (mblk_t *)(uintptr_t)LBOLT_FASTPATH;
4479 #else
4480                                 mp1->b_prev =
4481                                     (mblk_t *)(intptr_t)gethrtime();
4482 #endif
4483                                 mp1->b_next = NULL;
4484                         }
4485                         break;
4486                 }
4487                 mp1->b_next = NULL;
4488                 mp1->b_prev = NULL;
4489                 mp2 = mp1;
4490                 mp1 = mp1->b_cont;
4491 
4492                 /*
4493                  * This notification is required for some zero-copy
4494                  * clients to maintain a copy semantic. After the data
4495                  * is ack'ed, client is safe to modify or reuse the buffer.
4496                  */
4497                 if (tcp->tcp_snd_zcopy_aware &&
4498                     (mp2->b_datap->db_struioflag & STRUIO_ZCNOTIFY))
4499                         tcp_zcopy_notify(tcp);
4500                 freeb(mp2);
4501                 if (bytes_acked == 0) {
4502                         if (mp1 == NULL) {


4938             TH_NEED_SACK_REXMIT|TH_LIMIT_XMIT|TH_ACK_TIMER_NEEDED|
4939             TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0)
4940                 goto done;
4941 
4942         /* Any transmit work to do and a non-zero window? */
4943         if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_NEED_SACK_REXMIT|
4944             TH_LIMIT_XMIT)) && tcp->tcp_swnd != 0) {
4945                 if (flags & TH_REXMIT_NEEDED) {
4946                         uint32_t snd_size = tcp->tcp_snxt - tcp->tcp_suna;
4947 
4948                         TCPS_BUMP_MIB(tcps, tcpOutFastRetrans);
4949                         if (snd_size > mss)
4950                                 snd_size = mss;
4951                         if (snd_size > tcp->tcp_swnd)
4952                                 snd_size = tcp->tcp_swnd;
4953                         mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, snd_size,
4954                             NULL, NULL, tcp->tcp_suna, B_TRUE, &snd_size,
4955                             B_TRUE);
4956 
4957                         if (mp1 != NULL) {
4958 #ifdef KERNEL_32
4959                                 tcp->tcp_xmit_head->b_prev =
4960                                     (mblk_t *)LBOLT_FASTPATH;
4961 #else
4962                                 tcp->tcp_xmit_head->b_prev =
4963                                     (mblk_t *)(intptr_t)gethrtime();
4964 #endif
4965                                 tcp->tcp_csuna = tcp->tcp_snxt;
4966                                 TCPS_BUMP_MIB(tcps, tcpRetransSegs);
4967                                 TCPS_UPDATE_MIB(tcps, tcpRetransBytes,
4968                                     snd_size);
4969                                 tcp->tcp_cs.tcp_out_retrans_segs++;
4970                                 tcp->tcp_cs.tcp_out_retrans_bytes += snd_size;
4971                                 tcp_send_data(tcp, mp1);
4972                         }
4973                 }
4974                 if (flags & TH_NEED_SACK_REXMIT) {
4975                         tcp_sack_rexmit(tcp, &flags);
4976                 }
4977                 /*
4978                  * For TH_LIMIT_XMIT, tcp_wput_data() is called to send
4979                  * out new segment.  Note that tcp_rexmit should not be
4980                  * set, otherwise TH_LIMIT_XMIT should not be set.
4981                  */
4982                 if (flags & (TH_XMIT_NEEDED|TH_LIMIT_XMIT)) {
4983                         if (!tcp->tcp_rexmit) {
4984                                 tcp_wput_data(tcp, NULL, B_FALSE);
4985                         } else {
4986                                 tcp_ss_rexmit(tcp);
4987                         }
4988                 }
4989                 /*
4990                  * Adjust tcp_cwnd back to normal value after sending
4991                  * new data segments.
4992                  */
4993                 if (flags & TH_LIMIT_XMIT) {
4994                         tcp->tcp_cwnd -= mss << (tcp->tcp_dupack_cnt - 1);
4995                         /*
4996                          * This will restart the timer.  Restarting the
4997                          * timer is used to avoid a timeout before the
4998                          * limited transmitted segment's ACK gets back.
4999                          */
5000                         if (tcp->tcp_xmit_head != NULL) {
5001 #ifdef KERNEL_32
5002                                 tcp->tcp_xmit_head->b_prev =
5003                                     (mblk_t *)LBOLT_FASTPATH;
5004 #else
5005                                 tcp->tcp_xmit_head->b_prev =
5006                                     (mblk_t *)(intptr_t)gethrtime();
5007 #endif
5008                         }
5009                 }
5010 
5011                 /* Anything more to do? */
5012                 if ((flags & (TH_ACK_NEEDED|TH_ACK_TIMER_NEEDED|
5013                     TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0)
5014                         goto done;
5015         }
5016 ack_check:
5017         if (flags & TH_SEND_URP_MARK) {
5018                 ASSERT(tcp->tcp_urp_mark_mp);
5019                 ASSERT(!IPCL_IS_NONSTR(connp));
5020                 /*
5021                  * Send up any queued data and then send the mark message
5022                  */
5023                 if (tcp->tcp_rcv_list != NULL) {
5024                         flags |= tcp_rcv_drain(tcp);
5025 
5026                 }
5027                 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg);
5028                 mp1 = tcp->tcp_urp_mark_mp;
5029                 tcp->tcp_urp_mark_mp = NULL;


5031                         tcp_setcred_data(mp1, ira);
5032 
5033                 putnext(connp->conn_rq, mp1);
5034 #ifdef DEBUG
5035                 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
5036                     "tcp_rput: sending zero-length %s %s",
5037                     ((mp1->b_flag & MSGMARKNEXT) ? "MSGMARKNEXT" :
5038                     "MSGNOTMARKNEXT"),
5039                     tcp_display(tcp, NULL, DISP_PORT_ONLY));
5040 #endif /* DEBUG */
5041                 flags &= ~TH_SEND_URP_MARK;
5042         }
5043         if (flags & TH_ACK_NEEDED) {
5044                 /*
5045                  * Time to send an ack for some reason.
5046                  */
5047                 mp1 = tcp_ack_mp(tcp);
5048 
5049                 if (mp1 != NULL) {
5050                         tcp_send_data(tcp, mp1);
5051                         TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
5052                         TCPS_BUMP_MIB(tcps, tcpOutAck);
5053                 }
5054                 if (tcp->tcp_ack_tid != 0) {
5055                         (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid);
5056                         tcp->tcp_ack_tid = 0;
5057                 }
5058         }
5059         if (flags & TH_ACK_TIMER_NEEDED) {
5060                 /*
5061                  * Arrange for deferred ACK or push wait timeout.
5062                  * Start timer if it is not already running.
5063                  */
5064                 if (tcp->tcp_ack_tid == 0) {
5065                         tcp->tcp_ack_tid = TCP_TIMER(tcp, tcp_ack_timer,
5066                             tcp->tcp_localnet ?
5067                             tcps->tcps_local_dack_interval :
5068                             tcps->tcps_deferred_ack_interval);
5069                 }
5070         }
5071         if (flags & TH_ORDREL_NEEDED) {


5324         }
5325         if (addflag.crb_ipv6_recvdstopts) {
5326                 toh = (struct T_opthdr *)optptr;
5327                 toh->level = IPPROTO_IPV6;
5328                 toh->name = IPV6_DSTOPTS;
5329                 toh->len = sizeof (*toh) + ipp->ipp_dstoptslen;
5330                 toh->status = 0;
5331                 optptr += sizeof (*toh);
5332                 bcopy(ipp->ipp_dstopts, optptr, ipp->ipp_dstoptslen);
5333                 optptr += ipp->ipp_dstoptslen;
5334                 ASSERT(OK_32PTR(optptr));
5335                 /* Save as last value */
5336                 ip_savebuf((void **)&tcp->tcp_dstopts, &tcp->tcp_dstoptslen,
5337                     (ipp->ipp_fields & IPPF_DSTOPTS),
5338                     ipp->ipp_dstopts, ipp->ipp_dstoptslen);
5339         }
5340         ASSERT(optptr == mp->b_wptr);
5341         return (mp);
5342 }
5343 
5344 /* The minimum of smoothed mean deviation in RTO calculation (nsec). */
5345 #define TCP_SD_MIN      400000000
5346 
5347 /*
5348  * Set RTO for this connection based on a new round-trip time measurement.
5349  * The formula is from Jacobson and Karels' "Congestion Avoidance and Control"
5350  * in SIGCOMM '88.  The variable names are the same as those in Appendix A.2
5351  * of that paper.
5352  *
5353  * m = new measurement
5354  * sa = smoothed RTT average (8 * average estimates).
5355  * sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates).
5356  */
5357 static void
5358 tcp_set_rto(tcp_t *tcp, hrtime_t rtt)
5359 {
5360         hrtime_t m = rtt;
5361         hrtime_t sa = tcp->tcp_rtt_sa;
5362         hrtime_t sv = tcp->tcp_rtt_sd;

5363         tcp_stack_t *tcps = tcp->tcp_tcps;
5364 
5365         TCPS_BUMP_MIB(tcps, tcpRttUpdate);
5366         tcp->tcp_rtt_update++;
5367         tcp->tcp_rtt_sum += m;
5368         tcp->tcp_rtt_cnt++;
5369 
5370         /* tcp_rtt_sa is not 0 means this is a new sample. */
5371         if (sa != 0) {
5372                 /*
5373                  * Update average estimator (see section 2.3 of RFC6298):
5374                  *      SRTT = 7/8 SRTT + 1/8 rtt
5375                  *
5376                  * We maintain tcp_rtt_sa as 8 * SRTT, so this reduces to:
5377                  *      tcp_rtt_sa = 7 SRTT + rtt
5378                  *      tcp_rtt_sa = tcp_rtt_sa - 1/8 tcp_rtt_sa + rtt
5379                  *      tcp_rtt_sa = tcp_rtt_sa + (rtt - 1/8 tcp_rtt_sa)
5380                  *
5381                  * (rtt - 1/8 tcp_rtt_sa) is simply the difference
5382                  * between the new rtt measurement and the existing smoothed
5383                  * RTT average. This is referred to as "Error" in subsequent
5384                  * calculations.
5385                  */
5386 
5387                 /* m is now Error. */
5388                 m -= sa >> 3;
5389                 if ((sa += m) <= 0) {
5390                         /*
5391                          * Don't allow the smoothed average to be negative.
5392                          * We use 0 to denote reinitialization of the
5393                          * variables.
5394                          */
5395                         sa = 1;
5396                 }
5397 
5398                 /*
5399                  * Update deviation estimator:
5400                  *      mdev = 3/4 mdev + 1/4 abs(Error)
5401                  *
5402                  * We maintain tcp_rtt_sd as 4 * mdev, so this reduces to:
5403                  *      tcp_rtt_sd = 3 mdev + abs(Error)
5404                  *      tcp_rtt_sd = tcp_rtt_sd - 1/4 tcp_rtt_sd + abs(Error)
5405                  */
5406                 if (m < 0)
5407                         m = -m;
5408                 m -= sv >> 2;
5409                 sv += m;
5410         } else {
5411                 /*
5412                  * This follows BSD's implementation.  So the reinitialized
5413                  * RTO is 3 * m.  We cannot go less than 2 because if the
5414                  * link is bandwidth dominated, doubling the window size
5415                  * during slow start means doubling the RTT.  We want to be
5416                  * more conservative when we reinitialize our estimates.  3
5417                  * is just a convenient number.
5418                  */
5419                 sa = m << 3;
5420                 sv = m << 1;
5421         }
5422         if (sv < TCP_SD_MIN) {
5423                 /*
5424                  * We do not know that if sa captures the delay ACK
5425                  * effect as in a long train of segments, a receiver
5426                  * does not delay its ACKs.  So set the minimum of sv
5427                  * to be TCP_SD_MIN, which is default to 400 ms, twice
5428                  * of BSD DATO.  That means the minimum of mean
5429                  * deviation is 100 ms.

5430                  */
5431                 sv = TCP_SD_MIN;
5432         }
5433         tcp->tcp_rtt_sa = sa;
5434         tcp->tcp_rtt_sd = sv;













5435 
5436         tcp->tcp_rto = tcp_calculate_rto(tcp, tcps);
5437 
5438         /* Now, we can reset tcp_timer_backoff to use the new RTO... */
5439         tcp->tcp_timer_backoff = 0;
5440 }
5441 
5442 /*
5443  * On a labeled system we have some protocols above TCP, such as RPC, which
5444  * appear to assume that every mblk in a chain has a db_credp.
5445  */
5446 static void
5447 tcp_setcred_data(mblk_t *mp, ip_recv_attr_t *ira)
5448 {
5449         ASSERT(is_system_labeled());
5450         ASSERT(ira->ira_cred != NULL);
5451 
5452         while (mp != NULL) {
5453                 mblk_setcred(mp, ira->ira_cred, NOPID);
5454                 mp = mp->b_cont;
5455         }
5456 }


5741                 default:
5742                         break;
5743                 }
5744                 break;
5745         case ICMP_SOURCE_QUENCH:
5746                 /*
5747                  * use a global boolean to control
5748                  * whether TCP should respond to ICMP_SOURCE_QUENCH.
5749                  * The default is false.
5750                  */
5751                 if (tcp_icmp_source_quench) {
5752                         /*
5753                          * Reduce the sending rate as if we got a
5754                          * retransmit timeout
5755                          */
5756                         uint32_t npkt;
5757 
5758                         npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) /
5759                             tcp->tcp_mss;
5760                         tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * tcp->tcp_mss;
5761 
5762                         DTRACE_PROBE3(cwnd__source__quench, tcp_t *, tcp,
5763                             uint32_t, tcp->tcp_cwnd,
5764                             uint32_t, tcp->tcp_mss);
5765                         tcp->tcp_cwnd = tcp->tcp_mss;
5766                         tcp->tcp_cwnd_cnt = 0;
5767                 }
5768                 break;
5769         }
5770         freemsg(mp);
5771 }
5772 
5773 /*
5774  * tcp_icmp_error_ipv6 is called from tcp_icmp_input to process ICMPv6
5775  * error messages passed up by IP.
5776  * Assumes that IP has pulled up all the extension headers as well
5777  * as the ICMPv6 header.
5778  */
5779 static void
5780 tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, ip_recv_attr_t *ira)
5781 {
5782         icmp6_t         *icmp6;
5783         ip6_t           *ip6h;
5784         uint16_t        iph_hdr_length = ira->ira_ip_hdr_length;