Print this page
DLPX-25998 TCP congestion control is inadequate
Reviewed at: http://reviews.delphix.com/r/34808/
DLPX-43064 include high-resolution round-trip times in connstat (EP-652)
DLPX-37540 TCP per-connection kernel statistics DLPX-37544 connstat command to display per-connection TCP statistics


   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2014 by Delphix. All rights reserved.
  25  */
  26 
  27 /* This file contains all TCP output processing functions. */
  28 
  29 #include <sys/types.h>
  30 #include <sys/stream.h>
  31 #include <sys/strsun.h>
  32 #include <sys/strsubr.h>
  33 #include <sys/stropts.h>
  34 #include <sys/strlog.h>
  35 #define _SUN_TPI_VERSION 2
  36 #include <sys/tihdr.h>
  37 #include <sys/suntpi.h>
  38 #include <sys/xti_inet.h>
  39 #include <sys/timod.h>
  40 #include <sys/pattr.h>
  41 #include <sys/squeue_impl.h>
  42 #include <sys/squeue.h>
  43 #include <sys/sockio.h>
  44 #include <sys/tsol/tnet.h>


  46 #include <inet/common.h>
  47 #include <inet/ip.h>
  48 #include <inet/tcp.h>
  49 #include <inet/tcp_impl.h>
  50 #include <inet/snmpcom.h>
  51 #include <inet/proto_set.h>
  52 #include <inet/ipsec_impl.h>
  53 #include <inet/ip_ndp.h>
  54 
  55 static mblk_t   *tcp_get_seg_mp(tcp_t *, uint32_t, int32_t *);
  56 static void     tcp_wput_cmdblk(queue_t *, mblk_t *);
  57 static void     tcp_wput_flush(tcp_t *, mblk_t *);
  58 static void     tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp);
  59 static int      tcp_xmit_end(tcp_t *);
  60 static int      tcp_send(tcp_t *, const int, const int, const int,
  61                     const int, int *, uint_t *, int *, mblk_t **, mblk_t *);
  62 static void     tcp_xmit_early_reset(char *, mblk_t *, uint32_t, uint32_t,
  63                     int, ip_recv_attr_t *, ip_stack_t *, conn_t *);
  64 static boolean_t        tcp_send_rst_chk(tcp_stack_t *);
  65 static void     tcp_process_shrunk_swnd(tcp_t *, uint32_t);
  66 static void     tcp_fill_header(tcp_t *, uchar_t *, clock_t, int);
  67 
  68 /*
  69  * Functions called directly via squeue having a prototype of edesc_t.
  70  */
  71 static void     tcp_wput_nondata(void *, mblk_t *, void *, ip_recv_attr_t *);
  72 static void     tcp_wput_ioctl(void *, mblk_t *, void *, ip_recv_attr_t *);
  73 static void     tcp_wput_proto(void *, mblk_t *, void *, ip_recv_attr_t *);
  74 
  75 /*
  76  * This controls how tiny a write must be before we try to copy it
  77  * into the mblk on the tail of the transmit queue.  Not much
  78  * speedup is observed for values larger than sixteen.  Zero will
  79  * disable the optimisation.
  80  */
  81 static int tcp_tx_pull_len = 16;
  82 












  83 void
  84 tcp_wput(queue_t *q, mblk_t *mp)
  85 {
  86         conn_t  *connp = Q_TO_CONN(q);
  87         tcp_t   *tcp;
  88         void (*output_proc)();
  89         t_scalar_t type;
  90         uchar_t *rptr;
  91         struct iocblk   *iocp;
  92         size_t size;
  93 
  94         ASSERT(connp->conn_ref >= 2);
  95 
  96         switch (DB_TYPE(mp)) {
  97         case M_DATA:
  98                 tcp = connp->conn_tcp;
  99                 ASSERT(tcp != NULL);
 100 
 101                 size = msgdsize(mp);
 102 


 200 /*
 201  * The TCP normal data output path.
 202  * NOTE: the logic of the fast path is duplicated from this function.
 203  */
 204 void
 205 tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent)
 206 {
 207         int             len;
 208         mblk_t          *local_time;
 209         mblk_t          *mp1;
 210         uint32_t        snxt;
 211         int             tail_unsent;
 212         int             tcpstate;
 213         int             usable = 0;
 214         mblk_t          *xmit_tail;
 215         int32_t         mss;
 216         int32_t         num_sack_blk = 0;
 217         int32_t         total_hdr_len;
 218         int32_t         tcp_hdr_len;
 219         int             rc;
 220         tcp_stack_t     *tcps = tcp->tcp_tcps;
 221         conn_t          *connp = tcp->tcp_connp;
 222         clock_t         now = LBOLT_FASTPATH;
 223 
 224         tcpstate = tcp->tcp_state;
 225         if (mp == NULL) {
 226                 /*
 227                  * tcp_wput_data() with NULL mp should only be called when
 228                  * there is unsent data.
 229                  */
 230                 ASSERT(tcp->tcp_unsent > 0);
 231                 /* Really tacky... but we need this for detached closes. */
 232                 len = tcp->tcp_unsent;
 233                 goto data_null;
 234         }
 235 
 236         ASSERT(mp->b_datap->db_type == M_DATA);
 237         /*
 238          * Don't allow data after T_ORDREL_REQ or T_DISCON_REQ,
 239          * or before a connection attempt has begun.
 240          */


 355          * includes SACK options.
 356          */
 357         if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) {
 358                 int32_t opt_len;
 359 
 360                 num_sack_blk = MIN(tcp->tcp_max_sack_blk,
 361                     tcp->tcp_num_sack_blk);
 362                 opt_len = num_sack_blk * sizeof (sack_blk_t) + TCPOPT_NOP_LEN *
 363                     2 + TCPOPT_HEADER_LEN;
 364                 mss = tcp->tcp_mss - opt_len;
 365                 total_hdr_len = connp->conn_ht_iphc_len + opt_len;
 366                 tcp_hdr_len = connp->conn_ht_ulp_len + opt_len;
 367         } else {
 368                 mss = tcp->tcp_mss;
 369                 total_hdr_len = connp->conn_ht_iphc_len;
 370                 tcp_hdr_len = connp->conn_ht_ulp_len;
 371         }
 372 
 373         if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
 374             (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
 375                 TCP_SET_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle);
 376         }
 377         if (tcpstate == TCPS_SYN_RCVD) {
 378                 /*
 379                  * The three-way connection establishment handshake is not
 380                  * complete yet. We want to queue the data for transmission
 381                  * after entering ESTABLISHED state (RFC793). A jump to
 382                  * "done" label effectively leaves data on the queue.
 383                  */
 384                 goto done;
 385         } else {
 386                 int usable_r;
 387 
 388                 /*
 389                  * In the special case when cwnd is zero, which can only
 390                  * happen if the connection is ECN capable, return now.
 391                  * New segments is sent using tcp_timer().  The timer
 392                  * is set in tcp_input_data().
 393                  */
 394                 if (tcp->tcp_cwnd == 0) {
 395                         /*


 436                         return;
 437                 }
 438 
 439                 /* usable = MIN(swnd, cwnd) - unacked_bytes */
 440                 if (tcp->tcp_swnd > tcp->tcp_cwnd)
 441                         usable_r -= tcp->tcp_swnd - tcp->tcp_cwnd;
 442 
 443                 /* usable = MIN(usable, unsent) */
 444                 if (usable_r > len)
 445                         usable_r = len;
 446 
 447                 /* usable = MAX(usable, {1 for urgent, 0 for data}) */
 448                 if (usable_r > 0) {
 449                         usable = usable_r;
 450                 } else {
 451                         /* Bypass all other unnecessary processing. */
 452                         goto done;
 453                 }
 454         }
 455 

 456         local_time = (mblk_t *)now;



 457 
 458         /*
 459          * "Our" Nagle Algorithm.  This is not the same as in the old
 460          * BSD.  This is more in line with the true intent of Nagle.
 461          *
 462          * The conditions are:
 463          * 1. The amount of unsent data (or amount of data which can be
 464          *    sent, whichever is smaller) is less than Nagle limit.
 465          * 2. The last sent size is also less than Nagle limit.
 466          * 3. There is unack'ed data.
 467          * 4. Urgent pointer is not set.  Send urgent data ignoring the
 468          *    Nagle algorithm.  This reduces the probability that urgent
 469          *    bytes get "merged" together.
 470          * 5. The app has not closed the connection.  This eliminates the
 471          *    wait time of the receiving side waiting for the last piece of
 472          *    (small) data.
 473          *
 474          * If all are satisified, exit without sending anything.  Note
 475          * that Nagle limit can be smaller than 1 MSS.  Nagle limit is
 476          * the smaller of 1 MSS and global tcp_naglim_def (default to be


1173          *    un-acked     usable
1174          *  |--------------|-----------------|
1175          *  tcp_suna       tcp_snxt       tcp_suna+tcp_swnd
1176          */
1177         /* END CSTYLED */
1178 
1179         /* start sending from tcp_snxt */
1180         snxt = tcp->tcp_snxt;
1181 
1182         /*
1183          * Check to see if this connection has been idled for some
1184          * time and no ACK is expected.  If it is, we need to slow
1185          * start again to get back the connection's "self-clock" as
1186          * described in VJ's paper.
1187          *
1188          * Reinitialize tcp_cwnd after idle.
1189          */
1190         now = LBOLT_FASTPATH;
1191         if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
1192             (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
1193                 TCP_SET_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle);
1194         }
1195 
1196         usable = tcp->tcp_swnd;              /* tcp window size */
1197         if (usable > tcp->tcp_cwnd)
1198                 usable = tcp->tcp_cwnd;      /* congestion window smaller */
1199         usable -= snxt;         /* subtract stuff already sent */
1200         suna = tcp->tcp_suna;
1201         usable += suna;
1202         /* usable can be < 0 if the congestion window is smaller */
1203         if (len > usable) {
1204                 /* Can't send complete M_DATA in one shot */
1205                 goto slow;
1206         }
1207 
1208         mutex_enter(&tcp->tcp_non_sq_lock);
1209         if (tcp->tcp_flow_stopped &&
1210             TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) {
1211                 tcp_clrqfull(tcp);
1212         }
1213         mutex_exit(&tcp->tcp_non_sq_lock);


1236                 return;
1237         }
1238 
1239         /*
1240          * len <= tcp->tcp_mss && len == unsent so no sender silly window.  Can
1241          * send now.
1242          */
1243 
1244         if (snxt == suna) {
1245                 TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
1246         }
1247 
1248         /* we have always sent something */
1249         tcp->tcp_rack_cnt = 0;
1250 
1251         tcp->tcp_snxt = snxt + len;
1252         tcp->tcp_rack = tcp->tcp_rnxt;
1253 
1254         if ((mp1 = dupb(mp)) == 0)
1255                 goto no_memory;

1256         mp->b_prev = (mblk_t *)(uintptr_t)now;



1257         mp->b_next = (mblk_t *)(uintptr_t)snxt;
1258 
1259         /* adjust tcp header information */
1260         tcpha = tcp->tcp_tcpha;
1261         tcpha->tha_flags = (TH_ACK|TH_PUSH);
1262 
1263         sum = len + connp->conn_ht_ulp_len + connp->conn_sum;
1264         sum = (sum >> 16) + (sum & 0xFFFF);
1265         tcpha->tha_sum = htons(sum);
1266 
1267         tcpha->tha_seq = htonl(snxt);
1268 
1269         TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
1270         TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
1271         BUMP_LOCAL(tcp->tcp_obsegs);


1272 
1273         /* Update the latest receive window size in TCP header. */
1274         tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
1275 
1276         tcp->tcp_last_sent_len = (ushort_t)len;
1277 
1278         plen = len + connp->conn_ht_iphc_len;
1279 
1280         ixa = connp->conn_ixa;
1281         ixa->ixa_pktlen = plen;
1282 
1283         if (ixa->ixa_flags & IXAF_IS_IPV4) {
1284                 tcp->tcp_ipha->ipha_length = htons(plen);
1285         } else {
1286                 tcp->tcp_ip6h->ip6_plen = htons(plen - IPV6_HDR_LEN);
1287         }
1288 
1289         /* see if we need to allocate a mblk for the headers */
1290         hdrlen = connp->conn_ht_iphc_len;
1291         rptr = mp1->b_rptr - hdrlen;
1292         db = mp1->b_datap;
1293         if ((db->db_ref != 2) || rptr < db->db_base ||
1294             (!OK_32PTR(rptr))) {
1295                 /* NOTE: we assume allocb returns an OK_32PTR */
1296                 mp = allocb(hdrlen + tcps->tcps_wroff_xtra, BPRI_MED);
1297                 if (!mp) {
1298                         freemsg(mp1);
1299                         goto no_memory;
1300                 }
1301                 mp->b_cont = mp1;
1302                 mp1 = mp;
1303                 /* Leave room for Link Level header */
1304                 rptr = &mp1->b_rptr[tcps->tcps_wroff_xtra];
1305                 mp1->b_wptr = &rptr[hdrlen];
1306         }
1307         mp1->b_rptr = rptr;
1308 
1309         /* Fill in the timestamp option. */
1310         if (tcp->tcp_snd_ts_ok) {
1311                 uint32_t llbolt = (uint32_t)LBOLT_FASTPATH;
1312 
1313                 U32_TO_BE32(llbolt,
1314                     (char *)tcpha + TCP_MIN_HEADER_LENGTH+4);
1315                 U32_TO_BE32(tcp->tcp_ts_recent,
1316                     (char *)tcpha + TCP_MIN_HEADER_LENGTH+8);
1317         } else {
1318                 ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
1319         }
1320 
1321         /* copy header into outgoing packet */
1322         dst = (ipaddr_t *)rptr;
1323         src = (ipaddr_t *)connp->conn_ht_iphc;
1324         dst[0] = src[0];
1325         dst[1] = src[1];
1326         dst[2] = src[2];
1327         dst[3] = src[3];
1328         dst[4] = src[4];
1329         dst[5] = src[5];
1330         dst[6] = src[6];
1331         dst[7] = src[7];
1332         dst[8] = src[8];
1333         dst[9] = src[9];
1334         if (hdrlen -= 40) {
1335                 hdrlen >>= 2;
1336                 dst += 10;


1940                                 }
1941                         } else
1942                                 (*xmit_tail)->b_rptr = prev_rptr;
1943 
1944                         if (mp == NULL) {
1945                                 return (-1);
1946                         }
1947                         mp1 = mp->b_cont;
1948 
1949                         if (len <= mss) /* LSO is unusable (!do_lso_send) */
1950                                 tcp->tcp_last_sent_len = (ushort_t)len;
1951                         while (mp1->b_cont) {
1952                                 *xmit_tail = (*xmit_tail)->b_cont;
1953                                 (*xmit_tail)->b_prev = local_time;
1954                                 (*xmit_tail)->b_next =
1955                                     (mblk_t *)(uintptr_t)(*snxt);
1956                                 mp1 = mp1->b_cont;
1957                         }
1958                         *snxt += len;
1959                         *tail_unsent = (*xmit_tail)->b_wptr - mp1->b_wptr;
1960                         BUMP_LOCAL(tcp->tcp_obsegs);
1961                         TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
1962                         TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);


1963                         tcp_send_data(tcp, mp);
1964                         continue;
1965                 }
1966 
1967                 *snxt += len;   /* Adjust later if we don't send all of len */

1968                 TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
1969                 TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);


1970 
1971                 if (*tail_unsent) {
1972                         /* Are the bytes above us in flight? */
1973                         rptr = (*xmit_tail)->b_wptr - *tail_unsent;
1974                         if (rptr != (*xmit_tail)->b_rptr) {
1975                                 *tail_unsent -= len;
1976                                 if (len <= mss) /* LSO is unusable */
1977                                         tcp->tcp_last_sent_len = (ushort_t)len;
1978                                 len += total_hdr_len;
1979                                 ixa->ixa_pktlen = len;
1980 
1981                                 if (ixa->ixa_flags & IXAF_IS_IPV4) {
1982                                         tcp->tcp_ipha->ipha_length = htons(len);
1983                                 } else {
1984                                         tcp->tcp_ip6h->ip6_plen =
1985                                             htons(len - IPV6_HDR_LEN);
1986                                 }
1987 
1988                                 mp = dupb(*xmit_tail);
1989                                 if (mp == NULL) {


2046 
2047                 must_alloc:;
2048                         mp1 = allocb(connp->conn_ht_iphc_allocated +
2049                             tcps->tcps_wroff_xtra, BPRI_MED);
2050                         if (mp1 == NULL) {
2051                                 freemsg(mp);
2052                                 return (-1);    /* out_of_mem */
2053                         }
2054                         mp1->b_cont = mp;
2055                         mp = mp1;
2056                         /* Leave room for Link Level header */
2057                         len = total_hdr_len;
2058                         rptr = &mp->b_rptr[tcps->tcps_wroff_xtra];
2059                         mp->b_wptr = &rptr[len];
2060                 }
2061 
2062                 /*
2063                  * Fill in the header using the template header, and add
2064                  * options such as time-stamp, ECN and/or SACK, as needed.
2065                  */
2066                 tcp_fill_header(tcp, rptr, (clock_t)local_time, num_sack_blk);
2067 
2068                 mp->b_rptr = rptr;
2069 
2070                 if (*tail_unsent) {
2071                         int spill = *tail_unsent;
2072 
2073                         mp1 = mp->b_cont;
2074                         if (mp1 == NULL)
2075                                 mp1 = mp;
2076 
2077                         /*
2078                          * If we're a little short, tack on more mblks until
2079                          * there is no more spillover.
2080                          */
2081                         while (spill < 0) {
2082                                 mblk_t *nmp;
2083                                 int nmpsz;
2084 
2085                                 nmp = (*xmit_tail)->b_cont;
2086                                 nmpsz = MBLKL(nmp);


2125                                 if (mp1 == NULL) {
2126                                         *tail_unsent = spill;
2127                                         freemsg(mp);
2128                                         return (-1);    /* out_of_mem */
2129                                 }
2130                         }
2131 
2132                         /* Trim back any surplus on the last mblk */
2133                         if (spill >= 0) {
2134                                 mp1->b_wptr -= spill;
2135                                 *tail_unsent = spill;
2136                         } else {
2137                                 /*
2138                                  * We did not send everything we could in
2139                                  * order to remain within the b_cont limit.
2140                                  */
2141                                 *usable -= spill;
2142                                 *snxt += spill;
2143                                 tcp->tcp_last_sent_len += spill;
2144                                 TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, spill);

2145                                 /*
2146                                  * Adjust the checksum
2147                                  */
2148                                 tcpha = (tcpha_t *)(rptr +
2149                                     ixa->ixa_ip_hdr_length);
2150                                 sum += spill;
2151                                 sum = (sum >> 16) + (sum & 0xFFFF);
2152                                 tcpha->tha_sum = htons(sum);
2153                                 if (connp->conn_ipversion == IPV4_VERSION) {
2154                                         sum = ntohs(
2155                                             ((ipha_t *)rptr)->ipha_length) +
2156                                             spill;
2157                                         ((ipha_t *)rptr)->ipha_length =
2158                                             htons(sum);
2159                                 } else {
2160                                         sum = ntohs(
2161                                             ((ip6_t *)rptr)->ip6_plen) +
2162                                             spill;
2163                                         ((ip6_t *)rptr)->ip6_plen =
2164                                             htons(sum);


2173                 } else {
2174                         ixa->ixa_flags &= ~IXAF_REACH_CONF;
2175                 }
2176 
2177                 if (do_lso_send) {
2178                         /* Append LSO information to the mp. */
2179                         lso_info_set(mp, mss, HW_LSO);
2180                         ixa->ixa_fragsize = IP_MAXPACKET;
2181                         ixa->ixa_extra_ident = num_lso_seg - 1;
2182 
2183                         DTRACE_PROBE2(tcp_send_lso, int, num_lso_seg,
2184                             boolean_t, B_TRUE);
2185 
2186                         tcp_send_data(tcp, mp);
2187 
2188                         /*
2189                          * Restore values of ixa_fragsize and ixa_extra_ident.
2190                          */
2191                         ixa->ixa_fragsize = ixa->ixa_pmtu;
2192                         ixa->ixa_extra_ident = 0;
2193                         tcp->tcp_obsegs += num_lso_seg;
2194                         TCP_STAT(tcps, tcp_lso_times);
2195                         TCP_STAT_UPDATE(tcps, tcp_lso_pkt_out, num_lso_seg);
2196                 } else {
2197                         /*
2198                          * Make sure to clean up LSO information. Wherever a
2199                          * new mp uses the prepended header room after dupb(),
2200                          * lso_info_cleanup() should be called.
2201                          */
2202                         lso_info_cleanup(mp);
2203                         tcp_send_data(tcp, mp);
2204                         BUMP_LOCAL(tcp->tcp_obsegs);
2205                 }
2206         }
2207 
2208         return (0);
2209 }
2210 
2211 /*
2212  * Initiate closedown sequence on an active connection.  (May be called as
2213  * writer.)  Return value zero for OK return, non-zero for error return.
2214  */
2215 static int
2216 tcp_xmit_end(tcp_t *tcp)
2217 {
2218         mblk_t          *mp;
2219         tcp_stack_t     *tcps = tcp->tcp_tcps;
2220         iulp_t          uinfo;
2221         ip_stack_t      *ipst = tcps->tcps_netstack->netstack_ip;
2222         conn_t          *connp = tcp->tcp_connp;
2223 
2224         if (tcp->tcp_state < TCPS_SYN_RCVD ||


2264                  * so we have to check that and unset it first.
2265                  */
2266                 if (tcp->tcp_cork)
2267                         tcp->tcp_cork = B_FALSE;
2268                 tcp_wput_data(tcp, NULL, B_FALSE);
2269         }
2270 
2271         /*
2272          * If TCP does not get enough samples of RTT or tcp_rtt_updates
2273          * is 0, don't update the cache.
2274          */
2275         if (tcps->tcps_rtt_updates == 0 ||
2276             tcp->tcp_rtt_update < tcps->tcps_rtt_updates)
2277                 return (0);
2278 
2279         /*
2280          * We do not have a good algorithm to update ssthresh at this time.
2281          * So don't do any update.
2282          */
2283         bzero(&uinfo, sizeof (uinfo));
2284         uinfo.iulp_rtt = tcp->tcp_rtt_sa;
2285         uinfo.iulp_rtt_sd = tcp->tcp_rtt_sd;
2286 
2287         /*
2288          * Note that uinfo is kept for conn_faddr in the DCE. Could update even
2289          * if source routed but we don't.
2290          */
2291         if (connp->conn_ipversion == IPV4_VERSION) {
2292                 if (connp->conn_faddr_v4 !=  tcp->tcp_ipha->ipha_dst) {
2293                         return (0);
2294                 }
2295                 (void) dce_update_uinfo_v4(connp->conn_faddr_v4, &uinfo, ipst);
2296         } else {
2297                 uint_t ifindex;
2298 
2299                 if (!(IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
2300                     &tcp->tcp_ip6h->ip6_dst))) {
2301                         return (0);
2302                 }
2303                 ifindex = 0;
2304                 if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_faddr_v6)) {
2305                         ip_xmit_attr_t *ixa = connp->conn_ixa;


2400                 }
2401         }
2402         if (ctl & TH_ACK) {
2403                 if (tcp->tcp_snd_ts_ok) {
2404                         uint32_t llbolt = (uint32_t)LBOLT_FASTPATH;
2405 
2406                         U32_TO_BE32(llbolt,
2407                             (char *)tcpha + TCP_MIN_HEADER_LENGTH+4);
2408                         U32_TO_BE32(tcp->tcp_ts_recent,
2409                             (char *)tcpha + TCP_MIN_HEADER_LENGTH+8);
2410                 }
2411 
2412                 /* Update the latest receive window size in TCP header. */
2413                 tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
2414                 /* Track what we sent to the peer */
2415                 tcp->tcp_tcpha->tha_win = tcpha->tha_win;
2416                 tcp->tcp_rack = ack;
2417                 tcp->tcp_rack_cnt = 0;
2418                 TCPS_BUMP_MIB(tcps, tcpOutAck);
2419         }
2420         BUMP_LOCAL(tcp->tcp_obsegs);
2421         tcpha->tha_seq = htonl(seq);
2422         tcpha->tha_ack = htonl(ack);
2423         /*
2424          * Include the adjustment for a source route if any.
2425          */
2426         sum = (sum >> 16) + (sum & 0xFFFF);
2427         tcpha->tha_sum = htons(sum);
2428         tcp_send_data(tcp, mp);
2429 }
2430 
2431 /*
2432  * Generate a reset based on an inbound packet, connp is set by caller
2433  * when RST is in response to an unexpected inbound packet for which
2434  * there is active tcp state in the system.
2435  *
2436  * IPSEC NOTE : Try to send the reply with the same protection as it came
2437  * in.  We have the ip_recv_attr_t which is reversed to form the ip_xmit_attr_t.
2438  * That way the packet will go out at the same level of protection as it
2439  * came in with.
2440  */


3369                 ASSERT(snxt_mp != NULL);
3370                 /* This should not happen.  Defensive coding again... */
3371                 if (snxt_mp == NULL) {
3372                         return;
3373                 }
3374 
3375                 xmit_mp = tcp_xmit_mp(tcp, snxt_mp, seg_len, &off,
3376                     &tmp_mp, begin, B_TRUE, &seg_len, B_TRUE);
3377                 if (xmit_mp == NULL)
3378                         return;
3379 
3380                 usable_swnd -= seg_len;
3381                 tcp->tcp_pipe += seg_len;
3382                 tcp->tcp_sack_snxt = begin + seg_len;
3383 
3384                 tcp_send_data(tcp, xmit_mp);
3385 
3386                 /*
3387                  * Update the send timestamp to avoid false retransmission.
3388                  */

3389                 snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt();



3390 
3391                 TCPS_BUMP_MIB(tcps, tcpRetransSegs);
3392                 TCPS_UPDATE_MIB(tcps, tcpRetransBytes, seg_len);
3393                 TCPS_BUMP_MIB(tcps, tcpOutSackRetransSegs);


3394                 /*
3395                  * Update tcp_rexmit_max to extend this SACK recovery phase.
3396                  * This happens when new data sent during fast recovery is
3397                  * also lost.  If TCP retransmits those new data, it needs
3398                  * to extend SACK recover phase to avoid starting another
3399                  * fast retransmit/recovery unnecessarily.
3400                  */
3401                 if (SEQ_GT(tcp->tcp_sack_snxt, tcp->tcp_rexmit_max)) {
3402                         tcp->tcp_rexmit_max = tcp->tcp_sack_snxt;
3403                 }
3404         }
3405 }
3406 
3407 /*
3408  * tcp_ss_rexmit() is called to do slow start retransmission after a timeout
3409  * or ICMP errors.
3410  */
3411 void
3412 tcp_ss_rexmit(tcp_t *tcp)
3413 {


3441 
3442                         if (win < cnt) {
3443                                 cnt = win;
3444                         }
3445                         if (SEQ_GT(snxt + cnt, smax)) {
3446                                 cnt = smax - snxt;
3447                         }
3448                         xmit_mp = tcp_xmit_mp(tcp, snxt_mp, cnt, &off,
3449                             &snxt_mp, snxt, B_TRUE, &cnt, B_TRUE);
3450                         if (xmit_mp == NULL)
3451                                 return;
3452 
3453                         tcp_send_data(tcp, xmit_mp);
3454 
3455                         snxt += cnt;
3456                         win -= cnt;
3457                         /*
3458                          * Update the send timestamp to avoid false
3459                          * retransmission.
3460                          */

3461                         old_snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt();



3462                         TCPS_BUMP_MIB(tcps, tcpRetransSegs);
3463                         TCPS_UPDATE_MIB(tcps, tcpRetransBytes, cnt);


3464 
3465                         tcp->tcp_rexmit_nxt = snxt;
3466                 }
3467                 /*
3468                  * If we have transmitted all we have at the time
3469                  * we started the retranmission, we can leave
3470                  * the rest of the job to tcp_wput_data().  But we
3471                  * need to check the send window first.  If the
3472                  * win is not 0, go on with tcp_wput_data().
3473                  */
3474                 if (SEQ_LT(snxt, smax) || win == 0) {
3475                         return;
3476                 }
3477         }
3478         /* Only call tcp_wput_data() if there is data to be sent. */
3479         if (tcp->tcp_unsent) {
3480                 tcp_wput_data(tcp, NULL, B_FALSE);
3481         }
3482 }
3483 


3601         /*
3602          * If the SACK option is set, delete the entire list of
3603          * notsack'ed blocks.
3604          */
3605         TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, tcp);
3606 
3607         if (tcp->tcp_suna == tcp->tcp_snxt && tcp->tcp_swnd == 0)
3608                 /*
3609                  * Make sure the timer is running so that we will probe a zero
3610                  * window.
3611                  */
3612                 TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
3613 }
3614 
3615 /*
3616  * tcp_fill_header is called by tcp_send() to fill the outgoing TCP header
3617  * with the template header, as well as other options such as time-stamp,
3618  * ECN and/or SACK.
3619  */
3620 static void
3621 tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk)
3622 {
3623         tcpha_t *tcp_tmpl, *tcpha;
3624         uint32_t *dst, *src;
3625         int hdrlen;
3626         conn_t *connp = tcp->tcp_connp;
3627 
3628         ASSERT(OK_32PTR(rptr));
3629 
3630         /* Template header */
3631         tcp_tmpl = tcp->tcp_tcpha;
3632 
3633         /* Header of outgoing packet */
3634         tcpha = (tcpha_t *)(rptr + connp->conn_ixa->ixa_ip_hdr_length);
3635 
3636         /* dst and src are opaque 32-bit fields, used for copying */
3637         dst = (uint32_t *)rptr;
3638         src = (uint32_t *)connp->conn_ht_iphc;
3639         hdrlen = connp->conn_ht_iphc_len;
3640 
3641         /* Fill time-stamp option if needed */
3642         if (tcp->tcp_snd_ts_ok) {
3643                 U32_TO_BE32((uint32_t)now,
3644                     (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 4);
3645                 U32_TO_BE32(tcp->tcp_ts_recent,
3646                     (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 8);
3647         } else {
3648                 ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
3649         }
3650 
3651         /*
3652          * Copy the template header; is this really more efficient than
3653          * calling bcopy()?  For simple IPv4/TCP, it may be the case,
3654          * but perhaps not for other scenarios.
3655          */
3656         dst[0] = src[0];
3657         dst[1] = src[1];
3658         dst[2] = src[2];
3659         dst[3] = src[3];
3660         dst[4] = src[4];
3661         dst[5] = src[5];
3662         dst[6] = src[6];
3663         dst[7] = src[7];




   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
  25  */
  26 
  27 /* This file contains all TCP output processing functions. */
  28 
  29 #include <sys/types.h>
  30 #include <sys/stream.h>
  31 #include <sys/strsun.h>
  32 #include <sys/strsubr.h>
  33 #include <sys/stropts.h>
  34 #include <sys/strlog.h>
  35 #define _SUN_TPI_VERSION 2
  36 #include <sys/tihdr.h>
  37 #include <sys/suntpi.h>
  38 #include <sys/xti_inet.h>
  39 #include <sys/timod.h>
  40 #include <sys/pattr.h>
  41 #include <sys/squeue_impl.h>
  42 #include <sys/squeue.h>
  43 #include <sys/sockio.h>
  44 #include <sys/tsol/tnet.h>


  46 #include <inet/common.h>
  47 #include <inet/ip.h>
  48 #include <inet/tcp.h>
  49 #include <inet/tcp_impl.h>
  50 #include <inet/snmpcom.h>
  51 #include <inet/proto_set.h>
  52 #include <inet/ipsec_impl.h>
  53 #include <inet/ip_ndp.h>
  54 
  55 static mblk_t   *tcp_get_seg_mp(tcp_t *, uint32_t, int32_t *);
  56 static void     tcp_wput_cmdblk(queue_t *, mblk_t *);
  57 static void     tcp_wput_flush(tcp_t *, mblk_t *);
  58 static void     tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp);
  59 static int      tcp_xmit_end(tcp_t *);
  60 static int      tcp_send(tcp_t *, const int, const int, const int,
  61                     const int, int *, uint_t *, int *, mblk_t **, mblk_t *);
  62 static void     tcp_xmit_early_reset(char *, mblk_t *, uint32_t, uint32_t,
  63                     int, ip_recv_attr_t *, ip_stack_t *, conn_t *);
  64 static boolean_t        tcp_send_rst_chk(tcp_stack_t *);
  65 static void     tcp_process_shrunk_swnd(tcp_t *, uint32_t);
  66 static void     tcp_fill_header(tcp_t *, uchar_t *, int);
  67 
  68 /*
  69  * Functions called directly via squeue having a prototype of edesc_t.
  70  */
  71 static void     tcp_wput_nondata(void *, mblk_t *, void *, ip_recv_attr_t *);
  72 static void     tcp_wput_ioctl(void *, mblk_t *, void *, ip_recv_attr_t *);
  73 static void     tcp_wput_proto(void *, mblk_t *, void *, ip_recv_attr_t *);
  74 
  75 /*
  76  * This controls how tiny a write must be before we try to copy it
  77  * into the mblk on the tail of the transmit queue.  Not much
  78  * speedup is observed for values larger than sixteen.  Zero will
  79  * disable the optimisation.
  80  */
  81 static int tcp_tx_pull_len = 16;
  82 
  83 static void
  84 cc_after_idle(tcp_t *tcp)
  85 {
  86         uint32_t old_cwnd = tcp->tcp_cwnd;
  87 
  88         if (CC_ALGO(tcp)->after_idle != NULL)
  89                 CC_ALGO(tcp)->after_idle(&tcp->tcp_ccv);
  90 
  91         DTRACE_PROBE3(cwnd__cc__after__idle, tcp_t *, tcp, uint32_t, old_cwnd,
  92             uint32_t, tcp->tcp_cwnd);
  93 }
  94 
  95 void
  96 tcp_wput(queue_t *q, mblk_t *mp)
  97 {
  98         conn_t  *connp = Q_TO_CONN(q);
  99         tcp_t   *tcp;
 100         void (*output_proc)();
 101         t_scalar_t type;
 102         uchar_t *rptr;
 103         struct iocblk   *iocp;
 104         size_t size;
 105 
 106         ASSERT(connp->conn_ref >= 2);
 107 
 108         switch (DB_TYPE(mp)) {
 109         case M_DATA:
 110                 tcp = connp->conn_tcp;
 111                 ASSERT(tcp != NULL);
 112 
 113                 size = msgdsize(mp);
 114 


 212 /*
 213  * The TCP normal data output path.
 214  * NOTE: the logic of the fast path is duplicated from this function.
 215  */
 216 void
 217 tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent)
 218 {
 219         int             len;
 220         mblk_t          *local_time;
 221         mblk_t          *mp1;
 222         uint32_t        snxt;
 223         int             tail_unsent;
 224         int             tcpstate;
 225         int             usable = 0;
 226         mblk_t          *xmit_tail;
 227         int32_t         mss;
 228         int32_t         num_sack_blk = 0;
 229         int32_t         total_hdr_len;
 230         int32_t         tcp_hdr_len;
 231         int             rc;

 232         conn_t          *connp = tcp->tcp_connp;
 233         clock_t         now = LBOLT_FASTPATH;
 234 
 235         tcpstate = tcp->tcp_state;
 236         if (mp == NULL) {
 237                 /*
 238                  * tcp_wput_data() with NULL mp should only be called when
 239                  * there is unsent data.
 240                  */
 241                 ASSERT(tcp->tcp_unsent > 0);
 242                 /* Really tacky... but we need this for detached closes. */
 243                 len = tcp->tcp_unsent;
 244                 goto data_null;
 245         }
 246 
 247         ASSERT(mp->b_datap->db_type == M_DATA);
 248         /*
 249          * Don't allow data after T_ORDREL_REQ or T_DISCON_REQ,
 250          * or before a connection attempt has begun.
 251          */


 366          * includes SACK options.
 367          */
 368         if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) {
 369                 int32_t opt_len;
 370 
 371                 num_sack_blk = MIN(tcp->tcp_max_sack_blk,
 372                     tcp->tcp_num_sack_blk);
 373                 opt_len = num_sack_blk * sizeof (sack_blk_t) + TCPOPT_NOP_LEN *
 374                     2 + TCPOPT_HEADER_LEN;
 375                 mss = tcp->tcp_mss - opt_len;
 376                 total_hdr_len = connp->conn_ht_iphc_len + opt_len;
 377                 tcp_hdr_len = connp->conn_ht_ulp_len + opt_len;
 378         } else {
 379                 mss = tcp->tcp_mss;
 380                 total_hdr_len = connp->conn_ht_iphc_len;
 381                 tcp_hdr_len = connp->conn_ht_ulp_len;
 382         }
 383 
 384         if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
 385             (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
 386                 cc_after_idle(tcp);
 387         }
 388         if (tcpstate == TCPS_SYN_RCVD) {
 389                 /*
 390                  * The three-way connection establishment handshake is not
 391                  * complete yet. We want to queue the data for transmission
 392                  * after entering ESTABLISHED state (RFC793). A jump to
 393                  * "done" label effectively leaves data on the queue.
 394                  */
 395                 goto done;
 396         } else {
 397                 int usable_r;
 398 
 399                 /*
 400                  * In the special case when cwnd is zero, which can only
 401                  * happen if the connection is ECN capable, return now.
 402                  * New segments is sent using tcp_timer().  The timer
 403                  * is set in tcp_input_data().
 404                  */
 405                 if (tcp->tcp_cwnd == 0) {
 406                         /*


 447                         return;
 448                 }
 449 
 450                 /* usable = MIN(swnd, cwnd) - unacked_bytes */
 451                 if (tcp->tcp_swnd > tcp->tcp_cwnd)
 452                         usable_r -= tcp->tcp_swnd - tcp->tcp_cwnd;
 453 
 454                 /* usable = MIN(usable, unsent) */
 455                 if (usable_r > len)
 456                         usable_r = len;
 457 
 458                 /* usable = MAX(usable, {1 for urgent, 0 for data}) */
 459                 if (usable_r > 0) {
 460                         usable = usable_r;
 461                 } else {
 462                         /* Bypass all other unnecessary processing. */
 463                         goto done;
 464                 }
 465         }
 466 
 467 #ifdef KERNEL_32
 468         local_time = (mblk_t *)now;
 469 #else
 470         local_time = (mblk_t *)(intptr_t)gethrtime();
 471 #endif
 472 
 473         /*
 474          * "Our" Nagle Algorithm.  This is not the same as in the old
 475          * BSD.  This is more in line with the true intent of Nagle.
 476          *
 477          * The conditions are:
 478          * 1. The amount of unsent data (or amount of data which can be
 479          *    sent, whichever is smaller) is less than Nagle limit.
 480          * 2. The last sent size is also less than Nagle limit.
 481          * 3. There is unack'ed data.
 482          * 4. Urgent pointer is not set.  Send urgent data ignoring the
 483          *    Nagle algorithm.  This reduces the probability that urgent
 484          *    bytes get "merged" together.
 485          * 5. The app has not closed the connection.  This eliminates the
 486          *    wait time of the receiving side waiting for the last piece of
 487          *    (small) data.
 488          *
 489          * If all are satisified, exit without sending anything.  Note
 490          * that Nagle limit can be smaller than 1 MSS.  Nagle limit is
 491          * the smaller of 1 MSS and global tcp_naglim_def (default to be


1188          *    un-acked     usable
1189          *  |--------------|-----------------|
1190          *  tcp_suna       tcp_snxt       tcp_suna+tcp_swnd
1191          */
1192         /* END CSTYLED */
1193 
1194         /* start sending from tcp_snxt */
1195         snxt = tcp->tcp_snxt;
1196 
1197         /*
1198          * Check to see if this connection has been idled for some
1199          * time and no ACK is expected.  If it is, we need to slow
1200          * start again to get back the connection's "self-clock" as
1201          * described in VJ's paper.
1202          *
1203          * Reinitialize tcp_cwnd after idle.
1204          */
1205         now = LBOLT_FASTPATH;
1206         if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
1207             (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
1208                 cc_after_idle(tcp);
1209         }
1210 
1211         usable = tcp->tcp_swnd;              /* tcp window size */
1212         if (usable > tcp->tcp_cwnd)
1213                 usable = tcp->tcp_cwnd;      /* congestion window smaller */
1214         usable -= snxt;         /* subtract stuff already sent */
1215         suna = tcp->tcp_suna;
1216         usable += suna;
1217         /* usable can be < 0 if the congestion window is smaller */
1218         if (len > usable) {
1219                 /* Can't send complete M_DATA in one shot */
1220                 goto slow;
1221         }
1222 
1223         mutex_enter(&tcp->tcp_non_sq_lock);
1224         if (tcp->tcp_flow_stopped &&
1225             TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) {
1226                 tcp_clrqfull(tcp);
1227         }
1228         mutex_exit(&tcp->tcp_non_sq_lock);


1251                 return;
1252         }
1253 
1254         /*
1255          * len <= tcp->tcp_mss && len == unsent so no sender silly window.  Can
1256          * send now.
1257          */
1258 
1259         if (snxt == suna) {
1260                 TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
1261         }
1262 
1263         /* we have always sent something */
1264         tcp->tcp_rack_cnt = 0;
1265 
1266         tcp->tcp_snxt = snxt + len;
1267         tcp->tcp_rack = tcp->tcp_rnxt;
1268 
1269         if ((mp1 = dupb(mp)) == 0)
1270                 goto no_memory;
1271 #ifdef KERNEL_32
1272         mp->b_prev = (mblk_t *)(uintptr_t)now;
1273 #else
1274         mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
1275 #endif
1276         mp->b_next = (mblk_t *)(uintptr_t)snxt;
1277 
1278         /* adjust tcp header information */
1279         tcpha = tcp->tcp_tcpha;
1280         tcpha->tha_flags = (TH_ACK|TH_PUSH);
1281 
1282         sum = len + connp->conn_ht_ulp_len + connp->conn_sum;
1283         sum = (sum >> 16) + (sum & 0xFFFF);
1284         tcpha->tha_sum = htons(sum);
1285 
1286         tcpha->tha_seq = htonl(snxt);
1287 
1288         TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
1289         TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
1290         TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
1291         tcp->tcp_cs.tcp_out_data_segs++;
1292         tcp->tcp_cs.tcp_out_data_bytes += len;
1293 
1294         /* Update the latest receive window size in TCP header. */
1295         tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
1296 
1297         tcp->tcp_last_sent_len = (ushort_t)len;
1298 
1299         plen = len + connp->conn_ht_iphc_len;
1300 
1301         ixa = connp->conn_ixa;
1302         ixa->ixa_pktlen = plen;
1303 
1304         if (ixa->ixa_flags & IXAF_IS_IPV4) {
1305                 tcp->tcp_ipha->ipha_length = htons(plen);
1306         } else {
1307                 tcp->tcp_ip6h->ip6_plen = htons(plen - IPV6_HDR_LEN);
1308         }
1309 
1310         /* see if we need to allocate a mblk for the headers */
1311         hdrlen = connp->conn_ht_iphc_len;
1312         rptr = mp1->b_rptr - hdrlen;
1313         db = mp1->b_datap;
1314         if ((db->db_ref != 2) || rptr < db->db_base ||
1315             (!OK_32PTR(rptr))) {
1316                 /* NOTE: we assume allocb returns an OK_32PTR */
1317                 mp = allocb(hdrlen + tcps->tcps_wroff_xtra, BPRI_MED);
1318                 if (!mp) {
1319                         freemsg(mp1);
1320                         goto no_memory;
1321                 }
1322                 mp->b_cont = mp1;
1323                 mp1 = mp;
1324                 /* Leave room for Link Level header */
1325                 rptr = &mp1->b_rptr[tcps->tcps_wroff_xtra];
1326                 mp1->b_wptr = &rptr[hdrlen];
1327         }
1328         mp1->b_rptr = rptr;
1329 
1330         /* Fill in the timestamp option. */
1331         if (tcp->tcp_snd_ts_ok) {
1332                 U32_TO_BE32(now,
1333                     (char *)tcpha + TCP_MIN_HEADER_LENGTH + 4);


1334                 U32_TO_BE32(tcp->tcp_ts_recent,
1335                     (char *)tcpha + TCP_MIN_HEADER_LENGTH + 8);
1336         } else {
1337                 ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
1338         }
1339 
1340         /* copy header into outgoing packet */
1341         dst = (ipaddr_t *)rptr;
1342         src = (ipaddr_t *)connp->conn_ht_iphc;
1343         dst[0] = src[0];
1344         dst[1] = src[1];
1345         dst[2] = src[2];
1346         dst[3] = src[3];
1347         dst[4] = src[4];
1348         dst[5] = src[5];
1349         dst[6] = src[6];
1350         dst[7] = src[7];
1351         dst[8] = src[8];
1352         dst[9] = src[9];
1353         if (hdrlen -= 40) {
1354                 hdrlen >>= 2;
1355                 dst += 10;


1959                                 }
1960                         } else
1961                                 (*xmit_tail)->b_rptr = prev_rptr;
1962 
1963                         if (mp == NULL) {
1964                                 return (-1);
1965                         }
1966                         mp1 = mp->b_cont;
1967 
1968                         if (len <= mss) /* LSO is unusable (!do_lso_send) */
1969                                 tcp->tcp_last_sent_len = (ushort_t)len;
1970                         while (mp1->b_cont) {
1971                                 *xmit_tail = (*xmit_tail)->b_cont;
1972                                 (*xmit_tail)->b_prev = local_time;
1973                                 (*xmit_tail)->b_next =
1974                                     (mblk_t *)(uintptr_t)(*snxt);
1975                                 mp1 = mp1->b_cont;
1976                         }
1977                         *snxt += len;
1978                         *tail_unsent = (*xmit_tail)->b_wptr - mp1->b_wptr;
1979                         TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
1980                         TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
1981                         TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
1982                         tcp->tcp_cs.tcp_out_data_segs++;
1983                         tcp->tcp_cs.tcp_out_data_bytes += len;
1984                         tcp_send_data(tcp, mp);
1985                         continue;
1986                 }
1987 
1988                 *snxt += len;   /* Adjust later if we don't send all of len */
1989                 TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
1990                 TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
1991                 TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
1992                 tcp->tcp_cs.tcp_out_data_segs++;
1993                 tcp->tcp_cs.tcp_out_data_bytes += len;
1994 
1995                 if (*tail_unsent) {
1996                         /* Are the bytes above us in flight? */
1997                         rptr = (*xmit_tail)->b_wptr - *tail_unsent;
1998                         if (rptr != (*xmit_tail)->b_rptr) {
1999                                 *tail_unsent -= len;
2000                                 if (len <= mss) /* LSO is unusable */
2001                                         tcp->tcp_last_sent_len = (ushort_t)len;
2002                                 len += total_hdr_len;
2003                                 ixa->ixa_pktlen = len;
2004 
2005                                 if (ixa->ixa_flags & IXAF_IS_IPV4) {
2006                                         tcp->tcp_ipha->ipha_length = htons(len);
2007                                 } else {
2008                                         tcp->tcp_ip6h->ip6_plen =
2009                                             htons(len - IPV6_HDR_LEN);
2010                                 }
2011 
2012                                 mp = dupb(*xmit_tail);
2013                                 if (mp == NULL) {


2070 
2071                 must_alloc:;
2072                         mp1 = allocb(connp->conn_ht_iphc_allocated +
2073                             tcps->tcps_wroff_xtra, BPRI_MED);
2074                         if (mp1 == NULL) {
2075                                 freemsg(mp);
2076                                 return (-1);    /* out_of_mem */
2077                         }
2078                         mp1->b_cont = mp;
2079                         mp = mp1;
2080                         /* Leave room for Link Level header */
2081                         len = total_hdr_len;
2082                         rptr = &mp->b_rptr[tcps->tcps_wroff_xtra];
2083                         mp->b_wptr = &rptr[len];
2084                 }
2085 
2086                 /*
2087                  * Fill in the header using the template header, and add
2088                  * options such as time-stamp, ECN and/or SACK, as needed.
2089                  */
2090                 tcp_fill_header(tcp, rptr, num_sack_blk);
2091 
2092                 mp->b_rptr = rptr;
2093 
2094                 if (*tail_unsent) {
2095                         int spill = *tail_unsent;
2096 
2097                         mp1 = mp->b_cont;
2098                         if (mp1 == NULL)
2099                                 mp1 = mp;
2100 
2101                         /*
2102                          * If we're a little short, tack on more mblks until
2103                          * there is no more spillover.
2104                          */
2105                         while (spill < 0) {
2106                                 mblk_t *nmp;
2107                                 int nmpsz;
2108 
2109                                 nmp = (*xmit_tail)->b_cont;
2110                                 nmpsz = MBLKL(nmp);


2149                                 if (mp1 == NULL) {
2150                                         *tail_unsent = spill;
2151                                         freemsg(mp);
2152                                         return (-1);    /* out_of_mem */
2153                                 }
2154                         }
2155 
2156                         /* Trim back any surplus on the last mblk */
2157                         if (spill >= 0) {
2158                                 mp1->b_wptr -= spill;
2159                                 *tail_unsent = spill;
2160                         } else {
2161                                 /*
2162                                  * We did not send everything we could in
2163                                  * order to remain within the b_cont limit.
2164                                  */
2165                                 *usable -= spill;
2166                                 *snxt += spill;
2167                                 tcp->tcp_last_sent_len += spill;
2168                                 TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, spill);
2169                                 tcp->tcp_cs.tcp_out_data_bytes += spill;
2170                                 /*
2171                                  * Adjust the checksum
2172                                  */
2173                                 tcpha = (tcpha_t *)(rptr +
2174                                     ixa->ixa_ip_hdr_length);
2175                                 sum += spill;
2176                                 sum = (sum >> 16) + (sum & 0xFFFF);
2177                                 tcpha->tha_sum = htons(sum);
2178                                 if (connp->conn_ipversion == IPV4_VERSION) {
2179                                         sum = ntohs(
2180                                             ((ipha_t *)rptr)->ipha_length) +
2181                                             spill;
2182                                         ((ipha_t *)rptr)->ipha_length =
2183                                             htons(sum);
2184                                 } else {
2185                                         sum = ntohs(
2186                                             ((ip6_t *)rptr)->ip6_plen) +
2187                                             spill;
2188                                         ((ip6_t *)rptr)->ip6_plen =
2189                                             htons(sum);


2198                 } else {
2199                         ixa->ixa_flags &= ~IXAF_REACH_CONF;
2200                 }
2201 
2202                 if (do_lso_send) {
2203                         /* Append LSO information to the mp. */
2204                         lso_info_set(mp, mss, HW_LSO);
2205                         ixa->ixa_fragsize = IP_MAXPACKET;
2206                         ixa->ixa_extra_ident = num_lso_seg - 1;
2207 
2208                         DTRACE_PROBE2(tcp_send_lso, int, num_lso_seg,
2209                             boolean_t, B_TRUE);
2210 
2211                         tcp_send_data(tcp, mp);
2212 
2213                         /*
2214                          * Restore values of ixa_fragsize and ixa_extra_ident.
2215                          */
2216                         ixa->ixa_fragsize = ixa->ixa_pmtu;
2217                         ixa->ixa_extra_ident = 0;
2218                         TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
2219                         TCP_STAT(tcps, tcp_lso_times);
2220                         TCP_STAT_UPDATE(tcps, tcp_lso_pkt_out, num_lso_seg);
2221                 } else {
2222                         /*
2223                          * Make sure to clean up LSO information. Wherever a
2224                          * new mp uses the prepended header room after dupb(),
2225                          * lso_info_cleanup() should be called.
2226                          */
2227                         lso_info_cleanup(mp);
2228                         tcp_send_data(tcp, mp);
2229                         TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
2230                 }
2231         }
2232 
2233         return (0);
2234 }
2235 
2236 /*
2237  * Initiate closedown sequence on an active connection.  (May be called as
2238  * writer.)  Return value zero for OK return, non-zero for error return.
2239  */
2240 static int
2241 tcp_xmit_end(tcp_t *tcp)
2242 {
2243         mblk_t          *mp;
2244         tcp_stack_t     *tcps = tcp->tcp_tcps;
2245         iulp_t          uinfo;
2246         ip_stack_t      *ipst = tcps->tcps_netstack->netstack_ip;
2247         conn_t          *connp = tcp->tcp_connp;
2248 
2249         if (tcp->tcp_state < TCPS_SYN_RCVD ||


2289                  * so we have to check that and unset it first.
2290                  */
2291                 if (tcp->tcp_cork)
2292                         tcp->tcp_cork = B_FALSE;
2293                 tcp_wput_data(tcp, NULL, B_FALSE);
2294         }
2295 
2296         /*
2297          * If TCP does not get enough samples of RTT or tcp_rtt_updates
2298          * is 0, don't update the cache.
2299          */
2300         if (tcps->tcps_rtt_updates == 0 ||
2301             tcp->tcp_rtt_update < tcps->tcps_rtt_updates)
2302                 return (0);
2303 
2304         /*
2305          * We do not have a good algorithm to update ssthresh at this time.
2306          * So don't do any update.
2307          */
2308         bzero(&uinfo, sizeof (uinfo));
2309         uinfo.iulp_rtt = NSEC2MSEC(tcp->tcp_rtt_sa);
2310         uinfo.iulp_rtt_sd = NSEC2MSEC(tcp->tcp_rtt_sd);
2311 
2312         /*
2313          * Note that uinfo is kept for conn_faddr in the DCE. Could update even
2314          * if source routed but we don't.
2315          */
2316         if (connp->conn_ipversion == IPV4_VERSION) {
2317                 if (connp->conn_faddr_v4 !=  tcp->tcp_ipha->ipha_dst) {
2318                         return (0);
2319                 }
2320                 (void) dce_update_uinfo_v4(connp->conn_faddr_v4, &uinfo, ipst);
2321         } else {
2322                 uint_t ifindex;
2323 
2324                 if (!(IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
2325                     &tcp->tcp_ip6h->ip6_dst))) {
2326                         return (0);
2327                 }
2328                 ifindex = 0;
2329                 if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_faddr_v6)) {
2330                         ip_xmit_attr_t *ixa = connp->conn_ixa;


2425                 }
2426         }
2427         if (ctl & TH_ACK) {
2428                 if (tcp->tcp_snd_ts_ok) {
2429                         uint32_t llbolt = (uint32_t)LBOLT_FASTPATH;
2430 
2431                         U32_TO_BE32(llbolt,
2432                             (char *)tcpha + TCP_MIN_HEADER_LENGTH+4);
2433                         U32_TO_BE32(tcp->tcp_ts_recent,
2434                             (char *)tcpha + TCP_MIN_HEADER_LENGTH+8);
2435                 }
2436 
2437                 /* Update the latest receive window size in TCP header. */
2438                 tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
2439                 /* Track what we sent to the peer */
2440                 tcp->tcp_tcpha->tha_win = tcpha->tha_win;
2441                 tcp->tcp_rack = ack;
2442                 tcp->tcp_rack_cnt = 0;
2443                 TCPS_BUMP_MIB(tcps, tcpOutAck);
2444         }
2445         TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
2446         tcpha->tha_seq = htonl(seq);
2447         tcpha->tha_ack = htonl(ack);
2448         /*
2449          * Include the adjustment for a source route if any.
2450          */
2451         sum = (sum >> 16) + (sum & 0xFFFF);
2452         tcpha->tha_sum = htons(sum);
2453         tcp_send_data(tcp, mp);
2454 }
2455 
2456 /*
2457  * Generate a reset based on an inbound packet, connp is set by caller
2458  * when RST is in response to an unexpected inbound packet for which
2459  * there is active tcp state in the system.
2460  *
2461  * IPSEC NOTE : Try to send the reply with the same protection as it came
2462  * in.  We have the ip_recv_attr_t which is reversed to form the ip_xmit_attr_t.
2463  * That way the packet will go out at the same level of protection as it
2464  * came in with.
2465  */


3394                 ASSERT(snxt_mp != NULL);
3395                 /* This should not happen.  Defensive coding again... */
3396                 if (snxt_mp == NULL) {
3397                         return;
3398                 }
3399 
3400                 xmit_mp = tcp_xmit_mp(tcp, snxt_mp, seg_len, &off,
3401                     &tmp_mp, begin, B_TRUE, &seg_len, B_TRUE);
3402                 if (xmit_mp == NULL)
3403                         return;
3404 
3405                 usable_swnd -= seg_len;
3406                 tcp->tcp_pipe += seg_len;
3407                 tcp->tcp_sack_snxt = begin + seg_len;
3408 
3409                 tcp_send_data(tcp, xmit_mp);
3410 
3411                 /*
3412                  * Update the send timestamp to avoid false retransmission.
3413                  */
3414 #ifdef KERNEL_32
3415                 snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt();
3416 #else
3417                 snxt_mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
3418 #endif
3419 
3420                 TCPS_BUMP_MIB(tcps, tcpRetransSegs);
3421                 TCPS_UPDATE_MIB(tcps, tcpRetransBytes, seg_len);
3422                 TCPS_BUMP_MIB(tcps, tcpOutSackRetransSegs);
3423                 tcp->tcp_cs.tcp_out_retrans_segs++;
3424                 tcp->tcp_cs.tcp_out_retrans_bytes += seg_len;
3425                 /*
3426                  * Update tcp_rexmit_max to extend this SACK recovery phase.
3427                  * This happens when new data sent during fast recovery is
3428                  * also lost.  If TCP retransmits those new data, it needs
3429                  * to extend SACK recover phase to avoid starting another
3430                  * fast retransmit/recovery unnecessarily.
3431                  */
3432                 if (SEQ_GT(tcp->tcp_sack_snxt, tcp->tcp_rexmit_max)) {
3433                         tcp->tcp_rexmit_max = tcp->tcp_sack_snxt;
3434                 }
3435         }
3436 }
3437 
3438 /*
3439  * tcp_ss_rexmit() is called to do slow start retransmission after a timeout
3440  * or ICMP errors.
3441  */
3442 void
3443 tcp_ss_rexmit(tcp_t *tcp)
3444 {


3472 
3473                         if (win < cnt) {
3474                                 cnt = win;
3475                         }
3476                         if (SEQ_GT(snxt + cnt, smax)) {
3477                                 cnt = smax - snxt;
3478                         }
3479                         xmit_mp = tcp_xmit_mp(tcp, snxt_mp, cnt, &off,
3480                             &snxt_mp, snxt, B_TRUE, &cnt, B_TRUE);
3481                         if (xmit_mp == NULL)
3482                                 return;
3483 
3484                         tcp_send_data(tcp, xmit_mp);
3485 
3486                         snxt += cnt;
3487                         win -= cnt;
3488                         /*
3489                          * Update the send timestamp to avoid false
3490                          * retransmission.
3491                          */
3492 #ifdef KERNEL_32
3493                         old_snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt();
3494 #else
3495                         old_snxt_mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
3496 #endif
3497                         TCPS_BUMP_MIB(tcps, tcpRetransSegs);
3498                         TCPS_UPDATE_MIB(tcps, tcpRetransBytes, cnt);
3499                         tcp->tcp_cs.tcp_out_retrans_segs++;
3500                         tcp->tcp_cs.tcp_out_retrans_bytes += cnt;
3501 
3502                         tcp->tcp_rexmit_nxt = snxt;
3503                 }
3504                 /*
3505                  * If we have transmitted all we have at the time
3506                  * we started the retranmission, we can leave
3507                  * the rest of the job to tcp_wput_data().  But we
3508                  * need to check the send window first.  If the
3509                  * win is not 0, go on with tcp_wput_data().
3510                  */
3511                 if (SEQ_LT(snxt, smax) || win == 0) {
3512                         return;
3513                 }
3514         }
3515         /* Only call tcp_wput_data() if there is data to be sent. */
3516         if (tcp->tcp_unsent) {
3517                 tcp_wput_data(tcp, NULL, B_FALSE);
3518         }
3519 }
3520 


3638         /*
3639          * If the SACK option is set, delete the entire list of
3640          * notsack'ed blocks.
3641          */
3642         TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, tcp);
3643 
3644         if (tcp->tcp_suna == tcp->tcp_snxt && tcp->tcp_swnd == 0)
3645                 /*
3646                  * Make sure the timer is running so that we will probe a zero
3647                  * window.
3648                  */
3649                 TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
3650 }
3651 
3652 /*
3653  * tcp_fill_header is called by tcp_send() to fill the outgoing TCP header
3654  * with the template header, as well as other options such as time-stamp,
3655  * ECN and/or SACK.
3656  */
3657 static void
3658 tcp_fill_header(tcp_t *tcp, uchar_t *rptr, int num_sack_blk)
3659 {
3660         tcpha_t *tcp_tmpl, *tcpha;
3661         uint32_t *dst, *src;
3662         int hdrlen;
3663         conn_t *connp = tcp->tcp_connp;
3664 
3665         ASSERT(OK_32PTR(rptr));
3666 
3667         /* Template header */
3668         tcp_tmpl = tcp->tcp_tcpha;
3669 
3670         /* Header of outgoing packet */
3671         tcpha = (tcpha_t *)(rptr + connp->conn_ixa->ixa_ip_hdr_length);
3672 
3673         /* dst and src are opaque 32-bit fields, used for copying */
3674         dst = (uint32_t *)rptr;
3675         src = (uint32_t *)connp->conn_ht_iphc;
3676         hdrlen = connp->conn_ht_iphc_len;
3677 
3678         /* Fill time-stamp option if needed */
3679         if (tcp->tcp_snd_ts_ok) {
3680                 U32_TO_BE32(LBOLT_FASTPATH,
3681                     (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 4);
3682                 U32_TO_BE32(tcp->tcp_ts_recent,
3683                     (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 8);
3684         } else {
3685                 ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
3686         }
3687 
3688         /*
3689          * Copy the template header; is this really more efficient than
3690          * calling bcopy()?  For simple IPv4/TCP, it may be the case,
3691          * but perhaps not for other scenarios.
3692          */
3693         dst[0] = src[0];
3694         dst[1] = src[1];
3695         dst[2] = src[2];
3696         dst[3] = src[3];
3697         dst[4] = src[4];
3698         dst[5] = src[5];
3699         dst[6] = src[6];
3700         dst[7] = src[7];