4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2014 by Delphix. All rights reserved.
25 */
26
27 /* This file contains all TCP output processing functions. */
28
29 #include <sys/types.h>
30 #include <sys/stream.h>
31 #include <sys/strsun.h>
32 #include <sys/strsubr.h>
33 #include <sys/stropts.h>
34 #include <sys/strlog.h>
35 #define _SUN_TPI_VERSION 2
36 #include <sys/tihdr.h>
37 #include <sys/suntpi.h>
38 #include <sys/xti_inet.h>
39 #include <sys/timod.h>
40 #include <sys/pattr.h>
41 #include <sys/squeue_impl.h>
42 #include <sys/squeue.h>
43 #include <sys/sockio.h>
44 #include <sys/tsol/tnet.h>
46 #include <inet/common.h>
47 #include <inet/ip.h>
48 #include <inet/tcp.h>
49 #include <inet/tcp_impl.h>
50 #include <inet/snmpcom.h>
51 #include <inet/proto_set.h>
52 #include <inet/ipsec_impl.h>
53 #include <inet/ip_ndp.h>
54
55 static mblk_t *tcp_get_seg_mp(tcp_t *, uint32_t, int32_t *);
56 static void tcp_wput_cmdblk(queue_t *, mblk_t *);
57 static void tcp_wput_flush(tcp_t *, mblk_t *);
58 static void tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp);
59 static int tcp_xmit_end(tcp_t *);
60 static int tcp_send(tcp_t *, const int, const int, const int,
61 const int, int *, uint_t *, int *, mblk_t **, mblk_t *);
62 static void tcp_xmit_early_reset(char *, mblk_t *, uint32_t, uint32_t,
63 int, ip_recv_attr_t *, ip_stack_t *, conn_t *);
64 static boolean_t tcp_send_rst_chk(tcp_stack_t *);
65 static void tcp_process_shrunk_swnd(tcp_t *, uint32_t);
66 static void tcp_fill_header(tcp_t *, uchar_t *, clock_t, int);
67
68 /*
69 * Functions called directly via squeue having a prototype of edesc_t.
70 */
71 static void tcp_wput_nondata(void *, mblk_t *, void *, ip_recv_attr_t *);
72 static void tcp_wput_ioctl(void *, mblk_t *, void *, ip_recv_attr_t *);
73 static void tcp_wput_proto(void *, mblk_t *, void *, ip_recv_attr_t *);
74
75 /*
76 * This controls how tiny a write must be before we try to copy it
77 * into the mblk on the tail of the transmit queue. Not much
78 * speedup is observed for values larger than sixteen. Zero will
79 * disable the optimisation.
80 */
81 static int tcp_tx_pull_len = 16;
82
83 void
84 tcp_wput(queue_t *q, mblk_t *mp)
85 {
86 conn_t *connp = Q_TO_CONN(q);
87 tcp_t *tcp;
88 void (*output_proc)();
89 t_scalar_t type;
90 uchar_t *rptr;
91 struct iocblk *iocp;
92 size_t size;
93
94 ASSERT(connp->conn_ref >= 2);
95
96 switch (DB_TYPE(mp)) {
97 case M_DATA:
98 tcp = connp->conn_tcp;
99 ASSERT(tcp != NULL);
100
101 size = msgdsize(mp);
102
200 /*
201 * The TCP normal data output path.
202 * NOTE: the logic of the fast path is duplicated from this function.
203 */
204 void
205 tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent)
206 {
207 int len;
208 mblk_t *local_time;
209 mblk_t *mp1;
210 uint32_t snxt;
211 int tail_unsent;
212 int tcpstate;
213 int usable = 0;
214 mblk_t *xmit_tail;
215 int32_t mss;
216 int32_t num_sack_blk = 0;
217 int32_t total_hdr_len;
218 int32_t tcp_hdr_len;
219 int rc;
220 tcp_stack_t *tcps = tcp->tcp_tcps;
221 conn_t *connp = tcp->tcp_connp;
222 clock_t now = LBOLT_FASTPATH;
223
224 tcpstate = tcp->tcp_state;
225 if (mp == NULL) {
226 /*
227 * tcp_wput_data() with NULL mp should only be called when
228 * there is unsent data.
229 */
230 ASSERT(tcp->tcp_unsent > 0);
231 /* Really tacky... but we need this for detached closes. */
232 len = tcp->tcp_unsent;
233 goto data_null;
234 }
235
236 ASSERT(mp->b_datap->db_type == M_DATA);
237 /*
238 * Don't allow data after T_ORDREL_REQ or T_DISCON_REQ,
239 * or before a connection attempt has begun.
240 */
355 * includes SACK options.
356 */
357 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) {
358 int32_t opt_len;
359
360 num_sack_blk = MIN(tcp->tcp_max_sack_blk,
361 tcp->tcp_num_sack_blk);
362 opt_len = num_sack_blk * sizeof (sack_blk_t) + TCPOPT_NOP_LEN *
363 2 + TCPOPT_HEADER_LEN;
364 mss = tcp->tcp_mss - opt_len;
365 total_hdr_len = connp->conn_ht_iphc_len + opt_len;
366 tcp_hdr_len = connp->conn_ht_ulp_len + opt_len;
367 } else {
368 mss = tcp->tcp_mss;
369 total_hdr_len = connp->conn_ht_iphc_len;
370 tcp_hdr_len = connp->conn_ht_ulp_len;
371 }
372
373 if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
374 (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
375 TCP_SET_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle);
376 }
377 if (tcpstate == TCPS_SYN_RCVD) {
378 /*
379 * The three-way connection establishment handshake is not
380 * complete yet. We want to queue the data for transmission
381 * after entering ESTABLISHED state (RFC793). A jump to
382 * "done" label effectively leaves data on the queue.
383 */
384 goto done;
385 } else {
386 int usable_r;
387
388 /*
389 * In the special case when cwnd is zero, which can only
390 * happen if the connection is ECN capable, return now.
391 * New segments is sent using tcp_timer(). The timer
392 * is set in tcp_input_data().
393 */
394 if (tcp->tcp_cwnd == 0) {
395 /*
436 return;
437 }
438
439 /* usable = MIN(swnd, cwnd) - unacked_bytes */
440 if (tcp->tcp_swnd > tcp->tcp_cwnd)
441 usable_r -= tcp->tcp_swnd - tcp->tcp_cwnd;
442
443 /* usable = MIN(usable, unsent) */
444 if (usable_r > len)
445 usable_r = len;
446
447 /* usable = MAX(usable, {1 for urgent, 0 for data}) */
448 if (usable_r > 0) {
449 usable = usable_r;
450 } else {
451 /* Bypass all other unnecessary processing. */
452 goto done;
453 }
454 }
455
456 local_time = (mblk_t *)now;
457
458 /*
459 * "Our" Nagle Algorithm. This is not the same as in the old
460 * BSD. This is more in line with the true intent of Nagle.
461 *
462 * The conditions are:
463 * 1. The amount of unsent data (or amount of data which can be
464 * sent, whichever is smaller) is less than Nagle limit.
465 * 2. The last sent size is also less than Nagle limit.
466 * 3. There is unack'ed data.
467 * 4. Urgent pointer is not set. Send urgent data ignoring the
468 * Nagle algorithm. This reduces the probability that urgent
469 * bytes get "merged" together.
470 * 5. The app has not closed the connection. This eliminates the
471 * wait time of the receiving side waiting for the last piece of
472 * (small) data.
473 *
474 * If all are satisified, exit without sending anything. Note
475 * that Nagle limit can be smaller than 1 MSS. Nagle limit is
476 * the smaller of 1 MSS and global tcp_naglim_def (default to be
1173 * un-acked usable
1174 * |--------------|-----------------|
1175 * tcp_suna tcp_snxt tcp_suna+tcp_swnd
1176 */
1177 /* END CSTYLED */
1178
1179 /* start sending from tcp_snxt */
1180 snxt = tcp->tcp_snxt;
1181
1182 /*
1183 * Check to see if this connection has been idled for some
1184 * time and no ACK is expected. If it is, we need to slow
1185 * start again to get back the connection's "self-clock" as
1186 * described in VJ's paper.
1187 *
1188 * Reinitialize tcp_cwnd after idle.
1189 */
1190 now = LBOLT_FASTPATH;
1191 if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
1192 (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
1193 TCP_SET_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle);
1194 }
1195
1196 usable = tcp->tcp_swnd; /* tcp window size */
1197 if (usable > tcp->tcp_cwnd)
1198 usable = tcp->tcp_cwnd; /* congestion window smaller */
1199 usable -= snxt; /* subtract stuff already sent */
1200 suna = tcp->tcp_suna;
1201 usable += suna;
1202 /* usable can be < 0 if the congestion window is smaller */
1203 if (len > usable) {
1204 /* Can't send complete M_DATA in one shot */
1205 goto slow;
1206 }
1207
1208 mutex_enter(&tcp->tcp_non_sq_lock);
1209 if (tcp->tcp_flow_stopped &&
1210 TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) {
1211 tcp_clrqfull(tcp);
1212 }
1213 mutex_exit(&tcp->tcp_non_sq_lock);
1236 return;
1237 }
1238
1239 /*
1240 * len <= tcp->tcp_mss && len == unsent so no sender silly window. Can
1241 * send now.
1242 */
1243
1244 if (snxt == suna) {
1245 TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
1246 }
1247
1248 /* we have always sent something */
1249 tcp->tcp_rack_cnt = 0;
1250
1251 tcp->tcp_snxt = snxt + len;
1252 tcp->tcp_rack = tcp->tcp_rnxt;
1253
1254 if ((mp1 = dupb(mp)) == 0)
1255 goto no_memory;
1256 mp->b_prev = (mblk_t *)(uintptr_t)now;
1257 mp->b_next = (mblk_t *)(uintptr_t)snxt;
1258
1259 /* adjust tcp header information */
1260 tcpha = tcp->tcp_tcpha;
1261 tcpha->tha_flags = (TH_ACK|TH_PUSH);
1262
1263 sum = len + connp->conn_ht_ulp_len + connp->conn_sum;
1264 sum = (sum >> 16) + (sum & 0xFFFF);
1265 tcpha->tha_sum = htons(sum);
1266
1267 tcpha->tha_seq = htonl(snxt);
1268
1269 TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
1270 TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
1271 BUMP_LOCAL(tcp->tcp_obsegs);
1272
1273 /* Update the latest receive window size in TCP header. */
1274 tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
1275
1276 tcp->tcp_last_sent_len = (ushort_t)len;
1277
1278 plen = len + connp->conn_ht_iphc_len;
1279
1280 ixa = connp->conn_ixa;
1281 ixa->ixa_pktlen = plen;
1282
1283 if (ixa->ixa_flags & IXAF_IS_IPV4) {
1284 tcp->tcp_ipha->ipha_length = htons(plen);
1285 } else {
1286 tcp->tcp_ip6h->ip6_plen = htons(plen - IPV6_HDR_LEN);
1287 }
1288
1289 /* see if we need to allocate a mblk for the headers */
1290 hdrlen = connp->conn_ht_iphc_len;
1291 rptr = mp1->b_rptr - hdrlen;
1292 db = mp1->b_datap;
1293 if ((db->db_ref != 2) || rptr < db->db_base ||
1294 (!OK_32PTR(rptr))) {
1295 /* NOTE: we assume allocb returns an OK_32PTR */
1296 mp = allocb(hdrlen + tcps->tcps_wroff_xtra, BPRI_MED);
1297 if (!mp) {
1298 freemsg(mp1);
1299 goto no_memory;
1300 }
1301 mp->b_cont = mp1;
1302 mp1 = mp;
1303 /* Leave room for Link Level header */
1304 rptr = &mp1->b_rptr[tcps->tcps_wroff_xtra];
1305 mp1->b_wptr = &rptr[hdrlen];
1306 }
1307 mp1->b_rptr = rptr;
1308
1309 /* Fill in the timestamp option. */
1310 if (tcp->tcp_snd_ts_ok) {
1311 uint32_t llbolt = (uint32_t)LBOLT_FASTPATH;
1312
1313 U32_TO_BE32(llbolt,
1314 (char *)tcpha + TCP_MIN_HEADER_LENGTH+4);
1315 U32_TO_BE32(tcp->tcp_ts_recent,
1316 (char *)tcpha + TCP_MIN_HEADER_LENGTH+8);
1317 } else {
1318 ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
1319 }
1320
1321 /* copy header into outgoing packet */
1322 dst = (ipaddr_t *)rptr;
1323 src = (ipaddr_t *)connp->conn_ht_iphc;
1324 dst[0] = src[0];
1325 dst[1] = src[1];
1326 dst[2] = src[2];
1327 dst[3] = src[3];
1328 dst[4] = src[4];
1329 dst[5] = src[5];
1330 dst[6] = src[6];
1331 dst[7] = src[7];
1332 dst[8] = src[8];
1333 dst[9] = src[9];
1334 if (hdrlen -= 40) {
1335 hdrlen >>= 2;
1336 dst += 10;
1940 }
1941 } else
1942 (*xmit_tail)->b_rptr = prev_rptr;
1943
1944 if (mp == NULL) {
1945 return (-1);
1946 }
1947 mp1 = mp->b_cont;
1948
1949 if (len <= mss) /* LSO is unusable (!do_lso_send) */
1950 tcp->tcp_last_sent_len = (ushort_t)len;
1951 while (mp1->b_cont) {
1952 *xmit_tail = (*xmit_tail)->b_cont;
1953 (*xmit_tail)->b_prev = local_time;
1954 (*xmit_tail)->b_next =
1955 (mblk_t *)(uintptr_t)(*snxt);
1956 mp1 = mp1->b_cont;
1957 }
1958 *snxt += len;
1959 *tail_unsent = (*xmit_tail)->b_wptr - mp1->b_wptr;
1960 BUMP_LOCAL(tcp->tcp_obsegs);
1961 TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
1962 TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
1963 tcp_send_data(tcp, mp);
1964 continue;
1965 }
1966
1967 *snxt += len; /* Adjust later if we don't send all of len */
1968 TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
1969 TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
1970
1971 if (*tail_unsent) {
1972 /* Are the bytes above us in flight? */
1973 rptr = (*xmit_tail)->b_wptr - *tail_unsent;
1974 if (rptr != (*xmit_tail)->b_rptr) {
1975 *tail_unsent -= len;
1976 if (len <= mss) /* LSO is unusable */
1977 tcp->tcp_last_sent_len = (ushort_t)len;
1978 len += total_hdr_len;
1979 ixa->ixa_pktlen = len;
1980
1981 if (ixa->ixa_flags & IXAF_IS_IPV4) {
1982 tcp->tcp_ipha->ipha_length = htons(len);
1983 } else {
1984 tcp->tcp_ip6h->ip6_plen =
1985 htons(len - IPV6_HDR_LEN);
1986 }
1987
1988 mp = dupb(*xmit_tail);
1989 if (mp == NULL) {
2046
2047 must_alloc:;
2048 mp1 = allocb(connp->conn_ht_iphc_allocated +
2049 tcps->tcps_wroff_xtra, BPRI_MED);
2050 if (mp1 == NULL) {
2051 freemsg(mp);
2052 return (-1); /* out_of_mem */
2053 }
2054 mp1->b_cont = mp;
2055 mp = mp1;
2056 /* Leave room for Link Level header */
2057 len = total_hdr_len;
2058 rptr = &mp->b_rptr[tcps->tcps_wroff_xtra];
2059 mp->b_wptr = &rptr[len];
2060 }
2061
2062 /*
2063 * Fill in the header using the template header, and add
2064 * options such as time-stamp, ECN and/or SACK, as needed.
2065 */
2066 tcp_fill_header(tcp, rptr, (clock_t)local_time, num_sack_blk);
2067
2068 mp->b_rptr = rptr;
2069
2070 if (*tail_unsent) {
2071 int spill = *tail_unsent;
2072
2073 mp1 = mp->b_cont;
2074 if (mp1 == NULL)
2075 mp1 = mp;
2076
2077 /*
2078 * If we're a little short, tack on more mblks until
2079 * there is no more spillover.
2080 */
2081 while (spill < 0) {
2082 mblk_t *nmp;
2083 int nmpsz;
2084
2085 nmp = (*xmit_tail)->b_cont;
2086 nmpsz = MBLKL(nmp);
2125 if (mp1 == NULL) {
2126 *tail_unsent = spill;
2127 freemsg(mp);
2128 return (-1); /* out_of_mem */
2129 }
2130 }
2131
2132 /* Trim back any surplus on the last mblk */
2133 if (spill >= 0) {
2134 mp1->b_wptr -= spill;
2135 *tail_unsent = spill;
2136 } else {
2137 /*
2138 * We did not send everything we could in
2139 * order to remain within the b_cont limit.
2140 */
2141 *usable -= spill;
2142 *snxt += spill;
2143 tcp->tcp_last_sent_len += spill;
2144 TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, spill);
2145 /*
2146 * Adjust the checksum
2147 */
2148 tcpha = (tcpha_t *)(rptr +
2149 ixa->ixa_ip_hdr_length);
2150 sum += spill;
2151 sum = (sum >> 16) + (sum & 0xFFFF);
2152 tcpha->tha_sum = htons(sum);
2153 if (connp->conn_ipversion == IPV4_VERSION) {
2154 sum = ntohs(
2155 ((ipha_t *)rptr)->ipha_length) +
2156 spill;
2157 ((ipha_t *)rptr)->ipha_length =
2158 htons(sum);
2159 } else {
2160 sum = ntohs(
2161 ((ip6_t *)rptr)->ip6_plen) +
2162 spill;
2163 ((ip6_t *)rptr)->ip6_plen =
2164 htons(sum);
2173 } else {
2174 ixa->ixa_flags &= ~IXAF_REACH_CONF;
2175 }
2176
2177 if (do_lso_send) {
2178 /* Append LSO information to the mp. */
2179 lso_info_set(mp, mss, HW_LSO);
2180 ixa->ixa_fragsize = IP_MAXPACKET;
2181 ixa->ixa_extra_ident = num_lso_seg - 1;
2182
2183 DTRACE_PROBE2(tcp_send_lso, int, num_lso_seg,
2184 boolean_t, B_TRUE);
2185
2186 tcp_send_data(tcp, mp);
2187
2188 /*
2189 * Restore values of ixa_fragsize and ixa_extra_ident.
2190 */
2191 ixa->ixa_fragsize = ixa->ixa_pmtu;
2192 ixa->ixa_extra_ident = 0;
2193 tcp->tcp_obsegs += num_lso_seg;
2194 TCP_STAT(tcps, tcp_lso_times);
2195 TCP_STAT_UPDATE(tcps, tcp_lso_pkt_out, num_lso_seg);
2196 } else {
2197 /*
2198 * Make sure to clean up LSO information. Wherever a
2199 * new mp uses the prepended header room after dupb(),
2200 * lso_info_cleanup() should be called.
2201 */
2202 lso_info_cleanup(mp);
2203 tcp_send_data(tcp, mp);
2204 BUMP_LOCAL(tcp->tcp_obsegs);
2205 }
2206 }
2207
2208 return (0);
2209 }
2210
2211 /*
2212 * Initiate closedown sequence on an active connection. (May be called as
2213 * writer.) Return value zero for OK return, non-zero for error return.
2214 */
2215 static int
2216 tcp_xmit_end(tcp_t *tcp)
2217 {
2218 mblk_t *mp;
2219 tcp_stack_t *tcps = tcp->tcp_tcps;
2220 iulp_t uinfo;
2221 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
2222 conn_t *connp = tcp->tcp_connp;
2223
2224 if (tcp->tcp_state < TCPS_SYN_RCVD ||
2264 * so we have to check that and unset it first.
2265 */
2266 if (tcp->tcp_cork)
2267 tcp->tcp_cork = B_FALSE;
2268 tcp_wput_data(tcp, NULL, B_FALSE);
2269 }
2270
2271 /*
2272 * If TCP does not get enough samples of RTT or tcp_rtt_updates
2273 * is 0, don't update the cache.
2274 */
2275 if (tcps->tcps_rtt_updates == 0 ||
2276 tcp->tcp_rtt_update < tcps->tcps_rtt_updates)
2277 return (0);
2278
2279 /*
2280 * We do not have a good algorithm to update ssthresh at this time.
2281 * So don't do any update.
2282 */
2283 bzero(&uinfo, sizeof (uinfo));
2284 uinfo.iulp_rtt = tcp->tcp_rtt_sa;
2285 uinfo.iulp_rtt_sd = tcp->tcp_rtt_sd;
2286
2287 /*
2288 * Note that uinfo is kept for conn_faddr in the DCE. Could update even
2289 * if source routed but we don't.
2290 */
2291 if (connp->conn_ipversion == IPV4_VERSION) {
2292 if (connp->conn_faddr_v4 != tcp->tcp_ipha->ipha_dst) {
2293 return (0);
2294 }
2295 (void) dce_update_uinfo_v4(connp->conn_faddr_v4, &uinfo, ipst);
2296 } else {
2297 uint_t ifindex;
2298
2299 if (!(IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
2300 &tcp->tcp_ip6h->ip6_dst))) {
2301 return (0);
2302 }
2303 ifindex = 0;
2304 if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_faddr_v6)) {
2305 ip_xmit_attr_t *ixa = connp->conn_ixa;
2400 }
2401 }
2402 if (ctl & TH_ACK) {
2403 if (tcp->tcp_snd_ts_ok) {
2404 uint32_t llbolt = (uint32_t)LBOLT_FASTPATH;
2405
2406 U32_TO_BE32(llbolt,
2407 (char *)tcpha + TCP_MIN_HEADER_LENGTH+4);
2408 U32_TO_BE32(tcp->tcp_ts_recent,
2409 (char *)tcpha + TCP_MIN_HEADER_LENGTH+8);
2410 }
2411
2412 /* Update the latest receive window size in TCP header. */
2413 tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
2414 /* Track what we sent to the peer */
2415 tcp->tcp_tcpha->tha_win = tcpha->tha_win;
2416 tcp->tcp_rack = ack;
2417 tcp->tcp_rack_cnt = 0;
2418 TCPS_BUMP_MIB(tcps, tcpOutAck);
2419 }
2420 BUMP_LOCAL(tcp->tcp_obsegs);
2421 tcpha->tha_seq = htonl(seq);
2422 tcpha->tha_ack = htonl(ack);
2423 /*
2424 * Include the adjustment for a source route if any.
2425 */
2426 sum = (sum >> 16) + (sum & 0xFFFF);
2427 tcpha->tha_sum = htons(sum);
2428 tcp_send_data(tcp, mp);
2429 }
2430
2431 /*
2432 * Generate a reset based on an inbound packet, connp is set by caller
2433 * when RST is in response to an unexpected inbound packet for which
2434 * there is active tcp state in the system.
2435 *
2436 * IPSEC NOTE : Try to send the reply with the same protection as it came
2437 * in. We have the ip_recv_attr_t which is reversed to form the ip_xmit_attr_t.
2438 * That way the packet will go out at the same level of protection as it
2439 * came in with.
2440 */
3369 ASSERT(snxt_mp != NULL);
3370 /* This should not happen. Defensive coding again... */
3371 if (snxt_mp == NULL) {
3372 return;
3373 }
3374
3375 xmit_mp = tcp_xmit_mp(tcp, snxt_mp, seg_len, &off,
3376 &tmp_mp, begin, B_TRUE, &seg_len, B_TRUE);
3377 if (xmit_mp == NULL)
3378 return;
3379
3380 usable_swnd -= seg_len;
3381 tcp->tcp_pipe += seg_len;
3382 tcp->tcp_sack_snxt = begin + seg_len;
3383
3384 tcp_send_data(tcp, xmit_mp);
3385
3386 /*
3387 * Update the send timestamp to avoid false retransmission.
3388 */
3389 snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt();
3390
3391 TCPS_BUMP_MIB(tcps, tcpRetransSegs);
3392 TCPS_UPDATE_MIB(tcps, tcpRetransBytes, seg_len);
3393 TCPS_BUMP_MIB(tcps, tcpOutSackRetransSegs);
3394 /*
3395 * Update tcp_rexmit_max to extend this SACK recovery phase.
3396 * This happens when new data sent during fast recovery is
3397 * also lost. If TCP retransmits those new data, it needs
3398 * to extend SACK recover phase to avoid starting another
3399 * fast retransmit/recovery unnecessarily.
3400 */
3401 if (SEQ_GT(tcp->tcp_sack_snxt, tcp->tcp_rexmit_max)) {
3402 tcp->tcp_rexmit_max = tcp->tcp_sack_snxt;
3403 }
3404 }
3405 }
3406
3407 /*
3408 * tcp_ss_rexmit() is called to do slow start retransmission after a timeout
3409 * or ICMP errors.
3410 */
3411 void
3412 tcp_ss_rexmit(tcp_t *tcp)
3413 {
3441
3442 if (win < cnt) {
3443 cnt = win;
3444 }
3445 if (SEQ_GT(snxt + cnt, smax)) {
3446 cnt = smax - snxt;
3447 }
3448 xmit_mp = tcp_xmit_mp(tcp, snxt_mp, cnt, &off,
3449 &snxt_mp, snxt, B_TRUE, &cnt, B_TRUE);
3450 if (xmit_mp == NULL)
3451 return;
3452
3453 tcp_send_data(tcp, xmit_mp);
3454
3455 snxt += cnt;
3456 win -= cnt;
3457 /*
3458 * Update the send timestamp to avoid false
3459 * retransmission.
3460 */
3461 old_snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt();
3462 TCPS_BUMP_MIB(tcps, tcpRetransSegs);
3463 TCPS_UPDATE_MIB(tcps, tcpRetransBytes, cnt);
3464
3465 tcp->tcp_rexmit_nxt = snxt;
3466 }
3467 /*
3468 * If we have transmitted all we have at the time
3469 * we started the retranmission, we can leave
3470 * the rest of the job to tcp_wput_data(). But we
3471 * need to check the send window first. If the
3472 * win is not 0, go on with tcp_wput_data().
3473 */
3474 if (SEQ_LT(snxt, smax) || win == 0) {
3475 return;
3476 }
3477 }
3478 /* Only call tcp_wput_data() if there is data to be sent. */
3479 if (tcp->tcp_unsent) {
3480 tcp_wput_data(tcp, NULL, B_FALSE);
3481 }
3482 }
3483
3601 /*
3602 * If the SACK option is set, delete the entire list of
3603 * notsack'ed blocks.
3604 */
3605 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, tcp);
3606
3607 if (tcp->tcp_suna == tcp->tcp_snxt && tcp->tcp_swnd == 0)
3608 /*
3609 * Make sure the timer is running so that we will probe a zero
3610 * window.
3611 */
3612 TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
3613 }
3614
3615 /*
3616 * tcp_fill_header is called by tcp_send() to fill the outgoing TCP header
3617 * with the template header, as well as other options such as time-stamp,
3618 * ECN and/or SACK.
3619 */
3620 static void
3621 tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk)
3622 {
3623 tcpha_t *tcp_tmpl, *tcpha;
3624 uint32_t *dst, *src;
3625 int hdrlen;
3626 conn_t *connp = tcp->tcp_connp;
3627
3628 ASSERT(OK_32PTR(rptr));
3629
3630 /* Template header */
3631 tcp_tmpl = tcp->tcp_tcpha;
3632
3633 /* Header of outgoing packet */
3634 tcpha = (tcpha_t *)(rptr + connp->conn_ixa->ixa_ip_hdr_length);
3635
3636 /* dst and src are opaque 32-bit fields, used for copying */
3637 dst = (uint32_t *)rptr;
3638 src = (uint32_t *)connp->conn_ht_iphc;
3639 hdrlen = connp->conn_ht_iphc_len;
3640
3641 /* Fill time-stamp option if needed */
3642 if (tcp->tcp_snd_ts_ok) {
3643 U32_TO_BE32((uint32_t)now,
3644 (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 4);
3645 U32_TO_BE32(tcp->tcp_ts_recent,
3646 (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 8);
3647 } else {
3648 ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
3649 }
3650
3651 /*
3652 * Copy the template header; is this really more efficient than
3653 * calling bcopy()? For simple IPv4/TCP, it may be the case,
3654 * but perhaps not for other scenarios.
3655 */
3656 dst[0] = src[0];
3657 dst[1] = src[1];
3658 dst[2] = src[2];
3659 dst[3] = src[3];
3660 dst[4] = src[4];
3661 dst[5] = src[5];
3662 dst[6] = src[6];
3663 dst[7] = src[7];
|
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
25 */
26
27 /* This file contains all TCP output processing functions. */
28
29 #include <sys/types.h>
30 #include <sys/stream.h>
31 #include <sys/strsun.h>
32 #include <sys/strsubr.h>
33 #include <sys/stropts.h>
34 #include <sys/strlog.h>
35 #define _SUN_TPI_VERSION 2
36 #include <sys/tihdr.h>
37 #include <sys/suntpi.h>
38 #include <sys/xti_inet.h>
39 #include <sys/timod.h>
40 #include <sys/pattr.h>
41 #include <sys/squeue_impl.h>
42 #include <sys/squeue.h>
43 #include <sys/sockio.h>
44 #include <sys/tsol/tnet.h>
46 #include <inet/common.h>
47 #include <inet/ip.h>
48 #include <inet/tcp.h>
49 #include <inet/tcp_impl.h>
50 #include <inet/snmpcom.h>
51 #include <inet/proto_set.h>
52 #include <inet/ipsec_impl.h>
53 #include <inet/ip_ndp.h>
54
55 static mblk_t *tcp_get_seg_mp(tcp_t *, uint32_t, int32_t *);
56 static void tcp_wput_cmdblk(queue_t *, mblk_t *);
57 static void tcp_wput_flush(tcp_t *, mblk_t *);
58 static void tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp);
59 static int tcp_xmit_end(tcp_t *);
60 static int tcp_send(tcp_t *, const int, const int, const int,
61 const int, int *, uint_t *, int *, mblk_t **, mblk_t *);
62 static void tcp_xmit_early_reset(char *, mblk_t *, uint32_t, uint32_t,
63 int, ip_recv_attr_t *, ip_stack_t *, conn_t *);
64 static boolean_t tcp_send_rst_chk(tcp_stack_t *);
65 static void tcp_process_shrunk_swnd(tcp_t *, uint32_t);
66 static void tcp_fill_header(tcp_t *, uchar_t *, int);
67
68 /*
69 * Functions called directly via squeue having a prototype of edesc_t.
70 */
71 static void tcp_wput_nondata(void *, mblk_t *, void *, ip_recv_attr_t *);
72 static void tcp_wput_ioctl(void *, mblk_t *, void *, ip_recv_attr_t *);
73 static void tcp_wput_proto(void *, mblk_t *, void *, ip_recv_attr_t *);
74
75 /*
76 * This controls how tiny a write must be before we try to copy it
77 * into the mblk on the tail of the transmit queue. Not much
78 * speedup is observed for values larger than sixteen. Zero will
79 * disable the optimisation.
80 */
81 static int tcp_tx_pull_len = 16;
82
83 static void
84 cc_after_idle(tcp_t *tcp)
85 {
86 uint32_t old_cwnd = tcp->tcp_cwnd;
87
88 if (CC_ALGO(tcp)->after_idle != NULL)
89 CC_ALGO(tcp)->after_idle(&tcp->tcp_ccv);
90
91 DTRACE_PROBE3(cwnd__cc__after__idle, tcp_t *, tcp, uint32_t, old_cwnd,
92 uint32_t, tcp->tcp_cwnd);
93 }
94
95 void
96 tcp_wput(queue_t *q, mblk_t *mp)
97 {
98 conn_t *connp = Q_TO_CONN(q);
99 tcp_t *tcp;
100 void (*output_proc)();
101 t_scalar_t type;
102 uchar_t *rptr;
103 struct iocblk *iocp;
104 size_t size;
105
106 ASSERT(connp->conn_ref >= 2);
107
108 switch (DB_TYPE(mp)) {
109 case M_DATA:
110 tcp = connp->conn_tcp;
111 ASSERT(tcp != NULL);
112
113 size = msgdsize(mp);
114
212 /*
213 * The TCP normal data output path.
214 * NOTE: the logic of the fast path is duplicated from this function.
215 */
216 void
217 tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent)
218 {
219 int len;
220 mblk_t *local_time;
221 mblk_t *mp1;
222 uint32_t snxt;
223 int tail_unsent;
224 int tcpstate;
225 int usable = 0;
226 mblk_t *xmit_tail;
227 int32_t mss;
228 int32_t num_sack_blk = 0;
229 int32_t total_hdr_len;
230 int32_t tcp_hdr_len;
231 int rc;
232 conn_t *connp = tcp->tcp_connp;
233 clock_t now = LBOLT_FASTPATH;
234
235 tcpstate = tcp->tcp_state;
236 if (mp == NULL) {
237 /*
238 * tcp_wput_data() with NULL mp should only be called when
239 * there is unsent data.
240 */
241 ASSERT(tcp->tcp_unsent > 0);
242 /* Really tacky... but we need this for detached closes. */
243 len = tcp->tcp_unsent;
244 goto data_null;
245 }
246
247 ASSERT(mp->b_datap->db_type == M_DATA);
248 /*
249 * Don't allow data after T_ORDREL_REQ or T_DISCON_REQ,
250 * or before a connection attempt has begun.
251 */
366 * includes SACK options.
367 */
368 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) {
369 int32_t opt_len;
370
371 num_sack_blk = MIN(tcp->tcp_max_sack_blk,
372 tcp->tcp_num_sack_blk);
373 opt_len = num_sack_blk * sizeof (sack_blk_t) + TCPOPT_NOP_LEN *
374 2 + TCPOPT_HEADER_LEN;
375 mss = tcp->tcp_mss - opt_len;
376 total_hdr_len = connp->conn_ht_iphc_len + opt_len;
377 tcp_hdr_len = connp->conn_ht_ulp_len + opt_len;
378 } else {
379 mss = tcp->tcp_mss;
380 total_hdr_len = connp->conn_ht_iphc_len;
381 tcp_hdr_len = connp->conn_ht_ulp_len;
382 }
383
384 if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
385 (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
386 cc_after_idle(tcp);
387 }
388 if (tcpstate == TCPS_SYN_RCVD) {
389 /*
390 * The three-way connection establishment handshake is not
391 * complete yet. We want to queue the data for transmission
392 * after entering ESTABLISHED state (RFC793). A jump to
393 * "done" label effectively leaves data on the queue.
394 */
395 goto done;
396 } else {
397 int usable_r;
398
399 /*
400 * In the special case when cwnd is zero, which can only
401 * happen if the connection is ECN capable, return now.
402 * New segments is sent using tcp_timer(). The timer
403 * is set in tcp_input_data().
404 */
405 if (tcp->tcp_cwnd == 0) {
406 /*
447 return;
448 }
449
450 /* usable = MIN(swnd, cwnd) - unacked_bytes */
451 if (tcp->tcp_swnd > tcp->tcp_cwnd)
452 usable_r -= tcp->tcp_swnd - tcp->tcp_cwnd;
453
454 /* usable = MIN(usable, unsent) */
455 if (usable_r > len)
456 usable_r = len;
457
458 /* usable = MAX(usable, {1 for urgent, 0 for data}) */
459 if (usable_r > 0) {
460 usable = usable_r;
461 } else {
462 /* Bypass all other unnecessary processing. */
463 goto done;
464 }
465 }
466
467 #ifdef KERNEL_32
468 local_time = (mblk_t *)now;
469 #else
470 local_time = (mblk_t *)(intptr_t)gethrtime();
471 #endif
472
473 /*
474 * "Our" Nagle Algorithm. This is not the same as in the old
475 * BSD. This is more in line with the true intent of Nagle.
476 *
477 * The conditions are:
478 * 1. The amount of unsent data (or amount of data which can be
479 * sent, whichever is smaller) is less than Nagle limit.
480 * 2. The last sent size is also less than Nagle limit.
481 * 3. There is unack'ed data.
482 * 4. Urgent pointer is not set. Send urgent data ignoring the
483 * Nagle algorithm. This reduces the probability that urgent
484 * bytes get "merged" together.
485 * 5. The app has not closed the connection. This eliminates the
486 * wait time of the receiving side waiting for the last piece of
487 * (small) data.
488 *
489 * If all are satisified, exit without sending anything. Note
490 * that Nagle limit can be smaller than 1 MSS. Nagle limit is
491 * the smaller of 1 MSS and global tcp_naglim_def (default to be
1188 * un-acked usable
1189 * |--------------|-----------------|
1190 * tcp_suna tcp_snxt tcp_suna+tcp_swnd
1191 */
1192 /* END CSTYLED */
1193
1194 /* start sending from tcp_snxt */
1195 snxt = tcp->tcp_snxt;
1196
1197 /*
1198 * Check to see if this connection has been idled for some
1199 * time and no ACK is expected. If it is, we need to slow
1200 * start again to get back the connection's "self-clock" as
1201 * described in VJ's paper.
1202 *
1203 * Reinitialize tcp_cwnd after idle.
1204 */
1205 now = LBOLT_FASTPATH;
1206 if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
1207 (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
1208 cc_after_idle(tcp);
1209 }
1210
1211 usable = tcp->tcp_swnd; /* tcp window size */
1212 if (usable > tcp->tcp_cwnd)
1213 usable = tcp->tcp_cwnd; /* congestion window smaller */
1214 usable -= snxt; /* subtract stuff already sent */
1215 suna = tcp->tcp_suna;
1216 usable += suna;
1217 /* usable can be < 0 if the congestion window is smaller */
1218 if (len > usable) {
1219 /* Can't send complete M_DATA in one shot */
1220 goto slow;
1221 }
1222
1223 mutex_enter(&tcp->tcp_non_sq_lock);
1224 if (tcp->tcp_flow_stopped &&
1225 TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) {
1226 tcp_clrqfull(tcp);
1227 }
1228 mutex_exit(&tcp->tcp_non_sq_lock);
1251 return;
1252 }
1253
1254 /*
1255 * len <= tcp->tcp_mss && len == unsent so no sender silly window. Can
1256 * send now.
1257 */
1258
1259 if (snxt == suna) {
1260 TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
1261 }
1262
1263 /* we have always sent something */
1264 tcp->tcp_rack_cnt = 0;
1265
1266 tcp->tcp_snxt = snxt + len;
1267 tcp->tcp_rack = tcp->tcp_rnxt;
1268
1269 if ((mp1 = dupb(mp)) == 0)
1270 goto no_memory;
1271 #ifdef KERNEL_32
1272 mp->b_prev = (mblk_t *)(uintptr_t)now;
1273 #else
1274 mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
1275 #endif
1276 mp->b_next = (mblk_t *)(uintptr_t)snxt;
1277
1278 /* adjust tcp header information */
1279 tcpha = tcp->tcp_tcpha;
1280 tcpha->tha_flags = (TH_ACK|TH_PUSH);
1281
1282 sum = len + connp->conn_ht_ulp_len + connp->conn_sum;
1283 sum = (sum >> 16) + (sum & 0xFFFF);
1284 tcpha->tha_sum = htons(sum);
1285
1286 tcpha->tha_seq = htonl(snxt);
1287
1288 TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
1289 TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
1290 TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
1291 tcp->tcp_cs.tcp_out_data_segs++;
1292 tcp->tcp_cs.tcp_out_data_bytes += len;
1293
1294 /* Update the latest receive window size in TCP header. */
1295 tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
1296
1297 tcp->tcp_last_sent_len = (ushort_t)len;
1298
1299 plen = len + connp->conn_ht_iphc_len;
1300
1301 ixa = connp->conn_ixa;
1302 ixa->ixa_pktlen = plen;
1303
1304 if (ixa->ixa_flags & IXAF_IS_IPV4) {
1305 tcp->tcp_ipha->ipha_length = htons(plen);
1306 } else {
1307 tcp->tcp_ip6h->ip6_plen = htons(plen - IPV6_HDR_LEN);
1308 }
1309
1310 /* see if we need to allocate a mblk for the headers */
1311 hdrlen = connp->conn_ht_iphc_len;
1312 rptr = mp1->b_rptr - hdrlen;
1313 db = mp1->b_datap;
1314 if ((db->db_ref != 2) || rptr < db->db_base ||
1315 (!OK_32PTR(rptr))) {
1316 /* NOTE: we assume allocb returns an OK_32PTR */
1317 mp = allocb(hdrlen + tcps->tcps_wroff_xtra, BPRI_MED);
1318 if (!mp) {
1319 freemsg(mp1);
1320 goto no_memory;
1321 }
1322 mp->b_cont = mp1;
1323 mp1 = mp;
1324 /* Leave room for Link Level header */
1325 rptr = &mp1->b_rptr[tcps->tcps_wroff_xtra];
1326 mp1->b_wptr = &rptr[hdrlen];
1327 }
1328 mp1->b_rptr = rptr;
1329
1330 /* Fill in the timestamp option. */
1331 if (tcp->tcp_snd_ts_ok) {
1332 U32_TO_BE32(now,
1333 (char *)tcpha + TCP_MIN_HEADER_LENGTH + 4);
1334 U32_TO_BE32(tcp->tcp_ts_recent,
1335 (char *)tcpha + TCP_MIN_HEADER_LENGTH + 8);
1336 } else {
1337 ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
1338 }
1339
1340 /* copy header into outgoing packet */
1341 dst = (ipaddr_t *)rptr;
1342 src = (ipaddr_t *)connp->conn_ht_iphc;
1343 dst[0] = src[0];
1344 dst[1] = src[1];
1345 dst[2] = src[2];
1346 dst[3] = src[3];
1347 dst[4] = src[4];
1348 dst[5] = src[5];
1349 dst[6] = src[6];
1350 dst[7] = src[7];
1351 dst[8] = src[8];
1352 dst[9] = src[9];
1353 if (hdrlen -= 40) {
1354 hdrlen >>= 2;
1355 dst += 10;
1959 }
1960 } else
1961 (*xmit_tail)->b_rptr = prev_rptr;
1962
1963 if (mp == NULL) {
1964 return (-1);
1965 }
1966 mp1 = mp->b_cont;
1967
1968 if (len <= mss) /* LSO is unusable (!do_lso_send) */
1969 tcp->tcp_last_sent_len = (ushort_t)len;
1970 while (mp1->b_cont) {
1971 *xmit_tail = (*xmit_tail)->b_cont;
1972 (*xmit_tail)->b_prev = local_time;
1973 (*xmit_tail)->b_next =
1974 (mblk_t *)(uintptr_t)(*snxt);
1975 mp1 = mp1->b_cont;
1976 }
1977 *snxt += len;
1978 *tail_unsent = (*xmit_tail)->b_wptr - mp1->b_wptr;
1979 TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
1980 TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
1981 TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
1982 tcp->tcp_cs.tcp_out_data_segs++;
1983 tcp->tcp_cs.tcp_out_data_bytes += len;
1984 tcp_send_data(tcp, mp);
1985 continue;
1986 }
1987
1988 *snxt += len; /* Adjust later if we don't send all of len */
1989 TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
1990 TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
1991 TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
1992 tcp->tcp_cs.tcp_out_data_segs++;
1993 tcp->tcp_cs.tcp_out_data_bytes += len;
1994
1995 if (*tail_unsent) {
1996 /* Are the bytes above us in flight? */
1997 rptr = (*xmit_tail)->b_wptr - *tail_unsent;
1998 if (rptr != (*xmit_tail)->b_rptr) {
1999 *tail_unsent -= len;
2000 if (len <= mss) /* LSO is unusable */
2001 tcp->tcp_last_sent_len = (ushort_t)len;
2002 len += total_hdr_len;
2003 ixa->ixa_pktlen = len;
2004
2005 if (ixa->ixa_flags & IXAF_IS_IPV4) {
2006 tcp->tcp_ipha->ipha_length = htons(len);
2007 } else {
2008 tcp->tcp_ip6h->ip6_plen =
2009 htons(len - IPV6_HDR_LEN);
2010 }
2011
2012 mp = dupb(*xmit_tail);
2013 if (mp == NULL) {
2070
2071 must_alloc:;
2072 mp1 = allocb(connp->conn_ht_iphc_allocated +
2073 tcps->tcps_wroff_xtra, BPRI_MED);
2074 if (mp1 == NULL) {
2075 freemsg(mp);
2076 return (-1); /* out_of_mem */
2077 }
2078 mp1->b_cont = mp;
2079 mp = mp1;
2080 /* Leave room for Link Level header */
2081 len = total_hdr_len;
2082 rptr = &mp->b_rptr[tcps->tcps_wroff_xtra];
2083 mp->b_wptr = &rptr[len];
2084 }
2085
2086 /*
2087 * Fill in the header using the template header, and add
2088 * options such as time-stamp, ECN and/or SACK, as needed.
2089 */
2090 tcp_fill_header(tcp, rptr, num_sack_blk);
2091
2092 mp->b_rptr = rptr;
2093
2094 if (*tail_unsent) {
2095 int spill = *tail_unsent;
2096
2097 mp1 = mp->b_cont;
2098 if (mp1 == NULL)
2099 mp1 = mp;
2100
2101 /*
2102 * If we're a little short, tack on more mblks until
2103 * there is no more spillover.
2104 */
2105 while (spill < 0) {
2106 mblk_t *nmp;
2107 int nmpsz;
2108
2109 nmp = (*xmit_tail)->b_cont;
2110 nmpsz = MBLKL(nmp);
2149 if (mp1 == NULL) {
2150 *tail_unsent = spill;
2151 freemsg(mp);
2152 return (-1); /* out_of_mem */
2153 }
2154 }
2155
2156 /* Trim back any surplus on the last mblk */
2157 if (spill >= 0) {
2158 mp1->b_wptr -= spill;
2159 *tail_unsent = spill;
2160 } else {
2161 /*
2162 * We did not send everything we could in
2163 * order to remain within the b_cont limit.
2164 */
2165 *usable -= spill;
2166 *snxt += spill;
2167 tcp->tcp_last_sent_len += spill;
2168 TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, spill);
2169 tcp->tcp_cs.tcp_out_data_bytes += spill;
2170 /*
2171 * Adjust the checksum
2172 */
2173 tcpha = (tcpha_t *)(rptr +
2174 ixa->ixa_ip_hdr_length);
2175 sum += spill;
2176 sum = (sum >> 16) + (sum & 0xFFFF);
2177 tcpha->tha_sum = htons(sum);
2178 if (connp->conn_ipversion == IPV4_VERSION) {
2179 sum = ntohs(
2180 ((ipha_t *)rptr)->ipha_length) +
2181 spill;
2182 ((ipha_t *)rptr)->ipha_length =
2183 htons(sum);
2184 } else {
2185 sum = ntohs(
2186 ((ip6_t *)rptr)->ip6_plen) +
2187 spill;
2188 ((ip6_t *)rptr)->ip6_plen =
2189 htons(sum);
2198 } else {
2199 ixa->ixa_flags &= ~IXAF_REACH_CONF;
2200 }
2201
2202 if (do_lso_send) {
2203 /* Append LSO information to the mp. */
2204 lso_info_set(mp, mss, HW_LSO);
2205 ixa->ixa_fragsize = IP_MAXPACKET;
2206 ixa->ixa_extra_ident = num_lso_seg - 1;
2207
2208 DTRACE_PROBE2(tcp_send_lso, int, num_lso_seg,
2209 boolean_t, B_TRUE);
2210
2211 tcp_send_data(tcp, mp);
2212
2213 /*
2214 * Restore values of ixa_fragsize and ixa_extra_ident.
2215 */
2216 ixa->ixa_fragsize = ixa->ixa_pmtu;
2217 ixa->ixa_extra_ident = 0;
2218 TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
2219 TCP_STAT(tcps, tcp_lso_times);
2220 TCP_STAT_UPDATE(tcps, tcp_lso_pkt_out, num_lso_seg);
2221 } else {
2222 /*
2223 * Make sure to clean up LSO information. Wherever a
2224 * new mp uses the prepended header room after dupb(),
2225 * lso_info_cleanup() should be called.
2226 */
2227 lso_info_cleanup(mp);
2228 tcp_send_data(tcp, mp);
2229 TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
2230 }
2231 }
2232
2233 return (0);
2234 }
2235
2236 /*
2237 * Initiate closedown sequence on an active connection. (May be called as
2238 * writer.) Return value zero for OK return, non-zero for error return.
2239 */
2240 static int
2241 tcp_xmit_end(tcp_t *tcp)
2242 {
2243 mblk_t *mp;
2244 tcp_stack_t *tcps = tcp->tcp_tcps;
2245 iulp_t uinfo;
2246 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
2247 conn_t *connp = tcp->tcp_connp;
2248
2249 if (tcp->tcp_state < TCPS_SYN_RCVD ||
2289 * so we have to check that and unset it first.
2290 */
2291 if (tcp->tcp_cork)
2292 tcp->tcp_cork = B_FALSE;
2293 tcp_wput_data(tcp, NULL, B_FALSE);
2294 }
2295
2296 /*
2297 * If TCP does not get enough samples of RTT or tcp_rtt_updates
2298 * is 0, don't update the cache.
2299 */
2300 if (tcps->tcps_rtt_updates == 0 ||
2301 tcp->tcp_rtt_update < tcps->tcps_rtt_updates)
2302 return (0);
2303
2304 /*
2305 * We do not have a good algorithm to update ssthresh at this time.
2306 * So don't do any update.
2307 */
2308 bzero(&uinfo, sizeof (uinfo));
2309 uinfo.iulp_rtt = NSEC2MSEC(tcp->tcp_rtt_sa);
2310 uinfo.iulp_rtt_sd = NSEC2MSEC(tcp->tcp_rtt_sd);
2311
2312 /*
2313 * Note that uinfo is kept for conn_faddr in the DCE. Could update even
2314 * if source routed but we don't.
2315 */
2316 if (connp->conn_ipversion == IPV4_VERSION) {
2317 if (connp->conn_faddr_v4 != tcp->tcp_ipha->ipha_dst) {
2318 return (0);
2319 }
2320 (void) dce_update_uinfo_v4(connp->conn_faddr_v4, &uinfo, ipst);
2321 } else {
2322 uint_t ifindex;
2323
2324 if (!(IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
2325 &tcp->tcp_ip6h->ip6_dst))) {
2326 return (0);
2327 }
2328 ifindex = 0;
2329 if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_faddr_v6)) {
2330 ip_xmit_attr_t *ixa = connp->conn_ixa;
2425 }
2426 }
2427 if (ctl & TH_ACK) {
2428 if (tcp->tcp_snd_ts_ok) {
2429 uint32_t llbolt = (uint32_t)LBOLT_FASTPATH;
2430
2431 U32_TO_BE32(llbolt,
2432 (char *)tcpha + TCP_MIN_HEADER_LENGTH+4);
2433 U32_TO_BE32(tcp->tcp_ts_recent,
2434 (char *)tcpha + TCP_MIN_HEADER_LENGTH+8);
2435 }
2436
2437 /* Update the latest receive window size in TCP header. */
2438 tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
2439 /* Track what we sent to the peer */
2440 tcp->tcp_tcpha->tha_win = tcpha->tha_win;
2441 tcp->tcp_rack = ack;
2442 tcp->tcp_rack_cnt = 0;
2443 TCPS_BUMP_MIB(tcps, tcpOutAck);
2444 }
2445 TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
2446 tcpha->tha_seq = htonl(seq);
2447 tcpha->tha_ack = htonl(ack);
2448 /*
2449 * Include the adjustment for a source route if any.
2450 */
2451 sum = (sum >> 16) + (sum & 0xFFFF);
2452 tcpha->tha_sum = htons(sum);
2453 tcp_send_data(tcp, mp);
2454 }
2455
2456 /*
2457 * Generate a reset based on an inbound packet, connp is set by caller
2458 * when RST is in response to an unexpected inbound packet for which
2459 * there is active tcp state in the system.
2460 *
2461 * IPSEC NOTE : Try to send the reply with the same protection as it came
2462 * in. We have the ip_recv_attr_t which is reversed to form the ip_xmit_attr_t.
2463 * That way the packet will go out at the same level of protection as it
2464 * came in with.
2465 */
3394 ASSERT(snxt_mp != NULL);
3395 /* This should not happen. Defensive coding again... */
3396 if (snxt_mp == NULL) {
3397 return;
3398 }
3399
3400 xmit_mp = tcp_xmit_mp(tcp, snxt_mp, seg_len, &off,
3401 &tmp_mp, begin, B_TRUE, &seg_len, B_TRUE);
3402 if (xmit_mp == NULL)
3403 return;
3404
3405 usable_swnd -= seg_len;
3406 tcp->tcp_pipe += seg_len;
3407 tcp->tcp_sack_snxt = begin + seg_len;
3408
3409 tcp_send_data(tcp, xmit_mp);
3410
3411 /*
3412 * Update the send timestamp to avoid false retransmission.
3413 */
3414 #ifdef KERNEL_32
3415 snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt();
3416 #else
3417 snxt_mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
3418 #endif
3419
3420 TCPS_BUMP_MIB(tcps, tcpRetransSegs);
3421 TCPS_UPDATE_MIB(tcps, tcpRetransBytes, seg_len);
3422 TCPS_BUMP_MIB(tcps, tcpOutSackRetransSegs);
3423 tcp->tcp_cs.tcp_out_retrans_segs++;
3424 tcp->tcp_cs.tcp_out_retrans_bytes += seg_len;
3425 /*
3426 * Update tcp_rexmit_max to extend this SACK recovery phase.
3427 * This happens when new data sent during fast recovery is
3428 * also lost. If TCP retransmits those new data, it needs
3429 * to extend SACK recover phase to avoid starting another
3430 * fast retransmit/recovery unnecessarily.
3431 */
3432 if (SEQ_GT(tcp->tcp_sack_snxt, tcp->tcp_rexmit_max)) {
3433 tcp->tcp_rexmit_max = tcp->tcp_sack_snxt;
3434 }
3435 }
3436 }
3437
3438 /*
3439 * tcp_ss_rexmit() is called to do slow start retransmission after a timeout
3440 * or ICMP errors.
3441 */
3442 void
3443 tcp_ss_rexmit(tcp_t *tcp)
3444 {
3472
3473 if (win < cnt) {
3474 cnt = win;
3475 }
3476 if (SEQ_GT(snxt + cnt, smax)) {
3477 cnt = smax - snxt;
3478 }
3479 xmit_mp = tcp_xmit_mp(tcp, snxt_mp, cnt, &off,
3480 &snxt_mp, snxt, B_TRUE, &cnt, B_TRUE);
3481 if (xmit_mp == NULL)
3482 return;
3483
3484 tcp_send_data(tcp, xmit_mp);
3485
3486 snxt += cnt;
3487 win -= cnt;
3488 /*
3489 * Update the send timestamp to avoid false
3490 * retransmission.
3491 */
3492 #ifdef KERNEL_32
3493 old_snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt();
3494 #else
3495 old_snxt_mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
3496 #endif
3497 TCPS_BUMP_MIB(tcps, tcpRetransSegs);
3498 TCPS_UPDATE_MIB(tcps, tcpRetransBytes, cnt);
3499 tcp->tcp_cs.tcp_out_retrans_segs++;
3500 tcp->tcp_cs.tcp_out_retrans_bytes += cnt;
3501
3502 tcp->tcp_rexmit_nxt = snxt;
3503 }
3504 /*
3505 * If we have transmitted all we have at the time
3506 * we started the retranmission, we can leave
3507 * the rest of the job to tcp_wput_data(). But we
3508 * need to check the send window first. If the
3509 * win is not 0, go on with tcp_wput_data().
3510 */
3511 if (SEQ_LT(snxt, smax) || win == 0) {
3512 return;
3513 }
3514 }
3515 /* Only call tcp_wput_data() if there is data to be sent. */
3516 if (tcp->tcp_unsent) {
3517 tcp_wput_data(tcp, NULL, B_FALSE);
3518 }
3519 }
3520
3638 /*
3639 * If the SACK option is set, delete the entire list of
3640 * notsack'ed blocks.
3641 */
3642 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, tcp);
3643
3644 if (tcp->tcp_suna == tcp->tcp_snxt && tcp->tcp_swnd == 0)
3645 /*
3646 * Make sure the timer is running so that we will probe a zero
3647 * window.
3648 */
3649 TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
3650 }
3651
3652 /*
3653 * tcp_fill_header is called by tcp_send() to fill the outgoing TCP header
3654 * with the template header, as well as other options such as time-stamp,
3655 * ECN and/or SACK.
3656 */
3657 static void
3658 tcp_fill_header(tcp_t *tcp, uchar_t *rptr, int num_sack_blk)
3659 {
3660 tcpha_t *tcp_tmpl, *tcpha;
3661 uint32_t *dst, *src;
3662 int hdrlen;
3663 conn_t *connp = tcp->tcp_connp;
3664
3665 ASSERT(OK_32PTR(rptr));
3666
3667 /* Template header */
3668 tcp_tmpl = tcp->tcp_tcpha;
3669
3670 /* Header of outgoing packet */
3671 tcpha = (tcpha_t *)(rptr + connp->conn_ixa->ixa_ip_hdr_length);
3672
3673 /* dst and src are opaque 32-bit fields, used for copying */
3674 dst = (uint32_t *)rptr;
3675 src = (uint32_t *)connp->conn_ht_iphc;
3676 hdrlen = connp->conn_ht_iphc_len;
3677
3678 /* Fill time-stamp option if needed */
3679 if (tcp->tcp_snd_ts_ok) {
3680 U32_TO_BE32(LBOLT_FASTPATH,
3681 (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 4);
3682 U32_TO_BE32(tcp->tcp_ts_recent,
3683 (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 8);
3684 } else {
3685 ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
3686 }
3687
3688 /*
3689 * Copy the template header; is this really more efficient than
3690 * calling bcopy()? For simple IPv4/TCP, it may be the case,
3691 * but perhaps not for other scenarios.
3692 */
3693 dst[0] = src[0];
3694 dst[1] = src[1];
3695 dst[2] = src[2];
3696 dst[3] = src[3];
3697 dst[4] = src[4];
3698 dst[5] = src[5];
3699 dst[6] = src[6];
3700 dst[7] = src[7];
|