6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2017 Joyent, Inc.
25 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
26 * Copyright (c) 2013,2014 by Delphix. All rights reserved.
27 * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
28 */
29 /* Copyright (c) 1990 Mentat Inc. */
30
31 #include <sys/types.h>
32 #include <sys/stream.h>
33 #include <sys/strsun.h>
34 #include <sys/strsubr.h>
35 #include <sys/stropts.h>
36 #include <sys/strlog.h>
37 #define _SUN_TPI_VERSION 2
38 #include <sys/tihdr.h>
39 #include <sys/timod.h>
40 #include <sys/ddi.h>
41 #include <sys/sunddi.h>
42 #include <sys/suntpi.h>
43 #include <sys/xti_inet.h>
44 #include <sys/cmn_err.h>
45 #include <sys/debug.h>
46 #include <sys/sdt.h>
57
58 #include <sys/errno.h>
59 #include <sys/signal.h>
60 #include <sys/socket.h>
61 #include <sys/socketvar.h>
62 #include <sys/sockio.h>
63 #include <sys/isa_defs.h>
64 #include <sys/md5.h>
65 #include <sys/random.h>
66 #include <sys/uio.h>
67 #include <sys/systm.h>
68 #include <netinet/in.h>
69 #include <netinet/tcp.h>
70 #include <netinet/ip6.h>
71 #include <netinet/icmp6.h>
72 #include <net/if.h>
73 #include <net/route.h>
74 #include <inet/ipsec_impl.h>
75
76 #include <inet/common.h>
77 #include <inet/ip.h>
78 #include <inet/ip_impl.h>
79 #include <inet/ip6.h>
80 #include <inet/ip_ndp.h>
81 #include <inet/proto_set.h>
82 #include <inet/mib2.h>
83 #include <inet/optcom.h>
84 #include <inet/snmpcom.h>
85 #include <inet/kstatcom.h>
86 #include <inet/tcp.h>
87 #include <inet/tcp_impl.h>
88 #include <inet/tcp_cluster.h>
89 #include <inet/udp_impl.h>
90 #include <net/pfkeyv2.h>
91 #include <inet/ipdrop.h>
92
93 #include <inet/ipclassifier.h>
94 #include <inet/ip_ire.h>
95 #include <inet/ip_ftable.h>
96 #include <inet/ip_if.h>
249 ((uint_t)(accid) & (TCP_ACCEPTOR_FANOUT_SIZE - 1))
250 #endif /* _ILP32 */
251
252 /*
253 * Minimum number of connections which can be created per listener. Used
254 * when the listener connection count is in effect.
255 */
256 static uint32_t tcp_min_conn_listener = 2;
257
258 uint32_t tcp_early_abort = 30;
259
260 /* TCP Timer control structure */
261 typedef struct tcpt_s {
262 pfv_t tcpt_pfv; /* The routine we are to call */
263 tcp_t *tcpt_tcp; /* The parameter we are to pass in */
264 } tcpt_t;
265
266 /*
267 * Functions called directly via squeue having a prototype of edesc_t.
268 */
269 void tcp_input_listener(void *arg, mblk_t *mp, void *arg2,
270 ip_recv_attr_t *ira);
271 void tcp_input_data(void *arg, mblk_t *mp, void *arg2,
272 ip_recv_attr_t *ira);
273 static void tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2,
274 ip_recv_attr_t *dummy);
275
276
277 /* Prototype for TCP functions */
278 static void tcp_random_init(void);
279 int tcp_random(void);
280 static int tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp,
281 in_port_t dstport, uint_t srcid);
282 static int tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp,
283 in_port_t dstport, uint32_t flowinfo,
284 uint_t srcid, uint32_t scope_id);
285 static void tcp_iss_init(tcp_t *tcp);
286 static void tcp_reinit(tcp_t *tcp);
287 static void tcp_reinit_values(tcp_t *tcp);
288
289 static void tcp_wsrv(queue_t *q);
290 static void tcp_update_lso(tcp_t *tcp, ip_xmit_attr_t *ixa);
559 netstack_rele(ns);
560 ASSERT(tcps != NULL);
561 tcp->tcp_tcps = NULL;
562
563 bzero(tcp, sizeof (tcp_t));
564
565 /* restore the state */
566 tcp->tcp_timercache = mp;
567
568 tcp->tcp_rsrv_mp = tcp_rsrv_mp;
569
570 tcp->tcp_connp = connp;
571
572 ASSERT(connp->conn_tcp == tcp);
573 ASSERT(connp->conn_flags & IPCL_TCPCONN);
574 connp->conn_state_flags = CONN_INCIPIENT;
575 ASSERT(connp->conn_proto == IPPROTO_TCP);
576 ASSERT(connp->conn_ref == 1);
577 }
578
579 /*
580 * Adapt to the information, such as rtt and rtt_sd, provided from the
581 * DCE and IRE maintained by IP.
582 *
583 * Checks for multicast and broadcast destination address.
584 * Returns zero if ok; an errno on failure.
585 *
586 * Note that the MSS calculation here is based on the info given in
587 * the DCE and IRE. We do not do any calculation based on TCP options. They
588 * will be handled in tcp_input_data() when TCP knows which options to use.
589 *
590 * Note on how TCP gets its parameters for a connection.
591 *
592 * When a tcp_t structure is allocated, it gets all the default parameters.
593 * In tcp_set_destination(), it gets those metric parameters, like rtt, rtt_sd,
594 * spipe, rpipe, ... from the route metrics. Route metric overrides the
595 * default.
596 *
597 * An incoming SYN with a multicast or broadcast destination address is dropped
598 * in ip_fanout_v4/v6.
599 *
623 */
624 flags |= IPDF_UNIQUE_DCE;
625
626 if (!tcps->tcps_ignore_path_mtu)
627 connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
628
629 /* Use conn_lock to satify ASSERT; tcp is already serialized */
630 mutex_enter(&connp->conn_lock);
631 error = conn_connect(connp, &uinfo, flags);
632 mutex_exit(&connp->conn_lock);
633 if (error != 0)
634 return (error);
635
636 error = tcp_build_hdrs(tcp);
637 if (error != 0)
638 return (error);
639
640 tcp->tcp_localnet = uinfo.iulp_localnet;
641
642 if (uinfo.iulp_rtt != 0) {
643 clock_t rto;
644
645 tcp->tcp_rtt_sa = uinfo.iulp_rtt;
646 tcp->tcp_rtt_sd = uinfo.iulp_rtt_sd;
647 rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
648 tcps->tcps_rexmit_interval_extra +
649 (tcp->tcp_rtt_sa >> 5);
650
651 TCP_SET_RTO(tcp, rto);
652 }
653 if (uinfo.iulp_ssthresh != 0)
654 tcp->tcp_cwnd_ssthresh = uinfo.iulp_ssthresh;
655 else
656 tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
657 if (uinfo.iulp_spipe > 0) {
658 connp->conn_sndbuf = MIN(uinfo.iulp_spipe,
659 tcps->tcps_max_buf);
660 if (tcps->tcps_snd_lowat_fraction != 0) {
661 connp->conn_sndlowat = connp->conn_sndbuf /
662 tcps->tcps_snd_lowat_fraction;
663 }
664 (void) tcp_maxpsz_set(tcp, B_TRUE);
665 }
666 /*
667 * Note that up till now, acceptor always inherits receive
668 * window from the listener. But if there is a metrics
669 * associated with a host, we should use that instead of
670 * inheriting it from listener. Thus we need to pass this
671 * info back to the caller.
1221 CONN_DEC_REF(tcp->tcp_connp);
1222 }
1223
1224 /*
1225 * The tcp_t is going away. Remove it from all lists and set it
1226 * to TCPS_CLOSED. The freeing up of memory is deferred until
1227 * tcp_inactive. This is needed since a thread in tcp_rput might have
1228 * done a CONN_INC_REF on this structure before it was removed from the
1229 * hashes.
1230 */
1231 void
1232 tcp_closei_local(tcp_t *tcp)
1233 {
1234 conn_t *connp = tcp->tcp_connp;
1235 tcp_stack_t *tcps = tcp->tcp_tcps;
1236 int32_t oldstate;
1237
1238 if (!TCP_IS_SOCKET(tcp))
1239 tcp_acceptor_hash_remove(tcp);
1240
1241 TCPS_UPDATE_MIB(tcps, tcpHCInSegs, tcp->tcp_ibsegs);
1242 tcp->tcp_ibsegs = 0;
1243 TCPS_UPDATE_MIB(tcps, tcpHCOutSegs, tcp->tcp_obsegs);
1244 tcp->tcp_obsegs = 0;
1245
1246 /*
1247 * This can be called via tcp_time_wait_processing() if TCP gets a
1248 * SYN with sequence number outside the TIME-WAIT connection's
1249 * window. So we need to check for TIME-WAIT state here as the
1250 * connection counter is already decremented. See SET_TIME_WAIT()
1251 * macro
1252 */
1253 if (tcp->tcp_state >= TCPS_ESTABLISHED &&
1254 tcp->tcp_state < TCPS_TIME_WAIT) {
1255 TCPS_CONN_DEC(tcps);
1256 }
1257
1258 /*
1259 * If we are an eager connection hanging off a listener that
1260 * hasn't formally accepted the connection yet, get off its
1261 * list and blow off any data that we have accumulated.
1262 */
1263 if (tcp->tcp_listener != NULL) {
1264 tcp_t *listener = tcp->tcp_listener;
1265 mutex_enter(&listener->tcp_eager_lock);
1404 if (tcp->tcp_rthdrdstopts != NULL) {
1405 mi_free(tcp->tcp_rthdrdstopts);
1406 tcp->tcp_rthdrdstopts = NULL;
1407 tcp->tcp_rthdrdstoptslen = 0;
1408 }
1409 ASSERT(tcp->tcp_rthdrdstoptslen == 0);
1410 if (tcp->tcp_rthdr != NULL) {
1411 mi_free(tcp->tcp_rthdr);
1412 tcp->tcp_rthdr = NULL;
1413 tcp->tcp_rthdrlen = 0;
1414 }
1415 ASSERT(tcp->tcp_rthdrlen == 0);
1416
1417 /*
1418 * Following is really a blowing away a union.
1419 * It happens to have exactly two members of identical size
1420 * the following code is enough.
1421 */
1422 tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind);
1423
1424 /*
1425 * Destroy any association with SO_REUSEPORT group.
1426 */
1427 if (tcp->tcp_rg_bind != NULL) {
1428 /*
1429 * This is only necessary for connections which enabled
1430 * SO_REUSEPORT but were never bound. Such connections should
1431 * be the one and only member of the tcp_rg_tp to which they
1432 * have been associated.
1433 */
1434 VERIFY(tcp_rg_remove(tcp->tcp_rg_bind, tcp));
1435 tcp_rg_destroy(tcp->tcp_rg_bind);
1436 tcp->tcp_rg_bind = NULL;
1437 }
1438
1439 /*
1440 * If this is a non-STREAM socket still holding on to an upper
1441 * handle, release it. As a result of fallback we might also see
1442 * STREAMS based conns with upper handles, in which case there is
1443 * nothing to do other than clearing the field.
1465 * outside the squeue. So when the interrupt comes, we have a clean
1466 * connection sitting in the freelist. Obviously, this buys us
1467 * performance.
1468 *
1469 * 2) Defence against DOS attack. Allocating a tcp/conn in tcp_input_listener
1470 * has multiple disadvantages - tying up the squeue during alloc.
1471 * But allocating the conn/tcp in IP land is also not the best since
1472 * we can't check the 'q' and 'q0' which are protected by squeue and
1473 * blindly allocate memory which might have to be freed here if we are
1474 * not allowed to accept the connection. By using the freelist and
1475 * putting the conn/tcp back in freelist, we don't pay a penalty for
1476 * allocating memory without checking 'q/q0' and freeing it if we can't
1477 * accept the connection.
1478 *
1479 * Care should be taken to put the conn back in the same squeue's freelist
1480 * from which it was allocated. Best results are obtained if conn is
1481 * allocated from listener's squeue and freed to the same. Time wait
1482 * collector will free up the freelist is the connection ends up sitting
1483 * there for too long.
1484 */
1485 void *
1486 tcp_get_conn(void *arg, tcp_stack_t *tcps)
1487 {
1488 tcp_t *tcp = NULL;
1489 conn_t *connp = NULL;
1490 squeue_t *sqp = (squeue_t *)arg;
1491 tcp_squeue_priv_t *tcp_time_wait;
1492 netstack_t *ns;
1493 mblk_t *tcp_rsrv_mp = NULL;
1494
1495 tcp_time_wait =
1496 *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
1497
1498 mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
1499 tcp = tcp_time_wait->tcp_free_list;
1500 ASSERT((tcp != NULL) ^ (tcp_time_wait->tcp_free_list_cnt == 0));
1501 if (tcp != NULL) {
1502 tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next;
1503 tcp_time_wait->tcp_free_list_cnt--;
1504 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1505 tcp->tcp_time_wait_next = NULL;
1506 connp = tcp->tcp_connp;
1507 connp->conn_flags |= IPCL_REUSED;
1508
1509 ASSERT(tcp->tcp_tcps == NULL);
1510 ASSERT(connp->conn_netstack == NULL);
1511 ASSERT(tcp->tcp_rsrv_mp != NULL);
1512 ns = tcps->tcps_netstack;
1513 netstack_hold(ns);
1514 connp->conn_netstack = ns;
1515 connp->conn_ixa->ixa_ipst = ns->netstack_ip;
1516 tcp->tcp_tcps = tcps;
1517 ipcl_globalhash_insert(connp);
1518
1519 connp->conn_ixa->ixa_notify_cookie = tcp;
1520 ASSERT(connp->conn_ixa->ixa_notify == tcp_notify);
1521 connp->conn_recv = tcp_input_data;
1522 ASSERT(connp->conn_recvicmp == tcp_icmp_input);
1523 ASSERT(connp->conn_verifyicmp == tcp_verifyicmp);
1524 return ((void *)connp);
1525 }
1526 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1527 /*
1528 * Pre-allocate the tcp_rsrv_mp. This mblk will not be freed until
1529 * this conn_t/tcp_t is freed at ipcl_conn_destroy().
1530 */
1531 tcp_rsrv_mp = allocb(0, BPRI_HI);
1532 if (tcp_rsrv_mp == NULL)
1533 return (NULL);
1534
1535 if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP,
1536 tcps->tcps_netstack)) == NULL) {
1537 freeb(tcp_rsrv_mp);
1538 return (NULL);
1539 }
1540
1541 tcp = connp->conn_tcp;
1542 tcp->tcp_rsrv_mp = tcp_rsrv_mp;
1543 mutex_init(&tcp->tcp_rsrv_mp_lock, NULL, MUTEX_DEFAULT, NULL);
1544
1545 tcp->tcp_tcps = tcps;
1546
1547 connp->conn_recv = tcp_input_data;
1548 connp->conn_recvicmp = tcp_icmp_input;
1549 connp->conn_verifyicmp = tcp_verifyicmp;
1550
1551 /*
1552 * Register tcp_notify to listen to capability changes detected by IP.
1553 * This upcall is made in the context of the call to conn_ip_output
1554 * thus it is inside the squeue.
1555 */
1556 connp->conn_ixa->ixa_notify = tcp_notify;
1557 connp->conn_ixa->ixa_notify_cookie = tcp;
1558
1559 return ((void *)connp);
1560 }
1561
1562 /*
1563 * Handle connect to IPv4 destinations, including connections for AF_INET6
1564 * sockets connecting to IPv4 mapped IPv6 destinations.
1565 * Returns zero if OK, a positive errno, or a negative TLI error.
1566 */
1567 static int
1568 tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp, in_port_t dstport,
1569 uint_t srcid)
1570 {
1571 ipaddr_t dstaddr = *dstaddrp;
1572 uint16_t lport;
1573 conn_t *connp = tcp->tcp_connp;
1574 tcp_stack_t *tcps = tcp->tcp_tcps;
1575 int error;
1576
1577 ASSERT(connp->conn_ipversion == IPV4_VERSION);
1578
1579 /* Check for attempt to connect to INADDR_ANY */
1909 */
1910 static void
1911 tcp_reinit(tcp_t *tcp)
1912 {
1913 mblk_t *mp;
1914 tcp_stack_t *tcps = tcp->tcp_tcps;
1915 conn_t *connp = tcp->tcp_connp;
1916 int32_t oldstate;
1917
1918 /* tcp_reinit should never be called for detached tcp_t's */
1919 ASSERT(tcp->tcp_listener == NULL);
1920 ASSERT((connp->conn_family == AF_INET &&
1921 connp->conn_ipversion == IPV4_VERSION) ||
1922 (connp->conn_family == AF_INET6 &&
1923 (connp->conn_ipversion == IPV4_VERSION ||
1924 connp->conn_ipversion == IPV6_VERSION)));
1925
1926 /* Cancel outstanding timers */
1927 tcp_timers_stop(tcp);
1928
1929 /*
1930 * Reset everything in the state vector, after updating global
1931 * MIB data from instance counters.
1932 */
1933 TCPS_UPDATE_MIB(tcps, tcpHCInSegs, tcp->tcp_ibsegs);
1934 tcp->tcp_ibsegs = 0;
1935 TCPS_UPDATE_MIB(tcps, tcpHCOutSegs, tcp->tcp_obsegs);
1936 tcp->tcp_obsegs = 0;
1937
1938 tcp_close_mpp(&tcp->tcp_xmit_head);
1939 if (tcp->tcp_snd_zcopy_aware)
1940 tcp_zcopy_notify(tcp);
1941 tcp->tcp_xmit_last = tcp->tcp_xmit_tail = NULL;
1942 tcp->tcp_unsent = tcp->tcp_xmit_tail_unsent = 0;
1943 mutex_enter(&tcp->tcp_non_sq_lock);
1944 if (tcp->tcp_flow_stopped &&
1945 TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) {
1946 tcp_clrqfull(tcp);
1947 }
1948 mutex_exit(&tcp->tcp_non_sq_lock);
1949 tcp_close_mpp(&tcp->tcp_reass_head);
1950 tcp->tcp_reass_tail = NULL;
1951 if (tcp->tcp_rcv_list != NULL) {
1952 /* Free b_next chain */
1953 tcp_close_mpp(&tcp->tcp_rcv_list);
1954 tcp->tcp_rcv_last_head = NULL;
1955 tcp->tcp_rcv_last_tail = NULL;
1956 tcp->tcp_rcv_cnt = 0;
1957 }
2089
2090 /* Should be ASSERT NULL on these with new code! */
2091 ASSERT(tcp->tcp_time_wait_next == NULL);
2092 ASSERT(tcp->tcp_time_wait_prev == NULL);
2093 ASSERT(tcp->tcp_time_wait_expire == 0);
2094 PRESERVE(tcp->tcp_state);
2095 PRESERVE(connp->conn_rq);
2096 PRESERVE(connp->conn_wq);
2097
2098 ASSERT(tcp->tcp_xmit_head == NULL);
2099 ASSERT(tcp->tcp_xmit_last == NULL);
2100 ASSERT(tcp->tcp_unsent == 0);
2101 ASSERT(tcp->tcp_xmit_tail == NULL);
2102 ASSERT(tcp->tcp_xmit_tail_unsent == 0);
2103
2104 tcp->tcp_snxt = 0; /* Displayed in mib */
2105 tcp->tcp_suna = 0; /* Displayed in mib */
2106 tcp->tcp_swnd = 0;
2107 DONTCARE(tcp->tcp_cwnd); /* Init in tcp_process_options */
2108
2109 ASSERT(tcp->tcp_ibsegs == 0);
2110 ASSERT(tcp->tcp_obsegs == 0);
2111
2112 if (connp->conn_ht_iphc != NULL) {
2113 kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
2114 connp->conn_ht_iphc = NULL;
2115 connp->conn_ht_iphc_allocated = 0;
2116 connp->conn_ht_iphc_len = 0;
2117 connp->conn_ht_ulp = NULL;
2118 connp->conn_ht_ulp_len = 0;
2119 tcp->tcp_ipha = NULL;
2120 tcp->tcp_ip6h = NULL;
2121 tcp->tcp_tcpha = NULL;
2122 }
2123
2124 /* We clear any IP_OPTIONS and extension headers */
2125 ip_pkt_free(&connp->conn_xmit_ipp);
2126
2127 DONTCARE(tcp->tcp_naglim); /* Init in tcp_init_values */
2128 DONTCARE(tcp->tcp_ipha);
2129 DONTCARE(tcp->tcp_ip6h);
2130 DONTCARE(tcp->tcp_tcpha);
2131 tcp->tcp_valid_bits = 0;
2183 tcp->tcp_initial_pmtu = 0;
2184
2185 ASSERT(tcp->tcp_reass_head == NULL);
2186 ASSERT(tcp->tcp_reass_tail == NULL);
2187
2188 tcp->tcp_cwnd_cnt = 0;
2189
2190 ASSERT(tcp->tcp_rcv_list == NULL);
2191 ASSERT(tcp->tcp_rcv_last_head == NULL);
2192 ASSERT(tcp->tcp_rcv_last_tail == NULL);
2193 ASSERT(tcp->tcp_rcv_cnt == 0);
2194
2195 DONTCARE(tcp->tcp_cwnd_ssthresh); /* Init in tcp_set_destination */
2196 DONTCARE(tcp->tcp_cwnd_max); /* Init in tcp_init_values */
2197 tcp->tcp_csuna = 0;
2198
2199 tcp->tcp_rto = 0; /* Displayed in MIB */
2200 DONTCARE(tcp->tcp_rtt_sa); /* Init in tcp_init_values */
2201 DONTCARE(tcp->tcp_rtt_sd); /* Init in tcp_init_values */
2202 tcp->tcp_rtt_update = 0;
2203
2204 DONTCARE(tcp->tcp_swl1); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */
2205 DONTCARE(tcp->tcp_swl2); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */
2206
2207 tcp->tcp_rack = 0; /* Displayed in mib */
2208 tcp->tcp_rack_cnt = 0;
2209 tcp->tcp_rack_cur_max = 0;
2210 tcp->tcp_rack_abs_max = 0;
2211
2212 tcp->tcp_max_swnd = 0;
2213
2214 ASSERT(tcp->tcp_listener == NULL);
2215
2216 DONTCARE(tcp->tcp_irs); /* tcp_valid_bits cleared */
2217 DONTCARE(tcp->tcp_iss); /* tcp_valid_bits cleared */
2218 DONTCARE(tcp->tcp_fss); /* tcp_valid_bits cleared */
2219 DONTCARE(tcp->tcp_urg); /* tcp_valid_bits cleared */
2220
2221 ASSERT(tcp->tcp_conn_req_cnt_q == 0);
2222 ASSERT(tcp->tcp_conn_req_cnt_q0 == 0);
2318 tcp->tcp_in_ack_unsent = 0;
2319 tcp->tcp_cork = B_FALSE;
2320 tcp->tcp_tconnind_started = B_FALSE;
2321
2322 PRESERVE(tcp->tcp_squeue_bytes);
2323
2324 tcp->tcp_closemp_used = B_FALSE;
2325
2326 PRESERVE(tcp->tcp_rsrv_mp);
2327 PRESERVE(tcp->tcp_rsrv_mp_lock);
2328
2329 #ifdef DEBUG
2330 DONTCARE(tcp->tcmp_stk[0]);
2331 #endif
2332
2333 PRESERVE(tcp->tcp_connid);
2334
2335 ASSERT(tcp->tcp_listen_cnt == NULL);
2336 ASSERT(tcp->tcp_reass_tid == 0);
2337
2338 #undef DONTCARE
2339 #undef PRESERVE
2340 }
2341
2342 /*
2343 * Initialize the various fields in tcp_t. If parent (the listener) is non
2344 * NULL, certain values will be inheritted from it.
2345 */
2346 void
2347 tcp_init_values(tcp_t *tcp, tcp_t *parent)
2348 {
2349 tcp_stack_t *tcps = tcp->tcp_tcps;
2350 conn_t *connp = tcp->tcp_connp;
2351 clock_t rto;
2352
2353 ASSERT((connp->conn_family == AF_INET &&
2354 connp->conn_ipversion == IPV4_VERSION) ||
2355 (connp->conn_family == AF_INET6 &&
2356 (connp->conn_ipversion == IPV4_VERSION ||
2357 connp->conn_ipversion == IPV6_VERSION)));
2358
2359 if (parent == NULL) {
2360 tcp->tcp_naglim = tcps->tcps_naglim_def;
2361
2362 tcp->tcp_rto_initial = tcps->tcps_rexmit_interval_initial;
2363 tcp->tcp_rto_min = tcps->tcps_rexmit_interval_min;
2364 tcp->tcp_rto_max = tcps->tcps_rexmit_interval_max;
2365
2366 tcp->tcp_first_ctimer_threshold =
2367 tcps->tcps_ip_notify_cinterval;
2368 tcp->tcp_second_ctimer_threshold =
2369 tcps->tcps_ip_abort_cinterval;
2370 tcp->tcp_first_timer_threshold = tcps->tcps_ip_notify_interval;
2371 tcp->tcp_second_timer_threshold = tcps->tcps_ip_abort_interval;
2372
2373 tcp->tcp_fin_wait_2_flush_interval =
2374 tcps->tcps_fin_wait_2_flush_interval;
2375
2376 tcp->tcp_ka_interval = tcps->tcps_keepalive_interval;
2377 tcp->tcp_ka_abort_thres = tcps->tcps_keepalive_abort_interval;
2378 tcp->tcp_ka_cnt = 0;
2379 tcp->tcp_ka_rinterval = 0;
2380
2381 /*
2382 * Default value of tcp_init_cwnd is 0, so no need to set here
2383 * if parent is NULL. But we need to inherit it from parent.
2384 */
2385 } else {
2386 /* Inherit various TCP parameters from the parent. */
2387 tcp->tcp_naglim = parent->tcp_naglim;
2388
2389 tcp->tcp_rto_initial = parent->tcp_rto_initial;
2390 tcp->tcp_rto_min = parent->tcp_rto_min;
2391 tcp->tcp_rto_max = parent->tcp_rto_max;
2392
2393 tcp->tcp_first_ctimer_threshold =
2394 parent->tcp_first_ctimer_threshold;
2395 tcp->tcp_second_ctimer_threshold =
2396 parent->tcp_second_ctimer_threshold;
2397 tcp->tcp_first_timer_threshold =
2398 parent->tcp_first_timer_threshold;
2399 tcp->tcp_second_timer_threshold =
2400 parent->tcp_second_timer_threshold;
2401
2402 tcp->tcp_fin_wait_2_flush_interval =
2403 parent->tcp_fin_wait_2_flush_interval;
2404
2405 tcp->tcp_ka_interval = parent->tcp_ka_interval;
2406 tcp->tcp_ka_abort_thres = parent->tcp_ka_abort_thres;
2407 tcp->tcp_ka_cnt = parent->tcp_ka_cnt;
2408 tcp->tcp_ka_rinterval = parent->tcp_ka_rinterval;
2409
2410 tcp->tcp_init_cwnd = parent->tcp_init_cwnd;
2411 }
2412
2413 /*
2414 * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO
2415 * will be close to tcp_rexmit_interval_initial. By doing this, we
2416 * allow the algorithm to adjust slowly to large fluctuations of RTT
2417 * during first few transmissions of a connection as seen in slow
2418 * links.
2419 */
2420 tcp->tcp_rtt_sa = tcp->tcp_rto_initial << 2;
2421 tcp->tcp_rtt_sd = tcp->tcp_rto_initial >> 1;
2422 rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
2423 tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5) +
2424 tcps->tcps_conn_grace_period;
2425 TCP_SET_RTO(tcp, rto);
2426
2427 tcp->tcp_timer_backoff = 0;
2428 tcp->tcp_ms_we_have_waited = 0;
2429 tcp->tcp_last_recv_time = ddi_get_lbolt();
2430 tcp->tcp_cwnd_max = tcps->tcps_cwnd_max_;
2431 tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
2432
2433 tcp->tcp_maxpsz_multiplier = tcps->tcps_maxpsz_multiplier;
2434
2435 /* NOTE: ISS is now set in tcp_set_destination(). */
2436
2437 /* Reset fusion-related fields */
2438 tcp->tcp_fused = B_FALSE;
2439 tcp->tcp_unfusable = B_FALSE;
2440 tcp->tcp_fused_sigurg = B_FALSE;
2441 tcp->tcp_loopback_peer = NULL;
2442
2443 /* We rebuild the header template on the next connect/conn_request */
2444
2445 connp->conn_mlp_type = mlptSingle;
2642 return (NULL);
2643 }
2644
2645 ns = netstack_find_by_cred(credp);
2646 ASSERT(ns != NULL);
2647 tcps = ns->netstack_tcp;
2648 ASSERT(tcps != NULL);
2649
2650 /*
2651 * For exclusive stacks we set the zoneid to zero
2652 * to make TCP operate as if in the global zone.
2653 */
2654 if (tcps->tcps_netstack->netstack_stackid !=
2655 GLOBAL_NETSTACKID)
2656 zoneid = GLOBAL_ZONEID;
2657 else
2658 zoneid = crgetzoneid(credp);
2659 }
2660
2661 sqp = IP_SQUEUE_GET((uint_t)gethrtime());
2662 connp = (conn_t *)tcp_get_conn(sqp, tcps);
2663 /*
2664 * Both tcp_get_conn and netstack_find_by_cred incremented refcnt,
2665 * so we drop it by one.
2666 */
2667 netstack_rele(tcps->tcps_netstack);
2668 if (connp == NULL) {
2669 *errorp = ENOSR;
2670 return (NULL);
2671 }
2672 ASSERT(connp->conn_ixa->ixa_protocol == connp->conn_proto);
2673
2674 connp->conn_sqp = sqp;
2675 connp->conn_initial_sqp = connp->conn_sqp;
2676 connp->conn_ixa->ixa_sqp = connp->conn_sqp;
2677 tcp = connp->conn_tcp;
2678
2679 /*
2680 * Besides asking IP to set the checksum for us, have conn_ip_output
2681 * to do the following checks when necessary:
2682 *
3830 * there are many CPUs as we will be adding them 1 by 1.
3831 *
3832 * Note that tcps_sc_cnt never decreases and the tcps_sc[x] pointers
3833 * are not freed until the stack is going away. So there is no need
3834 * to grab a lock to access the per CPU tcps_sc[x] pointer.
3835 */
3836 mutex_enter(&cpu_lock);
3837 tcps->tcps_sc_cnt = MAX(ncpus, boot_ncpus);
3838 mutex_exit(&cpu_lock);
3839 tcps->tcps_sc = kmem_zalloc(max_ncpus * sizeof (tcp_stats_cpu_t *),
3840 KM_SLEEP);
3841 for (i = 0; i < tcps->tcps_sc_cnt; i++) {
3842 tcps->tcps_sc[i] = kmem_zalloc(sizeof (tcp_stats_cpu_t),
3843 KM_SLEEP);
3844 }
3845
3846 mutex_init(&tcps->tcps_listener_conf_lock, NULL, MUTEX_DEFAULT, NULL);
3847 list_create(&tcps->tcps_listener_conf, sizeof (tcp_listener_t),
3848 offsetof(tcp_listener_t, tl_link));
3849
3850 return (tcps);
3851 }
3852
3853 /*
3854 * Called when the IP module is about to be unloaded.
3855 */
3856 void
3857 tcp_ddi_g_destroy(void)
3858 {
3859 tcp_g_kstat_fini(tcp_g_kstat);
3860 tcp_g_kstat = NULL;
3861 bzero(&tcp_g_statistics, sizeof (tcp_g_statistics));
3862
3863 mutex_destroy(&tcp_random_lock);
3864
3865 kmem_cache_destroy(tcp_timercache);
3866 kmem_cache_destroy(tcp_notsack_blk_cache);
3867
3868 netstack_unregister(NS_TCP);
3869 }
|
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2017 Joyent, Inc.
25 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
26 * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
27 * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
28 */
29 /* Copyright (c) 1990 Mentat Inc. */
30
31 #include <sys/types.h>
32 #include <sys/stream.h>
33 #include <sys/strsun.h>
34 #include <sys/strsubr.h>
35 #include <sys/stropts.h>
36 #include <sys/strlog.h>
37 #define _SUN_TPI_VERSION 2
38 #include <sys/tihdr.h>
39 #include <sys/timod.h>
40 #include <sys/ddi.h>
41 #include <sys/sunddi.h>
42 #include <sys/suntpi.h>
43 #include <sys/xti_inet.h>
44 #include <sys/cmn_err.h>
45 #include <sys/debug.h>
46 #include <sys/sdt.h>
57
58 #include <sys/errno.h>
59 #include <sys/signal.h>
60 #include <sys/socket.h>
61 #include <sys/socketvar.h>
62 #include <sys/sockio.h>
63 #include <sys/isa_defs.h>
64 #include <sys/md5.h>
65 #include <sys/random.h>
66 #include <sys/uio.h>
67 #include <sys/systm.h>
68 #include <netinet/in.h>
69 #include <netinet/tcp.h>
70 #include <netinet/ip6.h>
71 #include <netinet/icmp6.h>
72 #include <net/if.h>
73 #include <net/route.h>
74 #include <inet/ipsec_impl.h>
75
76 #include <inet/common.h>
77 #include <inet/cc.h>
78 #include <inet/ip.h>
79 #include <inet/ip_impl.h>
80 #include <inet/ip6.h>
81 #include <inet/ip_ndp.h>
82 #include <inet/proto_set.h>
83 #include <inet/mib2.h>
84 #include <inet/optcom.h>
85 #include <inet/snmpcom.h>
86 #include <inet/kstatcom.h>
87 #include <inet/tcp.h>
88 #include <inet/tcp_impl.h>
89 #include <inet/tcp_cluster.h>
90 #include <inet/udp_impl.h>
91 #include <net/pfkeyv2.h>
92 #include <inet/ipdrop.h>
93
94 #include <inet/ipclassifier.h>
95 #include <inet/ip_ire.h>
96 #include <inet/ip_ftable.h>
97 #include <inet/ip_if.h>
250 ((uint_t)(accid) & (TCP_ACCEPTOR_FANOUT_SIZE - 1))
251 #endif /* _ILP32 */
252
253 /*
254 * Minimum number of connections which can be created per listener. Used
255 * when the listener connection count is in effect.
256 */
257 static uint32_t tcp_min_conn_listener = 2;
258
259 uint32_t tcp_early_abort = 30;
260
261 /* TCP Timer control structure */
262 typedef struct tcpt_s {
263 pfv_t tcpt_pfv; /* The routine we are to call */
264 tcp_t *tcpt_tcp; /* The parameter we are to pass in */
265 } tcpt_t;
266
267 /*
268 * Functions called directly via squeue having a prototype of edesc_t.
269 */
270 void tcp_input_data(void *arg, mblk_t *mp, void *arg2,
271 ip_recv_attr_t *ira);
272 static void tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2,
273 ip_recv_attr_t *dummy);
274
275
276 /* Prototype for TCP functions */
277 static void tcp_random_init(void);
278 int tcp_random(void);
279 static int tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp,
280 in_port_t dstport, uint_t srcid);
281 static int tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp,
282 in_port_t dstport, uint32_t flowinfo,
283 uint_t srcid, uint32_t scope_id);
284 static void tcp_iss_init(tcp_t *tcp);
285 static void tcp_reinit(tcp_t *tcp);
286 static void tcp_reinit_values(tcp_t *tcp);
287
288 static void tcp_wsrv(queue_t *q);
289 static void tcp_update_lso(tcp_t *tcp, ip_xmit_attr_t *ixa);
558 netstack_rele(ns);
559 ASSERT(tcps != NULL);
560 tcp->tcp_tcps = NULL;
561
562 bzero(tcp, sizeof (tcp_t));
563
564 /* restore the state */
565 tcp->tcp_timercache = mp;
566
567 tcp->tcp_rsrv_mp = tcp_rsrv_mp;
568
569 tcp->tcp_connp = connp;
570
571 ASSERT(connp->conn_tcp == tcp);
572 ASSERT(connp->conn_flags & IPCL_TCPCONN);
573 connp->conn_state_flags = CONN_INCIPIENT;
574 ASSERT(connp->conn_proto == IPPROTO_TCP);
575 ASSERT(connp->conn_ref == 1);
576 }
577
578 #pragma inline(tcp_calculate_rto)
579
580 /*
581 * RTO = average estimates (sa / 8) + 4 * deviation estimates (sd)
582 *
583 * Add tcp_rexmit_interval extra in case of extreme environment where the
584 * algorithm fails to work. The default value of tcp_rexmit_interval_extra
585 * should be 0.
586 *
587 * As we use a finer grained clock than BSD and update RTO for every ACKs, add
588 * in another .25 of RTT to the deviation of RTO to accommodate burstiness of
589 * 1/4 of window size.
590 */
591 clock_t
592 tcp_calculate_rto(tcp_t *tcp, tcp_stack_t *tcps)
593 {
594 clock_t rto;
595
596 rto = NSEC2MSEC((tcp->tcp_rtt_sa >> 3) + (tcp->tcp_rtt_sa >> 5) +
597 tcp->tcp_rtt_sd) + tcps->tcps_rexmit_interval_extra +
598 tcps->tcps_conn_grace_period;
599
600 if (rto < tcp->tcp_rto_min)
601 rto = tcp->tcp_rto_min;
602 else if (rto > tcp->tcp_rto_max)
603 rto = tcp->tcp_rto_max;
604
605 return (rto);
606 }
607
608 /*
609 * Adapt to the information, such as rtt and rtt_sd, provided from the
610 * DCE and IRE maintained by IP.
611 *
612 * Checks for multicast and broadcast destination address.
613 * Returns zero if ok; an errno on failure.
614 *
615 * Note that the MSS calculation here is based on the info given in
616 * the DCE and IRE. We do not do any calculation based on TCP options. They
617 * will be handled in tcp_input_data() when TCP knows which options to use.
618 *
619 * Note on how TCP gets its parameters for a connection.
620 *
621 * When a tcp_t structure is allocated, it gets all the default parameters.
622 * In tcp_set_destination(), it gets those metric parameters, like rtt, rtt_sd,
623 * spipe, rpipe, ... from the route metrics. Route metric overrides the
624 * default.
625 *
626 * An incoming SYN with a multicast or broadcast destination address is dropped
627 * in ip_fanout_v4/v6.
628 *
652 */
653 flags |= IPDF_UNIQUE_DCE;
654
655 if (!tcps->tcps_ignore_path_mtu)
656 connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
657
658 /* Use conn_lock to satify ASSERT; tcp is already serialized */
659 mutex_enter(&connp->conn_lock);
660 error = conn_connect(connp, &uinfo, flags);
661 mutex_exit(&connp->conn_lock);
662 if (error != 0)
663 return (error);
664
665 error = tcp_build_hdrs(tcp);
666 if (error != 0)
667 return (error);
668
669 tcp->tcp_localnet = uinfo.iulp_localnet;
670
671 if (uinfo.iulp_rtt != 0) {
672 tcp->tcp_rtt_sa = MSEC2NSEC(uinfo.iulp_rtt);
673 tcp->tcp_rtt_sd = MSEC2NSEC(uinfo.iulp_rtt_sd);
674 tcp->tcp_rto = tcp_calculate_rto(tcp, tcps);
675 }
676 if (uinfo.iulp_ssthresh != 0)
677 tcp->tcp_cwnd_ssthresh = uinfo.iulp_ssthresh;
678 else
679 tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
680 if (uinfo.iulp_spipe > 0) {
681 connp->conn_sndbuf = MIN(uinfo.iulp_spipe,
682 tcps->tcps_max_buf);
683 if (tcps->tcps_snd_lowat_fraction != 0) {
684 connp->conn_sndlowat = connp->conn_sndbuf /
685 tcps->tcps_snd_lowat_fraction;
686 }
687 (void) tcp_maxpsz_set(tcp, B_TRUE);
688 }
689 /*
690 * Note that up till now, acceptor always inherits receive
691 * window from the listener. But if there is a metrics
692 * associated with a host, we should use that instead of
693 * inheriting it from listener. Thus we need to pass this
694 * info back to the caller.
1244 CONN_DEC_REF(tcp->tcp_connp);
1245 }
1246
1247 /*
1248 * The tcp_t is going away. Remove it from all lists and set it
1249 * to TCPS_CLOSED. The freeing up of memory is deferred until
1250 * tcp_inactive. This is needed since a thread in tcp_rput might have
1251 * done a CONN_INC_REF on this structure before it was removed from the
1252 * hashes.
1253 */
1254 void
1255 tcp_closei_local(tcp_t *tcp)
1256 {
1257 conn_t *connp = tcp->tcp_connp;
1258 tcp_stack_t *tcps = tcp->tcp_tcps;
1259 int32_t oldstate;
1260
1261 if (!TCP_IS_SOCKET(tcp))
1262 tcp_acceptor_hash_remove(tcp);
1263
1264 /*
1265 * This can be called via tcp_time_wait_processing() if TCP gets a
1266 * SYN with sequence number outside the TIME-WAIT connection's
1267 * window. So we need to check for TIME-WAIT state here as the
1268 * connection counter is already decremented. See SET_TIME_WAIT()
1269 * macro
1270 */
1271 if (tcp->tcp_state >= TCPS_ESTABLISHED &&
1272 tcp->tcp_state < TCPS_TIME_WAIT) {
1273 TCPS_CONN_DEC(tcps);
1274 }
1275
1276 /*
1277 * If we are an eager connection hanging off a listener that
1278 * hasn't formally accepted the connection yet, get off its
1279 * list and blow off any data that we have accumulated.
1280 */
1281 if (tcp->tcp_listener != NULL) {
1282 tcp_t *listener = tcp->tcp_listener;
1283 mutex_enter(&listener->tcp_eager_lock);
1422 if (tcp->tcp_rthdrdstopts != NULL) {
1423 mi_free(tcp->tcp_rthdrdstopts);
1424 tcp->tcp_rthdrdstopts = NULL;
1425 tcp->tcp_rthdrdstoptslen = 0;
1426 }
1427 ASSERT(tcp->tcp_rthdrdstoptslen == 0);
1428 if (tcp->tcp_rthdr != NULL) {
1429 mi_free(tcp->tcp_rthdr);
1430 tcp->tcp_rthdr = NULL;
1431 tcp->tcp_rthdrlen = 0;
1432 }
1433 ASSERT(tcp->tcp_rthdrlen == 0);
1434
1435 /*
1436 * Following is really a blowing away a union.
1437 * It happens to have exactly two members of identical size
1438 * the following code is enough.
1439 */
1440 tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind);
1441
1442 /* Allow the CC algorithm to clean up after itself. */
1443 if (tcp->tcp_cc_algo != NULL && tcp->tcp_cc_algo->cb_destroy != NULL)
1444 tcp->tcp_cc_algo->cb_destroy(&tcp->tcp_ccv);
1445
1446 /*
1447 * Destroy any association with SO_REUSEPORT group.
1448 */
1449 if (tcp->tcp_rg_bind != NULL) {
1450 /*
1451 * This is only necessary for connections which enabled
1452 * SO_REUSEPORT but were never bound. Such connections should
1453 * be the one and only member of the tcp_rg_tp to which they
1454 * have been associated.
1455 */
1456 VERIFY(tcp_rg_remove(tcp->tcp_rg_bind, tcp));
1457 tcp_rg_destroy(tcp->tcp_rg_bind);
1458 tcp->tcp_rg_bind = NULL;
1459 }
1460
1461 /*
1462 * If this is a non-STREAM socket still holding on to an upper
1463 * handle, release it. As a result of fallback we might also see
1464 * STREAMS based conns with upper handles, in which case there is
1465 * nothing to do other than clearing the field.
1487 * outside the squeue. So when the interrupt comes, we have a clean
1488 * connection sitting in the freelist. Obviously, this buys us
1489 * performance.
1490 *
1491 * 2) Defence against DOS attack. Allocating a tcp/conn in tcp_input_listener
1492 * has multiple disadvantages - tying up the squeue during alloc.
1493 * But allocating the conn/tcp in IP land is also not the best since
1494 * we can't check the 'q' and 'q0' which are protected by squeue and
1495 * blindly allocate memory which might have to be freed here if we are
1496 * not allowed to accept the connection. By using the freelist and
1497 * putting the conn/tcp back in freelist, we don't pay a penalty for
1498 * allocating memory without checking 'q/q0' and freeing it if we can't
1499 * accept the connection.
1500 *
1501 * Care should be taken to put the conn back in the same squeue's freelist
1502 * from which it was allocated. Best results are obtained if conn is
1503 * allocated from listener's squeue and freed to the same. Time wait
1504 * collector will free up the freelist is the connection ends up sitting
1505 * there for too long.
1506 */
1507 conn_t *
1508 tcp_get_conn(void *arg, tcp_stack_t *tcps)
1509 {
1510 tcp_t *tcp = NULL;
1511 conn_t *connp = NULL;
1512 squeue_t *sqp = (squeue_t *)arg;
1513 tcp_squeue_priv_t *tcp_time_wait;
1514 netstack_t *ns;
1515 mblk_t *tcp_rsrv_mp = NULL;
1516
1517 tcp_time_wait =
1518 *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
1519
1520 mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
1521 tcp = tcp_time_wait->tcp_free_list;
1522 ASSERT((tcp != NULL) ^ (tcp_time_wait->tcp_free_list_cnt == 0));
1523 if (tcp != NULL) {
1524 tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next;
1525 tcp_time_wait->tcp_free_list_cnt--;
1526 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1527 tcp->tcp_time_wait_next = NULL;
1528 connp = tcp->tcp_connp;
1529 connp->conn_flags |= IPCL_REUSED;
1530
1531 ASSERT(tcp->tcp_tcps == NULL);
1532 ASSERT(connp->conn_netstack == NULL);
1533 ASSERT(tcp->tcp_rsrv_mp != NULL);
1534 ns = tcps->tcps_netstack;
1535 netstack_hold(ns);
1536 connp->conn_netstack = ns;
1537 connp->conn_ixa->ixa_ipst = ns->netstack_ip;
1538 tcp->tcp_tcps = tcps;
1539 ipcl_globalhash_insert(connp);
1540
1541 connp->conn_ixa->ixa_notify_cookie = tcp;
1542 ASSERT(connp->conn_ixa->ixa_notify == tcp_notify);
1543 connp->conn_recv = tcp_input_data;
1544 ASSERT(connp->conn_recvicmp == tcp_icmp_input);
1545 ASSERT(connp->conn_verifyicmp == tcp_verifyicmp);
1546 return (connp);
1547 }
1548 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1549 /*
1550 * Pre-allocate the tcp_rsrv_mp. This mblk will not be freed until
1551 * this conn_t/tcp_t is freed at ipcl_conn_destroy().
1552 */
1553 tcp_rsrv_mp = allocb(0, BPRI_HI);
1554 if (tcp_rsrv_mp == NULL)
1555 return (NULL);
1556
1557 if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP,
1558 tcps->tcps_netstack)) == NULL) {
1559 freeb(tcp_rsrv_mp);
1560 return (NULL);
1561 }
1562
1563 tcp = connp->conn_tcp;
1564 tcp->tcp_rsrv_mp = tcp_rsrv_mp;
1565 mutex_init(&tcp->tcp_rsrv_mp_lock, NULL, MUTEX_DEFAULT, NULL);
1566
1567 tcp->tcp_tcps = tcps;
1568
1569 connp->conn_recv = tcp_input_data;
1570 connp->conn_recvicmp = tcp_icmp_input;
1571 connp->conn_verifyicmp = tcp_verifyicmp;
1572
1573 /*
1574 * Register tcp_notify to listen to capability changes detected by IP.
1575 * This upcall is made in the context of the call to conn_ip_output
1576 * thus it is inside the squeue.
1577 */
1578 connp->conn_ixa->ixa_notify = tcp_notify;
1579 connp->conn_ixa->ixa_notify_cookie = tcp;
1580
1581 return (connp);
1582 }
1583
1584 /*
1585 * Handle connect to IPv4 destinations, including connections for AF_INET6
1586 * sockets connecting to IPv4 mapped IPv6 destinations.
1587 * Returns zero if OK, a positive errno, or a negative TLI error.
1588 */
1589 static int
1590 tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp, in_port_t dstport,
1591 uint_t srcid)
1592 {
1593 ipaddr_t dstaddr = *dstaddrp;
1594 uint16_t lport;
1595 conn_t *connp = tcp->tcp_connp;
1596 tcp_stack_t *tcps = tcp->tcp_tcps;
1597 int error;
1598
1599 ASSERT(connp->conn_ipversion == IPV4_VERSION);
1600
1601 /* Check for attempt to connect to INADDR_ANY */
1931 */
1932 static void
1933 tcp_reinit(tcp_t *tcp)
1934 {
1935 mblk_t *mp;
1936 tcp_stack_t *tcps = tcp->tcp_tcps;
1937 conn_t *connp = tcp->tcp_connp;
1938 int32_t oldstate;
1939
1940 /* tcp_reinit should never be called for detached tcp_t's */
1941 ASSERT(tcp->tcp_listener == NULL);
1942 ASSERT((connp->conn_family == AF_INET &&
1943 connp->conn_ipversion == IPV4_VERSION) ||
1944 (connp->conn_family == AF_INET6 &&
1945 (connp->conn_ipversion == IPV4_VERSION ||
1946 connp->conn_ipversion == IPV6_VERSION)));
1947
1948 /* Cancel outstanding timers */
1949 tcp_timers_stop(tcp);
1950
1951 tcp_close_mpp(&tcp->tcp_xmit_head);
1952 if (tcp->tcp_snd_zcopy_aware)
1953 tcp_zcopy_notify(tcp);
1954 tcp->tcp_xmit_last = tcp->tcp_xmit_tail = NULL;
1955 tcp->tcp_unsent = tcp->tcp_xmit_tail_unsent = 0;
1956 mutex_enter(&tcp->tcp_non_sq_lock);
1957 if (tcp->tcp_flow_stopped &&
1958 TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) {
1959 tcp_clrqfull(tcp);
1960 }
1961 mutex_exit(&tcp->tcp_non_sq_lock);
1962 tcp_close_mpp(&tcp->tcp_reass_head);
1963 tcp->tcp_reass_tail = NULL;
1964 if (tcp->tcp_rcv_list != NULL) {
1965 /* Free b_next chain */
1966 tcp_close_mpp(&tcp->tcp_rcv_list);
1967 tcp->tcp_rcv_last_head = NULL;
1968 tcp->tcp_rcv_last_tail = NULL;
1969 tcp->tcp_rcv_cnt = 0;
1970 }
2102
2103 /* Should be ASSERT NULL on these with new code! */
2104 ASSERT(tcp->tcp_time_wait_next == NULL);
2105 ASSERT(tcp->tcp_time_wait_prev == NULL);
2106 ASSERT(tcp->tcp_time_wait_expire == 0);
2107 PRESERVE(tcp->tcp_state);
2108 PRESERVE(connp->conn_rq);
2109 PRESERVE(connp->conn_wq);
2110
2111 ASSERT(tcp->tcp_xmit_head == NULL);
2112 ASSERT(tcp->tcp_xmit_last == NULL);
2113 ASSERT(tcp->tcp_unsent == 0);
2114 ASSERT(tcp->tcp_xmit_tail == NULL);
2115 ASSERT(tcp->tcp_xmit_tail_unsent == 0);
2116
2117 tcp->tcp_snxt = 0; /* Displayed in mib */
2118 tcp->tcp_suna = 0; /* Displayed in mib */
2119 tcp->tcp_swnd = 0;
2120 DONTCARE(tcp->tcp_cwnd); /* Init in tcp_process_options */
2121
2122 if (connp->conn_ht_iphc != NULL) {
2123 kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
2124 connp->conn_ht_iphc = NULL;
2125 connp->conn_ht_iphc_allocated = 0;
2126 connp->conn_ht_iphc_len = 0;
2127 connp->conn_ht_ulp = NULL;
2128 connp->conn_ht_ulp_len = 0;
2129 tcp->tcp_ipha = NULL;
2130 tcp->tcp_ip6h = NULL;
2131 tcp->tcp_tcpha = NULL;
2132 }
2133
2134 /* We clear any IP_OPTIONS and extension headers */
2135 ip_pkt_free(&connp->conn_xmit_ipp);
2136
2137 DONTCARE(tcp->tcp_naglim); /* Init in tcp_init_values */
2138 DONTCARE(tcp->tcp_ipha);
2139 DONTCARE(tcp->tcp_ip6h);
2140 DONTCARE(tcp->tcp_tcpha);
2141 tcp->tcp_valid_bits = 0;
2193 tcp->tcp_initial_pmtu = 0;
2194
2195 ASSERT(tcp->tcp_reass_head == NULL);
2196 ASSERT(tcp->tcp_reass_tail == NULL);
2197
2198 tcp->tcp_cwnd_cnt = 0;
2199
2200 ASSERT(tcp->tcp_rcv_list == NULL);
2201 ASSERT(tcp->tcp_rcv_last_head == NULL);
2202 ASSERT(tcp->tcp_rcv_last_tail == NULL);
2203 ASSERT(tcp->tcp_rcv_cnt == 0);
2204
2205 DONTCARE(tcp->tcp_cwnd_ssthresh); /* Init in tcp_set_destination */
2206 DONTCARE(tcp->tcp_cwnd_max); /* Init in tcp_init_values */
2207 tcp->tcp_csuna = 0;
2208
2209 tcp->tcp_rto = 0; /* Displayed in MIB */
2210 DONTCARE(tcp->tcp_rtt_sa); /* Init in tcp_init_values */
2211 DONTCARE(tcp->tcp_rtt_sd); /* Init in tcp_init_values */
2212 tcp->tcp_rtt_update = 0;
2213 tcp->tcp_rtt_sum = 0;
2214 tcp->tcp_rtt_cnt = 0;
2215
2216 DONTCARE(tcp->tcp_swl1); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */
2217 DONTCARE(tcp->tcp_swl2); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */
2218
2219 tcp->tcp_rack = 0; /* Displayed in mib */
2220 tcp->tcp_rack_cnt = 0;
2221 tcp->tcp_rack_cur_max = 0;
2222 tcp->tcp_rack_abs_max = 0;
2223
2224 tcp->tcp_max_swnd = 0;
2225
2226 ASSERT(tcp->tcp_listener == NULL);
2227
2228 DONTCARE(tcp->tcp_irs); /* tcp_valid_bits cleared */
2229 DONTCARE(tcp->tcp_iss); /* tcp_valid_bits cleared */
2230 DONTCARE(tcp->tcp_fss); /* tcp_valid_bits cleared */
2231 DONTCARE(tcp->tcp_urg); /* tcp_valid_bits cleared */
2232
2233 ASSERT(tcp->tcp_conn_req_cnt_q == 0);
2234 ASSERT(tcp->tcp_conn_req_cnt_q0 == 0);
2330 tcp->tcp_in_ack_unsent = 0;
2331 tcp->tcp_cork = B_FALSE;
2332 tcp->tcp_tconnind_started = B_FALSE;
2333
2334 PRESERVE(tcp->tcp_squeue_bytes);
2335
2336 tcp->tcp_closemp_used = B_FALSE;
2337
2338 PRESERVE(tcp->tcp_rsrv_mp);
2339 PRESERVE(tcp->tcp_rsrv_mp_lock);
2340
2341 #ifdef DEBUG
2342 DONTCARE(tcp->tcmp_stk[0]);
2343 #endif
2344
2345 PRESERVE(tcp->tcp_connid);
2346
2347 ASSERT(tcp->tcp_listen_cnt == NULL);
2348 ASSERT(tcp->tcp_reass_tid == 0);
2349
2350 /* Allow the CC algorithm to clean up after itself. */
2351 if (tcp->tcp_cc_algo->cb_destroy != NULL)
2352 tcp->tcp_cc_algo->cb_destroy(&tcp->tcp_ccv);
2353 tcp->tcp_cc_algo = NULL;
2354
2355 #undef DONTCARE
2356 #undef PRESERVE
2357 }
2358
2359 /*
2360 * Initialize the various fields in tcp_t. If parent (the listener) is non
2361 * NULL, certain values will be inheritted from it.
2362 */
2363 void
2364 tcp_init_values(tcp_t *tcp, tcp_t *parent)
2365 {
2366 tcp_stack_t *tcps = tcp->tcp_tcps;
2367 conn_t *connp = tcp->tcp_connp;
2368
2369 ASSERT((connp->conn_family == AF_INET &&
2370 connp->conn_ipversion == IPV4_VERSION) ||
2371 (connp->conn_family == AF_INET6 &&
2372 (connp->conn_ipversion == IPV4_VERSION ||
2373 connp->conn_ipversion == IPV6_VERSION)));
2374
2375 tcp->tcp_ccv.type = IPPROTO_TCP;
2376 tcp->tcp_ccv.ccvc.tcp = tcp;
2377
2378 if (parent == NULL) {
2379 tcp->tcp_cc_algo = tcps->tcps_default_cc_algo;
2380
2381 tcp->tcp_naglim = tcps->tcps_naglim_def;
2382
2383 tcp->tcp_rto_initial = tcps->tcps_rexmit_interval_initial;
2384 tcp->tcp_rto_min = tcps->tcps_rexmit_interval_min;
2385 tcp->tcp_rto_max = tcps->tcps_rexmit_interval_max;
2386
2387 tcp->tcp_first_ctimer_threshold =
2388 tcps->tcps_ip_notify_cinterval;
2389 tcp->tcp_second_ctimer_threshold =
2390 tcps->tcps_ip_abort_cinterval;
2391 tcp->tcp_first_timer_threshold = tcps->tcps_ip_notify_interval;
2392 tcp->tcp_second_timer_threshold = tcps->tcps_ip_abort_interval;
2393
2394 tcp->tcp_fin_wait_2_flush_interval =
2395 tcps->tcps_fin_wait_2_flush_interval;
2396
2397 tcp->tcp_ka_interval = tcps->tcps_keepalive_interval;
2398 tcp->tcp_ka_abort_thres = tcps->tcps_keepalive_abort_interval;
2399 tcp->tcp_ka_cnt = 0;
2400 tcp->tcp_ka_rinterval = 0;
2401
2402 /*
2403 * Default value of tcp_init_cwnd is 0, so no need to set here
2404 * if parent is NULL. But we need to inherit it from parent.
2405 */
2406 } else {
2407 /* Inherit various TCP parameters from the parent. */
2408 tcp->tcp_cc_algo = parent->tcp_cc_algo;
2409
2410 tcp->tcp_naglim = parent->tcp_naglim;
2411
2412 tcp->tcp_rto_initial = parent->tcp_rto_initial;
2413 tcp->tcp_rto_min = parent->tcp_rto_min;
2414 tcp->tcp_rto_max = parent->tcp_rto_max;
2415
2416 tcp->tcp_first_ctimer_threshold =
2417 parent->tcp_first_ctimer_threshold;
2418 tcp->tcp_second_ctimer_threshold =
2419 parent->tcp_second_ctimer_threshold;
2420 tcp->tcp_first_timer_threshold =
2421 parent->tcp_first_timer_threshold;
2422 tcp->tcp_second_timer_threshold =
2423 parent->tcp_second_timer_threshold;
2424
2425 tcp->tcp_fin_wait_2_flush_interval =
2426 parent->tcp_fin_wait_2_flush_interval;
2427
2428 tcp->tcp_ka_interval = parent->tcp_ka_interval;
2429 tcp->tcp_ka_abort_thres = parent->tcp_ka_abort_thres;
2430 tcp->tcp_ka_cnt = parent->tcp_ka_cnt;
2431 tcp->tcp_ka_rinterval = parent->tcp_ka_rinterval;
2432
2433 tcp->tcp_init_cwnd = parent->tcp_init_cwnd;
2434 }
2435
2436 if (tcp->tcp_cc_algo->cb_init != NULL)
2437 VERIFY(tcp->tcp_cc_algo->cb_init(&tcp->tcp_ccv) == 0);
2438
2439 /*
2440 * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO
2441 * will be close to tcp_rexmit_interval_initial. By doing this, we
2442 * allow the algorithm to adjust slowly to large fluctuations of RTT
2443 * during first few transmissions of a connection as seen in slow
2444 * links.
2445 */
2446 tcp->tcp_rtt_sa = MSEC2NSEC(tcp->tcp_rto_initial) << 2;
2447 tcp->tcp_rtt_sd = MSEC2NSEC(tcp->tcp_rto_initial) >> 1;
2448 tcp->tcp_rto = tcp_calculate_rto(tcp, tcps);
2449
2450 tcp->tcp_timer_backoff = 0;
2451 tcp->tcp_ms_we_have_waited = 0;
2452 tcp->tcp_last_recv_time = ddi_get_lbolt();
2453 tcp->tcp_cwnd_max = tcps->tcps_cwnd_max_;
2454 tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
2455
2456 tcp->tcp_maxpsz_multiplier = tcps->tcps_maxpsz_multiplier;
2457
2458 /* NOTE: ISS is now set in tcp_set_destination(). */
2459
2460 /* Reset fusion-related fields */
2461 tcp->tcp_fused = B_FALSE;
2462 tcp->tcp_unfusable = B_FALSE;
2463 tcp->tcp_fused_sigurg = B_FALSE;
2464 tcp->tcp_loopback_peer = NULL;
2465
2466 /* We rebuild the header template on the next connect/conn_request */
2467
2468 connp->conn_mlp_type = mlptSingle;
2665 return (NULL);
2666 }
2667
2668 ns = netstack_find_by_cred(credp);
2669 ASSERT(ns != NULL);
2670 tcps = ns->netstack_tcp;
2671 ASSERT(tcps != NULL);
2672
2673 /*
2674 * For exclusive stacks we set the zoneid to zero
2675 * to make TCP operate as if in the global zone.
2676 */
2677 if (tcps->tcps_netstack->netstack_stackid !=
2678 GLOBAL_NETSTACKID)
2679 zoneid = GLOBAL_ZONEID;
2680 else
2681 zoneid = crgetzoneid(credp);
2682 }
2683
2684 sqp = IP_SQUEUE_GET((uint_t)gethrtime());
2685 connp = tcp_get_conn(sqp, tcps);
2686 /*
2687 * Both tcp_get_conn and netstack_find_by_cred incremented refcnt,
2688 * so we drop it by one.
2689 */
2690 netstack_rele(tcps->tcps_netstack);
2691 if (connp == NULL) {
2692 *errorp = ENOSR;
2693 return (NULL);
2694 }
2695 ASSERT(connp->conn_ixa->ixa_protocol == connp->conn_proto);
2696
2697 connp->conn_sqp = sqp;
2698 connp->conn_initial_sqp = connp->conn_sqp;
2699 connp->conn_ixa->ixa_sqp = connp->conn_sqp;
2700 tcp = connp->conn_tcp;
2701
2702 /*
2703 * Besides asking IP to set the checksum for us, have conn_ip_output
2704 * to do the following checks when necessary:
2705 *
3853 * there are many CPUs as we will be adding them 1 by 1.
3854 *
3855 * Note that tcps_sc_cnt never decreases and the tcps_sc[x] pointers
3856 * are not freed until the stack is going away. So there is no need
3857 * to grab a lock to access the per CPU tcps_sc[x] pointer.
3858 */
3859 mutex_enter(&cpu_lock);
3860 tcps->tcps_sc_cnt = MAX(ncpus, boot_ncpus);
3861 mutex_exit(&cpu_lock);
3862 tcps->tcps_sc = kmem_zalloc(max_ncpus * sizeof (tcp_stats_cpu_t *),
3863 KM_SLEEP);
3864 for (i = 0; i < tcps->tcps_sc_cnt; i++) {
3865 tcps->tcps_sc[i] = kmem_zalloc(sizeof (tcp_stats_cpu_t),
3866 KM_SLEEP);
3867 }
3868
3869 mutex_init(&tcps->tcps_listener_conf_lock, NULL, MUTEX_DEFAULT, NULL);
3870 list_create(&tcps->tcps_listener_conf, sizeof (tcp_listener_t),
3871 offsetof(tcp_listener_t, tl_link));
3872
3873 tcps->tcps_default_cc_algo = cc_load_algo(CC_DEFAULT_ALGO_NAME);
3874 ASSERT3P(tcps->tcps_default_cc_algo, !=, NULL);
3875
3876 return (tcps);
3877 }
3878
3879 /*
3880 * Called when the IP module is about to be unloaded.
3881 */
3882 void
3883 tcp_ddi_g_destroy(void)
3884 {
3885 tcp_g_kstat_fini(tcp_g_kstat);
3886 tcp_g_kstat = NULL;
3887 bzero(&tcp_g_statistics, sizeof (tcp_g_statistics));
3888
3889 mutex_destroy(&tcp_random_lock);
3890
3891 kmem_cache_destroy(tcp_timercache);
3892 kmem_cache_destroy(tcp_notsack_blk_cache);
3893
3894 netstack_unregister(NS_TCP);
3895 }
|