Print this page
DLPX-25998 TCP congestion control is inadequate
Reviewed at: http://reviews.delphix.com/r/34808/
DLPX-45697 Adding Avg. RTT to connstat
DLPX-43064 include high-resolution round-trip times in connstat (EP-652)
DLPX-42721 Create inline function for TCP RTO calculation
DLPX-37540 TCP per-connection kernel statistics DLPX-37544 connstat command to display per-connection TCP statistics


   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2017 Joyent, Inc.
  25  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  26  * Copyright (c) 2013,2014 by Delphix. All rights reserved.
  27  * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
  28  */
  29 /* Copyright (c) 1990 Mentat Inc. */
  30 
  31 #include <sys/types.h>
  32 #include <sys/stream.h>
  33 #include <sys/strsun.h>
  34 #include <sys/strsubr.h>
  35 #include <sys/stropts.h>
  36 #include <sys/strlog.h>
  37 #define _SUN_TPI_VERSION 2
  38 #include <sys/tihdr.h>
  39 #include <sys/timod.h>
  40 #include <sys/ddi.h>
  41 #include <sys/sunddi.h>
  42 #include <sys/suntpi.h>
  43 #include <sys/xti_inet.h>
  44 #include <sys/cmn_err.h>
  45 #include <sys/debug.h>
  46 #include <sys/sdt.h>


  57 
  58 #include <sys/errno.h>
  59 #include <sys/signal.h>
  60 #include <sys/socket.h>
  61 #include <sys/socketvar.h>
  62 #include <sys/sockio.h>
  63 #include <sys/isa_defs.h>
  64 #include <sys/md5.h>
  65 #include <sys/random.h>
  66 #include <sys/uio.h>
  67 #include <sys/systm.h>
  68 #include <netinet/in.h>
  69 #include <netinet/tcp.h>
  70 #include <netinet/ip6.h>
  71 #include <netinet/icmp6.h>
  72 #include <net/if.h>
  73 #include <net/route.h>
  74 #include <inet/ipsec_impl.h>
  75 
  76 #include <inet/common.h>

  77 #include <inet/ip.h>
  78 #include <inet/ip_impl.h>
  79 #include <inet/ip6.h>
  80 #include <inet/ip_ndp.h>
  81 #include <inet/proto_set.h>
  82 #include <inet/mib2.h>
  83 #include <inet/optcom.h>
  84 #include <inet/snmpcom.h>
  85 #include <inet/kstatcom.h>
  86 #include <inet/tcp.h>
  87 #include <inet/tcp_impl.h>
  88 #include <inet/tcp_cluster.h>
  89 #include <inet/udp_impl.h>
  90 #include <net/pfkeyv2.h>
  91 #include <inet/ipdrop.h>
  92 
  93 #include <inet/ipclassifier.h>
  94 #include <inet/ip_ire.h>
  95 #include <inet/ip_ftable.h>
  96 #include <inet/ip_if.h>


 249                 ((uint_t)(accid) & (TCP_ACCEPTOR_FANOUT_SIZE - 1))
 250 #endif  /* _ILP32 */
 251 
 252 /*
 253  * Minimum number of connections which can be created per listener.  Used
 254  * when the listener connection count is in effect.
 255  */
 256 static uint32_t tcp_min_conn_listener = 2;
 257 
 258 uint32_t tcp_early_abort = 30;
 259 
 260 /* TCP Timer control structure */
 261 typedef struct tcpt_s {
 262         pfv_t   tcpt_pfv;       /* The routine we are to call */
 263         tcp_t   *tcpt_tcp;      /* The parameter we are to pass in */
 264 } tcpt_t;
 265 
 266 /*
 267  * Functions called directly via squeue having a prototype of edesc_t.
 268  */
 269 void            tcp_input_listener(void *arg, mblk_t *mp, void *arg2,
 270     ip_recv_attr_t *ira);
 271 void            tcp_input_data(void *arg, mblk_t *mp, void *arg2,
 272     ip_recv_attr_t *ira);
 273 static void     tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2,
 274     ip_recv_attr_t *dummy);
 275 
 276 
 277 /* Prototype for TCP functions */
 278 static void     tcp_random_init(void);
 279 int             tcp_random(void);
 280 static int      tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp,
 281                     in_port_t dstport, uint_t srcid);
 282 static int      tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp,
 283                     in_port_t dstport, uint32_t flowinfo,
 284                     uint_t srcid, uint32_t scope_id);
 285 static void     tcp_iss_init(tcp_t *tcp);
 286 static void     tcp_reinit(tcp_t *tcp);
 287 static void     tcp_reinit_values(tcp_t *tcp);
 288 
 289 static void     tcp_wsrv(queue_t *q);
 290 static void     tcp_update_lso(tcp_t *tcp, ip_xmit_attr_t *ixa);


 559         netstack_rele(ns);
 560         ASSERT(tcps != NULL);
 561         tcp->tcp_tcps = NULL;
 562 
 563         bzero(tcp, sizeof (tcp_t));
 564 
 565         /* restore the state */
 566         tcp->tcp_timercache = mp;
 567 
 568         tcp->tcp_rsrv_mp = tcp_rsrv_mp;
 569 
 570         tcp->tcp_connp = connp;
 571 
 572         ASSERT(connp->conn_tcp == tcp);
 573         ASSERT(connp->conn_flags & IPCL_TCPCONN);
 574         connp->conn_state_flags = CONN_INCIPIENT;
 575         ASSERT(connp->conn_proto == IPPROTO_TCP);
 576         ASSERT(connp->conn_ref == 1);
 577 }
 578 


 579 /*




























 580  * Adapt to the information, such as rtt and rtt_sd, provided from the
 581  * DCE and IRE maintained by IP.
 582  *
 583  * Checks for multicast and broadcast destination address.
 584  * Returns zero if ok; an errno on failure.
 585  *
 586  * Note that the MSS calculation here is based on the info given in
 587  * the DCE and IRE.  We do not do any calculation based on TCP options.  They
 588  * will be handled in tcp_input_data() when TCP knows which options to use.
 589  *
 590  * Note on how TCP gets its parameters for a connection.
 591  *
 592  * When a tcp_t structure is allocated, it gets all the default parameters.
 593  * In tcp_set_destination(), it gets those metric parameters, like rtt, rtt_sd,
 594  * spipe, rpipe, ... from the route metrics.  Route metric overrides the
 595  * default.
 596  *
 597  * An incoming SYN with a multicast or broadcast destination address is dropped
 598  * in ip_fanout_v4/v6.
 599  *


 623          */
 624         flags |= IPDF_UNIQUE_DCE;
 625 
 626         if (!tcps->tcps_ignore_path_mtu)
 627                 connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
 628 
 629         /* Use conn_lock to satify ASSERT; tcp is already serialized */
 630         mutex_enter(&connp->conn_lock);
 631         error = conn_connect(connp, &uinfo, flags);
 632         mutex_exit(&connp->conn_lock);
 633         if (error != 0)
 634                 return (error);
 635 
 636         error = tcp_build_hdrs(tcp);
 637         if (error != 0)
 638                 return (error);
 639 
 640         tcp->tcp_localnet = uinfo.iulp_localnet;
 641 
 642         if (uinfo.iulp_rtt != 0) {
 643                 clock_t rto;
 644 
 645                 tcp->tcp_rtt_sa = uinfo.iulp_rtt;
 646                 tcp->tcp_rtt_sd = uinfo.iulp_rtt_sd;
 647                 rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
 648                     tcps->tcps_rexmit_interval_extra +
 649                     (tcp->tcp_rtt_sa >> 5);
 650 
 651                 TCP_SET_RTO(tcp, rto);
 652         }
 653         if (uinfo.iulp_ssthresh != 0)
 654                 tcp->tcp_cwnd_ssthresh = uinfo.iulp_ssthresh;
 655         else
 656                 tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
 657         if (uinfo.iulp_spipe > 0) {
 658                 connp->conn_sndbuf = MIN(uinfo.iulp_spipe,
 659                     tcps->tcps_max_buf);
 660                 if (tcps->tcps_snd_lowat_fraction != 0) {
 661                         connp->conn_sndlowat = connp->conn_sndbuf /
 662                             tcps->tcps_snd_lowat_fraction;
 663                 }
 664                 (void) tcp_maxpsz_set(tcp, B_TRUE);
 665         }
 666         /*
 667          * Note that up till now, acceptor always inherits receive
 668          * window from the listener.  But if there is a metrics
 669          * associated with a host, we should use that instead of
 670          * inheriting it from listener. Thus we need to pass this
 671          * info back to the caller.


1221         CONN_DEC_REF(tcp->tcp_connp);
1222 }
1223 
1224 /*
1225  * The tcp_t is going away. Remove it from all lists and set it
1226  * to TCPS_CLOSED. The freeing up of memory is deferred until
1227  * tcp_inactive. This is needed since a thread in tcp_rput might have
1228  * done a CONN_INC_REF on this structure before it was removed from the
1229  * hashes.
1230  */
1231 void
1232 tcp_closei_local(tcp_t *tcp)
1233 {
1234         conn_t          *connp = tcp->tcp_connp;
1235         tcp_stack_t     *tcps = tcp->tcp_tcps;
1236         int32_t         oldstate;
1237 
1238         if (!TCP_IS_SOCKET(tcp))
1239                 tcp_acceptor_hash_remove(tcp);
1240 
1241         TCPS_UPDATE_MIB(tcps, tcpHCInSegs, tcp->tcp_ibsegs);
1242         tcp->tcp_ibsegs = 0;
1243         TCPS_UPDATE_MIB(tcps, tcpHCOutSegs, tcp->tcp_obsegs);
1244         tcp->tcp_obsegs = 0;
1245 
1246         /*
1247          * This can be called via tcp_time_wait_processing() if TCP gets a
1248          * SYN with sequence number outside the TIME-WAIT connection's
1249          * window.  So we need to check for TIME-WAIT state here as the
1250          * connection counter is already decremented.  See SET_TIME_WAIT()
1251          * macro
1252          */
1253         if (tcp->tcp_state >= TCPS_ESTABLISHED &&
1254             tcp->tcp_state < TCPS_TIME_WAIT) {
1255                 TCPS_CONN_DEC(tcps);
1256         }
1257 
1258         /*
1259          * If we are an eager connection hanging off a listener that
1260          * hasn't formally accepted the connection yet, get off its
1261          * list and blow off any data that we have accumulated.
1262          */
1263         if (tcp->tcp_listener != NULL) {
1264                 tcp_t   *listener = tcp->tcp_listener;
1265                 mutex_enter(&listener->tcp_eager_lock);


1404         if (tcp->tcp_rthdrdstopts != NULL) {
1405                 mi_free(tcp->tcp_rthdrdstopts);
1406                 tcp->tcp_rthdrdstopts = NULL;
1407                 tcp->tcp_rthdrdstoptslen = 0;
1408         }
1409         ASSERT(tcp->tcp_rthdrdstoptslen == 0);
1410         if (tcp->tcp_rthdr != NULL) {
1411                 mi_free(tcp->tcp_rthdr);
1412                 tcp->tcp_rthdr = NULL;
1413                 tcp->tcp_rthdrlen = 0;
1414         }
1415         ASSERT(tcp->tcp_rthdrlen == 0);
1416 
1417         /*
1418          * Following is really a blowing away a union.
1419          * It happens to have exactly two members of identical size
1420          * the following code is enough.
1421          */
1422         tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind);
1423 




1424         /*
1425          * Destroy any association with SO_REUSEPORT group.
1426          */
1427         if (tcp->tcp_rg_bind != NULL) {
1428                 /*
1429                  * This is only necessary for connections which enabled
1430                  * SO_REUSEPORT but were never bound.  Such connections should
1431                  * be the one and only member of the tcp_rg_tp to which they
1432                  * have been associated.
1433                  */
1434                 VERIFY(tcp_rg_remove(tcp->tcp_rg_bind, tcp));
1435                 tcp_rg_destroy(tcp->tcp_rg_bind);
1436                 tcp->tcp_rg_bind = NULL;
1437         }
1438 
1439         /*
1440          * If this is a non-STREAM socket still holding on to an upper
1441          * handle, release it. As a result of fallback we might also see
1442          * STREAMS based conns with upper handles, in which case there is
1443          * nothing to do other than clearing the field.


1465  * outside the squeue. So when the interrupt comes, we have a clean
1466  * connection sitting in the freelist. Obviously, this buys us
1467  * performance.
1468  *
1469  * 2) Defence against DOS attack. Allocating a tcp/conn in tcp_input_listener
1470  * has multiple disadvantages - tying up the squeue during alloc.
1471  * But allocating the conn/tcp in IP land is also not the best since
1472  * we can't check the 'q' and 'q0' which are protected by squeue and
1473  * blindly allocate memory which might have to be freed here if we are
1474  * not allowed to accept the connection. By using the freelist and
1475  * putting the conn/tcp back in freelist, we don't pay a penalty for
1476  * allocating memory without checking 'q/q0' and freeing it if we can't
1477  * accept the connection.
1478  *
1479  * Care should be taken to put the conn back in the same squeue's freelist
1480  * from which it was allocated. Best results are obtained if conn is
1481  * allocated from listener's squeue and freed to the same. Time wait
1482  * collector will free up the freelist is the connection ends up sitting
1483  * there for too long.
1484  */
1485 void *
1486 tcp_get_conn(void *arg, tcp_stack_t *tcps)
1487 {
1488         tcp_t                   *tcp = NULL;
1489         conn_t                  *connp = NULL;
1490         squeue_t                *sqp = (squeue_t *)arg;
1491         tcp_squeue_priv_t       *tcp_time_wait;
1492         netstack_t              *ns;
1493         mblk_t                  *tcp_rsrv_mp = NULL;
1494 
1495         tcp_time_wait =
1496             *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
1497 
1498         mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
1499         tcp = tcp_time_wait->tcp_free_list;
1500         ASSERT((tcp != NULL) ^ (tcp_time_wait->tcp_free_list_cnt == 0));
1501         if (tcp != NULL) {
1502                 tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next;
1503                 tcp_time_wait->tcp_free_list_cnt--;
1504                 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1505                 tcp->tcp_time_wait_next = NULL;
1506                 connp = tcp->tcp_connp;
1507                 connp->conn_flags |= IPCL_REUSED;
1508 
1509                 ASSERT(tcp->tcp_tcps == NULL);
1510                 ASSERT(connp->conn_netstack == NULL);
1511                 ASSERT(tcp->tcp_rsrv_mp != NULL);
1512                 ns = tcps->tcps_netstack;
1513                 netstack_hold(ns);
1514                 connp->conn_netstack = ns;
1515                 connp->conn_ixa->ixa_ipst = ns->netstack_ip;
1516                 tcp->tcp_tcps = tcps;
1517                 ipcl_globalhash_insert(connp);
1518 
1519                 connp->conn_ixa->ixa_notify_cookie = tcp;
1520                 ASSERT(connp->conn_ixa->ixa_notify == tcp_notify);
1521                 connp->conn_recv = tcp_input_data;
1522                 ASSERT(connp->conn_recvicmp == tcp_icmp_input);
1523                 ASSERT(connp->conn_verifyicmp == tcp_verifyicmp);
1524                 return ((void *)connp);
1525         }
1526         mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1527         /*
1528          * Pre-allocate the tcp_rsrv_mp. This mblk will not be freed until
1529          * this conn_t/tcp_t is freed at ipcl_conn_destroy().
1530          */
1531         tcp_rsrv_mp = allocb(0, BPRI_HI);
1532         if (tcp_rsrv_mp == NULL)
1533                 return (NULL);
1534 
1535         if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP,
1536             tcps->tcps_netstack)) == NULL) {
1537                 freeb(tcp_rsrv_mp);
1538                 return (NULL);
1539         }
1540 
1541         tcp = connp->conn_tcp;
1542         tcp->tcp_rsrv_mp = tcp_rsrv_mp;
1543         mutex_init(&tcp->tcp_rsrv_mp_lock, NULL, MUTEX_DEFAULT, NULL);
1544 
1545         tcp->tcp_tcps = tcps;
1546 
1547         connp->conn_recv = tcp_input_data;
1548         connp->conn_recvicmp = tcp_icmp_input;
1549         connp->conn_verifyicmp = tcp_verifyicmp;
1550 
1551         /*
1552          * Register tcp_notify to listen to capability changes detected by IP.
1553          * This upcall is made in the context of the call to conn_ip_output
1554          * thus it is inside the squeue.
1555          */
1556         connp->conn_ixa->ixa_notify = tcp_notify;
1557         connp->conn_ixa->ixa_notify_cookie = tcp;
1558 
1559         return ((void *)connp);
1560 }
1561 
1562 /*
1563  * Handle connect to IPv4 destinations, including connections for AF_INET6
1564  * sockets connecting to IPv4 mapped IPv6 destinations.
1565  * Returns zero if OK, a positive errno, or a negative TLI error.
1566  */
1567 static int
1568 tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp, in_port_t dstport,
1569     uint_t srcid)
1570 {
1571         ipaddr_t        dstaddr = *dstaddrp;
1572         uint16_t        lport;
1573         conn_t          *connp = tcp->tcp_connp;
1574         tcp_stack_t     *tcps = tcp->tcp_tcps;
1575         int             error;
1576 
1577         ASSERT(connp->conn_ipversion == IPV4_VERSION);
1578 
1579         /* Check for attempt to connect to INADDR_ANY */


1909  */
1910 static void
1911 tcp_reinit(tcp_t *tcp)
1912 {
1913         mblk_t          *mp;
1914         tcp_stack_t     *tcps = tcp->tcp_tcps;
1915         conn_t          *connp  = tcp->tcp_connp;
1916         int32_t         oldstate;
1917 
1918         /* tcp_reinit should never be called for detached tcp_t's */
1919         ASSERT(tcp->tcp_listener == NULL);
1920         ASSERT((connp->conn_family == AF_INET &&
1921             connp->conn_ipversion == IPV4_VERSION) ||
1922             (connp->conn_family == AF_INET6 &&
1923             (connp->conn_ipversion == IPV4_VERSION ||
1924             connp->conn_ipversion == IPV6_VERSION)));
1925 
1926         /* Cancel outstanding timers */
1927         tcp_timers_stop(tcp);
1928 
1929         /*
1930          * Reset everything in the state vector, after updating global
1931          * MIB data from instance counters.
1932          */
1933         TCPS_UPDATE_MIB(tcps, tcpHCInSegs, tcp->tcp_ibsegs);
1934         tcp->tcp_ibsegs = 0;
1935         TCPS_UPDATE_MIB(tcps, tcpHCOutSegs, tcp->tcp_obsegs);
1936         tcp->tcp_obsegs = 0;
1937 
1938         tcp_close_mpp(&tcp->tcp_xmit_head);
1939         if (tcp->tcp_snd_zcopy_aware)
1940                 tcp_zcopy_notify(tcp);
1941         tcp->tcp_xmit_last = tcp->tcp_xmit_tail = NULL;
1942         tcp->tcp_unsent = tcp->tcp_xmit_tail_unsent = 0;
1943         mutex_enter(&tcp->tcp_non_sq_lock);
1944         if (tcp->tcp_flow_stopped &&
1945             TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) {
1946                 tcp_clrqfull(tcp);
1947         }
1948         mutex_exit(&tcp->tcp_non_sq_lock);
1949         tcp_close_mpp(&tcp->tcp_reass_head);
1950         tcp->tcp_reass_tail = NULL;
1951         if (tcp->tcp_rcv_list != NULL) {
1952                 /* Free b_next chain */
1953                 tcp_close_mpp(&tcp->tcp_rcv_list);
1954                 tcp->tcp_rcv_last_head = NULL;
1955                 tcp->tcp_rcv_last_tail = NULL;
1956                 tcp->tcp_rcv_cnt = 0;
1957         }


2089 
2090         /* Should be ASSERT NULL on these with new code! */
2091         ASSERT(tcp->tcp_time_wait_next == NULL);
2092         ASSERT(tcp->tcp_time_wait_prev == NULL);
2093         ASSERT(tcp->tcp_time_wait_expire == 0);
2094         PRESERVE(tcp->tcp_state);
2095         PRESERVE(connp->conn_rq);
2096         PRESERVE(connp->conn_wq);
2097 
2098         ASSERT(tcp->tcp_xmit_head == NULL);
2099         ASSERT(tcp->tcp_xmit_last == NULL);
2100         ASSERT(tcp->tcp_unsent == 0);
2101         ASSERT(tcp->tcp_xmit_tail == NULL);
2102         ASSERT(tcp->tcp_xmit_tail_unsent == 0);
2103 
2104         tcp->tcp_snxt = 0;                   /* Displayed in mib */
2105         tcp->tcp_suna = 0;                   /* Displayed in mib */
2106         tcp->tcp_swnd = 0;
2107         DONTCARE(tcp->tcp_cwnd);     /* Init in tcp_process_options */
2108 
2109         ASSERT(tcp->tcp_ibsegs == 0);
2110         ASSERT(tcp->tcp_obsegs == 0);
2111 
2112         if (connp->conn_ht_iphc != NULL) {
2113                 kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
2114                 connp->conn_ht_iphc = NULL;
2115                 connp->conn_ht_iphc_allocated = 0;
2116                 connp->conn_ht_iphc_len = 0;
2117                 connp->conn_ht_ulp = NULL;
2118                 connp->conn_ht_ulp_len = 0;
2119                 tcp->tcp_ipha = NULL;
2120                 tcp->tcp_ip6h = NULL;
2121                 tcp->tcp_tcpha = NULL;
2122         }
2123 
2124         /* We clear any IP_OPTIONS and extension headers */
2125         ip_pkt_free(&connp->conn_xmit_ipp);
2126 
2127         DONTCARE(tcp->tcp_naglim);           /* Init in tcp_init_values */
2128         DONTCARE(tcp->tcp_ipha);
2129         DONTCARE(tcp->tcp_ip6h);
2130         DONTCARE(tcp->tcp_tcpha);
2131         tcp->tcp_valid_bits = 0;


2183         tcp->tcp_initial_pmtu = 0;
2184 
2185         ASSERT(tcp->tcp_reass_head == NULL);
2186         ASSERT(tcp->tcp_reass_tail == NULL);
2187 
2188         tcp->tcp_cwnd_cnt = 0;
2189 
2190         ASSERT(tcp->tcp_rcv_list == NULL);
2191         ASSERT(tcp->tcp_rcv_last_head == NULL);
2192         ASSERT(tcp->tcp_rcv_last_tail == NULL);
2193         ASSERT(tcp->tcp_rcv_cnt == 0);
2194 
2195         DONTCARE(tcp->tcp_cwnd_ssthresh); /* Init in tcp_set_destination */
2196         DONTCARE(tcp->tcp_cwnd_max);         /* Init in tcp_init_values */
2197         tcp->tcp_csuna = 0;
2198 
2199         tcp->tcp_rto = 0;                    /* Displayed in MIB */
2200         DONTCARE(tcp->tcp_rtt_sa);           /* Init in tcp_init_values */
2201         DONTCARE(tcp->tcp_rtt_sd);           /* Init in tcp_init_values */
2202         tcp->tcp_rtt_update = 0;


2203 
2204         DONTCARE(tcp->tcp_swl1); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */
2205         DONTCARE(tcp->tcp_swl2); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */
2206 
2207         tcp->tcp_rack = 0;                   /* Displayed in mib */
2208         tcp->tcp_rack_cnt = 0;
2209         tcp->tcp_rack_cur_max = 0;
2210         tcp->tcp_rack_abs_max = 0;
2211 
2212         tcp->tcp_max_swnd = 0;
2213 
2214         ASSERT(tcp->tcp_listener == NULL);
2215 
2216         DONTCARE(tcp->tcp_irs);                      /* tcp_valid_bits cleared */
2217         DONTCARE(tcp->tcp_iss);                      /* tcp_valid_bits cleared */
2218         DONTCARE(tcp->tcp_fss);                      /* tcp_valid_bits cleared */
2219         DONTCARE(tcp->tcp_urg);                      /* tcp_valid_bits cleared */
2220 
2221         ASSERT(tcp->tcp_conn_req_cnt_q == 0);
2222         ASSERT(tcp->tcp_conn_req_cnt_q0 == 0);


2318         tcp->tcp_in_ack_unsent = 0;
2319         tcp->tcp_cork = B_FALSE;
2320         tcp->tcp_tconnind_started = B_FALSE;
2321 
2322         PRESERVE(tcp->tcp_squeue_bytes);
2323 
2324         tcp->tcp_closemp_used = B_FALSE;
2325 
2326         PRESERVE(tcp->tcp_rsrv_mp);
2327         PRESERVE(tcp->tcp_rsrv_mp_lock);
2328 
2329 #ifdef DEBUG
2330         DONTCARE(tcp->tcmp_stk[0]);
2331 #endif
2332 
2333         PRESERVE(tcp->tcp_connid);
2334 
2335         ASSERT(tcp->tcp_listen_cnt == NULL);
2336         ASSERT(tcp->tcp_reass_tid == 0);
2337 





2338 #undef  DONTCARE
2339 #undef  PRESERVE
2340 }
2341 
2342 /*
2343  * Initialize the various fields in tcp_t.  If parent (the listener) is non
2344  * NULL, certain values will be inheritted from it.
2345  */
2346 void
2347 tcp_init_values(tcp_t *tcp, tcp_t *parent)
2348 {
2349         tcp_stack_t     *tcps = tcp->tcp_tcps;
2350         conn_t          *connp = tcp->tcp_connp;
2351         clock_t         rto;
2352 
2353         ASSERT((connp->conn_family == AF_INET &&
2354             connp->conn_ipversion == IPV4_VERSION) ||
2355             (connp->conn_family == AF_INET6 &&
2356             (connp->conn_ipversion == IPV4_VERSION ||
2357             connp->conn_ipversion == IPV6_VERSION)));
2358 



2359         if (parent == NULL) {


2360                 tcp->tcp_naglim = tcps->tcps_naglim_def;
2361 
2362                 tcp->tcp_rto_initial = tcps->tcps_rexmit_interval_initial;
2363                 tcp->tcp_rto_min = tcps->tcps_rexmit_interval_min;
2364                 tcp->tcp_rto_max = tcps->tcps_rexmit_interval_max;
2365 
2366                 tcp->tcp_first_ctimer_threshold =
2367                     tcps->tcps_ip_notify_cinterval;
2368                 tcp->tcp_second_ctimer_threshold =
2369                     tcps->tcps_ip_abort_cinterval;
2370                 tcp->tcp_first_timer_threshold = tcps->tcps_ip_notify_interval;
2371                 tcp->tcp_second_timer_threshold = tcps->tcps_ip_abort_interval;
2372 
2373                 tcp->tcp_fin_wait_2_flush_interval =
2374                     tcps->tcps_fin_wait_2_flush_interval;
2375 
2376                 tcp->tcp_ka_interval = tcps->tcps_keepalive_interval;
2377                 tcp->tcp_ka_abort_thres = tcps->tcps_keepalive_abort_interval;
2378                 tcp->tcp_ka_cnt = 0;
2379                 tcp->tcp_ka_rinterval = 0;
2380 
2381                 /*
2382                  * Default value of tcp_init_cwnd is 0, so no need to set here
2383                  * if parent is NULL.  But we need to inherit it from parent.
2384                  */
2385         } else {
2386                 /* Inherit various TCP parameters from the parent. */


2387                 tcp->tcp_naglim = parent->tcp_naglim;
2388 
2389                 tcp->tcp_rto_initial = parent->tcp_rto_initial;
2390                 tcp->tcp_rto_min = parent->tcp_rto_min;
2391                 tcp->tcp_rto_max = parent->tcp_rto_max;
2392 
2393                 tcp->tcp_first_ctimer_threshold =
2394                     parent->tcp_first_ctimer_threshold;
2395                 tcp->tcp_second_ctimer_threshold =
2396                     parent->tcp_second_ctimer_threshold;
2397                 tcp->tcp_first_timer_threshold =
2398                     parent->tcp_first_timer_threshold;
2399                 tcp->tcp_second_timer_threshold =
2400                     parent->tcp_second_timer_threshold;
2401 
2402                 tcp->tcp_fin_wait_2_flush_interval =
2403                     parent->tcp_fin_wait_2_flush_interval;
2404 
2405                 tcp->tcp_ka_interval = parent->tcp_ka_interval;
2406                 tcp->tcp_ka_abort_thres = parent->tcp_ka_abort_thres;
2407                 tcp->tcp_ka_cnt = parent->tcp_ka_cnt;
2408                 tcp->tcp_ka_rinterval = parent->tcp_ka_rinterval;
2409 
2410                 tcp->tcp_init_cwnd = parent->tcp_init_cwnd;
2411         }
2412 



2413         /*
2414          * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO
2415          * will be close to tcp_rexmit_interval_initial.  By doing this, we
2416          * allow the algorithm to adjust slowly to large fluctuations of RTT
2417          * during first few transmissions of a connection as seen in slow
2418          * links.
2419          */
2420         tcp->tcp_rtt_sa = tcp->tcp_rto_initial << 2;
2421         tcp->tcp_rtt_sd = tcp->tcp_rto_initial >> 1;
2422         rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
2423             tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5) +
2424             tcps->tcps_conn_grace_period;
2425         TCP_SET_RTO(tcp, rto);
2426 
2427         tcp->tcp_timer_backoff = 0;
2428         tcp->tcp_ms_we_have_waited = 0;
2429         tcp->tcp_last_recv_time = ddi_get_lbolt();
2430         tcp->tcp_cwnd_max = tcps->tcps_cwnd_max_;
2431         tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
2432 
2433         tcp->tcp_maxpsz_multiplier = tcps->tcps_maxpsz_multiplier;
2434 
2435         /* NOTE:  ISS is now set in tcp_set_destination(). */
2436 
2437         /* Reset fusion-related fields */
2438         tcp->tcp_fused = B_FALSE;
2439         tcp->tcp_unfusable = B_FALSE;
2440         tcp->tcp_fused_sigurg = B_FALSE;
2441         tcp->tcp_loopback_peer = NULL;
2442 
2443         /* We rebuild the header template on the next connect/conn_request */
2444 
2445         connp->conn_mlp_type = mlptSingle;


2642                         return (NULL);
2643                 }
2644 
2645                 ns = netstack_find_by_cred(credp);
2646                 ASSERT(ns != NULL);
2647                 tcps = ns->netstack_tcp;
2648                 ASSERT(tcps != NULL);
2649 
2650                 /*
2651                  * For exclusive stacks we set the zoneid to zero
2652                  * to make TCP operate as if in the global zone.
2653                  */
2654                 if (tcps->tcps_netstack->netstack_stackid !=
2655                     GLOBAL_NETSTACKID)
2656                         zoneid = GLOBAL_ZONEID;
2657                 else
2658                         zoneid = crgetzoneid(credp);
2659         }
2660 
2661         sqp = IP_SQUEUE_GET((uint_t)gethrtime());
2662         connp = (conn_t *)tcp_get_conn(sqp, tcps);
2663         /*
2664          * Both tcp_get_conn and netstack_find_by_cred incremented refcnt,
2665          * so we drop it by one.
2666          */
2667         netstack_rele(tcps->tcps_netstack);
2668         if (connp == NULL) {
2669                 *errorp = ENOSR;
2670                 return (NULL);
2671         }
2672         ASSERT(connp->conn_ixa->ixa_protocol == connp->conn_proto);
2673 
2674         connp->conn_sqp = sqp;
2675         connp->conn_initial_sqp = connp->conn_sqp;
2676         connp->conn_ixa->ixa_sqp = connp->conn_sqp;
2677         tcp = connp->conn_tcp;
2678 
2679         /*
2680          * Besides asking IP to set the checksum for us, have conn_ip_output
2681          * to do the following checks when necessary:
2682          *


3830          * there are many CPUs as we will be adding them 1 by 1.
3831          *
3832          * Note that tcps_sc_cnt never decreases and the tcps_sc[x] pointers
3833          * are not freed until the stack is going away.  So there is no need
3834          * to grab a lock to access the per CPU tcps_sc[x] pointer.
3835          */
3836         mutex_enter(&cpu_lock);
3837         tcps->tcps_sc_cnt = MAX(ncpus, boot_ncpus);
3838         mutex_exit(&cpu_lock);
3839         tcps->tcps_sc = kmem_zalloc(max_ncpus  * sizeof (tcp_stats_cpu_t *),
3840             KM_SLEEP);
3841         for (i = 0; i < tcps->tcps_sc_cnt; i++) {
3842                 tcps->tcps_sc[i] = kmem_zalloc(sizeof (tcp_stats_cpu_t),
3843                     KM_SLEEP);
3844         }
3845 
3846         mutex_init(&tcps->tcps_listener_conf_lock, NULL, MUTEX_DEFAULT, NULL);
3847         list_create(&tcps->tcps_listener_conf, sizeof (tcp_listener_t),
3848             offsetof(tcp_listener_t, tl_link));
3849 



3850         return (tcps);
3851 }
3852 
3853 /*
3854  * Called when the IP module is about to be unloaded.
3855  */
3856 void
3857 tcp_ddi_g_destroy(void)
3858 {
3859         tcp_g_kstat_fini(tcp_g_kstat);
3860         tcp_g_kstat = NULL;
3861         bzero(&tcp_g_statistics, sizeof (tcp_g_statistics));
3862 
3863         mutex_destroy(&tcp_random_lock);
3864 
3865         kmem_cache_destroy(tcp_timercache);
3866         kmem_cache_destroy(tcp_notsack_blk_cache);
3867 
3868         netstack_unregister(NS_TCP);
3869 }




   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2017 Joyent, Inc.
  25  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  26  * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
  27  * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
  28  */
  29 /* Copyright (c) 1990 Mentat Inc. */
  30 
  31 #include <sys/types.h>
  32 #include <sys/stream.h>
  33 #include <sys/strsun.h>
  34 #include <sys/strsubr.h>
  35 #include <sys/stropts.h>
  36 #include <sys/strlog.h>
  37 #define _SUN_TPI_VERSION 2
  38 #include <sys/tihdr.h>
  39 #include <sys/timod.h>
  40 #include <sys/ddi.h>
  41 #include <sys/sunddi.h>
  42 #include <sys/suntpi.h>
  43 #include <sys/xti_inet.h>
  44 #include <sys/cmn_err.h>
  45 #include <sys/debug.h>
  46 #include <sys/sdt.h>


  57 
  58 #include <sys/errno.h>
  59 #include <sys/signal.h>
  60 #include <sys/socket.h>
  61 #include <sys/socketvar.h>
  62 #include <sys/sockio.h>
  63 #include <sys/isa_defs.h>
  64 #include <sys/md5.h>
  65 #include <sys/random.h>
  66 #include <sys/uio.h>
  67 #include <sys/systm.h>
  68 #include <netinet/in.h>
  69 #include <netinet/tcp.h>
  70 #include <netinet/ip6.h>
  71 #include <netinet/icmp6.h>
  72 #include <net/if.h>
  73 #include <net/route.h>
  74 #include <inet/ipsec_impl.h>
  75 
  76 #include <inet/common.h>
  77 #include <inet/cc.h>
  78 #include <inet/ip.h>
  79 #include <inet/ip_impl.h>
  80 #include <inet/ip6.h>
  81 #include <inet/ip_ndp.h>
  82 #include <inet/proto_set.h>
  83 #include <inet/mib2.h>
  84 #include <inet/optcom.h>
  85 #include <inet/snmpcom.h>
  86 #include <inet/kstatcom.h>
  87 #include <inet/tcp.h>
  88 #include <inet/tcp_impl.h>
  89 #include <inet/tcp_cluster.h>
  90 #include <inet/udp_impl.h>
  91 #include <net/pfkeyv2.h>
  92 #include <inet/ipdrop.h>
  93 
  94 #include <inet/ipclassifier.h>
  95 #include <inet/ip_ire.h>
  96 #include <inet/ip_ftable.h>
  97 #include <inet/ip_if.h>


 250                 ((uint_t)(accid) & (TCP_ACCEPTOR_FANOUT_SIZE - 1))
 251 #endif  /* _ILP32 */
 252 
 253 /*
 254  * Minimum number of connections which can be created per listener.  Used
 255  * when the listener connection count is in effect.
 256  */
 257 static uint32_t tcp_min_conn_listener = 2;
 258 
 259 uint32_t tcp_early_abort = 30;
 260 
 261 /* TCP Timer control structure */
 262 typedef struct tcpt_s {
 263         pfv_t   tcpt_pfv;       /* The routine we are to call */
 264         tcp_t   *tcpt_tcp;      /* The parameter we are to pass in */
 265 } tcpt_t;
 266 
 267 /*
 268  * Functions called directly via squeue having a prototype of edesc_t.
 269  */


 270 void            tcp_input_data(void *arg, mblk_t *mp, void *arg2,
 271     ip_recv_attr_t *ira);
 272 static void     tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2,
 273     ip_recv_attr_t *dummy);
 274 
 275 
 276 /* Prototype for TCP functions */
 277 static void     tcp_random_init(void);
 278 int             tcp_random(void);
 279 static int      tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp,
 280                     in_port_t dstport, uint_t srcid);
 281 static int      tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp,
 282                     in_port_t dstport, uint32_t flowinfo,
 283                     uint_t srcid, uint32_t scope_id);
 284 static void     tcp_iss_init(tcp_t *tcp);
 285 static void     tcp_reinit(tcp_t *tcp);
 286 static void     tcp_reinit_values(tcp_t *tcp);
 287 
 288 static void     tcp_wsrv(queue_t *q);
 289 static void     tcp_update_lso(tcp_t *tcp, ip_xmit_attr_t *ixa);


 558         netstack_rele(ns);
 559         ASSERT(tcps != NULL);
 560         tcp->tcp_tcps = NULL;
 561 
 562         bzero(tcp, sizeof (tcp_t));
 563 
 564         /* restore the state */
 565         tcp->tcp_timercache = mp;
 566 
 567         tcp->tcp_rsrv_mp = tcp_rsrv_mp;
 568 
 569         tcp->tcp_connp = connp;
 570 
 571         ASSERT(connp->conn_tcp == tcp);
 572         ASSERT(connp->conn_flags & IPCL_TCPCONN);
 573         connp->conn_state_flags = CONN_INCIPIENT;
 574         ASSERT(connp->conn_proto == IPPROTO_TCP);
 575         ASSERT(connp->conn_ref == 1);
 576 }
 577 
 578 #pragma inline(tcp_calculate_rto)
 579 
 580 /*
 581  * RTO = average estimates (sa / 8) + 4 * deviation estimates (sd)
 582  *
 583  * Add tcp_rexmit_interval extra in case of extreme environment where the
 584  * algorithm fails to work.  The default value of tcp_rexmit_interval_extra
 585  * should be 0.
 586  *
 587  * As we use a finer grained clock than BSD and update RTO for every ACKs, add
 588  * in another .25 of RTT to the deviation of RTO to accommodate burstiness of
 589  * 1/4 of window size.
 590  */
 591 clock_t
 592 tcp_calculate_rto(tcp_t *tcp, tcp_stack_t *tcps)
 593 {
 594         clock_t rto;
 595 
 596         rto = NSEC2MSEC((tcp->tcp_rtt_sa >> 3) + (tcp->tcp_rtt_sa >> 5) +
 597             tcp->tcp_rtt_sd) + tcps->tcps_rexmit_interval_extra +
 598             tcps->tcps_conn_grace_period;
 599 
 600         if (rto < tcp->tcp_rto_min)
 601                 rto = tcp->tcp_rto_min;
 602         else if (rto > tcp->tcp_rto_max)
 603                 rto = tcp->tcp_rto_max;
 604 
 605         return (rto);
 606 }
 607 
 608 /*
 609  * Adapt to the information, such as rtt and rtt_sd, provided from the
 610  * DCE and IRE maintained by IP.
 611  *
 612  * Checks for multicast and broadcast destination address.
 613  * Returns zero if ok; an errno on failure.
 614  *
 615  * Note that the MSS calculation here is based on the info given in
 616  * the DCE and IRE.  We do not do any calculation based on TCP options.  They
 617  * will be handled in tcp_input_data() when TCP knows which options to use.
 618  *
 619  * Note on how TCP gets its parameters for a connection.
 620  *
 621  * When a tcp_t structure is allocated, it gets all the default parameters.
 622  * In tcp_set_destination(), it gets those metric parameters, like rtt, rtt_sd,
 623  * spipe, rpipe, ... from the route metrics.  Route metric overrides the
 624  * default.
 625  *
 626  * An incoming SYN with a multicast or broadcast destination address is dropped
 627  * in ip_fanout_v4/v6.
 628  *


 652          */
 653         flags |= IPDF_UNIQUE_DCE;
 654 
 655         if (!tcps->tcps_ignore_path_mtu)
 656                 connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
 657 
 658         /* Use conn_lock to satify ASSERT; tcp is already serialized */
 659         mutex_enter(&connp->conn_lock);
 660         error = conn_connect(connp, &uinfo, flags);
 661         mutex_exit(&connp->conn_lock);
 662         if (error != 0)
 663                 return (error);
 664 
 665         error = tcp_build_hdrs(tcp);
 666         if (error != 0)
 667                 return (error);
 668 
 669         tcp->tcp_localnet = uinfo.iulp_localnet;
 670 
 671         if (uinfo.iulp_rtt != 0) {
 672                 tcp->tcp_rtt_sa = MSEC2NSEC(uinfo.iulp_rtt);
 673                 tcp->tcp_rtt_sd = MSEC2NSEC(uinfo.iulp_rtt_sd);
 674                 tcp->tcp_rto = tcp_calculate_rto(tcp, tcps);






 675         }
 676         if (uinfo.iulp_ssthresh != 0)
 677                 tcp->tcp_cwnd_ssthresh = uinfo.iulp_ssthresh;
 678         else
 679                 tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
 680         if (uinfo.iulp_spipe > 0) {
 681                 connp->conn_sndbuf = MIN(uinfo.iulp_spipe,
 682                     tcps->tcps_max_buf);
 683                 if (tcps->tcps_snd_lowat_fraction != 0) {
 684                         connp->conn_sndlowat = connp->conn_sndbuf /
 685                             tcps->tcps_snd_lowat_fraction;
 686                 }
 687                 (void) tcp_maxpsz_set(tcp, B_TRUE);
 688         }
 689         /*
 690          * Note that up till now, acceptor always inherits receive
 691          * window from the listener.  But if there is a metrics
 692          * associated with a host, we should use that instead of
 693          * inheriting it from listener. Thus we need to pass this
 694          * info back to the caller.


1244         CONN_DEC_REF(tcp->tcp_connp);
1245 }
1246 
1247 /*
1248  * The tcp_t is going away. Remove it from all lists and set it
1249  * to TCPS_CLOSED. The freeing up of memory is deferred until
1250  * tcp_inactive. This is needed since a thread in tcp_rput might have
1251  * done a CONN_INC_REF on this structure before it was removed from the
1252  * hashes.
1253  */
1254 void
1255 tcp_closei_local(tcp_t *tcp)
1256 {
1257         conn_t          *connp = tcp->tcp_connp;
1258         tcp_stack_t     *tcps = tcp->tcp_tcps;
1259         int32_t         oldstate;
1260 
1261         if (!TCP_IS_SOCKET(tcp))
1262                 tcp_acceptor_hash_remove(tcp);
1263 





1264         /*
1265          * This can be called via tcp_time_wait_processing() if TCP gets a
1266          * SYN with sequence number outside the TIME-WAIT connection's
1267          * window.  So we need to check for TIME-WAIT state here as the
1268          * connection counter is already decremented.  See SET_TIME_WAIT()
1269          * macro
1270          */
1271         if (tcp->tcp_state >= TCPS_ESTABLISHED &&
1272             tcp->tcp_state < TCPS_TIME_WAIT) {
1273                 TCPS_CONN_DEC(tcps);
1274         }
1275 
1276         /*
1277          * If we are an eager connection hanging off a listener that
1278          * hasn't formally accepted the connection yet, get off its
1279          * list and blow off any data that we have accumulated.
1280          */
1281         if (tcp->tcp_listener != NULL) {
1282                 tcp_t   *listener = tcp->tcp_listener;
1283                 mutex_enter(&listener->tcp_eager_lock);


1422         if (tcp->tcp_rthdrdstopts != NULL) {
1423                 mi_free(tcp->tcp_rthdrdstopts);
1424                 tcp->tcp_rthdrdstopts = NULL;
1425                 tcp->tcp_rthdrdstoptslen = 0;
1426         }
1427         ASSERT(tcp->tcp_rthdrdstoptslen == 0);
1428         if (tcp->tcp_rthdr != NULL) {
1429                 mi_free(tcp->tcp_rthdr);
1430                 tcp->tcp_rthdr = NULL;
1431                 tcp->tcp_rthdrlen = 0;
1432         }
1433         ASSERT(tcp->tcp_rthdrlen == 0);
1434 
1435         /*
1436          * Following is really a blowing away a union.
1437          * It happens to have exactly two members of identical size
1438          * the following code is enough.
1439          */
1440         tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind);
1441 
1442         /* Allow the CC algorithm to clean up after itself. */
1443         if (tcp->tcp_cc_algo != NULL && tcp->tcp_cc_algo->cb_destroy != NULL)
1444                 tcp->tcp_cc_algo->cb_destroy(&tcp->tcp_ccv);
1445 
1446         /*
1447          * Destroy any association with SO_REUSEPORT group.
1448          */
1449         if (tcp->tcp_rg_bind != NULL) {
1450                 /*
1451                  * This is only necessary for connections which enabled
1452                  * SO_REUSEPORT but were never bound.  Such connections should
1453                  * be the one and only member of the tcp_rg_tp to which they
1454                  * have been associated.
1455                  */
1456                 VERIFY(tcp_rg_remove(tcp->tcp_rg_bind, tcp));
1457                 tcp_rg_destroy(tcp->tcp_rg_bind);
1458                 tcp->tcp_rg_bind = NULL;
1459         }
1460 
1461         /*
1462          * If this is a non-STREAM socket still holding on to an upper
1463          * handle, release it. As a result of fallback we might also see
1464          * STREAMS based conns with upper handles, in which case there is
1465          * nothing to do other than clearing the field.


1487  * outside the squeue. So when the interrupt comes, we have a clean
1488  * connection sitting in the freelist. Obviously, this buys us
1489  * performance.
1490  *
1491  * 2) Defence against DOS attack. Allocating a tcp/conn in tcp_input_listener
1492  * has multiple disadvantages - tying up the squeue during alloc.
1493  * But allocating the conn/tcp in IP land is also not the best since
1494  * we can't check the 'q' and 'q0' which are protected by squeue and
1495  * blindly allocate memory which might have to be freed here if we are
1496  * not allowed to accept the connection. By using the freelist and
1497  * putting the conn/tcp back in freelist, we don't pay a penalty for
1498  * allocating memory without checking 'q/q0' and freeing it if we can't
1499  * accept the connection.
1500  *
1501  * Care should be taken to put the conn back in the same squeue's freelist
1502  * from which it was allocated. Best results are obtained if conn is
1503  * allocated from listener's squeue and freed to the same. Time wait
1504  * collector will free up the freelist is the connection ends up sitting
1505  * there for too long.
1506  */
1507 conn_t *
1508 tcp_get_conn(void *arg, tcp_stack_t *tcps)
1509 {
1510         tcp_t                   *tcp = NULL;
1511         conn_t                  *connp = NULL;
1512         squeue_t                *sqp = (squeue_t *)arg;
1513         tcp_squeue_priv_t       *tcp_time_wait;
1514         netstack_t              *ns;
1515         mblk_t                  *tcp_rsrv_mp = NULL;
1516 
1517         tcp_time_wait =
1518             *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
1519 
1520         mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
1521         tcp = tcp_time_wait->tcp_free_list;
1522         ASSERT((tcp != NULL) ^ (tcp_time_wait->tcp_free_list_cnt == 0));
1523         if (tcp != NULL) {
1524                 tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next;
1525                 tcp_time_wait->tcp_free_list_cnt--;
1526                 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1527                 tcp->tcp_time_wait_next = NULL;
1528                 connp = tcp->tcp_connp;
1529                 connp->conn_flags |= IPCL_REUSED;
1530 
1531                 ASSERT(tcp->tcp_tcps == NULL);
1532                 ASSERT(connp->conn_netstack == NULL);
1533                 ASSERT(tcp->tcp_rsrv_mp != NULL);
1534                 ns = tcps->tcps_netstack;
1535                 netstack_hold(ns);
1536                 connp->conn_netstack = ns;
1537                 connp->conn_ixa->ixa_ipst = ns->netstack_ip;
1538                 tcp->tcp_tcps = tcps;
1539                 ipcl_globalhash_insert(connp);
1540 
1541                 connp->conn_ixa->ixa_notify_cookie = tcp;
1542                 ASSERT(connp->conn_ixa->ixa_notify == tcp_notify);
1543                 connp->conn_recv = tcp_input_data;
1544                 ASSERT(connp->conn_recvicmp == tcp_icmp_input);
1545                 ASSERT(connp->conn_verifyicmp == tcp_verifyicmp);
1546                 return (connp);
1547         }
1548         mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1549         /*
1550          * Pre-allocate the tcp_rsrv_mp. This mblk will not be freed until
1551          * this conn_t/tcp_t is freed at ipcl_conn_destroy().
1552          */
1553         tcp_rsrv_mp = allocb(0, BPRI_HI);
1554         if (tcp_rsrv_mp == NULL)
1555                 return (NULL);
1556 
1557         if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP,
1558             tcps->tcps_netstack)) == NULL) {
1559                 freeb(tcp_rsrv_mp);
1560                 return (NULL);
1561         }
1562 
1563         tcp = connp->conn_tcp;
1564         tcp->tcp_rsrv_mp = tcp_rsrv_mp;
1565         mutex_init(&tcp->tcp_rsrv_mp_lock, NULL, MUTEX_DEFAULT, NULL);
1566 
1567         tcp->tcp_tcps = tcps;
1568 
1569         connp->conn_recv = tcp_input_data;
1570         connp->conn_recvicmp = tcp_icmp_input;
1571         connp->conn_verifyicmp = tcp_verifyicmp;
1572 
1573         /*
1574          * Register tcp_notify to listen to capability changes detected by IP.
1575          * This upcall is made in the context of the call to conn_ip_output
1576          * thus it is inside the squeue.
1577          */
1578         connp->conn_ixa->ixa_notify = tcp_notify;
1579         connp->conn_ixa->ixa_notify_cookie = tcp;
1580 
1581         return (connp);
1582 }
1583 
1584 /*
1585  * Handle connect to IPv4 destinations, including connections for AF_INET6
1586  * sockets connecting to IPv4 mapped IPv6 destinations.
1587  * Returns zero if OK, a positive errno, or a negative TLI error.
1588  */
1589 static int
1590 tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp, in_port_t dstport,
1591     uint_t srcid)
1592 {
1593         ipaddr_t        dstaddr = *dstaddrp;
1594         uint16_t        lport;
1595         conn_t          *connp = tcp->tcp_connp;
1596         tcp_stack_t     *tcps = tcp->tcp_tcps;
1597         int             error;
1598 
1599         ASSERT(connp->conn_ipversion == IPV4_VERSION);
1600 
1601         /* Check for attempt to connect to INADDR_ANY */


1931  */
1932 static void
1933 tcp_reinit(tcp_t *tcp)
1934 {
1935         mblk_t          *mp;
1936         tcp_stack_t     *tcps = tcp->tcp_tcps;
1937         conn_t          *connp  = tcp->tcp_connp;
1938         int32_t         oldstate;
1939 
1940         /* tcp_reinit should never be called for detached tcp_t's */
1941         ASSERT(tcp->tcp_listener == NULL);
1942         ASSERT((connp->conn_family == AF_INET &&
1943             connp->conn_ipversion == IPV4_VERSION) ||
1944             (connp->conn_family == AF_INET6 &&
1945             (connp->conn_ipversion == IPV4_VERSION ||
1946             connp->conn_ipversion == IPV6_VERSION)));
1947 
1948         /* Cancel outstanding timers */
1949         tcp_timers_stop(tcp);
1950 









1951         tcp_close_mpp(&tcp->tcp_xmit_head);
1952         if (tcp->tcp_snd_zcopy_aware)
1953                 tcp_zcopy_notify(tcp);
1954         tcp->tcp_xmit_last = tcp->tcp_xmit_tail = NULL;
1955         tcp->tcp_unsent = tcp->tcp_xmit_tail_unsent = 0;
1956         mutex_enter(&tcp->tcp_non_sq_lock);
1957         if (tcp->tcp_flow_stopped &&
1958             TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) {
1959                 tcp_clrqfull(tcp);
1960         }
1961         mutex_exit(&tcp->tcp_non_sq_lock);
1962         tcp_close_mpp(&tcp->tcp_reass_head);
1963         tcp->tcp_reass_tail = NULL;
1964         if (tcp->tcp_rcv_list != NULL) {
1965                 /* Free b_next chain */
1966                 tcp_close_mpp(&tcp->tcp_rcv_list);
1967                 tcp->tcp_rcv_last_head = NULL;
1968                 tcp->tcp_rcv_last_tail = NULL;
1969                 tcp->tcp_rcv_cnt = 0;
1970         }


2102 
2103         /* Should be ASSERT NULL on these with new code! */
2104         ASSERT(tcp->tcp_time_wait_next == NULL);
2105         ASSERT(tcp->tcp_time_wait_prev == NULL);
2106         ASSERT(tcp->tcp_time_wait_expire == 0);
2107         PRESERVE(tcp->tcp_state);
2108         PRESERVE(connp->conn_rq);
2109         PRESERVE(connp->conn_wq);
2110 
2111         ASSERT(tcp->tcp_xmit_head == NULL);
2112         ASSERT(tcp->tcp_xmit_last == NULL);
2113         ASSERT(tcp->tcp_unsent == 0);
2114         ASSERT(tcp->tcp_xmit_tail == NULL);
2115         ASSERT(tcp->tcp_xmit_tail_unsent == 0);
2116 
2117         tcp->tcp_snxt = 0;                   /* Displayed in mib */
2118         tcp->tcp_suna = 0;                   /* Displayed in mib */
2119         tcp->tcp_swnd = 0;
2120         DONTCARE(tcp->tcp_cwnd);     /* Init in tcp_process_options */
2121 



2122         if (connp->conn_ht_iphc != NULL) {
2123                 kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
2124                 connp->conn_ht_iphc = NULL;
2125                 connp->conn_ht_iphc_allocated = 0;
2126                 connp->conn_ht_iphc_len = 0;
2127                 connp->conn_ht_ulp = NULL;
2128                 connp->conn_ht_ulp_len = 0;
2129                 tcp->tcp_ipha = NULL;
2130                 tcp->tcp_ip6h = NULL;
2131                 tcp->tcp_tcpha = NULL;
2132         }
2133 
2134         /* We clear any IP_OPTIONS and extension headers */
2135         ip_pkt_free(&connp->conn_xmit_ipp);
2136 
2137         DONTCARE(tcp->tcp_naglim);           /* Init in tcp_init_values */
2138         DONTCARE(tcp->tcp_ipha);
2139         DONTCARE(tcp->tcp_ip6h);
2140         DONTCARE(tcp->tcp_tcpha);
2141         tcp->tcp_valid_bits = 0;


2193         tcp->tcp_initial_pmtu = 0;
2194 
2195         ASSERT(tcp->tcp_reass_head == NULL);
2196         ASSERT(tcp->tcp_reass_tail == NULL);
2197 
2198         tcp->tcp_cwnd_cnt = 0;
2199 
2200         ASSERT(tcp->tcp_rcv_list == NULL);
2201         ASSERT(tcp->tcp_rcv_last_head == NULL);
2202         ASSERT(tcp->tcp_rcv_last_tail == NULL);
2203         ASSERT(tcp->tcp_rcv_cnt == 0);
2204 
2205         DONTCARE(tcp->tcp_cwnd_ssthresh); /* Init in tcp_set_destination */
2206         DONTCARE(tcp->tcp_cwnd_max);         /* Init in tcp_init_values */
2207         tcp->tcp_csuna = 0;
2208 
2209         tcp->tcp_rto = 0;                    /* Displayed in MIB */
2210         DONTCARE(tcp->tcp_rtt_sa);           /* Init in tcp_init_values */
2211         DONTCARE(tcp->tcp_rtt_sd);           /* Init in tcp_init_values */
2212         tcp->tcp_rtt_update = 0;
2213         tcp->tcp_rtt_sum = 0;
2214         tcp->tcp_rtt_cnt = 0;
2215 
2216         DONTCARE(tcp->tcp_swl1); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */
2217         DONTCARE(tcp->tcp_swl2); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */
2218 
2219         tcp->tcp_rack = 0;                   /* Displayed in mib */
2220         tcp->tcp_rack_cnt = 0;
2221         tcp->tcp_rack_cur_max = 0;
2222         tcp->tcp_rack_abs_max = 0;
2223 
2224         tcp->tcp_max_swnd = 0;
2225 
2226         ASSERT(tcp->tcp_listener == NULL);
2227 
2228         DONTCARE(tcp->tcp_irs);                      /* tcp_valid_bits cleared */
2229         DONTCARE(tcp->tcp_iss);                      /* tcp_valid_bits cleared */
2230         DONTCARE(tcp->tcp_fss);                      /* tcp_valid_bits cleared */
2231         DONTCARE(tcp->tcp_urg);                      /* tcp_valid_bits cleared */
2232 
2233         ASSERT(tcp->tcp_conn_req_cnt_q == 0);
2234         ASSERT(tcp->tcp_conn_req_cnt_q0 == 0);


2330         tcp->tcp_in_ack_unsent = 0;
2331         tcp->tcp_cork = B_FALSE;
2332         tcp->tcp_tconnind_started = B_FALSE;
2333 
2334         PRESERVE(tcp->tcp_squeue_bytes);
2335 
2336         tcp->tcp_closemp_used = B_FALSE;
2337 
2338         PRESERVE(tcp->tcp_rsrv_mp);
2339         PRESERVE(tcp->tcp_rsrv_mp_lock);
2340 
2341 #ifdef DEBUG
2342         DONTCARE(tcp->tcmp_stk[0]);
2343 #endif
2344 
2345         PRESERVE(tcp->tcp_connid);
2346 
2347         ASSERT(tcp->tcp_listen_cnt == NULL);
2348         ASSERT(tcp->tcp_reass_tid == 0);
2349 
2350         /* Allow the CC algorithm to clean up after itself. */
2351         if (tcp->tcp_cc_algo->cb_destroy != NULL)
2352                 tcp->tcp_cc_algo->cb_destroy(&tcp->tcp_ccv);
2353         tcp->tcp_cc_algo = NULL;
2354 
2355 #undef  DONTCARE
2356 #undef  PRESERVE
2357 }
2358 
2359 /*
2360  * Initialize the various fields in tcp_t.  If parent (the listener) is non
2361  * NULL, certain values will be inheritted from it.
2362  */
2363 void
2364 tcp_init_values(tcp_t *tcp, tcp_t *parent)
2365 {
2366         tcp_stack_t     *tcps = tcp->tcp_tcps;
2367         conn_t          *connp = tcp->tcp_connp;

2368 
2369         ASSERT((connp->conn_family == AF_INET &&
2370             connp->conn_ipversion == IPV4_VERSION) ||
2371             (connp->conn_family == AF_INET6 &&
2372             (connp->conn_ipversion == IPV4_VERSION ||
2373             connp->conn_ipversion == IPV6_VERSION)));
2374 
2375         tcp->tcp_ccv.type = IPPROTO_TCP;
2376         tcp->tcp_ccv.ccvc.tcp = tcp;
2377 
2378         if (parent == NULL) {
2379                 tcp->tcp_cc_algo = tcps->tcps_default_cc_algo;
2380 
2381                 tcp->tcp_naglim = tcps->tcps_naglim_def;
2382 
2383                 tcp->tcp_rto_initial = tcps->tcps_rexmit_interval_initial;
2384                 tcp->tcp_rto_min = tcps->tcps_rexmit_interval_min;
2385                 tcp->tcp_rto_max = tcps->tcps_rexmit_interval_max;
2386 
2387                 tcp->tcp_first_ctimer_threshold =
2388                     tcps->tcps_ip_notify_cinterval;
2389                 tcp->tcp_second_ctimer_threshold =
2390                     tcps->tcps_ip_abort_cinterval;
2391                 tcp->tcp_first_timer_threshold = tcps->tcps_ip_notify_interval;
2392                 tcp->tcp_second_timer_threshold = tcps->tcps_ip_abort_interval;
2393 
2394                 tcp->tcp_fin_wait_2_flush_interval =
2395                     tcps->tcps_fin_wait_2_flush_interval;
2396 
2397                 tcp->tcp_ka_interval = tcps->tcps_keepalive_interval;
2398                 tcp->tcp_ka_abort_thres = tcps->tcps_keepalive_abort_interval;
2399                 tcp->tcp_ka_cnt = 0;
2400                 tcp->tcp_ka_rinterval = 0;
2401 
2402                 /*
2403                  * Default value of tcp_init_cwnd is 0, so no need to set here
2404                  * if parent is NULL.  But we need to inherit it from parent.
2405                  */
2406         } else {
2407                 /* Inherit various TCP parameters from the parent. */
2408                 tcp->tcp_cc_algo = parent->tcp_cc_algo;
2409 
2410                 tcp->tcp_naglim = parent->tcp_naglim;
2411 
2412                 tcp->tcp_rto_initial = parent->tcp_rto_initial;
2413                 tcp->tcp_rto_min = parent->tcp_rto_min;
2414                 tcp->tcp_rto_max = parent->tcp_rto_max;
2415 
2416                 tcp->tcp_first_ctimer_threshold =
2417                     parent->tcp_first_ctimer_threshold;
2418                 tcp->tcp_second_ctimer_threshold =
2419                     parent->tcp_second_ctimer_threshold;
2420                 tcp->tcp_first_timer_threshold =
2421                     parent->tcp_first_timer_threshold;
2422                 tcp->tcp_second_timer_threshold =
2423                     parent->tcp_second_timer_threshold;
2424 
2425                 tcp->tcp_fin_wait_2_flush_interval =
2426                     parent->tcp_fin_wait_2_flush_interval;
2427 
2428                 tcp->tcp_ka_interval = parent->tcp_ka_interval;
2429                 tcp->tcp_ka_abort_thres = parent->tcp_ka_abort_thres;
2430                 tcp->tcp_ka_cnt = parent->tcp_ka_cnt;
2431                 tcp->tcp_ka_rinterval = parent->tcp_ka_rinterval;
2432 
2433                 tcp->tcp_init_cwnd = parent->tcp_init_cwnd;
2434         }
2435 
2436         if (tcp->tcp_cc_algo->cb_init != NULL)
2437                 VERIFY(tcp->tcp_cc_algo->cb_init(&tcp->tcp_ccv) == 0);
2438 
2439         /*
2440          * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO
2441          * will be close to tcp_rexmit_interval_initial.  By doing this, we
2442          * allow the algorithm to adjust slowly to large fluctuations of RTT
2443          * during first few transmissions of a connection as seen in slow
2444          * links.
2445          */
2446         tcp->tcp_rtt_sa = MSEC2NSEC(tcp->tcp_rto_initial) << 2;
2447         tcp->tcp_rtt_sd = MSEC2NSEC(tcp->tcp_rto_initial) >> 1;
2448         tcp->tcp_rto = tcp_calculate_rto(tcp, tcps);



2449 
2450         tcp->tcp_timer_backoff = 0;
2451         tcp->tcp_ms_we_have_waited = 0;
2452         tcp->tcp_last_recv_time = ddi_get_lbolt();
2453         tcp->tcp_cwnd_max = tcps->tcps_cwnd_max_;
2454         tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
2455 
2456         tcp->tcp_maxpsz_multiplier = tcps->tcps_maxpsz_multiplier;
2457 
2458         /* NOTE:  ISS is now set in tcp_set_destination(). */
2459 
2460         /* Reset fusion-related fields */
2461         tcp->tcp_fused = B_FALSE;
2462         tcp->tcp_unfusable = B_FALSE;
2463         tcp->tcp_fused_sigurg = B_FALSE;
2464         tcp->tcp_loopback_peer = NULL;
2465 
2466         /* We rebuild the header template on the next connect/conn_request */
2467 
2468         connp->conn_mlp_type = mlptSingle;


2665                         return (NULL);
2666                 }
2667 
2668                 ns = netstack_find_by_cred(credp);
2669                 ASSERT(ns != NULL);
2670                 tcps = ns->netstack_tcp;
2671                 ASSERT(tcps != NULL);
2672 
2673                 /*
2674                  * For exclusive stacks we set the zoneid to zero
2675                  * to make TCP operate as if in the global zone.
2676                  */
2677                 if (tcps->tcps_netstack->netstack_stackid !=
2678                     GLOBAL_NETSTACKID)
2679                         zoneid = GLOBAL_ZONEID;
2680                 else
2681                         zoneid = crgetzoneid(credp);
2682         }
2683 
2684         sqp = IP_SQUEUE_GET((uint_t)gethrtime());
2685         connp = tcp_get_conn(sqp, tcps);
2686         /*
2687          * Both tcp_get_conn and netstack_find_by_cred incremented refcnt,
2688          * so we drop it by one.
2689          */
2690         netstack_rele(tcps->tcps_netstack);
2691         if (connp == NULL) {
2692                 *errorp = ENOSR;
2693                 return (NULL);
2694         }
2695         ASSERT(connp->conn_ixa->ixa_protocol == connp->conn_proto);
2696 
2697         connp->conn_sqp = sqp;
2698         connp->conn_initial_sqp = connp->conn_sqp;
2699         connp->conn_ixa->ixa_sqp = connp->conn_sqp;
2700         tcp = connp->conn_tcp;
2701 
2702         /*
2703          * Besides asking IP to set the checksum for us, have conn_ip_output
2704          * to do the following checks when necessary:
2705          *


3853          * there are many CPUs as we will be adding them 1 by 1.
3854          *
3855          * Note that tcps_sc_cnt never decreases and the tcps_sc[x] pointers
3856          * are not freed until the stack is going away.  So there is no need
3857          * to grab a lock to access the per CPU tcps_sc[x] pointer.
3858          */
3859         mutex_enter(&cpu_lock);
3860         tcps->tcps_sc_cnt = MAX(ncpus, boot_ncpus);
3861         mutex_exit(&cpu_lock);
3862         tcps->tcps_sc = kmem_zalloc(max_ncpus  * sizeof (tcp_stats_cpu_t *),
3863             KM_SLEEP);
3864         for (i = 0; i < tcps->tcps_sc_cnt; i++) {
3865                 tcps->tcps_sc[i] = kmem_zalloc(sizeof (tcp_stats_cpu_t),
3866                     KM_SLEEP);
3867         }
3868 
3869         mutex_init(&tcps->tcps_listener_conf_lock, NULL, MUTEX_DEFAULT, NULL);
3870         list_create(&tcps->tcps_listener_conf, sizeof (tcp_listener_t),
3871             offsetof(tcp_listener_t, tl_link));
3872 
3873         tcps->tcps_default_cc_algo = cc_load_algo(CC_DEFAULT_ALGO_NAME);
3874         ASSERT3P(tcps->tcps_default_cc_algo, !=, NULL);
3875 
3876         return (tcps);
3877 }
3878 
3879 /*
3880  * Called when the IP module is about to be unloaded.
3881  */
3882 void
3883 tcp_ddi_g_destroy(void)
3884 {
3885         tcp_g_kstat_fini(tcp_g_kstat);
3886         tcp_g_kstat = NULL;
3887         bzero(&tcp_g_statistics, sizeof (tcp_g_statistics));
3888 
3889         mutex_destroy(&tcp_random_lock);
3890 
3891         kmem_cache_destroy(tcp_timercache);
3892         kmem_cache_destroy(tcp_notsack_blk_cache);
3893 
3894         netstack_unregister(NS_TCP);
3895 }