1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 #include <sys/types.h>
  27 #include <sys/strlog.h>
  28 #include <sys/policy.h>
  29 #include <sys/strsun.h>
  30 #include <sys/squeue_impl.h>
  31 #include <sys/squeue.h>
  32 
  33 #include <inet/common.h>
  34 #include <inet/ip.h>
  35 #include <inet/tcp.h>
  36 #include <inet/tcp_impl.h>
  37 
  38 /* Control whether TCP can enter defensive mode when under memory pressure. */
  39 static boolean_t tcp_do_reclaim = B_TRUE;
  40 
  41 /*
  42  * Routines related to the TCP_IOC_ABORT_CONN ioctl command.
  43  *
  44  * TCP_IOC_ABORT_CONN is a non-transparent ioctl command used for aborting
  45  * TCP connections. To invoke this ioctl, a tcp_ioc_abort_conn_t structure
  46  * (defined in tcp.h) needs to be filled in and passed into the kernel
  47  * via an I_STR ioctl command (see streamio(7I)). The tcp_ioc_abort_conn_t
  48  * structure contains the four-tuple of a TCP connection and a range of TCP
  49  * states (specified by ac_start and ac_end). The use of wildcard addresses
  50  * and ports is allowed. Connections with a matching four tuple and a state
  51  * within the specified range will be aborted. The valid states for the
  52  * ac_start and ac_end fields are in the range TCPS_SYN_SENT to TCPS_TIME_WAIT,
  53  * inclusive.
  54  *
  55  * An application which has its connection aborted by this ioctl will receive
  56  * an error that is dependent on the connection state at the time of the abort.
  57  * If the connection state is < TCPS_TIME_WAIT, an application should behave as
  58  * though a RST packet has been received.  If the connection state is equal to
  59  * TCPS_TIME_WAIT, the 2MSL timeout will immediately be canceled by the kernel
  60  * and all resources associated with the connection will be freed.
  61  */
  62 static mblk_t   *tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *, tcp_t *);
  63 static void     tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *);
  64 static void     tcp_ioctl_abort_handler(void *arg, mblk_t *mp, void *arg2,
  65     ip_recv_attr_t *dummy);
  66 static int      tcp_ioctl_abort(tcp_ioc_abort_conn_t *, tcp_stack_t *tcps);
  67 void    tcp_ioctl_abort_conn(queue_t *, mblk_t *);
  68 static int      tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *,
  69     boolean_t, tcp_stack_t *);
  70 
  71 /*
  72  * Macros used for accessing the different types of sockaddr
  73  * structures inside a tcp_ioc_abort_conn_t.
  74  */
  75 #define TCP_AC_V4LADDR(acp) ((sin_t *)&(acp)->ac_local)
  76 #define TCP_AC_V4RADDR(acp) ((sin_t *)&(acp)->ac_remote)
  77 #define TCP_AC_V4LOCAL(acp) (TCP_AC_V4LADDR(acp)->sin_addr.s_addr)
  78 #define TCP_AC_V4REMOTE(acp) (TCP_AC_V4RADDR(acp)->sin_addr.s_addr)
  79 #define TCP_AC_V4LPORT(acp) (TCP_AC_V4LADDR(acp)->sin_port)
  80 #define TCP_AC_V4RPORT(acp) (TCP_AC_V4RADDR(acp)->sin_port)
  81 #define TCP_AC_V6LADDR(acp) ((sin6_t *)&(acp)->ac_local)
  82 #define TCP_AC_V6RADDR(acp) ((sin6_t *)&(acp)->ac_remote)
  83 #define TCP_AC_V6LOCAL(acp) (TCP_AC_V6LADDR(acp)->sin6_addr)
  84 #define TCP_AC_V6REMOTE(acp) (TCP_AC_V6RADDR(acp)->sin6_addr)
  85 #define TCP_AC_V6LPORT(acp) (TCP_AC_V6LADDR(acp)->sin6_port)
  86 #define TCP_AC_V6RPORT(acp) (TCP_AC_V6RADDR(acp)->sin6_port)
  87 
  88 /*
  89  * Return the correct error code to mimic the behavior
  90  * of a connection reset.
  91  */
  92 #define TCP_AC_GET_ERRCODE(state, err) {        \
  93                 switch ((state)) {              \
  94                 case TCPS_SYN_SENT:             \
  95                 case TCPS_SYN_RCVD:             \
  96                         (err) = ECONNREFUSED;   \
  97                         break;                  \
  98                 case TCPS_ESTABLISHED:          \
  99                 case TCPS_FIN_WAIT_1:           \
 100                 case TCPS_FIN_WAIT_2:           \
 101                 case TCPS_CLOSE_WAIT:           \
 102                         (err) = ECONNRESET;     \
 103                         break;                  \
 104                 case TCPS_CLOSING:              \
 105                 case TCPS_LAST_ACK:             \
 106                 case TCPS_TIME_WAIT:            \
 107                         (err) = 0;              \
 108                         break;                  \
 109                 default:                        \
 110                         (err) = ENXIO;          \
 111                 }                               \
 112         }
 113 
 114 /*
 115  * Check if a tcp structure matches the info in acp.
 116  */
 117 #define TCP_AC_ADDR_MATCH(acp, connp, tcp)                      \
 118         (((acp)->ac_local.ss_family == AF_INET) ?            \
 119         ((TCP_AC_V4LOCAL((acp)) == INADDR_ANY ||                \
 120         TCP_AC_V4LOCAL((acp)) == (connp)->conn_laddr_v4) &&  \
 121         (TCP_AC_V4REMOTE((acp)) == INADDR_ANY ||                \
 122         TCP_AC_V4REMOTE((acp)) == (connp)->conn_faddr_v4) && \
 123         (TCP_AC_V4LPORT((acp)) == 0 ||                          \
 124         TCP_AC_V4LPORT((acp)) == (connp)->conn_lport) &&     \
 125         (TCP_AC_V4RPORT((acp)) == 0 ||                          \
 126         TCP_AC_V4RPORT((acp)) == (connp)->conn_fport) &&     \
 127         (acp)->ac_start <= (tcp)->tcp_state &&                 \
 128         (acp)->ac_end >= (tcp)->tcp_state) :                   \
 129         ((IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6LOCAL((acp))) ||        \
 130         IN6_ARE_ADDR_EQUAL(&TCP_AC_V6LOCAL((acp)),          \
 131         &(connp)->conn_laddr_v6)) &&                             \
 132         (IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6REMOTE((acp))) ||        \
 133         IN6_ARE_ADDR_EQUAL(&TCP_AC_V6REMOTE((acp)),         \
 134         &(connp)->conn_faddr_v6)) &&                             \
 135         (TCP_AC_V6LPORT((acp)) == 0 ||                          \
 136         TCP_AC_V6LPORT((acp)) == (connp)->conn_lport) &&     \
 137         (TCP_AC_V6RPORT((acp)) == 0 ||                          \
 138         TCP_AC_V6RPORT((acp)) == (connp)->conn_fport) &&     \
 139         (acp)->ac_start <= (tcp)->tcp_state &&                 \
 140         (acp)->ac_end >= (tcp)->tcp_state))
 141 
 142 #define TCP_AC_MATCH(acp, connp, tcp)                           \
 143         (((acp)->ac_zoneid == ALL_ZONES ||                   \
 144         (acp)->ac_zoneid == (connp)->conn_zoneid) ?               \
 145         TCP_AC_ADDR_MATCH(acp, connp, tcp) : 0)
 146 
 147 /*
 148  * Build a message containing a tcp_ioc_abort_conn_t structure
 149  * which is filled in with information from acp and tp.
 150  */
 151 static mblk_t *
 152 tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *acp, tcp_t *tp)
 153 {
 154         mblk_t *mp;
 155         tcp_ioc_abort_conn_t *tacp;
 156 
 157         mp = allocb(sizeof (uint32_t) + sizeof (*acp), BPRI_LO);
 158         if (mp == NULL)
 159                 return (NULL);
 160 
 161         *((uint32_t *)mp->b_rptr) = TCP_IOC_ABORT_CONN;
 162         tacp = (tcp_ioc_abort_conn_t *)((uchar_t *)mp->b_rptr +
 163             sizeof (uint32_t));
 164 
 165         tacp->ac_start = acp->ac_start;
 166         tacp->ac_end = acp->ac_end;
 167         tacp->ac_zoneid = acp->ac_zoneid;
 168 
 169         if (acp->ac_local.ss_family == AF_INET) {
 170                 tacp->ac_local.ss_family = AF_INET;
 171                 tacp->ac_remote.ss_family = AF_INET;
 172                 TCP_AC_V4LOCAL(tacp) = tp->tcp_connp->conn_laddr_v4;
 173                 TCP_AC_V4REMOTE(tacp) = tp->tcp_connp->conn_faddr_v4;
 174                 TCP_AC_V4LPORT(tacp) = tp->tcp_connp->conn_lport;
 175                 TCP_AC_V4RPORT(tacp) = tp->tcp_connp->conn_fport;
 176         } else {
 177                 tacp->ac_local.ss_family = AF_INET6;
 178                 tacp->ac_remote.ss_family = AF_INET6;
 179                 TCP_AC_V6LOCAL(tacp) = tp->tcp_connp->conn_laddr_v6;
 180                 TCP_AC_V6REMOTE(tacp) = tp->tcp_connp->conn_faddr_v6;
 181                 TCP_AC_V6LPORT(tacp) = tp->tcp_connp->conn_lport;
 182                 TCP_AC_V6RPORT(tacp) = tp->tcp_connp->conn_fport;
 183         }
 184         mp->b_wptr = (uchar_t *)mp->b_rptr + sizeof (uint32_t) + sizeof (*acp);
 185         return (mp);
 186 }
 187 
 188 /*
 189  * Print a tcp_ioc_abort_conn_t structure.
 190  */
 191 static void
 192 tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *acp)
 193 {
 194         char lbuf[128];
 195         char rbuf[128];
 196         sa_family_t af;
 197         in_port_t lport, rport;
 198         ushort_t logflags;
 199 
 200         af = acp->ac_local.ss_family;
 201 
 202         if (af == AF_INET) {
 203                 (void) inet_ntop(af, (const void *)&TCP_AC_V4LOCAL(acp),
 204                     lbuf, 128);
 205                 (void) inet_ntop(af, (const void *)&TCP_AC_V4REMOTE(acp),
 206                     rbuf, 128);
 207                 lport = ntohs(TCP_AC_V4LPORT(acp));
 208                 rport = ntohs(TCP_AC_V4RPORT(acp));
 209         } else {
 210                 (void) inet_ntop(af, (const void *)&TCP_AC_V6LOCAL(acp),
 211                     lbuf, 128);
 212                 (void) inet_ntop(af, (const void *)&TCP_AC_V6REMOTE(acp),
 213                     rbuf, 128);
 214                 lport = ntohs(TCP_AC_V6LPORT(acp));
 215                 rport = ntohs(TCP_AC_V6RPORT(acp));
 216         }
 217 
 218         logflags = SL_TRACE | SL_NOTE;
 219         /*
 220          * Don't print this message to the console if the operation was done
 221          * to a non-global zone.
 222          */
 223         if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES)
 224                 logflags |= SL_CONSOLE;
 225         (void) strlog(TCP_MOD_ID, 0, 1, logflags,
 226             "TCP_IOC_ABORT_CONN: local = %s:%d, remote = %s:%d, "
 227             "start = %d, end = %d\n", lbuf, lport, rbuf, rport,
 228             acp->ac_start, acp->ac_end);
 229 }
 230 
 231 /*
 232  * Called using SQ_FILL when a message built using
 233  * tcp_ioctl_abort_build_msg is put into a queue.
 234  * Note that when we get here there is no wildcard in acp any more.
 235  */
 236 /* ARGSUSED2 */
 237 static void
 238 tcp_ioctl_abort_handler(void *arg, mblk_t *mp, void *arg2,
 239     ip_recv_attr_t *dummy)
 240 {
 241         conn_t                  *connp = (conn_t *)arg;
 242         tcp_t                   *tcp = connp->conn_tcp;
 243         tcp_ioc_abort_conn_t    *acp;
 244 
 245         /*
 246          * Don't accept any input on a closed tcp as this TCP logically does
 247          * not exist on the system. Don't proceed further with this TCP.
 248          * For eg. this packet could trigger another close of this tcp
 249          * which would be disastrous for tcp_refcnt. tcp_close_detached /
 250          * tcp_clean_death / tcp_closei_local must be called at most once
 251          * on a TCP.
 252          */
 253         if (tcp->tcp_state == TCPS_CLOSED ||
 254             tcp->tcp_state == TCPS_BOUND) {
 255                 freemsg(mp);
 256                 return;
 257         }
 258 
 259         acp = (tcp_ioc_abort_conn_t *)(mp->b_rptr + sizeof (uint32_t));
 260         if (tcp->tcp_state <= acp->ac_end) {
 261                 /*
 262                  * If we get here, we are already on the correct
 263                  * squeue. This ioctl follows the following path
 264                  * tcp_wput -> tcp_wput_ioctl -> tcp_ioctl_abort_conn
 265                  * ->tcp_ioctl_abort->squeue_enter (if on a
 266                  * different squeue)
 267                  */
 268                 int errcode;
 269 
 270                 TCP_AC_GET_ERRCODE(tcp->tcp_state, errcode);
 271                 (void) tcp_clean_death(tcp, errcode);
 272         }
 273         freemsg(mp);
 274 }
 275 
 276 /*
 277  * Abort all matching connections on a hash chain.
 278  */
 279 static int
 280 tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *acp, int index, int *count,
 281     boolean_t exact, tcp_stack_t *tcps)
 282 {
 283         int nmatch, err = 0;
 284         tcp_t *tcp;
 285         MBLKP mp, last, listhead = NULL;
 286         conn_t  *tconnp;
 287         connf_t *connfp;
 288         ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
 289 
 290         connfp = &ipst->ips_ipcl_conn_fanout[index];
 291 
 292 startover:
 293         nmatch = 0;
 294 
 295         mutex_enter(&connfp->connf_lock);
 296         for (tconnp = connfp->connf_head; tconnp != NULL;
 297             tconnp = tconnp->conn_next) {
 298                 tcp = tconnp->conn_tcp;
 299                 /*
 300                  * We are missing a check on sin6_scope_id for linklocals here,
 301                  * but current usage is just for aborting based on zoneid
 302                  * for shared-IP zones.
 303                  */
 304                 if (TCP_AC_MATCH(acp, tconnp, tcp)) {
 305                         CONN_INC_REF(tconnp);
 306                         mp = tcp_ioctl_abort_build_msg(acp, tcp);
 307                         if (mp == NULL) {
 308                                 err = ENOMEM;
 309                                 CONN_DEC_REF(tconnp);
 310                                 break;
 311                         }
 312                         mp->b_prev = (mblk_t *)tcp;
 313 
 314                         if (listhead == NULL) {
 315                                 listhead = mp;
 316                                 last = mp;
 317                         } else {
 318                                 last->b_next = mp;
 319                                 last = mp;
 320                         }
 321                         nmatch++;
 322                         if (exact)
 323                                 break;
 324                 }
 325 
 326                 /* Avoid holding lock for too long. */
 327                 if (nmatch >= 500)
 328                         break;
 329         }
 330         mutex_exit(&connfp->connf_lock);
 331 
 332         /* Pass mp into the correct tcp */
 333         while ((mp = listhead) != NULL) {
 334                 listhead = listhead->b_next;
 335                 tcp = (tcp_t *)mp->b_prev;
 336                 mp->b_next = mp->b_prev = NULL;
 337                 SQUEUE_ENTER_ONE(tcp->tcp_connp->conn_sqp, mp,
 338                     tcp_ioctl_abort_handler, tcp->tcp_connp, NULL,
 339                     SQ_FILL, SQTAG_TCP_ABORT_BUCKET);
 340         }
 341 
 342         *count += nmatch;
 343         if (nmatch >= 500 && err == 0)
 344                 goto startover;
 345         return (err);
 346 }
 347 
 348 /*
 349  * Abort all connections that matches the attributes specified in acp.
 350  */
 351 static int
 352 tcp_ioctl_abort(tcp_ioc_abort_conn_t *acp, tcp_stack_t *tcps)
 353 {
 354         sa_family_t af;
 355         uint32_t  ports;
 356         uint16_t *pports;
 357         int err = 0, count = 0;
 358         boolean_t exact = B_FALSE; /* set when there is no wildcard */
 359         int index = -1;
 360         ushort_t logflags;
 361         ip_stack_t      *ipst = tcps->tcps_netstack->netstack_ip;
 362 
 363         af = acp->ac_local.ss_family;
 364 
 365         if (af == AF_INET) {
 366                 if (TCP_AC_V4REMOTE(acp) != INADDR_ANY &&
 367                     TCP_AC_V4LPORT(acp) != 0 && TCP_AC_V4RPORT(acp) != 0) {
 368                         pports = (uint16_t *)&ports;
 369                         pports[1] = TCP_AC_V4LPORT(acp);
 370                         pports[0] = TCP_AC_V4RPORT(acp);
 371                         exact = (TCP_AC_V4LOCAL(acp) != INADDR_ANY);
 372                 }
 373         } else {
 374                 if (!IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6REMOTE(acp)) &&
 375                     TCP_AC_V6LPORT(acp) != 0 && TCP_AC_V6RPORT(acp) != 0) {
 376                         pports = (uint16_t *)&ports;
 377                         pports[1] = TCP_AC_V6LPORT(acp);
 378                         pports[0] = TCP_AC_V6RPORT(acp);
 379                         exact = !IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6LOCAL(acp));
 380                 }
 381         }
 382 
 383         /*
 384          * For cases where remote addr, local port, and remote port are non-
 385          * wildcards, tcp_ioctl_abort_bucket will only be called once.
 386          */
 387         if (index != -1) {
 388                 err = tcp_ioctl_abort_bucket(acp, index,
 389                     &count, exact, tcps);
 390         } else {
 391                 /*
 392                  * loop through all entries for wildcard case
 393                  */
 394                 for (index = 0;
 395                     index < ipst->ips_ipcl_conn_fanout_size;
 396                     index++) {
 397                         err = tcp_ioctl_abort_bucket(acp, index,
 398                             &count, exact, tcps);
 399                         if (err != 0)
 400                                 break;
 401                 }
 402         }
 403 
 404         logflags = SL_TRACE | SL_NOTE;
 405         /*
 406          * Don't print this message to the console if the operation was done
 407          * to a non-global zone.
 408          */
 409         if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES)
 410                 logflags |= SL_CONSOLE;
 411         (void) strlog(TCP_MOD_ID, 0, 1, logflags, "TCP_IOC_ABORT_CONN: "
 412             "aborted %d connection%c\n", count, ((count > 1) ? 's' : ' '));
 413         if (err == 0 && count == 0)
 414                 err = ENOENT;
 415         return (err);
 416 }
 417 
 418 /*
 419  * Process the TCP_IOC_ABORT_CONN ioctl request.
 420  */
 421 void
 422 tcp_ioctl_abort_conn(queue_t *q, mblk_t *mp)
 423 {
 424         int     err;
 425         IOCP    iocp;
 426         MBLKP   mp1;
 427         sa_family_t laf, raf;
 428         tcp_ioc_abort_conn_t *acp;
 429         zone_t          *zptr;
 430         conn_t          *connp = Q_TO_CONN(q);
 431         zoneid_t        zoneid = connp->conn_zoneid;
 432         tcp_t           *tcp = connp->conn_tcp;
 433         tcp_stack_t     *tcps = tcp->tcp_tcps;
 434 
 435         iocp = (IOCP)mp->b_rptr;
 436 
 437         if ((mp1 = mp->b_cont) == NULL ||
 438             iocp->ioc_count != sizeof (tcp_ioc_abort_conn_t)) {
 439                 err = EINVAL;
 440                 goto out;
 441         }
 442 
 443         /* check permissions */
 444         if (secpolicy_ip_config(iocp->ioc_cr, B_FALSE) != 0) {
 445                 err = EPERM;
 446                 goto out;
 447         }
 448 
 449         if (mp1->b_cont != NULL) {
 450                 freemsg(mp1->b_cont);
 451                 mp1->b_cont = NULL;
 452         }
 453 
 454         acp = (tcp_ioc_abort_conn_t *)mp1->b_rptr;
 455         laf = acp->ac_local.ss_family;
 456         raf = acp->ac_remote.ss_family;
 457 
 458         /* check that a zone with the supplied zoneid exists */
 459         if (acp->ac_zoneid != GLOBAL_ZONEID && acp->ac_zoneid != ALL_ZONES) {
 460                 zptr = zone_find_by_id(zoneid);
 461                 if (zptr != NULL) {
 462                         zone_rele(zptr);
 463                 } else {
 464                         err = EINVAL;
 465                         goto out;
 466                 }
 467         }
 468 
 469         /*
 470          * For exclusive stacks we set the zoneid to zero
 471          * to make TCP operate as if in the global zone.
 472          */
 473         if (tcps->tcps_netstack->netstack_stackid != GLOBAL_NETSTACKID)
 474                 acp->ac_zoneid = GLOBAL_ZONEID;
 475 
 476         if (acp->ac_start < TCPS_SYN_SENT || acp->ac_end > TCPS_TIME_WAIT ||
 477             acp->ac_start > acp->ac_end || laf != raf ||
 478             (laf != AF_INET && laf != AF_INET6)) {
 479                 err = EINVAL;
 480                 goto out;
 481         }
 482 
 483         tcp_ioctl_abort_dump(acp);
 484         err = tcp_ioctl_abort(acp, tcps);
 485 
 486 out:
 487         if (mp1 != NULL) {
 488                 freemsg(mp1);
 489                 mp->b_cont = NULL;
 490         }
 491 
 492         if (err != 0)
 493                 miocnak(q, mp, 0, err);
 494         else
 495                 miocack(q, mp, 0, 0);
 496 }
 497 
 498 /*
 499  * Timeout function to reset the TCP stack variable tcps_reclaim to false.
 500  */
 501 void
 502 tcp_reclaim_timer(void *arg)
 503 {
 504         tcp_stack_t *tcps = (tcp_stack_t *)arg;
 505         int64_t tot_conn = 0;
 506         int i;
 507         extern pgcnt_t lotsfree, needfree;
 508 
 509         for (i = 0; i < tcps->tcps_sc_cnt; i++)
 510                 tot_conn += tcps->tcps_sc[i]->tcp_sc_conn_cnt;
 511 
 512         /*
 513          * This happens only when a stack is going away.  tcps_reclaim_tid
 514          * should not be reset to 0 when returning in this case.
 515          */
 516         mutex_enter(&tcps->tcps_reclaim_lock);
 517         if (!tcps->tcps_reclaim) {
 518                 mutex_exit(&tcps->tcps_reclaim_lock);
 519                 return;
 520         }
 521 
 522         if ((freemem >= lotsfree + needfree) || tot_conn < maxusers) {
 523                 tcps->tcps_reclaim = B_FALSE;
 524                 tcps->tcps_reclaim_tid = 0;
 525         } else {
 526                 /* Stay in defensive mode and restart the timer */
 527                 tcps->tcps_reclaim_tid = timeout(tcp_reclaim_timer,
 528                     tcps, MSEC_TO_TICK(tcps->tcps_reclaim_period));
 529         }
 530         mutex_exit(&tcps->tcps_reclaim_lock);
 531 }
 532 
 533 /*
 534  * Kmem reclaim call back function.  When the system is under memory
 535  * pressure, we set the TCP stack variable tcps_reclaim to true.  This
 536  * variable is reset to false after tcps_reclaim_period msecs.  During this
 537  * period, TCP will be more aggressive in aborting connections not making
 538  * progress, meaning retransmitting for some time (tcp_early_abort seconds).
 539  * TCP will also not accept new connection request for those listeners whose
 540  * q or q0 is not empty.
 541  */
 542 /* ARGSUSED */
 543 void
 544 tcp_conn_reclaim(void *arg)
 545 {
 546         netstack_handle_t nh;
 547         netstack_t *ns;
 548         tcp_stack_t *tcps;
 549         extern pgcnt_t lotsfree, needfree;
 550 
 551         if (!tcp_do_reclaim)
 552                 return;
 553 
 554         /*
 555          * The reclaim function may be called even when the system is not
 556          * really under memory pressure.
 557          */
 558         if (freemem >= lotsfree + needfree)
 559                 return;
 560 
 561         netstack_next_init(&nh);
 562         while ((ns = netstack_next(&nh)) != NULL) {
 563                 int i;
 564                 int64_t tot_conn = 0;
 565 
 566                 /*
 567                  * During boot time, the first netstack_t is created and
 568                  * initialized before TCP has registered with the netstack
 569                  * framework.  If this reclaim function is called before TCP
 570                  * has finished its initialization, netstack_next() will
 571                  * return the first netstack_t (since its netstack_flags is
 572                  * not NSF_UNINIT).  And its netstack_tcp will be NULL.  We
 573                  * need to catch it.
 574                  *
 575                  * All subsequent netstack_t creation will not have this
 576                  * problem since the initialization is not finished until TCP
 577                  * has finished its own tcp_stack_t initialization.  Hence
 578                  * netstack_next() will not return one with NULL netstack_tcp.
 579                  */
 580                 if ((tcps = ns->netstack_tcp) == NULL) {
 581                         netstack_rele(ns);
 582                         continue;
 583                 }
 584 
 585                 /*
 586                  * Even if the system is under memory pressure, the reason may
 587                  * not be because of TCP activity.  Check the number of
 588                  * connections in each stack.  If the number exceeds the
 589                  * threshold (maxusers), turn on defensive mode.
 590                  */
 591                 for (i = 0; i < tcps->tcps_sc_cnt; i++)
 592                         tot_conn += tcps->tcps_sc[i]->tcp_sc_conn_cnt;
 593                 if (tot_conn < maxusers) {
 594                         netstack_rele(ns);
 595                         continue;
 596                 }
 597 
 598                 mutex_enter(&tcps->tcps_reclaim_lock);
 599                 if (!tcps->tcps_reclaim) {
 600                         tcps->tcps_reclaim = B_TRUE;
 601                         tcps->tcps_reclaim_tid = timeout(tcp_reclaim_timer,
 602                             tcps, MSEC_TO_TICK(tcps->tcps_reclaim_period));
 603                         TCP_STAT(tcps, tcp_reclaim_cnt);
 604                 }
 605                 mutex_exit(&tcps->tcps_reclaim_lock);
 606                 netstack_rele(ns);
 607         }
 608         netstack_next_fini(&nh);
 609 }
 610 
 611 /*
 612  * Given a tcp_stack_t and a port (in host byte order), find a listener
 613  * configuration for that port and return the ratio.
 614  */
 615 uint32_t
 616 tcp_find_listener_conf(tcp_stack_t *tcps, in_port_t port)
 617 {
 618         tcp_listener_t  *tl;
 619         uint32_t ratio = 0;
 620 
 621         mutex_enter(&tcps->tcps_listener_conf_lock);
 622         for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL;
 623             tl = list_next(&tcps->tcps_listener_conf, tl)) {
 624                 if (tl->tl_port == port) {
 625                         ratio = tl->tl_ratio;
 626                         break;
 627                 }
 628         }
 629         mutex_exit(&tcps->tcps_listener_conf_lock);
 630         return (ratio);
 631 }
 632 
 633 /*
 634  * To remove all listener limit configuration in a tcp_stack_t.
 635  */
 636 void
 637 tcp_listener_conf_cleanup(tcp_stack_t *tcps)
 638 {
 639         tcp_listener_t  *tl;
 640 
 641         mutex_enter(&tcps->tcps_listener_conf_lock);
 642         while ((tl = list_head(&tcps->tcps_listener_conf)) != NULL) {
 643                 list_remove(&tcps->tcps_listener_conf, tl);
 644                 kmem_free(tl, sizeof (tcp_listener_t));
 645         }
 646         mutex_destroy(&tcps->tcps_listener_conf_lock);
 647         list_destroy(&tcps->tcps_listener_conf);
 648 }
 649 
 650 /*
 651  * When a CPU is added, we need to allocate the per CPU stats struct.
 652  */
 653 void
 654 tcp_stack_cpu_add(tcp_stack_t *tcps, processorid_t cpu_seqid)
 655 {
 656         int i;
 657 
 658         if (cpu_seqid < tcps->tcps_sc_cnt)
 659                 return;
 660         for (i = tcps->tcps_sc_cnt; i <= cpu_seqid; i++) {
 661                 ASSERT(tcps->tcps_sc[i] == NULL);
 662                 tcps->tcps_sc[i] = kmem_zalloc(sizeof (tcp_stats_cpu_t),
 663                     KM_SLEEP);
 664         }
 665         membar_producer();
 666         tcps->tcps_sc_cnt = cpu_seqid + 1;
 667 }
 668 
 669 /*
 670  * Diagnostic routine used to return a string associated with the tcp state.
 671  * Note that if the caller does not supply a buffer, it will use an internal
 672  * static string.  This means that if multiple threads call this function at
 673  * the same time, output can be corrupted...  Note also that this function
 674  * does not check the size of the supplied buffer.  The caller has to make
 675  * sure that it is big enough.
 676  */
 677 char *
 678 tcp_display(tcp_t *tcp, char *sup_buf, char format)
 679 {
 680         char            buf1[30];
 681         static char     priv_buf[INET6_ADDRSTRLEN * 2 + 80];
 682         char            *buf;
 683         char            *cp;
 684         in6_addr_t      local, remote;
 685         char            local_addrbuf[INET6_ADDRSTRLEN];
 686         char            remote_addrbuf[INET6_ADDRSTRLEN];
 687         conn_t          *connp;
 688 
 689         if (sup_buf != NULL)
 690                 buf = sup_buf;
 691         else
 692                 buf = priv_buf;
 693 
 694         if (tcp == NULL)
 695                 return ("NULL_TCP");
 696 
 697         connp = tcp->tcp_connp;
 698         switch (tcp->tcp_state) {
 699         case TCPS_CLOSED:
 700                 cp = "TCP_CLOSED";
 701                 break;
 702         case TCPS_IDLE:
 703                 cp = "TCP_IDLE";
 704                 break;
 705         case TCPS_BOUND:
 706                 cp = "TCP_BOUND";
 707                 break;
 708         case TCPS_LISTEN:
 709                 cp = "TCP_LISTEN";
 710                 break;
 711         case TCPS_SYN_SENT:
 712                 cp = "TCP_SYN_SENT";
 713                 break;
 714         case TCPS_SYN_RCVD:
 715                 cp = "TCP_SYN_RCVD";
 716                 break;
 717         case TCPS_ESTABLISHED:
 718                 cp = "TCP_ESTABLISHED";
 719                 break;
 720         case TCPS_CLOSE_WAIT:
 721                 cp = "TCP_CLOSE_WAIT";
 722                 break;
 723         case TCPS_FIN_WAIT_1:
 724                 cp = "TCP_FIN_WAIT_1";
 725                 break;
 726         case TCPS_CLOSING:
 727                 cp = "TCP_CLOSING";
 728                 break;
 729         case TCPS_LAST_ACK:
 730                 cp = "TCP_LAST_ACK";
 731                 break;
 732         case TCPS_FIN_WAIT_2:
 733                 cp = "TCP_FIN_WAIT_2";
 734                 break;
 735         case TCPS_TIME_WAIT:
 736                 cp = "TCP_TIME_WAIT";
 737                 break;
 738         default:
 739                 (void) mi_sprintf(buf1, "TCPUnkState(%d)", tcp->tcp_state);
 740                 cp = buf1;
 741                 break;
 742         }
 743         switch (format) {
 744         case DISP_ADDR_AND_PORT:
 745                 if (connp->conn_ipversion == IPV4_VERSION) {
 746                         /*
 747                          * Note that we use the remote address in the tcp_b
 748                          * structure.  This means that it will print out
 749                          * the real destination address, not the next hop's
 750                          * address if source routing is used.
 751                          */
 752                         IN6_IPADDR_TO_V4MAPPED(connp->conn_laddr_v4, &local);
 753                         IN6_IPADDR_TO_V4MAPPED(connp->conn_faddr_v4, &remote);
 754 
 755                 } else {
 756                         local = connp->conn_laddr_v6;
 757                         remote = connp->conn_faddr_v6;
 758                 }
 759                 (void) inet_ntop(AF_INET6, &local, local_addrbuf,
 760                     sizeof (local_addrbuf));
 761                 (void) inet_ntop(AF_INET6, &remote, remote_addrbuf,
 762                     sizeof (remote_addrbuf));
 763                 (void) mi_sprintf(buf, "[%s.%u, %s.%u] %s",
 764                     local_addrbuf, ntohs(connp->conn_lport), remote_addrbuf,
 765                     ntohs(connp->conn_fport), cp);
 766                 break;
 767         case DISP_PORT_ONLY:
 768         default:
 769                 (void) mi_sprintf(buf, "[%u, %u] %s",
 770                     ntohs(connp->conn_lport), ntohs(connp->conn_fport), cp);
 771                 break;
 772         }
 773 
 774         return (buf);
 775 }