1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  24  * Copyright 2016 Joyent, Inc.
  25  */
  26 
  27 #include <sys/types.h>
  28 #include <sys/stream.h>
  29 #define _SUN_TPI_VERSION 2
  30 #include <sys/tihdr.h>
  31 #include <sys/socket.h>
  32 #include <sys/xti_xtiopt.h>
  33 #include <sys/xti_inet.h>
  34 #include <sys/policy.h>
  35 
  36 #include <inet/common.h>
  37 #include <netinet/ip6.h>
  38 #include <inet/ip.h>
  39 
  40 #include <netinet/in.h>
  41 #include <netinet/tcp.h>
  42 #include <inet/optcom.h>
  43 #include <inet/proto_set.h>
  44 #include <inet/tcp_impl.h>
  45 
  46 static int      tcp_opt_default(queue_t *, int, int, uchar_t *);
  47 
  48 /*
  49  * Table of all known options handled on a TCP protocol stack.
  50  *
  51  * Note: This table contains options processed by both TCP and IP levels
  52  *       and is the superset of options that can be performed on a TCP over IP
  53  *       stack.
  54  */
  55 opdes_t tcp_opt_arr[] = {
  56 
  57 { SO_LINGER,    SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
  58         sizeof (struct linger), 0 },
  59 
  60 { SO_DEBUG,     SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  61 { SO_KEEPALIVE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  62 { SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  63 { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
  64         },
  65 { SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  66 { SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  67 { SO_REUSEPORT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  68 { SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  69 { SO_TYPE,      SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
  70 { SO_SNDBUF,    SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  71 { SO_RCVBUF,    SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  72 { SO_SNDTIMEO,  SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
  73         sizeof (struct timeval), 0 },
  74 { SO_RCVTIMEO,  SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
  75         sizeof (struct timeval), 0 },
  76 { SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
  77         },
  78 { SO_SND_COPYAVOID, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  79 { SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
  80         0 },
  81 { SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
  82         0 },
  83 { SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
  84         0 },
  85 { SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
  86         0 },
  87 { SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  88 
  89 { SO_DOMAIN,    SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
  90 
  91 { SO_PROTOTYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
  92 
  93 { TCP_NODELAY,  IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
  94         },
  95 { TCP_MAXSEG,   IPPROTO_TCP, OA_R, OA_R, OP_NP, 0, sizeof (uint_t),
  96         536 },
  97 
  98 { TCP_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
  99         OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
 100 
 101 { TCP_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
 102         OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
 103 
 104 { TCP_CONN_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
 105         OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
 106 
 107 { TCP_CONN_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
 108         OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
 109 
 110 { TCP_RECVDSTADDR, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
 111         0 },
 112 
 113 { TCP_ANONPRIVBIND, IPPROTO_TCP, OA_R, OA_RW, OP_PRIVPORT, 0,
 114         sizeof (int), 0 },
 115 
 116 { TCP_EXCLBIND, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
 117         },
 118 
 119 { TCP_INIT_CWND, IPPROTO_TCP, OA_RW, OA_RW, OP_CONFIG, 0,
 120         sizeof (int), 0 },
 121 
 122 { TCP_KEEPALIVE_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
 123         sizeof (int), 0 },
 124 
 125 { TCP_KEEPIDLE, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 126 
 127 { TCP_KEEPCNT, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 128 
 129 { TCP_KEEPINTVL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 130 
 131 { TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
 132         sizeof (int), 0 },
 133 
 134 { TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 135 
 136 { TCP_RTO_INITIAL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
 137 
 138 { TCP_RTO_MIN, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
 139 
 140 { TCP_RTO_MAX, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
 141 
 142 { TCP_LINGER2, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 143 
 144 { IP_OPTIONS,   IPPROTO_IP, OA_RW, OA_RW, OP_NP,
 145         (OP_VARLEN|OP_NODEFAULT),
 146         IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
 147 { T_IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
 148         (OP_VARLEN|OP_NODEFAULT),
 149         IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
 150 
 151 { IP_TOS,       IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 152 { T_IP_TOS,     IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 153 { IP_TTL,       IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
 154         sizeof (int), -1 /* not initialized */ },
 155 
 156 { IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
 157         sizeof (ipsec_req_t), -1 /* not initialized */ },
 158 
 159 { IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
 160         sizeof (int),   0 /* no ifindex */ },
 161 
 162 { IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0,
 163         sizeof (int), 0 },
 164 
 165 { IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
 166         sizeof (int), -1 /* not initialized */ },
 167 
 168 { IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 169         sizeof (int),   0 /* no ifindex */ },
 170 
 171 { IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 172 
 173 { IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
 174         sizeof (in_addr_t),     -1 /* not initialized  */ },
 175 
 176 { IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0,
 177         sizeof (int), 0 },
 178 
 179 { IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
 180         (OP_NODEFAULT|OP_VARLEN),
 181         sizeof (struct in6_pktinfo), -1 /* not initialized */ },
 182 { IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
 183         OP_NODEFAULT,
 184         sizeof (sin6_t), -1 /* not initialized */ },
 185 { IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
 186         (OP_VARLEN|OP_NODEFAULT), 255*8,
 187         -1 /* not initialized */ },
 188 { IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
 189         (OP_VARLEN|OP_NODEFAULT), 255*8,
 190         -1 /* not initialized */ },
 191 { IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
 192         (OP_VARLEN|OP_NODEFAULT), 255*8,
 193         -1 /* not initialized */ },
 194 { IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
 195         (OP_VARLEN|OP_NODEFAULT), 255*8,
 196         -1 /* not initialized */ },
 197 { IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
 198         OP_NODEFAULT,
 199         sizeof (int), -1 /* not initialized */ },
 200 { IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
 201         OP_NODEFAULT,
 202         sizeof (struct ip6_mtuinfo), -1 /* not initialized */ },
 203 { IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 204         sizeof (int), 0 },
 205 { IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 206         sizeof (int), 0 },
 207 { IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 208         sizeof (int), 0 },
 209 
 210 /* Enable receipt of ancillary data */
 211 { IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 212         sizeof (int), 0 },
 213 { IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 214         sizeof (int), 0 },
 215 { IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 216         sizeof (int), 0 },
 217 { _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 218         sizeof (int), 0 },
 219 { IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 220         sizeof (int), 0 },
 221 { IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 222         sizeof (int), 0 },
 223 { IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 224         sizeof (int), 0 },
 225 { IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 226         sizeof (int), 0 },
 227 
 228 { IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
 229         sizeof (ipsec_req_t), -1 /* not initialized */ },
 230 { IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 231         sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT },
 232 };
 233 
 234 /*
 235  * Table of all supported levels
 236  * Note: Some levels (e.g. XTI_GENERIC) may be valid but may not have
 237  * any supported options so we need this info separately.
 238  *
 239  * This is needed only for topmost tpi providers and is used only by
 240  * XTI interfaces.
 241  */
 242 optlevel_t      tcp_valid_levels_arr[] = {
 243         XTI_GENERIC,
 244         SOL_SOCKET,
 245         IPPROTO_TCP,
 246         IPPROTO_IP,
 247         IPPROTO_IPV6
 248 };
 249 
 250 
 251 #define TCP_OPT_ARR_CNT         A_CNT(tcp_opt_arr)
 252 #define TCP_VALID_LEVELS_CNT    A_CNT(tcp_valid_levels_arr)
 253 
 254 uint_t tcp_max_optsize; /* initialized when TCP driver is loaded */
 255 
 256 /*
 257  * Initialize option database object for TCP
 258  *
 259  * This object represents database of options to search passed to
 260  * {sock,tpi}optcom_req() interface routine to take care of option
 261  * management and associated methods.
 262  */
 263 
 264 optdb_obj_t tcp_opt_obj = {
 265         tcp_opt_default,        /* TCP default value function pointer */
 266         tcp_tpi_opt_get,        /* TCP get function pointer */
 267         tcp_tpi_opt_set,        /* TCP set function pointer */
 268         TCP_OPT_ARR_CNT,        /* TCP option database count of entries */
 269         tcp_opt_arr,            /* TCP option database */
 270         TCP_VALID_LEVELS_CNT,   /* TCP valid level count of entries */
 271         tcp_valid_levels_arr    /* TCP valid level array */
 272 };
 273 
 274 static int tcp_max_init_cwnd = TCP_MAX_INIT_CWND;
 275 
 276 /*
 277  * Some TCP options can be "set" by requesting them in the option
 278  * buffer. This is needed for XTI feature test though we do not
 279  * allow it in general. We interpret that this mechanism is more
 280  * applicable to OSI protocols and need not be allowed in general.
 281  * This routine filters out options for which it is not allowed (most)
 282  * and lets through those (few) for which it is. [ The XTI interface
 283  * test suite specifics will imply that any XTI_GENERIC level XTI_* if
 284  * ever implemented will have to be allowed here ].
 285  */
 286 static boolean_t
 287 tcp_allow_connopt_set(int level, int name)
 288 {
 289 
 290         switch (level) {
 291         case IPPROTO_TCP:
 292                 switch (name) {
 293                 case TCP_NODELAY:
 294                         return (B_TRUE);
 295                 default:
 296                         return (B_FALSE);
 297                 }
 298                 /*NOTREACHED*/
 299         default:
 300                 return (B_FALSE);
 301         }
 302         /*NOTREACHED*/
 303 }
 304 
 305 /*
 306  * This routine gets default values of certain options whose default
 307  * values are maintained by protocol specific code
 308  */
 309 /* ARGSUSED */
 310 static int
 311 tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
 312 {
 313         int32_t *i1 = (int32_t *)ptr;
 314         tcp_stack_t     *tcps = Q_TO_TCP(q)->tcp_tcps;
 315 
 316         switch (level) {
 317         case IPPROTO_TCP:
 318                 switch (name) {
 319                 case TCP_NOTIFY_THRESHOLD:
 320                         *i1 = tcps->tcps_ip_notify_interval;
 321                         break;
 322                 case TCP_ABORT_THRESHOLD:
 323                         *i1 = tcps->tcps_ip_abort_interval;
 324                         break;
 325                 case TCP_CONN_NOTIFY_THRESHOLD:
 326                         *i1 = tcps->tcps_ip_notify_cinterval;
 327                         break;
 328                 case TCP_CONN_ABORT_THRESHOLD:
 329                         *i1 = tcps->tcps_ip_abort_cinterval;
 330                         break;
 331                 default:
 332                         return (-1);
 333                 }
 334                 break;
 335         case IPPROTO_IP:
 336                 switch (name) {
 337                 case IP_TTL:
 338                         *i1 = tcps->tcps_ipv4_ttl;
 339                         break;
 340                 default:
 341                         return (-1);
 342                 }
 343                 break;
 344         case IPPROTO_IPV6:
 345                 switch (name) {
 346                 case IPV6_UNICAST_HOPS:
 347                         *i1 = tcps->tcps_ipv6_hoplimit;
 348                         break;
 349                 default:
 350                         return (-1);
 351                 }
 352                 break;
 353         default:
 354                 return (-1);
 355         }
 356         return (sizeof (int));
 357 }
 358 
 359 /*
 360  * TCP routine to get the values of options.
 361  */
 362 int
 363 tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
 364 {
 365         int             *i1 = (int *)ptr;
 366         tcp_t           *tcp = connp->conn_tcp;
 367         conn_opt_arg_t  coas;
 368         int             retval;
 369 
 370         coas.coa_connp = connp;
 371         coas.coa_ixa = connp->conn_ixa;
 372         coas.coa_ipp = &connp->conn_xmit_ipp;
 373         coas.coa_ancillary = B_FALSE;
 374         coas.coa_changed = 0;
 375 
 376         switch (level) {
 377         case SOL_SOCKET:
 378                 switch (name) {
 379                 case SO_SND_COPYAVOID:
 380                         *i1 = tcp->tcp_snd_zcopy_on ?
 381                             SO_SND_COPYAVOID : 0;
 382                         return (sizeof (int));
 383                 case SO_ACCEPTCONN:
 384                         *i1 = (tcp->tcp_state == TCPS_LISTEN);
 385                         return (sizeof (int));
 386                 }
 387                 break;
 388         case IPPROTO_TCP:
 389                 switch (name) {
 390                 case TCP_NODELAY:
 391                         *i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0;
 392                         return (sizeof (int));
 393                 case TCP_MAXSEG:
 394                         *i1 = tcp->tcp_mss;
 395                         return (sizeof (int));
 396                 case TCP_NOTIFY_THRESHOLD:
 397                         *i1 = (int)tcp->tcp_first_timer_threshold;
 398                         return (sizeof (int));
 399                 case TCP_ABORT_THRESHOLD:
 400                         *i1 = tcp->tcp_second_timer_threshold;
 401                         return (sizeof (int));
 402                 case TCP_CONN_NOTIFY_THRESHOLD:
 403                         *i1 = tcp->tcp_first_ctimer_threshold;
 404                         return (sizeof (int));
 405                 case TCP_CONN_ABORT_THRESHOLD:
 406                         *i1 = tcp->tcp_second_ctimer_threshold;
 407                         return (sizeof (int));
 408                 case TCP_INIT_CWND:
 409                         *i1 = tcp->tcp_init_cwnd;
 410                         return (sizeof (int));
 411                 case TCP_KEEPALIVE_THRESHOLD:
 412                         *i1 = tcp->tcp_ka_interval;
 413                         return (sizeof (int));
 414 
 415                 /*
 416                  * TCP_KEEPIDLE expects value in seconds, but
 417                  * tcp_ka_interval is in milliseconds.
 418                  */
 419                 case TCP_KEEPIDLE:
 420                         *i1 = tcp->tcp_ka_interval / 1000;
 421                         return (sizeof (int));
 422                 case TCP_KEEPCNT:
 423                         *i1 = tcp->tcp_ka_cnt;
 424                         return (sizeof (int));
 425 
 426                 /*
 427                  * TCP_KEEPINTVL expects value in seconds, but
 428                  * tcp_ka_rinterval is in milliseconds.
 429                  */
 430                 case TCP_KEEPINTVL:
 431                         *i1 = tcp->tcp_ka_rinterval / 1000;
 432                         return (sizeof (int));
 433                 case TCP_KEEPALIVE_ABORT_THRESHOLD:
 434                         *i1 = tcp->tcp_ka_abort_thres;
 435                         return (sizeof (int));
 436                 case TCP_CORK:
 437                         *i1 = tcp->tcp_cork;
 438                         return (sizeof (int));
 439                 case TCP_RTO_INITIAL:
 440                         *i1 = tcp->tcp_rto_initial;
 441                         return (sizeof (uint32_t));
 442                 case TCP_RTO_MIN:
 443                         *i1 = tcp->tcp_rto_min;
 444                         return (sizeof (uint32_t));
 445                 case TCP_RTO_MAX:
 446                         *i1 = tcp->tcp_rto_max;
 447                         return (sizeof (uint32_t));
 448                 case TCP_LINGER2:
 449                         *i1 = tcp->tcp_fin_wait_2_flush_interval / SECONDS;
 450                         return (sizeof (int));
 451                 }
 452                 break;
 453         case IPPROTO_IP:
 454                 if (connp->conn_family != AF_INET)
 455                         return (-1);
 456                 switch (name) {
 457                 case IP_OPTIONS:
 458                 case T_IP_OPTIONS:
 459                         /* Caller ensures enough space */
 460                         return (ip_opt_get_user(connp, ptr));
 461                 default:
 462                         break;
 463                 }
 464                 break;
 465 
 466         case IPPROTO_IPV6:
 467                 /*
 468                  * IPPROTO_IPV6 options are only supported for sockets
 469                  * that are using IPv6 on the wire.
 470                  */
 471                 if (connp->conn_ipversion != IPV6_VERSION) {
 472                         return (-1);
 473                 }
 474                 switch (name) {
 475                 case IPV6_PATHMTU:
 476                         if (tcp->tcp_state < TCPS_ESTABLISHED)
 477                                 return (-1);
 478                         break;
 479                 }
 480                 break;
 481         }
 482         mutex_enter(&connp->conn_lock);
 483         retval = conn_opt_get(&coas, level, name, ptr);
 484         mutex_exit(&connp->conn_lock);
 485         return (retval);
 486 }
 487 
 488 /*
 489  * Set a TCP connection's participation in SO_REUSEPORT.  This operation is
 490  * performed under the protection of the squeue via tcp_setsockopt.
 491  * The manipulation of tcp_rg_bind, as part of this operation, is subject to
 492  * these constraints:
 493  * 1. Prior to bind(), tcp_rg_bind can be set/cleared in tcp_set_reuseport
 494  *    under the protection of the squeue.
 495  * 2. Once the connection has been bound, the tcp_rg_bind pointer must not be
 496  *    altered until such time as tcp_free() cleans up the connection.
 497  * 3. A connection undergoing bind, which matches to a connection participating
 498  *    in port-reuse, will switch its tcp_rg_bind pointer when it joins the
 499  *    group of an existing connection in tcp_bindi().
 500  */
 501 static int
 502 tcp_set_reuseport(conn_t *connp, boolean_t do_enable)
 503 {
 504         tcp_t *tcp = connp->conn_tcp;
 505         struct tcp_rg_s *rg;
 506 
 507         if (!IPCL_IS_NONSTR(connp)) {
 508                 if (do_enable) {
 509                         /*
 510                          * SO_REUSEPORT cannot be enabled on sockets which have
 511                          * fallen back to the STREAMS API.
 512                          */
 513                         return (EINVAL);
 514                 } else {
 515                         /*
 516                          * A connection with SO_REUSEPORT enabled should be
 517                          * prevented from falling back to STREAMS mode via
 518                          * logic in tcp_fallback.  It is legal, however, for
 519                          * fallen-back connections to affirm the disabled state
 520                          * of SO_REUSEPORT.
 521                          */
 522                         ASSERT(connp->conn_reuseport == 0);
 523                         return (0);
 524                 }
 525         }
 526         if (tcp->tcp_state <= TCPS_CLOSED) {
 527                 return (EINVAL);
 528         }
 529         if (connp->conn_reuseport == 0 && do_enable) {
 530                 /* disabled -> enabled */
 531                 if (tcp->tcp_rg_bind != NULL) {
 532                         tcp_rg_setactive(tcp->tcp_rg_bind, do_enable);
 533                 } else {
 534                         /*
 535                          * Connection state is not a concern when initially
 536                          * populating tcp_rg_bind.  Setting it to non-NULL on a
 537                          * bound or listening connection would only mean that
 538                          * new reused-port binds become a possibility.
 539                          */
 540                         if ((rg = tcp_rg_init(tcp)) == NULL) {
 541                                 return (ENOMEM);
 542                         }
 543                         tcp->tcp_rg_bind = rg;
 544                 }
 545                 connp->conn_reuseport = 1;
 546         } else if (connp->conn_reuseport != 0 && !do_enable) {
 547                 /* enabled -> disabled */
 548                 ASSERT(tcp->tcp_rg_bind != NULL);
 549                 if (tcp->tcp_state == TCPS_IDLE) {
 550                         /*
 551                          * If the connection has not been bound yet, discard
 552                          * the reuse group state.  Since disabling SO_REUSEPORT
 553                          * on a bound socket will _not_ prevent others from
 554                          * reusing the port, the presence of tcp_rg_bind is
 555                          * used to determine reuse availability, not
 556                          * conn_reuseport.
 557                          *
 558                          * This allows proper behavior for examples such as:
 559                          *
 560                          * setsockopt(fd1, ... SO_REUSEPORT, &on_val...);
 561                          * bind(fd1, &myaddr, ...);
 562                          * setsockopt(fd1, ... SO_REUSEPORT, &off_val...);
 563                          *
 564                          * setsockopt(fd2, ... SO_REUSEPORT, &on_val...);
 565                          * bind(fd2, &myaddr, ...); // <- SHOULD SUCCEED
 566                          *
 567                          */
 568                         rg = tcp->tcp_rg_bind;
 569                         tcp->tcp_rg_bind = NULL;
 570                         VERIFY(tcp_rg_remove(rg, tcp));
 571                         tcp_rg_destroy(rg);
 572                 } else {
 573                         /*
 574                          * If a connection has been bound, it's no longer safe
 575                          * to manipulate tcp_rg_bind until connection clean-up
 576                          * during tcp_free.  Just mark the member status of the
 577                          * connection as inactive.
 578                          */
 579                         tcp_rg_setactive(tcp->tcp_rg_bind, do_enable);
 580                 }
 581                 connp->conn_reuseport = 0;
 582         }
 583         return (0);
 584 }
 585 
 586 /*
 587  * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
 588  * Parameters are assumed to be verified by the caller.
 589  */
 590 /* ARGSUSED */
 591 int
 592 tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
 593     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
 594     void *thisdg_attrs, cred_t *cr)
 595 {
 596         tcp_t   *tcp = connp->conn_tcp;
 597         int     *i1 = (int *)invalp;
 598         boolean_t onoff = (*i1 == 0) ? 0 : 1;
 599         boolean_t checkonly;
 600         int     reterr;
 601         tcp_stack_t     *tcps = tcp->tcp_tcps;
 602         conn_opt_arg_t  coas;
 603         uint32_t        val = *((uint32_t *)invalp);
 604 
 605         coas.coa_connp = connp;
 606         coas.coa_ixa = connp->conn_ixa;
 607         coas.coa_ipp = &connp->conn_xmit_ipp;
 608         coas.coa_ancillary = B_FALSE;
 609         coas.coa_changed = 0;
 610 
 611         switch (optset_context) {
 612         case SETFN_OPTCOM_CHECKONLY:
 613                 checkonly = B_TRUE;
 614                 /*
 615                  * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
 616                  * inlen != 0 implies value supplied and
 617                  *      we have to "pretend" to set it.
 618                  * inlen == 0 implies that there is no
 619                  *      value part in T_CHECK request and just validation
 620                  * done elsewhere should be enough, we just return here.
 621                  */
 622                 if (inlen == 0) {
 623                         *outlenp = 0;
 624                         return (0);
 625                 }
 626                 break;
 627         case SETFN_OPTCOM_NEGOTIATE:
 628                 checkonly = B_FALSE;
 629                 break;
 630         case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */
 631         case SETFN_CONN_NEGOTIATE:
 632                 checkonly = B_FALSE;
 633                 /*
 634                  * Negotiating local and "association-related" options
 635                  * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ)
 636                  * primitives is allowed by XTI, but we choose
 637                  * to not implement this style negotiation for Internet
 638                  * protocols (We interpret it is a must for OSI world but
 639                  * optional for Internet protocols) for all options.
 640                  * [ Will do only for the few options that enable test
 641                  * suites that our XTI implementation of this feature
 642                  * works for transports that do allow it ]
 643                  */
 644                 if (!tcp_allow_connopt_set(level, name)) {
 645                         *outlenp = 0;
 646                         return (EINVAL);
 647                 }
 648                 break;
 649         default:
 650                 /*
 651                  * We should never get here
 652                  */
 653                 *outlenp = 0;
 654                 return (EINVAL);
 655         }
 656 
 657         ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
 658             (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
 659 
 660         /*
 661          * For TCP, we should have no ancillary data sent down
 662          * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs
 663          * has to be zero.
 664          */
 665         ASSERT(thisdg_attrs == NULL);
 666 
 667         /*
 668          * For fixed length options, no sanity check
 669          * of passed in length is done. It is assumed *_optcom_req()
 670          * routines do the right thing.
 671          */
 672         switch (level) {
 673         case SOL_SOCKET:
 674                 switch (name) {
 675                 case SO_KEEPALIVE:
 676                         if (checkonly) {
 677                                 /* check only case */
 678                                 break;
 679                         }
 680 
 681                         if (!onoff) {
 682                                 if (connp->conn_keepalive) {
 683                                         if (tcp->tcp_ka_tid != 0) {
 684                                                 (void) TCP_TIMER_CANCEL(tcp,
 685                                                     tcp->tcp_ka_tid);
 686                                                 tcp->tcp_ka_tid = 0;
 687                                         }
 688                                         connp->conn_keepalive = 0;
 689                                 }
 690                                 break;
 691                         }
 692                         if (!connp->conn_keepalive) {
 693                                 /* Crank up the keepalive timer */
 694                                 tcp->tcp_ka_last_intrvl = 0;
 695                                 tcp->tcp_ka_tid = TCP_TIMER(tcp,
 696                                     tcp_keepalive_timer, tcp->tcp_ka_interval);
 697                                 connp->conn_keepalive = 1;
 698                         }
 699                         break;
 700                 case SO_SNDBUF: {
 701                         if (*i1 > tcps->tcps_max_buf) {
 702                                 *outlenp = 0;
 703                                 return (ENOBUFS);
 704                         }
 705                         if (checkonly)
 706                                 break;
 707 
 708                         connp->conn_sndbuf = *i1;
 709                         if (tcps->tcps_snd_lowat_fraction != 0) {
 710                                 connp->conn_sndlowat = connp->conn_sndbuf /
 711                                     tcps->tcps_snd_lowat_fraction;
 712                         }
 713                         (void) tcp_maxpsz_set(tcp, B_TRUE);
 714                         /*
 715                          * If we are flow-controlled, recheck the condition.
 716                          * There are apps that increase SO_SNDBUF size when
 717                          * flow-controlled (EWOULDBLOCK), and expect the flow
 718                          * control condition to be lifted right away.
 719                          */
 720                         mutex_enter(&tcp->tcp_non_sq_lock);
 721                         if (tcp->tcp_flow_stopped &&
 722                             TCP_UNSENT_BYTES(tcp) < connp->conn_sndbuf) {
 723                                 tcp_clrqfull(tcp);
 724                         }
 725                         mutex_exit(&tcp->tcp_non_sq_lock);
 726                         *outlenp = inlen;
 727                         return (0);
 728                 }
 729                 case SO_RCVBUF:
 730                         if (*i1 > tcps->tcps_max_buf) {
 731                                 *outlenp = 0;
 732                                 return (ENOBUFS);
 733                         }
 734                         /* Silently ignore zero */
 735                         if (!checkonly && *i1 != 0) {
 736                                 *i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss);
 737                                 (void) tcp_rwnd_set(tcp, *i1);
 738                         }
 739                         /*
 740                          * XXX should we return the rwnd here
 741                          * and tcp_opt_get ?
 742                          */
 743                         *outlenp = inlen;
 744                         return (0);
 745                 case SO_SND_COPYAVOID:
 746                         if (!checkonly) {
 747                                 if (tcp->tcp_loopback ||
 748                                     (onoff != 1) || !tcp_zcopy_check(tcp)) {
 749                                         *outlenp = 0;
 750                                         return (EOPNOTSUPP);
 751                                 }
 752                                 tcp->tcp_snd_zcopy_aware = 1;
 753                         }
 754                         *outlenp = inlen;
 755                         return (0);
 756                 case SO_REUSEPORT:
 757                         if (!checkonly) {
 758                                 return (tcp_set_reuseport(connp, *i1 != 0));
 759                         }
 760                         return (0);
 761                 }
 762                 break;
 763         case IPPROTO_TCP:
 764                 switch (name) {
 765                 case TCP_NODELAY:
 766                         if (!checkonly)
 767                                 tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss;
 768                         break;
 769                 case TCP_NOTIFY_THRESHOLD:
 770                         if (!checkonly)
 771                                 tcp->tcp_first_timer_threshold = *i1;
 772                         break;
 773                 case TCP_ABORT_THRESHOLD:
 774                         if (!checkonly)
 775                                 tcp->tcp_second_timer_threshold = *i1;
 776                         break;
 777                 case TCP_CONN_NOTIFY_THRESHOLD:
 778                         if (!checkonly)
 779                                 tcp->tcp_first_ctimer_threshold = *i1;
 780                         break;
 781                 case TCP_CONN_ABORT_THRESHOLD:
 782                         if (!checkonly)
 783                                 tcp->tcp_second_ctimer_threshold = *i1;
 784                         break;
 785                 case TCP_RECVDSTADDR:
 786                         if (tcp->tcp_state > TCPS_LISTEN) {
 787                                 *outlenp = 0;
 788                                 return (EOPNOTSUPP);
 789                         }
 790                         /* Setting done in conn_opt_set */
 791                         break;
 792                 case TCP_INIT_CWND:
 793                         if (checkonly)
 794                                 break;
 795 
 796                         /*
 797                          * Only allow socket with network configuration
 798                          * privilege to set the initial cwnd to be larger
 799                          * than allowed by RFC 3390.
 800                          */
 801                         if (val > MIN(4, MAX(2, 4380 / tcp->tcp_mss))) {
 802                                 if ((reterr = secpolicy_ip_config(cr, B_TRUE))
 803                                     != 0) {
 804                                         *outlenp = 0;
 805                                         return (reterr);
 806                                 }
 807                                 if (val > tcp_max_init_cwnd) {
 808                                         *outlenp = 0;
 809                                         return (EINVAL);
 810                                 }
 811                         }
 812 
 813                         tcp->tcp_init_cwnd = val;
 814 
 815                         /*
 816                          * If the socket is connected, AND no outbound data
 817                          * has been sent, reset the actual cwnd values.
 818                          */
 819                         if (tcp->tcp_state == TCPS_ESTABLISHED &&
 820                             tcp->tcp_iss == tcp->tcp_snxt - 1) {
 821                                 tcp->tcp_cwnd =
 822                                     MIN(tcp->tcp_rwnd, val * tcp->tcp_mss);
 823                         }
 824                         break;
 825 
 826                 /*
 827                  * TCP_KEEPIDLE is in seconds but TCP_KEEPALIVE_THRESHOLD
 828                  * is in milliseconds. TCP_KEEPIDLE is introduced for
 829                  * compatibility with other Unix flavors.
 830                  * We can fall through TCP_KEEPALIVE_THRESHOLD logic after
 831                  * converting the input to milliseconds.
 832                  */
 833                 case TCP_KEEPIDLE:
 834                         *i1 *= 1000;
 835                         /* FALLTHRU */
 836 
 837                 case TCP_KEEPALIVE_THRESHOLD:
 838                         if (checkonly)
 839                                 break;
 840 
 841                         if (*i1 < tcps->tcps_keepalive_interval_low ||
 842                             *i1 > tcps->tcps_keepalive_interval_high) {
 843                                 *outlenp = 0;
 844                                 return (EINVAL);
 845                         }
 846                         if (*i1 != tcp->tcp_ka_interval) {
 847                                 tcp->tcp_ka_interval = *i1;
 848                                 /*
 849                                  * Check if we need to restart the
 850                                  * keepalive timer.
 851                                  */
 852                                 if (tcp->tcp_ka_tid != 0) {
 853                                         ASSERT(connp->conn_keepalive);
 854                                         (void) TCP_TIMER_CANCEL(tcp,
 855                                             tcp->tcp_ka_tid);
 856                                         tcp->tcp_ka_last_intrvl = 0;
 857                                         tcp->tcp_ka_tid = TCP_TIMER(tcp,
 858                                             tcp_keepalive_timer,
 859                                             tcp->tcp_ka_interval);
 860                                 }
 861                         }
 862                         break;
 863 
 864                 /*
 865                  * tcp_ka_abort_thres = tcp_ka_rinterval * tcp_ka_cnt.
 866                  * So setting TCP_KEEPCNT or TCP_KEEPINTVL can affect all the
 867                  * three members - tcp_ka_abort_thres, tcp_ka_rinterval and
 868                  * tcp_ka_cnt.
 869                  */
 870                 case TCP_KEEPCNT:
 871                         if (checkonly)
 872                                 break;
 873 
 874                         if (*i1 == 0) {
 875                                 return (EINVAL);
 876                         } else if (tcp->tcp_ka_rinterval == 0) {
 877                                 /*
 878                                  * When TCP_KEEPCNT is specified without first
 879                                  * specifying a TCP_KEEPINTVL, we infer an
 880                                  * interval based on a tunable specific to our
 881                                  * stack: the tcp_keepalive_abort_interval.
 882                                  * (Or the TCP_KEEPALIVE_ABORT_THRESHOLD, in
 883                                  * the unlikely event that that has been set.)
 884                                  * Given the abort interval's default value of
 885                                  * 480 seconds, low TCP_KEEPCNT values can
 886                                  * result in intervals that exceed the default
 887                                  * maximum RTO of 60 seconds.  Rather than
 888                                  * fail in these cases, we (implicitly) clamp
 889                                  * the interval at the maximum RTO; if the
 890                                  * TCP_KEEPCNT is shortly followed by a
 891                                  * TCP_KEEPINTVL (as we expect), the abort
 892                                  * threshold will be recalculated correctly --
 893                                  * and if a TCP_KEEPINTVL is not forthcoming,
 894                                  * keep-alive will at least operate reasonably
 895                                  * given the underconfigured state.
 896                                  */
 897                                 uint32_t interval;
 898 
 899                                 interval = tcp->tcp_ka_abort_thres / *i1;
 900 
 901                                 if (interval < tcp->tcp_rto_min)
 902                                         interval = tcp->tcp_rto_min;
 903 
 904                                 if (interval > tcp->tcp_rto_max)
 905                                         interval = tcp->tcp_rto_max;
 906 
 907                                 tcp->tcp_ka_rinterval = interval;
 908                         } else {
 909                                 if ((*i1 * tcp->tcp_ka_rinterval) <
 910                                     tcps->tcps_keepalive_abort_interval_low ||
 911                                     (*i1 * tcp->tcp_ka_rinterval) >
 912                                     tcps->tcps_keepalive_abort_interval_high)
 913                                         return (EINVAL);
 914                                 tcp->tcp_ka_abort_thres =
 915                                     (*i1 * tcp->tcp_ka_rinterval);
 916                         }
 917                         tcp->tcp_ka_cnt = *i1;
 918                         break;
 919                 case TCP_KEEPINTVL:
 920                         /*
 921                          * TCP_KEEPINTVL is specified in seconds, but
 922                          * tcp_ka_rinterval is in milliseconds.
 923                          */
 924 
 925                         if (checkonly)
 926                                 break;
 927 
 928                         if ((*i1 * 1000) < tcp->tcp_rto_min ||
 929                             (*i1 * 1000) > tcp->tcp_rto_max)
 930                                 return (EINVAL);
 931 
 932                         if (tcp->tcp_ka_cnt == 0) {
 933                                 tcp->tcp_ka_cnt =
 934                                     tcp->tcp_ka_abort_thres / (*i1 * 1000);
 935                         } else {
 936                                 if ((*i1 * tcp->tcp_ka_cnt * 1000) <
 937                                     tcps->tcps_keepalive_abort_interval_low ||
 938                                     (*i1 * tcp->tcp_ka_cnt * 1000) >
 939                                     tcps->tcps_keepalive_abort_interval_high)
 940                                         return (EINVAL);
 941                                 tcp->tcp_ka_abort_thres =
 942                                     (*i1 * tcp->tcp_ka_cnt * 1000);
 943                         }
 944                         tcp->tcp_ka_rinterval = *i1 * 1000;
 945                         break;
 946                 case TCP_KEEPALIVE_ABORT_THRESHOLD:
 947                         if (!checkonly) {
 948                                 if (*i1 <
 949                                     tcps->tcps_keepalive_abort_interval_low ||
 950                                     *i1 >
 951                                     tcps->tcps_keepalive_abort_interval_high) {
 952                                         *outlenp = 0;
 953                                         return (EINVAL);
 954                                 }
 955                                 tcp->tcp_ka_abort_thres = *i1;
 956                                 tcp->tcp_ka_cnt = 0;
 957                                 tcp->tcp_ka_rinterval = 0;
 958                         }
 959                         break;
 960                 case TCP_CORK:
 961                         if (!checkonly) {
 962                                 /*
 963                                  * if tcp->tcp_cork was set and is now
 964                                  * being unset, we have to make sure that
 965                                  * the remaining data gets sent out. Also
 966                                  * unset tcp->tcp_cork so that tcp_wput_data()
 967                                  * can send data even if it is less than mss
 968                                  */
 969                                 if (tcp->tcp_cork && onoff == 0 &&
 970                                     tcp->tcp_unsent > 0) {
 971                                         tcp->tcp_cork = B_FALSE;
 972                                         tcp_wput_data(tcp, NULL, B_FALSE);
 973                                 }
 974                                 tcp->tcp_cork = onoff;
 975                         }
 976                         break;
 977                 case TCP_RTO_INITIAL: {
 978                         clock_t rto;
 979 
 980                         if (checkonly || val == 0)
 981                                 break;
 982 
 983                         /*
 984                          * Sanity checks
 985                          *
 986                          * The initial RTO should be bounded by the minimum
 987                          * and maximum RTO.  And it should also be smaller
 988                          * than the connect attempt abort timeout.  Otherwise,
 989                          * the connection won't be aborted in a period
 990                          * reasonably close to that timeout.
 991                          */
 992                         if (val < tcp->tcp_rto_min || val > tcp->tcp_rto_max ||
 993                             val > tcp->tcp_second_ctimer_threshold ||
 994                             val < tcps->tcps_rexmit_interval_initial_low ||
 995                             val > tcps->tcps_rexmit_interval_initial_high) {
 996                                 *outlenp = 0;
 997                                 return (EINVAL);
 998                         }
 999                         tcp->tcp_rto_initial = val;
1000 
1001                         /*
1002                          * If TCP has not sent anything, need to re-calculate
1003                          * tcp_rto.  Otherwise, this option change does not
1004                          * really affect anything.
1005                          */
1006                         if (tcp->tcp_state >= TCPS_SYN_SENT)
1007                                 break;
1008 
1009                         tcp->tcp_rtt_sa = tcp->tcp_rto_initial << 2;
1010                         tcp->tcp_rtt_sd = tcp->tcp_rto_initial >> 1;
1011                         rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
1012                             tcps->tcps_rexmit_interval_extra +
1013                             (tcp->tcp_rtt_sa >> 5) +
1014                             tcps->tcps_conn_grace_period;
1015                         TCP_SET_RTO(tcp, rto);
1016                         break;
1017                 }
1018                 case TCP_RTO_MIN:
1019                         if (checkonly || val == 0)
1020                                 break;
1021 
1022                         if (val < tcps->tcps_rexmit_interval_min_low ||
1023                             val > tcps->tcps_rexmit_interval_min_high ||
1024                             val > tcp->tcp_rto_max) {
1025                                 *outlenp = 0;
1026                                 return (EINVAL);
1027                         }
1028                         tcp->tcp_rto_min = val;
1029                         if (tcp->tcp_rto < val)
1030                                 tcp->tcp_rto = val;
1031                         break;
1032                 case TCP_RTO_MAX:
1033                         if (checkonly || val == 0)
1034                                 break;
1035 
1036                         /*
1037                          * Sanity checks
1038                          *
1039                          * The maximum RTO should not be larger than the
1040                          * connection abort timeout.  Otherwise, the
1041                          * connection won't be aborted in a period reasonably
1042                          * close to that timeout.
1043                          */
1044                         if (val < tcps->tcps_rexmit_interval_max_low ||
1045                             val > tcps->tcps_rexmit_interval_max_high ||
1046                             val < tcp->tcp_rto_min ||
1047                             val > tcp->tcp_second_timer_threshold) {
1048                                 *outlenp = 0;
1049                                 return (EINVAL);
1050                         }
1051                         tcp->tcp_rto_max = val;
1052                         if (tcp->tcp_rto > val)
1053                                 tcp->tcp_rto = val;
1054                         break;
1055                 case TCP_LINGER2:
1056                         if (checkonly || *i1 == 0)
1057                                 break;
1058 
1059                         /*
1060                          * Note that the option value's unit is second.  And
1061                          * the value should be bigger than the private
1062                          * parameter tcp_fin_wait_2_flush_interval's lower
1063                          * bound and smaller than the current value of that
1064                          * parameter.  It should be smaller than the current
1065                          * value to avoid an app setting TCP_LINGER2 to a big
1066                          * value, causing resource to be held up too long in
1067                          * FIN-WAIT-2 state.
1068                          */
1069                         if (*i1 < 0 ||
1070                             tcps->tcps_fin_wait_2_flush_interval_low/SECONDS >
1071                             *i1 ||
1072                             tcps->tcps_fin_wait_2_flush_interval/SECONDS <
1073                             *i1) {
1074                                 *outlenp = 0;
1075                                 return (EINVAL);
1076                         }
1077                         tcp->tcp_fin_wait_2_flush_interval = *i1 * SECONDS;
1078                         break;
1079                 default:
1080                         break;
1081                 }
1082                 break;
1083         case IPPROTO_IP:
1084                 switch (name) {
1085                 case IP_SEC_OPT:
1086                         /*
1087                          * We should not allow policy setting after
1088                          * we start listening for connections.
1089                          */
1090                         if (tcp->tcp_state == TCPS_LISTEN) {
1091                                 return (EINVAL);
1092                         }
1093                         break;
1094                 }
1095                 break;
1096         case IPPROTO_IPV6:
1097                 /*
1098                  * IPPROTO_IPV6 options are only supported for sockets
1099                  * that are using IPv6 on the wire.
1100                  */
1101                 if (connp->conn_ipversion != IPV6_VERSION) {
1102                         *outlenp = 0;
1103                         return (EINVAL);
1104                 }
1105 
1106                 switch (name) {
1107                 case IPV6_RECVPKTINFO:
1108                         if (!checkonly) {
1109                                 /* Force it to be sent up with the next msg */
1110                                 tcp->tcp_recvifindex = 0;
1111                         }
1112                         break;
1113                 case IPV6_RECVTCLASS:
1114                         if (!checkonly) {
1115                                 /* Force it to be sent up with the next msg */
1116                                 tcp->tcp_recvtclass = 0xffffffffU;
1117                         }
1118                         break;
1119                 case IPV6_RECVHOPLIMIT:
1120                         if (!checkonly) {
1121                                 /* Force it to be sent up with the next msg */
1122                                 tcp->tcp_recvhops = 0xffffffffU;
1123                         }
1124                         break;
1125                 case IPV6_PKTINFO:
1126                         /* This is an extra check for TCP */
1127                         if (inlen == sizeof (struct in6_pktinfo)) {
1128                                 struct in6_pktinfo *pkti;
1129 
1130                                 pkti = (struct in6_pktinfo *)invalp;
1131                                 /*
1132                                  * RFC 3542 states that ipi6_addr must be
1133                                  * the unspecified address when setting the
1134                                  * IPV6_PKTINFO sticky socket option on a
1135                                  * TCP socket.
1136                                  */
1137                                 if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr))
1138                                         return (EINVAL);
1139                         }
1140                         break;
1141                 case IPV6_SEC_OPT:
1142                         /*
1143                          * We should not allow policy setting after
1144                          * we start listening for connections.
1145                          */
1146                         if (tcp->tcp_state == TCPS_LISTEN) {
1147                                 return (EINVAL);
1148                         }
1149                         break;
1150                 }
1151                 break;
1152         }
1153         reterr = conn_opt_set(&coas, level, name, inlen, invalp,
1154             checkonly, cr);
1155         if (reterr != 0) {
1156                 *outlenp = 0;
1157                 return (reterr);
1158         }
1159 
1160         /*
1161          * Common case of OK return with outval same as inval
1162          */
1163         if (invalp != outvalp) {
1164                 /* don't trust bcopy for identical src/dst */
1165                 (void) bcopy(invalp, outvalp, inlen);
1166         }
1167         *outlenp = inlen;
1168 
1169         if (coas.coa_changed & COA_HEADER_CHANGED) {
1170                 /* If we are connected we rebuilt the headers */
1171                 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1172                     !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1173                         reterr = tcp_build_hdrs(tcp);
1174                         if (reterr != 0)
1175                                 return (reterr);
1176                 }
1177         }
1178         if (coas.coa_changed & COA_ROUTE_CHANGED) {
1179                 in6_addr_t nexthop;
1180 
1181                 /*
1182                  * If we are connected we re-cache the information.
1183                  * We ignore errors to preserve BSD behavior.
1184                  * Note that we don't redo IPsec policy lookup here
1185                  * since the final destination (or source) didn't change.
1186                  */
1187                 ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa,
1188                     &connp->conn_faddr_v6, &nexthop);
1189 
1190                 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1191                     !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1192                         (void) ip_attr_connect(connp, connp->conn_ixa,
1193                             &connp->conn_laddr_v6, &connp->conn_faddr_v6,
1194                             &nexthop, connp->conn_fport, NULL, NULL,
1195                             IPDF_VERIFY_DST);
1196                 }
1197         }
1198         if ((coas.coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
1199                 connp->conn_wq->q_hiwat = connp->conn_sndbuf;
1200         }
1201         if (coas.coa_changed & COA_WROFF_CHANGED) {
1202                 connp->conn_wroff = connp->conn_ht_iphc_allocated +
1203                     tcps->tcps_wroff_xtra;
1204                 (void) proto_set_tx_wroff(connp->conn_rq, connp,
1205                     connp->conn_wroff);
1206         }
1207         if (coas.coa_changed & COA_OOBINLINE_CHANGED) {
1208                 if (IPCL_IS_NONSTR(connp))
1209                         proto_set_rx_oob_opt(connp, onoff);
1210         }
1211         return (0);
1212 }