1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  24  * Copyright 2016 Joyent, Inc.
  25  * Copyright (c) 2016 by Delphix. All rights reserved.
  26  */
  27 
  28 #include <sys/types.h>
  29 #include <sys/stream.h>
  30 #define _SUN_TPI_VERSION 2
  31 #include <sys/tihdr.h>
  32 #include <sys/socket.h>
  33 #include <sys/xti_xtiopt.h>
  34 #include <sys/xti_inet.h>
  35 #include <sys/policy.h>
  36 
  37 #include <inet/common.h>
  38 #include <netinet/ip6.h>
  39 #include <inet/ip.h>
  40 
  41 #include <netinet/in.h>
  42 #include <netinet/tcp.h>
  43 #include <inet/optcom.h>
  44 #include <inet/proto_set.h>
  45 #include <inet/tcp_impl.h>
  46 
  47 static int      tcp_opt_default(queue_t *, int, int, uchar_t *);
  48 
  49 /*
  50  * Table of all known options handled on a TCP protocol stack.
  51  *
  52  * Note: This table contains options processed by both TCP and IP levels
  53  *       and is the superset of options that can be performed on a TCP over IP
  54  *       stack.
  55  */
  56 opdes_t tcp_opt_arr[] = {
  57 
  58 { SO_LINGER,    SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
  59         sizeof (struct linger), 0 },
  60 
  61 { SO_DEBUG,     SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  62 { SO_KEEPALIVE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  63 { SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  64 { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
  65         },
  66 { SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  67 { SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  68 { SO_REUSEPORT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  69 { SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  70 { SO_TYPE,      SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
  71 { SO_SNDBUF,    SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  72 { SO_RCVBUF,    SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  73 { SO_SNDTIMEO,  SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
  74         sizeof (struct timeval), 0 },
  75 { SO_RCVTIMEO,  SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
  76         sizeof (struct timeval), 0 },
  77 { SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
  78         },
  79 { SO_SND_COPYAVOID, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  80 { SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
  81         0 },
  82 { SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
  83         0 },
  84 { SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
  85         0 },
  86 { SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
  87         0 },
  88 { SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  89 
  90 { SO_DOMAIN,    SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
  91 
  92 { SO_PROTOTYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
  93 
  94 { TCP_NODELAY,  IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
  95         },
  96 { TCP_MAXSEG,   IPPROTO_TCP, OA_R, OA_R, OP_NP, 0, sizeof (uint_t),
  97         536 },
  98 
  99 { TCP_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
 100         OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
 101 
 102 { TCP_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
 103         OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
 104 
 105 { TCP_CONN_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
 106         OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
 107 
 108 { TCP_CONN_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
 109         OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
 110 
 111 { TCP_RECVDSTADDR, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
 112         0 },
 113 
 114 { TCP_ANONPRIVBIND, IPPROTO_TCP, OA_R, OA_RW, OP_PRIVPORT, 0,
 115         sizeof (int), 0 },
 116 
 117 { TCP_EXCLBIND, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
 118         },
 119 
 120 { TCP_INIT_CWND, IPPROTO_TCP, OA_RW, OA_RW, OP_CONFIG, 0,
 121         sizeof (int), 0 },
 122 
 123 { TCP_KEEPALIVE_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
 124         sizeof (int), 0 },
 125 
 126 { TCP_KEEPIDLE, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 127 
 128 { TCP_KEEPCNT, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 129 
 130 { TCP_KEEPINTVL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 131 
 132 { TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
 133         sizeof (int), 0 },
 134 
 135 { TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 136 
 137 { TCP_RTO_INITIAL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
 138 
 139 { TCP_RTO_MIN, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
 140 
 141 { TCP_RTO_MAX, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
 142 
 143 { TCP_LINGER2, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 144 
 145 { IP_OPTIONS,   IPPROTO_IP, OA_RW, OA_RW, OP_NP,
 146         (OP_VARLEN|OP_NODEFAULT),
 147         IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
 148 { T_IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
 149         (OP_VARLEN|OP_NODEFAULT),
 150         IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
 151 
 152 { IP_TOS,       IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 153 { T_IP_TOS,     IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 154 { IP_TTL,       IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
 155         sizeof (int), -1 /* not initialized */ },
 156 
 157 { IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
 158         sizeof (ipsec_req_t), -1 /* not initialized */ },
 159 
 160 { IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
 161         sizeof (int),   0 /* no ifindex */ },
 162 
 163 { IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0,
 164         sizeof (int), 0 },
 165 
 166 { IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
 167         sizeof (int), -1 /* not initialized */ },
 168 
 169 { IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 170         sizeof (int),   0 /* no ifindex */ },
 171 
 172 { IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 173 
 174 { IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
 175         sizeof (in_addr_t),     -1 /* not initialized  */ },
 176 
 177 { IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0,
 178         sizeof (int), 0 },
 179 
 180 { IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
 181         (OP_NODEFAULT|OP_VARLEN),
 182         sizeof (struct in6_pktinfo), -1 /* not initialized */ },
 183 { IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
 184         OP_NODEFAULT,
 185         sizeof (sin6_t), -1 /* not initialized */ },
 186 { IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
 187         (OP_VARLEN|OP_NODEFAULT), 255*8,
 188         -1 /* not initialized */ },
 189 { IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
 190         (OP_VARLEN|OP_NODEFAULT), 255*8,
 191         -1 /* not initialized */ },
 192 { IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
 193         (OP_VARLEN|OP_NODEFAULT), 255*8,
 194         -1 /* not initialized */ },
 195 { IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
 196         (OP_VARLEN|OP_NODEFAULT), 255*8,
 197         -1 /* not initialized */ },
 198 { IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
 199         OP_NODEFAULT,
 200         sizeof (int), -1 /* not initialized */ },
 201 { IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
 202         OP_NODEFAULT,
 203         sizeof (struct ip6_mtuinfo), -1 /* not initialized */ },
 204 { IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 205         sizeof (int), 0 },
 206 { IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 207         sizeof (int), 0 },
 208 { IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 209         sizeof (int), 0 },
 210 
 211 /* Enable receipt of ancillary data */
 212 { IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 213         sizeof (int), 0 },
 214 { IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 215         sizeof (int), 0 },
 216 { IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 217         sizeof (int), 0 },
 218 { _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 219         sizeof (int), 0 },
 220 { IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 221         sizeof (int), 0 },
 222 { IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 223         sizeof (int), 0 },
 224 { IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 225         sizeof (int), 0 },
 226 { IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 227         sizeof (int), 0 },
 228 
 229 { IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
 230         sizeof (ipsec_req_t), -1 /* not initialized */ },
 231 { IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 232         sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT },
 233 };
 234 
 235 /*
 236  * Table of all supported levels
 237  * Note: Some levels (e.g. XTI_GENERIC) may be valid but may not have
 238  * any supported options so we need this info separately.
 239  *
 240  * This is needed only for topmost tpi providers and is used only by
 241  * XTI interfaces.
 242  */
 243 optlevel_t      tcp_valid_levels_arr[] = {
 244         XTI_GENERIC,
 245         SOL_SOCKET,
 246         IPPROTO_TCP,
 247         IPPROTO_IP,
 248         IPPROTO_IPV6
 249 };
 250 
 251 
 252 #define TCP_OPT_ARR_CNT         A_CNT(tcp_opt_arr)
 253 #define TCP_VALID_LEVELS_CNT    A_CNT(tcp_valid_levels_arr)
 254 
 255 uint_t tcp_max_optsize; /* initialized when TCP driver is loaded */
 256 
 257 /*
 258  * Initialize option database object for TCP
 259  *
 260  * This object represents database of options to search passed to
 261  * {sock,tpi}optcom_req() interface routine to take care of option
 262  * management and associated methods.
 263  */
 264 
 265 optdb_obj_t tcp_opt_obj = {
 266         tcp_opt_default,        /* TCP default value function pointer */
 267         tcp_tpi_opt_get,        /* TCP get function pointer */
 268         tcp_tpi_opt_set,        /* TCP set function pointer */
 269         TCP_OPT_ARR_CNT,        /* TCP option database count of entries */
 270         tcp_opt_arr,            /* TCP option database */
 271         TCP_VALID_LEVELS_CNT,   /* TCP valid level count of entries */
 272         tcp_valid_levels_arr    /* TCP valid level array */
 273 };
 274 
 275 static int tcp_max_init_cwnd = TCP_MAX_INIT_CWND;
 276 
 277 /*
 278  * Some TCP options can be "set" by requesting them in the option
 279  * buffer. This is needed for XTI feature test though we do not
 280  * allow it in general. We interpret that this mechanism is more
 281  * applicable to OSI protocols and need not be allowed in general.
 282  * This routine filters out options for which it is not allowed (most)
 283  * and lets through those (few) for which it is. [ The XTI interface
 284  * test suite specifics will imply that any XTI_GENERIC level XTI_* if
 285  * ever implemented will have to be allowed here ].
 286  */
 287 static boolean_t
 288 tcp_allow_connopt_set(int level, int name)
 289 {
 290 
 291         switch (level) {
 292         case IPPROTO_TCP:
 293                 switch (name) {
 294                 case TCP_NODELAY:
 295                         return (B_TRUE);
 296                 default:
 297                         return (B_FALSE);
 298                 }
 299                 /*NOTREACHED*/
 300         default:
 301                 return (B_FALSE);
 302         }
 303         /*NOTREACHED*/
 304 }
 305 
 306 /*
 307  * This routine gets default values of certain options whose default
 308  * values are maintained by protocol specific code
 309  */
 310 /* ARGSUSED */
 311 static int
 312 tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
 313 {
 314         int32_t *i1 = (int32_t *)ptr;
 315         tcp_stack_t     *tcps = Q_TO_TCP(q)->tcp_tcps;
 316 
 317         switch (level) {
 318         case IPPROTO_TCP:
 319                 switch (name) {
 320                 case TCP_NOTIFY_THRESHOLD:
 321                         *i1 = tcps->tcps_ip_notify_interval;
 322                         break;
 323                 case TCP_ABORT_THRESHOLD:
 324                         *i1 = tcps->tcps_ip_abort_interval;
 325                         break;
 326                 case TCP_CONN_NOTIFY_THRESHOLD:
 327                         *i1 = tcps->tcps_ip_notify_cinterval;
 328                         break;
 329                 case TCP_CONN_ABORT_THRESHOLD:
 330                         *i1 = tcps->tcps_ip_abort_cinterval;
 331                         break;
 332                 default:
 333                         return (-1);
 334                 }
 335                 break;
 336         case IPPROTO_IP:
 337                 switch (name) {
 338                 case IP_TTL:
 339                         *i1 = tcps->tcps_ipv4_ttl;
 340                         break;
 341                 default:
 342                         return (-1);
 343                 }
 344                 break;
 345         case IPPROTO_IPV6:
 346                 switch (name) {
 347                 case IPV6_UNICAST_HOPS:
 348                         *i1 = tcps->tcps_ipv6_hoplimit;
 349                         break;
 350                 default:
 351                         return (-1);
 352                 }
 353                 break;
 354         default:
 355                 return (-1);
 356         }
 357         return (sizeof (int));
 358 }
 359 
 360 /*
 361  * TCP routine to get the values of options.
 362  */
 363 int
 364 tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
 365 {
 366         int             *i1 = (int *)ptr;
 367         tcp_t           *tcp = connp->conn_tcp;
 368         conn_opt_arg_t  coas;
 369         int             retval;
 370 
 371         coas.coa_connp = connp;
 372         coas.coa_ixa = connp->conn_ixa;
 373         coas.coa_ipp = &connp->conn_xmit_ipp;
 374         coas.coa_ancillary = B_FALSE;
 375         coas.coa_changed = 0;
 376 
 377         switch (level) {
 378         case SOL_SOCKET:
 379                 switch (name) {
 380                 case SO_SND_COPYAVOID:
 381                         *i1 = tcp->tcp_snd_zcopy_on ?
 382                             SO_SND_COPYAVOID : 0;
 383                         return (sizeof (int));
 384                 case SO_ACCEPTCONN:
 385                         *i1 = (tcp->tcp_state == TCPS_LISTEN);
 386                         return (sizeof (int));
 387                 }
 388                 break;
 389         case IPPROTO_TCP:
 390                 switch (name) {
 391                 case TCP_NODELAY:
 392                         *i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0;
 393                         return (sizeof (int));
 394                 case TCP_MAXSEG:
 395                         *i1 = tcp->tcp_mss;
 396                         return (sizeof (int));
 397                 case TCP_NOTIFY_THRESHOLD:
 398                         *i1 = (int)tcp->tcp_first_timer_threshold;
 399                         return (sizeof (int));
 400                 case TCP_ABORT_THRESHOLD:
 401                         *i1 = tcp->tcp_second_timer_threshold;
 402                         return (sizeof (int));
 403                 case TCP_CONN_NOTIFY_THRESHOLD:
 404                         *i1 = tcp->tcp_first_ctimer_threshold;
 405                         return (sizeof (int));
 406                 case TCP_CONN_ABORT_THRESHOLD:
 407                         *i1 = tcp->tcp_second_ctimer_threshold;
 408                         return (sizeof (int));
 409                 case TCP_INIT_CWND:
 410                         *i1 = tcp->tcp_init_cwnd;
 411                         return (sizeof (int));
 412                 case TCP_KEEPALIVE_THRESHOLD:
 413                         *i1 = tcp->tcp_ka_interval;
 414                         return (sizeof (int));
 415 
 416                 /*
 417                  * TCP_KEEPIDLE expects value in seconds, but
 418                  * tcp_ka_interval is in milliseconds.
 419                  */
 420                 case TCP_KEEPIDLE:
 421                         *i1 = tcp->tcp_ka_interval / 1000;
 422                         return (sizeof (int));
 423                 case TCP_KEEPCNT:
 424                         *i1 = tcp->tcp_ka_cnt;
 425                         return (sizeof (int));
 426 
 427                 /*
 428                  * TCP_KEEPINTVL expects value in seconds, but
 429                  * tcp_ka_rinterval is in milliseconds.
 430                  */
 431                 case TCP_KEEPINTVL:
 432                         *i1 = tcp->tcp_ka_rinterval / 1000;
 433                         return (sizeof (int));
 434                 case TCP_KEEPALIVE_ABORT_THRESHOLD:
 435                         *i1 = tcp->tcp_ka_abort_thres;
 436                         return (sizeof (int));
 437                 case TCP_CORK:
 438                         *i1 = tcp->tcp_cork;
 439                         return (sizeof (int));
 440                 case TCP_RTO_INITIAL:
 441                         *i1 = tcp->tcp_rto_initial;
 442                         return (sizeof (uint32_t));
 443                 case TCP_RTO_MIN:
 444                         *i1 = tcp->tcp_rto_min;
 445                         return (sizeof (uint32_t));
 446                 case TCP_RTO_MAX:
 447                         *i1 = tcp->tcp_rto_max;
 448                         return (sizeof (uint32_t));
 449                 case TCP_LINGER2:
 450                         *i1 = tcp->tcp_fin_wait_2_flush_interval / SECONDS;
 451                         return (sizeof (int));
 452                 }
 453                 break;
 454         case IPPROTO_IP:
 455                 if (connp->conn_family != AF_INET)
 456                         return (-1);
 457                 switch (name) {
 458                 case IP_OPTIONS:
 459                 case T_IP_OPTIONS:
 460                         /* Caller ensures enough space */
 461                         return (ip_opt_get_user(connp, ptr));
 462                 default:
 463                         break;
 464                 }
 465                 break;
 466 
 467         case IPPROTO_IPV6:
 468                 /*
 469                  * IPPROTO_IPV6 options are only supported for sockets
 470                  * that are using IPv6 on the wire.
 471                  */
 472                 if (connp->conn_ipversion != IPV6_VERSION) {
 473                         return (-1);
 474                 }
 475                 switch (name) {
 476                 case IPV6_PATHMTU:
 477                         if (tcp->tcp_state < TCPS_ESTABLISHED)
 478                                 return (-1);
 479                         break;
 480                 }
 481                 break;
 482         }
 483         mutex_enter(&connp->conn_lock);
 484         retval = conn_opt_get(&coas, level, name, ptr);
 485         mutex_exit(&connp->conn_lock);
 486         return (retval);
 487 }
 488 
 489 /*
 490  * Set a TCP connection's participation in SO_REUSEPORT.  This operation is
 491  * performed under the protection of the squeue via tcp_setsockopt.
 492  * The manipulation of tcp_rg_bind, as part of this operation, is subject to
 493  * these constraints:
 494  * 1. Prior to bind(), tcp_rg_bind can be set/cleared in tcp_set_reuseport
 495  *    under the protection of the squeue.
 496  * 2. Once the connection has been bound, the tcp_rg_bind pointer must not be
 497  *    altered until such time as tcp_free() cleans up the connection.
 498  * 3. A connection undergoing bind, which matches to a connection participating
 499  *    in port-reuse, will switch its tcp_rg_bind pointer when it joins the
 500  *    group of an existing connection in tcp_bindi().
 501  */
 502 static int
 503 tcp_set_reuseport(conn_t *connp, boolean_t do_enable)
 504 {
 505         tcp_t *tcp = connp->conn_tcp;
 506         struct tcp_rg_s *rg;
 507 
 508         if (!IPCL_IS_NONSTR(connp)) {
 509                 if (do_enable) {
 510                         /*
 511                          * SO_REUSEPORT cannot be enabled on sockets which have
 512                          * fallen back to the STREAMS API.
 513                          */
 514                         return (EINVAL);
 515                 } else {
 516                         /*
 517                          * A connection with SO_REUSEPORT enabled should be
 518                          * prevented from falling back to STREAMS mode via
 519                          * logic in tcp_fallback.  It is legal, however, for
 520                          * fallen-back connections to affirm the disabled state
 521                          * of SO_REUSEPORT.
 522                          */
 523                         ASSERT(connp->conn_reuseport == 0);
 524                         return (0);
 525                 }
 526         }
 527         if (tcp->tcp_state <= TCPS_CLOSED) {
 528                 return (EINVAL);
 529         }
 530         if (connp->conn_reuseport == 0 && do_enable) {
 531                 /* disabled -> enabled */
 532                 if (tcp->tcp_rg_bind != NULL) {
 533                         tcp_rg_setactive(tcp->tcp_rg_bind, do_enable);
 534                 } else {
 535                         /*
 536                          * Connection state is not a concern when initially
 537                          * populating tcp_rg_bind.  Setting it to non-NULL on a
 538                          * bound or listening connection would only mean that
 539                          * new reused-port binds become a possibility.
 540                          */
 541                         if ((rg = tcp_rg_init(tcp)) == NULL) {
 542                                 return (ENOMEM);
 543                         }
 544                         tcp->tcp_rg_bind = rg;
 545                 }
 546                 connp->conn_reuseport = 1;
 547         } else if (connp->conn_reuseport != 0 && !do_enable) {
 548                 /* enabled -> disabled */
 549                 ASSERT(tcp->tcp_rg_bind != NULL);
 550                 if (tcp->tcp_state == TCPS_IDLE) {
 551                         /*
 552                          * If the connection has not been bound yet, discard
 553                          * the reuse group state.  Since disabling SO_REUSEPORT
 554                          * on a bound socket will _not_ prevent others from
 555                          * reusing the port, the presence of tcp_rg_bind is
 556                          * used to determine reuse availability, not
 557                          * conn_reuseport.
 558                          *
 559                          * This allows proper behavior for examples such as:
 560                          *
 561                          * setsockopt(fd1, ... SO_REUSEPORT, &on_val...);
 562                          * bind(fd1, &myaddr, ...);
 563                          * setsockopt(fd1, ... SO_REUSEPORT, &off_val...);
 564                          *
 565                          * setsockopt(fd2, ... SO_REUSEPORT, &on_val...);
 566                          * bind(fd2, &myaddr, ...); // <- SHOULD SUCCEED
 567                          *
 568                          */
 569                         rg = tcp->tcp_rg_bind;
 570                         tcp->tcp_rg_bind = NULL;
 571                         VERIFY(tcp_rg_remove(rg, tcp));
 572                         tcp_rg_destroy(rg);
 573                 } else {
 574                         /*
 575                          * If a connection has been bound, it's no longer safe
 576                          * to manipulate tcp_rg_bind until connection clean-up
 577                          * during tcp_free.  Just mark the member status of the
 578                          * connection as inactive.
 579                          */
 580                         tcp_rg_setactive(tcp->tcp_rg_bind, do_enable);
 581                 }
 582                 connp->conn_reuseport = 0;
 583         }
 584         return (0);
 585 }
 586 
 587 /*
 588  * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
 589  * Parameters are assumed to be verified by the caller.
 590  */
 591 /* ARGSUSED */
 592 int
 593 tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
 594     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
 595     void *thisdg_attrs, cred_t *cr)
 596 {
 597         tcp_t   *tcp = connp->conn_tcp;
 598         int     *i1 = (int *)invalp;
 599         boolean_t onoff = (*i1 == 0) ? 0 : 1;
 600         boolean_t checkonly;
 601         int     reterr;
 602         tcp_stack_t     *tcps = tcp->tcp_tcps;
 603         conn_opt_arg_t  coas;
 604         uint32_t        val = *((uint32_t *)invalp);
 605 
 606         coas.coa_connp = connp;
 607         coas.coa_ixa = connp->conn_ixa;
 608         coas.coa_ipp = &connp->conn_xmit_ipp;
 609         coas.coa_ancillary = B_FALSE;
 610         coas.coa_changed = 0;
 611 
 612         switch (optset_context) {
 613         case SETFN_OPTCOM_CHECKONLY:
 614                 checkonly = B_TRUE;
 615                 /*
 616                  * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
 617                  * inlen != 0 implies value supplied and
 618                  *      we have to "pretend" to set it.
 619                  * inlen == 0 implies that there is no
 620                  *      value part in T_CHECK request and just validation
 621                  * done elsewhere should be enough, we just return here.
 622                  */
 623                 if (inlen == 0) {
 624                         *outlenp = 0;
 625                         return (0);
 626                 }
 627                 break;
 628         case SETFN_OPTCOM_NEGOTIATE:
 629                 checkonly = B_FALSE;
 630                 break;
 631         case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */
 632         case SETFN_CONN_NEGOTIATE:
 633                 checkonly = B_FALSE;
 634                 /*
 635                  * Negotiating local and "association-related" options
 636                  * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ)
 637                  * primitives is allowed by XTI, but we choose
 638                  * to not implement this style negotiation for Internet
 639                  * protocols (We interpret it is a must for OSI world but
 640                  * optional for Internet protocols) for all options.
 641                  * [ Will do only for the few options that enable test
 642                  * suites that our XTI implementation of this feature
 643                  * works for transports that do allow it ]
 644                  */
 645                 if (!tcp_allow_connopt_set(level, name)) {
 646                         *outlenp = 0;
 647                         return (EINVAL);
 648                 }
 649                 break;
 650         default:
 651                 /*
 652                  * We should never get here
 653                  */
 654                 *outlenp = 0;
 655                 return (EINVAL);
 656         }
 657 
 658         ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
 659             (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
 660 
 661         /*
 662          * For TCP, we should have no ancillary data sent down
 663          * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs
 664          * has to be zero.
 665          */
 666         ASSERT(thisdg_attrs == NULL);
 667 
 668         /*
 669          * For fixed length options, no sanity check
 670          * of passed in length is done. It is assumed *_optcom_req()
 671          * routines do the right thing.
 672          */
 673         switch (level) {
 674         case SOL_SOCKET:
 675                 switch (name) {
 676                 case SO_KEEPALIVE:
 677                         if (checkonly) {
 678                                 /* check only case */
 679                                 break;
 680                         }
 681 
 682                         if (!onoff) {
 683                                 if (connp->conn_keepalive) {
 684                                         if (tcp->tcp_ka_tid != 0) {
 685                                                 (void) TCP_TIMER_CANCEL(tcp,
 686                                                     tcp->tcp_ka_tid);
 687                                                 tcp->tcp_ka_tid = 0;
 688                                         }
 689                                         connp->conn_keepalive = 0;
 690                                 }
 691                                 break;
 692                         }
 693                         if (!connp->conn_keepalive) {
 694                                 /* Crank up the keepalive timer */
 695                                 tcp->tcp_ka_last_intrvl = 0;
 696                                 tcp->tcp_ka_tid = TCP_TIMER(tcp,
 697                                     tcp_keepalive_timer, tcp->tcp_ka_interval);
 698                                 connp->conn_keepalive = 1;
 699                         }
 700                         break;
 701                 case SO_SNDBUF: {
 702                         if (*i1 > tcps->tcps_max_buf) {
 703                                 *outlenp = 0;
 704                                 return (ENOBUFS);
 705                         }
 706                         if (checkonly)
 707                                 break;
 708 
 709                         connp->conn_sndbuf = *i1;
 710                         if (tcps->tcps_snd_lowat_fraction != 0) {
 711                                 connp->conn_sndlowat = connp->conn_sndbuf /
 712                                     tcps->tcps_snd_lowat_fraction;
 713                         }
 714                         (void) tcp_maxpsz_set(tcp, B_TRUE);
 715                         /*
 716                          * If we are flow-controlled, recheck the condition.
 717                          * There are apps that increase SO_SNDBUF size when
 718                          * flow-controlled (EWOULDBLOCK), and expect the flow
 719                          * control condition to be lifted right away.
 720                          */
 721                         mutex_enter(&tcp->tcp_non_sq_lock);
 722                         if (tcp->tcp_flow_stopped &&
 723                             TCP_UNSENT_BYTES(tcp) < connp->conn_sndbuf) {
 724                                 tcp_clrqfull(tcp);
 725                         }
 726                         mutex_exit(&tcp->tcp_non_sq_lock);
 727                         *outlenp = inlen;
 728                         return (0);
 729                 }
 730                 case SO_RCVBUF:
 731                         if (*i1 > tcps->tcps_max_buf) {
 732                                 *outlenp = 0;
 733                                 return (ENOBUFS);
 734                         }
 735                         /* Silently ignore zero */
 736                         if (!checkonly && *i1 != 0) {
 737                                 *i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss);
 738                                 (void) tcp_rwnd_set(tcp, *i1);
 739                         }
 740                         /*
 741                          * XXX should we return the rwnd here
 742                          * and tcp_opt_get ?
 743                          */
 744                         *outlenp = inlen;
 745                         return (0);
 746                 case SO_SND_COPYAVOID:
 747                         if (!checkonly) {
 748                                 if (tcp->tcp_loopback ||
 749                                     (onoff != 1) || !tcp_zcopy_check(tcp)) {
 750                                         *outlenp = 0;
 751                                         return (EOPNOTSUPP);
 752                                 }
 753                                 tcp->tcp_snd_zcopy_aware = 1;
 754                         }
 755                         *outlenp = inlen;
 756                         return (0);
 757                 case SO_REUSEPORT:
 758                         if (!checkonly) {
 759                                 return (tcp_set_reuseport(connp, *i1 != 0));
 760                         }
 761                         return (0);
 762                 }
 763                 break;
 764         case IPPROTO_TCP:
 765                 switch (name) {
 766                 case TCP_NODELAY:
 767                         if (!checkonly)
 768                                 tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss;
 769                         break;
 770                 case TCP_NOTIFY_THRESHOLD:
 771                         if (!checkonly)
 772                                 tcp->tcp_first_timer_threshold = *i1;
 773                         break;
 774                 case TCP_ABORT_THRESHOLD:
 775                         if (!checkonly)
 776                                 tcp->tcp_second_timer_threshold = *i1;
 777                         break;
 778                 case TCP_CONN_NOTIFY_THRESHOLD:
 779                         if (!checkonly)
 780                                 tcp->tcp_first_ctimer_threshold = *i1;
 781                         break;
 782                 case TCP_CONN_ABORT_THRESHOLD:
 783                         if (!checkonly)
 784                                 tcp->tcp_second_ctimer_threshold = *i1;
 785                         break;
 786                 case TCP_RECVDSTADDR:
 787                         if (tcp->tcp_state > TCPS_LISTEN) {
 788                                 *outlenp = 0;
 789                                 return (EOPNOTSUPP);
 790                         }
 791                         /* Setting done in conn_opt_set */
 792                         break;
 793                 case TCP_INIT_CWND:
 794                         if (checkonly)
 795                                 break;
 796 
 797                         /*
 798                          * Only allow socket with network configuration
 799                          * privilege to set the initial cwnd to be larger
 800                          * than allowed by RFC 3390.
 801                          */
 802                         if (val > MIN(4, MAX(2, 4380 / tcp->tcp_mss))) {
 803                                 if ((reterr = secpolicy_ip_config(cr, B_TRUE))
 804                                     != 0) {
 805                                         *outlenp = 0;
 806                                         return (reterr);
 807                                 }
 808                                 if (val > tcp_max_init_cwnd) {
 809                                         *outlenp = 0;
 810                                         return (EINVAL);
 811                                 }
 812                         }
 813 
 814                         tcp->tcp_init_cwnd = val;
 815 
 816                         /*
 817                          * If the socket is connected, AND no outbound data
 818                          * has been sent, reset the actual cwnd values.
 819                          */
 820                         if (tcp->tcp_state == TCPS_ESTABLISHED &&
 821                             tcp->tcp_iss == tcp->tcp_snxt - 1) {
 822                                 tcp->tcp_cwnd =
 823                                     MIN(tcp->tcp_rwnd, val * tcp->tcp_mss);
 824                         }
 825                         break;
 826 
 827                 /*
 828                  * TCP_KEEPIDLE is in seconds but TCP_KEEPALIVE_THRESHOLD
 829                  * is in milliseconds. TCP_KEEPIDLE is introduced for
 830                  * compatibility with other Unix flavors.
 831                  * We can fall through TCP_KEEPALIVE_THRESHOLD logic after
 832                  * converting the input to milliseconds.
 833                  */
 834                 case TCP_KEEPIDLE:
 835                         *i1 *= 1000;
 836                         /* FALLTHRU */
 837 
 838                 case TCP_KEEPALIVE_THRESHOLD:
 839                         if (checkonly)
 840                                 break;
 841 
 842                         if (*i1 < tcps->tcps_keepalive_interval_low ||
 843                             *i1 > tcps->tcps_keepalive_interval_high) {
 844                                 *outlenp = 0;
 845                                 return (EINVAL);
 846                         }
 847                         if (*i1 != tcp->tcp_ka_interval) {
 848                                 tcp->tcp_ka_interval = *i1;
 849                                 /*
 850                                  * Check if we need to restart the
 851                                  * keepalive timer.
 852                                  */
 853                                 if (tcp->tcp_ka_tid != 0) {
 854                                         ASSERT(connp->conn_keepalive);
 855                                         (void) TCP_TIMER_CANCEL(tcp,
 856                                             tcp->tcp_ka_tid);
 857                                         tcp->tcp_ka_last_intrvl = 0;
 858                                         tcp->tcp_ka_tid = TCP_TIMER(tcp,
 859                                             tcp_keepalive_timer,
 860                                             tcp->tcp_ka_interval);
 861                                 }
 862                         }
 863                         break;
 864 
 865                 /*
 866                  * tcp_ka_abort_thres = tcp_ka_rinterval * tcp_ka_cnt.
 867                  * So setting TCP_KEEPCNT or TCP_KEEPINTVL can affect all the
 868                  * three members - tcp_ka_abort_thres, tcp_ka_rinterval and
 869                  * tcp_ka_cnt.
 870                  */
 871                 case TCP_KEEPCNT:
 872                         if (checkonly)
 873                                 break;
 874 
 875                         if (*i1 == 0) {
 876                                 return (EINVAL);
 877                         } else if (tcp->tcp_ka_rinterval == 0) {
 878                                 /*
 879                                  * When TCP_KEEPCNT is specified without first
 880                                  * specifying a TCP_KEEPINTVL, we infer an
 881                                  * interval based on a tunable specific to our
 882                                  * stack: the tcp_keepalive_abort_interval.
 883                                  * (Or the TCP_KEEPALIVE_ABORT_THRESHOLD, in
 884                                  * the unlikely event that that has been set.)
 885                                  * Given the abort interval's default value of
 886                                  * 480 seconds, low TCP_KEEPCNT values can
 887                                  * result in intervals that exceed the default
 888                                  * maximum RTO of 60 seconds.  Rather than
 889                                  * fail in these cases, we (implicitly) clamp
 890                                  * the interval at the maximum RTO; if the
 891                                  * TCP_KEEPCNT is shortly followed by a
 892                                  * TCP_KEEPINTVL (as we expect), the abort
 893                                  * threshold will be recalculated correctly --
 894                                  * and if a TCP_KEEPINTVL is not forthcoming,
 895                                  * keep-alive will at least operate reasonably
 896                                  * given the underconfigured state.
 897                                  */
 898                                 uint32_t interval;
 899 
 900                                 interval = tcp->tcp_ka_abort_thres / *i1;
 901 
 902                                 if (interval < tcp->tcp_rto_min)
 903                                         interval = tcp->tcp_rto_min;
 904 
 905                                 if (interval > tcp->tcp_rto_max)
 906                                         interval = tcp->tcp_rto_max;
 907 
 908                                 tcp->tcp_ka_rinterval = interval;
 909                         } else {
 910                                 if ((*i1 * tcp->tcp_ka_rinterval) <
 911                                     tcps->tcps_keepalive_abort_interval_low ||
 912                                     (*i1 * tcp->tcp_ka_rinterval) >
 913                                     tcps->tcps_keepalive_abort_interval_high)
 914                                         return (EINVAL);
 915                                 tcp->tcp_ka_abort_thres =
 916                                     (*i1 * tcp->tcp_ka_rinterval);
 917                         }
 918                         tcp->tcp_ka_cnt = *i1;
 919                         break;
 920                 case TCP_KEEPINTVL:
 921                         /*
 922                          * TCP_KEEPINTVL is specified in seconds, but
 923                          * tcp_ka_rinterval is in milliseconds.
 924                          */
 925 
 926                         if (checkonly)
 927                                 break;
 928 
 929                         if ((*i1 * 1000) < tcp->tcp_rto_min ||
 930                             (*i1 * 1000) > tcp->tcp_rto_max)
 931                                 return (EINVAL);
 932 
 933                         if (tcp->tcp_ka_cnt == 0) {
 934                                 tcp->tcp_ka_cnt =
 935                                     tcp->tcp_ka_abort_thres / (*i1 * 1000);
 936                         } else {
 937                                 if ((*i1 * tcp->tcp_ka_cnt * 1000) <
 938                                     tcps->tcps_keepalive_abort_interval_low ||
 939                                     (*i1 * tcp->tcp_ka_cnt * 1000) >
 940                                     tcps->tcps_keepalive_abort_interval_high)
 941                                         return (EINVAL);
 942                                 tcp->tcp_ka_abort_thres =
 943                                     (*i1 * tcp->tcp_ka_cnt * 1000);
 944                         }
 945                         tcp->tcp_ka_rinterval = *i1 * 1000;
 946                         break;
 947                 case TCP_KEEPALIVE_ABORT_THRESHOLD:
 948                         if (!checkonly) {
 949                                 if (*i1 <
 950                                     tcps->tcps_keepalive_abort_interval_low ||
 951                                     *i1 >
 952                                     tcps->tcps_keepalive_abort_interval_high) {
 953                                         *outlenp = 0;
 954                                         return (EINVAL);
 955                                 }
 956                                 tcp->tcp_ka_abort_thres = *i1;
 957                                 tcp->tcp_ka_cnt = 0;
 958                                 tcp->tcp_ka_rinterval = 0;
 959                         }
 960                         break;
 961                 case TCP_CORK:
 962                         if (!checkonly) {
 963                                 /*
 964                                  * if tcp->tcp_cork was set and is now
 965                                  * being unset, we have to make sure that
 966                                  * the remaining data gets sent out. Also
 967                                  * unset tcp->tcp_cork so that tcp_wput_data()
 968                                  * can send data even if it is less than mss
 969                                  */
 970                                 if (tcp->tcp_cork && onoff == 0 &&
 971                                     tcp->tcp_unsent > 0) {
 972                                         tcp->tcp_cork = B_FALSE;
 973                                         tcp_wput_data(tcp, NULL, B_FALSE);
 974                                 }
 975                                 tcp->tcp_cork = onoff;
 976                         }
 977                         break;
 978                 case TCP_RTO_INITIAL:
 979                         if (checkonly || val == 0)
 980                                 break;
 981 
 982                         /*
 983                          * Sanity checks
 984                          *
 985                          * The initial RTO should be bounded by the minimum
 986                          * and maximum RTO.  And it should also be smaller
 987                          * than the connect attempt abort timeout.  Otherwise,
 988                          * the connection won't be aborted in a period
 989                          * reasonably close to that timeout.
 990                          */
 991                         if (val < tcp->tcp_rto_min || val > tcp->tcp_rto_max ||
 992                             val > tcp->tcp_second_ctimer_threshold ||
 993                             val < tcps->tcps_rexmit_interval_initial_low ||
 994                             val > tcps->tcps_rexmit_interval_initial_high) {
 995                                 *outlenp = 0;
 996                                 return (EINVAL);
 997                         }
 998                         tcp->tcp_rto_initial = val;
 999 
1000                         /*
1001                          * If TCP has not sent anything, need to re-calculate
1002                          * tcp_rto.  Otherwise, this option change does not
1003                          * really affect anything.
1004                          */
1005                         if (tcp->tcp_state >= TCPS_SYN_SENT)
1006                                 break;
1007 
1008                         tcp->tcp_rtt_sa = MSEC2NSEC(tcp->tcp_rto_initial) << 2;
1009                         tcp->tcp_rtt_sd = MSEC2NSEC(tcp->tcp_rto_initial) >> 1;
1010                         tcp->tcp_rto = tcp_calculate_rto(tcp, tcps);
1011                         break;
1012                 case TCP_RTO_MIN:
1013                         if (checkonly || val == 0)
1014                                 break;
1015 
1016                         if (val < tcps->tcps_rexmit_interval_min_low ||
1017                             val > tcps->tcps_rexmit_interval_min_high ||
1018                             val > tcp->tcp_rto_max) {
1019                                 *outlenp = 0;
1020                                 return (EINVAL);
1021                         }
1022                         tcp->tcp_rto_min = val;
1023                         if (tcp->tcp_rto < val)
1024                                 tcp->tcp_rto = val;
1025                         break;
1026                 case TCP_RTO_MAX:
1027                         if (checkonly || val == 0)
1028                                 break;
1029 
1030                         /*
1031                          * Sanity checks
1032                          *
1033                          * The maximum RTO should not be larger than the
1034                          * connection abort timeout.  Otherwise, the
1035                          * connection won't be aborted in a period reasonably
1036                          * close to that timeout.
1037                          */
1038                         if (val < tcps->tcps_rexmit_interval_max_low ||
1039                             val > tcps->tcps_rexmit_interval_max_high ||
1040                             val < tcp->tcp_rto_min ||
1041                             val > tcp->tcp_second_timer_threshold) {
1042                                 *outlenp = 0;
1043                                 return (EINVAL);
1044                         }
1045                         tcp->tcp_rto_max = val;
1046                         if (tcp->tcp_rto > val)
1047                                 tcp->tcp_rto = val;
1048                         break;
1049                 case TCP_LINGER2:
1050                         if (checkonly || *i1 == 0)
1051                                 break;
1052 
1053                         /*
1054                          * Note that the option value's unit is second.  And
1055                          * the value should be bigger than the private
1056                          * parameter tcp_fin_wait_2_flush_interval's lower
1057                          * bound and smaller than the current value of that
1058                          * parameter.  It should be smaller than the current
1059                          * value to avoid an app setting TCP_LINGER2 to a big
1060                          * value, causing resource to be held up too long in
1061                          * FIN-WAIT-2 state.
1062                          */
1063                         if (*i1 < 0 ||
1064                             tcps->tcps_fin_wait_2_flush_interval_low/SECONDS >
1065                             *i1 ||
1066                             tcps->tcps_fin_wait_2_flush_interval/SECONDS <
1067                             *i1) {
1068                                 *outlenp = 0;
1069                                 return (EINVAL);
1070                         }
1071                         tcp->tcp_fin_wait_2_flush_interval = *i1 * SECONDS;
1072                         break;
1073                 default:
1074                         break;
1075                 }
1076                 break;
1077         case IPPROTO_IP:
1078                 switch (name) {
1079                 case IP_SEC_OPT:
1080                         /*
1081                          * We should not allow policy setting after
1082                          * we start listening for connections.
1083                          */
1084                         if (tcp->tcp_state == TCPS_LISTEN) {
1085                                 return (EINVAL);
1086                         }
1087                         break;
1088                 }
1089                 break;
1090         case IPPROTO_IPV6:
1091                 /*
1092                  * IPPROTO_IPV6 options are only supported for sockets
1093                  * that are using IPv6 on the wire.
1094                  */
1095                 if (connp->conn_ipversion != IPV6_VERSION) {
1096                         *outlenp = 0;
1097                         return (EINVAL);
1098                 }
1099 
1100                 switch (name) {
1101                 case IPV6_RECVPKTINFO:
1102                         if (!checkonly) {
1103                                 /* Force it to be sent up with the next msg */
1104                                 tcp->tcp_recvifindex = 0;
1105                         }
1106                         break;
1107                 case IPV6_RECVTCLASS:
1108                         if (!checkonly) {
1109                                 /* Force it to be sent up with the next msg */
1110                                 tcp->tcp_recvtclass = 0xffffffffU;
1111                         }
1112                         break;
1113                 case IPV6_RECVHOPLIMIT:
1114                         if (!checkonly) {
1115                                 /* Force it to be sent up with the next msg */
1116                                 tcp->tcp_recvhops = 0xffffffffU;
1117                         }
1118                         break;
1119                 case IPV6_PKTINFO:
1120                         /* This is an extra check for TCP */
1121                         if (inlen == sizeof (struct in6_pktinfo)) {
1122                                 struct in6_pktinfo *pkti;
1123 
1124                                 pkti = (struct in6_pktinfo *)invalp;
1125                                 /*
1126                                  * RFC 3542 states that ipi6_addr must be
1127                                  * the unspecified address when setting the
1128                                  * IPV6_PKTINFO sticky socket option on a
1129                                  * TCP socket.
1130                                  */
1131                                 if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr))
1132                                         return (EINVAL);
1133                         }
1134                         break;
1135                 case IPV6_SEC_OPT:
1136                         /*
1137                          * We should not allow policy setting after
1138                          * we start listening for connections.
1139                          */
1140                         if (tcp->tcp_state == TCPS_LISTEN) {
1141                                 return (EINVAL);
1142                         }
1143                         break;
1144                 }
1145                 break;
1146         }
1147         reterr = conn_opt_set(&coas, level, name, inlen, invalp,
1148             checkonly, cr);
1149         if (reterr != 0) {
1150                 *outlenp = 0;
1151                 return (reterr);
1152         }
1153 
1154         /*
1155          * Common case of OK return with outval same as inval
1156          */
1157         if (invalp != outvalp) {
1158                 /* don't trust bcopy for identical src/dst */
1159                 (void) bcopy(invalp, outvalp, inlen);
1160         }
1161         *outlenp = inlen;
1162 
1163         if (coas.coa_changed & COA_HEADER_CHANGED) {
1164                 /* If we are connected we rebuilt the headers */
1165                 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1166                     !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1167                         reterr = tcp_build_hdrs(tcp);
1168                         if (reterr != 0)
1169                                 return (reterr);
1170                 }
1171         }
1172         if (coas.coa_changed & COA_ROUTE_CHANGED) {
1173                 in6_addr_t nexthop;
1174 
1175                 /*
1176                  * If we are connected we re-cache the information.
1177                  * We ignore errors to preserve BSD behavior.
1178                  * Note that we don't redo IPsec policy lookup here
1179                  * since the final destination (or source) didn't change.
1180                  */
1181                 ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa,
1182                     &connp->conn_faddr_v6, &nexthop);
1183 
1184                 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1185                     !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1186                         (void) ip_attr_connect(connp, connp->conn_ixa,
1187                             &connp->conn_laddr_v6, &connp->conn_faddr_v6,
1188                             &nexthop, connp->conn_fport, NULL, NULL,
1189                             IPDF_VERIFY_DST);
1190                 }
1191         }
1192         if ((coas.coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
1193                 connp->conn_wq->q_hiwat = connp->conn_sndbuf;
1194         }
1195         if (coas.coa_changed & COA_WROFF_CHANGED) {
1196                 connp->conn_wroff = connp->conn_ht_iphc_allocated +
1197                     tcps->tcps_wroff_xtra;
1198                 (void) proto_set_tx_wroff(connp->conn_rq, connp,
1199                     connp->conn_wroff);
1200         }
1201         if (coas.coa_changed & COA_OOBINLINE_CHANGED) {
1202                 if (IPCL_IS_NONSTR(connp))
1203                         proto_set_rx_oob_opt(connp, onoff);
1204         }
1205         return (0);
1206 }