Print this page
    
OS-5598 newproc() performs inadequate clean-up after failed lwp_create() [fix debug build]
OS-5613 SO_REUSEPORT needs better state-change coverage
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Approved by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-5217 setsockopt(TCP_KEEPCNT) can return EINVAL spuriously
Reviewed by: Dave Pacheco <dap@joyent.com>
OS-4699 lxbrand netty complains about SO_LINGER (really IP_TOS)
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-4018 lxbrand support TCP SO_REUSEPORT
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Cody Mello <cody.mello@joyent.com>
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/inet/tcp/tcp_opt_data.c
          +++ new/usr/src/uts/common/inet/tcp/tcp_opt_data.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  
    | 
      ↓ open down ↓ | 
    13 lines elided | 
    
      ↑ open up ↑ | 
  
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
       24 + * Copyright 2016 Joyent, Inc.
  24   25   */
  25   26  
  26   27  #include <sys/types.h>
  27   28  #include <sys/stream.h>
  28   29  #define _SUN_TPI_VERSION 2
  29   30  #include <sys/tihdr.h>
  30   31  #include <sys/socket.h>
  31   32  #include <sys/xti_xtiopt.h>
  32   33  #include <sys/xti_inet.h>
  33   34  #include <sys/policy.h>
  34   35  
  35   36  #include <inet/common.h>
  36   37  #include <netinet/ip6.h>
  37   38  #include <inet/ip.h>
  38   39  
  39   40  #include <netinet/in.h>
  40   41  #include <netinet/tcp.h>
  41   42  #include <inet/optcom.h>
  42   43  #include <inet/proto_set.h>
  43   44  #include <inet/tcp_impl.h>
  44   45  
  45   46  static int      tcp_opt_default(queue_t *, int, int, uchar_t *);
  46   47  
  47   48  /*
  48   49   * Table of all known options handled on a TCP protocol stack.
  49   50   *
  50   51   * Note: This table contains options processed by both TCP and IP levels
  51   52   *       and is the superset of options that can be performed on a TCP over IP
  52   53   *       stack.
  53   54   */
  54   55  opdes_t tcp_opt_arr[] = {
  
    | 
      ↓ open down ↓ | 
    21 lines elided | 
    
      ↑ open up ↑ | 
  
  55   56  
  56   57  { SO_LINGER,    SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
  57   58          sizeof (struct linger), 0 },
  58   59  
  59   60  { SO_DEBUG,     SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  60   61  { SO_KEEPALIVE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  61   62  { SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  62   63  { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
  63   64          },
  64   65  { SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  65      -{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
       66 +{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
       67 +{ SO_REUSEPORT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  66   68  { SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  67   69  { SO_TYPE,      SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
  68   70  { SO_SNDBUF,    SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  69   71  { SO_RCVBUF,    SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  70   72  { SO_SNDTIMEO,  SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
  71   73          sizeof (struct timeval), 0 },
  72   74  { SO_RCVTIMEO,  SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
  73   75          sizeof (struct timeval), 0 },
  74   76  { SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
  75   77          },
  76   78  { SO_SND_COPYAVOID, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  77   79  { SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
  78   80          0 },
  79   81  { SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
  80   82          0 },
  81   83  { SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
  82   84          0 },
  83   85  { SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
  84   86          0 },
  85   87  { SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  86   88  
  87   89  { SO_DOMAIN,    SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
  88   90  
  89   91  { SO_PROTOTYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
  90   92  
  91   93  { TCP_NODELAY,  IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
  92   94          },
  93   95  { TCP_MAXSEG,   IPPROTO_TCP, OA_R, OA_R, OP_NP, 0, sizeof (uint_t),
  94   96          536 },
  95   97  
  96   98  { TCP_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
  97   99          OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
  98  100  
  99  101  { TCP_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
 100  102          OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
 101  103  
 102  104  { TCP_CONN_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
 103  105          OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
 104  106  
 105  107  { TCP_CONN_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
 106  108          OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
 107  109  
 108  110  { TCP_RECVDSTADDR, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
 109  111          0 },
 110  112  
 111  113  { TCP_ANONPRIVBIND, IPPROTO_TCP, OA_R, OA_RW, OP_PRIVPORT, 0,
 112  114          sizeof (int), 0 },
 113  115  
 114  116  { TCP_EXCLBIND, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
 115  117          },
 116  118  
 117  119  { TCP_INIT_CWND, IPPROTO_TCP, OA_RW, OA_RW, OP_CONFIG, 0,
 118  120          sizeof (int), 0 },
 119  121  
 120  122  { TCP_KEEPALIVE_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
 121  123          sizeof (int), 0 },
 122  124  
 123  125  { TCP_KEEPIDLE, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 124  126  
 125  127  { TCP_KEEPCNT, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 126  128  
 127  129  { TCP_KEEPINTVL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 128  130  
 129  131  { TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
 130  132          sizeof (int), 0 },
 131  133  
 132  134  { TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 133  135  
 134  136  { TCP_RTO_INITIAL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
 135  137  
 136  138  { TCP_RTO_MIN, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
 137  139  
 138  140  { TCP_RTO_MAX, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
 139  141  
 140  142  { TCP_LINGER2, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 141  143  
 142  144  { IP_OPTIONS,   IPPROTO_IP, OA_RW, OA_RW, OP_NP,
 143  145          (OP_VARLEN|OP_NODEFAULT),
 144  146          IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
 145  147  { T_IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
 146  148          (OP_VARLEN|OP_NODEFAULT),
 147  149          IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
 148  150  
 149  151  { IP_TOS,       IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 150  152  { T_IP_TOS,     IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 151  153  { IP_TTL,       IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
 152  154          sizeof (int), -1 /* not initialized */ },
 153  155  
 154  156  { IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
 155  157          sizeof (ipsec_req_t), -1 /* not initialized */ },
 156  158  
 157  159  { IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
 158  160          sizeof (int),   0 /* no ifindex */ },
 159  161  
 160  162  { IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0,
 161  163          sizeof (int), 0 },
 162  164  
 163  165  { IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
 164  166          sizeof (int), -1 /* not initialized */ },
 165  167  
 166  168  { IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 167  169          sizeof (int),   0 /* no ifindex */ },
 168  170  
 169  171  { IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 170  172  
 171  173  { IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
 172  174          sizeof (in_addr_t),     -1 /* not initialized  */ },
 173  175  
 174  176  { IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0,
 175  177          sizeof (int), 0 },
 176  178  
 177  179  { IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
 178  180          (OP_NODEFAULT|OP_VARLEN),
 179  181          sizeof (struct in6_pktinfo), -1 /* not initialized */ },
 180  182  { IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
 181  183          OP_NODEFAULT,
 182  184          sizeof (sin6_t), -1 /* not initialized */ },
 183  185  { IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
 184  186          (OP_VARLEN|OP_NODEFAULT), 255*8,
 185  187          -1 /* not initialized */ },
 186  188  { IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
 187  189          (OP_VARLEN|OP_NODEFAULT), 255*8,
 188  190          -1 /* not initialized */ },
 189  191  { IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
 190  192          (OP_VARLEN|OP_NODEFAULT), 255*8,
 191  193          -1 /* not initialized */ },
 192  194  { IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
 193  195          (OP_VARLEN|OP_NODEFAULT), 255*8,
 194  196          -1 /* not initialized */ },
 195  197  { IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
 196  198          OP_NODEFAULT,
 197  199          sizeof (int), -1 /* not initialized */ },
 198  200  { IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
 199  201          OP_NODEFAULT,
 200  202          sizeof (struct ip6_mtuinfo), -1 /* not initialized */ },
 201  203  { IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 202  204          sizeof (int), 0 },
 203  205  { IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 204  206          sizeof (int), 0 },
 205  207  { IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 206  208          sizeof (int), 0 },
 207  209  
 208  210  /* Enable receipt of ancillary data */
 209  211  { IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 210  212          sizeof (int), 0 },
 211  213  { IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 212  214          sizeof (int), 0 },
 213  215  { IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 214  216          sizeof (int), 0 },
 215  217  { _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 216  218          sizeof (int), 0 },
 217  219  { IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 218  220          sizeof (int), 0 },
 219  221  { IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 220  222          sizeof (int), 0 },
 221  223  { IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 222  224          sizeof (int), 0 },
 223  225  { IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 224  226          sizeof (int), 0 },
 225  227  
 226  228  { IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
 227  229          sizeof (ipsec_req_t), -1 /* not initialized */ },
 228  230  { IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
 229  231          sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT },
 230  232  };
 231  233  
 232  234  /*
 233  235   * Table of all supported levels
 234  236   * Note: Some levels (e.g. XTI_GENERIC) may be valid but may not have
 235  237   * any supported options so we need this info separately.
 236  238   *
 237  239   * This is needed only for topmost tpi providers and is used only by
 238  240   * XTI interfaces.
 239  241   */
 240  242  optlevel_t      tcp_valid_levels_arr[] = {
 241  243          XTI_GENERIC,
 242  244          SOL_SOCKET,
 243  245          IPPROTO_TCP,
 244  246          IPPROTO_IP,
 245  247          IPPROTO_IPV6
 246  248  };
 247  249  
 248  250  
 249  251  #define TCP_OPT_ARR_CNT         A_CNT(tcp_opt_arr)
 250  252  #define TCP_VALID_LEVELS_CNT    A_CNT(tcp_valid_levels_arr)
 251  253  
 252  254  uint_t tcp_max_optsize; /* initialized when TCP driver is loaded */
 253  255  
 254  256  /*
 255  257   * Initialize option database object for TCP
 256  258   *
 257  259   * This object represents database of options to search passed to
 258  260   * {sock,tpi}optcom_req() interface routine to take care of option
 259  261   * management and associated methods.
 260  262   */
 261  263  
 262  264  optdb_obj_t tcp_opt_obj = {
 263  265          tcp_opt_default,        /* TCP default value function pointer */
 264  266          tcp_tpi_opt_get,        /* TCP get function pointer */
 265  267          tcp_tpi_opt_set,        /* TCP set function pointer */
 266  268          TCP_OPT_ARR_CNT,        /* TCP option database count of entries */
 267  269          tcp_opt_arr,            /* TCP option database */
 268  270          TCP_VALID_LEVELS_CNT,   /* TCP valid level count of entries */
 269  271          tcp_valid_levels_arr    /* TCP valid level array */
 270  272  };
 271  273  
 272  274  static int tcp_max_init_cwnd = TCP_MAX_INIT_CWND;
 273  275  
 274  276  /*
 275  277   * Some TCP options can be "set" by requesting them in the option
 276  278   * buffer. This is needed for XTI feature test though we do not
 277  279   * allow it in general. We interpret that this mechanism is more
 278  280   * applicable to OSI protocols and need not be allowed in general.
 279  281   * This routine filters out options for which it is not allowed (most)
 280  282   * and lets through those (few) for which it is. [ The XTI interface
 281  283   * test suite specifics will imply that any XTI_GENERIC level XTI_* if
 282  284   * ever implemented will have to be allowed here ].
 283  285   */
 284  286  static boolean_t
 285  287  tcp_allow_connopt_set(int level, int name)
 286  288  {
 287  289  
 288  290          switch (level) {
 289  291          case IPPROTO_TCP:
 290  292                  switch (name) {
 291  293                  case TCP_NODELAY:
 292  294                          return (B_TRUE);
 293  295                  default:
 294  296                          return (B_FALSE);
 295  297                  }
 296  298                  /*NOTREACHED*/
 297  299          default:
 298  300                  return (B_FALSE);
 299  301          }
 300  302          /*NOTREACHED*/
 301  303  }
 302  304  
 303  305  /*
 304  306   * This routine gets default values of certain options whose default
 305  307   * values are maintained by protocol specific code
 306  308   */
 307  309  /* ARGSUSED */
 308  310  static int
 309  311  tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
 310  312  {
 311  313          int32_t *i1 = (int32_t *)ptr;
 312  314          tcp_stack_t     *tcps = Q_TO_TCP(q)->tcp_tcps;
 313  315  
 314  316          switch (level) {
 315  317          case IPPROTO_TCP:
 316  318                  switch (name) {
 317  319                  case TCP_NOTIFY_THRESHOLD:
 318  320                          *i1 = tcps->tcps_ip_notify_interval;
 319  321                          break;
 320  322                  case TCP_ABORT_THRESHOLD:
 321  323                          *i1 = tcps->tcps_ip_abort_interval;
 322  324                          break;
 323  325                  case TCP_CONN_NOTIFY_THRESHOLD:
 324  326                          *i1 = tcps->tcps_ip_notify_cinterval;
 325  327                          break;
 326  328                  case TCP_CONN_ABORT_THRESHOLD:
 327  329                          *i1 = tcps->tcps_ip_abort_cinterval;
 328  330                          break;
 329  331                  default:
 330  332                          return (-1);
 331  333                  }
 332  334                  break;
 333  335          case IPPROTO_IP:
 334  336                  switch (name) {
 335  337                  case IP_TTL:
 336  338                          *i1 = tcps->tcps_ipv4_ttl;
 337  339                          break;
 338  340                  default:
 339  341                          return (-1);
 340  342                  }
 341  343                  break;
 342  344          case IPPROTO_IPV6:
 343  345                  switch (name) {
 344  346                  case IPV6_UNICAST_HOPS:
 345  347                          *i1 = tcps->tcps_ipv6_hoplimit;
 346  348                          break;
 347  349                  default:
 348  350                          return (-1);
 349  351                  }
 350  352                  break;
 351  353          default:
 352  354                  return (-1);
 353  355          }
 354  356          return (sizeof (int));
 355  357  }
 356  358  
 357  359  /*
 358  360   * TCP routine to get the values of options.
 359  361   */
 360  362  int
 361  363  tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
 362  364  {
 363  365          int             *i1 = (int *)ptr;
 364  366          tcp_t           *tcp = connp->conn_tcp;
 365  367          conn_opt_arg_t  coas;
 366  368          int             retval;
 367  369  
 368  370          coas.coa_connp = connp;
 369  371          coas.coa_ixa = connp->conn_ixa;
 370  372          coas.coa_ipp = &connp->conn_xmit_ipp;
 371  373          coas.coa_ancillary = B_FALSE;
 372  374          coas.coa_changed = 0;
 373  375  
 374  376          switch (level) {
 375  377          case SOL_SOCKET:
 376  378                  switch (name) {
 377  379                  case SO_SND_COPYAVOID:
 378  380                          *i1 = tcp->tcp_snd_zcopy_on ?
 379  381                              SO_SND_COPYAVOID : 0;
 380  382                          return (sizeof (int));
 381  383                  case SO_ACCEPTCONN:
 382  384                          *i1 = (tcp->tcp_state == TCPS_LISTEN);
 383  385                          return (sizeof (int));
 384  386                  }
 385  387                  break;
 386  388          case IPPROTO_TCP:
 387  389                  switch (name) {
 388  390                  case TCP_NODELAY:
 389  391                          *i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0;
 390  392                          return (sizeof (int));
 391  393                  case TCP_MAXSEG:
 392  394                          *i1 = tcp->tcp_mss;
 393  395                          return (sizeof (int));
 394  396                  case TCP_NOTIFY_THRESHOLD:
 395  397                          *i1 = (int)tcp->tcp_first_timer_threshold;
 396  398                          return (sizeof (int));
 397  399                  case TCP_ABORT_THRESHOLD:
 398  400                          *i1 = tcp->tcp_second_timer_threshold;
 399  401                          return (sizeof (int));
 400  402                  case TCP_CONN_NOTIFY_THRESHOLD:
 401  403                          *i1 = tcp->tcp_first_ctimer_threshold;
 402  404                          return (sizeof (int));
 403  405                  case TCP_CONN_ABORT_THRESHOLD:
 404  406                          *i1 = tcp->tcp_second_ctimer_threshold;
 405  407                          return (sizeof (int));
 406  408                  case TCP_INIT_CWND:
 407  409                          *i1 = tcp->tcp_init_cwnd;
 408  410                          return (sizeof (int));
 409  411                  case TCP_KEEPALIVE_THRESHOLD:
 410  412                          *i1 = tcp->tcp_ka_interval;
 411  413                          return (sizeof (int));
 412  414  
 413  415                  /*
 414  416                   * TCP_KEEPIDLE expects value in seconds, but
 415  417                   * tcp_ka_interval is in milliseconds.
 416  418                   */
 417  419                  case TCP_KEEPIDLE:
 418  420                          *i1 = tcp->tcp_ka_interval / 1000;
 419  421                          return (sizeof (int));
 420  422                  case TCP_KEEPCNT:
 421  423                          *i1 = tcp->tcp_ka_cnt;
 422  424                          return (sizeof (int));
 423  425  
 424  426                  /*
 425  427                   * TCP_KEEPINTVL expects value in seconds, but
 426  428                   * tcp_ka_rinterval is in milliseconds.
 427  429                   */
 428  430                  case TCP_KEEPINTVL:
 429  431                          *i1 = tcp->tcp_ka_rinterval / 1000;
 430  432                          return (sizeof (int));
 431  433                  case TCP_KEEPALIVE_ABORT_THRESHOLD:
 432  434                          *i1 = tcp->tcp_ka_abort_thres;
 433  435                          return (sizeof (int));
 434  436                  case TCP_CORK:
 435  437                          *i1 = tcp->tcp_cork;
 436  438                          return (sizeof (int));
 437  439                  case TCP_RTO_INITIAL:
 438  440                          *i1 = tcp->tcp_rto_initial;
 439  441                          return (sizeof (uint32_t));
 440  442                  case TCP_RTO_MIN:
 441  443                          *i1 = tcp->tcp_rto_min;
 442  444                          return (sizeof (uint32_t));
 443  445                  case TCP_RTO_MAX:
 444  446                          *i1 = tcp->tcp_rto_max;
 445  447                          return (sizeof (uint32_t));
 446  448                  case TCP_LINGER2:
 447  449                          *i1 = tcp->tcp_fin_wait_2_flush_interval / SECONDS;
 448  450                          return (sizeof (int));
 449  451                  }
 450  452                  break;
 451  453          case IPPROTO_IP:
 452  454                  if (connp->conn_family != AF_INET)
 453  455                          return (-1);
 454  456                  switch (name) {
 455  457                  case IP_OPTIONS:
 456  458                  case T_IP_OPTIONS:
 457  459                          /* Caller ensures enough space */
 458  460                          return (ip_opt_get_user(connp, ptr));
 459  461                  default:
 460  462                          break;
 461  463                  }
 462  464                  break;
 463  465  
 464  466          case IPPROTO_IPV6:
 465  467                  /*
 466  468                   * IPPROTO_IPV6 options are only supported for sockets
 467  469                   * that are using IPv6 on the wire.
 468  470                   */
 469  471                  if (connp->conn_ipversion != IPV6_VERSION) {
 470  472                          return (-1);
 471  473                  }
 472  474                  switch (name) {
 473  475                  case IPV6_PATHMTU:
 474  476                          if (tcp->tcp_state < TCPS_ESTABLISHED)
 475  477                                  return (-1);
 476  478                          break;
  
    | 
      ↓ open down ↓ | 
    401 lines elided | 
    
      ↑ open up ↑ | 
  
 477  479                  }
 478  480                  break;
 479  481          }
 480  482          mutex_enter(&connp->conn_lock);
 481  483          retval = conn_opt_get(&coas, level, name, ptr);
 482  484          mutex_exit(&connp->conn_lock);
 483  485          return (retval);
 484  486  }
 485  487  
 486  488  /*
      489 + * Set a TCP connection's participation in SO_REUSEPORT.  This operation is
      490 + * performed under the protection of the squeue via tcp_setsockopt.
      491 + * The manipulation of tcp_rg_bind, as part of this operation, is subject to
      492 + * these constraints:
      493 + * 1. Prior to bind(), tcp_rg_bind can be set/cleared in tcp_set_reuseport
      494 + *    under the protection of the squeue.
      495 + * 2. Once the connection has been bound, the tcp_rg_bind pointer must not be
      496 + *    altered until such time as tcp_free() cleans up the connection.
      497 + * 3. A connection undergoing bind, which matches to a connection participating
      498 + *    in port-reuse, will switch its tcp_rg_bind pointer when it joins the
      499 + *    group of an existing connection in tcp_bindi().
      500 + */
      501 +static int
      502 +tcp_set_reuseport(conn_t *connp, boolean_t do_enable)
      503 +{
      504 +        tcp_t *tcp = connp->conn_tcp;
      505 +        struct tcp_rg_s *rg;
      506 +
      507 +        if (!IPCL_IS_NONSTR(connp)) {
      508 +                if (do_enable) {
      509 +                        /*
      510 +                         * SO_REUSEPORT cannot be enabled on sockets which have
      511 +                         * fallen back to the STREAMS API.
      512 +                         */
      513 +                        return (EINVAL);
      514 +                } else {
      515 +                        /*
      516 +                         * A connection with SO_REUSEPORT enabled should be
      517 +                         * prevented from falling back to STREAMS mode via
      518 +                         * logic in tcp_fallback.  It is legal, however, for
      519 +                         * fallen-back connections to affirm the disabled state
      520 +                         * of SO_REUSEPORT.
      521 +                         */
      522 +                        ASSERT(connp->conn_reuseport == 0);
      523 +                        return (0);
      524 +                }
      525 +        }
      526 +        if (tcp->tcp_state <= TCPS_CLOSED) {
      527 +                return (EINVAL);
      528 +        }
      529 +        if (connp->conn_reuseport == 0 && do_enable) {
      530 +                /* disabled -> enabled */
      531 +                if (tcp->tcp_rg_bind != NULL) {
      532 +                        tcp_rg_setactive(tcp->tcp_rg_bind, do_enable);
      533 +                } else {
      534 +                        /*
      535 +                         * Connection state is not a concern when initially
      536 +                         * populating tcp_rg_bind.  Setting it to non-NULL on a
      537 +                         * bound or listening connection would only mean that
      538 +                         * new reused-port binds become a possibility.
      539 +                         */
      540 +                        if ((rg = tcp_rg_init(tcp)) == NULL) {
      541 +                                return (ENOMEM);
      542 +                        }
      543 +                        tcp->tcp_rg_bind = rg;
      544 +                }
      545 +                connp->conn_reuseport = 1;
      546 +        } else if (connp->conn_reuseport != 0 && !do_enable) {
      547 +                /* enabled -> disabled */
      548 +                ASSERT(tcp->tcp_rg_bind != NULL);
      549 +                if (tcp->tcp_state == TCPS_IDLE) {
      550 +                        /*
      551 +                         * If the connection has not been bound yet, discard
      552 +                         * the reuse group state.  Since disabling SO_REUSEPORT
      553 +                         * on a bound socket will _not_ prevent others from
      554 +                         * reusing the port, the presence of tcp_rg_bind is
      555 +                         * used to determine reuse availability, not
      556 +                         * conn_reuseport.
      557 +                         *
      558 +                         * This allows proper behavior for examples such as:
      559 +                         *
      560 +                         * setsockopt(fd1, ... SO_REUSEPORT, &on_val...);
      561 +                         * bind(fd1, &myaddr, ...);
      562 +                         * setsockopt(fd1, ... SO_REUSEPORT, &off_val...);
      563 +                         *
      564 +                         * setsockopt(fd2, ... SO_REUSEPORT, &on_val...);
      565 +                         * bind(fd2, &myaddr, ...); // <- SHOULD SUCCEED
      566 +                         *
      567 +                         */
      568 +                        rg = tcp->tcp_rg_bind;
      569 +                        tcp->tcp_rg_bind = NULL;
      570 +                        VERIFY(tcp_rg_remove(rg, tcp));
      571 +                        tcp_rg_destroy(rg);
      572 +                } else {
      573 +                        /*
      574 +                         * If a connection has been bound, it's no longer safe
      575 +                         * to manipulate tcp_rg_bind until connection clean-up
      576 +                         * during tcp_free.  Just mark the member status of the
      577 +                         * connection as inactive.
      578 +                         */
      579 +                        tcp_rg_setactive(tcp->tcp_rg_bind, do_enable);
      580 +                }
      581 +                connp->conn_reuseport = 0;
      582 +        }
      583 +        return (0);
      584 +}
      585 +
      586 +/*
 487  587   * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
 488  588   * Parameters are assumed to be verified by the caller.
 489  589   */
 490  590  /* ARGSUSED */
 491  591  int
 492  592  tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
 493  593      uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
 494  594      void *thisdg_attrs, cred_t *cr)
 495  595  {
 496  596          tcp_t   *tcp = connp->conn_tcp;
 497  597          int     *i1 = (int *)invalp;
 498  598          boolean_t onoff = (*i1 == 0) ? 0 : 1;
 499  599          boolean_t checkonly;
 500  600          int     reterr;
 501  601          tcp_stack_t     *tcps = tcp->tcp_tcps;
 502  602          conn_opt_arg_t  coas;
 503  603          uint32_t        val = *((uint32_t *)invalp);
 504  604  
 505  605          coas.coa_connp = connp;
 506  606          coas.coa_ixa = connp->conn_ixa;
 507  607          coas.coa_ipp = &connp->conn_xmit_ipp;
 508  608          coas.coa_ancillary = B_FALSE;
 509  609          coas.coa_changed = 0;
 510  610  
 511  611          switch (optset_context) {
 512  612          case SETFN_OPTCOM_CHECKONLY:
 513  613                  checkonly = B_TRUE;
 514  614                  /*
 515  615                   * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
 516  616                   * inlen != 0 implies value supplied and
 517  617                   *      we have to "pretend" to set it.
 518  618                   * inlen == 0 implies that there is no
 519  619                   *      value part in T_CHECK request and just validation
 520  620                   * done elsewhere should be enough, we just return here.
 521  621                   */
 522  622                  if (inlen == 0) {
 523  623                          *outlenp = 0;
 524  624                          return (0);
 525  625                  }
 526  626                  break;
 527  627          case SETFN_OPTCOM_NEGOTIATE:
 528  628                  checkonly = B_FALSE;
 529  629                  break;
 530  630          case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */
 531  631          case SETFN_CONN_NEGOTIATE:
 532  632                  checkonly = B_FALSE;
 533  633                  /*
 534  634                   * Negotiating local and "association-related" options
 535  635                   * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ)
 536  636                   * primitives is allowed by XTI, but we choose
 537  637                   * to not implement this style negotiation for Internet
 538  638                   * protocols (We interpret it is a must for OSI world but
 539  639                   * optional for Internet protocols) for all options.
 540  640                   * [ Will do only for the few options that enable test
 541  641                   * suites that our XTI implementation of this feature
 542  642                   * works for transports that do allow it ]
 543  643                   */
 544  644                  if (!tcp_allow_connopt_set(level, name)) {
 545  645                          *outlenp = 0;
 546  646                          return (EINVAL);
 547  647                  }
 548  648                  break;
 549  649          default:
 550  650                  /*
 551  651                   * We should never get here
 552  652                   */
 553  653                  *outlenp = 0;
 554  654                  return (EINVAL);
 555  655          }
 556  656  
 557  657          ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
 558  658              (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
 559  659  
 560  660          /*
 561  661           * For TCP, we should have no ancillary data sent down
 562  662           * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs
 563  663           * has to be zero.
 564  664           */
 565  665          ASSERT(thisdg_attrs == NULL);
 566  666  
 567  667          /*
 568  668           * For fixed length options, no sanity check
 569  669           * of passed in length is done. It is assumed *_optcom_req()
 570  670           * routines do the right thing.
 571  671           */
 572  672          switch (level) {
 573  673          case SOL_SOCKET:
 574  674                  switch (name) {
 575  675                  case SO_KEEPALIVE:
 576  676                          if (checkonly) {
 577  677                                  /* check only case */
 578  678                                  break;
 579  679                          }
 580  680  
 581  681                          if (!onoff) {
 582  682                                  if (connp->conn_keepalive) {
 583  683                                          if (tcp->tcp_ka_tid != 0) {
 584  684                                                  (void) TCP_TIMER_CANCEL(tcp,
 585  685                                                      tcp->tcp_ka_tid);
 586  686                                                  tcp->tcp_ka_tid = 0;
 587  687                                          }
 588  688                                          connp->conn_keepalive = 0;
 589  689                                  }
 590  690                                  break;
 591  691                          }
 592  692                          if (!connp->conn_keepalive) {
 593  693                                  /* Crank up the keepalive timer */
 594  694                                  tcp->tcp_ka_last_intrvl = 0;
 595  695                                  tcp->tcp_ka_tid = TCP_TIMER(tcp,
 596  696                                      tcp_keepalive_timer, tcp->tcp_ka_interval);
 597  697                                  connp->conn_keepalive = 1;
 598  698                          }
 599  699                          break;
 600  700                  case SO_SNDBUF: {
 601  701                          if (*i1 > tcps->tcps_max_buf) {
 602  702                                  *outlenp = 0;
 603  703                                  return (ENOBUFS);
 604  704                          }
 605  705                          if (checkonly)
 606  706                                  break;
 607  707  
 608  708                          connp->conn_sndbuf = *i1;
 609  709                          if (tcps->tcps_snd_lowat_fraction != 0) {
 610  710                                  connp->conn_sndlowat = connp->conn_sndbuf /
 611  711                                      tcps->tcps_snd_lowat_fraction;
 612  712                          }
 613  713                          (void) tcp_maxpsz_set(tcp, B_TRUE);
 614  714                          /*
 615  715                           * If we are flow-controlled, recheck the condition.
 616  716                           * There are apps that increase SO_SNDBUF size when
 617  717                           * flow-controlled (EWOULDBLOCK), and expect the flow
 618  718                           * control condition to be lifted right away.
 619  719                           */
 620  720                          mutex_enter(&tcp->tcp_non_sq_lock);
 621  721                          if (tcp->tcp_flow_stopped &&
 622  722                              TCP_UNSENT_BYTES(tcp) < connp->conn_sndbuf) {
 623  723                                  tcp_clrqfull(tcp);
 624  724                          }
 625  725                          mutex_exit(&tcp->tcp_non_sq_lock);
 626  726                          *outlenp = inlen;
 627  727                          return (0);
 628  728                  }
 629  729                  case SO_RCVBUF:
 630  730                          if (*i1 > tcps->tcps_max_buf) {
 631  731                                  *outlenp = 0;
 632  732                                  return (ENOBUFS);
 633  733                          }
 634  734                          /* Silently ignore zero */
 635  735                          if (!checkonly && *i1 != 0) {
 636  736                                  *i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss);
 637  737                                  (void) tcp_rwnd_set(tcp, *i1);
 638  738                          }
 639  739                          /*
 640  740                           * XXX should we return the rwnd here
 641  741                           * and tcp_opt_get ?
 642  742                           */
 643  743                          *outlenp = inlen;
 644  744                          return (0);
 645  745                  case SO_SND_COPYAVOID:
  
    | 
      ↓ open down ↓ | 
    149 lines elided | 
    
      ↑ open up ↑ | 
  
 646  746                          if (!checkonly) {
 647  747                                  if (tcp->tcp_loopback ||
 648  748                                      (onoff != 1) || !tcp_zcopy_check(tcp)) {
 649  749                                          *outlenp = 0;
 650  750                                          return (EOPNOTSUPP);
 651  751                                  }
 652  752                                  tcp->tcp_snd_zcopy_aware = 1;
 653  753                          }
 654  754                          *outlenp = inlen;
 655  755                          return (0);
      756 +                case SO_REUSEPORT:
      757 +                        if (!checkonly) {
      758 +                                return (tcp_set_reuseport(connp, *i1 != 0));
      759 +                        }
      760 +                        return (0);
 656  761                  }
 657  762                  break;
 658  763          case IPPROTO_TCP:
 659  764                  switch (name) {
 660  765                  case TCP_NODELAY:
 661  766                          if (!checkonly)
 662  767                                  tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss;
 663  768                          break;
 664  769                  case TCP_NOTIFY_THRESHOLD:
 665  770                          if (!checkonly)
 666  771                                  tcp->tcp_first_timer_threshold = *i1;
 667  772                          break;
 668  773                  case TCP_ABORT_THRESHOLD:
 669  774                          if (!checkonly)
 670  775                                  tcp->tcp_second_timer_threshold = *i1;
 671  776                          break;
 672  777                  case TCP_CONN_NOTIFY_THRESHOLD:
 673  778                          if (!checkonly)
 674  779                                  tcp->tcp_first_ctimer_threshold = *i1;
 675  780                          break;
 676  781                  case TCP_CONN_ABORT_THRESHOLD:
 677  782                          if (!checkonly)
 678  783                                  tcp->tcp_second_ctimer_threshold = *i1;
 679  784                          break;
 680  785                  case TCP_RECVDSTADDR:
 681  786                          if (tcp->tcp_state > TCPS_LISTEN) {
 682  787                                  *outlenp = 0;
 683  788                                  return (EOPNOTSUPP);
 684  789                          }
 685  790                          /* Setting done in conn_opt_set */
 686  791                          break;
 687  792                  case TCP_INIT_CWND:
 688  793                          if (checkonly)
 689  794                                  break;
 690  795  
 691  796                          /*
 692  797                           * Only allow socket with network configuration
 693  798                           * privilege to set the initial cwnd to be larger
 694  799                           * than allowed by RFC 3390.
 695  800                           */
 696  801                          if (val > MIN(4, MAX(2, 4380 / tcp->tcp_mss))) {
 697  802                                  if ((reterr = secpolicy_ip_config(cr, B_TRUE))
 698  803                                      != 0) {
 699  804                                          *outlenp = 0;
 700  805                                          return (reterr);
 701  806                                  }
 702  807                                  if (val > tcp_max_init_cwnd) {
 703  808                                          *outlenp = 0;
 704  809                                          return (EINVAL);
 705  810                                  }
 706  811                          }
 707  812  
 708  813                          tcp->tcp_init_cwnd = val;
 709  814  
 710  815                          /*
 711  816                           * If the socket is connected, AND no outbound data
 712  817                           * has been sent, reset the actual cwnd values.
 713  818                           */
 714  819                          if (tcp->tcp_state == TCPS_ESTABLISHED &&
 715  820                              tcp->tcp_iss == tcp->tcp_snxt - 1) {
 716  821                                  tcp->tcp_cwnd =
 717  822                                      MIN(tcp->tcp_rwnd, val * tcp->tcp_mss);
 718  823                          }
 719  824                          break;
 720  825  
 721  826                  /*
 722  827                   * TCP_KEEPIDLE is in seconds but TCP_KEEPALIVE_THRESHOLD
 723  828                   * is in milliseconds. TCP_KEEPIDLE is introduced for
 724  829                   * compatibility with other Unix flavors.
 725  830                   * We can fall through TCP_KEEPALIVE_THRESHOLD logic after
 726  831                   * converting the input to milliseconds.
 727  832                   */
 728  833                  case TCP_KEEPIDLE:
 729  834                          *i1 *= 1000;
 730  835                          /* FALLTHRU */
 731  836  
 732  837                  case TCP_KEEPALIVE_THRESHOLD:
 733  838                          if (checkonly)
 734  839                                  break;
 735  840  
 736  841                          if (*i1 < tcps->tcps_keepalive_interval_low ||
 737  842                              *i1 > tcps->tcps_keepalive_interval_high) {
 738  843                                  *outlenp = 0;
 739  844                                  return (EINVAL);
 740  845                          }
 741  846                          if (*i1 != tcp->tcp_ka_interval) {
 742  847                                  tcp->tcp_ka_interval = *i1;
 743  848                                  /*
 744  849                                   * Check if we need to restart the
 745  850                                   * keepalive timer.
 746  851                                   */
 747  852                                  if (tcp->tcp_ka_tid != 0) {
 748  853                                          ASSERT(connp->conn_keepalive);
 749  854                                          (void) TCP_TIMER_CANCEL(tcp,
 750  855                                              tcp->tcp_ka_tid);
 751  856                                          tcp->tcp_ka_last_intrvl = 0;
 752  857                                          tcp->tcp_ka_tid = TCP_TIMER(tcp,
 753  858                                              tcp_keepalive_timer,
 754  859                                              tcp->tcp_ka_interval);
 755  860                                  }
 756  861                          }
 757  862                          break;
 758  863  
 759  864                  /*
 760  865                   * tcp_ka_abort_thres = tcp_ka_rinterval * tcp_ka_cnt.
 761  866                   * So setting TCP_KEEPCNT or TCP_KEEPINTVL can affect all the
  
    | 
      ↓ open down ↓ | 
    96 lines elided | 
    
      ↑ open up ↑ | 
  
 762  867                   * three members - tcp_ka_abort_thres, tcp_ka_rinterval and
 763  868                   * tcp_ka_cnt.
 764  869                   */
 765  870                  case TCP_KEEPCNT:
 766  871                          if (checkonly)
 767  872                                  break;
 768  873  
 769  874                          if (*i1 == 0) {
 770  875                                  return (EINVAL);
 771  876                          } else if (tcp->tcp_ka_rinterval == 0) {
 772      -                                if ((tcp->tcp_ka_abort_thres / *i1) <
 773      -                                    tcp->tcp_rto_min ||
 774      -                                    (tcp->tcp_ka_abort_thres / *i1) >
 775      -                                    tcp->tcp_rto_max)
 776      -                                        return (EINVAL);
      877 +                                /*
      878 +                                 * When TCP_KEEPCNT is specified without first
      879 +                                 * specifying a TCP_KEEPINTVL, we infer an
      880 +                                 * interval based on a tunable specific to our
      881 +                                 * stack: the tcp_keepalive_abort_interval.
      882 +                                 * (Or the TCP_KEEPALIVE_ABORT_THRESHOLD, in
      883 +                                 * the unlikely event that that has been set.)
      884 +                                 * Given the abort interval's default value of
      885 +                                 * 480 seconds, low TCP_KEEPCNT values can
      886 +                                 * result in intervals that exceed the default
      887 +                                 * maximum RTO of 60 seconds.  Rather than
      888 +                                 * fail in these cases, we (implicitly) clamp
      889 +                                 * the interval at the maximum RTO; if the
      890 +                                 * TCP_KEEPCNT is shortly followed by a
      891 +                                 * TCP_KEEPINTVL (as we expect), the abort
      892 +                                 * threshold will be recalculated correctly --
      893 +                                 * and if a TCP_KEEPINTVL is not forthcoming,
      894 +                                 * keep-alive will at least operate reasonably
      895 +                                 * given the underconfigured state.
      896 +                                 */
      897 +                                uint32_t interval;
 777  898  
 778      -                                tcp->tcp_ka_rinterval =
 779      -                                    tcp->tcp_ka_abort_thres / *i1;
      899 +                                interval = tcp->tcp_ka_abort_thres / *i1;
      900 +
      901 +                                if (interval < tcp->tcp_rto_min)
      902 +                                        interval = tcp->tcp_rto_min;
      903 +
      904 +                                if (interval > tcp->tcp_rto_max)
      905 +                                        interval = tcp->tcp_rto_max;
      906 +
      907 +                                tcp->tcp_ka_rinterval = interval;
 780  908                          } else {
 781  909                                  if ((*i1 * tcp->tcp_ka_rinterval) <
 782  910                                      tcps->tcps_keepalive_abort_interval_low ||
 783  911                                      (*i1 * tcp->tcp_ka_rinterval) >
 784  912                                      tcps->tcps_keepalive_abort_interval_high)
 785  913                                          return (EINVAL);
 786  914                                  tcp->tcp_ka_abort_thres =
 787  915                                      (*i1 * tcp->tcp_ka_rinterval);
 788  916                          }
 789  917                          tcp->tcp_ka_cnt = *i1;
 790  918                          break;
 791  919                  case TCP_KEEPINTVL:
 792  920                          /*
 793  921                           * TCP_KEEPINTVL is specified in seconds, but
 794  922                           * tcp_ka_rinterval is in milliseconds.
 795  923                           */
 796  924  
 797  925                          if (checkonly)
 798  926                                  break;
 799  927  
 800  928                          if ((*i1 * 1000) < tcp->tcp_rto_min ||
 801  929                              (*i1 * 1000) > tcp->tcp_rto_max)
 802  930                                  return (EINVAL);
 803  931  
 804  932                          if (tcp->tcp_ka_cnt == 0) {
 805  933                                  tcp->tcp_ka_cnt =
 806  934                                      tcp->tcp_ka_abort_thres / (*i1 * 1000);
 807  935                          } else {
 808  936                                  if ((*i1 * tcp->tcp_ka_cnt * 1000) <
 809  937                                      tcps->tcps_keepalive_abort_interval_low ||
 810  938                                      (*i1 * tcp->tcp_ka_cnt * 1000) >
 811  939                                      tcps->tcps_keepalive_abort_interval_high)
 812  940                                          return (EINVAL);
 813  941                                  tcp->tcp_ka_abort_thres =
 814  942                                      (*i1 * tcp->tcp_ka_cnt * 1000);
 815  943                          }
 816  944                          tcp->tcp_ka_rinterval = *i1 * 1000;
 817  945                          break;
 818  946                  case TCP_KEEPALIVE_ABORT_THRESHOLD:
 819  947                          if (!checkonly) {
 820  948                                  if (*i1 <
 821  949                                      tcps->tcps_keepalive_abort_interval_low ||
 822  950                                      *i1 >
 823  951                                      tcps->tcps_keepalive_abort_interval_high) {
 824  952                                          *outlenp = 0;
 825  953                                          return (EINVAL);
 826  954                                  }
 827  955                                  tcp->tcp_ka_abort_thres = *i1;
 828  956                                  tcp->tcp_ka_cnt = 0;
 829  957                                  tcp->tcp_ka_rinterval = 0;
 830  958                          }
 831  959                          break;
 832  960                  case TCP_CORK:
 833  961                          if (!checkonly) {
 834  962                                  /*
 835  963                                   * if tcp->tcp_cork was set and is now
 836  964                                   * being unset, we have to make sure that
 837  965                                   * the remaining data gets sent out. Also
 838  966                                   * unset tcp->tcp_cork so that tcp_wput_data()
 839  967                                   * can send data even if it is less than mss
 840  968                                   */
 841  969                                  if (tcp->tcp_cork && onoff == 0 &&
 842  970                                      tcp->tcp_unsent > 0) {
 843  971                                          tcp->tcp_cork = B_FALSE;
 844  972                                          tcp_wput_data(tcp, NULL, B_FALSE);
 845  973                                  }
 846  974                                  tcp->tcp_cork = onoff;
 847  975                          }
 848  976                          break;
 849  977                  case TCP_RTO_INITIAL: {
 850  978                          clock_t rto;
 851  979  
 852  980                          if (checkonly || val == 0)
 853  981                                  break;
 854  982  
 855  983                          /*
 856  984                           * Sanity checks
 857  985                           *
 858  986                           * The initial RTO should be bounded by the minimum
 859  987                           * and maximum RTO.  And it should also be smaller
 860  988                           * than the connect attempt abort timeout.  Otherwise,
 861  989                           * the connection won't be aborted in a period
 862  990                           * reasonably close to that timeout.
 863  991                           */
 864  992                          if (val < tcp->tcp_rto_min || val > tcp->tcp_rto_max ||
 865  993                              val > tcp->tcp_second_ctimer_threshold ||
 866  994                              val < tcps->tcps_rexmit_interval_initial_low ||
 867  995                              val > tcps->tcps_rexmit_interval_initial_high) {
 868  996                                  *outlenp = 0;
 869  997                                  return (EINVAL);
 870  998                          }
 871  999                          tcp->tcp_rto_initial = val;
 872 1000  
 873 1001                          /*
 874 1002                           * If TCP has not sent anything, need to re-calculate
 875 1003                           * tcp_rto.  Otherwise, this option change does not
 876 1004                           * really affect anything.
 877 1005                           */
 878 1006                          if (tcp->tcp_state >= TCPS_SYN_SENT)
 879 1007                                  break;
 880 1008  
 881 1009                          tcp->tcp_rtt_sa = tcp->tcp_rto_initial << 2;
 882 1010                          tcp->tcp_rtt_sd = tcp->tcp_rto_initial >> 1;
 883 1011                          rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
 884 1012                              tcps->tcps_rexmit_interval_extra +
 885 1013                              (tcp->tcp_rtt_sa >> 5) +
 886 1014                              tcps->tcps_conn_grace_period;
 887 1015                          TCP_SET_RTO(tcp, rto);
 888 1016                          break;
 889 1017                  }
 890 1018                  case TCP_RTO_MIN:
 891 1019                          if (checkonly || val == 0)
 892 1020                                  break;
 893 1021  
 894 1022                          if (val < tcps->tcps_rexmit_interval_min_low ||
 895 1023                              val > tcps->tcps_rexmit_interval_min_high ||
 896 1024                              val > tcp->tcp_rto_max) {
 897 1025                                  *outlenp = 0;
 898 1026                                  return (EINVAL);
 899 1027                          }
 900 1028                          tcp->tcp_rto_min = val;
 901 1029                          if (tcp->tcp_rto < val)
 902 1030                                  tcp->tcp_rto = val;
 903 1031                          break;
 904 1032                  case TCP_RTO_MAX:
 905 1033                          if (checkonly || val == 0)
 906 1034                                  break;
 907 1035  
 908 1036                          /*
 909 1037                           * Sanity checks
 910 1038                           *
 911 1039                           * The maximum RTO should not be larger than the
 912 1040                           * connection abort timeout.  Otherwise, the
 913 1041                           * connection won't be aborted in a period reasonably
 914 1042                           * close to that timeout.
 915 1043                           */
 916 1044                          if (val < tcps->tcps_rexmit_interval_max_low ||
 917 1045                              val > tcps->tcps_rexmit_interval_max_high ||
 918 1046                              val < tcp->tcp_rto_min ||
 919 1047                              val > tcp->tcp_second_timer_threshold) {
 920 1048                                  *outlenp = 0;
 921 1049                                  return (EINVAL);
 922 1050                          }
 923 1051                          tcp->tcp_rto_max = val;
 924 1052                          if (tcp->tcp_rto > val)
 925 1053                                  tcp->tcp_rto = val;
 926 1054                          break;
 927 1055                  case TCP_LINGER2:
 928 1056                          if (checkonly || *i1 == 0)
 929 1057                                  break;
 930 1058  
 931 1059                          /*
 932 1060                           * Note that the option value's unit is second.  And
 933 1061                           * the value should be bigger than the private
 934 1062                           * parameter tcp_fin_wait_2_flush_interval's lower
 935 1063                           * bound and smaller than the current value of that
 936 1064                           * parameter.  It should be smaller than the current
 937 1065                           * value to avoid an app setting TCP_LINGER2 to a big
 938 1066                           * value, causing resource to be held up too long in
 939 1067                           * FIN-WAIT-2 state.
 940 1068                           */
 941 1069                          if (*i1 < 0 ||
 942 1070                              tcps->tcps_fin_wait_2_flush_interval_low/SECONDS >
 943 1071                              *i1 ||
 944 1072                              tcps->tcps_fin_wait_2_flush_interval/SECONDS <
 945 1073                              *i1) {
  
    | 
      ↓ open down ↓ | 
    156 lines elided | 
    
      ↑ open up ↑ | 
  
 946 1074                                  *outlenp = 0;
 947 1075                                  return (EINVAL);
 948 1076                          }
 949 1077                          tcp->tcp_fin_wait_2_flush_interval = *i1 * SECONDS;
 950 1078                          break;
 951 1079                  default:
 952 1080                          break;
 953 1081                  }
 954 1082                  break;
 955 1083          case IPPROTO_IP:
 956      -                if (connp->conn_family != AF_INET) {
 957      -                        *outlenp = 0;
 958      -                        return (EINVAL);
 959      -                }
 960 1084                  switch (name) {
 961 1085                  case IP_SEC_OPT:
 962 1086                          /*
 963 1087                           * We should not allow policy setting after
 964 1088                           * we start listening for connections.
 965 1089                           */
 966 1090                          if (tcp->tcp_state == TCPS_LISTEN) {
 967 1091                                  return (EINVAL);
 968 1092                          }
 969 1093                          break;
 970 1094                  }
 971 1095                  break;
 972 1096          case IPPROTO_IPV6:
 973 1097                  /*
 974 1098                   * IPPROTO_IPV6 options are only supported for sockets
 975 1099                   * that are using IPv6 on the wire.
 976 1100                   */
 977 1101                  if (connp->conn_ipversion != IPV6_VERSION) {
 978 1102                          *outlenp = 0;
 979 1103                          return (EINVAL);
 980 1104                  }
 981 1105  
 982 1106                  switch (name) {
 983 1107                  case IPV6_RECVPKTINFO:
 984 1108                          if (!checkonly) {
 985 1109                                  /* Force it to be sent up with the next msg */
 986 1110                                  tcp->tcp_recvifindex = 0;
 987 1111                          }
 988 1112                          break;
 989 1113                  case IPV6_RECVTCLASS:
 990 1114                          if (!checkonly) {
 991 1115                                  /* Force it to be sent up with the next msg */
 992 1116                                  tcp->tcp_recvtclass = 0xffffffffU;
 993 1117                          }
 994 1118                          break;
 995 1119                  case IPV6_RECVHOPLIMIT:
 996 1120                          if (!checkonly) {
 997 1121                                  /* Force it to be sent up with the next msg */
 998 1122                                  tcp->tcp_recvhops = 0xffffffffU;
 999 1123                          }
1000 1124                          break;
1001 1125                  case IPV6_PKTINFO:
1002 1126                          /* This is an extra check for TCP */
1003 1127                          if (inlen == sizeof (struct in6_pktinfo)) {
1004 1128                                  struct in6_pktinfo *pkti;
1005 1129  
1006 1130                                  pkti = (struct in6_pktinfo *)invalp;
1007 1131                                  /*
1008 1132                                   * RFC 3542 states that ipi6_addr must be
1009 1133                                   * the unspecified address when setting the
1010 1134                                   * IPV6_PKTINFO sticky socket option on a
1011 1135                                   * TCP socket.
1012 1136                                   */
1013 1137                                  if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr))
1014 1138                                          return (EINVAL);
1015 1139                          }
1016 1140                          break;
1017 1141                  case IPV6_SEC_OPT:
1018 1142                          /*
1019 1143                           * We should not allow policy setting after
1020 1144                           * we start listening for connections.
1021 1145                           */
1022 1146                          if (tcp->tcp_state == TCPS_LISTEN) {
1023 1147                                  return (EINVAL);
1024 1148                          }
1025 1149                          break;
1026 1150                  }
1027 1151                  break;
1028 1152          }
1029 1153          reterr = conn_opt_set(&coas, level, name, inlen, invalp,
1030 1154              checkonly, cr);
1031 1155          if (reterr != 0) {
1032 1156                  *outlenp = 0;
1033 1157                  return (reterr);
1034 1158          }
1035 1159  
1036 1160          /*
1037 1161           * Common case of OK return with outval same as inval
1038 1162           */
1039 1163          if (invalp != outvalp) {
1040 1164                  /* don't trust bcopy for identical src/dst */
1041 1165                  (void) bcopy(invalp, outvalp, inlen);
1042 1166          }
1043 1167          *outlenp = inlen;
1044 1168  
1045 1169          if (coas.coa_changed & COA_HEADER_CHANGED) {
1046 1170                  /* If we are connected we rebuilt the headers */
1047 1171                  if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1048 1172                      !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1049 1173                          reterr = tcp_build_hdrs(tcp);
1050 1174                          if (reterr != 0)
1051 1175                                  return (reterr);
1052 1176                  }
1053 1177          }
1054 1178          if (coas.coa_changed & COA_ROUTE_CHANGED) {
1055 1179                  in6_addr_t nexthop;
1056 1180  
1057 1181                  /*
1058 1182                   * If we are connected we re-cache the information.
1059 1183                   * We ignore errors to preserve BSD behavior.
1060 1184                   * Note that we don't redo IPsec policy lookup here
1061 1185                   * since the final destination (or source) didn't change.
1062 1186                   */
1063 1187                  ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa,
1064 1188                      &connp->conn_faddr_v6, &nexthop);
1065 1189  
1066 1190                  if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1067 1191                      !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1068 1192                          (void) ip_attr_connect(connp, connp->conn_ixa,
1069 1193                              &connp->conn_laddr_v6, &connp->conn_faddr_v6,
1070 1194                              &nexthop, connp->conn_fport, NULL, NULL,
1071 1195                              IPDF_VERIFY_DST);
1072 1196                  }
1073 1197          }
1074 1198          if ((coas.coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
1075 1199                  connp->conn_wq->q_hiwat = connp->conn_sndbuf;
1076 1200          }
1077 1201          if (coas.coa_changed & COA_WROFF_CHANGED) {
1078 1202                  connp->conn_wroff = connp->conn_ht_iphc_allocated +
1079 1203                      tcps->tcps_wroff_xtra;
1080 1204                  (void) proto_set_tx_wroff(connp->conn_rq, connp,
1081 1205                      connp->conn_wroff);
1082 1206          }
1083 1207          if (coas.coa_changed & COA_OOBINLINE_CHANGED) {
1084 1208                  if (IPCL_IS_NONSTR(connp))
1085 1209                          proto_set_rx_oob_opt(connp, onoff);
1086 1210          }
1087 1211          return (0);
1088 1212  }
  
    | 
      ↓ open down ↓ | 
    119 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX