Print this page
    
OS-5613 SO_REUSEPORT needs better state-change coverage
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Approved by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-4018 lxbrand support TCP SO_REUSEPORT
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Cody Mello <cody.mello@joyent.com>
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/inet/tcp/tcp_bind.c
          +++ new/usr/src/uts/common/inet/tcp/tcp_bind.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  
    | 
      ↓ open down ↓ | 
    14 lines elided | 
    
      ↑ open up ↑ | 
  
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
       25 + * Copyright 2016 Joyent, Inc.
  25   26   */
  26   27  
  27   28  #include <sys/types.h>
  28   29  #include <sys/stream.h>
  29   30  #include <sys/strsun.h>
  30   31  #include <sys/strsubr.h>
  31   32  #include <sys/stropts.h>
  32   33  #include <sys/strlog.h>
  33   34  #define _SUN_TPI_VERSION 2
  34   35  #include <sys/tihdr.h>
  35   36  #include <sys/suntpi.h>
  36   37  #include <sys/xti_inet.h>
  37   38  #include <sys/policy.h>
  38   39  #include <sys/squeue_impl.h>
  39   40  #include <sys/squeue.h>
  40   41  #include <sys/tsol/tnet.h>
  41   42  
  42   43  #include <rpc/pmap_prot.h>
  43   44  
  44   45  #include <inet/common.h>
  45   46  #include <inet/ip.h>
  46   47  #include <inet/tcp.h>
  47   48  #include <inet/tcp_impl.h>
  
    | 
      ↓ open down ↓ | 
    13 lines elided | 
    
      ↑ open up ↑ | 
  
  48   49  #include <inet/proto_set.h>
  49   50  #include <inet/ipsec_impl.h>
  50   51  
  51   52  /* Setable in /etc/system */
  52   53  /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */
  53   54  static uint32_t tcp_random_anon_port = 1;
  54   55  
  55   56  static int      tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t,
  56   57                      cred_t *cr);
  57   58  static in_port_t        tcp_get_next_priv_port(const tcp_t *);
       59 +static int tcp_rg_insert(tcp_rg_t *, struct tcp_s *);
  58   60  
  59   61  /*
  60   62   * Hash list insertion routine for tcp_t structures. Each hash bucket
  61   63   * contains a list of tcp_t entries, and each entry is bound to a unique
  62   64   * port. If there are multiple tcp_t's that are bound to the same port, then
  63   65   * one of them will be linked into the hash bucket list, and the rest will
  64   66   * hang off of that one entry. For each port, entries bound to a specific IP
  65   67   * address will be inserted before those those bound to INADDR_ANY.
  66   68   */
  67   69  void
  68   70  tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock)
  69   71  {
  70   72          tcp_t   **tcpp;
  71   73          tcp_t   *tcpnext;
  72   74          tcp_t   *tcphash;
  73   75          conn_t  *connp = tcp->tcp_connp;
  74   76          conn_t  *connext;
  75   77  
  76   78          if (tcp->tcp_ptpbhn != NULL) {
  77   79                  ASSERT(!caller_holds_lock);
  78   80                  tcp_bind_hash_remove(tcp);
  79   81          }
  80   82          tcpp = &tbf->tf_tcp;
  81   83          if (!caller_holds_lock) {
  82   84                  mutex_enter(&tbf->tf_lock);
  83   85          } else {
  84   86                  ASSERT(MUTEX_HELD(&tbf->tf_lock));
  85   87          }
  86   88          tcphash = tcpp[0];
  87   89          tcpnext = NULL;
  88   90          if (tcphash != NULL) {
  89   91                  /* Look for an entry using the same port */
  90   92                  while ((tcphash = tcpp[0]) != NULL &&
  91   93                      connp->conn_lport != tcphash->tcp_connp->conn_lport)
  92   94                          tcpp = &(tcphash->tcp_bind_hash);
  93   95  
  94   96                  /* The port was not found, just add to the end */
  95   97                  if (tcphash == NULL)
  96   98                          goto insert;
  97   99  
  98  100                  /*
  99  101                   * OK, there already exists an entry bound to the
 100  102                   * same port.
 101  103                   *
 102  104                   * If the new tcp bound to the INADDR_ANY address
 103  105                   * and the first one in the list is not bound to
 104  106                   * INADDR_ANY we skip all entries until we find the
 105  107                   * first one bound to INADDR_ANY.
 106  108                   * This makes sure that applications binding to a
 107  109                   * specific address get preference over those binding to
 108  110                   * INADDR_ANY.
 109  111                   */
 110  112                  tcpnext = tcphash;
 111  113                  connext = tcpnext->tcp_connp;
 112  114                  tcphash = NULL;
 113  115                  if (V6_OR_V4_INADDR_ANY(connp->conn_bound_addr_v6) &&
 114  116                      !V6_OR_V4_INADDR_ANY(connext->conn_bound_addr_v6)) {
 115  117                          while ((tcpnext = tcpp[0]) != NULL) {
 116  118                                  connext = tcpnext->tcp_connp;
 117  119                                  if (!V6_OR_V4_INADDR_ANY(
 118  120                                      connext->conn_bound_addr_v6))
 119  121                                          tcpp = &(tcpnext->tcp_bind_hash_port);
 120  122                                  else
 121  123                                          break;
 122  124                          }
 123  125                          if (tcpnext != NULL) {
 124  126                                  tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port;
 125  127                                  tcphash = tcpnext->tcp_bind_hash;
 126  128                                  if (tcphash != NULL) {
 127  129                                          tcphash->tcp_ptpbhn =
 128  130                                              &(tcp->tcp_bind_hash);
 129  131                                          tcpnext->tcp_bind_hash = NULL;
 130  132                                  }
 131  133                          }
 132  134                  } else {
 133  135                          tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port;
 134  136                          tcphash = tcpnext->tcp_bind_hash;
 135  137                          if (tcphash != NULL) {
 136  138                                  tcphash->tcp_ptpbhn =
 137  139                                      &(tcp->tcp_bind_hash);
 138  140                                  tcpnext->tcp_bind_hash = NULL;
 139  141                          }
 140  142                  }
 141  143          }
 142  144  insert:
 143  145          tcp->tcp_bind_hash_port = tcpnext;
 144  146          tcp->tcp_bind_hash = tcphash;
 145  147          tcp->tcp_ptpbhn = tcpp;
 146  148          tcpp[0] = tcp;
 147  149          if (!caller_holds_lock)
 148  150                  mutex_exit(&tbf->tf_lock);
 149  151  }
 150  152  
 151  153  /*
 152  154   * Hash list removal routine for tcp_t structures.
 153  155   */
 154  156  void
 155  157  tcp_bind_hash_remove(tcp_t *tcp)
 156  158  {
 157  159          tcp_t   *tcpnext;
 158  160          kmutex_t *lockp;
 159  161          tcp_stack_t     *tcps = tcp->tcp_tcps;
 160  162          conn_t          *connp = tcp->tcp_connp;
 161  163  
 162  164          if (tcp->tcp_ptpbhn == NULL)
 163  165                  return;
 164  166  
  
    | 
      ↓ open down ↓ | 
    97 lines elided | 
    
      ↑ open up ↑ | 
  
 165  167          /*
 166  168           * Extract the lock pointer in case there are concurrent
 167  169           * hash_remove's for this instance.
 168  170           */
 169  171          ASSERT(connp->conn_lport != 0);
 170  172          lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH(
 171  173              connp->conn_lport)].tf_lock;
 172  174  
 173  175          ASSERT(lockp != NULL);
 174  176          mutex_enter(lockp);
      177 +
      178 +        /* destroy any association with SO_REUSEPORT group */
      179 +        if (tcp->tcp_rg_bind != NULL) {
      180 +                if (tcp_rg_remove(tcp->tcp_rg_bind, tcp)) {
      181 +                        /* Last one out turns off the lights */
      182 +                        tcp_rg_destroy(tcp->tcp_rg_bind);
      183 +                }
      184 +                tcp->tcp_rg_bind = NULL;
      185 +        }
      186 +
 175  187          if (tcp->tcp_ptpbhn) {
 176  188                  tcpnext = tcp->tcp_bind_hash_port;
 177  189                  if (tcpnext != NULL) {
 178  190                          tcp->tcp_bind_hash_port = NULL;
 179  191                          tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
 180  192                          tcpnext->tcp_bind_hash = tcp->tcp_bind_hash;
 181  193                          if (tcpnext->tcp_bind_hash != NULL) {
 182  194                                  tcpnext->tcp_bind_hash->tcp_ptpbhn =
 183  195                                      &(tcpnext->tcp_bind_hash);
 184  196                                  tcp->tcp_bind_hash = NULL;
 185  197                          }
 186  198                  } else if ((tcpnext = tcp->tcp_bind_hash) != NULL) {
 187  199                          tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
 188  200                          tcp->tcp_bind_hash = NULL;
 189  201                  }
 190  202                  *tcp->tcp_ptpbhn = tcpnext;
 191  203                  tcp->tcp_ptpbhn = NULL;
 192  204          }
 193  205          mutex_exit(lockp);
 194  206  }
 195  207  
 196  208  /*
 197  209   * Don't let port fall into the privileged range.
 198  210   * Since the extra privileged ports can be arbitrary we also
 199  211   * ensure that we exclude those from consideration.
 200  212   * tcp_g_epriv_ports is not sorted thus we loop over it until
 201  213   * there are no changes.
 202  214   *
 203  215   * Note: No locks are held when inspecting tcp_g_*epriv_ports
 204  216   * but instead the code relies on:
 205  217   * - the fact that the address of the array and its size never changes
 206  218   * - the atomic assignment of the elements of the array
 207  219   *
 208  220   * Returns 0 if there are no more ports available.
 209  221   *
 210  222   * TS note: skip multilevel ports.
 211  223   */
 212  224  in_port_t
 213  225  tcp_update_next_port(in_port_t port, const tcp_t *tcp, boolean_t random)
 214  226  {
 215  227          int i, bump;
 216  228          boolean_t restart = B_FALSE;
 217  229          tcp_stack_t *tcps = tcp->tcp_tcps;
 218  230  
 219  231          if (random && tcp_random_anon_port != 0) {
 220  232                  (void) random_get_pseudo_bytes((uint8_t *)&port,
 221  233                      sizeof (in_port_t));
 222  234                  /*
 223  235                   * Unless changed by a sys admin, the smallest anon port
 224  236                   * is 32768 and the largest anon port is 65535.  It is
 225  237                   * very likely (50%) for the random port to be smaller
 226  238                   * than the smallest anon port.  When that happens,
 227  239                   * add port % (anon port range) to the smallest anon
 228  240                   * port to get the random port.  It should fall into the
 229  241                   * valid anon port range.
 230  242                   */
 231  243                  if ((port < tcps->tcps_smallest_anon_port) ||
 232  244                      (port > tcps->tcps_largest_anon_port)) {
 233  245                          if (tcps->tcps_smallest_anon_port ==
 234  246                              tcps->tcps_largest_anon_port) {
 235  247                                  bump = 0;
 236  248                          } else {
 237  249                                  bump = port % (tcps->tcps_largest_anon_port -
 238  250                                      tcps->tcps_smallest_anon_port);
 239  251                          }
 240  252                          port = tcps->tcps_smallest_anon_port + bump;
 241  253                  }
 242  254          }
 243  255  
 244  256  retry:
 245  257          if (port < tcps->tcps_smallest_anon_port)
 246  258                  port = (in_port_t)tcps->tcps_smallest_anon_port;
 247  259  
 248  260          if (port > tcps->tcps_largest_anon_port) {
 249  261                  if (restart)
 250  262                          return (0);
 251  263                  restart = B_TRUE;
 252  264                  port = (in_port_t)tcps->tcps_smallest_anon_port;
 253  265          }
 254  266  
 255  267          if (port < tcps->tcps_smallest_nonpriv_port)
 256  268                  port = (in_port_t)tcps->tcps_smallest_nonpriv_port;
 257  269  
 258  270          for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
 259  271                  if (port == tcps->tcps_g_epriv_ports[i]) {
 260  272                          port++;
 261  273                          /*
 262  274                           * Make sure whether the port is in the
 263  275                           * valid range.
 264  276                           */
 265  277                          goto retry;
 266  278                  }
 267  279          }
 268  280          if (is_system_labeled() &&
 269  281              (i = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred), port,
 270  282              IPPROTO_TCP, B_TRUE)) != 0) {
 271  283                  port = i;
 272  284                  goto retry;
 273  285          }
 274  286          return (port);
 275  287  }
 276  288  
 277  289  /*
 278  290   * Return the next anonymous port in the privileged port range for
 279  291   * bind checking.  It starts at IPPORT_RESERVED - 1 and goes
 280  292   * downwards.  This is the same behavior as documented in the userland
 281  293   * library call rresvport(3N).
 282  294   *
 283  295   * TS note: skip multilevel ports.
 284  296   */
 285  297  static in_port_t
 286  298  tcp_get_next_priv_port(const tcp_t *tcp)
 287  299  {
 288  300          static in_port_t next_priv_port = IPPORT_RESERVED - 1;
 289  301          in_port_t nextport;
 290  302          boolean_t restart = B_FALSE;
 291  303          tcp_stack_t *tcps = tcp->tcp_tcps;
 292  304  retry:
 293  305          if (next_priv_port < tcps->tcps_min_anonpriv_port ||
 294  306              next_priv_port >= IPPORT_RESERVED) {
 295  307                  next_priv_port = IPPORT_RESERVED - 1;
 296  308                  if (restart)
 297  309                          return (0);
 298  310                  restart = B_TRUE;
 299  311          }
 300  312          if (is_system_labeled() &&
 301  313              (nextport = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred),
 302  314              next_priv_port, IPPROTO_TCP, B_FALSE)) != 0) {
 303  315                  next_priv_port = nextport;
 304  316                  goto retry;
 305  317          }
 306  318          return (next_priv_port--);
 307  319  }
 308  320  
 309  321  static int
 310  322  tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr,
 311  323      boolean_t bind_to_req_port_only, cred_t *cr)
 312  324  {
 313  325          in_port_t       mlp_port;
 314  326          mlp_type_t      addrtype, mlptype;
 315  327          boolean_t       user_specified;
 316  328          in_port_t       allocated_port;
 317  329          in_port_t       requested_port = *requested_port_ptr;
 318  330          conn_t          *connp = tcp->tcp_connp;
 319  331          zone_t          *zone;
 320  332          tcp_stack_t     *tcps = tcp->tcp_tcps;
 321  333          in6_addr_t      v6addr = connp->conn_laddr_v6;
 322  334  
 323  335          /*
 324  336           * XXX It's up to the caller to specify bind_to_req_port_only or not.
 325  337           */
 326  338          ASSERT(cr != NULL);
 327  339  
 328  340          /*
 329  341           * Get a valid port (within the anonymous range and should not
 330  342           * be a privileged one) to use if the user has not given a port.
 331  343           * If multiple threads are here, they may all start with
 332  344           * with the same initial port. But, it should be fine as long as
 333  345           * tcp_bindi will ensure that no two threads will be assigned
 334  346           * the same port.
 335  347           *
 336  348           * NOTE: XXX If a privileged process asks for an anonymous port, we
 337  349           * still check for ports only in the range > tcp_smallest_non_priv_port,
 338  350           * unless TCP_ANONPRIVBIND option is set.
 339  351           */
 340  352          mlptype = mlptSingle;
 341  353          mlp_port = requested_port;
 342  354          if (requested_port == 0) {
 343  355                  requested_port = connp->conn_anon_priv_bind ?
 344  356                      tcp_get_next_priv_port(tcp) :
 345  357                      tcp_update_next_port(tcps->tcps_next_port_to_try,
 346  358                      tcp, B_TRUE);
 347  359                  if (requested_port == 0) {
 348  360                          return (-TNOADDR);
 349  361                  }
 350  362                  user_specified = B_FALSE;
 351  363  
 352  364                  /*
 353  365                   * If the user went through one of the RPC interfaces to create
 354  366                   * this socket and RPC is MLP in this zone, then give him an
 355  367                   * anonymous MLP.
 356  368                   */
 357  369                  if (connp->conn_anon_mlp && is_system_labeled()) {
 358  370                          zone = crgetzone(cr);
 359  371                          addrtype = tsol_mlp_addr_type(
 360  372                              connp->conn_allzones ? ALL_ZONES : zone->zone_id,
 361  373                              IPV6_VERSION, &v6addr,
 362  374                              tcps->tcps_netstack->netstack_ip);
 363  375                          if (addrtype == mlptSingle) {
 364  376                                  return (-TNOADDR);
 365  377                          }
 366  378                          mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
 367  379                              PMAPPORT, addrtype);
 368  380                          mlp_port = PMAPPORT;
 369  381                  }
 370  382          } else {
 371  383                  int i;
 372  384                  boolean_t priv = B_FALSE;
 373  385  
 374  386                  /*
 375  387                   * If the requested_port is in the well-known privileged range,
 376  388                   * verify that the stream was opened by a privileged user.
 377  389                   * Note: No locks are held when inspecting tcp_g_*epriv_ports
 378  390                   * but instead the code relies on:
 379  391                   * - the fact that the address of the array and its size never
 380  392                   *   changes
 381  393                   * - the atomic assignment of the elements of the array
 382  394                   */
 383  395                  if (requested_port < tcps->tcps_smallest_nonpriv_port) {
 384  396                          priv = B_TRUE;
 385  397                  } else {
 386  398                          for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
 387  399                                  if (requested_port ==
 388  400                                      tcps->tcps_g_epriv_ports[i]) {
 389  401                                          priv = B_TRUE;
 390  402                                          break;
 391  403                                  }
 392  404                          }
 393  405                  }
 394  406                  if (priv) {
 395  407                          if (secpolicy_net_privaddr(cr, requested_port,
 396  408                              IPPROTO_TCP) != 0) {
 397  409                                  if (connp->conn_debug) {
 398  410                                          (void) strlog(TCP_MOD_ID, 0, 1,
 399  411                                              SL_ERROR|SL_TRACE,
 400  412                                              "tcp_bind: no priv for port %d",
 401  413                                              requested_port);
 402  414                                  }
 403  415                                  return (-TACCES);
 404  416                          }
 405  417                  }
 406  418                  user_specified = B_TRUE;
 407  419  
 408  420                  connp = tcp->tcp_connp;
 409  421                  if (is_system_labeled()) {
 410  422                          zone = crgetzone(cr);
 411  423                          addrtype = tsol_mlp_addr_type(
 412  424                              connp->conn_allzones ? ALL_ZONES : zone->zone_id,
 413  425                              IPV6_VERSION, &v6addr,
 414  426                              tcps->tcps_netstack->netstack_ip);
 415  427                          if (addrtype == mlptSingle) {
 416  428                                  return (-TNOADDR);
 417  429                          }
 418  430                          mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
 419  431                              requested_port, addrtype);
 420  432                  }
 421  433          }
 422  434  
 423  435          if (mlptype != mlptSingle) {
 424  436                  if (secpolicy_net_bindmlp(cr) != 0) {
 425  437                          if (connp->conn_debug) {
 426  438                                  (void) strlog(TCP_MOD_ID, 0, 1,
 427  439                                      SL_ERROR|SL_TRACE,
 428  440                                      "tcp_bind: no priv for multilevel port %d",
 429  441                                      requested_port);
 430  442                          }
 431  443                          return (-TACCES);
 432  444                  }
 433  445  
 434  446                  /*
 435  447                   * If we're specifically binding a shared IP address and the
 436  448                   * port is MLP on shared addresses, then check to see if this
 437  449                   * zone actually owns the MLP.  Reject if not.
 438  450                   */
 439  451                  if (mlptype == mlptShared && addrtype == mlptShared) {
 440  452                          /*
 441  453                           * No need to handle exclusive-stack zones since
 442  454                           * ALL_ZONES only applies to the shared stack.
 443  455                           */
 444  456                          zoneid_t mlpzone;
 445  457  
 446  458                          mlpzone = tsol_mlp_findzone(IPPROTO_TCP,
 447  459                              htons(mlp_port));
 448  460                          if (connp->conn_zoneid != mlpzone) {
 449  461                                  if (connp->conn_debug) {
 450  462                                          (void) strlog(TCP_MOD_ID, 0, 1,
 451  463                                              SL_ERROR|SL_TRACE,
 452  464                                              "tcp_bind: attempt to bind port "
 453  465                                              "%d on shared addr in zone %d "
 454  466                                              "(should be %d)",
 455  467                                              mlp_port, connp->conn_zoneid,
 456  468                                              mlpzone);
 457  469                                  }
 458  470                                  return (-TACCES);
 459  471                          }
 460  472                  }
 461  473  
 462  474                  if (!user_specified) {
 463  475                          int err;
 464  476                          err = tsol_mlp_anon(zone, mlptype, connp->conn_proto,
 465  477                              requested_port, B_TRUE);
 466  478                          if (err != 0) {
 467  479                                  if (connp->conn_debug) {
 468  480                                          (void) strlog(TCP_MOD_ID, 0, 1,
 469  481                                              SL_ERROR|SL_TRACE,
 470  482                                              "tcp_bind: cannot establish anon "
 471  483                                              "MLP for port %d",
 472  484                                              requested_port);
 473  485                                  }
 474  486                                  return (err);
 475  487                          }
 476  488                          connp->conn_anon_port = B_TRUE;
 477  489                  }
 478  490                  connp->conn_mlp_type = mlptype;
 479  491          }
 480  492  
 481  493          allocated_port = tcp_bindi(tcp, requested_port, &v6addr,
 482  494              connp->conn_reuseaddr, B_FALSE, bind_to_req_port_only,
 483  495              user_specified);
 484  496  
 485  497          if (allocated_port == 0) {
 486  498                  connp->conn_mlp_type = mlptSingle;
 487  499                  if (connp->conn_anon_port) {
 488  500                          connp->conn_anon_port = B_FALSE;
 489  501                          (void) tsol_mlp_anon(zone, mlptype, connp->conn_proto,
 490  502                              requested_port, B_FALSE);
 491  503                  }
 492  504                  if (bind_to_req_port_only) {
 493  505                          if (connp->conn_debug) {
 494  506                                  (void) strlog(TCP_MOD_ID, 0, 1,
 495  507                                      SL_ERROR|SL_TRACE,
 496  508                                      "tcp_bind: requested addr busy");
 497  509                          }
 498  510                          return (-TADDRBUSY);
 499  511                  } else {
 500  512                          /* If we are out of ports, fail the bind. */
 501  513                          if (connp->conn_debug) {
 502  514                                  (void) strlog(TCP_MOD_ID, 0, 1,
 503  515                                      SL_ERROR|SL_TRACE,
 504  516                                      "tcp_bind: out of ports?");
 505  517                          }
 506  518                          return (-TNOADDR);
 507  519                  }
 508  520          }
 509  521  
 510  522          /* Pass the allocated port back */
 511  523          *requested_port_ptr = allocated_port;
 512  524          return (0);
 513  525  }
 514  526  
 515  527  /*
 516  528   * Check the address and check/pick a local port number.
 517  529   */
 518  530  int
 519  531  tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
 520  532      boolean_t bind_to_req_port_only)
 521  533  {
 522  534          tcp_t   *tcp = connp->conn_tcp;
 523  535          sin_t   *sin;
 524  536          sin6_t  *sin6;
 525  537          in_port_t       requested_port;
 526  538          ipaddr_t        v4addr;
 527  539          in6_addr_t      v6addr;
 528  540          ip_laddr_t      laddr_type = IPVL_UNICAST_UP;   /* INADDR_ANY */
 529  541          zoneid_t        zoneid = IPCL_ZONEID(connp);
 530  542          ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
 531  543          uint_t          scopeid = 0;
 532  544          int             error = 0;
 533  545          ip_xmit_attr_t  *ixa = connp->conn_ixa;
 534  546  
 535  547          ASSERT((uintptr_t)len <= (uintptr_t)INT_MAX);
 536  548  
 537  549          if (tcp->tcp_state == TCPS_BOUND) {
 538  550                  return (0);
 539  551          } else if (tcp->tcp_state > TCPS_BOUND) {
 540  552                  if (connp->conn_debug) {
 541  553                          (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
 542  554                              "tcp_bind: bad state, %d", tcp->tcp_state);
 543  555                  }
 544  556                  return (-TOUTSTATE);
 545  557          }
 546  558  
 547  559          ASSERT(sa != NULL && len != 0);
 548  560  
 549  561          if (!OK_32PTR((char *)sa)) {
 550  562                  if (connp->conn_debug) {
 551  563                          (void) strlog(TCP_MOD_ID, 0, 1,
 552  564                              SL_ERROR|SL_TRACE,
 553  565                              "tcp_bind: bad address parameter, "
 554  566                              "address %p, len %d",
 555  567                              (void *)sa, len);
 556  568                  }
 557  569                  return (-TPROTO);
 558  570          }
 559  571  
 560  572          error = proto_verify_ip_addr(connp->conn_family, sa, len);
 561  573          if (error != 0) {
 562  574                  return (error);
 563  575          }
 564  576  
 565  577          switch (len) {
 566  578          case sizeof (sin_t):    /* Complete IPv4 address */
 567  579                  sin = (sin_t *)sa;
 568  580                  requested_port = ntohs(sin->sin_port);
 569  581                  v4addr = sin->sin_addr.s_addr;
 570  582                  IN6_IPADDR_TO_V4MAPPED(v4addr, &v6addr);
 571  583                  if (v4addr != INADDR_ANY) {
 572  584                          laddr_type = ip_laddr_verify_v4(v4addr, zoneid, ipst,
 573  585                              B_FALSE);
 574  586                  }
 575  587                  break;
 576  588  
 577  589          case sizeof (sin6_t): /* Complete IPv6 address */
 578  590                  sin6 = (sin6_t *)sa;
 579  591                  v6addr = sin6->sin6_addr;
 580  592                  requested_port = ntohs(sin6->sin6_port);
 581  593                  if (IN6_IS_ADDR_V4MAPPED(&v6addr)) {
 582  594                          if (connp->conn_ipv6_v6only)
 583  595                                  return (EADDRNOTAVAIL);
 584  596  
 585  597                          IN6_V4MAPPED_TO_IPADDR(&v6addr, v4addr);
 586  598                          if (v4addr != INADDR_ANY) {
 587  599                                  laddr_type = ip_laddr_verify_v4(v4addr,
 588  600                                      zoneid, ipst, B_FALSE);
 589  601                          }
 590  602                  } else {
 591  603                          if (!IN6_IS_ADDR_UNSPECIFIED(&v6addr)) {
 592  604                                  if (IN6_IS_ADDR_LINKSCOPE(&v6addr))
 593  605                                          scopeid = sin6->sin6_scope_id;
 594  606                                  laddr_type = ip_laddr_verify_v6(&v6addr,
 595  607                                      zoneid, ipst, B_FALSE, scopeid);
 596  608                          }
 597  609                  }
 598  610                  break;
 599  611  
 600  612          default:
 601  613                  if (connp->conn_debug) {
 602  614                          (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
 603  615                              "tcp_bind: bad address length, %d", len);
 604  616                  }
 605  617                  return (EAFNOSUPPORT);
 606  618                  /* return (-TBADADDR); */
 607  619          }
 608  620  
 609  621          /* Is the local address a valid unicast address? */
 610  622          if (laddr_type == IPVL_BAD)
 611  623                  return (EADDRNOTAVAIL);
 612  624  
 613  625          connp->conn_bound_addr_v6 = v6addr;
 614  626          if (scopeid != 0) {
 615  627                  ixa->ixa_flags |= IXAF_SCOPEID_SET;
 616  628                  ixa->ixa_scopeid = scopeid;
 617  629                  connp->conn_incoming_ifindex = scopeid;
 618  630          } else {
 619  631                  ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
 620  632                  connp->conn_incoming_ifindex = connp->conn_bound_if;
 621  633          }
 622  634  
 623  635          connp->conn_laddr_v6 = v6addr;
 624  636          connp->conn_saddr_v6 = v6addr;
 625  637  
 626  638          bind_to_req_port_only = requested_port != 0 && bind_to_req_port_only;
 627  639  
 628  640          error = tcp_bind_select_lport(tcp, &requested_port,
  
    | 
      ↓ open down ↓ | 
    444 lines elided | 
    
      ↑ open up ↑ | 
  
 629  641              bind_to_req_port_only, cr);
 630  642          if (error != 0) {
 631  643                  connp->conn_laddr_v6 = ipv6_all_zeros;
 632  644                  connp->conn_saddr_v6 = ipv6_all_zeros;
 633  645                  connp->conn_bound_addr_v6 = ipv6_all_zeros;
 634  646          }
 635  647          return (error);
 636  648  }
 637  649  
 638  650  /*
 639      - * If the "bind_to_req_port_only" parameter is set, if the requested port
 640      - * number is available, return it, If not return 0
      651 + * If the "bind_to_req_port_only" parameter is set and the requested port
      652 + * number is available, return it (else return 0).
 641  653   *
 642      - * If "bind_to_req_port_only" parameter is not set and
 643      - * If the requested port number is available, return it.  If not, return
 644      - * the first anonymous port we happen across.  If no anonymous ports are
 645      - * available, return 0. addr is the requested local address, if any.
      654 + * If "bind_to_req_port_only" parameter is not set and the requested port
      655 + * number is available, return it.  If not, return the first anonymous port we
      656 + * happen across.  If no anonymous ports are available, return 0.
 646  657   *
 647  658   * In either case, when succeeding update the tcp_t to record the port number
 648  659   * and insert it in the bind hash table.
 649  660   *
 650  661   * Note that TCP over IPv4 and IPv6 sockets can use the same port number
 651  662   * without setting SO_REUSEADDR. This is needed so that they
 652  663   * can be viewed as two independent transport protocols.
 653  664   */
 654  665  in_port_t
 655  666  tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
 656  667      int reuseaddr, boolean_t quick_connect,
 657  668      boolean_t bind_to_req_port_only, boolean_t user_specified)
 658  669  {
 659  670          /* number of times we have run around the loop */
 660  671          int count = 0;
 661  672          /* maximum number of times to run around the loop */
 662  673          int loopmax;
 663  674          conn_t *connp = tcp->tcp_connp;
 664  675          tcp_stack_t     *tcps = tcp->tcp_tcps;
      676 +        boolean_t reuseport = connp->conn_reuseport;
 665  677  
 666  678          /*
 667  679           * Lookup for free addresses is done in a loop and "loopmax"
 668  680           * influences how long we spin in the loop
 669  681           */
 670  682          if (bind_to_req_port_only) {
 671  683                  /*
 672  684                   * If the requested port is busy, don't bother to look
 673  685                   * for a new one. Setting loop maximum count to 1 has
 674  686                   * that effect.
 675  687                   */
 676  688                  loopmax = 1;
 677  689          } else {
 678  690                  /*
 679  691                   * If the requested port is busy, look for a free one
 680  692                   * in the anonymous port range.
 681  693                   * Set loopmax appropriately so that one does not look
 682  694                   * forever in the case all of the anonymous ports are in use.
 683  695                   */
 684  696                  if (connp->conn_anon_priv_bind) {
 685  697                          /*
 686  698                           * loopmax =
 687  699                           *      (IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1
 688  700                           */
 689  701                          loopmax = IPPORT_RESERVED -
 690  702                              tcps->tcps_min_anonpriv_port;
  
    | 
      ↓ open down ↓ | 
    16 lines elided | 
    
      ↑ open up ↑ | 
  
 691  703                  } else {
 692  704                          loopmax = (tcps->tcps_largest_anon_port -
 693  705                              tcps->tcps_smallest_anon_port + 1);
 694  706                  }
 695  707          }
 696  708          do {
 697  709                  uint16_t        lport;
 698  710                  tf_t            *tbf;
 699  711                  tcp_t           *ltcp;
 700  712                  conn_t          *lconnp;
      713 +                boolean_t       attempt_reuse = B_FALSE;
 701  714  
 702  715                  lport = htons(port);
 703  716  
 704  717                  /*
 705  718                   * Ensure that the tcp_t is not currently in the bind hash.
 706  719                   * Hold the lock on the hash bucket to ensure that
 707  720                   * the duplicate check plus the insertion is an atomic
 708  721                   * operation.
 709  722                   *
 710  723                   * This function does an inline lookup on the bind hash list
 711  724                   * Make sure that we access only members of tcp_t
 712  725                   * and that we don't look at tcp_tcp, since we are not
 713  726                   * doing a CONN_INC_REF.
 714  727                   */
 715  728                  tcp_bind_hash_remove(tcp);
 716  729                  tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)];
  
    | 
      ↓ open down ↓ | 
    6 lines elided | 
    
      ↑ open up ↑ | 
  
 717  730                  mutex_enter(&tbf->tf_lock);
 718  731                  for (ltcp = tbf->tf_tcp; ltcp != NULL;
 719  732                      ltcp = ltcp->tcp_bind_hash) {
 720  733                          if (lport == ltcp->tcp_connp->conn_lport)
 721  734                                  break;
 722  735                  }
 723  736  
 724  737                  for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) {
 725  738                          boolean_t not_socket;
 726  739                          boolean_t exclbind;
      740 +                        boolean_t addrmatch;
 727  741  
 728  742                          lconnp = ltcp->tcp_connp;
 729  743  
 730  744                          /*
 731  745                           * On a labeled system, we must treat bindings to ports
 732  746                           * on shared IP addresses by sockets with MAC exemption
 733  747                           * privilege as being in all zones, as there's
 734  748                           * otherwise no way to identify the right receiver.
 735  749                           */
 736  750                          if (!IPCL_BIND_ZONE_MATCH(lconnp, connp))
 737  751                                  continue;
 738  752  
 739  753                          /*
 740  754                           * If TCP_EXCLBIND is set for either the bound or
 741  755                           * binding endpoint, the semantics of bind
 742  756                           * is changed according to the following.
 743  757                           *
 744  758                           * spec = specified address (v4 or v6)
 745  759                           * unspec = unspecified address (v4 or v6)
 746  760                           * A = specified addresses are different for endpoints
 747  761                           *
 748  762                           * bound        bind to         allowed
 749  763                           * -------------------------------------
 750  764                           * unspec       unspec          no
 751  765                           * unspec       spec            no
 752  766                           * spec         unspec          no
 753  767                           * spec         spec            yes if A
 754  768                           *
 755  769                           * For labeled systems, SO_MAC_EXEMPT behaves the same
 756  770                           * as TCP_EXCLBIND, except that zoneid is ignored.
 757  771                           *
 758  772                           * Note:
 759  773                           *
 760  774                           * 1. Because of TLI semantics, an endpoint can go
 761  775                           * back from, say TCP_ESTABLISHED to TCPS_LISTEN or
 762  776                           * TCPS_BOUND, depending on whether it is originally
 763  777                           * a listener or not.  That is why we need to check
 764  778                           * for states greater than or equal to TCPS_BOUND
 765  779                           * here.
 766  780                           *
 767  781                           * 2. Ideally, we should only check for state equals
 768  782                           * to TCPS_LISTEN. And the following check should be
 769  783                           * added.
 770  784                           *
 771  785                           * if (ltcp->tcp_state == TCPS_LISTEN ||
 772  786                           *      !reuseaddr || !lconnp->conn_reuseaddr) {
 773  787                           *              ...
 774  788                           * }
 775  789                           *
 776  790                           * The semantics will be changed to this.  If the
 777  791                           * endpoint on the list is in state not equal to
 778  792                           * TCPS_LISTEN and both endpoints have SO_REUSEADDR
 779  793                           * set, let the bind succeed.
 780  794                           *
 781  795                           * Because of (1), we cannot do that for TLI
 782  796                           * endpoints.  But we can do that for socket endpoints.
 783  797                           * If in future, we can change this going back
 784  798                           * semantics, we can use the above check for TLI also.
 785  799                           */
 786  800                          not_socket = !(TCP_IS_SOCKET(ltcp) &&
 787  801                              TCP_IS_SOCKET(tcp));
 788  802                          exclbind = lconnp->conn_exclbind ||
 789  803                              connp->conn_exclbind;
 790  804  
 791  805                          if ((lconnp->conn_mac_mode != CONN_MAC_DEFAULT) ||
 792  806                              (connp->conn_mac_mode != CONN_MAC_DEFAULT) ||
 793  807                              (exclbind && (not_socket ||
 794  808                              ltcp->tcp_state <= TCPS_ESTABLISHED))) {
 795  809                                  if (V6_OR_V4_INADDR_ANY(
 796  810                                      lconnp->conn_bound_addr_v6) ||
 797  811                                      V6_OR_V4_INADDR_ANY(*laddr) ||
 798  812                                      IN6_ARE_ADDR_EQUAL(laddr,
 799  813                                      &lconnp->conn_bound_addr_v6)) {
 800  814                                          break;
 801  815                                  }
 802  816                                  continue;
 803  817                          }
 804  818  
 805  819                          /*
 806  820                           * Check ipversion to allow IPv4 and IPv6 sockets to
 807  821                           * have disjoint port number spaces, if *_EXCLBIND
 808  822                           * is not set and only if the application binds to a
 809  823                           * specific port. We use the same autoassigned port
 810  824                           * number space for IPv4 and IPv6 sockets.
 811  825                           */
 812  826                          if (connp->conn_ipversion != lconnp->conn_ipversion &&
 813  827                              bind_to_req_port_only)
 814  828                                  continue;
 815  829  
 816  830                          /*
 817  831                           * Ideally, we should make sure that the source
 818  832                           * address, remote address, and remote port in the
 819  833                           * four tuple for this tcp-connection is unique.
 820  834                           * However, trying to find out the local source
 821  835                           * address would require too much code duplication
  
    | 
      ↓ open down ↓ | 
    85 lines elided | 
    
      ↑ open up ↑ | 
  
 822  836                           * with IP, since IP needs needs to have that code
 823  837                           * to support userland TCP implementations.
 824  838                           */
 825  839                          if (quick_connect &&
 826  840                              (ltcp->tcp_state > TCPS_LISTEN) &&
 827  841                              ((connp->conn_fport != lconnp->conn_fport) ||
 828  842                              !IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
 829  843                              &lconnp->conn_faddr_v6)))
 830  844                                  continue;
 831  845  
      846 +                        addrmatch = IN6_ARE_ADDR_EQUAL(laddr,
      847 +                            &lconnp->conn_bound_addr_v6);
      848 +
      849 +                        if (addrmatch && reuseport && bind_to_req_port_only &&
      850 +                            (ltcp->tcp_state == TCPS_BOUND ||
      851 +                            ltcp->tcp_state == TCPS_LISTEN)) {
      852 +                                /*
      853 +                                 * This entry is bound to the exact same
      854 +                                 * address and port.  If SO_REUSEPORT is set on
      855 +                                 * the calling socket, attempt to reuse this
      856 +                                 * binding if it too had SO_REUSEPORT enabled
      857 +                                 * when it was bound.
      858 +                                 */
      859 +                                attempt_reuse = (ltcp->tcp_rg_bind != NULL);
      860 +                                break;
      861 +                        }
      862 +
 832  863                          if (!reuseaddr) {
 833  864                                  /*
 834      -                                 * No socket option SO_REUSEADDR.
 835      -                                 * If existing port is bound to
 836      -                                 * a non-wildcard IP address
 837      -                                 * and the requesting stream is
 838      -                                 * bound to a distinct
 839      -                                 * different IP addresses
 840      -                                 * (non-wildcard, also), keep
 841      -                                 * going.
      865 +                                 * No socket option SO_REUSEADDR.  If an
      866 +                                 * existing port is bound to a non-wildcard IP
      867 +                                 * address and the requesting stream is bound
      868 +                                 * to a distinct different IP address
      869 +                                 * (non-wildcard, also), keep going.
 842  870                                   */
 843  871                                  if (!V6_OR_V4_INADDR_ANY(*laddr) &&
 844  872                                      !V6_OR_V4_INADDR_ANY(
 845  873                                      lconnp->conn_bound_addr_v6) &&
 846      -                                    !IN6_ARE_ADDR_EQUAL(laddr,
 847      -                                    &lconnp->conn_bound_addr_v6))
      874 +                                    !addrmatch)
 848  875                                          continue;
 849  876                                  if (ltcp->tcp_state >= TCPS_BOUND) {
 850  877                                          /*
 851  878                                           * This port is being used and
 852  879                                           * its state is >= TCPS_BOUND,
 853  880                                           * so we can't bind to it.
 854  881                                           */
 855  882                                          break;
 856  883                                  }
 857  884                          } else {
 858  885                                  /*
 859  886                                   * socket option SO_REUSEADDR is set on the
 860  887                                   * binding tcp_t.
 861  888                                   *
 862      -                                 * If two streams are bound to
 863      -                                 * same IP address or both addr
 864      -                                 * and bound source are wildcards
 865      -                                 * (INADDR_ANY), we want to stop
 866      -                                 * searching.
 867      -                                 * We have found a match of IP source
 868      -                                 * address and source port, which is
 869      -                                 * refused regardless of the
 870      -                                 * SO_REUSEADDR setting, so we break.
      889 +                                 * If two streams are bound to the same IP
      890 +                                 * address or both addr and bound source are
      891 +                                 * wildcards (INADDR_ANY), we want to stop
      892 +                                 * searching.  We have found a match of IP
      893 +                                 * source address and source port, which is
      894 +                                 * refused regardless of the SO_REUSEADDR
      895 +                                 * setting, so we break.
 871  896                                   */
 872      -                                if (IN6_ARE_ADDR_EQUAL(laddr,
 873      -                                    &lconnp->conn_bound_addr_v6) &&
      897 +                                if (addrmatch &&
 874  898                                      (ltcp->tcp_state == TCPS_LISTEN ||
 875  899                                      ltcp->tcp_state == TCPS_BOUND))
 876  900                                          break;
 877  901                          }
 878  902                  }
 879      -                if (ltcp != NULL) {
      903 +                if (ltcp != NULL && !attempt_reuse) {
 880  904                          /* The port number is busy */
 881  905                          mutex_exit(&tbf->tf_lock);
 882  906                  } else {
      907 +                        if (attempt_reuse) {
      908 +                                int err;
      909 +                                struct tcp_rg_s *rg;
      910 +
      911 +                                ASSERT(ltcp != NULL);
      912 +                                ASSERT(ltcp->tcp_rg_bind != NULL);
      913 +                                ASSERT(tcp->tcp_rg_bind != NULL);
      914 +                                ASSERT(ltcp->tcp_rg_bind != tcp->tcp_rg_bind);
      915 +
      916 +                                err = tcp_rg_insert(ltcp->tcp_rg_bind, tcp);
      917 +                                if (err != 0) {
      918 +                                        mutex_exit(&tbf->tf_lock);
      919 +                                        return (0);
      920 +                                }
      921 +                                /*
      922 +                                 * Now that the newly-binding socket has joined
      923 +                                 * the existing reuseport group on ltcp, it
      924 +                                 * should clean up its own (empty) group.
      925 +                                 */
      926 +                                rg = tcp->tcp_rg_bind;
      927 +                                tcp->tcp_rg_bind = ltcp->tcp_rg_bind;
      928 +                                VERIFY(tcp_rg_remove(rg, tcp));
      929 +                                tcp_rg_destroy(rg);
      930 +                        }
      931 +
 883  932                          /*
 884  933                           * This port is ours. Insert in fanout and mark as
 885  934                           * bound to prevent others from getting the port
 886  935                           * number.
 887  936                           */
 888  937                          tcp->tcp_state = TCPS_BOUND;
 889  938                          DTRACE_TCP6(state__change, void, NULL,
 890  939                              ip_xmit_attr_t *, connp->conn_ixa,
 891  940                              void, NULL, tcp_t *, tcp, void, NULL,
 892  941                              int32_t, TCPS_IDLE);
 893  942  
 894  943                          connp->conn_lport = htons(port);
 895  944  
 896  945                          ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH(
 897  946                              connp->conn_lport)] == tbf);
 898  947                          tcp_bind_hash_insert(tbf, tcp, 1);
 899  948  
 900  949                          mutex_exit(&tbf->tf_lock);
 901  950  
 902  951                          /*
 903  952                           * We don't want tcp_next_port_to_try to "inherit"
 904  953                           * a port number supplied by the user in a bind.
 905  954                           */
 906  955                          if (user_specified)
 907  956                                  return (port);
 908  957  
 909  958                          /*
 910  959                           * This is the only place where tcp_next_port_to_try
 911  960                           * is updated. After the update, it may or may not
 912  961                           * be in the valid range.
 913  962                           */
 914  963                          if (!connp->conn_anon_priv_bind)
 915  964                                  tcps->tcps_next_port_to_try = port + 1;
 916  965                          return (port);
 917  966                  }
 918  967  
 919  968                  if (connp->conn_anon_priv_bind) {
 920  969                          port = tcp_get_next_priv_port(tcp);
 921  970                  } else {
 922  971                          if (count == 0 && user_specified) {
 923  972                                  /*
 924  973                                   * We may have to return an anonymous port. So
 925  974                                   * get one to start with.
 926  975                                   */
 927  976                                  port =
 928  977                                      tcp_update_next_port(
 929  978                                      tcps->tcps_next_port_to_try,
 930  979                                      tcp, B_TRUE);
 931  980                                  user_specified = B_FALSE;
 932  981                          } else {
 933  982                                  port = tcp_update_next_port(port + 1, tcp,
 934  983                                      B_FALSE);
 935  984                          }
  
    | 
      ↓ open down ↓ | 
    43 lines elided | 
    
      ↑ open up ↑ | 
  
 936  985                  }
 937  986                  if (port == 0)
 938  987                          break;
 939  988  
 940  989                  /*
 941  990                   * Don't let this loop run forever in the case where
 942  991                   * all of the anonymous ports are in use.
 943  992                   */
 944  993          } while (++count < loopmax);
 945  994          return (0);
      995 +}
      996 +
      997 +/* Max number of members in TCP SO_REUSEPORT group */
      998 +#define TCP_RG_SIZE_MAX         64
      999 +/* Step size when expanding members array */
     1000 +#define TCP_RG_SIZE_STEP        2
     1001 +
     1002 +
     1003 +tcp_rg_t *
     1004 +tcp_rg_init(tcp_t *tcp)
     1005 +{
     1006 +        tcp_rg_t *rg;
     1007 +        rg = kmem_alloc(sizeof (tcp_rg_t), KM_NOSLEEP|KM_NORMALPRI);
     1008 +        if (rg == NULL)
     1009 +                return (NULL);
     1010 +        rg->tcprg_members = kmem_zalloc(2 * sizeof (tcp_t *),
     1011 +            KM_NOSLEEP|KM_NORMALPRI);
     1012 +        if (rg->tcprg_members == NULL) {
     1013 +                kmem_free(rg, sizeof (tcp_rg_t));
     1014 +                return (NULL);
     1015 +        }
     1016 +
     1017 +        mutex_init(&rg->tcprg_lock, NULL, MUTEX_DEFAULT, NULL);
     1018 +        rg->tcprg_size = 2;
     1019 +        rg->tcprg_count = 1;
     1020 +        rg->tcprg_active = 1;
     1021 +        rg->tcprg_members[0] = tcp;
     1022 +        return (rg);
     1023 +}
     1024 +
     1025 +void
     1026 +tcp_rg_destroy(tcp_rg_t *rg)
     1027 +{
     1028 +        mutex_enter(&rg->tcprg_lock);
     1029 +        ASSERT(rg->tcprg_count == 0);
     1030 +        ASSERT(rg->tcprg_active == 0);
     1031 +        kmem_free(rg->tcprg_members, rg->tcprg_size * sizeof (tcp_t *));
     1032 +        mutex_destroy(&rg->tcprg_lock);
     1033 +        kmem_free(rg, sizeof (struct tcp_rg_s));
     1034 +}
     1035 +
     1036 +static int
     1037 +tcp_rg_insert(tcp_rg_t *rg, tcp_t *tcp)
     1038 +{
     1039 +        mutex_enter(&rg->tcprg_lock);
     1040 +
     1041 +        VERIFY(rg->tcprg_size > 0);
     1042 +        VERIFY(rg->tcprg_count <= rg->tcprg_size);
     1043 +        if (rg->tcprg_count != 0) {
     1044 +                cred_t *oldcred = rg->tcprg_members[0]->tcp_connp->conn_cred;
     1045 +                cred_t *newcred = tcp->tcp_connp->conn_cred;
     1046 +
     1047 +                if (crgetuid(oldcred) != crgetuid(newcred) ||
     1048 +                    crgetzoneid(oldcred) != crgetzoneid(newcred)) {
     1049 +                        mutex_exit(&rg->tcprg_lock);
     1050 +                        return (EPERM);
     1051 +                }
     1052 +        }
     1053 +
     1054 +        if (rg->tcprg_count == rg->tcprg_size) {
     1055 +                unsigned int oldalloc = rg->tcprg_size * sizeof (tcp_t *);
     1056 +                unsigned int newsize = rg->tcprg_size + TCP_RG_SIZE_STEP;
     1057 +                tcp_t **newmembers;
     1058 +
     1059 +                if (newsize > TCP_RG_SIZE_MAX) {
     1060 +                        mutex_exit(&rg->tcprg_lock);
     1061 +                        return (EINVAL);
     1062 +                }
     1063 +                newmembers = kmem_zalloc(newsize * sizeof (tcp_t *),
     1064 +                    KM_NOSLEEP|KM_NORMALPRI);
     1065 +                if (newmembers == NULL) {
     1066 +                        mutex_exit(&rg->tcprg_lock);
     1067 +                        return (ENOMEM);
     1068 +                }
     1069 +                bcopy(rg->tcprg_members, newmembers, oldalloc);
     1070 +                kmem_free(rg->tcprg_members, oldalloc);
     1071 +                rg->tcprg_members = newmembers;
     1072 +                rg->tcprg_size = newsize;
     1073 +        }
     1074 +
     1075 +        rg->tcprg_members[rg->tcprg_count] = tcp;
     1076 +        rg->tcprg_count++;
     1077 +        rg->tcprg_active++;
     1078 +
     1079 +        mutex_exit(&rg->tcprg_lock);
     1080 +        return (0);
     1081 +}
     1082 +
     1083 +boolean_t
     1084 +tcp_rg_remove(tcp_rg_t *rg, tcp_t *tcp)
     1085 +{
     1086 +        int i;
     1087 +        boolean_t is_empty;
     1088 +
     1089 +        mutex_enter(&rg->tcprg_lock);
     1090 +        for (i = 0; i < rg->tcprg_count; i++) {
     1091 +                if (rg->tcprg_members[i] == tcp)
     1092 +                        break;
     1093 +        }
     1094 +        /* The item should be present */
     1095 +        ASSERT(i < rg->tcprg_count);
     1096 +        /* Move the last member into this position */
     1097 +        rg->tcprg_count--;
     1098 +        rg->tcprg_members[i] = rg->tcprg_members[rg->tcprg_count];
     1099 +        rg->tcprg_members[rg->tcprg_count] = NULL;
     1100 +        if (tcp->tcp_connp->conn_reuseport != 0)
     1101 +                rg->tcprg_active--;
     1102 +        is_empty = (rg->tcprg_count == 0);
     1103 +        mutex_exit(&rg->tcprg_lock);
     1104 +        return (is_empty);
     1105 +}
     1106 +
     1107 +void
     1108 +tcp_rg_setactive(tcp_rg_t *rg, boolean_t is_active)
     1109 +{
     1110 +        mutex_enter(&rg->tcprg_lock);
     1111 +        if (is_active) {
     1112 +                rg->tcprg_active++;
     1113 +        } else {
     1114 +                rg->tcprg_active--;
     1115 +        }
     1116 +        mutex_exit(&rg->tcprg_lock);
 946 1117  }
    
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX