Print this page
OS-5613 SO_REUSEPORT needs better state-change coverage
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Approved by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-4018 lxbrand support TCP SO_REUSEPORT
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Cody Mello <cody.mello@joyent.com>


   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.

  25  */
  26 
  27 #include <sys/types.h>
  28 #include <sys/stream.h>
  29 #include <sys/strsun.h>
  30 #include <sys/strsubr.h>
  31 #include <sys/stropts.h>
  32 #include <sys/strlog.h>
  33 #define _SUN_TPI_VERSION 2
  34 #include <sys/tihdr.h>
  35 #include <sys/suntpi.h>
  36 #include <sys/xti_inet.h>
  37 #include <sys/policy.h>
  38 #include <sys/squeue_impl.h>
  39 #include <sys/squeue.h>
  40 #include <sys/tsol/tnet.h>
  41 
  42 #include <rpc/pmap_prot.h>
  43 
  44 #include <inet/common.h>
  45 #include <inet/ip.h>
  46 #include <inet/tcp.h>
  47 #include <inet/tcp_impl.h>
  48 #include <inet/proto_set.h>
  49 #include <inet/ipsec_impl.h>
  50 
  51 /* Setable in /etc/system */
  52 /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */
  53 static uint32_t tcp_random_anon_port = 1;
  54 
  55 static int      tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t,
  56                     cred_t *cr);
  57 static in_port_t        tcp_get_next_priv_port(const tcp_t *);

  58 
  59 /*
  60  * Hash list insertion routine for tcp_t structures. Each hash bucket
  61  * contains a list of tcp_t entries, and each entry is bound to a unique
  62  * port. If there are multiple tcp_t's that are bound to the same port, then
  63  * one of them will be linked into the hash bucket list, and the rest will
  64  * hang off of that one entry. For each port, entries bound to a specific IP
  65  * address will be inserted before those those bound to INADDR_ANY.
  66  */
  67 void
  68 tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock)
  69 {
  70         tcp_t   **tcpp;
  71         tcp_t   *tcpnext;
  72         tcp_t   *tcphash;
  73         conn_t  *connp = tcp->tcp_connp;
  74         conn_t  *connext;
  75 
  76         if (tcp->tcp_ptpbhn != NULL) {
  77                 ASSERT(!caller_holds_lock);


 155 tcp_bind_hash_remove(tcp_t *tcp)
 156 {
 157         tcp_t   *tcpnext;
 158         kmutex_t *lockp;
 159         tcp_stack_t     *tcps = tcp->tcp_tcps;
 160         conn_t          *connp = tcp->tcp_connp;
 161 
 162         if (tcp->tcp_ptpbhn == NULL)
 163                 return;
 164 
 165         /*
 166          * Extract the lock pointer in case there are concurrent
 167          * hash_remove's for this instance.
 168          */
 169         ASSERT(connp->conn_lport != 0);
 170         lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH(
 171             connp->conn_lport)].tf_lock;
 172 
 173         ASSERT(lockp != NULL);
 174         mutex_enter(lockp);










 175         if (tcp->tcp_ptpbhn) {
 176                 tcpnext = tcp->tcp_bind_hash_port;
 177                 if (tcpnext != NULL) {
 178                         tcp->tcp_bind_hash_port = NULL;
 179                         tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
 180                         tcpnext->tcp_bind_hash = tcp->tcp_bind_hash;
 181                         if (tcpnext->tcp_bind_hash != NULL) {
 182                                 tcpnext->tcp_bind_hash->tcp_ptpbhn =
 183                                     &(tcpnext->tcp_bind_hash);
 184                                 tcp->tcp_bind_hash = NULL;
 185                         }
 186                 } else if ((tcpnext = tcp->tcp_bind_hash) != NULL) {
 187                         tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
 188                         tcp->tcp_bind_hash = NULL;
 189                 }
 190                 *tcp->tcp_ptpbhn = tcpnext;
 191                 tcp->tcp_ptpbhn = NULL;
 192         }
 193         mutex_exit(lockp);
 194 }


 619                 ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
 620                 connp->conn_incoming_ifindex = connp->conn_bound_if;
 621         }
 622 
 623         connp->conn_laddr_v6 = v6addr;
 624         connp->conn_saddr_v6 = v6addr;
 625 
 626         bind_to_req_port_only = requested_port != 0 && bind_to_req_port_only;
 627 
 628         error = tcp_bind_select_lport(tcp, &requested_port,
 629             bind_to_req_port_only, cr);
 630         if (error != 0) {
 631                 connp->conn_laddr_v6 = ipv6_all_zeros;
 632                 connp->conn_saddr_v6 = ipv6_all_zeros;
 633                 connp->conn_bound_addr_v6 = ipv6_all_zeros;
 634         }
 635         return (error);
 636 }
 637 
 638 /*
 639  * If the "bind_to_req_port_only" parameter is set, if the requested port
 640  * number is available, return it, If not return 0
 641  *
 642  * If "bind_to_req_port_only" parameter is not set and
 643  * If the requested port number is available, return it.  If not, return
 644  * the first anonymous port we happen across.  If no anonymous ports are
 645  * available, return 0. addr is the requested local address, if any.
 646  *
 647  * In either case, when succeeding update the tcp_t to record the port number
 648  * and insert it in the bind hash table.
 649  *
 650  * Note that TCP over IPv4 and IPv6 sockets can use the same port number
 651  * without setting SO_REUSEADDR. This is needed so that they
 652  * can be viewed as two independent transport protocols.
 653  */
 654 in_port_t
 655 tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
 656     int reuseaddr, boolean_t quick_connect,
 657     boolean_t bind_to_req_port_only, boolean_t user_specified)
 658 {
 659         /* number of times we have run around the loop */
 660         int count = 0;
 661         /* maximum number of times to run around the loop */
 662         int loopmax;
 663         conn_t *connp = tcp->tcp_connp;
 664         tcp_stack_t     *tcps = tcp->tcp_tcps;

 665 
 666         /*
 667          * Lookup for free addresses is done in a loop and "loopmax"
 668          * influences how long we spin in the loop
 669          */
 670         if (bind_to_req_port_only) {
 671                 /*
 672                  * If the requested port is busy, don't bother to look
 673                  * for a new one. Setting loop maximum count to 1 has
 674                  * that effect.
 675                  */
 676                 loopmax = 1;
 677         } else {
 678                 /*
 679                  * If the requested port is busy, look for a free one
 680                  * in the anonymous port range.
 681                  * Set loopmax appropriately so that one does not look
 682                  * forever in the case all of the anonymous ports are in use.
 683                  */
 684                 if (connp->conn_anon_priv_bind) {
 685                         /*
 686                          * loopmax =
 687                          *      (IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1
 688                          */
 689                         loopmax = IPPORT_RESERVED -
 690                             tcps->tcps_min_anonpriv_port;
 691                 } else {
 692                         loopmax = (tcps->tcps_largest_anon_port -
 693                             tcps->tcps_smallest_anon_port + 1);
 694                 }
 695         }
 696         do {
 697                 uint16_t        lport;
 698                 tf_t            *tbf;
 699                 tcp_t           *ltcp;
 700                 conn_t          *lconnp;

 701 
 702                 lport = htons(port);
 703 
 704                 /*
 705                  * Ensure that the tcp_t is not currently in the bind hash.
 706                  * Hold the lock on the hash bucket to ensure that
 707                  * the duplicate check plus the insertion is an atomic
 708                  * operation.
 709                  *
 710                  * This function does an inline lookup on the bind hash list
 711                  * Make sure that we access only members of tcp_t
 712                  * and that we don't look at tcp_tcp, since we are not
 713                  * doing a CONN_INC_REF.
 714                  */
 715                 tcp_bind_hash_remove(tcp);
 716                 tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)];
 717                 mutex_enter(&tbf->tf_lock);
 718                 for (ltcp = tbf->tf_tcp; ltcp != NULL;
 719                     ltcp = ltcp->tcp_bind_hash) {
 720                         if (lport == ltcp->tcp_connp->conn_lport)
 721                                 break;
 722                 }
 723 
 724                 for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) {
 725                         boolean_t not_socket;
 726                         boolean_t exclbind;

 727 
 728                         lconnp = ltcp->tcp_connp;
 729 
 730                         /*
 731                          * On a labeled system, we must treat bindings to ports
 732                          * on shared IP addresses by sockets with MAC exemption
 733                          * privilege as being in all zones, as there's
 734                          * otherwise no way to identify the right receiver.
 735                          */
 736                         if (!IPCL_BIND_ZONE_MATCH(lconnp, connp))
 737                                 continue;
 738 
 739                         /*
 740                          * If TCP_EXCLBIND is set for either the bound or
 741                          * binding endpoint, the semantics of bind
 742                          * is changed according to the following.
 743                          *
 744                          * spec = specified address (v4 or v6)
 745                          * unspec = unspecified address (v4 or v6)
 746                          * A = specified addresses are different for endpoints


 812                         if (connp->conn_ipversion != lconnp->conn_ipversion &&
 813                             bind_to_req_port_only)
 814                                 continue;
 815 
 816                         /*
 817                          * Ideally, we should make sure that the source
 818                          * address, remote address, and remote port in the
 819                          * four tuple for this tcp-connection is unique.
 820                          * However, trying to find out the local source
 821                          * address would require too much code duplication
 822                          * with IP, since IP needs needs to have that code
 823                          * to support userland TCP implementations.
 824                          */
 825                         if (quick_connect &&
 826                             (ltcp->tcp_state > TCPS_LISTEN) &&
 827                             ((connp->conn_fport != lconnp->conn_fport) ||
 828                             !IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
 829                             &lconnp->conn_faddr_v6)))
 830                                 continue;
 831 

















 832                         if (!reuseaddr) {
 833                                 /*
 834                                  * No socket option SO_REUSEADDR.
 835                                  * If existing port is bound to
 836                                  * a non-wildcard IP address
 837                                  * and the requesting stream is
 838                                  * bound to a distinct
 839                                  * different IP addresses
 840                                  * (non-wildcard, also), keep
 841                                  * going.
 842                                  */
 843                                 if (!V6_OR_V4_INADDR_ANY(*laddr) &&
 844                                     !V6_OR_V4_INADDR_ANY(
 845                                     lconnp->conn_bound_addr_v6) &&
 846                                     !IN6_ARE_ADDR_EQUAL(laddr,
 847                                     &lconnp->conn_bound_addr_v6))
 848                                         continue;
 849                                 if (ltcp->tcp_state >= TCPS_BOUND) {
 850                                         /*
 851                                          * This port is being used and
 852                                          * its state is >= TCPS_BOUND,
 853                                          * so we can't bind to it.
 854                                          */
 855                                         break;
 856                                 }
 857                         } else {
 858                                 /*
 859                                  * socket option SO_REUSEADDR is set on the
 860                                  * binding tcp_t.
 861                                  *
 862                                  * If two streams are bound to
 863                                  * same IP address or both addr
 864                                  * and bound source are wildcards
 865                                  * (INADDR_ANY), we want to stop
 866                                  * searching.
 867                                  * We have found a match of IP source
 868                                  * address and source port, which is
 869                                  * refused regardless of the
 870                                  * SO_REUSEADDR setting, so we break.
 871                                  */
 872                                 if (IN6_ARE_ADDR_EQUAL(laddr,
 873                                     &lconnp->conn_bound_addr_v6) &&
 874                                     (ltcp->tcp_state == TCPS_LISTEN ||
 875                                     ltcp->tcp_state == TCPS_BOUND))
 876                                         break;
 877                         }
 878                 }
 879                 if (ltcp != NULL) {
 880                         /* The port number is busy */
 881                         mutex_exit(&tbf->tf_lock);
 882                 } else {














 883                         /*











 884                          * This port is ours. Insert in fanout and mark as
 885                          * bound to prevent others from getting the port
 886                          * number.
 887                          */
 888                         tcp->tcp_state = TCPS_BOUND;
 889                         DTRACE_TCP6(state__change, void, NULL,
 890                             ip_xmit_attr_t *, connp->conn_ixa,
 891                             void, NULL, tcp_t *, tcp, void, NULL,
 892                             int32_t, TCPS_IDLE);
 893 
 894                         connp->conn_lport = htons(port);
 895 
 896                         ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH(
 897                             connp->conn_lport)] == tbf);
 898                         tcp_bind_hash_insert(tbf, tcp, 1);
 899 
 900                         mutex_exit(&tbf->tf_lock);
 901 
 902                         /*
 903                          * We don't want tcp_next_port_to_try to "inherit"


 926                                  */
 927                                 port =
 928                                     tcp_update_next_port(
 929                                     tcps->tcps_next_port_to_try,
 930                                     tcp, B_TRUE);
 931                                 user_specified = B_FALSE;
 932                         } else {
 933                                 port = tcp_update_next_port(port + 1, tcp,
 934                                     B_FALSE);
 935                         }
 936                 }
 937                 if (port == 0)
 938                         break;
 939 
 940                 /*
 941                  * Don't let this loop run forever in the case where
 942                  * all of the anonymous ports are in use.
 943                  */
 944         } while (++count < loopmax);
 945         return (0);


























































































































 946 }


   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  25  * Copyright 2016 Joyent, Inc.
  26  */
  27 
  28 #include <sys/types.h>
  29 #include <sys/stream.h>
  30 #include <sys/strsun.h>
  31 #include <sys/strsubr.h>
  32 #include <sys/stropts.h>
  33 #include <sys/strlog.h>
  34 #define _SUN_TPI_VERSION 2
  35 #include <sys/tihdr.h>
  36 #include <sys/suntpi.h>
  37 #include <sys/xti_inet.h>
  38 #include <sys/policy.h>
  39 #include <sys/squeue_impl.h>
  40 #include <sys/squeue.h>
  41 #include <sys/tsol/tnet.h>
  42 
  43 #include <rpc/pmap_prot.h>
  44 
  45 #include <inet/common.h>
  46 #include <inet/ip.h>
  47 #include <inet/tcp.h>
  48 #include <inet/tcp_impl.h>
  49 #include <inet/proto_set.h>
  50 #include <inet/ipsec_impl.h>
  51 
  52 /* Setable in /etc/system */
  53 /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */
  54 static uint32_t tcp_random_anon_port = 1;
  55 
  56 static int      tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t,
  57                     cred_t *cr);
  58 static in_port_t        tcp_get_next_priv_port(const tcp_t *);
  59 static int tcp_rg_insert(tcp_rg_t *, struct tcp_s *);
  60 
  61 /*
  62  * Hash list insertion routine for tcp_t structures. Each hash bucket
  63  * contains a list of tcp_t entries, and each entry is bound to a unique
  64  * port. If there are multiple tcp_t's that are bound to the same port, then
  65  * one of them will be linked into the hash bucket list, and the rest will
  66  * hang off of that one entry. For each port, entries bound to a specific IP
  67  * address will be inserted before those those bound to INADDR_ANY.
  68  */
  69 void
  70 tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock)
  71 {
  72         tcp_t   **tcpp;
  73         tcp_t   *tcpnext;
  74         tcp_t   *tcphash;
  75         conn_t  *connp = tcp->tcp_connp;
  76         conn_t  *connext;
  77 
  78         if (tcp->tcp_ptpbhn != NULL) {
  79                 ASSERT(!caller_holds_lock);


 157 tcp_bind_hash_remove(tcp_t *tcp)
 158 {
 159         tcp_t   *tcpnext;
 160         kmutex_t *lockp;
 161         tcp_stack_t     *tcps = tcp->tcp_tcps;
 162         conn_t          *connp = tcp->tcp_connp;
 163 
 164         if (tcp->tcp_ptpbhn == NULL)
 165                 return;
 166 
 167         /*
 168          * Extract the lock pointer in case there are concurrent
 169          * hash_remove's for this instance.
 170          */
 171         ASSERT(connp->conn_lport != 0);
 172         lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH(
 173             connp->conn_lport)].tf_lock;
 174 
 175         ASSERT(lockp != NULL);
 176         mutex_enter(lockp);
 177 
 178         /* destroy any association with SO_REUSEPORT group */
 179         if (tcp->tcp_rg_bind != NULL) {
 180                 if (tcp_rg_remove(tcp->tcp_rg_bind, tcp)) {
 181                         /* Last one out turns off the lights */
 182                         tcp_rg_destroy(tcp->tcp_rg_bind);
 183                 }
 184                 tcp->tcp_rg_bind = NULL;
 185         }
 186 
 187         if (tcp->tcp_ptpbhn) {
 188                 tcpnext = tcp->tcp_bind_hash_port;
 189                 if (tcpnext != NULL) {
 190                         tcp->tcp_bind_hash_port = NULL;
 191                         tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
 192                         tcpnext->tcp_bind_hash = tcp->tcp_bind_hash;
 193                         if (tcpnext->tcp_bind_hash != NULL) {
 194                                 tcpnext->tcp_bind_hash->tcp_ptpbhn =
 195                                     &(tcpnext->tcp_bind_hash);
 196                                 tcp->tcp_bind_hash = NULL;
 197                         }
 198                 } else if ((tcpnext = tcp->tcp_bind_hash) != NULL) {
 199                         tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
 200                         tcp->tcp_bind_hash = NULL;
 201                 }
 202                 *tcp->tcp_ptpbhn = tcpnext;
 203                 tcp->tcp_ptpbhn = NULL;
 204         }
 205         mutex_exit(lockp);
 206 }


 631                 ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
 632                 connp->conn_incoming_ifindex = connp->conn_bound_if;
 633         }
 634 
 635         connp->conn_laddr_v6 = v6addr;
 636         connp->conn_saddr_v6 = v6addr;
 637 
 638         bind_to_req_port_only = requested_port != 0 && bind_to_req_port_only;
 639 
 640         error = tcp_bind_select_lport(tcp, &requested_port,
 641             bind_to_req_port_only, cr);
 642         if (error != 0) {
 643                 connp->conn_laddr_v6 = ipv6_all_zeros;
 644                 connp->conn_saddr_v6 = ipv6_all_zeros;
 645                 connp->conn_bound_addr_v6 = ipv6_all_zeros;
 646         }
 647         return (error);
 648 }
 649 
 650 /*
 651  * If the "bind_to_req_port_only" parameter is set and the requested port
 652  * number is available, return it (else return 0).
 653  *
 654  * If "bind_to_req_port_only" parameter is not set and the requested port
 655  * number is available, return it.  If not, return the first anonymous port we
 656  * happen across.  If no anonymous ports are available, return 0.

 657  *
 658  * In either case, when succeeding update the tcp_t to record the port number
 659  * and insert it in the bind hash table.
 660  *
 661  * Note that TCP over IPv4 and IPv6 sockets can use the same port number
 662  * without setting SO_REUSEADDR. This is needed so that they
 663  * can be viewed as two independent transport protocols.
 664  */
 665 in_port_t
 666 tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
 667     int reuseaddr, boolean_t quick_connect,
 668     boolean_t bind_to_req_port_only, boolean_t user_specified)
 669 {
 670         /* number of times we have run around the loop */
 671         int count = 0;
 672         /* maximum number of times to run around the loop */
 673         int loopmax;
 674         conn_t *connp = tcp->tcp_connp;
 675         tcp_stack_t     *tcps = tcp->tcp_tcps;
 676         boolean_t reuseport = connp->conn_reuseport;
 677 
 678         /*
 679          * Lookup for free addresses is done in a loop and "loopmax"
 680          * influences how long we spin in the loop
 681          */
 682         if (bind_to_req_port_only) {
 683                 /*
 684                  * If the requested port is busy, don't bother to look
 685                  * for a new one. Setting loop maximum count to 1 has
 686                  * that effect.
 687                  */
 688                 loopmax = 1;
 689         } else {
 690                 /*
 691                  * If the requested port is busy, look for a free one
 692                  * in the anonymous port range.
 693                  * Set loopmax appropriately so that one does not look
 694                  * forever in the case all of the anonymous ports are in use.
 695                  */
 696                 if (connp->conn_anon_priv_bind) {
 697                         /*
 698                          * loopmax =
 699                          *      (IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1
 700                          */
 701                         loopmax = IPPORT_RESERVED -
 702                             tcps->tcps_min_anonpriv_port;
 703                 } else {
 704                         loopmax = (tcps->tcps_largest_anon_port -
 705                             tcps->tcps_smallest_anon_port + 1);
 706                 }
 707         }
 708         do {
 709                 uint16_t        lport;
 710                 tf_t            *tbf;
 711                 tcp_t           *ltcp;
 712                 conn_t          *lconnp;
 713                 boolean_t       attempt_reuse = B_FALSE;
 714 
 715                 lport = htons(port);
 716 
 717                 /*
 718                  * Ensure that the tcp_t is not currently in the bind hash.
 719                  * Hold the lock on the hash bucket to ensure that
 720                  * the duplicate check plus the insertion is an atomic
 721                  * operation.
 722                  *
 723                  * This function does an inline lookup on the bind hash list
 724                  * Make sure that we access only members of tcp_t
 725                  * and that we don't look at tcp_tcp, since we are not
 726                  * doing a CONN_INC_REF.
 727                  */
 728                 tcp_bind_hash_remove(tcp);
 729                 tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)];
 730                 mutex_enter(&tbf->tf_lock);
 731                 for (ltcp = tbf->tf_tcp; ltcp != NULL;
 732                     ltcp = ltcp->tcp_bind_hash) {
 733                         if (lport == ltcp->tcp_connp->conn_lport)
 734                                 break;
 735                 }
 736 
 737                 for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) {
 738                         boolean_t not_socket;
 739                         boolean_t exclbind;
 740                         boolean_t addrmatch;
 741 
 742                         lconnp = ltcp->tcp_connp;
 743 
 744                         /*
 745                          * On a labeled system, we must treat bindings to ports
 746                          * on shared IP addresses by sockets with MAC exemption
 747                          * privilege as being in all zones, as there's
 748                          * otherwise no way to identify the right receiver.
 749                          */
 750                         if (!IPCL_BIND_ZONE_MATCH(lconnp, connp))
 751                                 continue;
 752 
 753                         /*
 754                          * If TCP_EXCLBIND is set for either the bound or
 755                          * binding endpoint, the semantics of bind
 756                          * is changed according to the following.
 757                          *
 758                          * spec = specified address (v4 or v6)
 759                          * unspec = unspecified address (v4 or v6)
 760                          * A = specified addresses are different for endpoints


 826                         if (connp->conn_ipversion != lconnp->conn_ipversion &&
 827                             bind_to_req_port_only)
 828                                 continue;
 829 
 830                         /*
 831                          * Ideally, we should make sure that the source
 832                          * address, remote address, and remote port in the
 833                          * four tuple for this tcp-connection is unique.
 834                          * However, trying to find out the local source
 835                          * address would require too much code duplication
 836                          * with IP, since IP needs needs to have that code
 837                          * to support userland TCP implementations.
 838                          */
 839                         if (quick_connect &&
 840                             (ltcp->tcp_state > TCPS_LISTEN) &&
 841                             ((connp->conn_fport != lconnp->conn_fport) ||
 842                             !IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
 843                             &lconnp->conn_faddr_v6)))
 844                                 continue;
 845 
 846                         addrmatch = IN6_ARE_ADDR_EQUAL(laddr,
 847                             &lconnp->conn_bound_addr_v6);
 848 
 849                         if (addrmatch && reuseport && bind_to_req_port_only &&
 850                             (ltcp->tcp_state == TCPS_BOUND ||
 851                             ltcp->tcp_state == TCPS_LISTEN)) {
 852                                 /*
 853                                  * This entry is bound to the exact same
 854                                  * address and port.  If SO_REUSEPORT is set on
 855                                  * the calling socket, attempt to reuse this
 856                                  * binding if it too had SO_REUSEPORT enabled
 857                                  * when it was bound.
 858                                  */
 859                                 attempt_reuse = (ltcp->tcp_rg_bind != NULL);
 860                                 break;
 861                         }
 862 
 863                         if (!reuseaddr) {
 864                                 /*
 865                                  * No socket option SO_REUSEADDR.  If an
 866                                  * existing port is bound to a non-wildcard IP
 867                                  * address and the requesting stream is bound
 868                                  * to a distinct different IP address
 869                                  * (non-wildcard, also), keep going.



 870                                  */
 871                                 if (!V6_OR_V4_INADDR_ANY(*laddr) &&
 872                                     !V6_OR_V4_INADDR_ANY(
 873                                     lconnp->conn_bound_addr_v6) &&
 874                                     !addrmatch)

 875                                         continue;
 876                                 if (ltcp->tcp_state >= TCPS_BOUND) {
 877                                         /*
 878                                          * This port is being used and
 879                                          * its state is >= TCPS_BOUND,
 880                                          * so we can't bind to it.
 881                                          */
 882                                         break;
 883                                 }
 884                         } else {
 885                                 /*
 886                                  * socket option SO_REUSEADDR is set on the
 887                                  * binding tcp_t.
 888                                  *
 889                                  * If two streams are bound to the same IP
 890                                  * address or both addr and bound source are
 891                                  * wildcards (INADDR_ANY), we want to stop
 892                                  * searching.  We have found a match of IP
 893                                  * source address and source port, which is
 894                                  * refused regardless of the SO_REUSEADDR
 895                                  * setting, so we break.


 896                                  */
 897                                 if (addrmatch &&

 898                                     (ltcp->tcp_state == TCPS_LISTEN ||
 899                                     ltcp->tcp_state == TCPS_BOUND))
 900                                         break;
 901                         }
 902                 }
 903                 if (ltcp != NULL && !attempt_reuse) {
 904                         /* The port number is busy */
 905                         mutex_exit(&tbf->tf_lock);
 906                 } else {
 907                         if (attempt_reuse) {
 908                                 int err;
 909                                 struct tcp_rg_s *rg;
 910 
 911                                 ASSERT(ltcp != NULL);
 912                                 ASSERT(ltcp->tcp_rg_bind != NULL);
 913                                 ASSERT(tcp->tcp_rg_bind != NULL);
 914                                 ASSERT(ltcp->tcp_rg_bind != tcp->tcp_rg_bind);
 915 
 916                                 err = tcp_rg_insert(ltcp->tcp_rg_bind, tcp);
 917                                 if (err != 0) {
 918                                         mutex_exit(&tbf->tf_lock);
 919                                         return (0);
 920                                 }
 921                                 /*
 922                                  * Now that the newly-binding socket has joined
 923                                  * the existing reuseport group on ltcp, it
 924                                  * should clean up its own (empty) group.
 925                                  */
 926                                 rg = tcp->tcp_rg_bind;
 927                                 tcp->tcp_rg_bind = ltcp->tcp_rg_bind;
 928                                 VERIFY(tcp_rg_remove(rg, tcp));
 929                                 tcp_rg_destroy(rg);
 930                         }
 931 
 932                         /*
 933                          * This port is ours. Insert in fanout and mark as
 934                          * bound to prevent others from getting the port
 935                          * number.
 936                          */
 937                         tcp->tcp_state = TCPS_BOUND;
 938                         DTRACE_TCP6(state__change, void, NULL,
 939                             ip_xmit_attr_t *, connp->conn_ixa,
 940                             void, NULL, tcp_t *, tcp, void, NULL,
 941                             int32_t, TCPS_IDLE);
 942 
 943                         connp->conn_lport = htons(port);
 944 
 945                         ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH(
 946                             connp->conn_lport)] == tbf);
 947                         tcp_bind_hash_insert(tbf, tcp, 1);
 948 
 949                         mutex_exit(&tbf->tf_lock);
 950 
 951                         /*
 952                          * We don't want tcp_next_port_to_try to "inherit"


 975                                  */
 976                                 port =
 977                                     tcp_update_next_port(
 978                                     tcps->tcps_next_port_to_try,
 979                                     tcp, B_TRUE);
 980                                 user_specified = B_FALSE;
 981                         } else {
 982                                 port = tcp_update_next_port(port + 1, tcp,
 983                                     B_FALSE);
 984                         }
 985                 }
 986                 if (port == 0)
 987                         break;
 988 
 989                 /*
 990                  * Don't let this loop run forever in the case where
 991                  * all of the anonymous ports are in use.
 992                  */
 993         } while (++count < loopmax);
 994         return (0);
 995 }
 996 
 997 /* Max number of members in TCP SO_REUSEPORT group */
 998 #define TCP_RG_SIZE_MAX         64
 999 /* Step size when expanding members array */
1000 #define TCP_RG_SIZE_STEP        2
1001 
1002 
1003 tcp_rg_t *
1004 tcp_rg_init(tcp_t *tcp)
1005 {
1006         tcp_rg_t *rg;
1007         rg = kmem_alloc(sizeof (tcp_rg_t), KM_NOSLEEP|KM_NORMALPRI);
1008         if (rg == NULL)
1009                 return (NULL);
1010         rg->tcprg_members = kmem_zalloc(2 * sizeof (tcp_t *),
1011             KM_NOSLEEP|KM_NORMALPRI);
1012         if (rg->tcprg_members == NULL) {
1013                 kmem_free(rg, sizeof (tcp_rg_t));
1014                 return (NULL);
1015         }
1016 
1017         mutex_init(&rg->tcprg_lock, NULL, MUTEX_DEFAULT, NULL);
1018         rg->tcprg_size = 2;
1019         rg->tcprg_count = 1;
1020         rg->tcprg_active = 1;
1021         rg->tcprg_members[0] = tcp;
1022         return (rg);
1023 }
1024 
1025 void
1026 tcp_rg_destroy(tcp_rg_t *rg)
1027 {
1028         mutex_enter(&rg->tcprg_lock);
1029         ASSERT(rg->tcprg_count == 0);
1030         ASSERT(rg->tcprg_active == 0);
1031         kmem_free(rg->tcprg_members, rg->tcprg_size * sizeof (tcp_t *));
1032         mutex_destroy(&rg->tcprg_lock);
1033         kmem_free(rg, sizeof (struct tcp_rg_s));
1034 }
1035 
1036 static int
1037 tcp_rg_insert(tcp_rg_t *rg, tcp_t *tcp)
1038 {
1039         mutex_enter(&rg->tcprg_lock);
1040 
1041         VERIFY(rg->tcprg_size > 0);
1042         VERIFY(rg->tcprg_count <= rg->tcprg_size);
1043         if (rg->tcprg_count != 0) {
1044                 cred_t *oldcred = rg->tcprg_members[0]->tcp_connp->conn_cred;
1045                 cred_t *newcred = tcp->tcp_connp->conn_cred;
1046 
1047                 if (crgetuid(oldcred) != crgetuid(newcred) ||
1048                     crgetzoneid(oldcred) != crgetzoneid(newcred)) {
1049                         mutex_exit(&rg->tcprg_lock);
1050                         return (EPERM);
1051                 }
1052         }
1053 
1054         if (rg->tcprg_count == rg->tcprg_size) {
1055                 unsigned int oldalloc = rg->tcprg_size * sizeof (tcp_t *);
1056                 unsigned int newsize = rg->tcprg_size + TCP_RG_SIZE_STEP;
1057                 tcp_t **newmembers;
1058 
1059                 if (newsize > TCP_RG_SIZE_MAX) {
1060                         mutex_exit(&rg->tcprg_lock);
1061                         return (EINVAL);
1062                 }
1063                 newmembers = kmem_zalloc(newsize * sizeof (tcp_t *),
1064                     KM_NOSLEEP|KM_NORMALPRI);
1065                 if (newmembers == NULL) {
1066                         mutex_exit(&rg->tcprg_lock);
1067                         return (ENOMEM);
1068                 }
1069                 bcopy(rg->tcprg_members, newmembers, oldalloc);
1070                 kmem_free(rg->tcprg_members, oldalloc);
1071                 rg->tcprg_members = newmembers;
1072                 rg->tcprg_size = newsize;
1073         }
1074 
1075         rg->tcprg_members[rg->tcprg_count] = tcp;
1076         rg->tcprg_count++;
1077         rg->tcprg_active++;
1078 
1079         mutex_exit(&rg->tcprg_lock);
1080         return (0);
1081 }
1082 
1083 boolean_t
1084 tcp_rg_remove(tcp_rg_t *rg, tcp_t *tcp)
1085 {
1086         int i;
1087         boolean_t is_empty;
1088 
1089         mutex_enter(&rg->tcprg_lock);
1090         for (i = 0; i < rg->tcprg_count; i++) {
1091                 if (rg->tcprg_members[i] == tcp)
1092                         break;
1093         }
1094         /* The item should be present */
1095         ASSERT(i < rg->tcprg_count);
1096         /* Move the last member into this position */
1097         rg->tcprg_count--;
1098         rg->tcprg_members[i] = rg->tcprg_members[rg->tcprg_count];
1099         rg->tcprg_members[rg->tcprg_count] = NULL;
1100         if (tcp->tcp_connp->conn_reuseport != 0)
1101                 rg->tcprg_active--;
1102         is_empty = (rg->tcprg_count == 0);
1103         mutex_exit(&rg->tcprg_lock);
1104         return (is_empty);
1105 }
1106 
1107 void
1108 tcp_rg_setactive(tcp_rg_t *rg, boolean_t is_active)
1109 {
1110         mutex_enter(&rg->tcprg_lock);
1111         if (is_active) {
1112                 rg->tcprg_active++;
1113         } else {
1114                 rg->tcprg_active--;
1115         }
1116         mutex_exit(&rg->tcprg_lock);
1117 }