5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
25 */
26
27 #include <sys/types.h>
28 #include <sys/stream.h>
29 #include <sys/strsun.h>
30 #include <sys/strsubr.h>
31 #include <sys/stropts.h>
32 #include <sys/strlog.h>
33 #define _SUN_TPI_VERSION 2
34 #include <sys/tihdr.h>
35 #include <sys/suntpi.h>
36 #include <sys/xti_inet.h>
37 #include <sys/policy.h>
38 #include <sys/squeue_impl.h>
39 #include <sys/squeue.h>
40 #include <sys/tsol/tnet.h>
41
42 #include <rpc/pmap_prot.h>
43
44 #include <inet/common.h>
45 #include <inet/ip.h>
46 #include <inet/tcp.h>
47 #include <inet/tcp_impl.h>
48 #include <inet/proto_set.h>
49 #include <inet/ipsec_impl.h>
50
51 /* Setable in /etc/system */
52 /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */
53 static uint32_t tcp_random_anon_port = 1;
54
55 static int tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t,
56 cred_t *cr);
57 static in_port_t tcp_get_next_priv_port(const tcp_t *);
58
59 /*
60 * Hash list insertion routine for tcp_t structures. Each hash bucket
61 * contains a list of tcp_t entries, and each entry is bound to a unique
62 * port. If there are multiple tcp_t's that are bound to the same port, then
63 * one of them will be linked into the hash bucket list, and the rest will
64 * hang off of that one entry. For each port, entries bound to a specific IP
65 * address will be inserted before those those bound to INADDR_ANY.
66 */
67 void
68 tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock)
69 {
70 tcp_t **tcpp;
71 tcp_t *tcpnext;
72 tcp_t *tcphash;
73 conn_t *connp = tcp->tcp_connp;
74 conn_t *connext;
75
76 if (tcp->tcp_ptpbhn != NULL) {
77 ASSERT(!caller_holds_lock);
155 tcp_bind_hash_remove(tcp_t *tcp)
156 {
157 tcp_t *tcpnext;
158 kmutex_t *lockp;
159 tcp_stack_t *tcps = tcp->tcp_tcps;
160 conn_t *connp = tcp->tcp_connp;
161
162 if (tcp->tcp_ptpbhn == NULL)
163 return;
164
165 /*
166 * Extract the lock pointer in case there are concurrent
167 * hash_remove's for this instance.
168 */
169 ASSERT(connp->conn_lport != 0);
170 lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH(
171 connp->conn_lport)].tf_lock;
172
173 ASSERT(lockp != NULL);
174 mutex_enter(lockp);
175 if (tcp->tcp_ptpbhn) {
176 tcpnext = tcp->tcp_bind_hash_port;
177 if (tcpnext != NULL) {
178 tcp->tcp_bind_hash_port = NULL;
179 tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
180 tcpnext->tcp_bind_hash = tcp->tcp_bind_hash;
181 if (tcpnext->tcp_bind_hash != NULL) {
182 tcpnext->tcp_bind_hash->tcp_ptpbhn =
183 &(tcpnext->tcp_bind_hash);
184 tcp->tcp_bind_hash = NULL;
185 }
186 } else if ((tcpnext = tcp->tcp_bind_hash) != NULL) {
187 tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
188 tcp->tcp_bind_hash = NULL;
189 }
190 *tcp->tcp_ptpbhn = tcpnext;
191 tcp->tcp_ptpbhn = NULL;
192 }
193 mutex_exit(lockp);
194 }
619 ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
620 connp->conn_incoming_ifindex = connp->conn_bound_if;
621 }
622
623 connp->conn_laddr_v6 = v6addr;
624 connp->conn_saddr_v6 = v6addr;
625
626 bind_to_req_port_only = requested_port != 0 && bind_to_req_port_only;
627
628 error = tcp_bind_select_lport(tcp, &requested_port,
629 bind_to_req_port_only, cr);
630 if (error != 0) {
631 connp->conn_laddr_v6 = ipv6_all_zeros;
632 connp->conn_saddr_v6 = ipv6_all_zeros;
633 connp->conn_bound_addr_v6 = ipv6_all_zeros;
634 }
635 return (error);
636 }
637
638 /*
639 * If the "bind_to_req_port_only" parameter is set, if the requested port
640 * number is available, return it, If not return 0
641 *
642 * If "bind_to_req_port_only" parameter is not set and
643 * If the requested port number is available, return it. If not, return
644 * the first anonymous port we happen across. If no anonymous ports are
645 * available, return 0. addr is the requested local address, if any.
646 *
647 * In either case, when succeeding update the tcp_t to record the port number
648 * and insert it in the bind hash table.
649 *
650 * Note that TCP over IPv4 and IPv6 sockets can use the same port number
651 * without setting SO_REUSEADDR. This is needed so that they
652 * can be viewed as two independent transport protocols.
653 */
654 in_port_t
655 tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
656 int reuseaddr, boolean_t quick_connect,
657 boolean_t bind_to_req_port_only, boolean_t user_specified)
658 {
659 /* number of times we have run around the loop */
660 int count = 0;
661 /* maximum number of times to run around the loop */
662 int loopmax;
663 conn_t *connp = tcp->tcp_connp;
664 tcp_stack_t *tcps = tcp->tcp_tcps;
665
666 /*
667 * Lookup for free addresses is done in a loop and "loopmax"
668 * influences how long we spin in the loop
669 */
670 if (bind_to_req_port_only) {
671 /*
672 * If the requested port is busy, don't bother to look
673 * for a new one. Setting loop maximum count to 1 has
674 * that effect.
675 */
676 loopmax = 1;
677 } else {
678 /*
679 * If the requested port is busy, look for a free one
680 * in the anonymous port range.
681 * Set loopmax appropriately so that one does not look
682 * forever in the case all of the anonymous ports are in use.
683 */
684 if (connp->conn_anon_priv_bind) {
685 /*
686 * loopmax =
687 * (IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1
688 */
689 loopmax = IPPORT_RESERVED -
690 tcps->tcps_min_anonpriv_port;
691 } else {
692 loopmax = (tcps->tcps_largest_anon_port -
693 tcps->tcps_smallest_anon_port + 1);
694 }
695 }
696 do {
697 uint16_t lport;
698 tf_t *tbf;
699 tcp_t *ltcp;
700 conn_t *lconnp;
701
702 lport = htons(port);
703
704 /*
705 * Ensure that the tcp_t is not currently in the bind hash.
706 * Hold the lock on the hash bucket to ensure that
707 * the duplicate check plus the insertion is an atomic
708 * operation.
709 *
710 * This function does an inline lookup on the bind hash list
711 * Make sure that we access only members of tcp_t
712 * and that we don't look at tcp_tcp, since we are not
713 * doing a CONN_INC_REF.
714 */
715 tcp_bind_hash_remove(tcp);
716 tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)];
717 mutex_enter(&tbf->tf_lock);
718 for (ltcp = tbf->tf_tcp; ltcp != NULL;
719 ltcp = ltcp->tcp_bind_hash) {
720 if (lport == ltcp->tcp_connp->conn_lport)
721 break;
722 }
723
724 for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) {
725 boolean_t not_socket;
726 boolean_t exclbind;
727
728 lconnp = ltcp->tcp_connp;
729
730 /*
731 * On a labeled system, we must treat bindings to ports
732 * on shared IP addresses by sockets with MAC exemption
733 * privilege as being in all zones, as there's
734 * otherwise no way to identify the right receiver.
735 */
736 if (!IPCL_BIND_ZONE_MATCH(lconnp, connp))
737 continue;
738
739 /*
740 * If TCP_EXCLBIND is set for either the bound or
741 * binding endpoint, the semantics of bind
742 * is changed according to the following.
743 *
744 * spec = specified address (v4 or v6)
745 * unspec = unspecified address (v4 or v6)
746 * A = specified addresses are different for endpoints
812 if (connp->conn_ipversion != lconnp->conn_ipversion &&
813 bind_to_req_port_only)
814 continue;
815
816 /*
817 * Ideally, we should make sure that the source
818 * address, remote address, and remote port in the
819 * four tuple for this tcp-connection is unique.
820 * However, trying to find out the local source
821 * address would require too much code duplication
822 * with IP, since IP needs needs to have that code
823 * to support userland TCP implementations.
824 */
825 if (quick_connect &&
826 (ltcp->tcp_state > TCPS_LISTEN) &&
827 ((connp->conn_fport != lconnp->conn_fport) ||
828 !IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
829 &lconnp->conn_faddr_v6)))
830 continue;
831
832 if (!reuseaddr) {
833 /*
834 * No socket option SO_REUSEADDR.
835 * If existing port is bound to
836 * a non-wildcard IP address
837 * and the requesting stream is
838 * bound to a distinct
839 * different IP addresses
840 * (non-wildcard, also), keep
841 * going.
842 */
843 if (!V6_OR_V4_INADDR_ANY(*laddr) &&
844 !V6_OR_V4_INADDR_ANY(
845 lconnp->conn_bound_addr_v6) &&
846 !IN6_ARE_ADDR_EQUAL(laddr,
847 &lconnp->conn_bound_addr_v6))
848 continue;
849 if (ltcp->tcp_state >= TCPS_BOUND) {
850 /*
851 * This port is being used and
852 * its state is >= TCPS_BOUND,
853 * so we can't bind to it.
854 */
855 break;
856 }
857 } else {
858 /*
859 * socket option SO_REUSEADDR is set on the
860 * binding tcp_t.
861 *
862 * If two streams are bound to
863 * same IP address or both addr
864 * and bound source are wildcards
865 * (INADDR_ANY), we want to stop
866 * searching.
867 * We have found a match of IP source
868 * address and source port, which is
869 * refused regardless of the
870 * SO_REUSEADDR setting, so we break.
871 */
872 if (IN6_ARE_ADDR_EQUAL(laddr,
873 &lconnp->conn_bound_addr_v6) &&
874 (ltcp->tcp_state == TCPS_LISTEN ||
875 ltcp->tcp_state == TCPS_BOUND))
876 break;
877 }
878 }
879 if (ltcp != NULL) {
880 /* The port number is busy */
881 mutex_exit(&tbf->tf_lock);
882 } else {
883 /*
884 * This port is ours. Insert in fanout and mark as
885 * bound to prevent others from getting the port
886 * number.
887 */
888 tcp->tcp_state = TCPS_BOUND;
889 DTRACE_TCP6(state__change, void, NULL,
890 ip_xmit_attr_t *, connp->conn_ixa,
891 void, NULL, tcp_t *, tcp, void, NULL,
892 int32_t, TCPS_IDLE);
893
894 connp->conn_lport = htons(port);
895
896 ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH(
897 connp->conn_lport)] == tbf);
898 tcp_bind_hash_insert(tbf, tcp, 1);
899
900 mutex_exit(&tbf->tf_lock);
901
902 /*
903 * We don't want tcp_next_port_to_try to "inherit"
926 */
927 port =
928 tcp_update_next_port(
929 tcps->tcps_next_port_to_try,
930 tcp, B_TRUE);
931 user_specified = B_FALSE;
932 } else {
933 port = tcp_update_next_port(port + 1, tcp,
934 B_FALSE);
935 }
936 }
937 if (port == 0)
938 break;
939
940 /*
941 * Don't let this loop run forever in the case where
942 * all of the anonymous ports are in use.
943 */
944 } while (++count < loopmax);
945 return (0);
946 }
|
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
25 * Copyright 2016 Joyent, Inc.
26 */
27
28 #include <sys/types.h>
29 #include <sys/stream.h>
30 #include <sys/strsun.h>
31 #include <sys/strsubr.h>
32 #include <sys/stropts.h>
33 #include <sys/strlog.h>
34 #define _SUN_TPI_VERSION 2
35 #include <sys/tihdr.h>
36 #include <sys/suntpi.h>
37 #include <sys/xti_inet.h>
38 #include <sys/policy.h>
39 #include <sys/squeue_impl.h>
40 #include <sys/squeue.h>
41 #include <sys/tsol/tnet.h>
42
43 #include <rpc/pmap_prot.h>
44
45 #include <inet/common.h>
46 #include <inet/ip.h>
47 #include <inet/tcp.h>
48 #include <inet/tcp_impl.h>
49 #include <inet/proto_set.h>
50 #include <inet/ipsec_impl.h>
51
52 /* Setable in /etc/system */
53 /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */
54 static uint32_t tcp_random_anon_port = 1;
55
56 static int tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t,
57 cred_t *cr);
58 static in_port_t tcp_get_next_priv_port(const tcp_t *);
59 static int tcp_rg_insert(tcp_rg_t *, struct tcp_s *);
60
61 /*
62 * Hash list insertion routine for tcp_t structures. Each hash bucket
63 * contains a list of tcp_t entries, and each entry is bound to a unique
64 * port. If there are multiple tcp_t's that are bound to the same port, then
65 * one of them will be linked into the hash bucket list, and the rest will
66 * hang off of that one entry. For each port, entries bound to a specific IP
67 * address will be inserted before those those bound to INADDR_ANY.
68 */
69 void
70 tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock)
71 {
72 tcp_t **tcpp;
73 tcp_t *tcpnext;
74 tcp_t *tcphash;
75 conn_t *connp = tcp->tcp_connp;
76 conn_t *connext;
77
78 if (tcp->tcp_ptpbhn != NULL) {
79 ASSERT(!caller_holds_lock);
157 tcp_bind_hash_remove(tcp_t *tcp)
158 {
159 tcp_t *tcpnext;
160 kmutex_t *lockp;
161 tcp_stack_t *tcps = tcp->tcp_tcps;
162 conn_t *connp = tcp->tcp_connp;
163
164 if (tcp->tcp_ptpbhn == NULL)
165 return;
166
167 /*
168 * Extract the lock pointer in case there are concurrent
169 * hash_remove's for this instance.
170 */
171 ASSERT(connp->conn_lport != 0);
172 lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH(
173 connp->conn_lport)].tf_lock;
174
175 ASSERT(lockp != NULL);
176 mutex_enter(lockp);
177
178 /* destroy any association with SO_REUSEPORT group */
179 if (tcp->tcp_rg_bind != NULL) {
180 if (tcp_rg_remove(tcp->tcp_rg_bind, tcp)) {
181 /* Last one out turns off the lights */
182 tcp_rg_destroy(tcp->tcp_rg_bind);
183 }
184 tcp->tcp_rg_bind = NULL;
185 }
186
187 if (tcp->tcp_ptpbhn) {
188 tcpnext = tcp->tcp_bind_hash_port;
189 if (tcpnext != NULL) {
190 tcp->tcp_bind_hash_port = NULL;
191 tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
192 tcpnext->tcp_bind_hash = tcp->tcp_bind_hash;
193 if (tcpnext->tcp_bind_hash != NULL) {
194 tcpnext->tcp_bind_hash->tcp_ptpbhn =
195 &(tcpnext->tcp_bind_hash);
196 tcp->tcp_bind_hash = NULL;
197 }
198 } else if ((tcpnext = tcp->tcp_bind_hash) != NULL) {
199 tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
200 tcp->tcp_bind_hash = NULL;
201 }
202 *tcp->tcp_ptpbhn = tcpnext;
203 tcp->tcp_ptpbhn = NULL;
204 }
205 mutex_exit(lockp);
206 }
631 ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
632 connp->conn_incoming_ifindex = connp->conn_bound_if;
633 }
634
635 connp->conn_laddr_v6 = v6addr;
636 connp->conn_saddr_v6 = v6addr;
637
638 bind_to_req_port_only = requested_port != 0 && bind_to_req_port_only;
639
640 error = tcp_bind_select_lport(tcp, &requested_port,
641 bind_to_req_port_only, cr);
642 if (error != 0) {
643 connp->conn_laddr_v6 = ipv6_all_zeros;
644 connp->conn_saddr_v6 = ipv6_all_zeros;
645 connp->conn_bound_addr_v6 = ipv6_all_zeros;
646 }
647 return (error);
648 }
649
650 /*
651 * If the "bind_to_req_port_only" parameter is set and the requested port
652 * number is available, return it (else return 0).
653 *
654 * If "bind_to_req_port_only" parameter is not set and the requested port
655 * number is available, return it. If not, return the first anonymous port we
656 * happen across. If no anonymous ports are available, return 0.
657 *
658 * In either case, when succeeding update the tcp_t to record the port number
659 * and insert it in the bind hash table.
660 *
661 * Note that TCP over IPv4 and IPv6 sockets can use the same port number
662 * without setting SO_REUSEADDR. This is needed so that they
663 * can be viewed as two independent transport protocols.
664 */
665 in_port_t
666 tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
667 int reuseaddr, boolean_t quick_connect,
668 boolean_t bind_to_req_port_only, boolean_t user_specified)
669 {
670 /* number of times we have run around the loop */
671 int count = 0;
672 /* maximum number of times to run around the loop */
673 int loopmax;
674 conn_t *connp = tcp->tcp_connp;
675 tcp_stack_t *tcps = tcp->tcp_tcps;
676 boolean_t reuseport = connp->conn_reuseport;
677
678 /*
679 * Lookup for free addresses is done in a loop and "loopmax"
680 * influences how long we spin in the loop
681 */
682 if (bind_to_req_port_only) {
683 /*
684 * If the requested port is busy, don't bother to look
685 * for a new one. Setting loop maximum count to 1 has
686 * that effect.
687 */
688 loopmax = 1;
689 } else {
690 /*
691 * If the requested port is busy, look for a free one
692 * in the anonymous port range.
693 * Set loopmax appropriately so that one does not look
694 * forever in the case all of the anonymous ports are in use.
695 */
696 if (connp->conn_anon_priv_bind) {
697 /*
698 * loopmax =
699 * (IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1
700 */
701 loopmax = IPPORT_RESERVED -
702 tcps->tcps_min_anonpriv_port;
703 } else {
704 loopmax = (tcps->tcps_largest_anon_port -
705 tcps->tcps_smallest_anon_port + 1);
706 }
707 }
708 do {
709 uint16_t lport;
710 tf_t *tbf;
711 tcp_t *ltcp;
712 conn_t *lconnp;
713 boolean_t attempt_reuse = B_FALSE;
714
715 lport = htons(port);
716
717 /*
718 * Ensure that the tcp_t is not currently in the bind hash.
719 * Hold the lock on the hash bucket to ensure that
720 * the duplicate check plus the insertion is an atomic
721 * operation.
722 *
723 * This function does an inline lookup on the bind hash list
724 * Make sure that we access only members of tcp_t
725 * and that we don't look at tcp_tcp, since we are not
726 * doing a CONN_INC_REF.
727 */
728 tcp_bind_hash_remove(tcp);
729 tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)];
730 mutex_enter(&tbf->tf_lock);
731 for (ltcp = tbf->tf_tcp; ltcp != NULL;
732 ltcp = ltcp->tcp_bind_hash) {
733 if (lport == ltcp->tcp_connp->conn_lport)
734 break;
735 }
736
737 for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) {
738 boolean_t not_socket;
739 boolean_t exclbind;
740 boolean_t addrmatch;
741
742 lconnp = ltcp->tcp_connp;
743
744 /*
745 * On a labeled system, we must treat bindings to ports
746 * on shared IP addresses by sockets with MAC exemption
747 * privilege as being in all zones, as there's
748 * otherwise no way to identify the right receiver.
749 */
750 if (!IPCL_BIND_ZONE_MATCH(lconnp, connp))
751 continue;
752
753 /*
754 * If TCP_EXCLBIND is set for either the bound or
755 * binding endpoint, the semantics of bind
756 * is changed according to the following.
757 *
758 * spec = specified address (v4 or v6)
759 * unspec = unspecified address (v4 or v6)
760 * A = specified addresses are different for endpoints
826 if (connp->conn_ipversion != lconnp->conn_ipversion &&
827 bind_to_req_port_only)
828 continue;
829
830 /*
831 * Ideally, we should make sure that the source
832 * address, remote address, and remote port in the
833 * four tuple for this tcp-connection is unique.
834 * However, trying to find out the local source
835 * address would require too much code duplication
836 * with IP, since IP needs needs to have that code
837 * to support userland TCP implementations.
838 */
839 if (quick_connect &&
840 (ltcp->tcp_state > TCPS_LISTEN) &&
841 ((connp->conn_fport != lconnp->conn_fport) ||
842 !IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
843 &lconnp->conn_faddr_v6)))
844 continue;
845
846 addrmatch = IN6_ARE_ADDR_EQUAL(laddr,
847 &lconnp->conn_bound_addr_v6);
848
849 if (addrmatch && reuseport && bind_to_req_port_only &&
850 (ltcp->tcp_state == TCPS_BOUND ||
851 ltcp->tcp_state == TCPS_LISTEN)) {
852 /*
853 * This entry is bound to the exact same
854 * address and port. If SO_REUSEPORT is set on
855 * the calling socket, attempt to reuse this
856 * binding if it too had SO_REUSEPORT enabled
857 * when it was bound.
858 */
859 attempt_reuse = (ltcp->tcp_rg_bind != NULL);
860 break;
861 }
862
863 if (!reuseaddr) {
864 /*
865 * No socket option SO_REUSEADDR. If an
866 * existing port is bound to a non-wildcard IP
867 * address and the requesting stream is bound
868 * to a distinct different IP address
869 * (non-wildcard, also), keep going.
870 */
871 if (!V6_OR_V4_INADDR_ANY(*laddr) &&
872 !V6_OR_V4_INADDR_ANY(
873 lconnp->conn_bound_addr_v6) &&
874 !addrmatch)
875 continue;
876 if (ltcp->tcp_state >= TCPS_BOUND) {
877 /*
878 * This port is being used and
879 * its state is >= TCPS_BOUND,
880 * so we can't bind to it.
881 */
882 break;
883 }
884 } else {
885 /*
886 * socket option SO_REUSEADDR is set on the
887 * binding tcp_t.
888 *
889 * If two streams are bound to the same IP
890 * address or both addr and bound source are
891 * wildcards (INADDR_ANY), we want to stop
892 * searching. We have found a match of IP
893 * source address and source port, which is
894 * refused regardless of the SO_REUSEADDR
895 * setting, so we break.
896 */
897 if (addrmatch &&
898 (ltcp->tcp_state == TCPS_LISTEN ||
899 ltcp->tcp_state == TCPS_BOUND))
900 break;
901 }
902 }
903 if (ltcp != NULL && !attempt_reuse) {
904 /* The port number is busy */
905 mutex_exit(&tbf->tf_lock);
906 } else {
907 if (attempt_reuse) {
908 int err;
909 struct tcp_rg_s *rg;
910
911 ASSERT(ltcp != NULL);
912 ASSERT(ltcp->tcp_rg_bind != NULL);
913 ASSERT(tcp->tcp_rg_bind != NULL);
914 ASSERT(ltcp->tcp_rg_bind != tcp->tcp_rg_bind);
915
916 err = tcp_rg_insert(ltcp->tcp_rg_bind, tcp);
917 if (err != 0) {
918 mutex_exit(&tbf->tf_lock);
919 return (0);
920 }
921 /*
922 * Now that the newly-binding socket has joined
923 * the existing reuseport group on ltcp, it
924 * should clean up its own (empty) group.
925 */
926 rg = tcp->tcp_rg_bind;
927 tcp->tcp_rg_bind = ltcp->tcp_rg_bind;
928 VERIFY(tcp_rg_remove(rg, tcp));
929 tcp_rg_destroy(rg);
930 }
931
932 /*
933 * This port is ours. Insert in fanout and mark as
934 * bound to prevent others from getting the port
935 * number.
936 */
937 tcp->tcp_state = TCPS_BOUND;
938 DTRACE_TCP6(state__change, void, NULL,
939 ip_xmit_attr_t *, connp->conn_ixa,
940 void, NULL, tcp_t *, tcp, void, NULL,
941 int32_t, TCPS_IDLE);
942
943 connp->conn_lport = htons(port);
944
945 ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH(
946 connp->conn_lport)] == tbf);
947 tcp_bind_hash_insert(tbf, tcp, 1);
948
949 mutex_exit(&tbf->tf_lock);
950
951 /*
952 * We don't want tcp_next_port_to_try to "inherit"
975 */
976 port =
977 tcp_update_next_port(
978 tcps->tcps_next_port_to_try,
979 tcp, B_TRUE);
980 user_specified = B_FALSE;
981 } else {
982 port = tcp_update_next_port(port + 1, tcp,
983 B_FALSE);
984 }
985 }
986 if (port == 0)
987 break;
988
989 /*
990 * Don't let this loop run forever in the case where
991 * all of the anonymous ports are in use.
992 */
993 } while (++count < loopmax);
994 return (0);
995 }
996
997 /* Max number of members in TCP SO_REUSEPORT group */
998 #define TCP_RG_SIZE_MAX 64
999 /* Step size when expanding members array */
1000 #define TCP_RG_SIZE_STEP 2
1001
1002
1003 tcp_rg_t *
1004 tcp_rg_init(tcp_t *tcp)
1005 {
1006 tcp_rg_t *rg;
1007 rg = kmem_alloc(sizeof (tcp_rg_t), KM_NOSLEEP|KM_NORMALPRI);
1008 if (rg == NULL)
1009 return (NULL);
1010 rg->tcprg_members = kmem_zalloc(2 * sizeof (tcp_t *),
1011 KM_NOSLEEP|KM_NORMALPRI);
1012 if (rg->tcprg_members == NULL) {
1013 kmem_free(rg, sizeof (tcp_rg_t));
1014 return (NULL);
1015 }
1016
1017 mutex_init(&rg->tcprg_lock, NULL, MUTEX_DEFAULT, NULL);
1018 rg->tcprg_size = 2;
1019 rg->tcprg_count = 1;
1020 rg->tcprg_active = 1;
1021 rg->tcprg_members[0] = tcp;
1022 return (rg);
1023 }
1024
1025 void
1026 tcp_rg_destroy(tcp_rg_t *rg)
1027 {
1028 mutex_enter(&rg->tcprg_lock);
1029 ASSERT(rg->tcprg_count == 0);
1030 ASSERT(rg->tcprg_active == 0);
1031 kmem_free(rg->tcprg_members, rg->tcprg_size * sizeof (tcp_t *));
1032 mutex_destroy(&rg->tcprg_lock);
1033 kmem_free(rg, sizeof (struct tcp_rg_s));
1034 }
1035
1036 static int
1037 tcp_rg_insert(tcp_rg_t *rg, tcp_t *tcp)
1038 {
1039 mutex_enter(&rg->tcprg_lock);
1040
1041 VERIFY(rg->tcprg_size > 0);
1042 VERIFY(rg->tcprg_count <= rg->tcprg_size);
1043 if (rg->tcprg_count != 0) {
1044 cred_t *oldcred = rg->tcprg_members[0]->tcp_connp->conn_cred;
1045 cred_t *newcred = tcp->tcp_connp->conn_cred;
1046
1047 if (crgetuid(oldcred) != crgetuid(newcred) ||
1048 crgetzoneid(oldcred) != crgetzoneid(newcred)) {
1049 mutex_exit(&rg->tcprg_lock);
1050 return (EPERM);
1051 }
1052 }
1053
1054 if (rg->tcprg_count == rg->tcprg_size) {
1055 unsigned int oldalloc = rg->tcprg_size * sizeof (tcp_t *);
1056 unsigned int newsize = rg->tcprg_size + TCP_RG_SIZE_STEP;
1057 tcp_t **newmembers;
1058
1059 if (newsize > TCP_RG_SIZE_MAX) {
1060 mutex_exit(&rg->tcprg_lock);
1061 return (EINVAL);
1062 }
1063 newmembers = kmem_zalloc(newsize * sizeof (tcp_t *),
1064 KM_NOSLEEP|KM_NORMALPRI);
1065 if (newmembers == NULL) {
1066 mutex_exit(&rg->tcprg_lock);
1067 return (ENOMEM);
1068 }
1069 bcopy(rg->tcprg_members, newmembers, oldalloc);
1070 kmem_free(rg->tcprg_members, oldalloc);
1071 rg->tcprg_members = newmembers;
1072 rg->tcprg_size = newsize;
1073 }
1074
1075 rg->tcprg_members[rg->tcprg_count] = tcp;
1076 rg->tcprg_count++;
1077 rg->tcprg_active++;
1078
1079 mutex_exit(&rg->tcprg_lock);
1080 return (0);
1081 }
1082
1083 boolean_t
1084 tcp_rg_remove(tcp_rg_t *rg, tcp_t *tcp)
1085 {
1086 int i;
1087 boolean_t is_empty;
1088
1089 mutex_enter(&rg->tcprg_lock);
1090 for (i = 0; i < rg->tcprg_count; i++) {
1091 if (rg->tcprg_members[i] == tcp)
1092 break;
1093 }
1094 /* The item should be present */
1095 ASSERT(i < rg->tcprg_count);
1096 /* Move the last member into this position */
1097 rg->tcprg_count--;
1098 rg->tcprg_members[i] = rg->tcprg_members[rg->tcprg_count];
1099 rg->tcprg_members[rg->tcprg_count] = NULL;
1100 if (tcp->tcp_connp->conn_reuseport != 0)
1101 rg->tcprg_active--;
1102 is_empty = (rg->tcprg_count == 0);
1103 mutex_exit(&rg->tcprg_lock);
1104 return (is_empty);
1105 }
1106
1107 void
1108 tcp_rg_setactive(tcp_rg_t *rg, boolean_t is_active)
1109 {
1110 mutex_enter(&rg->tcprg_lock);
1111 if (is_active) {
1112 rg->tcprg_active++;
1113 } else {
1114 rg->tcprg_active--;
1115 }
1116 mutex_exit(&rg->tcprg_lock);
1117 }
|