Print this page
OS-5613 SO_REUSEPORT needs better state-change coverage
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Approved by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-4018 lxbrand support TCP SO_REUSEPORT
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Cody Mello <cody.mello@joyent.com>
*** 20,29 ****
--- 20,30 ----
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2013 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
*/
#include <sys/types.h>
#include <sys/stream.h>
#include <sys/strsun.h>
*** 53,62 ****
--- 54,64 ----
static uint32_t tcp_random_anon_port = 1;
static int tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t,
cred_t *cr);
static in_port_t tcp_get_next_priv_port(const tcp_t *);
+ static int tcp_rg_insert(tcp_rg_t *, struct tcp_s *);
/*
* Hash list insertion routine for tcp_t structures. Each hash bucket
* contains a list of tcp_t entries, and each entry is bound to a unique
* port. If there are multiple tcp_t's that are bound to the same port, then
*** 170,179 ****
--- 172,191 ----
lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH(
connp->conn_lport)].tf_lock;
ASSERT(lockp != NULL);
mutex_enter(lockp);
+
+ /* destroy any association with SO_REUSEPORT group */
+ if (tcp->tcp_rg_bind != NULL) {
+ if (tcp_rg_remove(tcp->tcp_rg_bind, tcp)) {
+ /* Last one out turns off the lights */
+ tcp_rg_destroy(tcp->tcp_rg_bind);
+ }
+ tcp->tcp_rg_bind = NULL;
+ }
+
if (tcp->tcp_ptpbhn) {
tcpnext = tcp->tcp_bind_hash_port;
if (tcpnext != NULL) {
tcp->tcp_bind_hash_port = NULL;
tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
*** 634,650 ****
}
return (error);
}
/*
! * If the "bind_to_req_port_only" parameter is set, if the requested port
! * number is available, return it, If not return 0
*
! * If "bind_to_req_port_only" parameter is not set and
! * If the requested port number is available, return it. If not, return
! * the first anonymous port we happen across. If no anonymous ports are
! * available, return 0. addr is the requested local address, if any.
*
* In either case, when succeeding update the tcp_t to record the port number
* and insert it in the bind hash table.
*
* Note that TCP over IPv4 and IPv6 sockets can use the same port number
--- 646,661 ----
}
return (error);
}
/*
! * If the "bind_to_req_port_only" parameter is set and the requested port
! * number is available, return it (else return 0).
*
! * If "bind_to_req_port_only" parameter is not set and the requested port
! * number is available, return it. If not, return the first anonymous port we
! * happen across. If no anonymous ports are available, return 0.
*
* In either case, when succeeding update the tcp_t to record the port number
* and insert it in the bind hash table.
*
* Note that TCP over IPv4 and IPv6 sockets can use the same port number
*** 660,669 ****
--- 671,681 ----
int count = 0;
/* maximum number of times to run around the loop */
int loopmax;
conn_t *connp = tcp->tcp_connp;
tcp_stack_t *tcps = tcp->tcp_tcps;
+ boolean_t reuseport = connp->conn_reuseport;
/*
* Lookup for free addresses is done in a loop and "loopmax"
* influences how long we spin in the loop
*/
*** 696,705 ****
--- 708,718 ----
do {
uint16_t lport;
tf_t *tbf;
tcp_t *ltcp;
conn_t *lconnp;
+ boolean_t attempt_reuse = B_FALSE;
lport = htons(port);
/*
* Ensure that the tcp_t is not currently in the bind hash.
*** 722,731 ****
--- 735,745 ----
}
for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) {
boolean_t not_socket;
boolean_t exclbind;
+ boolean_t addrmatch;
lconnp = ltcp->tcp_connp;
/*
* On a labeled system, we must treat bindings to ports
*** 827,852 ****
((connp->conn_fport != lconnp->conn_fport) ||
!IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
&lconnp->conn_faddr_v6)))
continue;
if (!reuseaddr) {
/*
! * No socket option SO_REUSEADDR.
! * If existing port is bound to
! * a non-wildcard IP address
! * and the requesting stream is
! * bound to a distinct
! * different IP addresses
! * (non-wildcard, also), keep
! * going.
*/
if (!V6_OR_V4_INADDR_ANY(*laddr) &&
!V6_OR_V4_INADDR_ANY(
lconnp->conn_bound_addr_v6) &&
! !IN6_ARE_ADDR_EQUAL(laddr,
! &lconnp->conn_bound_addr_v6))
continue;
if (ltcp->tcp_state >= TCPS_BOUND) {
/*
* This port is being used and
* its state is >= TCPS_BOUND,
--- 841,879 ----
((connp->conn_fport != lconnp->conn_fport) ||
!IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
&lconnp->conn_faddr_v6)))
continue;
+ addrmatch = IN6_ARE_ADDR_EQUAL(laddr,
+ &lconnp->conn_bound_addr_v6);
+
+ if (addrmatch && reuseport && bind_to_req_port_only &&
+ (ltcp->tcp_state == TCPS_BOUND ||
+ ltcp->tcp_state == TCPS_LISTEN)) {
+ /*
+ * This entry is bound to the exact same
+ * address and port. If SO_REUSEPORT is set on
+ * the calling socket, attempt to reuse this
+ * binding if it too had SO_REUSEPORT enabled
+ * when it was bound.
+ */
+ attempt_reuse = (ltcp->tcp_rg_bind != NULL);
+ break;
+ }
+
if (!reuseaddr) {
/*
! * No socket option SO_REUSEADDR. If an
! * existing port is bound to a non-wildcard IP
! * address and the requesting stream is bound
! * to a distinct different IP address
! * (non-wildcard, also), keep going.
*/
if (!V6_OR_V4_INADDR_ANY(*laddr) &&
!V6_OR_V4_INADDR_ANY(
lconnp->conn_bound_addr_v6) &&
! !addrmatch)
continue;
if (ltcp->tcp_state >= TCPS_BOUND) {
/*
* This port is being used and
* its state is >= TCPS_BOUND,
*** 857,888 ****
} else {
/*
* socket option SO_REUSEADDR is set on the
* binding tcp_t.
*
! * If two streams are bound to
! * same IP address or both addr
! * and bound source are wildcards
! * (INADDR_ANY), we want to stop
! * searching.
! * We have found a match of IP source
! * address and source port, which is
! * refused regardless of the
! * SO_REUSEADDR setting, so we break.
*/
! if (IN6_ARE_ADDR_EQUAL(laddr,
! &lconnp->conn_bound_addr_v6) &&
(ltcp->tcp_state == TCPS_LISTEN ||
ltcp->tcp_state == TCPS_BOUND))
break;
}
}
! if (ltcp != NULL) {
/* The port number is busy */
mutex_exit(&tbf->tf_lock);
} else {
/*
* This port is ours. Insert in fanout and mark as
* bound to prevent others from getting the port
* number.
*/
tcp->tcp_state = TCPS_BOUND;
--- 884,937 ----
} else {
/*
* socket option SO_REUSEADDR is set on the
* binding tcp_t.
*
! * If two streams are bound to the same IP
! * address or both addr and bound source are
! * wildcards (INADDR_ANY), we want to stop
! * searching. We have found a match of IP
! * source address and source port, which is
! * refused regardless of the SO_REUSEADDR
! * setting, so we break.
*/
! if (addrmatch &&
(ltcp->tcp_state == TCPS_LISTEN ||
ltcp->tcp_state == TCPS_BOUND))
break;
}
}
! if (ltcp != NULL && !attempt_reuse) {
/* The port number is busy */
mutex_exit(&tbf->tf_lock);
} else {
+ if (attempt_reuse) {
+ int err;
+ struct tcp_rg_s *rg;
+
+ ASSERT(ltcp != NULL);
+ ASSERT(ltcp->tcp_rg_bind != NULL);
+ ASSERT(tcp->tcp_rg_bind != NULL);
+ ASSERT(ltcp->tcp_rg_bind != tcp->tcp_rg_bind);
+
+ err = tcp_rg_insert(ltcp->tcp_rg_bind, tcp);
+ if (err != 0) {
+ mutex_exit(&tbf->tf_lock);
+ return (0);
+ }
/*
+ * Now that the newly-binding socket has joined
+ * the existing reuseport group on ltcp, it
+ * should clean up its own (empty) group.
+ */
+ rg = tcp->tcp_rg_bind;
+ tcp->tcp_rg_bind = ltcp->tcp_rg_bind;
+ VERIFY(tcp_rg_remove(rg, tcp));
+ tcp_rg_destroy(rg);
+ }
+
+ /*
* This port is ours. Insert in fanout and mark as
* bound to prevent others from getting the port
* number.
*/
tcp->tcp_state = TCPS_BOUND;
*** 941,946 ****
--- 990,1117 ----
* Don't let this loop run forever in the case where
* all of the anonymous ports are in use.
*/
} while (++count < loopmax);
return (0);
+ }
+
+ /* Max number of members in TCP SO_REUSEPORT group */
+ #define TCP_RG_SIZE_MAX 64
+ /* Step size when expanding members array */
+ #define TCP_RG_SIZE_STEP 2
+
+
+ tcp_rg_t *
+ tcp_rg_init(tcp_t *tcp)
+ {
+ tcp_rg_t *rg;
+ rg = kmem_alloc(sizeof (tcp_rg_t), KM_NOSLEEP|KM_NORMALPRI);
+ if (rg == NULL)
+ return (NULL);
+ rg->tcprg_members = kmem_zalloc(2 * sizeof (tcp_t *),
+ KM_NOSLEEP|KM_NORMALPRI);
+ if (rg->tcprg_members == NULL) {
+ kmem_free(rg, sizeof (tcp_rg_t));
+ return (NULL);
+ }
+
+ mutex_init(&rg->tcprg_lock, NULL, MUTEX_DEFAULT, NULL);
+ rg->tcprg_size = 2;
+ rg->tcprg_count = 1;
+ rg->tcprg_active = 1;
+ rg->tcprg_members[0] = tcp;
+ return (rg);
+ }
+
+ void
+ tcp_rg_destroy(tcp_rg_t *rg)
+ {
+ mutex_enter(&rg->tcprg_lock);
+ ASSERT(rg->tcprg_count == 0);
+ ASSERT(rg->tcprg_active == 0);
+ kmem_free(rg->tcprg_members, rg->tcprg_size * sizeof (tcp_t *));
+ mutex_destroy(&rg->tcprg_lock);
+ kmem_free(rg, sizeof (struct tcp_rg_s));
+ }
+
+ static int
+ tcp_rg_insert(tcp_rg_t *rg, tcp_t *tcp)
+ {
+ mutex_enter(&rg->tcprg_lock);
+
+ VERIFY(rg->tcprg_size > 0);
+ VERIFY(rg->tcprg_count <= rg->tcprg_size);
+ if (rg->tcprg_count != 0) {
+ cred_t *oldcred = rg->tcprg_members[0]->tcp_connp->conn_cred;
+ cred_t *newcred = tcp->tcp_connp->conn_cred;
+
+ if (crgetuid(oldcred) != crgetuid(newcred) ||
+ crgetzoneid(oldcred) != crgetzoneid(newcred)) {
+ mutex_exit(&rg->tcprg_lock);
+ return (EPERM);
+ }
+ }
+
+ if (rg->tcprg_count == rg->tcprg_size) {
+ unsigned int oldalloc = rg->tcprg_size * sizeof (tcp_t *);
+ unsigned int newsize = rg->tcprg_size + TCP_RG_SIZE_STEP;
+ tcp_t **newmembers;
+
+ if (newsize > TCP_RG_SIZE_MAX) {
+ mutex_exit(&rg->tcprg_lock);
+ return (EINVAL);
+ }
+ newmembers = kmem_zalloc(newsize * sizeof (tcp_t *),
+ KM_NOSLEEP|KM_NORMALPRI);
+ if (newmembers == NULL) {
+ mutex_exit(&rg->tcprg_lock);
+ return (ENOMEM);
+ }
+ bcopy(rg->tcprg_members, newmembers, oldalloc);
+ kmem_free(rg->tcprg_members, oldalloc);
+ rg->tcprg_members = newmembers;
+ rg->tcprg_size = newsize;
+ }
+
+ rg->tcprg_members[rg->tcprg_count] = tcp;
+ rg->tcprg_count++;
+ rg->tcprg_active++;
+
+ mutex_exit(&rg->tcprg_lock);
+ return (0);
+ }
+
+ boolean_t
+ tcp_rg_remove(tcp_rg_t *rg, tcp_t *tcp)
+ {
+ int i;
+ boolean_t is_empty;
+
+ mutex_enter(&rg->tcprg_lock);
+ for (i = 0; i < rg->tcprg_count; i++) {
+ if (rg->tcprg_members[i] == tcp)
+ break;
+ }
+ /* The item should be present */
+ ASSERT(i < rg->tcprg_count);
+ /* Move the last member into this position */
+ rg->tcprg_count--;
+ rg->tcprg_members[i] = rg->tcprg_members[rg->tcprg_count];
+ rg->tcprg_members[rg->tcprg_count] = NULL;
+ if (tcp->tcp_connp->conn_reuseport != 0)
+ rg->tcprg_active--;
+ is_empty = (rg->tcprg_count == 0);
+ mutex_exit(&rg->tcprg_lock);
+ return (is_empty);
+ }
+
+ void
+ tcp_rg_setactive(tcp_rg_t *rg, boolean_t is_active)
+ {
+ mutex_enter(&rg->tcprg_lock);
+ if (is_active) {
+ rg->tcprg_active++;
+ } else {
+ rg->tcprg_active--;
+ }
+ mutex_exit(&rg->tcprg_lock);
}