Print this page
OS-5613 SO_REUSEPORT needs better state-change coverage
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Approved by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-4018 lxbrand support TCP SO_REUSEPORT
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Cody Mello <cody.mello@joyent.com>
        
@@ -20,10 +20,11 @@
  */
 
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2016 Joyent, Inc.
  */
 
 #include <sys/types.h>
 #include <sys/stream.h>
 #include <sys/strsun.h>
@@ -53,10 +54,11 @@
 static uint32_t tcp_random_anon_port = 1;
 
 static int      tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t,
                     cred_t *cr);
 static in_port_t        tcp_get_next_priv_port(const tcp_t *);
+static int tcp_rg_insert(tcp_rg_t *, struct tcp_s *);
 
 /*
  * Hash list insertion routine for tcp_t structures. Each hash bucket
  * contains a list of tcp_t entries, and each entry is bound to a unique
  * port. If there are multiple tcp_t's that are bound to the same port, then
@@ -170,10 +172,20 @@
         lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH(
             connp->conn_lport)].tf_lock;
 
         ASSERT(lockp != NULL);
         mutex_enter(lockp);
+
+        /* destroy any association with SO_REUSEPORT group */
+        if (tcp->tcp_rg_bind != NULL) {
+                if (tcp_rg_remove(tcp->tcp_rg_bind, tcp)) {
+                        /* Last one out turns off the lights */
+                        tcp_rg_destroy(tcp->tcp_rg_bind);
+                }
+                tcp->tcp_rg_bind = NULL;
+        }
+
         if (tcp->tcp_ptpbhn) {
                 tcpnext = tcp->tcp_bind_hash_port;
                 if (tcpnext != NULL) {
                         tcp->tcp_bind_hash_port = NULL;
                         tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
@@ -634,17 +646,16 @@
         }
         return (error);
 }
 
 /*
- * If the "bind_to_req_port_only" parameter is set, if the requested port
- * number is available, return it, If not return 0
+ * If the "bind_to_req_port_only" parameter is set and the requested port
+ * number is available, return it (else return 0).
  *
- * If "bind_to_req_port_only" parameter is not set and
- * If the requested port number is available, return it.  If not, return
- * the first anonymous port we happen across.  If no anonymous ports are
- * available, return 0. addr is the requested local address, if any.
+ * If "bind_to_req_port_only" parameter is not set and the requested port
+ * number is available, return it.  If not, return the first anonymous port we
+ * happen across.  If no anonymous ports are available, return 0.
  *
  * In either case, when succeeding update the tcp_t to record the port number
  * and insert it in the bind hash table.
  *
  * Note that TCP over IPv4 and IPv6 sockets can use the same port number
@@ -660,10 +671,11 @@
         int count = 0;
         /* maximum number of times to run around the loop */
         int loopmax;
         conn_t *connp = tcp->tcp_connp;
         tcp_stack_t     *tcps = tcp->tcp_tcps;
+        boolean_t reuseport = connp->conn_reuseport;
 
         /*
          * Lookup for free addresses is done in a loop and "loopmax"
          * influences how long we spin in the loop
          */
@@ -696,10 +708,11 @@
         do {
                 uint16_t        lport;
                 tf_t            *tbf;
                 tcp_t           *ltcp;
                 conn_t          *lconnp;
+                boolean_t       attempt_reuse = B_FALSE;
 
                 lport = htons(port);
 
                 /*
                  * Ensure that the tcp_t is not currently in the bind hash.
@@ -722,10 +735,11 @@
                 }
 
                 for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) {
                         boolean_t not_socket;
                         boolean_t exclbind;
+                        boolean_t addrmatch;
 
                         lconnp = ltcp->tcp_connp;
 
                         /*
                          * On a labeled system, we must treat bindings to ports
@@ -827,26 +841,39 @@
                             ((connp->conn_fport != lconnp->conn_fport) ||
                             !IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
                             &lconnp->conn_faddr_v6)))
                                 continue;
 
+                        addrmatch = IN6_ARE_ADDR_EQUAL(laddr,
+                            &lconnp->conn_bound_addr_v6);
+
+                        if (addrmatch && reuseport && bind_to_req_port_only &&
+                            (ltcp->tcp_state == TCPS_BOUND ||
+                            ltcp->tcp_state == TCPS_LISTEN)) {
+                                /*
+                                 * This entry is bound to the exact same
+                                 * address and port.  If SO_REUSEPORT is set on
+                                 * the calling socket, attempt to reuse this
+                                 * binding if it too had SO_REUSEPORT enabled
+                                 * when it was bound.
+                                 */
+                                attempt_reuse = (ltcp->tcp_rg_bind != NULL);
+                                break;
+                        }
+
                         if (!reuseaddr) {
                                 /*
-                                 * No socket option SO_REUSEADDR.
-                                 * If existing port is bound to
-                                 * a non-wildcard IP address
-                                 * and the requesting stream is
-                                 * bound to a distinct
-                                 * different IP addresses
-                                 * (non-wildcard, also), keep
-                                 * going.
+                                 * No socket option SO_REUSEADDR.  If an
+                                 * existing port is bound to a non-wildcard IP
+                                 * address and the requesting stream is bound
+                                 * to a distinct different IP address
+                                 * (non-wildcard, also), keep going.
                                  */
                                 if (!V6_OR_V4_INADDR_ANY(*laddr) &&
                                     !V6_OR_V4_INADDR_ANY(
                                     lconnp->conn_bound_addr_v6) &&
-                                    !IN6_ARE_ADDR_EQUAL(laddr,
-                                    &lconnp->conn_bound_addr_v6))
+                                    !addrmatch)
                                         continue;
                                 if (ltcp->tcp_state >= TCPS_BOUND) {
                                         /*
                                          * This port is being used and
                                          * its state is >= TCPS_BOUND,
@@ -857,32 +884,54 @@
                         } else {
                                 /*
                                  * socket option SO_REUSEADDR is set on the
                                  * binding tcp_t.
                                  *
-                                 * If two streams are bound to
-                                 * same IP address or both addr
-                                 * and bound source are wildcards
-                                 * (INADDR_ANY), we want to stop
-                                 * searching.
-                                 * We have found a match of IP source
-                                 * address and source port, which is
-                                 * refused regardless of the
-                                 * SO_REUSEADDR setting, so we break.
+                                 * If two streams are bound to the same IP
+                                 * address or both addr and bound source are
+                                 * wildcards (INADDR_ANY), we want to stop
+                                 * searching.  We have found a match of IP
+                                 * source address and source port, which is
+                                 * refused regardless of the SO_REUSEADDR
+                                 * setting, so we break.
                                  */
-                                if (IN6_ARE_ADDR_EQUAL(laddr,
-                                    &lconnp->conn_bound_addr_v6) &&
+                                if (addrmatch &&
                                     (ltcp->tcp_state == TCPS_LISTEN ||
                                     ltcp->tcp_state == TCPS_BOUND))
                                         break;
                         }
                 }
-                if (ltcp != NULL) {
+                if (ltcp != NULL && !attempt_reuse) {
                         /* The port number is busy */
                         mutex_exit(&tbf->tf_lock);
                 } else {
+                        if (attempt_reuse) {
+                                int err;
+                                struct tcp_rg_s *rg;
+
+                                ASSERT(ltcp != NULL);
+                                ASSERT(ltcp->tcp_rg_bind != NULL);
+                                ASSERT(tcp->tcp_rg_bind != NULL);
+                                ASSERT(ltcp->tcp_rg_bind != tcp->tcp_rg_bind);
+
+                                err = tcp_rg_insert(ltcp->tcp_rg_bind, tcp);
+                                if (err != 0) {
+                                        mutex_exit(&tbf->tf_lock);
+                                        return (0);
+                                }
                         /*
+                                 * Now that the newly-binding socket has joined
+                                 * the existing reuseport group on ltcp, it
+                                 * should clean up its own (empty) group.
+                                 */
+                                rg = tcp->tcp_rg_bind;
+                                tcp->tcp_rg_bind = ltcp->tcp_rg_bind;
+                                VERIFY(tcp_rg_remove(rg, tcp));
+                                tcp_rg_destroy(rg);
+                        }
+
+                        /*
                          * This port is ours. Insert in fanout and mark as
                          * bound to prevent others from getting the port
                          * number.
                          */
                         tcp->tcp_state = TCPS_BOUND;
@@ -941,6 +990,128 @@
                  * Don't let this loop run forever in the case where
                  * all of the anonymous ports are in use.
                  */
         } while (++count < loopmax);
         return (0);
+}
+
+/* Max number of members in TCP SO_REUSEPORT group */
+#define TCP_RG_SIZE_MAX         64
+/* Step size when expanding members array */
+#define TCP_RG_SIZE_STEP        2
+
+
+tcp_rg_t *
+tcp_rg_init(tcp_t *tcp)
+{
+        tcp_rg_t *rg;
+        rg = kmem_alloc(sizeof (tcp_rg_t), KM_NOSLEEP|KM_NORMALPRI);
+        if (rg == NULL)
+                return (NULL);
+        rg->tcprg_members = kmem_zalloc(2 * sizeof (tcp_t *),
+            KM_NOSLEEP|KM_NORMALPRI);
+        if (rg->tcprg_members == NULL) {
+                kmem_free(rg, sizeof (tcp_rg_t));
+                return (NULL);
+        }
+
+        mutex_init(&rg->tcprg_lock, NULL, MUTEX_DEFAULT, NULL);
+        rg->tcprg_size = 2;
+        rg->tcprg_count = 1;
+        rg->tcprg_active = 1;
+        rg->tcprg_members[0] = tcp;
+        return (rg);
+}
+
+void
+tcp_rg_destroy(tcp_rg_t *rg)
+{
+        mutex_enter(&rg->tcprg_lock);
+        ASSERT(rg->tcprg_count == 0);
+        ASSERT(rg->tcprg_active == 0);
+        kmem_free(rg->tcprg_members, rg->tcprg_size * sizeof (tcp_t *));
+        mutex_destroy(&rg->tcprg_lock);
+        kmem_free(rg, sizeof (struct tcp_rg_s));
+}
+
+static int
+tcp_rg_insert(tcp_rg_t *rg, tcp_t *tcp)
+{
+        mutex_enter(&rg->tcprg_lock);
+
+        VERIFY(rg->tcprg_size > 0);
+        VERIFY(rg->tcprg_count <= rg->tcprg_size);
+        if (rg->tcprg_count != 0) {
+                cred_t *oldcred = rg->tcprg_members[0]->tcp_connp->conn_cred;
+                cred_t *newcred = tcp->tcp_connp->conn_cred;
+
+                if (crgetuid(oldcred) != crgetuid(newcred) ||
+                    crgetzoneid(oldcred) != crgetzoneid(newcred)) {
+                        mutex_exit(&rg->tcprg_lock);
+                        return (EPERM);
+                }
+        }
+
+        if (rg->tcprg_count == rg->tcprg_size) {
+                unsigned int oldalloc = rg->tcprg_size * sizeof (tcp_t *);
+                unsigned int newsize = rg->tcprg_size + TCP_RG_SIZE_STEP;
+                tcp_t **newmembers;
+
+                if (newsize > TCP_RG_SIZE_MAX) {
+                        mutex_exit(&rg->tcprg_lock);
+                        return (EINVAL);
+                }
+                newmembers = kmem_zalloc(newsize * sizeof (tcp_t *),
+                    KM_NOSLEEP|KM_NORMALPRI);
+                if (newmembers == NULL) {
+                        mutex_exit(&rg->tcprg_lock);
+                        return (ENOMEM);
+                }
+                bcopy(rg->tcprg_members, newmembers, oldalloc);
+                kmem_free(rg->tcprg_members, oldalloc);
+                rg->tcprg_members = newmembers;
+                rg->tcprg_size = newsize;
+        }
+
+        rg->tcprg_members[rg->tcprg_count] = tcp;
+        rg->tcprg_count++;
+        rg->tcprg_active++;
+
+        mutex_exit(&rg->tcprg_lock);
+        return (0);
+}
+
+boolean_t
+tcp_rg_remove(tcp_rg_t *rg, tcp_t *tcp)
+{
+        int i;
+        boolean_t is_empty;
+
+        mutex_enter(&rg->tcprg_lock);
+        for (i = 0; i < rg->tcprg_count; i++) {
+                if (rg->tcprg_members[i] == tcp)
+                        break;
+        }
+        /* The item should be present */
+        ASSERT(i < rg->tcprg_count);
+        /* Move the last member into this position */
+        rg->tcprg_count--;
+        rg->tcprg_members[i] = rg->tcprg_members[rg->tcprg_count];
+        rg->tcprg_members[rg->tcprg_count] = NULL;
+        if (tcp->tcp_connp->conn_reuseport != 0)
+                rg->tcprg_active--;
+        is_empty = (rg->tcprg_count == 0);
+        mutex_exit(&rg->tcprg_lock);
+        return (is_empty);
+}
+
+void
+tcp_rg_setactive(tcp_rg_t *rg, boolean_t is_active)
+{
+        mutex_enter(&rg->tcprg_lock);
+        if (is_active) {
+                rg->tcprg_active++;
+        } else {
+                rg->tcprg_active--;
+        }
+        mutex_exit(&rg->tcprg_lock);
 }