Print this page
OS-5598 newproc() performs inadequate clean-up after failed lwp_create() [fix debug build]
OS-5613 SO_REUSEPORT needs better state-change coverage
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Approved by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-5217 setsockopt(TCP_KEEPCNT) can return EINVAL spuriously
Reviewed by: Dave Pacheco <dap@joyent.com>
OS-4699 lxbrand netty complains about SO_LINGER (really IP_TOS)
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-4018 lxbrand support TCP SO_REUSEPORT
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Cody Mello <cody.mello@joyent.com>

@@ -19,10 +19,11 @@
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
  */
 
 #include <sys/types.h>
 #include <sys/stream.h>
 #define _SUN_TPI_VERSION 2

@@ -61,10 +62,11 @@
 { SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
         },
 { SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 { SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_REUSEPORT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 { SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 { SO_TYPE,      SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
 { SO_SNDBUF,    SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 { SO_RCVBUF,    SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 { SO_SNDTIMEO,  SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,

@@ -482,10 +484,108 @@
         mutex_exit(&connp->conn_lock);
         return (retval);
 }
 
 /*
+ * Set a TCP connection's participation in SO_REUSEPORT.  This operation is
+ * performed under the protection of the squeue via tcp_setsockopt.
+ * The manipulation of tcp_rg_bind, as part of this operation, is subject to
+ * these constraints:
+ * 1. Prior to bind(), tcp_rg_bind can be set/cleared in tcp_set_reuseport
+ *    under the protection of the squeue.
+ * 2. Once the connection has been bound, the tcp_rg_bind pointer must not be
+ *    altered until such time as tcp_free() cleans up the connection.
+ * 3. A connection undergoing bind, which matches to a connection participating
+ *    in port-reuse, will switch its tcp_rg_bind pointer when it joins the
+ *    group of an existing connection in tcp_bindi().
+ */
+static int
+tcp_set_reuseport(conn_t *connp, boolean_t do_enable)
+{
+        tcp_t *tcp = connp->conn_tcp;
+        struct tcp_rg_s *rg;
+
+        if (!IPCL_IS_NONSTR(connp)) {
+                if (do_enable) {
+                        /*
+                         * SO_REUSEPORT cannot be enabled on sockets which have
+                         * fallen back to the STREAMS API.
+                         */
+                        return (EINVAL);
+                } else {
+                        /*
+                         * A connection with SO_REUSEPORT enabled should be
+                         * prevented from falling back to STREAMS mode via
+                         * logic in tcp_fallback.  It is legal, however, for
+                         * fallen-back connections to affirm the disabled state
+                         * of SO_REUSEPORT.
+                         */
+                        ASSERT(connp->conn_reuseport == 0);
+                        return (0);
+                }
+        }
+        if (tcp->tcp_state <= TCPS_CLOSED) {
+                return (EINVAL);
+        }
+        if (connp->conn_reuseport == 0 && do_enable) {
+                /* disabled -> enabled */
+                if (tcp->tcp_rg_bind != NULL) {
+                        tcp_rg_setactive(tcp->tcp_rg_bind, do_enable);
+                } else {
+                        /*
+                         * Connection state is not a concern when initially
+                         * populating tcp_rg_bind.  Setting it to non-NULL on a
+                         * bound or listening connection would only mean that
+                         * new reused-port binds become a possibility.
+                         */
+                        if ((rg = tcp_rg_init(tcp)) == NULL) {
+                                return (ENOMEM);
+                        }
+                        tcp->tcp_rg_bind = rg;
+                }
+                connp->conn_reuseport = 1;
+        } else if (connp->conn_reuseport != 0 && !do_enable) {
+                /* enabled -> disabled */
+                ASSERT(tcp->tcp_rg_bind != NULL);
+                if (tcp->tcp_state == TCPS_IDLE) {
+                        /*
+                         * If the connection has not been bound yet, discard
+                         * the reuse group state.  Since disabling SO_REUSEPORT
+                         * on a bound socket will _not_ prevent others from
+                         * reusing the port, the presence of tcp_rg_bind is
+                         * used to determine reuse availability, not
+                         * conn_reuseport.
+                         *
+                         * This allows proper behavior for examples such as:
+                         *
+                         * setsockopt(fd1, ... SO_REUSEPORT, &on_val...);
+                         * bind(fd1, &myaddr, ...);
+                         * setsockopt(fd1, ... SO_REUSEPORT, &off_val...);
+                         *
+                         * setsockopt(fd2, ... SO_REUSEPORT, &on_val...);
+                         * bind(fd2, &myaddr, ...); // <- SHOULD SUCCEED
+                         *
+                         */
+                        rg = tcp->tcp_rg_bind;
+                        tcp->tcp_rg_bind = NULL;
+                        VERIFY(tcp_rg_remove(rg, tcp));
+                        tcp_rg_destroy(rg);
+                } else {
+                        /*
+                         * If a connection has been bound, it's no longer safe
+                         * to manipulate tcp_rg_bind until connection clean-up
+                         * during tcp_free.  Just mark the member status of the
+                         * connection as inactive.
+                         */
+                        tcp_rg_setactive(tcp->tcp_rg_bind, do_enable);
+                }
+                connp->conn_reuseport = 0;
+        }
+        return (0);
+}
+
+/*
  * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
  * Parameters are assumed to be verified by the caller.
  */
 /* ARGSUSED */
 int

@@ -651,11 +751,16 @@
                                 }
                                 tcp->tcp_snd_zcopy_aware = 1;
                         }
                         *outlenp = inlen;
                         return (0);
+                case SO_REUSEPORT:
+                        if (!checkonly) {
+                                return (tcp_set_reuseport(connp, *i1 != 0));
                 }
+                        return (0);
+                }
                 break;
         case IPPROTO_TCP:
                 switch (name) {
                 case TCP_NODELAY:
                         if (!checkonly)

@@ -767,18 +872,41 @@
                                 break;
 
                         if (*i1 == 0) {
                                 return (EINVAL);
                         } else if (tcp->tcp_ka_rinterval == 0) {
-                                if ((tcp->tcp_ka_abort_thres / *i1) <
-                                    tcp->tcp_rto_min ||
-                                    (tcp->tcp_ka_abort_thres / *i1) >
-                                    tcp->tcp_rto_max)
-                                        return (EINVAL);
+                                /*
+                                 * When TCP_KEEPCNT is specified without first
+                                 * specifying a TCP_KEEPINTVL, we infer an
+                                 * interval based on a tunable specific to our
+                                 * stack: the tcp_keepalive_abort_interval.
+                                 * (Or the TCP_KEEPALIVE_ABORT_THRESHOLD, in
+                                 * the unlikely event that that has been set.)
+                                 * Given the abort interval's default value of
+                                 * 480 seconds, low TCP_KEEPCNT values can
+                                 * result in intervals that exceed the default
+                                 * maximum RTO of 60 seconds.  Rather than
+                                 * fail in these cases, we (implicitly) clamp
+                                 * the interval at the maximum RTO; if the
+                                 * TCP_KEEPCNT is shortly followed by a
+                                 * TCP_KEEPINTVL (as we expect), the abort
+                                 * threshold will be recalculated correctly --
+                                 * and if a TCP_KEEPINTVL is not forthcoming,
+                                 * keep-alive will at least operate reasonably
+                                 * given the underconfigured state.
+                                 */
+                                uint32_t interval;
 
-                                tcp->tcp_ka_rinterval =
-                                    tcp->tcp_ka_abort_thres / *i1;
+                                interval = tcp->tcp_ka_abort_thres / *i1;
+
+                                if (interval < tcp->tcp_rto_min)
+                                        interval = tcp->tcp_rto_min;
+
+                                if (interval > tcp->tcp_rto_max)
+                                        interval = tcp->tcp_rto_max;
+
+                                tcp->tcp_ka_rinterval = interval;
                         } else {
                                 if ((*i1 * tcp->tcp_ka_rinterval) <
                                     tcps->tcps_keepalive_abort_interval_low ||
                                     (*i1 * tcp->tcp_ka_rinterval) >
                                     tcps->tcps_keepalive_abort_interval_high)

@@ -951,14 +1079,10 @@
                 default:
                         break;
                 }
                 break;
         case IPPROTO_IP:
-                if (connp->conn_family != AF_INET) {
-                        *outlenp = 0;
-                        return (EINVAL);
-                }
                 switch (name) {
                 case IP_SEC_OPT:
                         /*
                          * We should not allow policy setting after
                          * we start listening for connections.