Print this page
OS-5598 newproc() performs inadequate clean-up after failed lwp_create() [fix debug build]
OS-5613 SO_REUSEPORT needs better state-change coverage
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Approved by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-5217 setsockopt(TCP_KEEPCNT) can return EINVAL spuriously
Reviewed by: Dave Pacheco <dap@joyent.com>
OS-4699 lxbrand netty complains about SO_LINGER (really IP_TOS)
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-4018 lxbrand support TCP SO_REUSEPORT
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Cody Mello <cody.mello@joyent.com>

*** 19,28 **** --- 19,29 ---- * CDDL HEADER END */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. */ #include <sys/types.h> #include <sys/stream.h> #define _SUN_TPI_VERSION 2
*** 61,70 **** --- 62,72 ---- { SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, { SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, { SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, + { SO_REUSEPORT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, { SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, { SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, { SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, { SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, { SO_SNDTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
*** 482,491 **** --- 484,591 ---- mutex_exit(&connp->conn_lock); return (retval); } /* + * Set a TCP connection's participation in SO_REUSEPORT. This operation is + * performed under the protection of the squeue via tcp_setsockopt. + * The manipulation of tcp_rg_bind, as part of this operation, is subject to + * these constraints: + * 1. Prior to bind(), tcp_rg_bind can be set/cleared in tcp_set_reuseport + * under the protection of the squeue. + * 2. Once the connection has been bound, the tcp_rg_bind pointer must not be + * altered until such time as tcp_free() cleans up the connection. + * 3. A connection undergoing bind, which matches to a connection participating + * in port-reuse, will switch its tcp_rg_bind pointer when it joins the + * group of an existing connection in tcp_bindi(). + */ + static int + tcp_set_reuseport(conn_t *connp, boolean_t do_enable) + { + tcp_t *tcp = connp->conn_tcp; + struct tcp_rg_s *rg; + + if (!IPCL_IS_NONSTR(connp)) { + if (do_enable) { + /* + * SO_REUSEPORT cannot be enabled on sockets which have + * fallen back to the STREAMS API. + */ + return (EINVAL); + } else { + /* + * A connection with SO_REUSEPORT enabled should be + * prevented from falling back to STREAMS mode via + * logic in tcp_fallback. It is legal, however, for + * fallen-back connections to affirm the disabled state + * of SO_REUSEPORT. + */ + ASSERT(connp->conn_reuseport == 0); + return (0); + } + } + if (tcp->tcp_state <= TCPS_CLOSED) { + return (EINVAL); + } + if (connp->conn_reuseport == 0 && do_enable) { + /* disabled -> enabled */ + if (tcp->tcp_rg_bind != NULL) { + tcp_rg_setactive(tcp->tcp_rg_bind, do_enable); + } else { + /* + * Connection state is not a concern when initially + * populating tcp_rg_bind. Setting it to non-NULL on a + * bound or listening connection would only mean that + * new reused-port binds become a possibility. + */ + if ((rg = tcp_rg_init(tcp)) == NULL) { + return (ENOMEM); + } + tcp->tcp_rg_bind = rg; + } + connp->conn_reuseport = 1; + } else if (connp->conn_reuseport != 0 && !do_enable) { + /* enabled -> disabled */ + ASSERT(tcp->tcp_rg_bind != NULL); + if (tcp->tcp_state == TCPS_IDLE) { + /* + * If the connection has not been bound yet, discard + * the reuse group state. Since disabling SO_REUSEPORT + * on a bound socket will _not_ prevent others from + * reusing the port, the presence of tcp_rg_bind is + * used to determine reuse availability, not + * conn_reuseport. + * + * This allows proper behavior for examples such as: + * + * setsockopt(fd1, ... SO_REUSEPORT, &on_val...); + * bind(fd1, &myaddr, ...); + * setsockopt(fd1, ... SO_REUSEPORT, &off_val...); + * + * setsockopt(fd2, ... SO_REUSEPORT, &on_val...); + * bind(fd2, &myaddr, ...); // <- SHOULD SUCCEED + * + */ + rg = tcp->tcp_rg_bind; + tcp->tcp_rg_bind = NULL; + VERIFY(tcp_rg_remove(rg, tcp)); + tcp_rg_destroy(rg); + } else { + /* + * If a connection has been bound, it's no longer safe + * to manipulate tcp_rg_bind until connection clean-up + * during tcp_free. Just mark the member status of the + * connection as inactive. + */ + tcp_rg_setactive(tcp->tcp_rg_bind, do_enable); + } + connp->conn_reuseport = 0; + } + return (0); + } + + /* * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements. * Parameters are assumed to be verified by the caller. */ /* ARGSUSED */ int
*** 651,661 **** --- 751,766 ---- } tcp->tcp_snd_zcopy_aware = 1; } *outlenp = inlen; return (0); + case SO_REUSEPORT: + if (!checkonly) { + return (tcp_set_reuseport(connp, *i1 != 0)); } + return (0); + } break; case IPPROTO_TCP: switch (name) { case TCP_NODELAY: if (!checkonly)
*** 767,784 **** break; if (*i1 == 0) { return (EINVAL); } else if (tcp->tcp_ka_rinterval == 0) { ! if ((tcp->tcp_ka_abort_thres / *i1) < ! tcp->tcp_rto_min || ! (tcp->tcp_ka_abort_thres / *i1) > ! tcp->tcp_rto_max) ! return (EINVAL); ! tcp->tcp_ka_rinterval = ! tcp->tcp_ka_abort_thres / *i1; } else { if ((*i1 * tcp->tcp_ka_rinterval) < tcps->tcps_keepalive_abort_interval_low || (*i1 * tcp->tcp_ka_rinterval) > tcps->tcps_keepalive_abort_interval_high) --- 872,912 ---- break; if (*i1 == 0) { return (EINVAL); } else if (tcp->tcp_ka_rinterval == 0) { ! /* ! * When TCP_KEEPCNT is specified without first ! * specifying a TCP_KEEPINTVL, we infer an ! * interval based on a tunable specific to our ! * stack: the tcp_keepalive_abort_interval. ! * (Or the TCP_KEEPALIVE_ABORT_THRESHOLD, in ! * the unlikely event that that has been set.) ! * Given the abort interval's default value of ! * 480 seconds, low TCP_KEEPCNT values can ! * result in intervals that exceed the default ! * maximum RTO of 60 seconds. Rather than ! * fail in these cases, we (implicitly) clamp ! * the interval at the maximum RTO; if the ! * TCP_KEEPCNT is shortly followed by a ! * TCP_KEEPINTVL (as we expect), the abort ! * threshold will be recalculated correctly -- ! * and if a TCP_KEEPINTVL is not forthcoming, ! * keep-alive will at least operate reasonably ! * given the underconfigured state. ! */ ! uint32_t interval; ! interval = tcp->tcp_ka_abort_thres / *i1; ! ! if (interval < tcp->tcp_rto_min) ! interval = tcp->tcp_rto_min; ! ! if (interval > tcp->tcp_rto_max) ! interval = tcp->tcp_rto_max; ! ! tcp->tcp_ka_rinterval = interval; } else { if ((*i1 * tcp->tcp_ka_rinterval) < tcps->tcps_keepalive_abort_interval_low || (*i1 * tcp->tcp_ka_rinterval) > tcps->tcps_keepalive_abort_interval_high)
*** 951,964 **** default: break; } break; case IPPROTO_IP: - if (connp->conn_family != AF_INET) { - *outlenp = 0; - return (EINVAL); - } switch (name) { case IP_SEC_OPT: /* * We should not allow policy setting after * we start listening for connections. --- 1079,1088 ----