Print this page
OS-5598 newproc() performs inadequate clean-up after failed lwp_create() [fix debug build]
OS-5613 SO_REUSEPORT needs better state-change coverage
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Approved by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-5217 setsockopt(TCP_KEEPCNT) can return EINVAL spuriously
Reviewed by: Dave Pacheco <dap@joyent.com>
OS-4699 lxbrand netty complains about SO_LINGER (really IP_TOS)
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-4018 lxbrand support TCP SO_REUSEPORT
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Cody Mello <cody.mello@joyent.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/inet/tcp/tcp_opt_data.c
          +++ new/usr/src/uts/common/inet/tcp/tcp_opt_data.c
↓ open down ↓ 13 lines elided ↑ open up ↑
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
       24 + * Copyright 2016 Joyent, Inc.
  24   25   */
  25   26  
  26   27  #include <sys/types.h>
  27   28  #include <sys/stream.h>
  28   29  #define _SUN_TPI_VERSION 2
  29   30  #include <sys/tihdr.h>
  30   31  #include <sys/socket.h>
  31   32  #include <sys/xti_xtiopt.h>
  32   33  #include <sys/xti_inet.h>
  33   34  #include <sys/policy.h>
↓ open down ↓ 21 lines elided ↑ open up ↑
  55   56  
  56   57  { SO_LINGER,    SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
  57   58          sizeof (struct linger), 0 },
  58   59  
  59   60  { SO_DEBUG,     SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  60   61  { SO_KEEPALIVE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  61   62  { SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  62   63  { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
  63   64          },
  64   65  { SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  65      -{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
       66 +{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
       67 +{ SO_REUSEPORT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  66   68  { SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  67   69  { SO_TYPE,      SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
  68   70  { SO_SNDBUF,    SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  69   71  { SO_RCVBUF,    SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
  70   72  { SO_SNDTIMEO,  SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
  71   73          sizeof (struct timeval), 0 },
  72   74  { SO_RCVTIMEO,  SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
  73   75          sizeof (struct timeval), 0 },
  74   76  { SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
  75   77          },
↓ open down ↓ 401 lines elided ↑ open up ↑
 477  479                  }
 478  480                  break;
 479  481          }
 480  482          mutex_enter(&connp->conn_lock);
 481  483          retval = conn_opt_get(&coas, level, name, ptr);
 482  484          mutex_exit(&connp->conn_lock);
 483  485          return (retval);
 484  486  }
 485  487  
 486  488  /*
      489 + * Set a TCP connection's participation in SO_REUSEPORT.  This operation is
      490 + * performed under the protection of the squeue via tcp_setsockopt.
      491 + * The manipulation of tcp_rg_bind, as part of this operation, is subject to
      492 + * these constraints:
      493 + * 1. Prior to bind(), tcp_rg_bind can be set/cleared in tcp_set_reuseport
      494 + *    under the protection of the squeue.
      495 + * 2. Once the connection has been bound, the tcp_rg_bind pointer must not be
      496 + *    altered until such time as tcp_free() cleans up the connection.
      497 + * 3. A connection undergoing bind, which matches to a connection participating
      498 + *    in port-reuse, will switch its tcp_rg_bind pointer when it joins the
      499 + *    group of an existing connection in tcp_bindi().
      500 + */
      501 +static int
      502 +tcp_set_reuseport(conn_t *connp, boolean_t do_enable)
      503 +{
      504 +        tcp_t *tcp = connp->conn_tcp;
      505 +        struct tcp_rg_s *rg;
      506 +
      507 +        if (!IPCL_IS_NONSTR(connp)) {
      508 +                if (do_enable) {
      509 +                        /*
      510 +                         * SO_REUSEPORT cannot be enabled on sockets which have
      511 +                         * fallen back to the STREAMS API.
      512 +                         */
      513 +                        return (EINVAL);
      514 +                } else {
      515 +                        /*
      516 +                         * A connection with SO_REUSEPORT enabled should be
      517 +                         * prevented from falling back to STREAMS mode via
      518 +                         * logic in tcp_fallback.  It is legal, however, for
      519 +                         * fallen-back connections to affirm the disabled state
      520 +                         * of SO_REUSEPORT.
      521 +                         */
      522 +                        ASSERT(connp->conn_reuseport == 0);
      523 +                        return (0);
      524 +                }
      525 +        }
      526 +        if (tcp->tcp_state <= TCPS_CLOSED) {
      527 +                return (EINVAL);
      528 +        }
      529 +        if (connp->conn_reuseport == 0 && do_enable) {
      530 +                /* disabled -> enabled */
      531 +                if (tcp->tcp_rg_bind != NULL) {
      532 +                        tcp_rg_setactive(tcp->tcp_rg_bind, do_enable);
      533 +                } else {
      534 +                        /*
      535 +                         * Connection state is not a concern when initially
      536 +                         * populating tcp_rg_bind.  Setting it to non-NULL on a
      537 +                         * bound or listening connection would only mean that
      538 +                         * new reused-port binds become a possibility.
      539 +                         */
      540 +                        if ((rg = tcp_rg_init(tcp)) == NULL) {
      541 +                                return (ENOMEM);
      542 +                        }
      543 +                        tcp->tcp_rg_bind = rg;
      544 +                }
      545 +                connp->conn_reuseport = 1;
      546 +        } else if (connp->conn_reuseport != 0 && !do_enable) {
      547 +                /* enabled -> disabled */
      548 +                ASSERT(tcp->tcp_rg_bind != NULL);
      549 +                if (tcp->tcp_state == TCPS_IDLE) {
      550 +                        /*
      551 +                         * If the connection has not been bound yet, discard
      552 +                         * the reuse group state.  Since disabling SO_REUSEPORT
      553 +                         * on a bound socket will _not_ prevent others from
      554 +                         * reusing the port, the presence of tcp_rg_bind is
      555 +                         * used to determine reuse availability, not
      556 +                         * conn_reuseport.
      557 +                         *
      558 +                         * This allows proper behavior for examples such as:
      559 +                         *
      560 +                         * setsockopt(fd1, ... SO_REUSEPORT, &on_val...);
      561 +                         * bind(fd1, &myaddr, ...);
      562 +                         * setsockopt(fd1, ... SO_REUSEPORT, &off_val...);
      563 +                         *
      564 +                         * setsockopt(fd2, ... SO_REUSEPORT, &on_val...);
      565 +                         * bind(fd2, &myaddr, ...); // <- SHOULD SUCCEED
      566 +                         *
      567 +                         */
      568 +                        rg = tcp->tcp_rg_bind;
      569 +                        tcp->tcp_rg_bind = NULL;
      570 +                        VERIFY(tcp_rg_remove(rg, tcp));
      571 +                        tcp_rg_destroy(rg);
      572 +                } else {
      573 +                        /*
      574 +                         * If a connection has been bound, it's no longer safe
      575 +                         * to manipulate tcp_rg_bind until connection clean-up
      576 +                         * during tcp_free.  Just mark the member status of the
      577 +                         * connection as inactive.
      578 +                         */
      579 +                        tcp_rg_setactive(tcp->tcp_rg_bind, do_enable);
      580 +                }
      581 +                connp->conn_reuseport = 0;
      582 +        }
      583 +        return (0);
      584 +}
      585 +
      586 +/*
 487  587   * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
 488  588   * Parameters are assumed to be verified by the caller.
 489  589   */
 490  590  /* ARGSUSED */
 491  591  int
 492  592  tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
 493  593      uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
 494  594      void *thisdg_attrs, cred_t *cr)
 495  595  {
 496  596          tcp_t   *tcp = connp->conn_tcp;
↓ open down ↓ 149 lines elided ↑ open up ↑
 646  746                          if (!checkonly) {
 647  747                                  if (tcp->tcp_loopback ||
 648  748                                      (onoff != 1) || !tcp_zcopy_check(tcp)) {
 649  749                                          *outlenp = 0;
 650  750                                          return (EOPNOTSUPP);
 651  751                                  }
 652  752                                  tcp->tcp_snd_zcopy_aware = 1;
 653  753                          }
 654  754                          *outlenp = inlen;
 655  755                          return (0);
      756 +                case SO_REUSEPORT:
      757 +                        if (!checkonly) {
      758 +                                return (tcp_set_reuseport(connp, *i1 != 0));
      759 +                        }
      760 +                        return (0);
 656  761                  }
 657  762                  break;
 658  763          case IPPROTO_TCP:
 659  764                  switch (name) {
 660  765                  case TCP_NODELAY:
 661  766                          if (!checkonly)
 662  767                                  tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss;
 663  768                          break;
 664  769                  case TCP_NOTIFY_THRESHOLD:
 665  770                          if (!checkonly)
↓ open down ↓ 96 lines elided ↑ open up ↑
 762  867                   * three members - tcp_ka_abort_thres, tcp_ka_rinterval and
 763  868                   * tcp_ka_cnt.
 764  869                   */
 765  870                  case TCP_KEEPCNT:
 766  871                          if (checkonly)
 767  872                                  break;
 768  873  
 769  874                          if (*i1 == 0) {
 770  875                                  return (EINVAL);
 771  876                          } else if (tcp->tcp_ka_rinterval == 0) {
 772      -                                if ((tcp->tcp_ka_abort_thres / *i1) <
 773      -                                    tcp->tcp_rto_min ||
 774      -                                    (tcp->tcp_ka_abort_thres / *i1) >
 775      -                                    tcp->tcp_rto_max)
 776      -                                        return (EINVAL);
      877 +                                /*
      878 +                                 * When TCP_KEEPCNT is specified without first
      879 +                                 * specifying a TCP_KEEPINTVL, we infer an
      880 +                                 * interval based on a tunable specific to our
      881 +                                 * stack: the tcp_keepalive_abort_interval.
      882 +                                 * (Or the TCP_KEEPALIVE_ABORT_THRESHOLD, in
      883 +                                 * the unlikely event that that has been set.)
      884 +                                 * Given the abort interval's default value of
      885 +                                 * 480 seconds, low TCP_KEEPCNT values can
      886 +                                 * result in intervals that exceed the default
      887 +                                 * maximum RTO of 60 seconds.  Rather than
      888 +                                 * fail in these cases, we (implicitly) clamp
      889 +                                 * the interval at the maximum RTO; if the
      890 +                                 * TCP_KEEPCNT is shortly followed by a
      891 +                                 * TCP_KEEPINTVL (as we expect), the abort
      892 +                                 * threshold will be recalculated correctly --
      893 +                                 * and if a TCP_KEEPINTVL is not forthcoming,
      894 +                                 * keep-alive will at least operate reasonably
      895 +                                 * given the underconfigured state.
      896 +                                 */
      897 +                                uint32_t interval;
 777  898  
 778      -                                tcp->tcp_ka_rinterval =
 779      -                                    tcp->tcp_ka_abort_thres / *i1;
      899 +                                interval = tcp->tcp_ka_abort_thres / *i1;
      900 +
      901 +                                if (interval < tcp->tcp_rto_min)
      902 +                                        interval = tcp->tcp_rto_min;
      903 +
      904 +                                if (interval > tcp->tcp_rto_max)
      905 +                                        interval = tcp->tcp_rto_max;
      906 +
      907 +                                tcp->tcp_ka_rinterval = interval;
 780  908                          } else {
 781  909                                  if ((*i1 * tcp->tcp_ka_rinterval) <
 782  910                                      tcps->tcps_keepalive_abort_interval_low ||
 783  911                                      (*i1 * tcp->tcp_ka_rinterval) >
 784  912                                      tcps->tcps_keepalive_abort_interval_high)
 785  913                                          return (EINVAL);
 786  914                                  tcp->tcp_ka_abort_thres =
 787  915                                      (*i1 * tcp->tcp_ka_rinterval);
 788  916                          }
 789  917                          tcp->tcp_ka_cnt = *i1;
↓ open down ↓ 156 lines elided ↑ open up ↑
 946 1074                                  *outlenp = 0;
 947 1075                                  return (EINVAL);
 948 1076                          }
 949 1077                          tcp->tcp_fin_wait_2_flush_interval = *i1 * SECONDS;
 950 1078                          break;
 951 1079                  default:
 952 1080                          break;
 953 1081                  }
 954 1082                  break;
 955 1083          case IPPROTO_IP:
 956      -                if (connp->conn_family != AF_INET) {
 957      -                        *outlenp = 0;
 958      -                        return (EINVAL);
 959      -                }
 960 1084                  switch (name) {
 961 1085                  case IP_SEC_OPT:
 962 1086                          /*
 963 1087                           * We should not allow policy setting after
 964 1088                           * we start listening for connections.
 965 1089                           */
 966 1090                          if (tcp->tcp_state == TCPS_LISTEN) {
 967 1091                                  return (EINVAL);
 968 1092                          }
 969 1093                          break;
↓ open down ↓ 119 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX