Print this page
OS-5613 SO_REUSEPORT needs better state-change coverage
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Approved by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-4018 lxbrand support TCP SO_REUSEPORT
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Cody Mello <cody.mello@joyent.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/inet/tcp/tcp_bind.c
          +++ new/usr/src/uts/common/inet/tcp/tcp_bind.c
↓ open down ↓ 14 lines elided ↑ open up ↑
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
       25 + * Copyright 2016 Joyent, Inc.
  25   26   */
  26   27  
  27   28  #include <sys/types.h>
  28   29  #include <sys/stream.h>
  29   30  #include <sys/strsun.h>
  30   31  #include <sys/strsubr.h>
  31   32  #include <sys/stropts.h>
  32   33  #include <sys/strlog.h>
  33   34  #define _SUN_TPI_VERSION 2
  34   35  #include <sys/tihdr.h>
↓ open down ↓ 13 lines elided ↑ open up ↑
  48   49  #include <inet/proto_set.h>
  49   50  #include <inet/ipsec_impl.h>
  50   51  
  51   52  /* Setable in /etc/system */
  52   53  /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */
  53   54  static uint32_t tcp_random_anon_port = 1;
  54   55  
  55   56  static int      tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t,
  56   57                      cred_t *cr);
  57   58  static in_port_t        tcp_get_next_priv_port(const tcp_t *);
       59 +static int tcp_rg_insert(tcp_rg_t *, struct tcp_s *);
  58   60  
  59   61  /*
  60   62   * Hash list insertion routine for tcp_t structures. Each hash bucket
  61   63   * contains a list of tcp_t entries, and each entry is bound to a unique
  62   64   * port. If there are multiple tcp_t's that are bound to the same port, then
  63   65   * one of them will be linked into the hash bucket list, and the rest will
  64   66   * hang off of that one entry. For each port, entries bound to a specific IP
  65   67   * address will be inserted before those those bound to INADDR_ANY.
  66   68   */
  67   69  void
↓ open down ↓ 97 lines elided ↑ open up ↑
 165  167          /*
 166  168           * Extract the lock pointer in case there are concurrent
 167  169           * hash_remove's for this instance.
 168  170           */
 169  171          ASSERT(connp->conn_lport != 0);
 170  172          lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH(
 171  173              connp->conn_lport)].tf_lock;
 172  174  
 173  175          ASSERT(lockp != NULL);
 174  176          mutex_enter(lockp);
      177 +
      178 +        /* destroy any association with SO_REUSEPORT group */
      179 +        if (tcp->tcp_rg_bind != NULL) {
      180 +                if (tcp_rg_remove(tcp->tcp_rg_bind, tcp)) {
      181 +                        /* Last one out turns off the lights */
      182 +                        tcp_rg_destroy(tcp->tcp_rg_bind);
      183 +                }
      184 +                tcp->tcp_rg_bind = NULL;
      185 +        }
      186 +
 175  187          if (tcp->tcp_ptpbhn) {
 176  188                  tcpnext = tcp->tcp_bind_hash_port;
 177  189                  if (tcpnext != NULL) {
 178  190                          tcp->tcp_bind_hash_port = NULL;
 179  191                          tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
 180  192                          tcpnext->tcp_bind_hash = tcp->tcp_bind_hash;
 181  193                          if (tcpnext->tcp_bind_hash != NULL) {
 182  194                                  tcpnext->tcp_bind_hash->tcp_ptpbhn =
 183  195                                      &(tcpnext->tcp_bind_hash);
 184  196                                  tcp->tcp_bind_hash = NULL;
↓ open down ↓ 444 lines elided ↑ open up ↑
 629  641              bind_to_req_port_only, cr);
 630  642          if (error != 0) {
 631  643                  connp->conn_laddr_v6 = ipv6_all_zeros;
 632  644                  connp->conn_saddr_v6 = ipv6_all_zeros;
 633  645                  connp->conn_bound_addr_v6 = ipv6_all_zeros;
 634  646          }
 635  647          return (error);
 636  648  }
 637  649  
 638  650  /*
 639      - * If the "bind_to_req_port_only" parameter is set, if the requested port
 640      - * number is available, return it, If not return 0
      651 + * If the "bind_to_req_port_only" parameter is set and the requested port
      652 + * number is available, return it (else return 0).
 641  653   *
 642      - * If "bind_to_req_port_only" parameter is not set and
 643      - * If the requested port number is available, return it.  If not, return
 644      - * the first anonymous port we happen across.  If no anonymous ports are
 645      - * available, return 0. addr is the requested local address, if any.
      654 + * If "bind_to_req_port_only" parameter is not set and the requested port
      655 + * number is available, return it.  If not, return the first anonymous port we
      656 + * happen across.  If no anonymous ports are available, return 0.
 646  657   *
 647  658   * In either case, when succeeding update the tcp_t to record the port number
 648  659   * and insert it in the bind hash table.
 649  660   *
 650  661   * Note that TCP over IPv4 and IPv6 sockets can use the same port number
 651  662   * without setting SO_REUSEADDR. This is needed so that they
 652  663   * can be viewed as two independent transport protocols.
 653  664   */
 654  665  in_port_t
 655  666  tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
 656  667      int reuseaddr, boolean_t quick_connect,
 657  668      boolean_t bind_to_req_port_only, boolean_t user_specified)
 658  669  {
 659  670          /* number of times we have run around the loop */
 660  671          int count = 0;
 661  672          /* maximum number of times to run around the loop */
 662  673          int loopmax;
 663  674          conn_t *connp = tcp->tcp_connp;
 664  675          tcp_stack_t     *tcps = tcp->tcp_tcps;
      676 +        boolean_t reuseport = connp->conn_reuseport;
 665  677  
 666  678          /*
 667  679           * Lookup for free addresses is done in a loop and "loopmax"
 668  680           * influences how long we spin in the loop
 669  681           */
 670  682          if (bind_to_req_port_only) {
 671  683                  /*
 672  684                   * If the requested port is busy, don't bother to look
 673  685                   * for a new one. Setting loop maximum count to 1 has
 674  686                   * that effect.
↓ open down ↓ 16 lines elided ↑ open up ↑
 691  703                  } else {
 692  704                          loopmax = (tcps->tcps_largest_anon_port -
 693  705                              tcps->tcps_smallest_anon_port + 1);
 694  706                  }
 695  707          }
 696  708          do {
 697  709                  uint16_t        lport;
 698  710                  tf_t            *tbf;
 699  711                  tcp_t           *ltcp;
 700  712                  conn_t          *lconnp;
      713 +                boolean_t       attempt_reuse = B_FALSE;
 701  714  
 702  715                  lport = htons(port);
 703  716  
 704  717                  /*
 705  718                   * Ensure that the tcp_t is not currently in the bind hash.
 706  719                   * Hold the lock on the hash bucket to ensure that
 707  720                   * the duplicate check plus the insertion is an atomic
 708  721                   * operation.
 709  722                   *
 710  723                   * This function does an inline lookup on the bind hash list
↓ open down ↓ 6 lines elided ↑ open up ↑
 717  730                  mutex_enter(&tbf->tf_lock);
 718  731                  for (ltcp = tbf->tf_tcp; ltcp != NULL;
 719  732                      ltcp = ltcp->tcp_bind_hash) {
 720  733                          if (lport == ltcp->tcp_connp->conn_lport)
 721  734                                  break;
 722  735                  }
 723  736  
 724  737                  for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) {
 725  738                          boolean_t not_socket;
 726  739                          boolean_t exclbind;
      740 +                        boolean_t addrmatch;
 727  741  
 728  742                          lconnp = ltcp->tcp_connp;
 729  743  
 730  744                          /*
 731  745                           * On a labeled system, we must treat bindings to ports
 732  746                           * on shared IP addresses by sockets with MAC exemption
 733  747                           * privilege as being in all zones, as there's
 734  748                           * otherwise no way to identify the right receiver.
 735  749                           */
 736  750                          if (!IPCL_BIND_ZONE_MATCH(lconnp, connp))
↓ open down ↓ 85 lines elided ↑ open up ↑
 822  836                           * with IP, since IP needs needs to have that code
 823  837                           * to support userland TCP implementations.
 824  838                           */
 825  839                          if (quick_connect &&
 826  840                              (ltcp->tcp_state > TCPS_LISTEN) &&
 827  841                              ((connp->conn_fport != lconnp->conn_fport) ||
 828  842                              !IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
 829  843                              &lconnp->conn_faddr_v6)))
 830  844                                  continue;
 831  845  
      846 +                        addrmatch = IN6_ARE_ADDR_EQUAL(laddr,
      847 +                            &lconnp->conn_bound_addr_v6);
      848 +
      849 +                        if (addrmatch && reuseport && bind_to_req_port_only &&
      850 +                            (ltcp->tcp_state == TCPS_BOUND ||
      851 +                            ltcp->tcp_state == TCPS_LISTEN)) {
      852 +                                /*
      853 +                                 * This entry is bound to the exact same
      854 +                                 * address and port.  If SO_REUSEPORT is set on
      855 +                                 * the calling socket, attempt to reuse this
      856 +                                 * binding if it too had SO_REUSEPORT enabled
      857 +                                 * when it was bound.
      858 +                                 */
      859 +                                attempt_reuse = (ltcp->tcp_rg_bind != NULL);
      860 +                                break;
      861 +                        }
      862 +
 832  863                          if (!reuseaddr) {
 833  864                                  /*
 834      -                                 * No socket option SO_REUSEADDR.
 835      -                                 * If existing port is bound to
 836      -                                 * a non-wildcard IP address
 837      -                                 * and the requesting stream is
 838      -                                 * bound to a distinct
 839      -                                 * different IP addresses
 840      -                                 * (non-wildcard, also), keep
 841      -                                 * going.
      865 +                                 * No socket option SO_REUSEADDR.  If an
      866 +                                 * existing port is bound to a non-wildcard IP
      867 +                                 * address and the requesting stream is bound
      868 +                                 * to a distinct different IP address
      869 +                                 * (non-wildcard, also), keep going.
 842  870                                   */
 843  871                                  if (!V6_OR_V4_INADDR_ANY(*laddr) &&
 844  872                                      !V6_OR_V4_INADDR_ANY(
 845  873                                      lconnp->conn_bound_addr_v6) &&
 846      -                                    !IN6_ARE_ADDR_EQUAL(laddr,
 847      -                                    &lconnp->conn_bound_addr_v6))
      874 +                                    !addrmatch)
 848  875                                          continue;
 849  876                                  if (ltcp->tcp_state >= TCPS_BOUND) {
 850  877                                          /*
 851  878                                           * This port is being used and
 852  879                                           * its state is >= TCPS_BOUND,
 853  880                                           * so we can't bind to it.
 854  881                                           */
 855  882                                          break;
 856  883                                  }
 857  884                          } else {
 858  885                                  /*
 859  886                                   * socket option SO_REUSEADDR is set on the
 860  887                                   * binding tcp_t.
 861  888                                   *
 862      -                                 * If two streams are bound to
 863      -                                 * same IP address or both addr
 864      -                                 * and bound source are wildcards
 865      -                                 * (INADDR_ANY), we want to stop
 866      -                                 * searching.
 867      -                                 * We have found a match of IP source
 868      -                                 * address and source port, which is
 869      -                                 * refused regardless of the
 870      -                                 * SO_REUSEADDR setting, so we break.
      889 +                                 * If two streams are bound to the same IP
      890 +                                 * address or both addr and bound source are
      891 +                                 * wildcards (INADDR_ANY), we want to stop
      892 +                                 * searching.  We have found a match of IP
      893 +                                 * source address and source port, which is
      894 +                                 * refused regardless of the SO_REUSEADDR
      895 +                                 * setting, so we break.
 871  896                                   */
 872      -                                if (IN6_ARE_ADDR_EQUAL(laddr,
 873      -                                    &lconnp->conn_bound_addr_v6) &&
      897 +                                if (addrmatch &&
 874  898                                      (ltcp->tcp_state == TCPS_LISTEN ||
 875  899                                      ltcp->tcp_state == TCPS_BOUND))
 876  900                                          break;
 877  901                          }
 878  902                  }
 879      -                if (ltcp != NULL) {
      903 +                if (ltcp != NULL && !attempt_reuse) {
 880  904                          /* The port number is busy */
 881  905                          mutex_exit(&tbf->tf_lock);
 882  906                  } else {
      907 +                        if (attempt_reuse) {
      908 +                                int err;
      909 +                                struct tcp_rg_s *rg;
      910 +
      911 +                                ASSERT(ltcp != NULL);
      912 +                                ASSERT(ltcp->tcp_rg_bind != NULL);
      913 +                                ASSERT(tcp->tcp_rg_bind != NULL);
      914 +                                ASSERT(ltcp->tcp_rg_bind != tcp->tcp_rg_bind);
      915 +
      916 +                                err = tcp_rg_insert(ltcp->tcp_rg_bind, tcp);
      917 +                                if (err != 0) {
      918 +                                        mutex_exit(&tbf->tf_lock);
      919 +                                        return (0);
      920 +                                }
      921 +                                /*
      922 +                                 * Now that the newly-binding socket has joined
      923 +                                 * the existing reuseport group on ltcp, it
      924 +                                 * should clean up its own (empty) group.
      925 +                                 */
      926 +                                rg = tcp->tcp_rg_bind;
      927 +                                tcp->tcp_rg_bind = ltcp->tcp_rg_bind;
      928 +                                VERIFY(tcp_rg_remove(rg, tcp));
      929 +                                tcp_rg_destroy(rg);
      930 +                        }
      931 +
 883  932                          /*
 884  933                           * This port is ours. Insert in fanout and mark as
 885  934                           * bound to prevent others from getting the port
 886  935                           * number.
 887  936                           */
 888  937                          tcp->tcp_state = TCPS_BOUND;
 889  938                          DTRACE_TCP6(state__change, void, NULL,
 890  939                              ip_xmit_attr_t *, connp->conn_ixa,
 891  940                              void, NULL, tcp_t *, tcp, void, NULL,
 892  941                              int32_t, TCPS_IDLE);
↓ open down ↓ 43 lines elided ↑ open up ↑
 936  985                  }
 937  986                  if (port == 0)
 938  987                          break;
 939  988  
 940  989                  /*
 941  990                   * Don't let this loop run forever in the case where
 942  991                   * all of the anonymous ports are in use.
 943  992                   */
 944  993          } while (++count < loopmax);
 945  994          return (0);
      995 +}
      996 +
      997 +/* Max number of members in TCP SO_REUSEPORT group */
      998 +#define TCP_RG_SIZE_MAX         64
      999 +/* Step size when expanding members array */
     1000 +#define TCP_RG_SIZE_STEP        2
     1001 +
     1002 +
     1003 +tcp_rg_t *
     1004 +tcp_rg_init(tcp_t *tcp)
     1005 +{
     1006 +        tcp_rg_t *rg;
     1007 +        rg = kmem_alloc(sizeof (tcp_rg_t), KM_NOSLEEP|KM_NORMALPRI);
     1008 +        if (rg == NULL)
     1009 +                return (NULL);
     1010 +        rg->tcprg_members = kmem_zalloc(2 * sizeof (tcp_t *),
     1011 +            KM_NOSLEEP|KM_NORMALPRI);
     1012 +        if (rg->tcprg_members == NULL) {
     1013 +                kmem_free(rg, sizeof (tcp_rg_t));
     1014 +                return (NULL);
     1015 +        }
     1016 +
     1017 +        mutex_init(&rg->tcprg_lock, NULL, MUTEX_DEFAULT, NULL);
     1018 +        rg->tcprg_size = 2;
     1019 +        rg->tcprg_count = 1;
     1020 +        rg->tcprg_active = 1;
     1021 +        rg->tcprg_members[0] = tcp;
     1022 +        return (rg);
     1023 +}
     1024 +
     1025 +void
     1026 +tcp_rg_destroy(tcp_rg_t *rg)
     1027 +{
     1028 +        mutex_enter(&rg->tcprg_lock);
     1029 +        ASSERT(rg->tcprg_count == 0);
     1030 +        ASSERT(rg->tcprg_active == 0);
     1031 +        kmem_free(rg->tcprg_members, rg->tcprg_size * sizeof (tcp_t *));
     1032 +        mutex_destroy(&rg->tcprg_lock);
     1033 +        kmem_free(rg, sizeof (struct tcp_rg_s));
     1034 +}
     1035 +
     1036 +static int
     1037 +tcp_rg_insert(tcp_rg_t *rg, tcp_t *tcp)
     1038 +{
     1039 +        mutex_enter(&rg->tcprg_lock);
     1040 +
     1041 +        VERIFY(rg->tcprg_size > 0);
     1042 +        VERIFY(rg->tcprg_count <= rg->tcprg_size);
     1043 +        if (rg->tcprg_count != 0) {
     1044 +                cred_t *oldcred = rg->tcprg_members[0]->tcp_connp->conn_cred;
     1045 +                cred_t *newcred = tcp->tcp_connp->conn_cred;
     1046 +
     1047 +                if (crgetuid(oldcred) != crgetuid(newcred) ||
     1048 +                    crgetzoneid(oldcred) != crgetzoneid(newcred)) {
     1049 +                        mutex_exit(&rg->tcprg_lock);
     1050 +                        return (EPERM);
     1051 +                }
     1052 +        }
     1053 +
     1054 +        if (rg->tcprg_count == rg->tcprg_size) {
     1055 +                unsigned int oldalloc = rg->tcprg_size * sizeof (tcp_t *);
     1056 +                unsigned int newsize = rg->tcprg_size + TCP_RG_SIZE_STEP;
     1057 +                tcp_t **newmembers;
     1058 +
     1059 +                if (newsize > TCP_RG_SIZE_MAX) {
     1060 +                        mutex_exit(&rg->tcprg_lock);
     1061 +                        return (EINVAL);
     1062 +                }
     1063 +                newmembers = kmem_zalloc(newsize * sizeof (tcp_t *),
     1064 +                    KM_NOSLEEP|KM_NORMALPRI);
     1065 +                if (newmembers == NULL) {
     1066 +                        mutex_exit(&rg->tcprg_lock);
     1067 +                        return (ENOMEM);
     1068 +                }
     1069 +                bcopy(rg->tcprg_members, newmembers, oldalloc);
     1070 +                kmem_free(rg->tcprg_members, oldalloc);
     1071 +                rg->tcprg_members = newmembers;
     1072 +                rg->tcprg_size = newsize;
     1073 +        }
     1074 +
     1075 +        rg->tcprg_members[rg->tcprg_count] = tcp;
     1076 +        rg->tcprg_count++;
     1077 +        rg->tcprg_active++;
     1078 +
     1079 +        mutex_exit(&rg->tcprg_lock);
     1080 +        return (0);
     1081 +}
     1082 +
     1083 +boolean_t
     1084 +tcp_rg_remove(tcp_rg_t *rg, tcp_t *tcp)
     1085 +{
     1086 +        int i;
     1087 +        boolean_t is_empty;
     1088 +
     1089 +        mutex_enter(&rg->tcprg_lock);
     1090 +        for (i = 0; i < rg->tcprg_count; i++) {
     1091 +                if (rg->tcprg_members[i] == tcp)
     1092 +                        break;
     1093 +        }
     1094 +        /* The item should be present */
     1095 +        ASSERT(i < rg->tcprg_count);
     1096 +        /* Move the last member into this position */
     1097 +        rg->tcprg_count--;
     1098 +        rg->tcprg_members[i] = rg->tcprg_members[rg->tcprg_count];
     1099 +        rg->tcprg_members[rg->tcprg_count] = NULL;
     1100 +        if (tcp->tcp_connp->conn_reuseport != 0)
     1101 +                rg->tcprg_active--;
     1102 +        is_empty = (rg->tcprg_count == 0);
     1103 +        mutex_exit(&rg->tcprg_lock);
     1104 +        return (is_empty);
     1105 +}
     1106 +
     1107 +void
     1108 +tcp_rg_setactive(tcp_rg_t *rg, boolean_t is_active)
     1109 +{
     1110 +        mutex_enter(&rg->tcprg_lock);
     1111 +        if (is_active) {
     1112 +                rg->tcprg_active++;
     1113 +        } else {
     1114 +                rg->tcprg_active--;
     1115 +        }
     1116 +        mutex_exit(&rg->tcprg_lock);
 946 1117  }
    
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX