Print this page
    
Reduce lint
OS-5007 support SO_ATTACH_FILTER on ICMP sockets
Reviewed by: Cody Mello <melloc@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Approved by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-4018 lxbrand support TCP SO_REUSEPORT
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Cody Mello <cody.mello@joyent.com>
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/inet/ip/ipclassifier.c
          +++ new/usr/src/uts/common/inet/ip/ipclassifier.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  
    | 
      ↓ open down ↓ | 
    12 lines elided | 
    
      ↑ open up ↑ | 
  
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
       23 + * Copyright 2016 Joyent, Inc.
  23   24   */
  24   25  
  25   26  /*
  26   27   * IP PACKET CLASSIFIER
  27   28   *
  28   29   * The IP packet classifier provides mapping between IP packets and persistent
  29   30   * connection state for connection-oriented protocols. It also provides
  30   31   * interface for managing connection states.
  31   32   *
  32   33   * The connection state is kept in conn_t data structure and contains, among
  33   34   * other things:
  34   35   *
  35   36   *      o local/remote address and ports
  36   37   *      o Transport protocol
  37   38   *      o squeue for the connection (for TCP only)
  38   39   *      o reference counter
  39   40   *      o Connection state
  40   41   *      o hash table linkage
  41   42   *      o interface/ire information
  42   43   *      o credentials
  43   44   *      o ipsec policy
  44   45   *      o send and receive functions.
  45   46   *      o mutex lock.
  46   47   *
  47   48   * Connections use a reference counting scheme. They are freed when the
  48   49   * reference counter drops to zero. A reference is incremented when connection
  49   50   * is placed in a list or table, when incoming packet for the connection arrives
  50   51   * and when connection is processed via squeue (squeue processing may be
  51   52   * asynchronous and the reference protects the connection from being destroyed
  52   53   * before its processing is finished).
  53   54   *
  54   55   * conn_recv is used to pass up packets to the ULP.
  55   56   * For TCP conn_recv changes. It is tcp_input_listener_unbound initially for
  56   57   * a listener, and changes to tcp_input_listener as the listener has picked a
  57   58   * good squeue. For other cases it is set to tcp_input_data.
  58   59   *
  59   60   * conn_recvicmp is used to pass up ICMP errors to the ULP.
  60   61   *
  61   62   * Classifier uses several hash tables:
  62   63   *
  63   64   *      ipcl_conn_fanout:       contains all TCP connections in CONNECTED state
  64   65   *      ipcl_bind_fanout:       contains all connections in BOUND state
  65   66   *      ipcl_proto_fanout:      IPv4 protocol fanout
  66   67   *      ipcl_proto_fanout_v6:   IPv6 protocol fanout
  67   68   *      ipcl_udp_fanout:        contains all UDP connections
  68   69   *      ipcl_iptun_fanout:      contains all IP tunnel connections
  69   70   *      ipcl_globalhash_fanout: contains all connections
  70   71   *
  71   72   * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering)
  72   73   * which need to view all existing connections.
  73   74   *
  74   75   * All tables are protected by per-bucket locks. When both per-bucket lock and
  75   76   * connection lock need to be held, the per-bucket lock should be acquired
  76   77   * first, followed by the connection lock.
  77   78   *
  78   79   * All functions doing search in one of these tables increment a reference
  79   80   * counter on the connection found (if any). This reference should be dropped
  80   81   * when the caller has finished processing the connection.
  81   82   *
  82   83   *
  83   84   * INTERFACES:
  84   85   * ===========
  85   86   *
  86   87   * Connection Lookup:
  87   88   * ------------------
  88   89   *
  89   90   * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, ira, ip_stack)
  90   91   * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, ira, ip_stack)
  91   92   *
  92   93   * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
  93   94   * it can't find any associated connection. If the connection is found, its
  94   95   * reference counter is incremented.
  95   96   *
  96   97   *      mp:     mblock, containing packet header. The full header should fit
  97   98   *              into a single mblock. It should also contain at least full IP
  98   99   *              and TCP or UDP header.
  99  100   *
 100  101   *      protocol: Either IPPROTO_TCP or IPPROTO_UDP.
 101  102   *
 102  103   *      hdr_len: The size of IP header. It is used to find TCP or UDP header in
 103  104   *               the packet.
 104  105   *
 105  106   *      ira->ira_zoneid: The zone in which the returned connection must be; the
 106  107   *              zoneid corresponding to the ire_zoneid on the IRE located for
 107  108   *              the packet's destination address.
 108  109   *
 109  110   *      ira->ira_flags: Contains the IRAF_TX_MAC_EXEMPTABLE and
 110  111   *              IRAF_TX_SHARED_ADDR flags
 111  112   *
 112  113   *      For TCP connections, the lookup order is as follows:
 113  114   *              5-tuple {src, dst, protocol, local port, remote port}
 114  115   *                      lookup in ipcl_conn_fanout table.
 115  116   *              3-tuple {dst, remote port, protocol} lookup in
 116  117   *                      ipcl_bind_fanout table.
 117  118   *
 118  119   *      For UDP connections, a 5-tuple {src, dst, protocol, local port,
 119  120   *      remote port} lookup is done on ipcl_udp_fanout. Note that,
 120  121   *      these interfaces do not handle cases where a packets belongs
 121  122   *      to multiple UDP clients, which is handled in IP itself.
 122  123   *
 123  124   * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must
 124  125   * determine which actual zone gets the segment.  This is used only in a
 125  126   * labeled environment.  The matching rules are:
 126  127   *
 127  128   *      - If it's not a multilevel port, then the label on the packet selects
 128  129   *        the zone.  Unlabeled packets are delivered to the global zone.
 129  130   *
 130  131   *      - If it's a multilevel port, then only the zone registered to receive
 131  132   *        packets on that port matches.
 132  133   *
 133  134   * Also, in a labeled environment, packet labels need to be checked.  For fully
 134  135   * bound TCP connections, we can assume that the packet label was checked
 135  136   * during connection establishment, and doesn't need to be checked on each
 136  137   * packet.  For others, though, we need to check for strict equality or, for
 137  138   * multilevel ports, membership in the range or set.  This part currently does
 138  139   * a tnrh lookup on each packet, but could be optimized to use cached results
 139  140   * if that were necessary.  (SCTP doesn't come through here, but if it did,
 140  141   * we would apply the same rules as TCP.)
 141  142   *
 142  143   * An implication of the above is that fully-bound TCP sockets must always use
 143  144   * distinct 4-tuples; they can't be discriminated by label alone.
 144  145   *
 145  146   * Note that we cannot trust labels on packets sent to fully-bound UDP sockets,
 146  147   * as there's no connection set-up handshake and no shared state.
 147  148   *
 148  149   * Labels on looped-back packets within a single zone do not need to be
 149  150   * checked, as all processes in the same zone have the same label.
 150  151   *
 151  152   * Finally, for unlabeled packets received by a labeled system, special rules
 152  153   * apply.  We consider only the MLP if there is one.  Otherwise, we prefer a
 153  154   * socket in the zone whose label matches the default label of the sender, if
 154  155   * any.  In any event, the receiving socket must have SO_MAC_EXEMPT set and the
 155  156   * receiver's label must dominate the sender's default label.
 156  157   *
 157  158   * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcpha_t *, int, ip_stack);
 158  159   * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t,
 159  160   *                                       ip_stack);
 160  161   *
 161  162   *      Lookup routine to find a exact match for {src, dst, local port,
 162  163   *      remote port) for TCP connections in ipcl_conn_fanout. The address and
 163  164   *      ports are read from the IP and TCP header respectively.
 164  165   *
 165  166   * conn_t       *ipcl_lookup_listener_v4(lport, laddr, protocol,
 166  167   *                                       zoneid, ip_stack);
 167  168   * conn_t       *ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex,
 168  169   *                                       zoneid, ip_stack);
 169  170   *
 170  171   *      Lookup routine to find a listener with the tuple {lport, laddr,
 171  172   *      protocol} in the ipcl_bind_fanout table. For IPv6, an additional
 172  173   *      parameter interface index is also compared.
 173  174   *
 174  175   * void ipcl_walk(func, arg, ip_stack)
 175  176   *
 176  177   *      Apply 'func' to every connection available. The 'func' is called as
 177  178   *      (*func)(connp, arg). The walk is non-atomic so connections may be
 178  179   *      created and destroyed during the walk. The CONN_CONDEMNED and
 179  180   *      CONN_INCIPIENT flags ensure that connections which are newly created
 180  181   *      or being destroyed are not selected by the walker.
 181  182   *
 182  183   * Table Updates
 183  184   * -------------
 184  185   *
 185  186   * int ipcl_conn_insert(connp);
 186  187   * int ipcl_conn_insert_v4(connp);
 187  188   * int ipcl_conn_insert_v6(connp);
 188  189   *
 189  190   *      Insert 'connp' in the ipcl_conn_fanout.
 190  191   *      Arguements :
 191  192   *              connp           conn_t to be inserted
 192  193   *
 193  194   *      Return value :
 194  195   *              0               if connp was inserted
 195  196   *              EADDRINUSE      if the connection with the same tuple
 196  197   *                              already exists.
 197  198   *
 198  199   * int ipcl_bind_insert(connp);
 199  200   * int ipcl_bind_insert_v4(connp);
 200  201   * int ipcl_bind_insert_v6(connp);
 201  202   *
 202  203   *      Insert 'connp' in ipcl_bind_fanout.
 203  204   *      Arguements :
 204  205   *              connp           conn_t to be inserted
 205  206   *
 206  207   *
 207  208   * void ipcl_hash_remove(connp);
 208  209   *
 209  210   *      Removes the 'connp' from the connection fanout table.
 210  211   *
 211  212   * Connection Creation/Destruction
 212  213   * -------------------------------
 213  214   *
 214  215   * conn_t *ipcl_conn_create(type, sleep, netstack_t *)
 215  216   *
 216  217   *      Creates a new conn based on the type flag, inserts it into
 217  218   *      globalhash table.
 218  219   *
 219  220   *      type:   This flag determines the type of conn_t which needs to be
 220  221   *              created i.e., which kmem_cache it comes from.
 221  222   *              IPCL_TCPCONN    indicates a TCP connection
 222  223   *              IPCL_SCTPCONN   indicates a SCTP connection
 223  224   *              IPCL_UDPCONN    indicates a UDP conn_t.
 224  225   *              IPCL_RAWIPCONN  indicates a RAWIP/ICMP conn_t.
 225  226   *              IPCL_RTSCONN    indicates a RTS conn_t.
 226  227   *              IPCL_IPCCONN    indicates all other connections.
 227  228   *
 228  229   * void ipcl_conn_destroy(connp)
 229  230   *
 230  231   *      Destroys the connection state, removes it from the global
 231  232   *      connection hash table and frees its memory.
 232  233   */
 233  234  
 234  235  #include <sys/types.h>
 235  236  #include <sys/stream.h>
 236  237  #include <sys/stropts.h>
 237  238  #include <sys/sysmacros.h>
 238  239  #include <sys/strsubr.h>
 239  240  #include <sys/strsun.h>
 240  241  #define _SUN_TPI_VERSION 2
 241  242  #include <sys/ddi.h>
 242  243  #include <sys/cmn_err.h>
 243  244  #include <sys/debug.h>
 244  245  
 245  246  #include <sys/systm.h>
 246  247  #include <sys/param.h>
 247  248  #include <sys/kmem.h>
 248  249  #include <sys/isa_defs.h>
 249  250  #include <inet/common.h>
 250  251  #include <netinet/ip6.h>
 251  252  #include <netinet/icmp6.h>
 252  253  
 253  254  #include <inet/ip.h>
 254  255  #include <inet/ip_if.h>
 255  256  #include <inet/ip_ire.h>
 256  257  #include <inet/ip6.h>
 257  258  #include <inet/ip_ndp.h>
 258  259  #include <inet/ip_impl.h>
 259  260  #include <inet/udp_impl.h>
 260  261  #include <inet/sctp_ip.h>
 261  262  #include <inet/sctp/sctp_impl.h>
 262  263  #include <inet/rawip_impl.h>
 263  264  #include <inet/rts_impl.h>
 264  265  #include <inet/iptun/iptun_impl.h>
 265  266  
 266  267  #include <sys/cpuvar.h>
 267  268  
 268  269  #include <inet/ipclassifier.h>
 269  270  #include <inet/tcp.h>
 270  271  #include <inet/ipsec_impl.h>
 271  272  
 272  273  #include <sys/tsol/tnet.h>
 273  274  #include <sys/sockio.h>
 274  275  
 275  276  /* Old value for compatibility. Setable in /etc/system */
 276  277  uint_t tcp_conn_hash_size = 0;
 277  278  
 278  279  /* New value. Zero means choose automatically.  Setable in /etc/system */
 279  280  uint_t ipcl_conn_hash_size = 0;
 280  281  uint_t ipcl_conn_hash_memfactor = 8192;
 281  282  uint_t ipcl_conn_hash_maxsize = 82500;
 282  283  
 283  284  /* bind/udp fanout table size */
 284  285  uint_t ipcl_bind_fanout_size = 512;
 285  286  uint_t ipcl_udp_fanout_size = 16384;
 286  287  
 287  288  /* Raw socket fanout size.  Must be a power of 2. */
 288  289  uint_t ipcl_raw_fanout_size = 256;
 289  290  
 290  291  /*
 291  292   * The IPCL_IPTUN_HASH() function works best with a prime table size.  We
 292  293   * expect that most large deployments would have hundreds of tunnels, and
 293  294   * thousands in the extreme case.
 294  295   */
 295  296  uint_t ipcl_iptun_fanout_size = 6143;
 296  297  
 297  298  /*
 298  299   * Power of 2^N Primes useful for hashing for N of 0-28,
 299  300   * these primes are the nearest prime <= 2^N - 2^(N-2).
 300  301   */
 301  302  
 302  303  #define P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067,  \
 303  304                  6143, 12281, 24571, 49139, 98299, 196597, 393209,       \
 304  305                  786431, 1572853, 3145721, 6291449, 12582893, 25165813,  \
 305  306                  50331599, 100663291, 201326557, 0}
 306  307  
 307  308  /*
 308  309   * wrapper structure to ensure that conn and what follows it (tcp_t, etc)
 309  310   * are aligned on cache lines.
 310  311   */
 311  312  typedef union itc_s {
 312  313          conn_t  itc_conn;
 313  314          char    itcu_filler[CACHE_ALIGN(conn_s)];
 314  315  } itc_t;
 315  316  
 316  317  struct kmem_cache  *tcp_conn_cache;
 317  318  struct kmem_cache  *ip_conn_cache;
 318  319  extern struct kmem_cache  *sctp_conn_cache;
 319  320  struct kmem_cache  *udp_conn_cache;
 320  321  struct kmem_cache  *rawip_conn_cache;
 321  322  struct kmem_cache  *rts_conn_cache;
 322  323  
 323  324  extern void     tcp_timermp_free(tcp_t *);
 324  325  extern mblk_t   *tcp_timermp_alloc(int);
 325  326  
 326  327  static int      ip_conn_constructor(void *, void *, int);
 327  328  static void     ip_conn_destructor(void *, void *);
 328  329  
 329  330  static int      tcp_conn_constructor(void *, void *, int);
 330  331  static void     tcp_conn_destructor(void *, void *);
 331  332  
 332  333  static int      udp_conn_constructor(void *, void *, int);
 333  334  static void     udp_conn_destructor(void *, void *);
 334  335  
 335  336  static int      rawip_conn_constructor(void *, void *, int);
 336  337  static void     rawip_conn_destructor(void *, void *);
 337  338  
 338  339  static int      rts_conn_constructor(void *, void *, int);
 339  340  static void     rts_conn_destructor(void *, void *);
 340  341  
 341  342  /*
 342  343   * Global (for all stack instances) init routine
 343  344   */
 344  345  void
 345  346  ipcl_g_init(void)
 346  347  {
 347  348          ip_conn_cache = kmem_cache_create("ip_conn_cache",
 348  349              sizeof (conn_t), CACHE_ALIGN_SIZE,
 349  350              ip_conn_constructor, ip_conn_destructor,
 350  351              NULL, NULL, NULL, 0);
 351  352  
 352  353          tcp_conn_cache = kmem_cache_create("tcp_conn_cache",
 353  354              sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE,
 354  355              tcp_conn_constructor, tcp_conn_destructor,
 355  356              tcp_conn_reclaim, NULL, NULL, 0);
 356  357  
 357  358          udp_conn_cache = kmem_cache_create("udp_conn_cache",
 358  359              sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE,
 359  360              udp_conn_constructor, udp_conn_destructor,
 360  361              NULL, NULL, NULL, 0);
 361  362  
 362  363          rawip_conn_cache = kmem_cache_create("rawip_conn_cache",
 363  364              sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE,
 364  365              rawip_conn_constructor, rawip_conn_destructor,
 365  366              NULL, NULL, NULL, 0);
 366  367  
 367  368          rts_conn_cache = kmem_cache_create("rts_conn_cache",
 368  369              sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE,
 369  370              rts_conn_constructor, rts_conn_destructor,
 370  371              NULL, NULL, NULL, 0);
 371  372  }
 372  373  
 373  374  /*
 374  375   * ipclassifier intialization routine, sets up hash tables.
 375  376   */
 376  377  void
 377  378  ipcl_init(ip_stack_t *ipst)
 378  379  {
 379  380          int i;
 380  381          int sizes[] = P2Ps();
 381  382  
 382  383          /*
 383  384           * Calculate size of conn fanout table from /etc/system settings
 384  385           */
 385  386          if (ipcl_conn_hash_size != 0) {
 386  387                  ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size;
 387  388          } else if (tcp_conn_hash_size != 0) {
 388  389                  ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size;
 389  390          } else {
 390  391                  extern pgcnt_t freemem;
 391  392  
 392  393                  ipst->ips_ipcl_conn_fanout_size =
 393  394                      (freemem * PAGESIZE) / ipcl_conn_hash_memfactor;
 394  395  
 395  396                  if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) {
 396  397                          ipst->ips_ipcl_conn_fanout_size =
 397  398                              ipcl_conn_hash_maxsize;
 398  399                  }
 399  400          }
 400  401  
 401  402          for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) {
 402  403                  if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) {
 403  404                          break;
 404  405                  }
 405  406          }
 406  407          if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) {
 407  408                  /* Out of range, use the 2^16 value */
 408  409                  ipst->ips_ipcl_conn_fanout_size = sizes[16];
 409  410          }
 410  411  
 411  412          /* Take values from /etc/system */
 412  413          ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size;
 413  414          ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size;
 414  415          ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size;
 415  416          ipst->ips_ipcl_iptun_fanout_size = ipcl_iptun_fanout_size;
 416  417  
 417  418          ASSERT(ipst->ips_ipcl_conn_fanout == NULL);
 418  419  
 419  420          ipst->ips_ipcl_conn_fanout = kmem_zalloc(
 420  421              ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP);
 421  422  
 422  423          for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
 423  424                  mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL,
 424  425                      MUTEX_DEFAULT, NULL);
 425  426          }
 426  427  
 427  428          ipst->ips_ipcl_bind_fanout = kmem_zalloc(
 428  429              ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP);
 429  430  
 430  431          for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
 431  432                  mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL,
 432  433                      MUTEX_DEFAULT, NULL);
 433  434          }
 434  435  
 435  436          ipst->ips_ipcl_proto_fanout_v4 = kmem_zalloc(IPPROTO_MAX *
 436  437              sizeof (connf_t), KM_SLEEP);
 437  438          for (i = 0; i < IPPROTO_MAX; i++) {
 438  439                  mutex_init(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock, NULL,
 439  440                      MUTEX_DEFAULT, NULL);
 440  441          }
 441  442  
 442  443          ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX *
 443  444              sizeof (connf_t), KM_SLEEP);
 444  445          for (i = 0; i < IPPROTO_MAX; i++) {
 445  446                  mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL,
 446  447                      MUTEX_DEFAULT, NULL);
 447  448          }
 448  449  
 449  450          ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP);
 450  451          mutex_init(&ipst->ips_rts_clients->connf_lock,
 451  452              NULL, MUTEX_DEFAULT, NULL);
 452  453  
 453  454          ipst->ips_ipcl_udp_fanout = kmem_zalloc(
 454  455              ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP);
 455  456          for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
 456  457                  mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL,
 457  458                      MUTEX_DEFAULT, NULL);
 458  459          }
 459  460  
 460  461          ipst->ips_ipcl_iptun_fanout = kmem_zalloc(
 461  462              ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t), KM_SLEEP);
 462  463          for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
 463  464                  mutex_init(&ipst->ips_ipcl_iptun_fanout[i].connf_lock, NULL,
 464  465                      MUTEX_DEFAULT, NULL);
 465  466          }
 466  467  
 467  468          ipst->ips_ipcl_raw_fanout = kmem_zalloc(
 468  469              ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP);
 469  470          for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
 470  471                  mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL,
 471  472                      MUTEX_DEFAULT, NULL);
 472  473          }
 473  474  
 474  475          ipst->ips_ipcl_globalhash_fanout = kmem_zalloc(
 475  476              sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP);
 476  477          for (i = 0; i < CONN_G_HASH_SIZE; i++) {
 477  478                  mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock,
 478  479                      NULL, MUTEX_DEFAULT, NULL);
 479  480          }
 480  481  }
 481  482  
 482  483  void
 483  484  ipcl_g_destroy(void)
 484  485  {
 485  486          kmem_cache_destroy(ip_conn_cache);
 486  487          kmem_cache_destroy(tcp_conn_cache);
 487  488          kmem_cache_destroy(udp_conn_cache);
 488  489          kmem_cache_destroy(rawip_conn_cache);
 489  490          kmem_cache_destroy(rts_conn_cache);
 490  491  }
 491  492  
 492  493  /*
 493  494   * All user-level and kernel use of the stack must be gone
 494  495   * by now.
 495  496   */
 496  497  void
 497  498  ipcl_destroy(ip_stack_t *ipst)
 498  499  {
 499  500          int i;
 500  501  
 501  502          for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
 502  503                  ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL);
 503  504                  mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock);
 504  505          }
 505  506          kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size *
 506  507              sizeof (connf_t));
 507  508          ipst->ips_ipcl_conn_fanout = NULL;
 508  509  
 509  510          for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
 510  511                  ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL);
 511  512                  mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock);
 512  513          }
 513  514          kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size *
 514  515              sizeof (connf_t));
 515  516          ipst->ips_ipcl_bind_fanout = NULL;
 516  517  
 517  518          for (i = 0; i < IPPROTO_MAX; i++) {
 518  519                  ASSERT(ipst->ips_ipcl_proto_fanout_v4[i].connf_head == NULL);
 519  520                  mutex_destroy(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock);
 520  521          }
 521  522          kmem_free(ipst->ips_ipcl_proto_fanout_v4,
 522  523              IPPROTO_MAX * sizeof (connf_t));
 523  524          ipst->ips_ipcl_proto_fanout_v4 = NULL;
 524  525  
 525  526          for (i = 0; i < IPPROTO_MAX; i++) {
 526  527                  ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL);
 527  528                  mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock);
 528  529          }
 529  530          kmem_free(ipst->ips_ipcl_proto_fanout_v6,
 530  531              IPPROTO_MAX * sizeof (connf_t));
 531  532          ipst->ips_ipcl_proto_fanout_v6 = NULL;
 532  533  
 533  534          for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
 534  535                  ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL);
 535  536                  mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock);
 536  537          }
 537  538          kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size *
 538  539              sizeof (connf_t));
 539  540          ipst->ips_ipcl_udp_fanout = NULL;
 540  541  
 541  542          for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
 542  543                  ASSERT(ipst->ips_ipcl_iptun_fanout[i].connf_head == NULL);
 543  544                  mutex_destroy(&ipst->ips_ipcl_iptun_fanout[i].connf_lock);
 544  545          }
 545  546          kmem_free(ipst->ips_ipcl_iptun_fanout,
 546  547              ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t));
 547  548          ipst->ips_ipcl_iptun_fanout = NULL;
 548  549  
 549  550          for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
 550  551                  ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL);
 551  552                  mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock);
 552  553          }
 553  554          kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size *
 554  555              sizeof (connf_t));
 555  556          ipst->ips_ipcl_raw_fanout = NULL;
 556  557  
 557  558          for (i = 0; i < CONN_G_HASH_SIZE; i++) {
 558  559                  ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL);
 559  560                  mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
 560  561          }
 561  562          kmem_free(ipst->ips_ipcl_globalhash_fanout,
 562  563              sizeof (connf_t) * CONN_G_HASH_SIZE);
 563  564          ipst->ips_ipcl_globalhash_fanout = NULL;
 564  565  
 565  566          ASSERT(ipst->ips_rts_clients->connf_head == NULL);
 566  567          mutex_destroy(&ipst->ips_rts_clients->connf_lock);
 567  568          kmem_free(ipst->ips_rts_clients, sizeof (connf_t));
 568  569          ipst->ips_rts_clients = NULL;
 569  570  }
 570  571  
 571  572  /*
 572  573   * conn creation routine. initialize the conn, sets the reference
 573  574   * and inserts it in the global hash table.
 574  575   */
 575  576  conn_t *
 576  577  ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns)
 577  578  {
 578  579          conn_t  *connp;
 579  580          struct kmem_cache *conn_cache;
 580  581  
 581  582          switch (type) {
 582  583          case IPCL_SCTPCONN:
 583  584                  if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL)
 584  585                          return (NULL);
 585  586                  sctp_conn_init(connp);
 586  587                  netstack_hold(ns);
 587  588                  connp->conn_netstack = ns;
 588  589                  connp->conn_ixa->ixa_ipst = ns->netstack_ip;
 589  590                  connp->conn_ixa->ixa_conn_id = (long)connp;
 590  591                  ipcl_globalhash_insert(connp);
 591  592                  return (connp);
 592  593  
 593  594          case IPCL_TCPCONN:
 594  595                  conn_cache = tcp_conn_cache;
 595  596                  break;
 596  597  
 597  598          case IPCL_UDPCONN:
 598  599                  conn_cache = udp_conn_cache;
 599  600                  break;
 600  601  
 601  602          case IPCL_RAWIPCONN:
 602  603                  conn_cache = rawip_conn_cache;
 603  604                  break;
 604  605  
 605  606          case IPCL_RTSCONN:
 606  607                  conn_cache = rts_conn_cache;
 607  608                  break;
 608  609  
 609  610          case IPCL_IPCCONN:
 610  611                  conn_cache = ip_conn_cache;
 611  612                  break;
 612  613  
 613  614          default:
 614  615                  connp = NULL;
 615  616                  ASSERT(0);
 616  617          }
 617  618  
 618  619          if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL)
 619  620                  return (NULL);
 620  621  
 621  622          connp->conn_ref = 1;
 622  623          netstack_hold(ns);
 623  624          connp->conn_netstack = ns;
 624  625          connp->conn_ixa->ixa_ipst = ns->netstack_ip;
 625  626          connp->conn_ixa->ixa_conn_id = (long)connp;
 626  627          ipcl_globalhash_insert(connp);
 627  628          return (connp);
 628  629  }
 629  630  
 630  631  void
 631  632  ipcl_conn_destroy(conn_t *connp)
 632  633  {
 633  634          mblk_t  *mp;
 634  635          netstack_t      *ns = connp->conn_netstack;
 635  636  
 636  637          ASSERT(!MUTEX_HELD(&connp->conn_lock));
 637  638          ASSERT(connp->conn_ref == 0);
 638  639          ASSERT(connp->conn_ioctlref == 0);
 639  640  
 640  641          DTRACE_PROBE1(conn__destroy, conn_t *, connp);
 641  642  
 642  643          if (connp->conn_cred != NULL) {
 643  644                  crfree(connp->conn_cred);
 644  645                  connp->conn_cred = NULL;
 645  646                  /* ixa_cred done in ipcl_conn_cleanup below */
 646  647          }
 647  648  
 648  649          if (connp->conn_ht_iphc != NULL) {
 649  650                  kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
 650  651                  connp->conn_ht_iphc = NULL;
 651  652                  connp->conn_ht_iphc_allocated = 0;
 652  653                  connp->conn_ht_iphc_len = 0;
 653  654                  connp->conn_ht_ulp = NULL;
 654  655                  connp->conn_ht_ulp_len = 0;
 655  656          }
 656  657          ip_pkt_free(&connp->conn_xmit_ipp);
 657  658  
 658  659          ipcl_globalhash_remove(connp);
 659  660  
 660  661          if (connp->conn_latch != NULL) {
 661  662                  IPLATCH_REFRELE(connp->conn_latch);
 662  663                  connp->conn_latch = NULL;
 663  664          }
 664  665          if (connp->conn_latch_in_policy != NULL) {
 665  666                  IPPOL_REFRELE(connp->conn_latch_in_policy);
 666  667                  connp->conn_latch_in_policy = NULL;
 667  668          }
 668  669          if (connp->conn_latch_in_action != NULL) {
 669  670                  IPACT_REFRELE(connp->conn_latch_in_action);
 670  671                  connp->conn_latch_in_action = NULL;
 671  672          }
 672  673          if (connp->conn_policy != NULL) {
 673  674                  IPPH_REFRELE(connp->conn_policy, ns);
 674  675                  connp->conn_policy = NULL;
 675  676          }
 676  677  
 677  678          if (connp->conn_ipsec_opt_mp != NULL) {
 678  679                  freemsg(connp->conn_ipsec_opt_mp);
 679  680                  connp->conn_ipsec_opt_mp = NULL;
 680  681          }
 681  682  
 682  683          if (connp->conn_flags & IPCL_TCPCONN) {
 683  684                  tcp_t *tcp = connp->conn_tcp;
 684  685  
 685  686                  tcp_free(tcp);
 686  687                  mp = tcp->tcp_timercache;
 687  688  
 688  689                  tcp->tcp_tcps = NULL;
 689  690  
 690  691                  /*
 691  692                   * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate
 692  693                   * the mblk.
 693  694                   */
 694  695                  if (tcp->tcp_rsrv_mp != NULL) {
 695  696                          freeb(tcp->tcp_rsrv_mp);
 696  697                          tcp->tcp_rsrv_mp = NULL;
 697  698                          mutex_destroy(&tcp->tcp_rsrv_mp_lock);
 698  699                  }
 699  700  
 700  701                  ipcl_conn_cleanup(connp);
 701  702                  connp->conn_flags = IPCL_TCPCONN;
 702  703                  if (ns != NULL) {
 703  704                          ASSERT(tcp->tcp_tcps == NULL);
 704  705                          connp->conn_netstack = NULL;
 705  706                          connp->conn_ixa->ixa_ipst = NULL;
 706  707                          netstack_rele(ns);
 707  708                  }
 708  709  
 709  710                  bzero(tcp, sizeof (tcp_t));
 710  711  
 711  712                  tcp->tcp_timercache = mp;
 712  713                  tcp->tcp_connp = connp;
 713  714                  kmem_cache_free(tcp_conn_cache, connp);
 714  715                  return;
 715  716          }
 716  717  
 717  718          if (connp->conn_flags & IPCL_SCTPCONN) {
 718  719                  ASSERT(ns != NULL);
 719  720                  sctp_free(connp);
 720  721                  return;
 721  722          }
 722  723  
 723  724          ipcl_conn_cleanup(connp);
 724  725          if (ns != NULL) {
 725  726                  connp->conn_netstack = NULL;
 726  727                  connp->conn_ixa->ixa_ipst = NULL;
 727  728                  netstack_rele(ns);
 728  729          }
 729  730  
 730  731          /* leave conn_priv aka conn_udp, conn_icmp, etc in place. */
 731  732          if (connp->conn_flags & IPCL_UDPCONN) {
 732  733                  connp->conn_flags = IPCL_UDPCONN;
 733  734                  kmem_cache_free(udp_conn_cache, connp);
 734  735          } else if (connp->conn_flags & IPCL_RAWIPCONN) {
 735  736                  connp->conn_flags = IPCL_RAWIPCONN;
 736  737                  connp->conn_proto = IPPROTO_ICMP;
 737  738                  connp->conn_ixa->ixa_protocol = connp->conn_proto;
 738  739                  kmem_cache_free(rawip_conn_cache, connp);
 739  740          } else if (connp->conn_flags & IPCL_RTSCONN) {
 740  741                  connp->conn_flags = IPCL_RTSCONN;
 741  742                  kmem_cache_free(rts_conn_cache, connp);
 742  743          } else {
 743  744                  connp->conn_flags = IPCL_IPCCONN;
 744  745                  ASSERT(connp->conn_flags & IPCL_IPCCONN);
 745  746                  ASSERT(connp->conn_priv == NULL);
 746  747                  kmem_cache_free(ip_conn_cache, connp);
 747  748          }
 748  749  }
 749  750  
 750  751  /*
 751  752   * Running in cluster mode - deregister listener information
 752  753   */
 753  754  static void
 754  755  ipcl_conn_unlisten(conn_t *connp)
 755  756  {
 756  757          ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0);
 757  758          ASSERT(connp->conn_lport != 0);
 758  759  
 759  760          if (cl_inet_unlisten != NULL) {
 760  761                  sa_family_t     addr_family;
 761  762                  uint8_t         *laddrp;
 762  763  
 763  764                  if (connp->conn_ipversion == IPV6_VERSION) {
 764  765                          addr_family = AF_INET6;
 765  766                          laddrp = (uint8_t *)&connp->conn_bound_addr_v6;
 766  767                  } else {
 767  768                          addr_family = AF_INET;
 768  769                          laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
 769  770                  }
 770  771                  (*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid,
 771  772                      IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL);
 772  773          }
 773  774          connp->conn_flags &= ~IPCL_CL_LISTENER;
 774  775  }
 775  776  
 776  777  /*
 777  778   * We set the IPCL_REMOVED flag (instead of clearing the flag indicating
 778  779   * which table the conn belonged to). So for debugging we can see which hash
 779  780   * table this connection was in.
 780  781   */
 781  782  #define IPCL_HASH_REMOVE(connp) {                                       \
 782  783          connf_t *connfp = (connp)->conn_fanout;                         \
 783  784          ASSERT(!MUTEX_HELD(&((connp)->conn_lock)));                     \
 784  785          if (connfp != NULL) {                                           \
 785  786                  mutex_enter(&connfp->connf_lock);                       \
 786  787                  if ((connp)->conn_next != NULL)                         \
 787  788                          (connp)->conn_next->conn_prev =                 \
 788  789                              (connp)->conn_prev;                         \
 789  790                  if ((connp)->conn_prev != NULL)                         \
 790  791                          (connp)->conn_prev->conn_next =                 \
 791  792                              (connp)->conn_next;                         \
 792  793                  else                                                    \
 793  794                          connfp->connf_head = (connp)->conn_next;        \
 794  795                  (connp)->conn_fanout = NULL;                            \
 795  796                  (connp)->conn_next = NULL;                              \
 796  797                  (connp)->conn_prev = NULL;                              \
 797  798                  (connp)->conn_flags |= IPCL_REMOVED;                    \
 798  799                  if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0)      \
 799  800                          ipcl_conn_unlisten((connp));                    \
 800  801                  CONN_DEC_REF((connp));                                  \
 801  802                  mutex_exit(&connfp->connf_lock);                        \
 802  803          }                                                               \
 803  804  }
 804  805  
 805  806  void
 806  807  ipcl_hash_remove(conn_t *connp)
 807  808  {
 808  809          uint8_t         protocol = connp->conn_proto;
 809  810  
 810  811          IPCL_HASH_REMOVE(connp);
 811  812          if (protocol == IPPROTO_RSVP)
 812  813                  ill_set_inputfn_all(connp->conn_netstack->netstack_ip);
 813  814  }
 814  815  
 815  816  /*
 816  817   * The whole purpose of this function is allow removal of
 817  818   * a conn_t from the connected hash for timewait reclaim.
 818  819   * This is essentially a TW reclaim fastpath where timewait
 819  820   * collector checks under fanout lock (so no one else can
 820  821   * get access to the conn_t) that refcnt is 2 i.e. one for
 821  822   * TCP and one for the classifier hash list. If ref count
 822  823   * is indeed 2, we can just remove the conn under lock and
 823  824   * avoid cleaning up the conn under squeue. This gives us
 824  825   * improved performance.
 825  826   */
 826  827  void
 827  828  ipcl_hash_remove_locked(conn_t *connp, connf_t  *connfp)
 828  829  {
 829  830          ASSERT(MUTEX_HELD(&connfp->connf_lock));
 830  831          ASSERT(MUTEX_HELD(&connp->conn_lock));
 831  832          ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0);
 832  833  
 833  834          if ((connp)->conn_next != NULL) {
 834  835                  (connp)->conn_next->conn_prev = (connp)->conn_prev;
 835  836          }
 836  837          if ((connp)->conn_prev != NULL) {
 837  838                  (connp)->conn_prev->conn_next = (connp)->conn_next;
 838  839          } else {
 839  840                  connfp->connf_head = (connp)->conn_next;
 840  841          }
 841  842          (connp)->conn_fanout = NULL;
 842  843          (connp)->conn_next = NULL;
 843  844          (connp)->conn_prev = NULL;
 844  845          (connp)->conn_flags |= IPCL_REMOVED;
 845  846          ASSERT((connp)->conn_ref == 2);
 846  847          (connp)->conn_ref--;
 847  848  }
 848  849  
 849  850  #define IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) {              \
 850  851          ASSERT((connp)->conn_fanout == NULL);                           \
 851  852          ASSERT((connp)->conn_next == NULL);                             \
 852  853          ASSERT((connp)->conn_prev == NULL);                             \
 853  854          if ((connfp)->connf_head != NULL) {                             \
 854  855                  (connfp)->connf_head->conn_prev = (connp);              \
 855  856                  (connp)->conn_next = (connfp)->connf_head;              \
 856  857          }                                                               \
 857  858          (connp)->conn_fanout = (connfp);                                \
 858  859          (connfp)->connf_head = (connp);                                 \
 859  860          (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |   \
 860  861              IPCL_CONNECTED;                                             \
  
    | 
      ↓ open down ↓ | 
    828 lines elided | 
    
      ↑ open up ↑ | 
  
 861  862          CONN_INC_REF(connp);                                            \
 862  863  }
 863  864  
 864  865  #define IPCL_HASH_INSERT_CONNECTED(connfp, connp) {                     \
 865  866          IPCL_HASH_REMOVE((connp));                                      \
 866  867          mutex_enter(&(connfp)->connf_lock);                             \
 867  868          IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);               \
 868  869          mutex_exit(&(connfp)->connf_lock);                              \
 869  870  }
 870  871  
 871      -#define IPCL_HASH_INSERT_BOUND(connfp, connp) {                         \
 872      -        conn_t *pconnp = NULL, *nconnp;                                 \
 873      -        IPCL_HASH_REMOVE((connp));                                      \
 874      -        mutex_enter(&(connfp)->connf_lock);                             \
 875      -        nconnp = (connfp)->connf_head;                                  \
 876      -        while (nconnp != NULL &&                                        \
 877      -            !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) {               \
 878      -                pconnp = nconnp;                                        \
 879      -                nconnp = nconnp->conn_next;                             \
 880      -        }                                                               \
 881      -        if (pconnp != NULL) {                                           \
 882      -                pconnp->conn_next = (connp);                            \
 883      -                (connp)->conn_prev = pconnp;                            \
 884      -        } else {                                                        \
 885      -                (connfp)->connf_head = (connp);                         \
 886      -        }                                                               \
 887      -        if (nconnp != NULL) {                                           \
 888      -                (connp)->conn_next = nconnp;                            \
 889      -                nconnp->conn_prev = (connp);                            \
 890      -        }                                                               \
 891      -        (connp)->conn_fanout = (connfp);                                \
 892      -        (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |   \
 893      -            IPCL_BOUND;                                                 \
 894      -        CONN_INC_REF(connp);                                            \
 895      -        mutex_exit(&(connfp)->connf_lock);                              \
 896      -}
      872 +/*
      873 + * When inserting bound or wildcard entries into the hash, ordering rules are
      874 + * used to facilitate timely and correct lookups.  The order is as follows:
      875 + * 1. Entries bound to a specific address
      876 + * 2. Entries bound to INADDR_ANY
      877 + * 3. Entries bound to ADDR_UNSPECIFIED
      878 + * Entries in a category which share conn_lport (such as those using
      879 + * SO_REUSEPORT) will be ordered such that the newest inserted is first.
      880 + */
 897  881  
 898      -#define IPCL_HASH_INSERT_WILDCARD(connfp, connp) {                      \
 899      -        conn_t **list, *prev, *next;                                    \
 900      -        boolean_t isv4mapped =                                          \
 901      -            IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6);              \
 902      -        IPCL_HASH_REMOVE((connp));                                      \
 903      -        mutex_enter(&(connfp)->connf_lock);                             \
 904      -        list = &(connfp)->connf_head;                                   \
 905      -        prev = NULL;                                                    \
 906      -        while ((next = *list) != NULL) {                                \
 907      -                if (isv4mapped &&                                       \
 908      -                    IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) &&    \
 909      -                    connp->conn_zoneid == next->conn_zoneid) {          \
 910      -                        (connp)->conn_next = next;                      \
 911      -                        if (prev != NULL)                               \
 912      -                                prev = next->conn_prev;                 \
 913      -                        next->conn_prev = (connp);                      \
 914      -                        break;                                          \
 915      -                }                                                       \
 916      -                list = &next->conn_next;                                \
 917      -                prev = next;                                            \
 918      -        }                                                               \
 919      -        (connp)->conn_prev = prev;                                      \
 920      -        *list = (connp);                                                \
 921      -        (connp)->conn_fanout = (connfp);                                \
 922      -        (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |   \
 923      -            IPCL_BOUND;                                                 \
 924      -        CONN_INC_REF((connp));                                          \
 925      -        mutex_exit(&(connfp)->connf_lock);                              \
      882 +void
      883 +ipcl_hash_insert_bound(connf_t *connfp, conn_t *connp)
      884 +{
      885 +        conn_t *pconnp, *nconnp;
      886 +
      887 +        IPCL_HASH_REMOVE(connp);
      888 +        mutex_enter(&connfp->connf_lock);
      889 +        nconnp = connfp->connf_head;
      890 +        pconnp = NULL;
      891 +        while (nconnp != NULL) {
      892 +                /*
      893 +                 * Walk though entries associated with the fanout until one is
      894 +                 * found which fulfills any of these conditions:
      895 +                 * 1. Listen address of ADDR_ANY/ADDR_UNSPECIFIED
      896 +                 * 2. Listen port the same as connp
      897 +                 */
      898 +                if (_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6) ||
      899 +                    connp->conn_lport == nconnp->conn_lport)
      900 +                        break;
      901 +                pconnp = nconnp;
      902 +                nconnp = nconnp->conn_next;
      903 +        }
      904 +        if (pconnp != NULL) {
      905 +                pconnp->conn_next = connp;
      906 +                connp->conn_prev = pconnp;
      907 +        } else {
      908 +                connfp->connf_head = connp;
      909 +        }
      910 +        if (nconnp != NULL) {
      911 +                connp->conn_next = nconnp;
      912 +                nconnp->conn_prev = connp;
      913 +        }
      914 +        connp->conn_fanout = connfp;
      915 +        connp->conn_flags = (connp->conn_flags & ~IPCL_REMOVED) | IPCL_BOUND;
      916 +        CONN_INC_REF(connp);
      917 +        mutex_exit(&connfp->connf_lock);
 926  918  }
 927  919  
 928  920  void
 929  921  ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
 930  922  {
 931      -        IPCL_HASH_INSERT_WILDCARD(connfp, connp);
      923 +        conn_t *pconnp = NULL, *nconnp;
      924 +        boolean_t isv4mapped = IN6_IS_ADDR_V4MAPPED(&connp->conn_laddr_v6);
      925 +
      926 +        IPCL_HASH_REMOVE(connp);
      927 +        mutex_enter(&connfp->connf_lock);
      928 +        nconnp = connfp->connf_head;
      929 +        pconnp = NULL;
      930 +        while (nconnp != NULL) {
      931 +                if (IN6_IS_ADDR_V4MAPPED_ANY(&nconnp->conn_laddr_v6) &&
      932 +                    isv4mapped && connp->conn_lport == nconnp->conn_lport)
      933 +                        break;
      934 +                if (IN6_IS_ADDR_UNSPECIFIED(&nconnp->conn_laddr_v6) &&
      935 +                    (isv4mapped ||
      936 +                    connp->conn_lport == nconnp->conn_lport))
      937 +                        break;
      938 +
      939 +                pconnp = nconnp;
      940 +                nconnp = nconnp->conn_next;
      941 +        }
      942 +        if (pconnp != NULL) {
      943 +                pconnp->conn_next = connp;
      944 +                connp->conn_prev = pconnp;
      945 +        } else {
      946 +                connfp->connf_head = connp;
      947 +        }
      948 +        if (nconnp != NULL) {
      949 +                connp->conn_next = nconnp;
      950 +                nconnp->conn_prev = connp;
      951 +        }
      952 +        connp->conn_fanout = connfp;
      953 +        connp->conn_flags = (connp->conn_flags & ~IPCL_REMOVED) | IPCL_BOUND;
      954 +        CONN_INC_REF(connp);
      955 +        mutex_exit(&connfp->connf_lock);
 932  956  }
 933  957  
 934  958  /*
 935  959   * Because the classifier is used to classify inbound packets, the destination
 936  960   * address is meant to be our local tunnel address (tunnel source), and the
 937  961   * source the remote tunnel address (tunnel destination).
 938  962   *
 939  963   * Note that conn_proto can't be used for fanout since the upper protocol
 940  964   * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel.
 941  965   */
 942  966  conn_t *
 943  967  ipcl_iptun_classify_v4(ipaddr_t *src, ipaddr_t *dst, ip_stack_t *ipst)
 944  968  {
 945  969          connf_t *connfp;
 946  970          conn_t  *connp;
 947  971  
 948  972          /* first look for IPv4 tunnel links */
 949  973          connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, *src)];
 950  974          mutex_enter(&connfp->connf_lock);
 951  975          for (connp = connfp->connf_head; connp != NULL;
 952  976              connp = connp->conn_next) {
 953  977                  if (IPCL_IPTUN_MATCH(connp, *dst, *src))
 954  978                          break;
 955  979          }
 956  980          if (connp != NULL)
 957  981                  goto done;
 958  982  
 959  983          mutex_exit(&connfp->connf_lock);
 960  984  
 961  985          /* We didn't find an IPv4 tunnel, try a 6to4 tunnel */
 962  986          connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst,
 963  987              INADDR_ANY)];
 964  988          mutex_enter(&connfp->connf_lock);
 965  989          for (connp = connfp->connf_head; connp != NULL;
 966  990              connp = connp->conn_next) {
 967  991                  if (IPCL_IPTUN_MATCH(connp, *dst, INADDR_ANY))
 968  992                          break;
 969  993          }
 970  994  done:
 971  995          if (connp != NULL)
 972  996                  CONN_INC_REF(connp);
 973  997          mutex_exit(&connfp->connf_lock);
 974  998          return (connp);
 975  999  }
 976 1000  
 977 1001  conn_t *
 978 1002  ipcl_iptun_classify_v6(in6_addr_t *src, in6_addr_t *dst, ip_stack_t *ipst)
 979 1003  {
 980 1004          connf_t *connfp;
 981 1005          conn_t  *connp;
 982 1006  
 983 1007          /* Look for an IPv6 tunnel link */
 984 1008          connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(dst, src)];
 985 1009          mutex_enter(&connfp->connf_lock);
 986 1010          for (connp = connfp->connf_head; connp != NULL;
 987 1011              connp = connp->conn_next) {
 988 1012                  if (IPCL_IPTUN_MATCH_V6(connp, dst, src)) {
 989 1013                          CONN_INC_REF(connp);
 990 1014                          break;
 991 1015                  }
 992 1016          }
 993 1017          mutex_exit(&connfp->connf_lock);
 994 1018          return (connp);
 995 1019  }
 996 1020  
 997 1021  /*
 998 1022   * This function is used only for inserting SCTP raw socket now.
 999 1023   * This may change later.
1000 1024   *
1001 1025   * Note that only one raw socket can be bound to a port.  The param
1002 1026   * lport is in network byte order.
1003 1027   */
1004 1028  static int
1005 1029  ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
1006 1030  {
1007 1031          connf_t *connfp;
1008 1032          conn_t  *oconnp;
1009 1033          ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
1010 1034  
1011 1035          connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1012 1036  
1013 1037          /* Check for existing raw socket already bound to the port. */
1014 1038          mutex_enter(&connfp->connf_lock);
1015 1039          for (oconnp = connfp->connf_head; oconnp != NULL;
1016 1040              oconnp = oconnp->conn_next) {
1017 1041                  if (oconnp->conn_lport == lport &&
1018 1042                      oconnp->conn_zoneid == connp->conn_zoneid &&
1019 1043                      oconnp->conn_family == connp->conn_family &&
1020 1044                      ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1021 1045                      IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_laddr_v6) ||
1022 1046                      IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6) ||
1023 1047                      IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_laddr_v6)) ||
1024 1048                      IN6_ARE_ADDR_EQUAL(&oconnp->conn_laddr_v6,
1025 1049                      &connp->conn_laddr_v6))) {
1026 1050                          break;
  
    | 
      ↓ open down ↓ | 
    85 lines elided | 
    
      ↑ open up ↑ | 
  
1027 1051                  }
1028 1052          }
1029 1053          mutex_exit(&connfp->connf_lock);
1030 1054          if (oconnp != NULL)
1031 1055                  return (EADDRNOTAVAIL);
1032 1056  
1033 1057          if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) ||
1034 1058              IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1035 1059                  if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1036 1060                      IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) {
1037      -                        IPCL_HASH_INSERT_WILDCARD(connfp, connp);
     1061 +                        ipcl_hash_insert_wildcard(connfp, connp);
1038 1062                  } else {
1039      -                        IPCL_HASH_INSERT_BOUND(connfp, connp);
     1063 +                        ipcl_hash_insert_bound(connfp, connp);
1040 1064                  }
1041 1065          } else {
1042 1066                  IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1043 1067          }
1044 1068          return (0);
1045 1069  }
1046 1070  
1047 1071  static int
1048 1072  ipcl_iptun_hash_insert(conn_t *connp, ip_stack_t *ipst)
1049 1073  {
1050 1074          connf_t *connfp;
1051 1075          conn_t  *tconnp;
1052 1076          ipaddr_t laddr = connp->conn_laddr_v4;
1053 1077          ipaddr_t faddr = connp->conn_faddr_v4;
1054 1078  
1055 1079          connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(laddr, faddr)];
1056 1080          mutex_enter(&connfp->connf_lock);
1057 1081          for (tconnp = connfp->connf_head; tconnp != NULL;
1058 1082              tconnp = tconnp->conn_next) {
1059 1083                  if (IPCL_IPTUN_MATCH(tconnp, laddr, faddr)) {
1060 1084                          /* A tunnel is already bound to these addresses. */
1061 1085                          mutex_exit(&connfp->connf_lock);
1062 1086                          return (EADDRINUSE);
1063 1087                  }
1064 1088          }
1065 1089          IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1066 1090          mutex_exit(&connfp->connf_lock);
1067 1091          return (0);
1068 1092  }
1069 1093  
1070 1094  static int
1071 1095  ipcl_iptun_hash_insert_v6(conn_t *connp, ip_stack_t *ipst)
1072 1096  {
1073 1097          connf_t *connfp;
1074 1098          conn_t  *tconnp;
1075 1099          in6_addr_t *laddr = &connp->conn_laddr_v6;
1076 1100          in6_addr_t *faddr = &connp->conn_faddr_v6;
1077 1101  
1078 1102          connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(laddr, faddr)];
1079 1103          mutex_enter(&connfp->connf_lock);
1080 1104          for (tconnp = connfp->connf_head; tconnp != NULL;
1081 1105              tconnp = tconnp->conn_next) {
1082 1106                  if (IPCL_IPTUN_MATCH_V6(tconnp, laddr, faddr)) {
1083 1107                          /* A tunnel is already bound to these addresses. */
1084 1108                          mutex_exit(&connfp->connf_lock);
1085 1109                          return (EADDRINUSE);
1086 1110                  }
1087 1111          }
1088 1112          IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1089 1113          mutex_exit(&connfp->connf_lock);
1090 1114          return (0);
1091 1115  }
1092 1116  
1093 1117  /*
1094 1118   * Check for a MAC exemption conflict on a labeled system.  Note that for
1095 1119   * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the
1096 1120   * transport layer.  This check is for binding all other protocols.
1097 1121   *
1098 1122   * Returns true if there's a conflict.
1099 1123   */
1100 1124  static boolean_t
1101 1125  check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst)
1102 1126  {
1103 1127          connf_t *connfp;
1104 1128          conn_t *tconn;
1105 1129  
1106 1130          connfp = &ipst->ips_ipcl_proto_fanout_v4[connp->conn_proto];
1107 1131          mutex_enter(&connfp->connf_lock);
1108 1132          for (tconn = connfp->connf_head; tconn != NULL;
1109 1133              tconn = tconn->conn_next) {
1110 1134                  /* We don't allow v4 fallback for v6 raw socket */
1111 1135                  if (connp->conn_family != tconn->conn_family)
1112 1136                          continue;
1113 1137                  /* If neither is exempt, then there's no conflict */
1114 1138                  if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1115 1139                      (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1116 1140                          continue;
1117 1141                  /* We are only concerned about sockets for a different zone */
1118 1142                  if (connp->conn_zoneid == tconn->conn_zoneid)
1119 1143                          continue;
1120 1144                  /* If both are bound to different specific addrs, ok */
1121 1145                  if (connp->conn_laddr_v4 != INADDR_ANY &&
1122 1146                      tconn->conn_laddr_v4 != INADDR_ANY &&
1123 1147                      connp->conn_laddr_v4 != tconn->conn_laddr_v4)
1124 1148                          continue;
1125 1149                  /* These two conflict; fail */
1126 1150                  break;
1127 1151          }
1128 1152          mutex_exit(&connfp->connf_lock);
1129 1153          return (tconn != NULL);
1130 1154  }
1131 1155  
1132 1156  static boolean_t
1133 1157  check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst)
1134 1158  {
1135 1159          connf_t *connfp;
1136 1160          conn_t *tconn;
1137 1161  
1138 1162          connfp = &ipst->ips_ipcl_proto_fanout_v6[connp->conn_proto];
1139 1163          mutex_enter(&connfp->connf_lock);
1140 1164          for (tconn = connfp->connf_head; tconn != NULL;
1141 1165              tconn = tconn->conn_next) {
1142 1166                  /* We don't allow v4 fallback for v6 raw socket */
1143 1167                  if (connp->conn_family != tconn->conn_family)
1144 1168                          continue;
1145 1169                  /* If neither is exempt, then there's no conflict */
1146 1170                  if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1147 1171                      (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1148 1172                          continue;
1149 1173                  /* We are only concerned about sockets for a different zone */
1150 1174                  if (connp->conn_zoneid == tconn->conn_zoneid)
1151 1175                          continue;
1152 1176                  /* If both are bound to different addrs, ok */
1153 1177                  if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) &&
1154 1178                      !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_laddr_v6) &&
1155 1179                      !IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6,
1156 1180                      &tconn->conn_laddr_v6))
1157 1181                          continue;
1158 1182                  /* These two conflict; fail */
1159 1183                  break;
1160 1184          }
1161 1185          mutex_exit(&connfp->connf_lock);
1162 1186          return (tconn != NULL);
1163 1187  }
1164 1188  
1165 1189  /*
1166 1190   * (v4, v6) bind hash insertion routines
1167 1191   * The caller has already setup the conn (conn_proto, conn_laddr_v6, conn_lport)
1168 1192   */
1169 1193  
1170 1194  int
1171 1195  ipcl_bind_insert(conn_t *connp)
1172 1196  {
1173 1197          if (connp->conn_ipversion == IPV6_VERSION)
1174 1198                  return (ipcl_bind_insert_v6(connp));
1175 1199          else
1176 1200                  return (ipcl_bind_insert_v4(connp));
1177 1201  }
1178 1202  
1179 1203  int
1180 1204  ipcl_bind_insert_v4(conn_t *connp)
1181 1205  {
1182 1206          connf_t *connfp;
1183 1207          int     ret = 0;
1184 1208          ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
1185 1209          uint16_t        lport = connp->conn_lport;
1186 1210          uint8_t         protocol = connp->conn_proto;
1187 1211  
1188 1212          if (IPCL_IS_IPTUN(connp))
1189 1213                  return (ipcl_iptun_hash_insert(connp, ipst));
1190 1214  
1191 1215          switch (protocol) {
1192 1216          default:
1193 1217                  if (is_system_labeled() &&
1194 1218                      check_exempt_conflict_v4(connp, ipst))
1195 1219                          return (EADDRINUSE);
1196 1220                  /* FALLTHROUGH */
1197 1221          case IPPROTO_UDP:
  
    | 
      ↓ open down ↓ | 
    148 lines elided | 
    
      ↑ open up ↑ | 
  
1198 1222                  if (protocol == IPPROTO_UDP) {
1199 1223                          connfp = &ipst->ips_ipcl_udp_fanout[
1200 1224                              IPCL_UDP_HASH(lport, ipst)];
1201 1225                  } else {
1202 1226                          connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1203 1227                  }
1204 1228  
1205 1229                  if (connp->conn_faddr_v4 != INADDR_ANY) {
1206 1230                          IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1207 1231                  } else if (connp->conn_laddr_v4 != INADDR_ANY) {
1208      -                        IPCL_HASH_INSERT_BOUND(connfp, connp);
     1232 +                        ipcl_hash_insert_bound(connfp, connp);
1209 1233                  } else {
1210      -                        IPCL_HASH_INSERT_WILDCARD(connfp, connp);
     1234 +                        ipcl_hash_insert_wildcard(connfp, connp);
1211 1235                  }
1212 1236                  if (protocol == IPPROTO_RSVP)
1213 1237                          ill_set_inputfn_all(ipst);
1214 1238                  break;
1215 1239  
1216 1240          case IPPROTO_TCP:
1217 1241                  /* Insert it in the Bind Hash */
1218 1242                  ASSERT(connp->conn_zoneid != ALL_ZONES);
1219 1243                  connfp = &ipst->ips_ipcl_bind_fanout[
1220 1244                      IPCL_BIND_HASH(lport, ipst)];
1221 1245                  if (connp->conn_laddr_v4 != INADDR_ANY) {
1222      -                        IPCL_HASH_INSERT_BOUND(connfp, connp);
     1246 +                        ipcl_hash_insert_bound(connfp, connp);
1223 1247                  } else {
1224      -                        IPCL_HASH_INSERT_WILDCARD(connfp, connp);
     1248 +                        ipcl_hash_insert_wildcard(connfp, connp);
1225 1249                  }
1226 1250                  if (cl_inet_listen != NULL) {
1227 1251                          ASSERT(connp->conn_ipversion == IPV4_VERSION);
1228 1252                          connp->conn_flags |= IPCL_CL_LISTENER;
1229 1253                          (*cl_inet_listen)(
1230 1254                              connp->conn_netstack->netstack_stackid,
1231 1255                              IPPROTO_TCP, AF_INET,
1232 1256                              (uint8_t *)&connp->conn_bound_addr_v4, lport, NULL);
1233 1257                  }
1234 1258                  break;
1235 1259  
1236 1260          case IPPROTO_SCTP:
1237 1261                  ret = ipcl_sctp_hash_insert(connp, lport);
1238 1262                  break;
1239 1263          }
1240 1264  
1241 1265          return (ret);
1242 1266  }
1243 1267  
1244 1268  int
1245 1269  ipcl_bind_insert_v6(conn_t *connp)
1246 1270  {
1247 1271          connf_t         *connfp;
1248 1272          int             ret = 0;
1249 1273          ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
1250 1274          uint16_t        lport = connp->conn_lport;
1251 1275          uint8_t         protocol = connp->conn_proto;
1252 1276  
1253 1277          if (IPCL_IS_IPTUN(connp)) {
1254 1278                  return (ipcl_iptun_hash_insert_v6(connp, ipst));
1255 1279          }
1256 1280  
1257 1281          switch (protocol) {
1258 1282          default:
1259 1283                  if (is_system_labeled() &&
1260 1284                      check_exempt_conflict_v6(connp, ipst))
1261 1285                          return (EADDRINUSE);
1262 1286                  /* FALLTHROUGH */
1263 1287          case IPPROTO_UDP:
  
    | 
      ↓ open down ↓ | 
    29 lines elided | 
    
      ↑ open up ↑ | 
  
1264 1288                  if (protocol == IPPROTO_UDP) {
1265 1289                          connfp = &ipst->ips_ipcl_udp_fanout[
1266 1290                              IPCL_UDP_HASH(lport, ipst)];
1267 1291                  } else {
1268 1292                          connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1269 1293                  }
1270 1294  
1271 1295                  if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1272 1296                          IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1273 1297                  } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1274      -                        IPCL_HASH_INSERT_BOUND(connfp, connp);
     1298 +                        ipcl_hash_insert_bound(connfp, connp);
1275 1299                  } else {
1276      -                        IPCL_HASH_INSERT_WILDCARD(connfp, connp);
     1300 +                        ipcl_hash_insert_wildcard(connfp, connp);
1277 1301                  }
1278 1302                  break;
1279 1303  
1280 1304          case IPPROTO_TCP:
1281 1305                  /* Insert it in the Bind Hash */
1282 1306                  ASSERT(connp->conn_zoneid != ALL_ZONES);
1283 1307                  connfp = &ipst->ips_ipcl_bind_fanout[
1284 1308                      IPCL_BIND_HASH(lport, ipst)];
1285 1309                  if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1286      -                        IPCL_HASH_INSERT_BOUND(connfp, connp);
     1310 +                        ipcl_hash_insert_bound(connfp, connp);
1287 1311                  } else {
1288      -                        IPCL_HASH_INSERT_WILDCARD(connfp, connp);
     1312 +                        ipcl_hash_insert_wildcard(connfp, connp);
1289 1313                  }
1290 1314                  if (cl_inet_listen != NULL) {
1291 1315                          sa_family_t     addr_family;
1292 1316                          uint8_t         *laddrp;
1293 1317  
1294 1318                          if (connp->conn_ipversion == IPV6_VERSION) {
1295 1319                                  addr_family = AF_INET6;
1296 1320                                  laddrp =
1297 1321                                      (uint8_t *)&connp->conn_bound_addr_v6;
1298 1322                          } else {
1299 1323                                  addr_family = AF_INET;
1300 1324                                  laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
1301 1325                          }
1302 1326                          connp->conn_flags |= IPCL_CL_LISTENER;
1303 1327                          (*cl_inet_listen)(
1304 1328                              connp->conn_netstack->netstack_stackid,
1305 1329                              IPPROTO_TCP, addr_family, laddrp, lport, NULL);
1306 1330                  }
1307 1331                  break;
1308 1332  
1309 1333          case IPPROTO_SCTP:
1310 1334                  ret = ipcl_sctp_hash_insert(connp, lport);
1311 1335                  break;
1312 1336          }
1313 1337  
1314 1338          return (ret);
1315 1339  }
1316 1340  
1317 1341  /*
1318 1342   * ipcl_conn_hash insertion routines.
1319 1343   * The caller has already set conn_proto and the addresses/ports in the conn_t.
1320 1344   */
1321 1345  
1322 1346  int
1323 1347  ipcl_conn_insert(conn_t *connp)
1324 1348  {
1325 1349          if (connp->conn_ipversion == IPV6_VERSION)
1326 1350                  return (ipcl_conn_insert_v6(connp));
1327 1351          else
1328 1352                  return (ipcl_conn_insert_v4(connp));
1329 1353  }
1330 1354  
1331 1355  int
1332 1356  ipcl_conn_insert_v4(conn_t *connp)
1333 1357  {
1334 1358          connf_t         *connfp;
1335 1359          conn_t          *tconnp;
1336 1360          int             ret = 0;
1337 1361          ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
1338 1362          uint16_t        lport = connp->conn_lport;
1339 1363          uint8_t         protocol = connp->conn_proto;
1340 1364  
1341 1365          if (IPCL_IS_IPTUN(connp))
1342 1366                  return (ipcl_iptun_hash_insert(connp, ipst));
1343 1367  
1344 1368          switch (protocol) {
1345 1369          case IPPROTO_TCP:
1346 1370                  /*
1347 1371                   * For TCP, we check whether the connection tuple already
1348 1372                   * exists before allowing the connection to proceed.  We
1349 1373                   * also allow indexing on the zoneid. This is to allow
1350 1374                   * multiple shared stack zones to have the same tcp
1351 1375                   * connection tuple. In practice this only happens for
1352 1376                   * INADDR_LOOPBACK as it's the only local address which
1353 1377                   * doesn't have to be unique.
1354 1378                   */
1355 1379                  connfp = &ipst->ips_ipcl_conn_fanout[
1356 1380                      IPCL_CONN_HASH(connp->conn_faddr_v4,
1357 1381                      connp->conn_ports, ipst)];
1358 1382                  mutex_enter(&connfp->connf_lock);
1359 1383                  for (tconnp = connfp->connf_head; tconnp != NULL;
1360 1384                      tconnp = tconnp->conn_next) {
1361 1385                          if (IPCL_CONN_MATCH(tconnp, connp->conn_proto,
1362 1386                              connp->conn_faddr_v4, connp->conn_laddr_v4,
1363 1387                              connp->conn_ports) &&
1364 1388                              IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1365 1389                                  /* Already have a conn. bail out */
1366 1390                                  mutex_exit(&connfp->connf_lock);
1367 1391                                  return (EADDRINUSE);
1368 1392                          }
1369 1393                  }
1370 1394                  if (connp->conn_fanout != NULL) {
1371 1395                          /*
1372 1396                           * Probably a XTI/TLI application trying to do a
1373 1397                           * rebind. Let it happen.
1374 1398                           */
1375 1399                          mutex_exit(&connfp->connf_lock);
1376 1400                          IPCL_HASH_REMOVE(connp);
1377 1401                          mutex_enter(&connfp->connf_lock);
1378 1402                  }
1379 1403  
1380 1404                  ASSERT(connp->conn_recv != NULL);
1381 1405                  ASSERT(connp->conn_recvicmp != NULL);
1382 1406  
1383 1407                  IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1384 1408                  mutex_exit(&connfp->connf_lock);
1385 1409                  break;
1386 1410  
1387 1411          case IPPROTO_SCTP:
1388 1412                  /*
1389 1413                   * The raw socket may have already been bound, remove it
1390 1414                   * from the hash first.
1391 1415                   */
1392 1416                  IPCL_HASH_REMOVE(connp);
1393 1417                  ret = ipcl_sctp_hash_insert(connp, lport);
1394 1418                  break;
1395 1419  
1396 1420          default:
1397 1421                  /*
1398 1422                   * Check for conflicts among MAC exempt bindings.  For
1399 1423                   * transports with port numbers, this is done by the upper
1400 1424                   * level per-transport binding logic.  For all others, it's
1401 1425                   * done here.
1402 1426                   */
1403 1427                  if (is_system_labeled() &&
1404 1428                      check_exempt_conflict_v4(connp, ipst))
1405 1429                          return (EADDRINUSE);
1406 1430                  /* FALLTHROUGH */
1407 1431  
1408 1432          case IPPROTO_UDP:
  
    | 
      ↓ open down ↓ | 
    110 lines elided | 
    
      ↑ open up ↑ | 
  
1409 1433                  if (protocol == IPPROTO_UDP) {
1410 1434                          connfp = &ipst->ips_ipcl_udp_fanout[
1411 1435                              IPCL_UDP_HASH(lport, ipst)];
1412 1436                  } else {
1413 1437                          connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1414 1438                  }
1415 1439  
1416 1440                  if (connp->conn_faddr_v4 != INADDR_ANY) {
1417 1441                          IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1418 1442                  } else if (connp->conn_laddr_v4 != INADDR_ANY) {
1419      -                        IPCL_HASH_INSERT_BOUND(connfp, connp);
     1443 +                        ipcl_hash_insert_bound(connfp, connp);
1420 1444                  } else {
1421      -                        IPCL_HASH_INSERT_WILDCARD(connfp, connp);
     1445 +                        ipcl_hash_insert_wildcard(connfp, connp);
1422 1446                  }
1423 1447                  break;
1424 1448          }
1425 1449  
1426 1450          return (ret);
1427 1451  }
1428 1452  
1429 1453  int
1430 1454  ipcl_conn_insert_v6(conn_t *connp)
1431 1455  {
1432 1456          connf_t         *connfp;
1433 1457          conn_t          *tconnp;
1434 1458          int             ret = 0;
1435 1459          ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
1436 1460          uint16_t        lport = connp->conn_lport;
1437 1461          uint8_t         protocol = connp->conn_proto;
1438 1462          uint_t          ifindex = connp->conn_bound_if;
1439 1463  
1440 1464          if (IPCL_IS_IPTUN(connp))
1441 1465                  return (ipcl_iptun_hash_insert_v6(connp, ipst));
1442 1466  
1443 1467          switch (protocol) {
1444 1468          case IPPROTO_TCP:
1445 1469  
1446 1470                  /*
1447 1471                   * For tcp, we check whether the connection tuple already
1448 1472                   * exists before allowing the connection to proceed.  We
1449 1473                   * also allow indexing on the zoneid. This is to allow
1450 1474                   * multiple shared stack zones to have the same tcp
1451 1475                   * connection tuple. In practice this only happens for
1452 1476                   * ipv6_loopback as it's the only local address which
1453 1477                   * doesn't have to be unique.
1454 1478                   */
1455 1479                  connfp = &ipst->ips_ipcl_conn_fanout[
1456 1480                      IPCL_CONN_HASH_V6(connp->conn_faddr_v6, connp->conn_ports,
1457 1481                      ipst)];
1458 1482                  mutex_enter(&connfp->connf_lock);
1459 1483                  for (tconnp = connfp->connf_head; tconnp != NULL;
1460 1484                      tconnp = tconnp->conn_next) {
1461 1485                          /* NOTE: need to match zoneid. Bug in onnv-gate */
1462 1486                          if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_proto,
1463 1487                              connp->conn_faddr_v6, connp->conn_laddr_v6,
1464 1488                              connp->conn_ports) &&
1465 1489                              (tconnp->conn_bound_if == 0 ||
1466 1490                              tconnp->conn_bound_if == ifindex) &&
1467 1491                              IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1468 1492                                  /* Already have a conn. bail out */
1469 1493                                  mutex_exit(&connfp->connf_lock);
1470 1494                                  return (EADDRINUSE);
1471 1495                          }
1472 1496                  }
1473 1497                  if (connp->conn_fanout != NULL) {
1474 1498                          /*
1475 1499                           * Probably a XTI/TLI application trying to do a
1476 1500                           * rebind. Let it happen.
1477 1501                           */
1478 1502                          mutex_exit(&connfp->connf_lock);
1479 1503                          IPCL_HASH_REMOVE(connp);
1480 1504                          mutex_enter(&connfp->connf_lock);
1481 1505                  }
1482 1506                  IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1483 1507                  mutex_exit(&connfp->connf_lock);
1484 1508                  break;
1485 1509  
1486 1510          case IPPROTO_SCTP:
1487 1511                  IPCL_HASH_REMOVE(connp);
1488 1512                  ret = ipcl_sctp_hash_insert(connp, lport);
1489 1513                  break;
1490 1514  
1491 1515          default:
1492 1516                  if (is_system_labeled() &&
1493 1517                      check_exempt_conflict_v6(connp, ipst))
1494 1518                          return (EADDRINUSE);
1495 1519                  /* FALLTHROUGH */
1496 1520          case IPPROTO_UDP:
  
    | 
      ↓ open down ↓ | 
    65 lines elided | 
    
      ↑ open up ↑ | 
  
1497 1521                  if (protocol == IPPROTO_UDP) {
1498 1522                          connfp = &ipst->ips_ipcl_udp_fanout[
1499 1523                              IPCL_UDP_HASH(lport, ipst)];
1500 1524                  } else {
1501 1525                          connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1502 1526                  }
1503 1527  
1504 1528                  if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1505 1529                          IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1506 1530                  } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1507      -                        IPCL_HASH_INSERT_BOUND(connfp, connp);
     1531 +                        ipcl_hash_insert_bound(connfp, connp);
1508 1532                  } else {
1509      -                        IPCL_HASH_INSERT_WILDCARD(connfp, connp);
     1533 +                        ipcl_hash_insert_wildcard(connfp, connp);
1510 1534                  }
1511 1535                  break;
1512 1536          }
1513 1537  
1514 1538          return (ret);
1515 1539  }
1516 1540  
1517 1541  /*
1518 1542   * v4 packet classifying function. looks up the fanout table to
1519 1543   * find the conn, the packet belongs to. returns the conn with
1520 1544   * the reference held, null otherwise.
1521 1545   *
1522 1546   * If zoneid is ALL_ZONES, then the search rules described in the "Connection
1523 1547   * Lookup" comment block are applied.  Labels are also checked as described
1524 1548   * above.  If the packet is from the inside (looped back), and is from the same
1525 1549   * zone, then label checks are omitted.
1526 1550   */
1527 1551  conn_t *
1528 1552  ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1529 1553      ip_recv_attr_t *ira, ip_stack_t *ipst)
1530 1554  {
1531 1555          ipha_t  *ipha;
1532 1556          connf_t *connfp, *bind_connfp;
1533 1557          uint16_t lport;
1534 1558          uint16_t fport;
1535 1559          uint32_t ports;
1536 1560          conn_t  *connp;
1537 1561          uint16_t  *up;
1538 1562          zoneid_t        zoneid = ira->ira_zoneid;
1539 1563  
1540 1564          ipha = (ipha_t *)mp->b_rptr;
1541 1565          up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET);
1542 1566  
1543 1567          switch (protocol) {
1544 1568          case IPPROTO_TCP:
1545 1569                  ports = *(uint32_t *)up;
1546 1570                  connfp =
1547 1571                      &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src,
1548 1572                      ports, ipst)];
1549 1573                  mutex_enter(&connfp->connf_lock);
1550 1574                  for (connp = connfp->connf_head; connp != NULL;
1551 1575                      connp = connp->conn_next) {
1552 1576                          if (IPCL_CONN_MATCH(connp, protocol,
1553 1577                              ipha->ipha_src, ipha->ipha_dst, ports) &&
1554 1578                              (connp->conn_zoneid == zoneid ||
1555 1579                              connp->conn_allzones ||
1556 1580                              ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1557 1581                              (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1558 1582                              (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1559 1583                                  break;
1560 1584                  }
1561 1585  
1562 1586                  if (connp != NULL) {
1563 1587                          /*
1564 1588                           * We have a fully-bound TCP connection.
1565 1589                           *
1566 1590                           * For labeled systems, there's no need to check the
1567 1591                           * label here.  It's known to be good as we checked
1568 1592                           * before allowing the connection to become bound.
1569 1593                           */
1570 1594                          CONN_INC_REF(connp);
1571 1595                          mutex_exit(&connfp->connf_lock);
1572 1596                          return (connp);
1573 1597                  }
1574 1598  
1575 1599                  mutex_exit(&connfp->connf_lock);
1576 1600                  lport = up[1];
1577 1601                  bind_connfp =
1578 1602                      &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1579 1603                  mutex_enter(&bind_connfp->connf_lock);
1580 1604                  for (connp = bind_connfp->connf_head; connp != NULL;
1581 1605                      connp = connp->conn_next) {
1582 1606                          if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst,
1583 1607                              lport) &&
1584 1608                              (connp->conn_zoneid == zoneid ||
1585 1609                              connp->conn_allzones ||
1586 1610                              ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1587 1611                              (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1588 1612                              (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1589 1613                                  break;
1590 1614                  }
1591 1615  
1592 1616                  /*
1593 1617                   * If the matching connection is SLP on a private address, then
1594 1618                   * the label on the packet must match the local zone's label.
1595 1619                   * Otherwise, it must be in the label range defined by tnrh.
1596 1620                   * This is ensured by tsol_receive_local.
1597 1621                   *
1598 1622                   * Note that we don't check tsol_receive_local for
1599 1623                   * the connected case.
1600 1624                   */
1601 1625                  if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1602 1626                      !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1603 1627                      ira, connp)) {
1604 1628                          DTRACE_PROBE3(tx__ip__log__info__classify__tcp,
1605 1629                              char *, "connp(1) could not receive mp(2)",
1606 1630                              conn_t *, connp, mblk_t *, mp);
1607 1631                          connp = NULL;
1608 1632                  }
1609 1633  
1610 1634                  if (connp != NULL) {
1611 1635                          /* Have a listener at least */
1612 1636                          CONN_INC_REF(connp);
1613 1637                          mutex_exit(&bind_connfp->connf_lock);
1614 1638                          return (connp);
1615 1639                  }
1616 1640  
1617 1641                  mutex_exit(&bind_connfp->connf_lock);
1618 1642                  break;
1619 1643  
1620 1644          case IPPROTO_UDP:
1621 1645                  lport = up[1];
1622 1646                  fport = up[0];
1623 1647                  connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1624 1648                  mutex_enter(&connfp->connf_lock);
1625 1649                  for (connp = connfp->connf_head; connp != NULL;
1626 1650                      connp = connp->conn_next) {
1627 1651                          if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst,
1628 1652                              fport, ipha->ipha_src) &&
1629 1653                              (connp->conn_zoneid == zoneid ||
1630 1654                              connp->conn_allzones ||
1631 1655                              ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1632 1656                              (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE))))
1633 1657                                  break;
1634 1658                  }
1635 1659  
1636 1660                  if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1637 1661                      !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1638 1662                      ira, connp)) {
1639 1663                          DTRACE_PROBE3(tx__ip__log__info__classify__udp,
1640 1664                              char *, "connp(1) could not receive mp(2)",
1641 1665                              conn_t *, connp, mblk_t *, mp);
1642 1666                          connp = NULL;
1643 1667                  }
1644 1668  
1645 1669                  if (connp != NULL) {
1646 1670                          CONN_INC_REF(connp);
1647 1671                          mutex_exit(&connfp->connf_lock);
1648 1672                          return (connp);
1649 1673                  }
1650 1674  
1651 1675                  /*
1652 1676                   * We shouldn't come here for multicast/broadcast packets
1653 1677                   */
1654 1678                  mutex_exit(&connfp->connf_lock);
1655 1679  
1656 1680                  break;
1657 1681  
1658 1682          case IPPROTO_ENCAP:
1659 1683          case IPPROTO_IPV6:
1660 1684                  return (ipcl_iptun_classify_v4(&ipha->ipha_src,
1661 1685                      &ipha->ipha_dst, ipst));
1662 1686          }
1663 1687  
1664 1688          return (NULL);
1665 1689  }
1666 1690  
1667 1691  conn_t *
1668 1692  ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1669 1693      ip_recv_attr_t *ira, ip_stack_t *ipst)
1670 1694  {
1671 1695          ip6_t           *ip6h;
1672 1696          connf_t         *connfp, *bind_connfp;
1673 1697          uint16_t        lport;
1674 1698          uint16_t        fport;
1675 1699          tcpha_t         *tcpha;
1676 1700          uint32_t        ports;
1677 1701          conn_t          *connp;
1678 1702          uint16_t        *up;
1679 1703          zoneid_t        zoneid = ira->ira_zoneid;
1680 1704  
1681 1705          ip6h = (ip6_t *)mp->b_rptr;
1682 1706  
1683 1707          switch (protocol) {
1684 1708          case IPPROTO_TCP:
1685 1709                  tcpha = (tcpha_t *)&mp->b_rptr[hdr_len];
1686 1710                  up = &tcpha->tha_lport;
1687 1711                  ports = *(uint32_t *)up;
1688 1712  
1689 1713                  connfp =
1690 1714                      &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src,
1691 1715                      ports, ipst)];
1692 1716                  mutex_enter(&connfp->connf_lock);
1693 1717                  for (connp = connfp->connf_head; connp != NULL;
1694 1718                      connp = connp->conn_next) {
1695 1719                          if (IPCL_CONN_MATCH_V6(connp, protocol,
1696 1720                              ip6h->ip6_src, ip6h->ip6_dst, ports) &&
1697 1721                              (connp->conn_zoneid == zoneid ||
1698 1722                              connp->conn_allzones ||
1699 1723                              ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1700 1724                              (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1701 1725                              (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1702 1726                                  break;
1703 1727                  }
1704 1728  
1705 1729                  if (connp != NULL) {
1706 1730                          /*
1707 1731                           * We have a fully-bound TCP connection.
1708 1732                           *
1709 1733                           * For labeled systems, there's no need to check the
1710 1734                           * label here.  It's known to be good as we checked
1711 1735                           * before allowing the connection to become bound.
1712 1736                           */
1713 1737                          CONN_INC_REF(connp);
1714 1738                          mutex_exit(&connfp->connf_lock);
1715 1739                          return (connp);
1716 1740                  }
1717 1741  
1718 1742                  mutex_exit(&connfp->connf_lock);
1719 1743  
1720 1744                  lport = up[1];
1721 1745                  bind_connfp =
1722 1746                      &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1723 1747                  mutex_enter(&bind_connfp->connf_lock);
1724 1748                  for (connp = bind_connfp->connf_head; connp != NULL;
1725 1749                      connp = connp->conn_next) {
1726 1750                          if (IPCL_BIND_MATCH_V6(connp, protocol,
1727 1751                              ip6h->ip6_dst, lport) &&
1728 1752                              (connp->conn_zoneid == zoneid ||
1729 1753                              connp->conn_allzones ||
1730 1754                              ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1731 1755                              (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1732 1756                              (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1733 1757                                  break;
1734 1758                  }
1735 1759  
1736 1760                  if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1737 1761                      !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1738 1762                      ira, connp)) {
1739 1763                          DTRACE_PROBE3(tx__ip__log__info__classify__tcp6,
1740 1764                              char *, "connp(1) could not receive mp(2)",
1741 1765                              conn_t *, connp, mblk_t *, mp);
1742 1766                          connp = NULL;
1743 1767                  }
1744 1768  
1745 1769                  if (connp != NULL) {
1746 1770                          /* Have a listner at least */
1747 1771                          CONN_INC_REF(connp);
1748 1772                          mutex_exit(&bind_connfp->connf_lock);
1749 1773                          return (connp);
1750 1774                  }
1751 1775  
1752 1776                  mutex_exit(&bind_connfp->connf_lock);
1753 1777                  break;
1754 1778  
1755 1779          case IPPROTO_UDP:
1756 1780                  up = (uint16_t *)&mp->b_rptr[hdr_len];
1757 1781                  lport = up[1];
1758 1782                  fport = up[0];
1759 1783                  connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1760 1784                  mutex_enter(&connfp->connf_lock);
1761 1785                  for (connp = connfp->connf_head; connp != NULL;
1762 1786                      connp = connp->conn_next) {
1763 1787                          if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst,
1764 1788                              fport, ip6h->ip6_src) &&
1765 1789                              (connp->conn_zoneid == zoneid ||
1766 1790                              connp->conn_allzones ||
1767 1791                              ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1768 1792                              (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1769 1793                              (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1770 1794                                  break;
1771 1795                  }
1772 1796  
1773 1797                  if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1774 1798                      !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1775 1799                      ira, connp)) {
1776 1800                          DTRACE_PROBE3(tx__ip__log__info__classify__udp6,
1777 1801                              char *, "connp(1) could not receive mp(2)",
1778 1802                              conn_t *, connp, mblk_t *, mp);
1779 1803                          connp = NULL;
1780 1804                  }
1781 1805  
1782 1806                  if (connp != NULL) {
1783 1807                          CONN_INC_REF(connp);
1784 1808                          mutex_exit(&connfp->connf_lock);
1785 1809                          return (connp);
1786 1810                  }
1787 1811  
1788 1812                  /*
1789 1813                   * We shouldn't come here for multicast/broadcast packets
1790 1814                   */
1791 1815                  mutex_exit(&connfp->connf_lock);
1792 1816                  break;
1793 1817          case IPPROTO_ENCAP:
1794 1818          case IPPROTO_IPV6:
1795 1819                  return (ipcl_iptun_classify_v6(&ip6h->ip6_src,
1796 1820                      &ip6h->ip6_dst, ipst));
1797 1821          }
1798 1822  
1799 1823          return (NULL);
1800 1824  }
1801 1825  
1802 1826  /*
1803 1827   * wrapper around ipcl_classify_(v4,v6) routines.
1804 1828   */
1805 1829  conn_t *
1806 1830  ipcl_classify(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst)
1807 1831  {
1808 1832          if (ira->ira_flags & IRAF_IS_IPV4) {
1809 1833                  return (ipcl_classify_v4(mp, ira->ira_protocol,
1810 1834                      ira->ira_ip_hdr_length, ira, ipst));
1811 1835          } else {
1812 1836                  return (ipcl_classify_v6(mp, ira->ira_protocol,
1813 1837                      ira->ira_ip_hdr_length, ira, ipst));
1814 1838          }
1815 1839  }
1816 1840  
1817 1841  /*
1818 1842   * Only used to classify SCTP RAW sockets
1819 1843   */
1820 1844  conn_t *
1821 1845  ipcl_classify_raw(mblk_t *mp, uint8_t protocol, uint32_t ports,
1822 1846      ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira, ip_stack_t *ipst)
1823 1847  {
1824 1848          connf_t         *connfp;
1825 1849          conn_t          *connp;
1826 1850          in_port_t       lport;
1827 1851          int             ipversion;
1828 1852          const void      *dst;
1829 1853          zoneid_t        zoneid = ira->ira_zoneid;
1830 1854  
1831 1855          lport = ((uint16_t *)&ports)[1];
1832 1856          if (ira->ira_flags & IRAF_IS_IPV4) {
1833 1857                  dst = (const void *)&ipha->ipha_dst;
1834 1858                  ipversion = IPV4_VERSION;
1835 1859          } else {
1836 1860                  dst = (const void *)&ip6h->ip6_dst;
1837 1861                  ipversion = IPV6_VERSION;
1838 1862          }
1839 1863  
1840 1864          connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1841 1865          mutex_enter(&connfp->connf_lock);
1842 1866          for (connp = connfp->connf_head; connp != NULL;
1843 1867              connp = connp->conn_next) {
1844 1868                  /* We don't allow v4 fallback for v6 raw socket. */
1845 1869                  if (ipversion != connp->conn_ipversion)
1846 1870                          continue;
1847 1871                  if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1848 1872                      !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1849 1873                          if (ipversion == IPV4_VERSION) {
1850 1874                                  if (!IPCL_CONN_MATCH(connp, protocol,
1851 1875                                      ipha->ipha_src, ipha->ipha_dst, ports))
1852 1876                                          continue;
1853 1877                          } else {
1854 1878                                  if (!IPCL_CONN_MATCH_V6(connp, protocol,
1855 1879                                      ip6h->ip6_src, ip6h->ip6_dst, ports))
1856 1880                                          continue;
1857 1881                          }
1858 1882                  } else {
1859 1883                          if (ipversion == IPV4_VERSION) {
1860 1884                                  if (!IPCL_BIND_MATCH(connp, protocol,
1861 1885                                      ipha->ipha_dst, lport))
1862 1886                                          continue;
1863 1887                          } else {
1864 1888                                  if (!IPCL_BIND_MATCH_V6(connp, protocol,
1865 1889                                      ip6h->ip6_dst, lport))
1866 1890                                          continue;
1867 1891                          }
1868 1892                  }
1869 1893  
1870 1894                  if (connp->conn_zoneid == zoneid ||
1871 1895                      connp->conn_allzones ||
1872 1896                      ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1873 1897                      (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1874 1898                      (ira->ira_flags & IRAF_TX_SHARED_ADDR)))
1875 1899                          break;
1876 1900          }
1877 1901  
1878 1902          if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1879 1903              !tsol_receive_local(mp, dst, ipversion, ira, connp)) {
1880 1904                  DTRACE_PROBE3(tx__ip__log__info__classify__rawip,
1881 1905                      char *, "connp(1) could not receive mp(2)",
1882 1906                      conn_t *, connp, mblk_t *, mp);
1883 1907                  connp = NULL;
1884 1908          }
1885 1909  
1886 1910          if (connp != NULL)
1887 1911                  goto found;
1888 1912          mutex_exit(&connfp->connf_lock);
1889 1913  
1890 1914          /* Try to look for a wildcard SCTP RAW socket match. */
1891 1915          connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)];
1892 1916          mutex_enter(&connfp->connf_lock);
1893 1917          for (connp = connfp->connf_head; connp != NULL;
1894 1918              connp = connp->conn_next) {
1895 1919                  /* We don't allow v4 fallback for v6 raw socket. */
1896 1920                  if (ipversion != connp->conn_ipversion)
1897 1921                          continue;
1898 1922                  if (!IPCL_ZONE_MATCH(connp, zoneid))
1899 1923                          continue;
1900 1924  
1901 1925                  if (ipversion == IPV4_VERSION) {
1902 1926                          if (IPCL_RAW_MATCH(connp, protocol, ipha->ipha_dst))
1903 1927                                  break;
1904 1928                  } else {
1905 1929                          if (IPCL_RAW_MATCH_V6(connp, protocol, ip6h->ip6_dst)) {
1906 1930                                  break;
1907 1931                          }
1908 1932                  }
1909 1933          }
1910 1934  
1911 1935          if (connp != NULL)
1912 1936                  goto found;
1913 1937  
1914 1938          mutex_exit(&connfp->connf_lock);
1915 1939          return (NULL);
1916 1940  
1917 1941  found:
1918 1942          ASSERT(connp != NULL);
1919 1943          CONN_INC_REF(connp);
1920 1944          mutex_exit(&connfp->connf_lock);
1921 1945          return (connp);
1922 1946  }
1923 1947  
1924 1948  /* ARGSUSED */
1925 1949  static int
1926 1950  tcp_conn_constructor(void *buf, void *cdrarg, int kmflags)
1927 1951  {
1928 1952          itc_t   *itc = (itc_t *)buf;
1929 1953          conn_t  *connp = &itc->itc_conn;
1930 1954          tcp_t   *tcp = (tcp_t *)&itc[1];
1931 1955  
1932 1956          bzero(connp, sizeof (conn_t));
1933 1957          bzero(tcp, sizeof (tcp_t));
1934 1958  
1935 1959          mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
1936 1960          cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
1937 1961          cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL);
1938 1962          tcp->tcp_timercache = tcp_timermp_alloc(kmflags);
1939 1963          if (tcp->tcp_timercache == NULL)
1940 1964                  return (ENOMEM);
1941 1965          connp->conn_tcp = tcp;
1942 1966          connp->conn_flags = IPCL_TCPCONN;
1943 1967          connp->conn_proto = IPPROTO_TCP;
1944 1968          tcp->tcp_connp = connp;
1945 1969          rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
1946 1970  
1947 1971          connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
1948 1972          if (connp->conn_ixa == NULL) {
1949 1973                  tcp_timermp_free(tcp);
1950 1974                  return (ENOMEM);
1951 1975          }
1952 1976          connp->conn_ixa->ixa_refcnt = 1;
1953 1977          connp->conn_ixa->ixa_protocol = connp->conn_proto;
1954 1978          connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
1955 1979          return (0);
1956 1980  }
1957 1981  
1958 1982  /* ARGSUSED */
1959 1983  static void
1960 1984  tcp_conn_destructor(void *buf, void *cdrarg)
1961 1985  {
1962 1986          itc_t   *itc = (itc_t *)buf;
1963 1987          conn_t  *connp = &itc->itc_conn;
1964 1988          tcp_t   *tcp = (tcp_t *)&itc[1];
1965 1989  
1966 1990          ASSERT(connp->conn_flags & IPCL_TCPCONN);
1967 1991          ASSERT(tcp->tcp_connp == connp);
1968 1992          ASSERT(connp->conn_tcp == tcp);
1969 1993          tcp_timermp_free(tcp);
1970 1994          mutex_destroy(&connp->conn_lock);
1971 1995          cv_destroy(&connp->conn_cv);
1972 1996          cv_destroy(&connp->conn_sq_cv);
1973 1997          rw_destroy(&connp->conn_ilg_lock);
1974 1998  
1975 1999          /* Can be NULL if constructor failed */
1976 2000          if (connp->conn_ixa != NULL) {
1977 2001                  ASSERT(connp->conn_ixa->ixa_refcnt == 1);
1978 2002                  ASSERT(connp->conn_ixa->ixa_ire == NULL);
1979 2003                  ASSERT(connp->conn_ixa->ixa_nce == NULL);
1980 2004                  ixa_refrele(connp->conn_ixa);
1981 2005          }
1982 2006  }
1983 2007  
1984 2008  /* ARGSUSED */
1985 2009  static int
1986 2010  ip_conn_constructor(void *buf, void *cdrarg, int kmflags)
1987 2011  {
1988 2012          itc_t   *itc = (itc_t *)buf;
1989 2013          conn_t  *connp = &itc->itc_conn;
1990 2014  
1991 2015          bzero(connp, sizeof (conn_t));
1992 2016          mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
1993 2017          cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
1994 2018          connp->conn_flags = IPCL_IPCCONN;
1995 2019          rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
1996 2020  
1997 2021          connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
1998 2022          if (connp->conn_ixa == NULL)
1999 2023                  return (ENOMEM);
2000 2024          connp->conn_ixa->ixa_refcnt = 1;
2001 2025          connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2002 2026          return (0);
2003 2027  }
2004 2028  
2005 2029  /* ARGSUSED */
2006 2030  static void
2007 2031  ip_conn_destructor(void *buf, void *cdrarg)
2008 2032  {
2009 2033          itc_t   *itc = (itc_t *)buf;
2010 2034          conn_t  *connp = &itc->itc_conn;
2011 2035  
2012 2036          ASSERT(connp->conn_flags & IPCL_IPCCONN);
2013 2037          ASSERT(connp->conn_priv == NULL);
2014 2038          mutex_destroy(&connp->conn_lock);
2015 2039          cv_destroy(&connp->conn_cv);
2016 2040          rw_destroy(&connp->conn_ilg_lock);
2017 2041  
2018 2042          /* Can be NULL if constructor failed */
2019 2043          if (connp->conn_ixa != NULL) {
2020 2044                  ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2021 2045                  ASSERT(connp->conn_ixa->ixa_ire == NULL);
2022 2046                  ASSERT(connp->conn_ixa->ixa_nce == NULL);
2023 2047                  ixa_refrele(connp->conn_ixa);
2024 2048          }
2025 2049  }
2026 2050  
2027 2051  /* ARGSUSED */
2028 2052  static int
2029 2053  udp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2030 2054  {
2031 2055          itc_t   *itc = (itc_t *)buf;
2032 2056          conn_t  *connp = &itc->itc_conn;
2033 2057          udp_t   *udp = (udp_t *)&itc[1];
2034 2058  
2035 2059          bzero(connp, sizeof (conn_t));
2036 2060          bzero(udp, sizeof (udp_t));
2037 2061  
2038 2062          mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2039 2063          cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2040 2064          connp->conn_udp = udp;
2041 2065          connp->conn_flags = IPCL_UDPCONN;
2042 2066          connp->conn_proto = IPPROTO_UDP;
2043 2067          udp->udp_connp = connp;
2044 2068          rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2045 2069          connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2046 2070          if (connp->conn_ixa == NULL)
2047 2071                  return (ENOMEM);
2048 2072          connp->conn_ixa->ixa_refcnt = 1;
2049 2073          connp->conn_ixa->ixa_protocol = connp->conn_proto;
2050 2074          connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2051 2075          return (0);
2052 2076  }
2053 2077  
2054 2078  /* ARGSUSED */
2055 2079  static void
2056 2080  udp_conn_destructor(void *buf, void *cdrarg)
2057 2081  {
2058 2082          itc_t   *itc = (itc_t *)buf;
2059 2083          conn_t  *connp = &itc->itc_conn;
2060 2084          udp_t   *udp = (udp_t *)&itc[1];
2061 2085  
2062 2086          ASSERT(connp->conn_flags & IPCL_UDPCONN);
2063 2087          ASSERT(udp->udp_connp == connp);
2064 2088          ASSERT(connp->conn_udp == udp);
2065 2089          mutex_destroy(&connp->conn_lock);
2066 2090          cv_destroy(&connp->conn_cv);
2067 2091          rw_destroy(&connp->conn_ilg_lock);
2068 2092  
2069 2093          /* Can be NULL if constructor failed */
2070 2094          if (connp->conn_ixa != NULL) {
2071 2095                  ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2072 2096                  ASSERT(connp->conn_ixa->ixa_ire == NULL);
2073 2097                  ASSERT(connp->conn_ixa->ixa_nce == NULL);
2074 2098                  ixa_refrele(connp->conn_ixa);
2075 2099          }
2076 2100  }
2077 2101  
2078 2102  /* ARGSUSED */
2079 2103  static int
2080 2104  rawip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2081 2105  {
2082 2106          itc_t   *itc = (itc_t *)buf;
2083 2107          conn_t  *connp = &itc->itc_conn;
2084 2108          icmp_t  *icmp = (icmp_t *)&itc[1];
  
    | 
      ↓ open down ↓ | 
    565 lines elided | 
    
      ↑ open up ↑ | 
  
2085 2109  
2086 2110          bzero(connp, sizeof (conn_t));
2087 2111          bzero(icmp, sizeof (icmp_t));
2088 2112  
2089 2113          mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2090 2114          cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2091 2115          connp->conn_icmp = icmp;
2092 2116          connp->conn_flags = IPCL_RAWIPCONN;
2093 2117          connp->conn_proto = IPPROTO_ICMP;
2094 2118          icmp->icmp_connp = connp;
     2119 +        rw_init(&icmp->icmp_bpf_lock, NULL, RW_DEFAULT, NULL);
2095 2120          rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2096 2121          connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2097 2122          if (connp->conn_ixa == NULL)
2098 2123                  return (ENOMEM);
2099 2124          connp->conn_ixa->ixa_refcnt = 1;
2100 2125          connp->conn_ixa->ixa_protocol = connp->conn_proto;
2101 2126          connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2102 2127          return (0);
2103 2128  }
2104 2129  
2105 2130  /* ARGSUSED */
2106 2131  static void
2107 2132  rawip_conn_destructor(void *buf, void *cdrarg)
2108 2133  {
  
    | 
      ↓ open down ↓ | 
    4 lines elided | 
    
      ↑ open up ↑ | 
  
2109 2134          itc_t   *itc = (itc_t *)buf;
2110 2135          conn_t  *connp = &itc->itc_conn;
2111 2136          icmp_t  *icmp = (icmp_t *)&itc[1];
2112 2137  
2113 2138          ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
2114 2139          ASSERT(icmp->icmp_connp == connp);
2115 2140          ASSERT(connp->conn_icmp == icmp);
2116 2141          mutex_destroy(&connp->conn_lock);
2117 2142          cv_destroy(&connp->conn_cv);
2118 2143          rw_destroy(&connp->conn_ilg_lock);
     2144 +        rw_destroy(&icmp->icmp_bpf_lock);
2119 2145  
2120 2146          /* Can be NULL if constructor failed */
2121 2147          if (connp->conn_ixa != NULL) {
2122 2148                  ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2123 2149                  ASSERT(connp->conn_ixa->ixa_ire == NULL);
2124 2150                  ASSERT(connp->conn_ixa->ixa_nce == NULL);
2125 2151                  ixa_refrele(connp->conn_ixa);
2126 2152          }
2127 2153  }
2128 2154  
2129 2155  /* ARGSUSED */
2130 2156  static int
2131 2157  rts_conn_constructor(void *buf, void *cdrarg, int kmflags)
2132 2158  {
2133 2159          itc_t   *itc = (itc_t *)buf;
2134 2160          conn_t  *connp = &itc->itc_conn;
2135 2161          rts_t   *rts = (rts_t *)&itc[1];
2136 2162  
2137 2163          bzero(connp, sizeof (conn_t));
2138 2164          bzero(rts, sizeof (rts_t));
2139 2165  
2140 2166          mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2141 2167          cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2142 2168          connp->conn_rts = rts;
2143 2169          connp->conn_flags = IPCL_RTSCONN;
2144 2170          rts->rts_connp = connp;
2145 2171          rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2146 2172          connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2147 2173          if (connp->conn_ixa == NULL)
2148 2174                  return (ENOMEM);
2149 2175          connp->conn_ixa->ixa_refcnt = 1;
2150 2176          connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2151 2177          return (0);
2152 2178  }
2153 2179  
2154 2180  /* ARGSUSED */
2155 2181  static void
2156 2182  rts_conn_destructor(void *buf, void *cdrarg)
2157 2183  {
2158 2184          itc_t   *itc = (itc_t *)buf;
2159 2185          conn_t  *connp = &itc->itc_conn;
2160 2186          rts_t   *rts = (rts_t *)&itc[1];
2161 2187  
2162 2188          ASSERT(connp->conn_flags & IPCL_RTSCONN);
2163 2189          ASSERT(rts->rts_connp == connp);
2164 2190          ASSERT(connp->conn_rts == rts);
2165 2191          mutex_destroy(&connp->conn_lock);
2166 2192          cv_destroy(&connp->conn_cv);
2167 2193          rw_destroy(&connp->conn_ilg_lock);
2168 2194  
2169 2195          /* Can be NULL if constructor failed */
2170 2196          if (connp->conn_ixa != NULL) {
2171 2197                  ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2172 2198                  ASSERT(connp->conn_ixa->ixa_ire == NULL);
2173 2199                  ASSERT(connp->conn_ixa->ixa_nce == NULL);
2174 2200                  ixa_refrele(connp->conn_ixa);
2175 2201          }
2176 2202  }
2177 2203  
2178 2204  /*
2179 2205   * Called as part of ipcl_conn_destroy to assert and clear any pointers
2180 2206   * in the conn_t.
2181 2207   *
2182 2208   * Below we list all the pointers in the conn_t as a documentation aid.
2183 2209   * The ones that we can not ASSERT to be NULL are #ifdef'ed out.
2184 2210   * If you add any pointers to the conn_t please add an ASSERT here
2185 2211   * and #ifdef it out if it can't be actually asserted to be NULL.
2186 2212   * In any case, we bzero most of the conn_t at the end of the function.
2187 2213   */
2188 2214  void
2189 2215  ipcl_conn_cleanup(conn_t *connp)
2190 2216  {
2191 2217          ip_xmit_attr_t  *ixa;
2192 2218  
2193 2219          ASSERT(connp->conn_latch == NULL);
2194 2220          ASSERT(connp->conn_latch_in_policy == NULL);
2195 2221          ASSERT(connp->conn_latch_in_action == NULL);
2196 2222  #ifdef notdef
2197 2223          ASSERT(connp->conn_rq == NULL);
2198 2224          ASSERT(connp->conn_wq == NULL);
2199 2225  #endif
2200 2226          ASSERT(connp->conn_cred == NULL);
2201 2227          ASSERT(connp->conn_g_fanout == NULL);
2202 2228          ASSERT(connp->conn_g_next == NULL);
2203 2229          ASSERT(connp->conn_g_prev == NULL);
2204 2230          ASSERT(connp->conn_policy == NULL);
2205 2231          ASSERT(connp->conn_fanout == NULL);
2206 2232          ASSERT(connp->conn_next == NULL);
2207 2233          ASSERT(connp->conn_prev == NULL);
2208 2234          ASSERT(connp->conn_oper_pending_ill == NULL);
2209 2235          ASSERT(connp->conn_ilg == NULL);
2210 2236          ASSERT(connp->conn_drain_next == NULL);
2211 2237          ASSERT(connp->conn_drain_prev == NULL);
2212 2238  #ifdef notdef
2213 2239          /* conn_idl is not cleared when removed from idl list */
2214 2240          ASSERT(connp->conn_idl == NULL);
2215 2241  #endif
2216 2242          ASSERT(connp->conn_ipsec_opt_mp == NULL);
2217 2243  #ifdef notdef
2218 2244          /* conn_netstack is cleared by the caller; needed by ixa_cleanup */
2219 2245          ASSERT(connp->conn_netstack == NULL);
2220 2246  #endif
2221 2247  
2222 2248          ASSERT(connp->conn_helper_info == NULL);
2223 2249          ASSERT(connp->conn_ixa != NULL);
2224 2250          ixa = connp->conn_ixa;
2225 2251          ASSERT(ixa->ixa_refcnt == 1);
2226 2252          /* Need to preserve ixa_protocol */
2227 2253          ixa_cleanup(ixa);
2228 2254          ixa->ixa_flags = 0;
2229 2255  
2230 2256          /* Clear out the conn_t fields that are not preserved */
2231 2257          bzero(&connp->conn_start_clr,
2232 2258              sizeof (conn_t) -
2233 2259              ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp));
2234 2260  }
2235 2261  
2236 2262  /*
2237 2263   * All conns are inserted in a global multi-list for the benefit of
2238 2264   * walkers. The walk is guaranteed to walk all open conns at the time
2239 2265   * of the start of the walk exactly once. This property is needed to
2240 2266   * achieve some cleanups during unplumb of interfaces. This is achieved
2241 2267   * as follows.
2242 2268   *
2243 2269   * ipcl_conn_create and ipcl_conn_destroy are the only functions that
2244 2270   * call the insert and delete functions below at creation and deletion
2245 2271   * time respectively. The conn never moves or changes its position in this
2246 2272   * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt
2247 2273   * won't increase due to walkers, once the conn deletion has started. Note
2248 2274   * that we can't remove the conn from the global list and then wait for
2249 2275   * the refcnt to drop to zero, since walkers would then see a truncated
2250 2276   * list. CONN_INCIPIENT ensures that walkers don't start looking at
2251 2277   * conns until ip_open is ready to make them globally visible.
2252 2278   * The global round robin multi-list locks are held only to get the
2253 2279   * next member/insertion/deletion and contention should be negligible
2254 2280   * if the multi-list is much greater than the number of cpus.
2255 2281   */
2256 2282  void
2257 2283  ipcl_globalhash_insert(conn_t *connp)
2258 2284  {
2259 2285          int     index;
2260 2286          struct connf_s  *connfp;
2261 2287          ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
2262 2288  
2263 2289          /*
2264 2290           * No need for atomic here. Approximate even distribution
2265 2291           * in the global lists is sufficient.
2266 2292           */
2267 2293          ipst->ips_conn_g_index++;
2268 2294          index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1);
2269 2295  
2270 2296          connp->conn_g_prev = NULL;
2271 2297          /*
2272 2298           * Mark as INCIPIENT, so that walkers will ignore this
2273 2299           * for now, till ip_open is ready to make it visible globally.
2274 2300           */
2275 2301          connp->conn_state_flags |= CONN_INCIPIENT;
2276 2302  
2277 2303          connfp = &ipst->ips_ipcl_globalhash_fanout[index];
2278 2304          /* Insert at the head of the list */
2279 2305          mutex_enter(&connfp->connf_lock);
2280 2306          connp->conn_g_next = connfp->connf_head;
2281 2307          if (connp->conn_g_next != NULL)
2282 2308                  connp->conn_g_next->conn_g_prev = connp;
2283 2309          connfp->connf_head = connp;
2284 2310  
2285 2311          /* The fanout bucket this conn points to */
2286 2312          connp->conn_g_fanout = connfp;
2287 2313  
2288 2314          mutex_exit(&connfp->connf_lock);
2289 2315  }
2290 2316  
2291 2317  void
2292 2318  ipcl_globalhash_remove(conn_t *connp)
2293 2319  {
2294 2320          struct connf_s  *connfp;
2295 2321  
2296 2322          /*
2297 2323           * We were never inserted in the global multi list.
2298 2324           * IPCL_NONE variety is never inserted in the global multilist
2299 2325           * since it is presumed to not need any cleanup and is transient.
2300 2326           */
2301 2327          if (connp->conn_g_fanout == NULL)
2302 2328                  return;
2303 2329  
2304 2330          connfp = connp->conn_g_fanout;
2305 2331          mutex_enter(&connfp->connf_lock);
2306 2332          if (connp->conn_g_prev != NULL)
2307 2333                  connp->conn_g_prev->conn_g_next = connp->conn_g_next;
2308 2334          else
2309 2335                  connfp->connf_head = connp->conn_g_next;
2310 2336          if (connp->conn_g_next != NULL)
2311 2337                  connp->conn_g_next->conn_g_prev = connp->conn_g_prev;
2312 2338          mutex_exit(&connfp->connf_lock);
2313 2339  
2314 2340          /* Better to stumble on a null pointer than to corrupt memory */
2315 2341          connp->conn_g_next = NULL;
2316 2342          connp->conn_g_prev = NULL;
2317 2343          connp->conn_g_fanout = NULL;
2318 2344  }
2319 2345  
2320 2346  /*
2321 2347   * Walk the list of all conn_t's in the system, calling the function provided
2322 2348   * With the specified argument for each.
2323 2349   * Applies to both IPv4 and IPv6.
2324 2350   *
2325 2351   * CONNs may hold pointers to ills (conn_dhcpinit_ill and
2326 2352   * conn_oper_pending_ill). To guard against stale pointers
2327 2353   * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is
2328 2354   * unplumbed or removed. New conn_t's that are created while we are walking
2329 2355   * may be missed by this walk, because they are not necessarily inserted
2330 2356   * at the tail of the list. They are new conn_t's and thus don't have any
2331 2357   * stale pointers. The CONN_CLOSING flag ensures that no new reference
2332 2358   * is created to the struct that is going away.
2333 2359   */
2334 2360  void
2335 2361  ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst)
2336 2362  {
2337 2363          int     i;
2338 2364          conn_t  *connp;
2339 2365          conn_t  *prev_connp;
2340 2366  
2341 2367          for (i = 0; i < CONN_G_HASH_SIZE; i++) {
2342 2368                  mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2343 2369                  prev_connp = NULL;
2344 2370                  connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head;
2345 2371                  while (connp != NULL) {
2346 2372                          mutex_enter(&connp->conn_lock);
2347 2373                          if (connp->conn_state_flags &
2348 2374                              (CONN_CONDEMNED | CONN_INCIPIENT)) {
2349 2375                                  mutex_exit(&connp->conn_lock);
2350 2376                                  connp = connp->conn_g_next;
2351 2377                                  continue;
2352 2378                          }
2353 2379                          CONN_INC_REF_LOCKED(connp);
2354 2380                          mutex_exit(&connp->conn_lock);
2355 2381                          mutex_exit(
2356 2382                              &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2357 2383                          (*func)(connp, arg);
2358 2384                          if (prev_connp != NULL)
2359 2385                                  CONN_DEC_REF(prev_connp);
2360 2386                          mutex_enter(
2361 2387                              &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2362 2388                          prev_connp = connp;
2363 2389                          connp = connp->conn_g_next;
2364 2390                  }
2365 2391                  mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2366 2392                  if (prev_connp != NULL)
2367 2393                          CONN_DEC_REF(prev_connp);
2368 2394          }
2369 2395  }
2370 2396  
2371 2397  /*
2372 2398   * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on
2373 2399   * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2374 2400   * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2375 2401   * (peer tcp in ESTABLISHED state).
2376 2402   */
2377 2403  conn_t *
2378 2404  ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcpha_t *tcpha,
2379 2405      ip_stack_t *ipst)
2380 2406  {
2381 2407          uint32_t ports;
2382 2408          uint16_t *pports = (uint16_t *)&ports;
2383 2409          connf_t *connfp;
2384 2410          conn_t  *tconnp;
2385 2411          boolean_t zone_chk;
2386 2412  
2387 2413          /*
2388 2414           * If either the source of destination address is loopback, then
2389 2415           * both endpoints must be in the same Zone.  Otherwise, both of
2390 2416           * the addresses are system-wide unique (tcp is in ESTABLISHED
2391 2417           * state) and the endpoints may reside in different Zones.
2392 2418           */
2393 2419          zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) ||
2394 2420              ipha->ipha_dst == htonl(INADDR_LOOPBACK));
2395 2421  
2396 2422          pports[0] = tcpha->tha_fport;
2397 2423          pports[1] = tcpha->tha_lport;
2398 2424  
2399 2425          connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2400 2426              ports, ipst)];
2401 2427  
2402 2428          mutex_enter(&connfp->connf_lock);
2403 2429          for (tconnp = connfp->connf_head; tconnp != NULL;
2404 2430              tconnp = tconnp->conn_next) {
2405 2431  
2406 2432                  if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2407 2433                      ipha->ipha_dst, ipha->ipha_src, ports) &&
2408 2434                      tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2409 2435                      (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2410 2436  
2411 2437                          ASSERT(tconnp != connp);
2412 2438                          CONN_INC_REF(tconnp);
2413 2439                          mutex_exit(&connfp->connf_lock);
2414 2440                          return (tconnp);
2415 2441                  }
2416 2442          }
2417 2443          mutex_exit(&connfp->connf_lock);
2418 2444          return (NULL);
2419 2445  }
2420 2446  
2421 2447  /*
2422 2448   * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on
2423 2449   * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2424 2450   * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2425 2451   * (peer tcp in ESTABLISHED state).
2426 2452   */
2427 2453  conn_t *
2428 2454  ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcpha_t *tcpha,
2429 2455      ip_stack_t *ipst)
2430 2456  {
2431 2457          uint32_t ports;
2432 2458          uint16_t *pports = (uint16_t *)&ports;
2433 2459          connf_t *connfp;
2434 2460          conn_t  *tconnp;
2435 2461          boolean_t zone_chk;
2436 2462  
2437 2463          /*
2438 2464           * If either the source of destination address is loopback, then
2439 2465           * both endpoints must be in the same Zone.  Otherwise, both of
2440 2466           * the addresses are system-wide unique (tcp is in ESTABLISHED
2441 2467           * state) and the endpoints may reside in different Zones.  We
2442 2468           * don't do Zone check for link local address(es) because the
2443 2469           * current Zone implementation treats each link local address as
2444 2470           * being unique per system node, i.e. they belong to global Zone.
2445 2471           */
2446 2472          zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) ||
2447 2473              IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst));
2448 2474  
2449 2475          pports[0] = tcpha->tha_fport;
2450 2476          pports[1] = tcpha->tha_lport;
2451 2477  
2452 2478          connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2453 2479              ports, ipst)];
2454 2480  
2455 2481          mutex_enter(&connfp->connf_lock);
2456 2482          for (tconnp = connfp->connf_head; tconnp != NULL;
2457 2483              tconnp = tconnp->conn_next) {
2458 2484  
2459 2485                  /* We skip conn_bound_if check here as this is loopback tcp */
2460 2486                  if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2461 2487                      ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2462 2488                      tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2463 2489                      (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2464 2490  
2465 2491                          ASSERT(tconnp != connp);
2466 2492                          CONN_INC_REF(tconnp);
2467 2493                          mutex_exit(&connfp->connf_lock);
2468 2494                          return (tconnp);
2469 2495                  }
2470 2496          }
2471 2497          mutex_exit(&connfp->connf_lock);
2472 2498          return (NULL);
2473 2499  }
2474 2500  
2475 2501  /*
2476 2502   * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2477 2503   * Returns with conn reference held. Caller must call CONN_DEC_REF.
2478 2504   * Only checks for connected entries i.e. no INADDR_ANY checks.
2479 2505   */
2480 2506  conn_t *
2481 2507  ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcpha_t *tcpha, int min_state,
2482 2508      ip_stack_t *ipst)
2483 2509  {
2484 2510          uint32_t ports;
2485 2511          uint16_t *pports;
2486 2512          connf_t *connfp;
2487 2513          conn_t  *tconnp;
2488 2514  
2489 2515          pports = (uint16_t *)&ports;
2490 2516          pports[0] = tcpha->tha_fport;
2491 2517          pports[1] = tcpha->tha_lport;
2492 2518  
2493 2519          connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2494 2520              ports, ipst)];
2495 2521  
2496 2522          mutex_enter(&connfp->connf_lock);
2497 2523          for (tconnp = connfp->connf_head; tconnp != NULL;
2498 2524              tconnp = tconnp->conn_next) {
2499 2525  
2500 2526                  if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2501 2527                      ipha->ipha_dst, ipha->ipha_src, ports) &&
2502 2528                      tconnp->conn_tcp->tcp_state >= min_state) {
2503 2529  
2504 2530                          CONN_INC_REF(tconnp);
2505 2531                          mutex_exit(&connfp->connf_lock);
2506 2532                          return (tconnp);
2507 2533                  }
2508 2534          }
2509 2535          mutex_exit(&connfp->connf_lock);
2510 2536          return (NULL);
2511 2537  }
2512 2538  
2513 2539  /*
2514 2540   * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2515 2541   * Returns with conn reference held. Caller must call CONN_DEC_REF.
2516 2542   * Only checks for connected entries i.e. no INADDR_ANY checks.
2517 2543   * Match on ifindex in addition to addresses.
2518 2544   */
2519 2545  conn_t *
2520 2546  ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state,
2521 2547      uint_t ifindex, ip_stack_t *ipst)
2522 2548  {
2523 2549          tcp_t   *tcp;
2524 2550          uint32_t ports;
2525 2551          uint16_t *pports;
2526 2552          connf_t *connfp;
2527 2553          conn_t  *tconnp;
2528 2554  
2529 2555          pports = (uint16_t *)&ports;
2530 2556          pports[0] = tcpha->tha_fport;
2531 2557          pports[1] = tcpha->tha_lport;
2532 2558  
2533 2559          connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2534 2560              ports, ipst)];
2535 2561  
2536 2562          mutex_enter(&connfp->connf_lock);
2537 2563          for (tconnp = connfp->connf_head; tconnp != NULL;
2538 2564              tconnp = tconnp->conn_next) {
2539 2565  
2540 2566                  tcp = tconnp->conn_tcp;
2541 2567                  if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2542 2568                      ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2543 2569                      tcp->tcp_state >= min_state &&
2544 2570                      (tconnp->conn_bound_if == 0 ||
2545 2571                      tconnp->conn_bound_if == ifindex)) {
2546 2572  
2547 2573                          CONN_INC_REF(tconnp);
2548 2574                          mutex_exit(&connfp->connf_lock);
2549 2575                          return (tconnp);
2550 2576                  }
2551 2577          }
2552 2578          mutex_exit(&connfp->connf_lock);
2553 2579          return (NULL);
2554 2580  }
2555 2581  
2556 2582  /*
2557 2583   * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate
2558 2584   * a listener when changing state.
2559 2585   */
2560 2586  conn_t *
2561 2587  ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid,
2562 2588      ip_stack_t *ipst)
2563 2589  {
2564 2590          connf_t         *bind_connfp;
2565 2591          conn_t          *connp;
2566 2592          tcp_t           *tcp;
2567 2593  
2568 2594          /*
2569 2595           * Avoid false matches for packets sent to an IP destination of
2570 2596           * all zeros.
2571 2597           */
2572 2598          if (laddr == 0)
2573 2599                  return (NULL);
2574 2600  
2575 2601          ASSERT(zoneid != ALL_ZONES);
2576 2602  
2577 2603          bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2578 2604          mutex_enter(&bind_connfp->connf_lock);
2579 2605          for (connp = bind_connfp->connf_head; connp != NULL;
2580 2606              connp = connp->conn_next) {
2581 2607                  tcp = connp->conn_tcp;
2582 2608                  if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) &&
2583 2609                      IPCL_ZONE_MATCH(connp, zoneid) &&
2584 2610                      (tcp->tcp_listener == NULL)) {
2585 2611                          CONN_INC_REF(connp);
2586 2612                          mutex_exit(&bind_connfp->connf_lock);
2587 2613                          return (connp);
2588 2614                  }
2589 2615          }
2590 2616          mutex_exit(&bind_connfp->connf_lock);
2591 2617          return (NULL);
2592 2618  }
2593 2619  
2594 2620  /*
2595 2621   * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate
2596 2622   * a listener when changing state.
2597 2623   */
2598 2624  conn_t *
2599 2625  ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex,
2600 2626      zoneid_t zoneid, ip_stack_t *ipst)
2601 2627  {
2602 2628          connf_t         *bind_connfp;
2603 2629          conn_t          *connp = NULL;
2604 2630          tcp_t           *tcp;
2605 2631  
2606 2632          /*
2607 2633           * Avoid false matches for packets sent to an IP destination of
2608 2634           * all zeros.
2609 2635           */
2610 2636          if (IN6_IS_ADDR_UNSPECIFIED(laddr))
2611 2637                  return (NULL);
2612 2638  
2613 2639          ASSERT(zoneid != ALL_ZONES);
2614 2640  
2615 2641          bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2616 2642          mutex_enter(&bind_connfp->connf_lock);
2617 2643          for (connp = bind_connfp->connf_head; connp != NULL;
2618 2644              connp = connp->conn_next) {
2619 2645                  tcp = connp->conn_tcp;
2620 2646                  if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) &&
2621 2647                      IPCL_ZONE_MATCH(connp, zoneid) &&
2622 2648                      (connp->conn_bound_if == 0 ||
2623 2649                      connp->conn_bound_if == ifindex) &&
2624 2650                      tcp->tcp_listener == NULL) {
2625 2651                          CONN_INC_REF(connp);
2626 2652                          mutex_exit(&bind_connfp->connf_lock);
2627 2653                          return (connp);
2628 2654                  }
2629 2655          }
2630 2656          mutex_exit(&bind_connfp->connf_lock);
2631 2657          return (NULL);
2632 2658  }
2633 2659  
2634 2660  /*
2635 2661   * ipcl_get_next_conn
2636 2662   *      get the next entry in the conn global list
2637 2663   *      and put a reference on the next_conn.
2638 2664   *      decrement the reference on the current conn.
2639 2665   *
2640 2666   * This is an iterator based walker function that also provides for
2641 2667   * some selection by the caller. It walks through the conn_hash bucket
2642 2668   * searching for the next valid connp in the list, and selects connections
2643 2669   * that are neither closed nor condemned. It also REFHOLDS the conn
2644 2670   * thus ensuring that the conn exists when the caller uses the conn.
2645 2671   */
2646 2672  conn_t *
2647 2673  ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags)
2648 2674  {
2649 2675          conn_t  *next_connp;
2650 2676  
2651 2677          if (connfp == NULL)
2652 2678                  return (NULL);
2653 2679  
2654 2680          mutex_enter(&connfp->connf_lock);
2655 2681  
2656 2682          next_connp = (connp == NULL) ?
2657 2683              connfp->connf_head : connp->conn_g_next;
2658 2684  
2659 2685          while (next_connp != NULL) {
2660 2686                  mutex_enter(&next_connp->conn_lock);
2661 2687                  if (!(next_connp->conn_flags & conn_flags) ||
2662 2688                      (next_connp->conn_state_flags &
2663 2689                      (CONN_CONDEMNED | CONN_INCIPIENT))) {
2664 2690                          /*
2665 2691                           * This conn has been condemned or
2666 2692                           * is closing, or the flags don't match
2667 2693                           */
2668 2694                          mutex_exit(&next_connp->conn_lock);
2669 2695                          next_connp = next_connp->conn_g_next;
2670 2696                          continue;
2671 2697                  }
2672 2698                  CONN_INC_REF_LOCKED(next_connp);
2673 2699                  mutex_exit(&next_connp->conn_lock);
2674 2700                  break;
2675 2701          }
2676 2702  
2677 2703          mutex_exit(&connfp->connf_lock);
2678 2704  
2679 2705          if (connp != NULL)
2680 2706                  CONN_DEC_REF(connp);
2681 2707  
2682 2708          return (next_connp);
2683 2709  }
2684 2710  
2685 2711  #ifdef CONN_DEBUG
2686 2712  /*
2687 2713   * Trace of the last NBUF refhold/refrele
2688 2714   */
2689 2715  int
2690 2716  conn_trace_ref(conn_t *connp)
2691 2717  {
2692 2718          int     last;
2693 2719          conn_trace_t    *ctb;
2694 2720  
2695 2721          ASSERT(MUTEX_HELD(&connp->conn_lock));
2696 2722          last = connp->conn_trace_last;
2697 2723          last++;
2698 2724          if (last == CONN_TRACE_MAX)
2699 2725                  last = 0;
2700 2726  
2701 2727          ctb = &connp->conn_trace_buf[last];
2702 2728          ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2703 2729          connp->conn_trace_last = last;
2704 2730          return (1);
2705 2731  }
2706 2732  
2707 2733  int
2708 2734  conn_untrace_ref(conn_t *connp)
2709 2735  {
2710 2736          int     last;
2711 2737          conn_trace_t    *ctb;
2712 2738  
2713 2739          ASSERT(MUTEX_HELD(&connp->conn_lock));
2714 2740          last = connp->conn_trace_last;
2715 2741          last++;
2716 2742          if (last == CONN_TRACE_MAX)
2717 2743                  last = 0;
2718 2744  
2719 2745          ctb = &connp->conn_trace_buf[last];
2720 2746          ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2721 2747          connp->conn_trace_last = last;
2722 2748          return (1);
2723 2749  }
2724 2750  #endif
  
    | 
      ↓ open down ↓ | 
    596 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX