Print this page
    
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/inet/ip/ipclassifier.c
          +++ new/usr/src/uts/common/inet/ip/ipclassifier.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright 2016 Joyent, Inc.
  24   24   */
  25   25  
  26   26  /*
  27   27   * IP PACKET CLASSIFIER
  28   28   *
  29   29   * The IP packet classifier provides mapping between IP packets and persistent
  30   30   * connection state for connection-oriented protocols. It also provides
  31   31   * interface for managing connection states.
  32   32   *
  33   33   * The connection state is kept in conn_t data structure and contains, among
  34   34   * other things:
  35   35   *
  36   36   *      o local/remote address and ports
  37   37   *      o Transport protocol
  38   38   *      o squeue for the connection (for TCP only)
  39   39   *      o reference counter
  40   40   *      o Connection state
  41   41   *      o hash table linkage
  42   42   *      o interface/ire information
  43   43   *      o credentials
  44   44   *      o ipsec policy
  45   45   *      o send and receive functions.
  46   46   *      o mutex lock.
  47   47   *
  48   48   * Connections use a reference counting scheme. They are freed when the
  49   49   * reference counter drops to zero. A reference is incremented when connection
  50   50   * is placed in a list or table, when incoming packet for the connection arrives
  51   51   * and when connection is processed via squeue (squeue processing may be
  52   52   * asynchronous and the reference protects the connection from being destroyed
  53   53   * before its processing is finished).
  54   54   *
  55   55   * conn_recv is used to pass up packets to the ULP.
  56   56   * For TCP conn_recv changes. It is tcp_input_listener_unbound initially for
  57   57   * a listener, and changes to tcp_input_listener as the listener has picked a
  58   58   * good squeue. For other cases it is set to tcp_input_data.
  59   59   *
  60   60   * conn_recvicmp is used to pass up ICMP errors to the ULP.
  61   61   *
  62   62   * Classifier uses several hash tables:
  63   63   *
  64   64   *      ipcl_conn_fanout:       contains all TCP connections in CONNECTED state
  65   65   *      ipcl_bind_fanout:       contains all connections in BOUND state
  66   66   *      ipcl_proto_fanout:      IPv4 protocol fanout
  67   67   *      ipcl_proto_fanout_v6:   IPv6 protocol fanout
  68   68   *      ipcl_udp_fanout:        contains all UDP connections
  69   69   *      ipcl_iptun_fanout:      contains all IP tunnel connections
  70   70   *      ipcl_globalhash_fanout: contains all connections
  71   71   *
  72   72   * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering)
  73   73   * which need to view all existing connections.
  74   74   *
  75   75   * All tables are protected by per-bucket locks. When both per-bucket lock and
  76   76   * connection lock need to be held, the per-bucket lock should be acquired
  77   77   * first, followed by the connection lock.
  78   78   *
  79   79   * All functions doing search in one of these tables increment a reference
  80   80   * counter on the connection found (if any). This reference should be dropped
  81   81   * when the caller has finished processing the connection.
  82   82   *
  83   83   *
  84   84   * INTERFACES:
  85   85   * ===========
  86   86   *
  87   87   * Connection Lookup:
  88   88   * ------------------
  89   89   *
  90   90   * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, ira, ip_stack)
  91   91   * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, ira, ip_stack)
  92   92   *
  93   93   * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
  94   94   * it can't find any associated connection. If the connection is found, its
  95   95   * reference counter is incremented.
  96   96   *
  97   97   *      mp:     mblock, containing packet header. The full header should fit
  98   98   *              into a single mblock. It should also contain at least full IP
  99   99   *              and TCP or UDP header.
 100  100   *
 101  101   *      protocol: Either IPPROTO_TCP or IPPROTO_UDP.
 102  102   *
 103  103   *      hdr_len: The size of IP header. It is used to find TCP or UDP header in
 104  104   *               the packet.
 105  105   *
 106  106   *      ira->ira_zoneid: The zone in which the returned connection must be; the
 107  107   *              zoneid corresponding to the ire_zoneid on the IRE located for
 108  108   *              the packet's destination address.
 109  109   *
 110  110   *      ira->ira_flags: Contains the IRAF_TX_MAC_EXEMPTABLE and
 111  111   *              IRAF_TX_SHARED_ADDR flags
 112  112   *
 113  113   *      For TCP connections, the lookup order is as follows:
 114  114   *              5-tuple {src, dst, protocol, local port, remote port}
 115  115   *                      lookup in ipcl_conn_fanout table.
 116  116   *              3-tuple {dst, remote port, protocol} lookup in
 117  117   *                      ipcl_bind_fanout table.
 118  118   *
 119  119   *      For UDP connections, a 5-tuple {src, dst, protocol, local port,
 120  120   *      remote port} lookup is done on ipcl_udp_fanout. Note that,
 121  121   *      these interfaces do not handle cases where a packets belongs
 122  122   *      to multiple UDP clients, which is handled in IP itself.
 123  123   *
 124  124   * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must
 125  125   * determine which actual zone gets the segment.  This is used only in a
 126  126   * labeled environment.  The matching rules are:
 127  127   *
 128  128   *      - If it's not a multilevel port, then the label on the packet selects
 129  129   *        the zone.  Unlabeled packets are delivered to the global zone.
 130  130   *
 131  131   *      - If it's a multilevel port, then only the zone registered to receive
 132  132   *        packets on that port matches.
 133  133   *
 134  134   * Also, in a labeled environment, packet labels need to be checked.  For fully
 135  135   * bound TCP connections, we can assume that the packet label was checked
 136  136   * during connection establishment, and doesn't need to be checked on each
 137  137   * packet.  For others, though, we need to check for strict equality or, for
 138  138   * multilevel ports, membership in the range or set.  This part currently does
 139  139   * a tnrh lookup on each packet, but could be optimized to use cached results
 140  140   * if that were necessary.  (SCTP doesn't come through here, but if it did,
 141  141   * we would apply the same rules as TCP.)
 142  142   *
 143  143   * An implication of the above is that fully-bound TCP sockets must always use
 144  144   * distinct 4-tuples; they can't be discriminated by label alone.
 145  145   *
 146  146   * Note that we cannot trust labels on packets sent to fully-bound UDP sockets,
 147  147   * as there's no connection set-up handshake and no shared state.
 148  148   *
 149  149   * Labels on looped-back packets within a single zone do not need to be
 150  150   * checked, as all processes in the same zone have the same label.
 151  151   *
 152  152   * Finally, for unlabeled packets received by a labeled system, special rules
 153  153   * apply.  We consider only the MLP if there is one.  Otherwise, we prefer a
 154  154   * socket in the zone whose label matches the default label of the sender, if
 155  155   * any.  In any event, the receiving socket must have SO_MAC_EXEMPT set and the
 156  156   * receiver's label must dominate the sender's default label.
 157  157   *
 158  158   * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcpha_t *, int, ip_stack);
 159  159   * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t,
 160  160   *                                       ip_stack);
 161  161   *
 162  162   *      Lookup routine to find a exact match for {src, dst, local port,
 163  163   *      remote port) for TCP connections in ipcl_conn_fanout. The address and
 164  164   *      ports are read from the IP and TCP header respectively.
 165  165   *
 166  166   * conn_t       *ipcl_lookup_listener_v4(lport, laddr, protocol,
 167  167   *                                       zoneid, ip_stack);
 168  168   * conn_t       *ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex,
 169  169   *                                       zoneid, ip_stack);
 170  170   *
 171  171   *      Lookup routine to find a listener with the tuple {lport, laddr,
 172  172   *      protocol} in the ipcl_bind_fanout table. For IPv6, an additional
 173  173   *      parameter interface index is also compared.
 174  174   *
 175  175   * void ipcl_walk(func, arg, ip_stack)
 176  176   *
 177  177   *      Apply 'func' to every connection available. The 'func' is called as
 178  178   *      (*func)(connp, arg). The walk is non-atomic so connections may be
 179  179   *      created and destroyed during the walk. The CONN_CONDEMNED and
 180  180   *      CONN_INCIPIENT flags ensure that connections which are newly created
 181  181   *      or being destroyed are not selected by the walker.
 182  182   *
 183  183   * Table Updates
 184  184   * -------------
 185  185   *
 186  186   * int ipcl_conn_insert(connp);
 187  187   * int ipcl_conn_insert_v4(connp);
 188  188   * int ipcl_conn_insert_v6(connp);
 189  189   *
 190  190   *      Insert 'connp' in the ipcl_conn_fanout.
 191  191   *      Arguements :
 192  192   *              connp           conn_t to be inserted
 193  193   *
 194  194   *      Return value :
 195  195   *              0               if connp was inserted
 196  196   *              EADDRINUSE      if the connection with the same tuple
 197  197   *                              already exists.
 198  198   *
 199  199   * int ipcl_bind_insert(connp);
 200  200   * int ipcl_bind_insert_v4(connp);
 201  201   * int ipcl_bind_insert_v6(connp);
 202  202   *
 203  203   *      Insert 'connp' in ipcl_bind_fanout.
 204  204   *      Arguements :
 205  205   *              connp           conn_t to be inserted
 206  206   *
 207  207   *
 208  208   * void ipcl_hash_remove(connp);
 209  209   *
 210  210   *      Removes the 'connp' from the connection fanout table.
 211  211   *
 212  212   * Connection Creation/Destruction
 213  213   * -------------------------------
 214  214   *
 215  215   * conn_t *ipcl_conn_create(type, sleep, netstack_t *)
 216  216   *
 217  217   *      Creates a new conn based on the type flag, inserts it into
 218  218   *      globalhash table.
 219  219   *
 220  220   *      type:   This flag determines the type of conn_t which needs to be
 221  221   *              created i.e., which kmem_cache it comes from.
 222  222   *              IPCL_TCPCONN    indicates a TCP connection
 223  223   *              IPCL_SCTPCONN   indicates a SCTP connection
 224  224   *              IPCL_UDPCONN    indicates a UDP conn_t.
 225  225   *              IPCL_RAWIPCONN  indicates a RAWIP/ICMP conn_t.
 226  226   *              IPCL_RTSCONN    indicates a RTS conn_t.
 227  227   *              IPCL_IPCCONN    indicates all other connections.
 228  228   *
 229  229   * void ipcl_conn_destroy(connp)
 230  230   *
 231  231   *      Destroys the connection state, removes it from the global
 232  232   *      connection hash table and frees its memory.
 233  233   */
 234  234  
 235  235  #include <sys/types.h>
 236  236  #include <sys/stream.h>
 237  237  #include <sys/stropts.h>
 238  238  #include <sys/sysmacros.h>
 239  239  #include <sys/strsubr.h>
 240  240  #include <sys/strsun.h>
 241  241  #define _SUN_TPI_VERSION 2
 242  242  #include <sys/ddi.h>
 243  243  #include <sys/cmn_err.h>
 244  244  #include <sys/debug.h>
 245  245  
 246  246  #include <sys/systm.h>
 247  247  #include <sys/param.h>
 248  248  #include <sys/kmem.h>
 249  249  #include <sys/isa_defs.h>
 250  250  #include <inet/common.h>
 251  251  #include <netinet/ip6.h>
 252  252  #include <netinet/icmp6.h>
 253  253  
 254  254  #include <inet/ip.h>
 255  255  #include <inet/ip_if.h>
 256  256  #include <inet/ip_ire.h>
 257  257  #include <inet/ip6.h>
 258  258  #include <inet/ip_ndp.h>
 259  259  #include <inet/ip_impl.h>
 260  260  #include <inet/udp_impl.h>
 261  261  #include <inet/sctp_ip.h>
 262  262  #include <inet/sctp/sctp_impl.h>
 263  263  #include <inet/rawip_impl.h>
 264  264  #include <inet/rts_impl.h>
 265  265  #include <inet/iptun/iptun_impl.h>
 266  266  
 267  267  #include <sys/cpuvar.h>
 268  268  
 269  269  #include <inet/ipclassifier.h>
 270  270  #include <inet/tcp.h>
 271  271  #include <inet/ipsec_impl.h>
 272  272  
 273  273  #include <sys/tsol/tnet.h>
 274  274  #include <sys/sockio.h>
 275  275  
 276  276  /* Old value for compatibility. Setable in /etc/system */
 277  277  uint_t tcp_conn_hash_size = 0;
 278  278  
 279  279  /* New value. Zero means choose automatically.  Setable in /etc/system */
 280  280  uint_t ipcl_conn_hash_size = 0;
 281  281  uint_t ipcl_conn_hash_memfactor = 8192;
 282  282  uint_t ipcl_conn_hash_maxsize = 82500;
 283  283  
 284  284  /* bind/udp fanout table size */
 285  285  uint_t ipcl_bind_fanout_size = 512;
 286  286  uint_t ipcl_udp_fanout_size = 16384;
 287  287  
 288  288  /* Raw socket fanout size.  Must be a power of 2. */
 289  289  uint_t ipcl_raw_fanout_size = 256;
 290  290  
 291  291  /*
 292  292   * The IPCL_IPTUN_HASH() function works best with a prime table size.  We
 293  293   * expect that most large deployments would have hundreds of tunnels, and
 294  294   * thousands in the extreme case.
 295  295   */
 296  296  uint_t ipcl_iptun_fanout_size = 6143;
 297  297  
 298  298  /*
 299  299   * Power of 2^N Primes useful for hashing for N of 0-28,
 300  300   * these primes are the nearest prime <= 2^N - 2^(N-2).
 301  301   */
 302  302  
 303  303  #define P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067,  \
 304  304                  6143, 12281, 24571, 49139, 98299, 196597, 393209,       \
 305  305                  786431, 1572853, 3145721, 6291449, 12582893, 25165813,  \
 306  306                  50331599, 100663291, 201326557, 0}
 307  307  
 308  308  /*
 309  309   * wrapper structure to ensure that conn and what follows it (tcp_t, etc)
 310  310   * are aligned on cache lines.
 311  311   */
 312  312  typedef union itc_s {
 313  313          conn_t  itc_conn;
 314  314          char    itcu_filler[CACHE_ALIGN(conn_s)];
 315  315  } itc_t;
 316  316  
 317  317  struct kmem_cache  *tcp_conn_cache;
 318  318  struct kmem_cache  *ip_conn_cache;
 319  319  extern struct kmem_cache  *sctp_conn_cache;
 320  320  struct kmem_cache  *udp_conn_cache;
 321  321  struct kmem_cache  *rawip_conn_cache;
 322  322  struct kmem_cache  *rts_conn_cache;
 323  323  
 324  324  extern void     tcp_timermp_free(tcp_t *);
 325  325  extern mblk_t   *tcp_timermp_alloc(int);
 326  326  
 327  327  static int      ip_conn_constructor(void *, void *, int);
 328  328  static void     ip_conn_destructor(void *, void *);
 329  329  
 330  330  static int      tcp_conn_constructor(void *, void *, int);
 331  331  static void     tcp_conn_destructor(void *, void *);
 332  332  
 333  333  static int      udp_conn_constructor(void *, void *, int);
 334  334  static void     udp_conn_destructor(void *, void *);
 335  335  
 336  336  static int      rawip_conn_constructor(void *, void *, int);
 337  337  static void     rawip_conn_destructor(void *, void *);
 338  338  
 339  339  static int      rts_conn_constructor(void *, void *, int);
 340  340  static void     rts_conn_destructor(void *, void *);
 341  341  
 342  342  /*
 343  343   * Global (for all stack instances) init routine
 344  344   */
 345  345  void
 346  346  ipcl_g_init(void)
 347  347  {
 348  348          ip_conn_cache = kmem_cache_create("ip_conn_cache",
 349  349              sizeof (conn_t), CACHE_ALIGN_SIZE,
 350  350              ip_conn_constructor, ip_conn_destructor,
 351  351              NULL, NULL, NULL, 0);
 352  352  
 353  353          tcp_conn_cache = kmem_cache_create("tcp_conn_cache",
 354  354              sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE,
 355  355              tcp_conn_constructor, tcp_conn_destructor,
 356  356              tcp_conn_reclaim, NULL, NULL, 0);
 357  357  
 358  358          udp_conn_cache = kmem_cache_create("udp_conn_cache",
 359  359              sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE,
 360  360              udp_conn_constructor, udp_conn_destructor,
 361  361              NULL, NULL, NULL, 0);
 362  362  
 363  363          rawip_conn_cache = kmem_cache_create("rawip_conn_cache",
 364  364              sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE,
 365  365              rawip_conn_constructor, rawip_conn_destructor,
 366  366              NULL, NULL, NULL, 0);
 367  367  
 368  368          rts_conn_cache = kmem_cache_create("rts_conn_cache",
 369  369              sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE,
 370  370              rts_conn_constructor, rts_conn_destructor,
 371  371              NULL, NULL, NULL, 0);
 372  372  }
 373  373  
 374  374  /*
 375  375   * ipclassifier intialization routine, sets up hash tables.
 376  376   */
 377  377  void
 378  378  ipcl_init(ip_stack_t *ipst)
 379  379  {
 380  380          int i;
 381  381          int sizes[] = P2Ps();
 382  382  
 383  383          /*
 384  384           * Calculate size of conn fanout table from /etc/system settings
 385  385           */
 386  386          if (ipcl_conn_hash_size != 0) {
 387  387                  ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size;
 388  388          } else if (tcp_conn_hash_size != 0) {
 389  389                  ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size;
 390  390          } else {
 391  391                  extern pgcnt_t freemem;
 392  392  
 393  393                  ipst->ips_ipcl_conn_fanout_size =
 394  394                      (freemem * PAGESIZE) / ipcl_conn_hash_memfactor;
 395  395  
 396  396                  if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) {
 397  397                          ipst->ips_ipcl_conn_fanout_size =
 398  398                              ipcl_conn_hash_maxsize;
 399  399                  }
 400  400          }
 401  401  
 402  402          for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) {
 403  403                  if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) {
 404  404                          break;
 405  405                  }
 406  406          }
 407  407          if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) {
 408  408                  /* Out of range, use the 2^16 value */
 409  409                  ipst->ips_ipcl_conn_fanout_size = sizes[16];
 410  410          }
 411  411  
 412  412          /* Take values from /etc/system */
 413  413          ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size;
 414  414          ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size;
 415  415          ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size;
 416  416          ipst->ips_ipcl_iptun_fanout_size = ipcl_iptun_fanout_size;
 417  417  
 418  418          ASSERT(ipst->ips_ipcl_conn_fanout == NULL);
 419  419  
 420  420          ipst->ips_ipcl_conn_fanout = kmem_zalloc(
 421  421              ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP);
 422  422  
 423  423          for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
 424  424                  mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL,
 425  425                      MUTEX_DEFAULT, NULL);
 426  426          }
 427  427  
 428  428          ipst->ips_ipcl_bind_fanout = kmem_zalloc(
 429  429              ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP);
 430  430  
 431  431          for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
 432  432                  mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL,
 433  433                      MUTEX_DEFAULT, NULL);
 434  434          }
 435  435  
 436  436          ipst->ips_ipcl_proto_fanout_v4 = kmem_zalloc(IPPROTO_MAX *
 437  437              sizeof (connf_t), KM_SLEEP);
 438  438          for (i = 0; i < IPPROTO_MAX; i++) {
 439  439                  mutex_init(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock, NULL,
 440  440                      MUTEX_DEFAULT, NULL);
 441  441          }
 442  442  
 443  443          ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX *
 444  444              sizeof (connf_t), KM_SLEEP);
 445  445          for (i = 0; i < IPPROTO_MAX; i++) {
 446  446                  mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL,
 447  447                      MUTEX_DEFAULT, NULL);
 448  448          }
 449  449  
 450  450          ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP);
 451  451          mutex_init(&ipst->ips_rts_clients->connf_lock,
 452  452              NULL, MUTEX_DEFAULT, NULL);
 453  453  
 454  454          ipst->ips_ipcl_udp_fanout = kmem_zalloc(
 455  455              ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP);
 456  456          for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
 457  457                  mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL,
 458  458                      MUTEX_DEFAULT, NULL);
 459  459          }
 460  460  
 461  461          ipst->ips_ipcl_iptun_fanout = kmem_zalloc(
 462  462              ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t), KM_SLEEP);
 463  463          for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
 464  464                  mutex_init(&ipst->ips_ipcl_iptun_fanout[i].connf_lock, NULL,
 465  465                      MUTEX_DEFAULT, NULL);
 466  466          }
 467  467  
 468  468          ipst->ips_ipcl_raw_fanout = kmem_zalloc(
 469  469              ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP);
 470  470          for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
 471  471                  mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL,
 472  472                      MUTEX_DEFAULT, NULL);
 473  473          }
 474  474  
 475  475          ipst->ips_ipcl_globalhash_fanout = kmem_zalloc(
 476  476              sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP);
 477  477          for (i = 0; i < CONN_G_HASH_SIZE; i++) {
 478  478                  mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock,
 479  479                      NULL, MUTEX_DEFAULT, NULL);
 480  480          }
 481  481  }
 482  482  
 483  483  void
 484  484  ipcl_g_destroy(void)
 485  485  {
 486  486          kmem_cache_destroy(ip_conn_cache);
 487  487          kmem_cache_destroy(tcp_conn_cache);
 488  488          kmem_cache_destroy(udp_conn_cache);
 489  489          kmem_cache_destroy(rawip_conn_cache);
 490  490          kmem_cache_destroy(rts_conn_cache);
 491  491  }
 492  492  
 493  493  /*
 494  494   * All user-level and kernel use of the stack must be gone
 495  495   * by now.
 496  496   */
 497  497  void
 498  498  ipcl_destroy(ip_stack_t *ipst)
 499  499  {
 500  500          int i;
 501  501  
 502  502          for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
 503  503                  ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL);
 504  504                  mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock);
 505  505          }
 506  506          kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size *
 507  507              sizeof (connf_t));
 508  508          ipst->ips_ipcl_conn_fanout = NULL;
 509  509  
 510  510          for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
 511  511                  ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL);
 512  512                  mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock);
 513  513          }
 514  514          kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size *
 515  515              sizeof (connf_t));
 516  516          ipst->ips_ipcl_bind_fanout = NULL;
 517  517  
 518  518          for (i = 0; i < IPPROTO_MAX; i++) {
 519  519                  ASSERT(ipst->ips_ipcl_proto_fanout_v4[i].connf_head == NULL);
 520  520                  mutex_destroy(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock);
 521  521          }
 522  522          kmem_free(ipst->ips_ipcl_proto_fanout_v4,
 523  523              IPPROTO_MAX * sizeof (connf_t));
 524  524          ipst->ips_ipcl_proto_fanout_v4 = NULL;
 525  525  
 526  526          for (i = 0; i < IPPROTO_MAX; i++) {
 527  527                  ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL);
 528  528                  mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock);
 529  529          }
 530  530          kmem_free(ipst->ips_ipcl_proto_fanout_v6,
 531  531              IPPROTO_MAX * sizeof (connf_t));
 532  532          ipst->ips_ipcl_proto_fanout_v6 = NULL;
 533  533  
 534  534          for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
 535  535                  ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL);
 536  536                  mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock);
 537  537          }
 538  538          kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size *
 539  539              sizeof (connf_t));
 540  540          ipst->ips_ipcl_udp_fanout = NULL;
 541  541  
 542  542          for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
 543  543                  ASSERT(ipst->ips_ipcl_iptun_fanout[i].connf_head == NULL);
 544  544                  mutex_destroy(&ipst->ips_ipcl_iptun_fanout[i].connf_lock);
 545  545          }
 546  546          kmem_free(ipst->ips_ipcl_iptun_fanout,
 547  547              ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t));
 548  548          ipst->ips_ipcl_iptun_fanout = NULL;
 549  549  
 550  550          for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
 551  551                  ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL);
 552  552                  mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock);
 553  553          }
 554  554          kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size *
 555  555              sizeof (connf_t));
 556  556          ipst->ips_ipcl_raw_fanout = NULL;
 557  557  
 558  558          for (i = 0; i < CONN_G_HASH_SIZE; i++) {
 559  559                  ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL);
 560  560                  mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
 561  561          }
 562  562          kmem_free(ipst->ips_ipcl_globalhash_fanout,
 563  563              sizeof (connf_t) * CONN_G_HASH_SIZE);
 564  564          ipst->ips_ipcl_globalhash_fanout = NULL;
 565  565  
 566  566          ASSERT(ipst->ips_rts_clients->connf_head == NULL);
 567  567          mutex_destroy(&ipst->ips_rts_clients->connf_lock);
 568  568          kmem_free(ipst->ips_rts_clients, sizeof (connf_t));
 569  569          ipst->ips_rts_clients = NULL;
 570  570  }
 571  571  
 572  572  /*
 573  573   * conn creation routine. initialize the conn, sets the reference
 574  574   * and inserts it in the global hash table.
 575  575   */
 576  576  conn_t *
 577  577  ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns)
 578  578  {
 579  579          conn_t  *connp;
 580  580          struct kmem_cache *conn_cache;
 581  581  
 582  582          switch (type) {
 583  583          case IPCL_SCTPCONN:
 584  584                  if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL)
 585  585                          return (NULL);
 586  586                  sctp_conn_init(connp);
 587  587                  netstack_hold(ns);
 588  588                  connp->conn_netstack = ns;
 589  589                  connp->conn_ixa->ixa_ipst = ns->netstack_ip;
 590  590                  connp->conn_ixa->ixa_conn_id = (long)connp;
 591  591                  ipcl_globalhash_insert(connp);
 592  592                  return (connp);
 593  593  
 594  594          case IPCL_TCPCONN:
 595  595                  conn_cache = tcp_conn_cache;
 596  596                  break;
 597  597  
 598  598          case IPCL_UDPCONN:
 599  599                  conn_cache = udp_conn_cache;
 600  600                  break;
 601  601  
 602  602          case IPCL_RAWIPCONN:
 603  603                  conn_cache = rawip_conn_cache;
 604  604                  break;
 605  605  
 606  606          case IPCL_RTSCONN:
 607  607                  conn_cache = rts_conn_cache;
 608  608                  break;
 609  609  
 610  610          case IPCL_IPCCONN:
 611  611                  conn_cache = ip_conn_cache;
 612  612                  break;
 613  613  
 614  614          default:
 615  615                  connp = NULL;
 616  616                  ASSERT(0);
 617  617          }
 618  618  
 619  619          if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL)
 620  620                  return (NULL);
 621  621  
 622  622          connp->conn_ref = 1;
 623  623          netstack_hold(ns);
 624  624          connp->conn_netstack = ns;
 625  625          connp->conn_ixa->ixa_ipst = ns->netstack_ip;
 626  626          connp->conn_ixa->ixa_conn_id = (long)connp;
 627  627          ipcl_globalhash_insert(connp);
 628  628          return (connp);
 629  629  }
 630  630  
 631  631  void
 632  632  ipcl_conn_destroy(conn_t *connp)
 633  633  {
 634  634          mblk_t  *mp;
 635  635          netstack_t      *ns = connp->conn_netstack;
 636  636  
 637  637          ASSERT(!MUTEX_HELD(&connp->conn_lock));
 638  638          ASSERT(connp->conn_ref == 0);
 639  639          ASSERT(connp->conn_ioctlref == 0);
 640  640  
 641  641          DTRACE_PROBE1(conn__destroy, conn_t *, connp);
 642  642  
 643  643          if (connp->conn_cred != NULL) {
 644  644                  crfree(connp->conn_cred);
 645  645                  connp->conn_cred = NULL;
 646  646                  /* ixa_cred done in ipcl_conn_cleanup below */
 647  647          }
 648  648  
 649  649          if (connp->conn_ht_iphc != NULL) {
 650  650                  kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
 651  651                  connp->conn_ht_iphc = NULL;
 652  652                  connp->conn_ht_iphc_allocated = 0;
 653  653                  connp->conn_ht_iphc_len = 0;
 654  654                  connp->conn_ht_ulp = NULL;
 655  655                  connp->conn_ht_ulp_len = 0;
 656  656          }
 657  657          ip_pkt_free(&connp->conn_xmit_ipp);
 658  658  
 659  659          ipcl_globalhash_remove(connp);
 660  660  
 661  661          if (connp->conn_latch != NULL) {
 662  662                  IPLATCH_REFRELE(connp->conn_latch);
 663  663                  connp->conn_latch = NULL;
 664  664          }
 665  665          if (connp->conn_latch_in_policy != NULL) {
 666  666                  IPPOL_REFRELE(connp->conn_latch_in_policy);
 667  667                  connp->conn_latch_in_policy = NULL;
 668  668          }
 669  669          if (connp->conn_latch_in_action != NULL) {
 670  670                  IPACT_REFRELE(connp->conn_latch_in_action);
 671  671                  connp->conn_latch_in_action = NULL;
 672  672          }
 673  673          if (connp->conn_policy != NULL) {
 674  674                  IPPH_REFRELE(connp->conn_policy, ns);
 675  675                  connp->conn_policy = NULL;
 676  676          }
 677  677  
 678  678          if (connp->conn_ipsec_opt_mp != NULL) {
 679  679                  freemsg(connp->conn_ipsec_opt_mp);
 680  680                  connp->conn_ipsec_opt_mp = NULL;
 681  681          }
 682  682  
 683  683          if (connp->conn_flags & IPCL_TCPCONN) {
 684  684                  tcp_t *tcp = connp->conn_tcp;
 685  685  
 686  686                  tcp_free(tcp);
 687  687                  mp = tcp->tcp_timercache;
 688  688  
 689  689                  tcp->tcp_tcps = NULL;
 690  690  
 691  691                  /*
 692  692                   * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate
 693  693                   * the mblk.
 694  694                   */
 695  695                  if (tcp->tcp_rsrv_mp != NULL) {
 696  696                          freeb(tcp->tcp_rsrv_mp);
 697  697                          tcp->tcp_rsrv_mp = NULL;
 698  698                          mutex_destroy(&tcp->tcp_rsrv_mp_lock);
 699  699                  }
 700  700  
 701  701                  ipcl_conn_cleanup(connp);
 702  702                  connp->conn_flags = IPCL_TCPCONN;
 703  703                  if (ns != NULL) {
 704  704                          ASSERT(tcp->tcp_tcps == NULL);
 705  705                          connp->conn_netstack = NULL;
 706  706                          connp->conn_ixa->ixa_ipst = NULL;
 707  707                          netstack_rele(ns);
 708  708                  }
 709  709  
 710  710                  bzero(tcp, sizeof (tcp_t));
 711  711  
 712  712                  tcp->tcp_timercache = mp;
 713  713                  tcp->tcp_connp = connp;
 714  714                  kmem_cache_free(tcp_conn_cache, connp);
 715  715                  return;
 716  716          }
 717  717  
 718  718          if (connp->conn_flags & IPCL_SCTPCONN) {
 719  719                  ASSERT(ns != NULL);
 720  720                  sctp_free(connp);
 721  721                  return;
 722  722          }
 723  723  
 724  724          ipcl_conn_cleanup(connp);
 725  725          if (ns != NULL) {
 726  726                  connp->conn_netstack = NULL;
 727  727                  connp->conn_ixa->ixa_ipst = NULL;
 728  728                  netstack_rele(ns);
 729  729          }
 730  730  
 731  731          /* leave conn_priv aka conn_udp, conn_icmp, etc in place. */
 732  732          if (connp->conn_flags & IPCL_UDPCONN) {
 733  733                  connp->conn_flags = IPCL_UDPCONN;
 734  734                  kmem_cache_free(udp_conn_cache, connp);
 735  735          } else if (connp->conn_flags & IPCL_RAWIPCONN) {
 736  736                  connp->conn_flags = IPCL_RAWIPCONN;
 737  737                  connp->conn_proto = IPPROTO_ICMP;
 738  738                  connp->conn_ixa->ixa_protocol = connp->conn_proto;
 739  739                  kmem_cache_free(rawip_conn_cache, connp);
 740  740          } else if (connp->conn_flags & IPCL_RTSCONN) {
 741  741                  connp->conn_flags = IPCL_RTSCONN;
 742  742                  kmem_cache_free(rts_conn_cache, connp);
 743  743          } else {
 744  744                  connp->conn_flags = IPCL_IPCCONN;
 745  745                  ASSERT(connp->conn_flags & IPCL_IPCCONN);
 746  746                  ASSERT(connp->conn_priv == NULL);
 747  747                  kmem_cache_free(ip_conn_cache, connp);
 748  748          }
 749  749  }
 750  750  
 751  751  /*
 752  752   * Running in cluster mode - deregister listener information
 753  753   */
 754  754  static void
 755  755  ipcl_conn_unlisten(conn_t *connp)
 756  756  {
 757  757          ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0);
 758  758          ASSERT(connp->conn_lport != 0);
 759  759  
 760  760          if (cl_inet_unlisten != NULL) {
 761  761                  sa_family_t     addr_family;
 762  762                  uint8_t         *laddrp;
 763  763  
 764  764                  if (connp->conn_ipversion == IPV6_VERSION) {
 765  765                          addr_family = AF_INET6;
 766  766                          laddrp = (uint8_t *)&connp->conn_bound_addr_v6;
 767  767                  } else {
 768  768                          addr_family = AF_INET;
 769  769                          laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
 770  770                  }
 771  771                  (*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid,
 772  772                      IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL);
 773  773          }
 774  774          connp->conn_flags &= ~IPCL_CL_LISTENER;
 775  775  }
 776  776  
 777  777  /*
 778  778   * We set the IPCL_REMOVED flag (instead of clearing the flag indicating
 779  779   * which table the conn belonged to). So for debugging we can see which hash
 780  780   * table this connection was in.
 781  781   */
 782  782  #define IPCL_HASH_REMOVE(connp) {                                       \
 783  783          connf_t *connfp = (connp)->conn_fanout;                         \
 784  784          ASSERT(!MUTEX_HELD(&((connp)->conn_lock)));                     \
 785  785          if (connfp != NULL) {                                           \
 786  786                  mutex_enter(&connfp->connf_lock);                       \
 787  787                  if ((connp)->conn_next != NULL)                         \
 788  788                          (connp)->conn_next->conn_prev =                 \
 789  789                              (connp)->conn_prev;                         \
 790  790                  if ((connp)->conn_prev != NULL)                         \
 791  791                          (connp)->conn_prev->conn_next =                 \
 792  792                              (connp)->conn_next;                         \
 793  793                  else                                                    \
 794  794                          connfp->connf_head = (connp)->conn_next;        \
 795  795                  (connp)->conn_fanout = NULL;                            \
 796  796                  (connp)->conn_next = NULL;                              \
 797  797                  (connp)->conn_prev = NULL;                              \
 798  798                  (connp)->conn_flags |= IPCL_REMOVED;                    \
 799  799                  if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0)      \
 800  800                          ipcl_conn_unlisten((connp));                    \
 801  801                  CONN_DEC_REF((connp));                                  \
 802  802                  mutex_exit(&connfp->connf_lock);                        \
 803  803          }                                                               \
 804  804  }
 805  805  
 806  806  void
 807  807  ipcl_hash_remove(conn_t *connp)
 808  808  {
 809  809          uint8_t         protocol = connp->conn_proto;
 810  810  
 811  811          IPCL_HASH_REMOVE(connp);
 812  812          if (protocol == IPPROTO_RSVP)
 813  813                  ill_set_inputfn_all(connp->conn_netstack->netstack_ip);
 814  814  }
 815  815  
 816  816  /*
 817  817   * The whole purpose of this function is allow removal of
 818  818   * a conn_t from the connected hash for timewait reclaim.
 819  819   * This is essentially a TW reclaim fastpath where timewait
 820  820   * collector checks under fanout lock (so no one else can
 821  821   * get access to the conn_t) that refcnt is 2 i.e. one for
 822  822   * TCP and one for the classifier hash list. If ref count
 823  823   * is indeed 2, we can just remove the conn under lock and
 824  824   * avoid cleaning up the conn under squeue. This gives us
 825  825   * improved performance.
 826  826   */
 827  827  void
 828  828  ipcl_hash_remove_locked(conn_t *connp, connf_t  *connfp)
 829  829  {
 830  830          ASSERT(MUTEX_HELD(&connfp->connf_lock));
 831  831          ASSERT(MUTEX_HELD(&connp->conn_lock));
 832  832          ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0);
 833  833  
 834  834          if ((connp)->conn_next != NULL) {
 835  835                  (connp)->conn_next->conn_prev = (connp)->conn_prev;
 836  836          }
 837  837          if ((connp)->conn_prev != NULL) {
 838  838                  (connp)->conn_prev->conn_next = (connp)->conn_next;
 839  839          } else {
 840  840                  connfp->connf_head = (connp)->conn_next;
 841  841          }
 842  842          (connp)->conn_fanout = NULL;
 843  843          (connp)->conn_next = NULL;
 844  844          (connp)->conn_prev = NULL;
 845  845          (connp)->conn_flags |= IPCL_REMOVED;
 846  846          ASSERT((connp)->conn_ref == 2);
 847  847          (connp)->conn_ref--;
 848  848  }
 849  849  
 850  850  #define IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) {              \
 851  851          ASSERT((connp)->conn_fanout == NULL);                           \
 852  852          ASSERT((connp)->conn_next == NULL);                             \
 853  853          ASSERT((connp)->conn_prev == NULL);                             \
 854  854          if ((connfp)->connf_head != NULL) {                             \
 855  855                  (connfp)->connf_head->conn_prev = (connp);              \
 856  856                  (connp)->conn_next = (connfp)->connf_head;              \
 857  857          }                                                               \
 858  858          (connp)->conn_fanout = (connfp);                                \
 859  859          (connfp)->connf_head = (connp);                                 \
 860  860          (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |   \
 861  861              IPCL_CONNECTED;                                             \
 862  862          CONN_INC_REF(connp);                                            \
 863  863  }
 864  864  
 865  865  #define IPCL_HASH_INSERT_CONNECTED(connfp, connp) {                     \
 866  866          IPCL_HASH_REMOVE((connp));                                      \
 867  867          mutex_enter(&(connfp)->connf_lock);                             \
 868  868          IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);               \
 869  869          mutex_exit(&(connfp)->connf_lock);                              \
 870  870  }
 871  871  
 872  872  /*
 873  873   * When inserting bound or wildcard entries into the hash, ordering rules are
 874  874   * used to facilitate timely and correct lookups.  The order is as follows:
 875  875   * 1. Entries bound to a specific address
 876  876   * 2. Entries bound to INADDR_ANY
 877  877   * 3. Entries bound to ADDR_UNSPECIFIED
 878  878   * Entries in a category which share conn_lport (such as those using
 879  879   * SO_REUSEPORT) will be ordered such that the newest inserted is first.
 880  880   */
 881  881  
 882  882  void
 883  883  ipcl_hash_insert_bound(connf_t *connfp, conn_t *connp)
 884  884  {
 885  885          conn_t *pconnp, *nconnp;
 886  886  
 887  887          IPCL_HASH_REMOVE(connp);
 888  888          mutex_enter(&connfp->connf_lock);
 889  889          nconnp = connfp->connf_head;
 890  890          pconnp = NULL;
 891  891          while (nconnp != NULL) {
 892  892                  /*
 893  893                   * Walk though entries associated with the fanout until one is
 894  894                   * found which fulfills any of these conditions:
 895  895                   * 1. Listen address of ADDR_ANY/ADDR_UNSPECIFIED
 896  896                   * 2. Listen port the same as connp
 897  897                   */
 898  898                  if (_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6) ||
 899  899                      connp->conn_lport == nconnp->conn_lport)
 900  900                          break;
 901  901                  pconnp = nconnp;
 902  902                  nconnp = nconnp->conn_next;
 903  903          }
 904  904          if (pconnp != NULL) {
 905  905                  pconnp->conn_next = connp;
 906  906                  connp->conn_prev = pconnp;
 907  907          } else {
 908  908                  connfp->connf_head = connp;
 909  909          }
 910  910          if (nconnp != NULL) {
 911  911                  connp->conn_next = nconnp;
 912  912                  nconnp->conn_prev = connp;
  
    | 
      ↓ open down ↓ | 
    912 lines elided | 
    
      ↑ open up ↑ | 
  
 913  913          }
 914  914          connp->conn_fanout = connfp;
 915  915          connp->conn_flags = (connp->conn_flags & ~IPCL_REMOVED) | IPCL_BOUND;
 916  916          CONN_INC_REF(connp);
 917  917          mutex_exit(&connfp->connf_lock);
 918  918  }
 919  919  
 920  920  void
 921  921  ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
 922  922  {
 923      -        conn_t **list, *prev, *next;
 924  923          conn_t *pconnp = NULL, *nconnp;
 925  924          boolean_t isv4mapped = IN6_IS_ADDR_V4MAPPED(&connp->conn_laddr_v6);
 926  925  
 927  926          IPCL_HASH_REMOVE(connp);
 928  927          mutex_enter(&connfp->connf_lock);
 929  928          nconnp = connfp->connf_head;
 930  929          pconnp = NULL;
 931  930          while (nconnp != NULL) {
 932  931                  if (IN6_IS_ADDR_V4MAPPED_ANY(&nconnp->conn_laddr_v6) &&
 933  932                      isv4mapped && connp->conn_lport == nconnp->conn_lport)
 934  933                          break;
 935  934                  if (IN6_IS_ADDR_UNSPECIFIED(&nconnp->conn_laddr_v6) &&
 936  935                      (isv4mapped ||
 937  936                      connp->conn_lport == nconnp->conn_lport))
 938  937                          break;
 939  938  
 940  939                  pconnp = nconnp;
 941  940                  nconnp = nconnp->conn_next;
 942  941          }
 943  942          if (pconnp != NULL) {
 944  943                  pconnp->conn_next = connp;
 945  944                  connp->conn_prev = pconnp;
 946  945          } else {
 947  946                  connfp->connf_head = connp;
 948  947          }
 949  948          if (nconnp != NULL) {
 950  949                  connp->conn_next = nconnp;
 951  950                  nconnp->conn_prev = connp;
 952  951          }
 953  952          connp->conn_fanout = connfp;
 954  953          connp->conn_flags = (connp->conn_flags & ~IPCL_REMOVED) | IPCL_BOUND;
 955  954          CONN_INC_REF(connp);
 956  955          mutex_exit(&connfp->connf_lock);
 957  956  }
 958  957  
 959  958  /*
 960  959   * Because the classifier is used to classify inbound packets, the destination
 961  960   * address is meant to be our local tunnel address (tunnel source), and the
 962  961   * source the remote tunnel address (tunnel destination).
 963  962   *
 964  963   * Note that conn_proto can't be used for fanout since the upper protocol
 965  964   * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel.
 966  965   */
 967  966  conn_t *
 968  967  ipcl_iptun_classify_v4(ipaddr_t *src, ipaddr_t *dst, ip_stack_t *ipst)
 969  968  {
 970  969          connf_t *connfp;
 971  970          conn_t  *connp;
 972  971  
 973  972          /* first look for IPv4 tunnel links */
 974  973          connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, *src)];
 975  974          mutex_enter(&connfp->connf_lock);
 976  975          for (connp = connfp->connf_head; connp != NULL;
 977  976              connp = connp->conn_next) {
 978  977                  if (IPCL_IPTUN_MATCH(connp, *dst, *src))
 979  978                          break;
 980  979          }
 981  980          if (connp != NULL)
 982  981                  goto done;
 983  982  
 984  983          mutex_exit(&connfp->connf_lock);
 985  984  
 986  985          /* We didn't find an IPv4 tunnel, try a 6to4 tunnel */
 987  986          connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst,
 988  987              INADDR_ANY)];
 989  988          mutex_enter(&connfp->connf_lock);
 990  989          for (connp = connfp->connf_head; connp != NULL;
 991  990              connp = connp->conn_next) {
 992  991                  if (IPCL_IPTUN_MATCH(connp, *dst, INADDR_ANY))
 993  992                          break;
 994  993          }
 995  994  done:
 996  995          if (connp != NULL)
 997  996                  CONN_INC_REF(connp);
 998  997          mutex_exit(&connfp->connf_lock);
 999  998          return (connp);
1000  999  }
1001 1000  
1002 1001  conn_t *
1003 1002  ipcl_iptun_classify_v6(in6_addr_t *src, in6_addr_t *dst, ip_stack_t *ipst)
1004 1003  {
1005 1004          connf_t *connfp;
1006 1005          conn_t  *connp;
1007 1006  
1008 1007          /* Look for an IPv6 tunnel link */
1009 1008          connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(dst, src)];
1010 1009          mutex_enter(&connfp->connf_lock);
1011 1010          for (connp = connfp->connf_head; connp != NULL;
1012 1011              connp = connp->conn_next) {
1013 1012                  if (IPCL_IPTUN_MATCH_V6(connp, dst, src)) {
1014 1013                          CONN_INC_REF(connp);
1015 1014                          break;
1016 1015                  }
1017 1016          }
1018 1017          mutex_exit(&connfp->connf_lock);
1019 1018          return (connp);
1020 1019  }
1021 1020  
1022 1021  /*
1023 1022   * This function is used only for inserting SCTP raw socket now.
1024 1023   * This may change later.
1025 1024   *
1026 1025   * Note that only one raw socket can be bound to a port.  The param
1027 1026   * lport is in network byte order.
1028 1027   */
1029 1028  static int
1030 1029  ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
1031 1030  {
1032 1031          connf_t *connfp;
1033 1032          conn_t  *oconnp;
1034 1033          ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
1035 1034  
1036 1035          connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1037 1036  
1038 1037          /* Check for existing raw socket already bound to the port. */
1039 1038          mutex_enter(&connfp->connf_lock);
1040 1039          for (oconnp = connfp->connf_head; oconnp != NULL;
1041 1040              oconnp = oconnp->conn_next) {
1042 1041                  if (oconnp->conn_lport == lport &&
1043 1042                      oconnp->conn_zoneid == connp->conn_zoneid &&
1044 1043                      oconnp->conn_family == connp->conn_family &&
1045 1044                      ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1046 1045                      IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_laddr_v6) ||
1047 1046                      IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6) ||
1048 1047                      IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_laddr_v6)) ||
1049 1048                      IN6_ARE_ADDR_EQUAL(&oconnp->conn_laddr_v6,
1050 1049                      &connp->conn_laddr_v6))) {
1051 1050                          break;
1052 1051                  }
1053 1052          }
1054 1053          mutex_exit(&connfp->connf_lock);
1055 1054          if (oconnp != NULL)
1056 1055                  return (EADDRNOTAVAIL);
1057 1056  
1058 1057          if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) ||
1059 1058              IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1060 1059                  if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1061 1060                      IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) {
1062 1061                          ipcl_hash_insert_wildcard(connfp, connp);
1063 1062                  } else {
1064 1063                          ipcl_hash_insert_bound(connfp, connp);
1065 1064                  }
1066 1065          } else {
1067 1066                  IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1068 1067          }
1069 1068          return (0);
1070 1069  }
1071 1070  
1072 1071  static int
1073 1072  ipcl_iptun_hash_insert(conn_t *connp, ip_stack_t *ipst)
1074 1073  {
1075 1074          connf_t *connfp;
1076 1075          conn_t  *tconnp;
1077 1076          ipaddr_t laddr = connp->conn_laddr_v4;
1078 1077          ipaddr_t faddr = connp->conn_faddr_v4;
1079 1078  
1080 1079          connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(laddr, faddr)];
1081 1080          mutex_enter(&connfp->connf_lock);
1082 1081          for (tconnp = connfp->connf_head; tconnp != NULL;
1083 1082              tconnp = tconnp->conn_next) {
1084 1083                  if (IPCL_IPTUN_MATCH(tconnp, laddr, faddr)) {
1085 1084                          /* A tunnel is already bound to these addresses. */
1086 1085                          mutex_exit(&connfp->connf_lock);
1087 1086                          return (EADDRINUSE);
1088 1087                  }
1089 1088          }
1090 1089          IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1091 1090          mutex_exit(&connfp->connf_lock);
1092 1091          return (0);
1093 1092  }
1094 1093  
1095 1094  static int
1096 1095  ipcl_iptun_hash_insert_v6(conn_t *connp, ip_stack_t *ipst)
1097 1096  {
1098 1097          connf_t *connfp;
1099 1098          conn_t  *tconnp;
1100 1099          in6_addr_t *laddr = &connp->conn_laddr_v6;
1101 1100          in6_addr_t *faddr = &connp->conn_faddr_v6;
1102 1101  
1103 1102          connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(laddr, faddr)];
1104 1103          mutex_enter(&connfp->connf_lock);
1105 1104          for (tconnp = connfp->connf_head; tconnp != NULL;
1106 1105              tconnp = tconnp->conn_next) {
1107 1106                  if (IPCL_IPTUN_MATCH_V6(tconnp, laddr, faddr)) {
1108 1107                          /* A tunnel is already bound to these addresses. */
1109 1108                          mutex_exit(&connfp->connf_lock);
1110 1109                          return (EADDRINUSE);
1111 1110                  }
1112 1111          }
1113 1112          IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1114 1113          mutex_exit(&connfp->connf_lock);
1115 1114          return (0);
1116 1115  }
1117 1116  
1118 1117  /*
1119 1118   * Check for a MAC exemption conflict on a labeled system.  Note that for
1120 1119   * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the
1121 1120   * transport layer.  This check is for binding all other protocols.
1122 1121   *
1123 1122   * Returns true if there's a conflict.
1124 1123   */
1125 1124  static boolean_t
1126 1125  check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst)
1127 1126  {
1128 1127          connf_t *connfp;
1129 1128          conn_t *tconn;
1130 1129  
1131 1130          connfp = &ipst->ips_ipcl_proto_fanout_v4[connp->conn_proto];
1132 1131          mutex_enter(&connfp->connf_lock);
1133 1132          for (tconn = connfp->connf_head; tconn != NULL;
1134 1133              tconn = tconn->conn_next) {
1135 1134                  /* We don't allow v4 fallback for v6 raw socket */
1136 1135                  if (connp->conn_family != tconn->conn_family)
1137 1136                          continue;
1138 1137                  /* If neither is exempt, then there's no conflict */
1139 1138                  if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1140 1139                      (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1141 1140                          continue;
1142 1141                  /* We are only concerned about sockets for a different zone */
1143 1142                  if (connp->conn_zoneid == tconn->conn_zoneid)
1144 1143                          continue;
1145 1144                  /* If both are bound to different specific addrs, ok */
1146 1145                  if (connp->conn_laddr_v4 != INADDR_ANY &&
1147 1146                      tconn->conn_laddr_v4 != INADDR_ANY &&
1148 1147                      connp->conn_laddr_v4 != tconn->conn_laddr_v4)
1149 1148                          continue;
1150 1149                  /* These two conflict; fail */
1151 1150                  break;
1152 1151          }
1153 1152          mutex_exit(&connfp->connf_lock);
1154 1153          return (tconn != NULL);
1155 1154  }
1156 1155  
1157 1156  static boolean_t
1158 1157  check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst)
1159 1158  {
1160 1159          connf_t *connfp;
1161 1160          conn_t *tconn;
1162 1161  
1163 1162          connfp = &ipst->ips_ipcl_proto_fanout_v6[connp->conn_proto];
1164 1163          mutex_enter(&connfp->connf_lock);
1165 1164          for (tconn = connfp->connf_head; tconn != NULL;
1166 1165              tconn = tconn->conn_next) {
1167 1166                  /* We don't allow v4 fallback for v6 raw socket */
1168 1167                  if (connp->conn_family != tconn->conn_family)
1169 1168                          continue;
1170 1169                  /* If neither is exempt, then there's no conflict */
1171 1170                  if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1172 1171                      (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1173 1172                          continue;
1174 1173                  /* We are only concerned about sockets for a different zone */
1175 1174                  if (connp->conn_zoneid == tconn->conn_zoneid)
1176 1175                          continue;
1177 1176                  /* If both are bound to different addrs, ok */
1178 1177                  if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) &&
1179 1178                      !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_laddr_v6) &&
1180 1179                      !IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6,
1181 1180                      &tconn->conn_laddr_v6))
1182 1181                          continue;
1183 1182                  /* These two conflict; fail */
1184 1183                  break;
1185 1184          }
1186 1185          mutex_exit(&connfp->connf_lock);
1187 1186          return (tconn != NULL);
1188 1187  }
1189 1188  
1190 1189  /*
1191 1190   * (v4, v6) bind hash insertion routines
1192 1191   * The caller has already setup the conn (conn_proto, conn_laddr_v6, conn_lport)
1193 1192   */
1194 1193  
1195 1194  int
1196 1195  ipcl_bind_insert(conn_t *connp)
1197 1196  {
1198 1197          if (connp->conn_ipversion == IPV6_VERSION)
1199 1198                  return (ipcl_bind_insert_v6(connp));
1200 1199          else
1201 1200                  return (ipcl_bind_insert_v4(connp));
1202 1201  }
1203 1202  
1204 1203  int
1205 1204  ipcl_bind_insert_v4(conn_t *connp)
1206 1205  {
1207 1206          connf_t *connfp;
1208 1207          int     ret = 0;
1209 1208          ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
1210 1209          uint16_t        lport = connp->conn_lport;
1211 1210          uint8_t         protocol = connp->conn_proto;
1212 1211  
1213 1212          if (IPCL_IS_IPTUN(connp))
1214 1213                  return (ipcl_iptun_hash_insert(connp, ipst));
1215 1214  
1216 1215          switch (protocol) {
1217 1216          default:
1218 1217                  if (is_system_labeled() &&
1219 1218                      check_exempt_conflict_v4(connp, ipst))
1220 1219                          return (EADDRINUSE);
1221 1220                  /* FALLTHROUGH */
1222 1221          case IPPROTO_UDP:
1223 1222                  if (protocol == IPPROTO_UDP) {
1224 1223                          connfp = &ipst->ips_ipcl_udp_fanout[
1225 1224                              IPCL_UDP_HASH(lport, ipst)];
1226 1225                  } else {
1227 1226                          connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1228 1227                  }
1229 1228  
1230 1229                  if (connp->conn_faddr_v4 != INADDR_ANY) {
1231 1230                          IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1232 1231                  } else if (connp->conn_laddr_v4 != INADDR_ANY) {
1233 1232                          ipcl_hash_insert_bound(connfp, connp);
1234 1233                  } else {
1235 1234                          ipcl_hash_insert_wildcard(connfp, connp);
1236 1235                  }
1237 1236                  if (protocol == IPPROTO_RSVP)
1238 1237                          ill_set_inputfn_all(ipst);
1239 1238                  break;
1240 1239  
1241 1240          case IPPROTO_TCP:
1242 1241                  /* Insert it in the Bind Hash */
1243 1242                  ASSERT(connp->conn_zoneid != ALL_ZONES);
1244 1243                  connfp = &ipst->ips_ipcl_bind_fanout[
1245 1244                      IPCL_BIND_HASH(lport, ipst)];
1246 1245                  if (connp->conn_laddr_v4 != INADDR_ANY) {
1247 1246                          ipcl_hash_insert_bound(connfp, connp);
1248 1247                  } else {
1249 1248                          ipcl_hash_insert_wildcard(connfp, connp);
1250 1249                  }
1251 1250                  if (cl_inet_listen != NULL) {
1252 1251                          ASSERT(connp->conn_ipversion == IPV4_VERSION);
1253 1252                          connp->conn_flags |= IPCL_CL_LISTENER;
1254 1253                          (*cl_inet_listen)(
1255 1254                              connp->conn_netstack->netstack_stackid,
1256 1255                              IPPROTO_TCP, AF_INET,
1257 1256                              (uint8_t *)&connp->conn_bound_addr_v4, lport, NULL);
1258 1257                  }
1259 1258                  break;
1260 1259  
1261 1260          case IPPROTO_SCTP:
1262 1261                  ret = ipcl_sctp_hash_insert(connp, lport);
1263 1262                  break;
1264 1263          }
1265 1264  
1266 1265          return (ret);
1267 1266  }
1268 1267  
1269 1268  int
1270 1269  ipcl_bind_insert_v6(conn_t *connp)
1271 1270  {
1272 1271          connf_t         *connfp;
1273 1272          int             ret = 0;
1274 1273          ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
1275 1274          uint16_t        lport = connp->conn_lport;
1276 1275          uint8_t         protocol = connp->conn_proto;
1277 1276  
1278 1277          if (IPCL_IS_IPTUN(connp)) {
1279 1278                  return (ipcl_iptun_hash_insert_v6(connp, ipst));
1280 1279          }
1281 1280  
1282 1281          switch (protocol) {
1283 1282          default:
1284 1283                  if (is_system_labeled() &&
1285 1284                      check_exempt_conflict_v6(connp, ipst))
1286 1285                          return (EADDRINUSE);
1287 1286                  /* FALLTHROUGH */
1288 1287          case IPPROTO_UDP:
1289 1288                  if (protocol == IPPROTO_UDP) {
1290 1289                          connfp = &ipst->ips_ipcl_udp_fanout[
1291 1290                              IPCL_UDP_HASH(lport, ipst)];
1292 1291                  } else {
1293 1292                          connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1294 1293                  }
1295 1294  
1296 1295                  if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1297 1296                          IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1298 1297                  } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1299 1298                          ipcl_hash_insert_bound(connfp, connp);
1300 1299                  } else {
1301 1300                          ipcl_hash_insert_wildcard(connfp, connp);
1302 1301                  }
1303 1302                  break;
1304 1303  
1305 1304          case IPPROTO_TCP:
1306 1305                  /* Insert it in the Bind Hash */
1307 1306                  ASSERT(connp->conn_zoneid != ALL_ZONES);
1308 1307                  connfp = &ipst->ips_ipcl_bind_fanout[
1309 1308                      IPCL_BIND_HASH(lport, ipst)];
1310 1309                  if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1311 1310                          ipcl_hash_insert_bound(connfp, connp);
1312 1311                  } else {
1313 1312                          ipcl_hash_insert_wildcard(connfp, connp);
1314 1313                  }
1315 1314                  if (cl_inet_listen != NULL) {
1316 1315                          sa_family_t     addr_family;
1317 1316                          uint8_t         *laddrp;
1318 1317  
1319 1318                          if (connp->conn_ipversion == IPV6_VERSION) {
1320 1319                                  addr_family = AF_INET6;
1321 1320                                  laddrp =
1322 1321                                      (uint8_t *)&connp->conn_bound_addr_v6;
1323 1322                          } else {
1324 1323                                  addr_family = AF_INET;
1325 1324                                  laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
1326 1325                          }
1327 1326                          connp->conn_flags |= IPCL_CL_LISTENER;
1328 1327                          (*cl_inet_listen)(
1329 1328                              connp->conn_netstack->netstack_stackid,
1330 1329                              IPPROTO_TCP, addr_family, laddrp, lport, NULL);
1331 1330                  }
1332 1331                  break;
1333 1332  
1334 1333          case IPPROTO_SCTP:
1335 1334                  ret = ipcl_sctp_hash_insert(connp, lport);
1336 1335                  break;
1337 1336          }
1338 1337  
1339 1338          return (ret);
1340 1339  }
1341 1340  
1342 1341  /*
1343 1342   * ipcl_conn_hash insertion routines.
1344 1343   * The caller has already set conn_proto and the addresses/ports in the conn_t.
1345 1344   */
1346 1345  
1347 1346  int
1348 1347  ipcl_conn_insert(conn_t *connp)
1349 1348  {
1350 1349          if (connp->conn_ipversion == IPV6_VERSION)
1351 1350                  return (ipcl_conn_insert_v6(connp));
1352 1351          else
1353 1352                  return (ipcl_conn_insert_v4(connp));
1354 1353  }
1355 1354  
1356 1355  int
1357 1356  ipcl_conn_insert_v4(conn_t *connp)
1358 1357  {
1359 1358          connf_t         *connfp;
1360 1359          conn_t          *tconnp;
1361 1360          int             ret = 0;
1362 1361          ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
1363 1362          uint16_t        lport = connp->conn_lport;
1364 1363          uint8_t         protocol = connp->conn_proto;
1365 1364  
1366 1365          if (IPCL_IS_IPTUN(connp))
1367 1366                  return (ipcl_iptun_hash_insert(connp, ipst));
1368 1367  
1369 1368          switch (protocol) {
1370 1369          case IPPROTO_TCP:
1371 1370                  /*
1372 1371                   * For TCP, we check whether the connection tuple already
1373 1372                   * exists before allowing the connection to proceed.  We
1374 1373                   * also allow indexing on the zoneid. This is to allow
1375 1374                   * multiple shared stack zones to have the same tcp
1376 1375                   * connection tuple. In practice this only happens for
1377 1376                   * INADDR_LOOPBACK as it's the only local address which
1378 1377                   * doesn't have to be unique.
1379 1378                   */
1380 1379                  connfp = &ipst->ips_ipcl_conn_fanout[
1381 1380                      IPCL_CONN_HASH(connp->conn_faddr_v4,
1382 1381                      connp->conn_ports, ipst)];
1383 1382                  mutex_enter(&connfp->connf_lock);
1384 1383                  for (tconnp = connfp->connf_head; tconnp != NULL;
1385 1384                      tconnp = tconnp->conn_next) {
1386 1385                          if (IPCL_CONN_MATCH(tconnp, connp->conn_proto,
1387 1386                              connp->conn_faddr_v4, connp->conn_laddr_v4,
1388 1387                              connp->conn_ports) &&
1389 1388                              IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1390 1389                                  /* Already have a conn. bail out */
1391 1390                                  mutex_exit(&connfp->connf_lock);
1392 1391                                  return (EADDRINUSE);
1393 1392                          }
1394 1393                  }
1395 1394                  if (connp->conn_fanout != NULL) {
1396 1395                          /*
1397 1396                           * Probably a XTI/TLI application trying to do a
1398 1397                           * rebind. Let it happen.
1399 1398                           */
1400 1399                          mutex_exit(&connfp->connf_lock);
1401 1400                          IPCL_HASH_REMOVE(connp);
1402 1401                          mutex_enter(&connfp->connf_lock);
1403 1402                  }
1404 1403  
1405 1404                  ASSERT(connp->conn_recv != NULL);
1406 1405                  ASSERT(connp->conn_recvicmp != NULL);
1407 1406  
1408 1407                  IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1409 1408                  mutex_exit(&connfp->connf_lock);
1410 1409                  break;
1411 1410  
1412 1411          case IPPROTO_SCTP:
1413 1412                  /*
1414 1413                   * The raw socket may have already been bound, remove it
1415 1414                   * from the hash first.
1416 1415                   */
1417 1416                  IPCL_HASH_REMOVE(connp);
1418 1417                  ret = ipcl_sctp_hash_insert(connp, lport);
1419 1418                  break;
1420 1419  
1421 1420          default:
1422 1421                  /*
1423 1422                   * Check for conflicts among MAC exempt bindings.  For
1424 1423                   * transports with port numbers, this is done by the upper
1425 1424                   * level per-transport binding logic.  For all others, it's
1426 1425                   * done here.
1427 1426                   */
1428 1427                  if (is_system_labeled() &&
1429 1428                      check_exempt_conflict_v4(connp, ipst))
1430 1429                          return (EADDRINUSE);
1431 1430                  /* FALLTHROUGH */
1432 1431  
1433 1432          case IPPROTO_UDP:
1434 1433                  if (protocol == IPPROTO_UDP) {
1435 1434                          connfp = &ipst->ips_ipcl_udp_fanout[
1436 1435                              IPCL_UDP_HASH(lport, ipst)];
1437 1436                  } else {
1438 1437                          connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1439 1438                  }
1440 1439  
1441 1440                  if (connp->conn_faddr_v4 != INADDR_ANY) {
1442 1441                          IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1443 1442                  } else if (connp->conn_laddr_v4 != INADDR_ANY) {
1444 1443                          ipcl_hash_insert_bound(connfp, connp);
1445 1444                  } else {
1446 1445                          ipcl_hash_insert_wildcard(connfp, connp);
1447 1446                  }
1448 1447                  break;
1449 1448          }
1450 1449  
1451 1450          return (ret);
1452 1451  }
1453 1452  
1454 1453  int
1455 1454  ipcl_conn_insert_v6(conn_t *connp)
1456 1455  {
1457 1456          connf_t         *connfp;
1458 1457          conn_t          *tconnp;
1459 1458          int             ret = 0;
1460 1459          ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
1461 1460          uint16_t        lport = connp->conn_lport;
1462 1461          uint8_t         protocol = connp->conn_proto;
1463 1462          uint_t          ifindex = connp->conn_bound_if;
1464 1463  
1465 1464          if (IPCL_IS_IPTUN(connp))
1466 1465                  return (ipcl_iptun_hash_insert_v6(connp, ipst));
1467 1466  
1468 1467          switch (protocol) {
1469 1468          case IPPROTO_TCP:
1470 1469  
1471 1470                  /*
1472 1471                   * For tcp, we check whether the connection tuple already
1473 1472                   * exists before allowing the connection to proceed.  We
1474 1473                   * also allow indexing on the zoneid. This is to allow
1475 1474                   * multiple shared stack zones to have the same tcp
1476 1475                   * connection tuple. In practice this only happens for
1477 1476                   * ipv6_loopback as it's the only local address which
1478 1477                   * doesn't have to be unique.
1479 1478                   */
1480 1479                  connfp = &ipst->ips_ipcl_conn_fanout[
1481 1480                      IPCL_CONN_HASH_V6(connp->conn_faddr_v6, connp->conn_ports,
1482 1481                      ipst)];
1483 1482                  mutex_enter(&connfp->connf_lock);
1484 1483                  for (tconnp = connfp->connf_head; tconnp != NULL;
1485 1484                      tconnp = tconnp->conn_next) {
1486 1485                          /* NOTE: need to match zoneid. Bug in onnv-gate */
1487 1486                          if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_proto,
1488 1487                              connp->conn_faddr_v6, connp->conn_laddr_v6,
1489 1488                              connp->conn_ports) &&
1490 1489                              (tconnp->conn_bound_if == 0 ||
1491 1490                              tconnp->conn_bound_if == ifindex) &&
1492 1491                              IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1493 1492                                  /* Already have a conn. bail out */
1494 1493                                  mutex_exit(&connfp->connf_lock);
1495 1494                                  return (EADDRINUSE);
1496 1495                          }
1497 1496                  }
1498 1497                  if (connp->conn_fanout != NULL) {
1499 1498                          /*
1500 1499                           * Probably a XTI/TLI application trying to do a
1501 1500                           * rebind. Let it happen.
1502 1501                           */
1503 1502                          mutex_exit(&connfp->connf_lock);
1504 1503                          IPCL_HASH_REMOVE(connp);
1505 1504                          mutex_enter(&connfp->connf_lock);
1506 1505                  }
1507 1506                  IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1508 1507                  mutex_exit(&connfp->connf_lock);
1509 1508                  break;
1510 1509  
1511 1510          case IPPROTO_SCTP:
1512 1511                  IPCL_HASH_REMOVE(connp);
1513 1512                  ret = ipcl_sctp_hash_insert(connp, lport);
1514 1513                  break;
1515 1514  
1516 1515          default:
1517 1516                  if (is_system_labeled() &&
1518 1517                      check_exempt_conflict_v6(connp, ipst))
1519 1518                          return (EADDRINUSE);
1520 1519                  /* FALLTHROUGH */
1521 1520          case IPPROTO_UDP:
1522 1521                  if (protocol == IPPROTO_UDP) {
1523 1522                          connfp = &ipst->ips_ipcl_udp_fanout[
1524 1523                              IPCL_UDP_HASH(lport, ipst)];
1525 1524                  } else {
1526 1525                          connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1527 1526                  }
1528 1527  
1529 1528                  if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1530 1529                          IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1531 1530                  } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1532 1531                          ipcl_hash_insert_bound(connfp, connp);
1533 1532                  } else {
1534 1533                          ipcl_hash_insert_wildcard(connfp, connp);
1535 1534                  }
1536 1535                  break;
1537 1536          }
1538 1537  
1539 1538          return (ret);
1540 1539  }
1541 1540  
1542 1541  /*
1543 1542   * v4 packet classifying function. looks up the fanout table to
1544 1543   * find the conn, the packet belongs to. returns the conn with
1545 1544   * the reference held, null otherwise.
1546 1545   *
1547 1546   * If zoneid is ALL_ZONES, then the search rules described in the "Connection
1548 1547   * Lookup" comment block are applied.  Labels are also checked as described
1549 1548   * above.  If the packet is from the inside (looped back), and is from the same
1550 1549   * zone, then label checks are omitted.
1551 1550   */
1552 1551  conn_t *
1553 1552  ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1554 1553      ip_recv_attr_t *ira, ip_stack_t *ipst)
1555 1554  {
1556 1555          ipha_t  *ipha;
1557 1556          connf_t *connfp, *bind_connfp;
1558 1557          uint16_t lport;
1559 1558          uint16_t fport;
1560 1559          uint32_t ports;
1561 1560          conn_t  *connp;
1562 1561          uint16_t  *up;
1563 1562          zoneid_t        zoneid = ira->ira_zoneid;
1564 1563  
1565 1564          ipha = (ipha_t *)mp->b_rptr;
1566 1565          up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET);
1567 1566  
1568 1567          switch (protocol) {
1569 1568          case IPPROTO_TCP:
1570 1569                  ports = *(uint32_t *)up;
1571 1570                  connfp =
1572 1571                      &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src,
1573 1572                      ports, ipst)];
1574 1573                  mutex_enter(&connfp->connf_lock);
1575 1574                  for (connp = connfp->connf_head; connp != NULL;
1576 1575                      connp = connp->conn_next) {
1577 1576                          if (IPCL_CONN_MATCH(connp, protocol,
1578 1577                              ipha->ipha_src, ipha->ipha_dst, ports) &&
1579 1578                              (connp->conn_zoneid == zoneid ||
1580 1579                              connp->conn_allzones ||
1581 1580                              ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1582 1581                              (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1583 1582                              (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1584 1583                                  break;
1585 1584                  }
1586 1585  
1587 1586                  if (connp != NULL) {
1588 1587                          /*
1589 1588                           * We have a fully-bound TCP connection.
1590 1589                           *
1591 1590                           * For labeled systems, there's no need to check the
1592 1591                           * label here.  It's known to be good as we checked
1593 1592                           * before allowing the connection to become bound.
1594 1593                           */
1595 1594                          CONN_INC_REF(connp);
1596 1595                          mutex_exit(&connfp->connf_lock);
1597 1596                          return (connp);
1598 1597                  }
1599 1598  
1600 1599                  mutex_exit(&connfp->connf_lock);
1601 1600                  lport = up[1];
1602 1601                  bind_connfp =
1603 1602                      &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1604 1603                  mutex_enter(&bind_connfp->connf_lock);
1605 1604                  for (connp = bind_connfp->connf_head; connp != NULL;
1606 1605                      connp = connp->conn_next) {
1607 1606                          if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst,
1608 1607                              lport) &&
1609 1608                              (connp->conn_zoneid == zoneid ||
1610 1609                              connp->conn_allzones ||
1611 1610                              ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1612 1611                              (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1613 1612                              (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1614 1613                                  break;
1615 1614                  }
1616 1615  
1617 1616                  /*
1618 1617                   * If the matching connection is SLP on a private address, then
1619 1618                   * the label on the packet must match the local zone's label.
1620 1619                   * Otherwise, it must be in the label range defined by tnrh.
1621 1620                   * This is ensured by tsol_receive_local.
1622 1621                   *
1623 1622                   * Note that we don't check tsol_receive_local for
1624 1623                   * the connected case.
1625 1624                   */
1626 1625                  if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1627 1626                      !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1628 1627                      ira, connp)) {
1629 1628                          DTRACE_PROBE3(tx__ip__log__info__classify__tcp,
1630 1629                              char *, "connp(1) could not receive mp(2)",
1631 1630                              conn_t *, connp, mblk_t *, mp);
1632 1631                          connp = NULL;
1633 1632                  }
1634 1633  
1635 1634                  if (connp != NULL) {
1636 1635                          /* Have a listener at least */
1637 1636                          CONN_INC_REF(connp);
1638 1637                          mutex_exit(&bind_connfp->connf_lock);
1639 1638                          return (connp);
1640 1639                  }
1641 1640  
1642 1641                  mutex_exit(&bind_connfp->connf_lock);
1643 1642                  break;
1644 1643  
1645 1644          case IPPROTO_UDP:
1646 1645                  lport = up[1];
1647 1646                  fport = up[0];
1648 1647                  connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1649 1648                  mutex_enter(&connfp->connf_lock);
1650 1649                  for (connp = connfp->connf_head; connp != NULL;
1651 1650                      connp = connp->conn_next) {
1652 1651                          if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst,
1653 1652                              fport, ipha->ipha_src) &&
1654 1653                              (connp->conn_zoneid == zoneid ||
1655 1654                              connp->conn_allzones ||
1656 1655                              ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1657 1656                              (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE))))
1658 1657                                  break;
1659 1658                  }
1660 1659  
1661 1660                  if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1662 1661                      !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1663 1662                      ira, connp)) {
1664 1663                          DTRACE_PROBE3(tx__ip__log__info__classify__udp,
1665 1664                              char *, "connp(1) could not receive mp(2)",
1666 1665                              conn_t *, connp, mblk_t *, mp);
1667 1666                          connp = NULL;
1668 1667                  }
1669 1668  
1670 1669                  if (connp != NULL) {
1671 1670                          CONN_INC_REF(connp);
1672 1671                          mutex_exit(&connfp->connf_lock);
1673 1672                          return (connp);
1674 1673                  }
1675 1674  
1676 1675                  /*
1677 1676                   * We shouldn't come here for multicast/broadcast packets
1678 1677                   */
1679 1678                  mutex_exit(&connfp->connf_lock);
1680 1679  
1681 1680                  break;
1682 1681  
1683 1682          case IPPROTO_ENCAP:
1684 1683          case IPPROTO_IPV6:
1685 1684                  return (ipcl_iptun_classify_v4(&ipha->ipha_src,
1686 1685                      &ipha->ipha_dst, ipst));
1687 1686          }
1688 1687  
1689 1688          return (NULL);
1690 1689  }
1691 1690  
1692 1691  conn_t *
1693 1692  ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1694 1693      ip_recv_attr_t *ira, ip_stack_t *ipst)
1695 1694  {
1696 1695          ip6_t           *ip6h;
1697 1696          connf_t         *connfp, *bind_connfp;
1698 1697          uint16_t        lport;
1699 1698          uint16_t        fport;
1700 1699          tcpha_t         *tcpha;
1701 1700          uint32_t        ports;
1702 1701          conn_t          *connp;
1703 1702          uint16_t        *up;
1704 1703          zoneid_t        zoneid = ira->ira_zoneid;
1705 1704  
1706 1705          ip6h = (ip6_t *)mp->b_rptr;
1707 1706  
1708 1707          switch (protocol) {
1709 1708          case IPPROTO_TCP:
1710 1709                  tcpha = (tcpha_t *)&mp->b_rptr[hdr_len];
1711 1710                  up = &tcpha->tha_lport;
1712 1711                  ports = *(uint32_t *)up;
1713 1712  
1714 1713                  connfp =
1715 1714                      &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src,
1716 1715                      ports, ipst)];
1717 1716                  mutex_enter(&connfp->connf_lock);
1718 1717                  for (connp = connfp->connf_head; connp != NULL;
1719 1718                      connp = connp->conn_next) {
1720 1719                          if (IPCL_CONN_MATCH_V6(connp, protocol,
1721 1720                              ip6h->ip6_src, ip6h->ip6_dst, ports) &&
1722 1721                              (connp->conn_zoneid == zoneid ||
1723 1722                              connp->conn_allzones ||
1724 1723                              ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1725 1724                              (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1726 1725                              (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1727 1726                                  break;
1728 1727                  }
1729 1728  
1730 1729                  if (connp != NULL) {
1731 1730                          /*
1732 1731                           * We have a fully-bound TCP connection.
1733 1732                           *
1734 1733                           * For labeled systems, there's no need to check the
1735 1734                           * label here.  It's known to be good as we checked
1736 1735                           * before allowing the connection to become bound.
1737 1736                           */
1738 1737                          CONN_INC_REF(connp);
1739 1738                          mutex_exit(&connfp->connf_lock);
1740 1739                          return (connp);
1741 1740                  }
1742 1741  
1743 1742                  mutex_exit(&connfp->connf_lock);
1744 1743  
1745 1744                  lport = up[1];
1746 1745                  bind_connfp =
1747 1746                      &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1748 1747                  mutex_enter(&bind_connfp->connf_lock);
1749 1748                  for (connp = bind_connfp->connf_head; connp != NULL;
1750 1749                      connp = connp->conn_next) {
1751 1750                          if (IPCL_BIND_MATCH_V6(connp, protocol,
1752 1751                              ip6h->ip6_dst, lport) &&
1753 1752                              (connp->conn_zoneid == zoneid ||
1754 1753                              connp->conn_allzones ||
1755 1754                              ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1756 1755                              (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1757 1756                              (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1758 1757                                  break;
1759 1758                  }
1760 1759  
1761 1760                  if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1762 1761                      !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1763 1762                      ira, connp)) {
1764 1763                          DTRACE_PROBE3(tx__ip__log__info__classify__tcp6,
1765 1764                              char *, "connp(1) could not receive mp(2)",
1766 1765                              conn_t *, connp, mblk_t *, mp);
1767 1766                          connp = NULL;
1768 1767                  }
1769 1768  
1770 1769                  if (connp != NULL) {
1771 1770                          /* Have a listner at least */
1772 1771                          CONN_INC_REF(connp);
1773 1772                          mutex_exit(&bind_connfp->connf_lock);
1774 1773                          return (connp);
1775 1774                  }
1776 1775  
1777 1776                  mutex_exit(&bind_connfp->connf_lock);
1778 1777                  break;
1779 1778  
1780 1779          case IPPROTO_UDP:
1781 1780                  up = (uint16_t *)&mp->b_rptr[hdr_len];
1782 1781                  lport = up[1];
1783 1782                  fport = up[0];
1784 1783                  connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1785 1784                  mutex_enter(&connfp->connf_lock);
1786 1785                  for (connp = connfp->connf_head; connp != NULL;
1787 1786                      connp = connp->conn_next) {
1788 1787                          if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst,
1789 1788                              fport, ip6h->ip6_src) &&
1790 1789                              (connp->conn_zoneid == zoneid ||
1791 1790                              connp->conn_allzones ||
1792 1791                              ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1793 1792                              (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1794 1793                              (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1795 1794                                  break;
1796 1795                  }
1797 1796  
1798 1797                  if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1799 1798                      !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1800 1799                      ira, connp)) {
1801 1800                          DTRACE_PROBE3(tx__ip__log__info__classify__udp6,
1802 1801                              char *, "connp(1) could not receive mp(2)",
1803 1802                              conn_t *, connp, mblk_t *, mp);
1804 1803                          connp = NULL;
1805 1804                  }
1806 1805  
1807 1806                  if (connp != NULL) {
1808 1807                          CONN_INC_REF(connp);
1809 1808                          mutex_exit(&connfp->connf_lock);
1810 1809                          return (connp);
1811 1810                  }
1812 1811  
1813 1812                  /*
1814 1813                   * We shouldn't come here for multicast/broadcast packets
1815 1814                   */
1816 1815                  mutex_exit(&connfp->connf_lock);
1817 1816                  break;
1818 1817          case IPPROTO_ENCAP:
1819 1818          case IPPROTO_IPV6:
1820 1819                  return (ipcl_iptun_classify_v6(&ip6h->ip6_src,
1821 1820                      &ip6h->ip6_dst, ipst));
1822 1821          }
1823 1822  
1824 1823          return (NULL);
1825 1824  }
1826 1825  
1827 1826  /*
1828 1827   * wrapper around ipcl_classify_(v4,v6) routines.
1829 1828   */
1830 1829  conn_t *
1831 1830  ipcl_classify(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst)
1832 1831  {
1833 1832          if (ira->ira_flags & IRAF_IS_IPV4) {
1834 1833                  return (ipcl_classify_v4(mp, ira->ira_protocol,
1835 1834                      ira->ira_ip_hdr_length, ira, ipst));
1836 1835          } else {
1837 1836                  return (ipcl_classify_v6(mp, ira->ira_protocol,
1838 1837                      ira->ira_ip_hdr_length, ira, ipst));
1839 1838          }
1840 1839  }
1841 1840  
1842 1841  /*
1843 1842   * Only used to classify SCTP RAW sockets
1844 1843   */
1845 1844  conn_t *
1846 1845  ipcl_classify_raw(mblk_t *mp, uint8_t protocol, uint32_t ports,
1847 1846      ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira, ip_stack_t *ipst)
1848 1847  {
1849 1848          connf_t         *connfp;
1850 1849          conn_t          *connp;
1851 1850          in_port_t       lport;
1852 1851          int             ipversion;
1853 1852          const void      *dst;
1854 1853          zoneid_t        zoneid = ira->ira_zoneid;
1855 1854  
1856 1855          lport = ((uint16_t *)&ports)[1];
1857 1856          if (ira->ira_flags & IRAF_IS_IPV4) {
1858 1857                  dst = (const void *)&ipha->ipha_dst;
1859 1858                  ipversion = IPV4_VERSION;
1860 1859          } else {
1861 1860                  dst = (const void *)&ip6h->ip6_dst;
1862 1861                  ipversion = IPV6_VERSION;
1863 1862          }
1864 1863  
1865 1864          connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1866 1865          mutex_enter(&connfp->connf_lock);
1867 1866          for (connp = connfp->connf_head; connp != NULL;
1868 1867              connp = connp->conn_next) {
1869 1868                  /* We don't allow v4 fallback for v6 raw socket. */
1870 1869                  if (ipversion != connp->conn_ipversion)
1871 1870                          continue;
1872 1871                  if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1873 1872                      !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1874 1873                          if (ipversion == IPV4_VERSION) {
1875 1874                                  if (!IPCL_CONN_MATCH(connp, protocol,
1876 1875                                      ipha->ipha_src, ipha->ipha_dst, ports))
1877 1876                                          continue;
1878 1877                          } else {
1879 1878                                  if (!IPCL_CONN_MATCH_V6(connp, protocol,
1880 1879                                      ip6h->ip6_src, ip6h->ip6_dst, ports))
1881 1880                                          continue;
1882 1881                          }
1883 1882                  } else {
1884 1883                          if (ipversion == IPV4_VERSION) {
1885 1884                                  if (!IPCL_BIND_MATCH(connp, protocol,
1886 1885                                      ipha->ipha_dst, lport))
1887 1886                                          continue;
1888 1887                          } else {
1889 1888                                  if (!IPCL_BIND_MATCH_V6(connp, protocol,
1890 1889                                      ip6h->ip6_dst, lport))
1891 1890                                          continue;
1892 1891                          }
1893 1892                  }
1894 1893  
1895 1894                  if (connp->conn_zoneid == zoneid ||
1896 1895                      connp->conn_allzones ||
1897 1896                      ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1898 1897                      (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1899 1898                      (ira->ira_flags & IRAF_TX_SHARED_ADDR)))
1900 1899                          break;
1901 1900          }
1902 1901  
1903 1902          if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1904 1903              !tsol_receive_local(mp, dst, ipversion, ira, connp)) {
1905 1904                  DTRACE_PROBE3(tx__ip__log__info__classify__rawip,
1906 1905                      char *, "connp(1) could not receive mp(2)",
1907 1906                      conn_t *, connp, mblk_t *, mp);
1908 1907                  connp = NULL;
1909 1908          }
1910 1909  
1911 1910          if (connp != NULL)
1912 1911                  goto found;
1913 1912          mutex_exit(&connfp->connf_lock);
1914 1913  
1915 1914          /* Try to look for a wildcard SCTP RAW socket match. */
1916 1915          connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)];
1917 1916          mutex_enter(&connfp->connf_lock);
1918 1917          for (connp = connfp->connf_head; connp != NULL;
1919 1918              connp = connp->conn_next) {
1920 1919                  /* We don't allow v4 fallback for v6 raw socket. */
1921 1920                  if (ipversion != connp->conn_ipversion)
1922 1921                          continue;
1923 1922                  if (!IPCL_ZONE_MATCH(connp, zoneid))
1924 1923                          continue;
1925 1924  
1926 1925                  if (ipversion == IPV4_VERSION) {
1927 1926                          if (IPCL_RAW_MATCH(connp, protocol, ipha->ipha_dst))
1928 1927                                  break;
1929 1928                  } else {
1930 1929                          if (IPCL_RAW_MATCH_V6(connp, protocol, ip6h->ip6_dst)) {
1931 1930                                  break;
1932 1931                          }
1933 1932                  }
1934 1933          }
1935 1934  
1936 1935          if (connp != NULL)
1937 1936                  goto found;
1938 1937  
1939 1938          mutex_exit(&connfp->connf_lock);
1940 1939          return (NULL);
1941 1940  
1942 1941  found:
1943 1942          ASSERT(connp != NULL);
1944 1943          CONN_INC_REF(connp);
1945 1944          mutex_exit(&connfp->connf_lock);
1946 1945          return (connp);
1947 1946  }
1948 1947  
1949 1948  /* ARGSUSED */
1950 1949  static int
1951 1950  tcp_conn_constructor(void *buf, void *cdrarg, int kmflags)
1952 1951  {
1953 1952          itc_t   *itc = (itc_t *)buf;
1954 1953          conn_t  *connp = &itc->itc_conn;
1955 1954          tcp_t   *tcp = (tcp_t *)&itc[1];
1956 1955  
1957 1956          bzero(connp, sizeof (conn_t));
1958 1957          bzero(tcp, sizeof (tcp_t));
1959 1958  
1960 1959          mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
1961 1960          cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
1962 1961          cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL);
1963 1962          tcp->tcp_timercache = tcp_timermp_alloc(kmflags);
1964 1963          if (tcp->tcp_timercache == NULL)
1965 1964                  return (ENOMEM);
1966 1965          connp->conn_tcp = tcp;
1967 1966          connp->conn_flags = IPCL_TCPCONN;
1968 1967          connp->conn_proto = IPPROTO_TCP;
1969 1968          tcp->tcp_connp = connp;
1970 1969          rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
1971 1970  
1972 1971          connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
1973 1972          if (connp->conn_ixa == NULL) {
1974 1973                  tcp_timermp_free(tcp);
1975 1974                  return (ENOMEM);
1976 1975          }
1977 1976          connp->conn_ixa->ixa_refcnt = 1;
1978 1977          connp->conn_ixa->ixa_protocol = connp->conn_proto;
1979 1978          connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
1980 1979          return (0);
1981 1980  }
1982 1981  
1983 1982  /* ARGSUSED */
1984 1983  static void
1985 1984  tcp_conn_destructor(void *buf, void *cdrarg)
1986 1985  {
1987 1986          itc_t   *itc = (itc_t *)buf;
1988 1987          conn_t  *connp = &itc->itc_conn;
1989 1988          tcp_t   *tcp = (tcp_t *)&itc[1];
1990 1989  
1991 1990          ASSERT(connp->conn_flags & IPCL_TCPCONN);
1992 1991          ASSERT(tcp->tcp_connp == connp);
1993 1992          ASSERT(connp->conn_tcp == tcp);
1994 1993          tcp_timermp_free(tcp);
1995 1994          mutex_destroy(&connp->conn_lock);
1996 1995          cv_destroy(&connp->conn_cv);
1997 1996          cv_destroy(&connp->conn_sq_cv);
1998 1997          rw_destroy(&connp->conn_ilg_lock);
1999 1998  
2000 1999          /* Can be NULL if constructor failed */
2001 2000          if (connp->conn_ixa != NULL) {
2002 2001                  ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2003 2002                  ASSERT(connp->conn_ixa->ixa_ire == NULL);
2004 2003                  ASSERT(connp->conn_ixa->ixa_nce == NULL);
2005 2004                  ixa_refrele(connp->conn_ixa);
2006 2005          }
2007 2006  }
2008 2007  
2009 2008  /* ARGSUSED */
2010 2009  static int
2011 2010  ip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2012 2011  {
2013 2012          itc_t   *itc = (itc_t *)buf;
2014 2013          conn_t  *connp = &itc->itc_conn;
2015 2014  
2016 2015          bzero(connp, sizeof (conn_t));
2017 2016          mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2018 2017          cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2019 2018          connp->conn_flags = IPCL_IPCCONN;
2020 2019          rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2021 2020  
2022 2021          connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2023 2022          if (connp->conn_ixa == NULL)
2024 2023                  return (ENOMEM);
2025 2024          connp->conn_ixa->ixa_refcnt = 1;
2026 2025          connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2027 2026          return (0);
2028 2027  }
2029 2028  
2030 2029  /* ARGSUSED */
2031 2030  static void
2032 2031  ip_conn_destructor(void *buf, void *cdrarg)
2033 2032  {
2034 2033          itc_t   *itc = (itc_t *)buf;
2035 2034          conn_t  *connp = &itc->itc_conn;
2036 2035  
2037 2036          ASSERT(connp->conn_flags & IPCL_IPCCONN);
2038 2037          ASSERT(connp->conn_priv == NULL);
2039 2038          mutex_destroy(&connp->conn_lock);
2040 2039          cv_destroy(&connp->conn_cv);
2041 2040          rw_destroy(&connp->conn_ilg_lock);
2042 2041  
2043 2042          /* Can be NULL if constructor failed */
2044 2043          if (connp->conn_ixa != NULL) {
2045 2044                  ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2046 2045                  ASSERT(connp->conn_ixa->ixa_ire == NULL);
2047 2046                  ASSERT(connp->conn_ixa->ixa_nce == NULL);
2048 2047                  ixa_refrele(connp->conn_ixa);
2049 2048          }
2050 2049  }
2051 2050  
2052 2051  /* ARGSUSED */
2053 2052  static int
2054 2053  udp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2055 2054  {
2056 2055          itc_t   *itc = (itc_t *)buf;
2057 2056          conn_t  *connp = &itc->itc_conn;
2058 2057          udp_t   *udp = (udp_t *)&itc[1];
2059 2058  
2060 2059          bzero(connp, sizeof (conn_t));
2061 2060          bzero(udp, sizeof (udp_t));
2062 2061  
2063 2062          mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2064 2063          cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2065 2064          connp->conn_udp = udp;
2066 2065          connp->conn_flags = IPCL_UDPCONN;
2067 2066          connp->conn_proto = IPPROTO_UDP;
2068 2067          udp->udp_connp = connp;
2069 2068          rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2070 2069          connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2071 2070          if (connp->conn_ixa == NULL)
2072 2071                  return (ENOMEM);
2073 2072          connp->conn_ixa->ixa_refcnt = 1;
2074 2073          connp->conn_ixa->ixa_protocol = connp->conn_proto;
2075 2074          connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2076 2075          return (0);
2077 2076  }
2078 2077  
2079 2078  /* ARGSUSED */
2080 2079  static void
2081 2080  udp_conn_destructor(void *buf, void *cdrarg)
2082 2081  {
2083 2082          itc_t   *itc = (itc_t *)buf;
2084 2083          conn_t  *connp = &itc->itc_conn;
2085 2084          udp_t   *udp = (udp_t *)&itc[1];
2086 2085  
2087 2086          ASSERT(connp->conn_flags & IPCL_UDPCONN);
2088 2087          ASSERT(udp->udp_connp == connp);
2089 2088          ASSERT(connp->conn_udp == udp);
2090 2089          mutex_destroy(&connp->conn_lock);
2091 2090          cv_destroy(&connp->conn_cv);
2092 2091          rw_destroy(&connp->conn_ilg_lock);
2093 2092  
2094 2093          /* Can be NULL if constructor failed */
2095 2094          if (connp->conn_ixa != NULL) {
2096 2095                  ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2097 2096                  ASSERT(connp->conn_ixa->ixa_ire == NULL);
2098 2097                  ASSERT(connp->conn_ixa->ixa_nce == NULL);
2099 2098                  ixa_refrele(connp->conn_ixa);
2100 2099          }
2101 2100  }
2102 2101  
2103 2102  /* ARGSUSED */
2104 2103  static int
2105 2104  rawip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2106 2105  {
2107 2106          itc_t   *itc = (itc_t *)buf;
2108 2107          conn_t  *connp = &itc->itc_conn;
2109 2108          icmp_t  *icmp = (icmp_t *)&itc[1];
2110 2109  
2111 2110          bzero(connp, sizeof (conn_t));
2112 2111          bzero(icmp, sizeof (icmp_t));
2113 2112  
2114 2113          mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2115 2114          cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2116 2115          connp->conn_icmp = icmp;
2117 2116          connp->conn_flags = IPCL_RAWIPCONN;
2118 2117          connp->conn_proto = IPPROTO_ICMP;
2119 2118          icmp->icmp_connp = connp;
2120 2119          rw_init(&icmp->icmp_bpf_lock, NULL, RW_DEFAULT, NULL);
2121 2120          rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2122 2121          connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2123 2122          if (connp->conn_ixa == NULL)
2124 2123                  return (ENOMEM);
2125 2124          connp->conn_ixa->ixa_refcnt = 1;
2126 2125          connp->conn_ixa->ixa_protocol = connp->conn_proto;
2127 2126          connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2128 2127          return (0);
2129 2128  }
2130 2129  
2131 2130  /* ARGSUSED */
2132 2131  static void
2133 2132  rawip_conn_destructor(void *buf, void *cdrarg)
2134 2133  {
2135 2134          itc_t   *itc = (itc_t *)buf;
2136 2135          conn_t  *connp = &itc->itc_conn;
2137 2136          icmp_t  *icmp = (icmp_t *)&itc[1];
2138 2137  
2139 2138          ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
2140 2139          ASSERT(icmp->icmp_connp == connp);
2141 2140          ASSERT(connp->conn_icmp == icmp);
2142 2141          mutex_destroy(&connp->conn_lock);
2143 2142          cv_destroy(&connp->conn_cv);
2144 2143          rw_destroy(&connp->conn_ilg_lock);
2145 2144          rw_destroy(&icmp->icmp_bpf_lock);
2146 2145  
2147 2146          /* Can be NULL if constructor failed */
2148 2147          if (connp->conn_ixa != NULL) {
2149 2148                  ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2150 2149                  ASSERT(connp->conn_ixa->ixa_ire == NULL);
2151 2150                  ASSERT(connp->conn_ixa->ixa_nce == NULL);
2152 2151                  ixa_refrele(connp->conn_ixa);
2153 2152          }
2154 2153  }
2155 2154  
2156 2155  /* ARGSUSED */
2157 2156  static int
2158 2157  rts_conn_constructor(void *buf, void *cdrarg, int kmflags)
2159 2158  {
2160 2159          itc_t   *itc = (itc_t *)buf;
2161 2160          conn_t  *connp = &itc->itc_conn;
2162 2161          rts_t   *rts = (rts_t *)&itc[1];
2163 2162  
2164 2163          bzero(connp, sizeof (conn_t));
2165 2164          bzero(rts, sizeof (rts_t));
2166 2165  
2167 2166          mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2168 2167          cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2169 2168          connp->conn_rts = rts;
2170 2169          connp->conn_flags = IPCL_RTSCONN;
2171 2170          rts->rts_connp = connp;
2172 2171          rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2173 2172          connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2174 2173          if (connp->conn_ixa == NULL)
2175 2174                  return (ENOMEM);
2176 2175          connp->conn_ixa->ixa_refcnt = 1;
2177 2176          connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2178 2177          return (0);
2179 2178  }
2180 2179  
2181 2180  /* ARGSUSED */
2182 2181  static void
2183 2182  rts_conn_destructor(void *buf, void *cdrarg)
2184 2183  {
2185 2184          itc_t   *itc = (itc_t *)buf;
2186 2185          conn_t  *connp = &itc->itc_conn;
2187 2186          rts_t   *rts = (rts_t *)&itc[1];
2188 2187  
2189 2188          ASSERT(connp->conn_flags & IPCL_RTSCONN);
2190 2189          ASSERT(rts->rts_connp == connp);
2191 2190          ASSERT(connp->conn_rts == rts);
2192 2191          mutex_destroy(&connp->conn_lock);
2193 2192          cv_destroy(&connp->conn_cv);
2194 2193          rw_destroy(&connp->conn_ilg_lock);
2195 2194  
2196 2195          /* Can be NULL if constructor failed */
2197 2196          if (connp->conn_ixa != NULL) {
2198 2197                  ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2199 2198                  ASSERT(connp->conn_ixa->ixa_ire == NULL);
2200 2199                  ASSERT(connp->conn_ixa->ixa_nce == NULL);
2201 2200                  ixa_refrele(connp->conn_ixa);
2202 2201          }
2203 2202  }
2204 2203  
2205 2204  /*
2206 2205   * Called as part of ipcl_conn_destroy to assert and clear any pointers
2207 2206   * in the conn_t.
2208 2207   *
2209 2208   * Below we list all the pointers in the conn_t as a documentation aid.
2210 2209   * The ones that we can not ASSERT to be NULL are #ifdef'ed out.
2211 2210   * If you add any pointers to the conn_t please add an ASSERT here
2212 2211   * and #ifdef it out if it can't be actually asserted to be NULL.
2213 2212   * In any case, we bzero most of the conn_t at the end of the function.
2214 2213   */
2215 2214  void
2216 2215  ipcl_conn_cleanup(conn_t *connp)
2217 2216  {
2218 2217          ip_xmit_attr_t  *ixa;
2219 2218  
2220 2219          ASSERT(connp->conn_latch == NULL);
2221 2220          ASSERT(connp->conn_latch_in_policy == NULL);
2222 2221          ASSERT(connp->conn_latch_in_action == NULL);
2223 2222  #ifdef notdef
2224 2223          ASSERT(connp->conn_rq == NULL);
2225 2224          ASSERT(connp->conn_wq == NULL);
2226 2225  #endif
2227 2226          ASSERT(connp->conn_cred == NULL);
2228 2227          ASSERT(connp->conn_g_fanout == NULL);
2229 2228          ASSERT(connp->conn_g_next == NULL);
2230 2229          ASSERT(connp->conn_g_prev == NULL);
2231 2230          ASSERT(connp->conn_policy == NULL);
2232 2231          ASSERT(connp->conn_fanout == NULL);
2233 2232          ASSERT(connp->conn_next == NULL);
2234 2233          ASSERT(connp->conn_prev == NULL);
2235 2234          ASSERT(connp->conn_oper_pending_ill == NULL);
2236 2235          ASSERT(connp->conn_ilg == NULL);
2237 2236          ASSERT(connp->conn_drain_next == NULL);
2238 2237          ASSERT(connp->conn_drain_prev == NULL);
2239 2238  #ifdef notdef
2240 2239          /* conn_idl is not cleared when removed from idl list */
2241 2240          ASSERT(connp->conn_idl == NULL);
2242 2241  #endif
2243 2242          ASSERT(connp->conn_ipsec_opt_mp == NULL);
2244 2243  #ifdef notdef
2245 2244          /* conn_netstack is cleared by the caller; needed by ixa_cleanup */
2246 2245          ASSERT(connp->conn_netstack == NULL);
2247 2246  #endif
2248 2247  
2249 2248          ASSERT(connp->conn_helper_info == NULL);
2250 2249          ASSERT(connp->conn_ixa != NULL);
2251 2250          ixa = connp->conn_ixa;
2252 2251          ASSERT(ixa->ixa_refcnt == 1);
2253 2252          /* Need to preserve ixa_protocol */
2254 2253          ixa_cleanup(ixa);
2255 2254          ixa->ixa_flags = 0;
2256 2255  
2257 2256          /* Clear out the conn_t fields that are not preserved */
2258 2257          bzero(&connp->conn_start_clr,
2259 2258              sizeof (conn_t) -
2260 2259              ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp));
2261 2260  }
2262 2261  
2263 2262  /*
2264 2263   * All conns are inserted in a global multi-list for the benefit of
2265 2264   * walkers. The walk is guaranteed to walk all open conns at the time
2266 2265   * of the start of the walk exactly once. This property is needed to
2267 2266   * achieve some cleanups during unplumb of interfaces. This is achieved
2268 2267   * as follows.
2269 2268   *
2270 2269   * ipcl_conn_create and ipcl_conn_destroy are the only functions that
2271 2270   * call the insert and delete functions below at creation and deletion
2272 2271   * time respectively. The conn never moves or changes its position in this
2273 2272   * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt
2274 2273   * won't increase due to walkers, once the conn deletion has started. Note
2275 2274   * that we can't remove the conn from the global list and then wait for
2276 2275   * the refcnt to drop to zero, since walkers would then see a truncated
2277 2276   * list. CONN_INCIPIENT ensures that walkers don't start looking at
2278 2277   * conns until ip_open is ready to make them globally visible.
2279 2278   * The global round robin multi-list locks are held only to get the
2280 2279   * next member/insertion/deletion and contention should be negligible
2281 2280   * if the multi-list is much greater than the number of cpus.
2282 2281   */
2283 2282  void
2284 2283  ipcl_globalhash_insert(conn_t *connp)
2285 2284  {
2286 2285          int     index;
2287 2286          struct connf_s  *connfp;
2288 2287          ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
2289 2288  
2290 2289          /*
2291 2290           * No need for atomic here. Approximate even distribution
2292 2291           * in the global lists is sufficient.
2293 2292           */
2294 2293          ipst->ips_conn_g_index++;
2295 2294          index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1);
2296 2295  
2297 2296          connp->conn_g_prev = NULL;
2298 2297          /*
2299 2298           * Mark as INCIPIENT, so that walkers will ignore this
2300 2299           * for now, till ip_open is ready to make it visible globally.
2301 2300           */
2302 2301          connp->conn_state_flags |= CONN_INCIPIENT;
2303 2302  
2304 2303          connfp = &ipst->ips_ipcl_globalhash_fanout[index];
2305 2304          /* Insert at the head of the list */
2306 2305          mutex_enter(&connfp->connf_lock);
2307 2306          connp->conn_g_next = connfp->connf_head;
2308 2307          if (connp->conn_g_next != NULL)
2309 2308                  connp->conn_g_next->conn_g_prev = connp;
2310 2309          connfp->connf_head = connp;
2311 2310  
2312 2311          /* The fanout bucket this conn points to */
2313 2312          connp->conn_g_fanout = connfp;
2314 2313  
2315 2314          mutex_exit(&connfp->connf_lock);
2316 2315  }
2317 2316  
2318 2317  void
2319 2318  ipcl_globalhash_remove(conn_t *connp)
2320 2319  {
2321 2320          struct connf_s  *connfp;
2322 2321  
2323 2322          /*
2324 2323           * We were never inserted in the global multi list.
2325 2324           * IPCL_NONE variety is never inserted in the global multilist
2326 2325           * since it is presumed to not need any cleanup and is transient.
2327 2326           */
2328 2327          if (connp->conn_g_fanout == NULL)
2329 2328                  return;
2330 2329  
2331 2330          connfp = connp->conn_g_fanout;
2332 2331          mutex_enter(&connfp->connf_lock);
2333 2332          if (connp->conn_g_prev != NULL)
2334 2333                  connp->conn_g_prev->conn_g_next = connp->conn_g_next;
2335 2334          else
2336 2335                  connfp->connf_head = connp->conn_g_next;
2337 2336          if (connp->conn_g_next != NULL)
2338 2337                  connp->conn_g_next->conn_g_prev = connp->conn_g_prev;
2339 2338          mutex_exit(&connfp->connf_lock);
2340 2339  
2341 2340          /* Better to stumble on a null pointer than to corrupt memory */
2342 2341          connp->conn_g_next = NULL;
2343 2342          connp->conn_g_prev = NULL;
2344 2343          connp->conn_g_fanout = NULL;
2345 2344  }
2346 2345  
2347 2346  /*
2348 2347   * Walk the list of all conn_t's in the system, calling the function provided
2349 2348   * With the specified argument for each.
2350 2349   * Applies to both IPv4 and IPv6.
2351 2350   *
2352 2351   * CONNs may hold pointers to ills (conn_dhcpinit_ill and
2353 2352   * conn_oper_pending_ill). To guard against stale pointers
2354 2353   * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is
2355 2354   * unplumbed or removed. New conn_t's that are created while we are walking
2356 2355   * may be missed by this walk, because they are not necessarily inserted
2357 2356   * at the tail of the list. They are new conn_t's and thus don't have any
2358 2357   * stale pointers. The CONN_CLOSING flag ensures that no new reference
2359 2358   * is created to the struct that is going away.
2360 2359   */
2361 2360  void
2362 2361  ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst)
2363 2362  {
2364 2363          int     i;
2365 2364          conn_t  *connp;
2366 2365          conn_t  *prev_connp;
2367 2366  
2368 2367          for (i = 0; i < CONN_G_HASH_SIZE; i++) {
2369 2368                  mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2370 2369                  prev_connp = NULL;
2371 2370                  connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head;
2372 2371                  while (connp != NULL) {
2373 2372                          mutex_enter(&connp->conn_lock);
2374 2373                          if (connp->conn_state_flags &
2375 2374                              (CONN_CONDEMNED | CONN_INCIPIENT)) {
2376 2375                                  mutex_exit(&connp->conn_lock);
2377 2376                                  connp = connp->conn_g_next;
2378 2377                                  continue;
2379 2378                          }
2380 2379                          CONN_INC_REF_LOCKED(connp);
2381 2380                          mutex_exit(&connp->conn_lock);
2382 2381                          mutex_exit(
2383 2382                              &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2384 2383                          (*func)(connp, arg);
2385 2384                          if (prev_connp != NULL)
2386 2385                                  CONN_DEC_REF(prev_connp);
2387 2386                          mutex_enter(
2388 2387                              &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2389 2388                          prev_connp = connp;
2390 2389                          connp = connp->conn_g_next;
2391 2390                  }
2392 2391                  mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2393 2392                  if (prev_connp != NULL)
2394 2393                          CONN_DEC_REF(prev_connp);
2395 2394          }
2396 2395  }
2397 2396  
2398 2397  /*
2399 2398   * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on
2400 2399   * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2401 2400   * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2402 2401   * (peer tcp in ESTABLISHED state).
2403 2402   */
2404 2403  conn_t *
2405 2404  ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcpha_t *tcpha,
2406 2405      ip_stack_t *ipst)
2407 2406  {
2408 2407          uint32_t ports;
2409 2408          uint16_t *pports = (uint16_t *)&ports;
2410 2409          connf_t *connfp;
2411 2410          conn_t  *tconnp;
2412 2411          boolean_t zone_chk;
2413 2412  
2414 2413          /*
2415 2414           * If either the source of destination address is loopback, then
2416 2415           * both endpoints must be in the same Zone.  Otherwise, both of
2417 2416           * the addresses are system-wide unique (tcp is in ESTABLISHED
2418 2417           * state) and the endpoints may reside in different Zones.
2419 2418           */
2420 2419          zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) ||
2421 2420              ipha->ipha_dst == htonl(INADDR_LOOPBACK));
2422 2421  
2423 2422          pports[0] = tcpha->tha_fport;
2424 2423          pports[1] = tcpha->tha_lport;
2425 2424  
2426 2425          connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2427 2426              ports, ipst)];
2428 2427  
2429 2428          mutex_enter(&connfp->connf_lock);
2430 2429          for (tconnp = connfp->connf_head; tconnp != NULL;
2431 2430              tconnp = tconnp->conn_next) {
2432 2431  
2433 2432                  if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2434 2433                      ipha->ipha_dst, ipha->ipha_src, ports) &&
2435 2434                      tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2436 2435                      (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2437 2436  
2438 2437                          ASSERT(tconnp != connp);
2439 2438                          CONN_INC_REF(tconnp);
2440 2439                          mutex_exit(&connfp->connf_lock);
2441 2440                          return (tconnp);
2442 2441                  }
2443 2442          }
2444 2443          mutex_exit(&connfp->connf_lock);
2445 2444          return (NULL);
2446 2445  }
2447 2446  
2448 2447  /*
2449 2448   * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on
2450 2449   * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2451 2450   * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2452 2451   * (peer tcp in ESTABLISHED state).
2453 2452   */
2454 2453  conn_t *
2455 2454  ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcpha_t *tcpha,
2456 2455      ip_stack_t *ipst)
2457 2456  {
2458 2457          uint32_t ports;
2459 2458          uint16_t *pports = (uint16_t *)&ports;
2460 2459          connf_t *connfp;
2461 2460          conn_t  *tconnp;
2462 2461          boolean_t zone_chk;
2463 2462  
2464 2463          /*
2465 2464           * If either the source of destination address is loopback, then
2466 2465           * both endpoints must be in the same Zone.  Otherwise, both of
2467 2466           * the addresses are system-wide unique (tcp is in ESTABLISHED
2468 2467           * state) and the endpoints may reside in different Zones.  We
2469 2468           * don't do Zone check for link local address(es) because the
2470 2469           * current Zone implementation treats each link local address as
2471 2470           * being unique per system node, i.e. they belong to global Zone.
2472 2471           */
2473 2472          zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) ||
2474 2473              IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst));
2475 2474  
2476 2475          pports[0] = tcpha->tha_fport;
2477 2476          pports[1] = tcpha->tha_lport;
2478 2477  
2479 2478          connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2480 2479              ports, ipst)];
2481 2480  
2482 2481          mutex_enter(&connfp->connf_lock);
2483 2482          for (tconnp = connfp->connf_head; tconnp != NULL;
2484 2483              tconnp = tconnp->conn_next) {
2485 2484  
2486 2485                  /* We skip conn_bound_if check here as this is loopback tcp */
2487 2486                  if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2488 2487                      ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2489 2488                      tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2490 2489                      (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2491 2490  
2492 2491                          ASSERT(tconnp != connp);
2493 2492                          CONN_INC_REF(tconnp);
2494 2493                          mutex_exit(&connfp->connf_lock);
2495 2494                          return (tconnp);
2496 2495                  }
2497 2496          }
2498 2497          mutex_exit(&connfp->connf_lock);
2499 2498          return (NULL);
2500 2499  }
2501 2500  
2502 2501  /*
2503 2502   * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2504 2503   * Returns with conn reference held. Caller must call CONN_DEC_REF.
2505 2504   * Only checks for connected entries i.e. no INADDR_ANY checks.
2506 2505   */
2507 2506  conn_t *
2508 2507  ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcpha_t *tcpha, int min_state,
2509 2508      ip_stack_t *ipst)
2510 2509  {
2511 2510          uint32_t ports;
2512 2511          uint16_t *pports;
2513 2512          connf_t *connfp;
2514 2513          conn_t  *tconnp;
2515 2514  
2516 2515          pports = (uint16_t *)&ports;
2517 2516          pports[0] = tcpha->tha_fport;
2518 2517          pports[1] = tcpha->tha_lport;
2519 2518  
2520 2519          connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2521 2520              ports, ipst)];
2522 2521  
2523 2522          mutex_enter(&connfp->connf_lock);
2524 2523          for (tconnp = connfp->connf_head; tconnp != NULL;
2525 2524              tconnp = tconnp->conn_next) {
2526 2525  
2527 2526                  if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2528 2527                      ipha->ipha_dst, ipha->ipha_src, ports) &&
2529 2528                      tconnp->conn_tcp->tcp_state >= min_state) {
2530 2529  
2531 2530                          CONN_INC_REF(tconnp);
2532 2531                          mutex_exit(&connfp->connf_lock);
2533 2532                          return (tconnp);
2534 2533                  }
2535 2534          }
2536 2535          mutex_exit(&connfp->connf_lock);
2537 2536          return (NULL);
2538 2537  }
2539 2538  
2540 2539  /*
2541 2540   * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2542 2541   * Returns with conn reference held. Caller must call CONN_DEC_REF.
2543 2542   * Only checks for connected entries i.e. no INADDR_ANY checks.
2544 2543   * Match on ifindex in addition to addresses.
2545 2544   */
2546 2545  conn_t *
2547 2546  ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state,
2548 2547      uint_t ifindex, ip_stack_t *ipst)
2549 2548  {
2550 2549          tcp_t   *tcp;
2551 2550          uint32_t ports;
2552 2551          uint16_t *pports;
2553 2552          connf_t *connfp;
2554 2553          conn_t  *tconnp;
2555 2554  
2556 2555          pports = (uint16_t *)&ports;
2557 2556          pports[0] = tcpha->tha_fport;
2558 2557          pports[1] = tcpha->tha_lport;
2559 2558  
2560 2559          connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2561 2560              ports, ipst)];
2562 2561  
2563 2562          mutex_enter(&connfp->connf_lock);
2564 2563          for (tconnp = connfp->connf_head; tconnp != NULL;
2565 2564              tconnp = tconnp->conn_next) {
2566 2565  
2567 2566                  tcp = tconnp->conn_tcp;
2568 2567                  if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2569 2568                      ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2570 2569                      tcp->tcp_state >= min_state &&
2571 2570                      (tconnp->conn_bound_if == 0 ||
2572 2571                      tconnp->conn_bound_if == ifindex)) {
2573 2572  
2574 2573                          CONN_INC_REF(tconnp);
2575 2574                          mutex_exit(&connfp->connf_lock);
2576 2575                          return (tconnp);
2577 2576                  }
2578 2577          }
2579 2578          mutex_exit(&connfp->connf_lock);
2580 2579          return (NULL);
2581 2580  }
2582 2581  
2583 2582  /*
2584 2583   * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate
2585 2584   * a listener when changing state.
2586 2585   */
2587 2586  conn_t *
2588 2587  ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid,
2589 2588      ip_stack_t *ipst)
2590 2589  {
2591 2590          connf_t         *bind_connfp;
2592 2591          conn_t          *connp;
2593 2592          tcp_t           *tcp;
2594 2593  
2595 2594          /*
2596 2595           * Avoid false matches for packets sent to an IP destination of
2597 2596           * all zeros.
2598 2597           */
2599 2598          if (laddr == 0)
2600 2599                  return (NULL);
2601 2600  
2602 2601          ASSERT(zoneid != ALL_ZONES);
2603 2602  
2604 2603          bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2605 2604          mutex_enter(&bind_connfp->connf_lock);
2606 2605          for (connp = bind_connfp->connf_head; connp != NULL;
2607 2606              connp = connp->conn_next) {
2608 2607                  tcp = connp->conn_tcp;
2609 2608                  if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) &&
2610 2609                      IPCL_ZONE_MATCH(connp, zoneid) &&
2611 2610                      (tcp->tcp_listener == NULL)) {
2612 2611                          CONN_INC_REF(connp);
2613 2612                          mutex_exit(&bind_connfp->connf_lock);
2614 2613                          return (connp);
2615 2614                  }
2616 2615          }
2617 2616          mutex_exit(&bind_connfp->connf_lock);
2618 2617          return (NULL);
2619 2618  }
2620 2619  
2621 2620  /*
2622 2621   * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate
2623 2622   * a listener when changing state.
2624 2623   */
2625 2624  conn_t *
2626 2625  ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex,
2627 2626      zoneid_t zoneid, ip_stack_t *ipst)
2628 2627  {
2629 2628          connf_t         *bind_connfp;
2630 2629          conn_t          *connp = NULL;
2631 2630          tcp_t           *tcp;
2632 2631  
2633 2632          /*
2634 2633           * Avoid false matches for packets sent to an IP destination of
2635 2634           * all zeros.
2636 2635           */
2637 2636          if (IN6_IS_ADDR_UNSPECIFIED(laddr))
2638 2637                  return (NULL);
2639 2638  
2640 2639          ASSERT(zoneid != ALL_ZONES);
2641 2640  
2642 2641          bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2643 2642          mutex_enter(&bind_connfp->connf_lock);
2644 2643          for (connp = bind_connfp->connf_head; connp != NULL;
2645 2644              connp = connp->conn_next) {
2646 2645                  tcp = connp->conn_tcp;
2647 2646                  if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) &&
2648 2647                      IPCL_ZONE_MATCH(connp, zoneid) &&
2649 2648                      (connp->conn_bound_if == 0 ||
2650 2649                      connp->conn_bound_if == ifindex) &&
2651 2650                      tcp->tcp_listener == NULL) {
2652 2651                          CONN_INC_REF(connp);
2653 2652                          mutex_exit(&bind_connfp->connf_lock);
2654 2653                          return (connp);
2655 2654                  }
2656 2655          }
2657 2656          mutex_exit(&bind_connfp->connf_lock);
2658 2657          return (NULL);
2659 2658  }
2660 2659  
2661 2660  /*
2662 2661   * ipcl_get_next_conn
2663 2662   *      get the next entry in the conn global list
2664 2663   *      and put a reference on the next_conn.
2665 2664   *      decrement the reference on the current conn.
2666 2665   *
2667 2666   * This is an iterator based walker function that also provides for
2668 2667   * some selection by the caller. It walks through the conn_hash bucket
2669 2668   * searching for the next valid connp in the list, and selects connections
2670 2669   * that are neither closed nor condemned. It also REFHOLDS the conn
2671 2670   * thus ensuring that the conn exists when the caller uses the conn.
2672 2671   */
2673 2672  conn_t *
2674 2673  ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags)
2675 2674  {
2676 2675          conn_t  *next_connp;
2677 2676  
2678 2677          if (connfp == NULL)
2679 2678                  return (NULL);
2680 2679  
2681 2680          mutex_enter(&connfp->connf_lock);
2682 2681  
2683 2682          next_connp = (connp == NULL) ?
2684 2683              connfp->connf_head : connp->conn_g_next;
2685 2684  
2686 2685          while (next_connp != NULL) {
2687 2686                  mutex_enter(&next_connp->conn_lock);
2688 2687                  if (!(next_connp->conn_flags & conn_flags) ||
2689 2688                      (next_connp->conn_state_flags &
2690 2689                      (CONN_CONDEMNED | CONN_INCIPIENT))) {
2691 2690                          /*
2692 2691                           * This conn has been condemned or
2693 2692                           * is closing, or the flags don't match
2694 2693                           */
2695 2694                          mutex_exit(&next_connp->conn_lock);
2696 2695                          next_connp = next_connp->conn_g_next;
2697 2696                          continue;
2698 2697                  }
2699 2698                  CONN_INC_REF_LOCKED(next_connp);
2700 2699                  mutex_exit(&next_connp->conn_lock);
2701 2700                  break;
2702 2701          }
2703 2702  
2704 2703          mutex_exit(&connfp->connf_lock);
2705 2704  
2706 2705          if (connp != NULL)
2707 2706                  CONN_DEC_REF(connp);
2708 2707  
2709 2708          return (next_connp);
2710 2709  }
2711 2710  
2712 2711  #ifdef CONN_DEBUG
2713 2712  /*
2714 2713   * Trace of the last NBUF refhold/refrele
2715 2714   */
2716 2715  int
2717 2716  conn_trace_ref(conn_t *connp)
2718 2717  {
2719 2718          int     last;
2720 2719          conn_trace_t    *ctb;
2721 2720  
2722 2721          ASSERT(MUTEX_HELD(&connp->conn_lock));
2723 2722          last = connp->conn_trace_last;
2724 2723          last++;
2725 2724          if (last == CONN_TRACE_MAX)
2726 2725                  last = 0;
2727 2726  
2728 2727          ctb = &connp->conn_trace_buf[last];
2729 2728          ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2730 2729          connp->conn_trace_last = last;
2731 2730          return (1);
2732 2731  }
2733 2732  
2734 2733  int
2735 2734  conn_untrace_ref(conn_t *connp)
2736 2735  {
2737 2736          int     last;
2738 2737          conn_trace_t    *ctb;
2739 2738  
2740 2739          ASSERT(MUTEX_HELD(&connp->conn_lock));
2741 2740          last = connp->conn_trace_last;
2742 2741          last++;
2743 2742          if (last == CONN_TRACE_MAX)
2744 2743                  last = 0;
2745 2744  
2746 2745          ctb = &connp->conn_trace_buf[last];
2747 2746          ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2748 2747          connp->conn_trace_last = last;
2749 2748          return (1);
2750 2749  }
2751 2750  #endif
  
    | 
      ↓ open down ↓ | 
    1818 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX