1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2019 OmniOS Community Edition (OmniOSce) Association.
  24  * Copyright 2022 Joyent, Inc.
  25  */
  26 
  27 /*
  28  * IP PACKET CLASSIFIER
  29  *
  30  * The IP packet classifier provides mapping between IP packets and persistent
  31  * connection state for connection-oriented protocols. It also provides
  32  * interface for managing connection states.
  33  *
  34  * The connection state is kept in conn_t data structure and contains, among
  35  * other things:
  36  *
  37  *      o local/remote address and ports
  38  *      o Transport protocol
  39  *      o squeue for the connection (for TCP only)
  40  *      o reference counter
  41  *      o Connection state
  42  *      o hash table linkage
  43  *      o interface/ire information
  44  *      o credentials
  45  *      o ipsec policy
  46  *      o send and receive functions.
  47  *      o mutex lock.
  48  *
  49  * Connections use a reference counting scheme. They are freed when the
  50  * reference counter drops to zero. A reference is incremented when connection
  51  * is placed in a list or table, when incoming packet for the connection arrives
  52  * and when connection is processed via squeue (squeue processing may be
  53  * asynchronous and the reference protects the connection from being destroyed
  54  * before its processing is finished).
  55  *
  56  * conn_recv is used to pass up packets to the ULP.
  57  * For TCP conn_recv changes. It is tcp_input_listener_unbound initially for
  58  * a listener, and changes to tcp_input_listener as the listener has picked a
  59  * good squeue. For other cases it is set to tcp_input_data.
  60  *
  61  * conn_recvicmp is used to pass up ICMP errors to the ULP.
  62  *
  63  * Classifier uses several hash tables:
  64  *
  65  *      ipcl_conn_fanout:       contains all TCP connections in CONNECTED state
  66  *      ipcl_bind_fanout:       contains all connections in BOUND state
  67  *      ipcl_proto_fanout:      IPv4 protocol fanout
  68  *      ipcl_proto_fanout_v6:   IPv6 protocol fanout
  69  *      ipcl_udp_fanout:        contains all UDP connections
  70  *      ipcl_iptun_fanout:      contains all IP tunnel connections
  71  *      ipcl_globalhash_fanout: contains all connections
  72  *
  73  * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering)
  74  * which need to view all existing connections.
  75  *
  76  * All tables are protected by per-bucket locks. When both per-bucket lock and
  77  * connection lock need to be held, the per-bucket lock should be acquired
  78  * first, followed by the connection lock.
  79  *
  80  * All functions doing search in one of these tables increment a reference
  81  * counter on the connection found (if any). This reference should be dropped
  82  * when the caller has finished processing the connection.
  83  *
  84  *
  85  * INTERFACES:
  86  * ===========
  87  *
  88  * Connection Lookup:
  89  * ------------------
  90  *
  91  * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, ira, ip_stack)
  92  * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, ira, ip_stack)
  93  *
  94  * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
  95  * it can't find any associated connection. If the connection is found, its
  96  * reference counter is incremented.
  97  *
  98  *      mp:     mblock, containing packet header. The full header should fit
  99  *              into a single mblock. It should also contain at least full IP
 100  *              and TCP or UDP header.
 101  *
 102  *      protocol: Either IPPROTO_TCP or IPPROTO_UDP.
 103  *
 104  *      hdr_len: The size of IP header. It is used to find TCP or UDP header in
 105  *               the packet.
 106  *
 107  *      ira->ira_zoneid: The zone in which the returned connection must be; the
 108  *              zoneid corresponding to the ire_zoneid on the IRE located for
 109  *              the packet's destination address.
 110  *
 111  *      ira->ira_flags: Contains the IRAF_TX_MAC_EXEMPTABLE and
 112  *              IRAF_TX_SHARED_ADDR flags
 113  *
 114  *      For TCP connections, the lookup order is as follows:
 115  *              5-tuple {src, dst, protocol, local port, remote port}
 116  *                      lookup in ipcl_conn_fanout table.
 117  *              3-tuple {dst, remote port, protocol} lookup in
 118  *                      ipcl_bind_fanout table.
 119  *
 120  *      For UDP connections, a 5-tuple {src, dst, protocol, local port,
 121  *      remote port} lookup is done on ipcl_udp_fanout. Note that,
 122  *      these interfaces do not handle cases where a packets belongs
 123  *      to multiple UDP clients, which is handled in IP itself.
 124  *
 125  * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must
 126  * determine which actual zone gets the segment.  This is used only in a
 127  * labeled environment.  The matching rules are:
 128  *
 129  *      - If it's not a multilevel port, then the label on the packet selects
 130  *        the zone.  Unlabeled packets are delivered to the global zone.
 131  *
 132  *      - If it's a multilevel port, then only the zone registered to receive
 133  *        packets on that port matches.
 134  *
 135  * Also, in a labeled environment, packet labels need to be checked.  For fully
 136  * bound TCP connections, we can assume that the packet label was checked
 137  * during connection establishment, and doesn't need to be checked on each
 138  * packet.  For others, though, we need to check for strict equality or, for
 139  * multilevel ports, membership in the range or set.  This part currently does
 140  * a tnrh lookup on each packet, but could be optimized to use cached results
 141  * if that were necessary.  (SCTP doesn't come through here, but if it did,
 142  * we would apply the same rules as TCP.)
 143  *
 144  * An implication of the above is that fully-bound TCP sockets must always use
 145  * distinct 4-tuples; they can't be discriminated by label alone.
 146  *
 147  * Note that we cannot trust labels on packets sent to fully-bound UDP sockets,
 148  * as there's no connection set-up handshake and no shared state.
 149  *
 150  * Labels on looped-back packets within a single zone do not need to be
 151  * checked, as all processes in the same zone have the same label.
 152  *
 153  * Finally, for unlabeled packets received by a labeled system, special rules
 154  * apply.  We consider only the MLP if there is one.  Otherwise, we prefer a
 155  * socket in the zone whose label matches the default label of the sender, if
 156  * any.  In any event, the receiving socket must have SO_MAC_EXEMPT set and the
 157  * receiver's label must dominate the sender's default label.
 158  *
 159  * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcpha_t *, int, ip_stack);
 160  * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t,
 161  *                                       ip_stack);
 162  *
 163  *      Lookup routine to find a exact match for {src, dst, local port,
 164  *      remote port) for TCP connections in ipcl_conn_fanout. The address and
 165  *      ports are read from the IP and TCP header respectively.
 166  *
 167  * conn_t       *ipcl_lookup_listener_v4(lport, laddr, protocol,
 168  *                                       zoneid, ip_stack);
 169  * conn_t       *ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex,
 170  *                                       zoneid, ip_stack);
 171  *
 172  *      Lookup routine to find a listener with the tuple {lport, laddr,
 173  *      protocol} in the ipcl_bind_fanout table. For IPv6, an additional
 174  *      parameter interface index is also compared.
 175  *
 176  * void ipcl_walk(func, arg, ip_stack)
 177  *
 178  *      Apply 'func' to every connection available. The 'func' is called as
 179  *      (*func)(connp, arg). The walk is non-atomic so connections may be
 180  *      created and destroyed during the walk. The CONN_CONDEMNED and
 181  *      CONN_INCIPIENT flags ensure that connections which are newly created
 182  *      or being destroyed are not selected by the walker.
 183  *
 184  * Table Updates
 185  * -------------
 186  *
 187  * int ipcl_conn_insert(connp);
 188  * int ipcl_conn_insert_v4(connp);
 189  * int ipcl_conn_insert_v6(connp);
 190  *
 191  *      Insert 'connp' in the ipcl_conn_fanout.
 192  *      Arguments :
 193  *              connp           conn_t to be inserted
 194  *
 195  *      Return value :
 196  *              0               if connp was inserted
 197  *              EADDRINUSE      if the connection with the same tuple
 198  *                              already exists.
 199  *
 200  * int ipcl_bind_insert(connp);
 201  * int ipcl_bind_insert_v4(connp);
 202  * int ipcl_bind_insert_v6(connp);
 203  *
 204  *      Insert 'connp' in ipcl_bind_fanout.
 205  *      Arguments :
 206  *              connp           conn_t to be inserted
 207  *
 208  *
 209  * void ipcl_hash_remove(connp);
 210  *
 211  *      Removes the 'connp' from the connection fanout table.
 212  *
 213  * Connection Creation/Destruction
 214  * -------------------------------
 215  *
 216  * conn_t *ipcl_conn_create(type, sleep, netstack_t *)
 217  *
 218  *      Creates a new conn based on the type flag, inserts it into
 219  *      globalhash table.
 220  *
 221  *      type:   This flag determines the type of conn_t which needs to be
 222  *              created i.e., which kmem_cache it comes from.
 223  *              IPCL_TCPCONN    indicates a TCP connection
 224  *              IPCL_SCTPCONN   indicates a SCTP connection
 225  *              IPCL_UDPCONN    indicates a UDP conn_t.
 226  *              IPCL_RAWIPCONN  indicates a RAWIP/ICMP conn_t.
 227  *              IPCL_RTSCONN    indicates a RTS conn_t.
 228  *              IPCL_IPCCONN    indicates all other connections.
 229  *
 230  * void ipcl_conn_destroy(connp)
 231  *
 232  *      Destroys the connection state, removes it from the global
 233  *      connection hash table and frees its memory.
 234  */
 235 
 236 #include <sys/types.h>
 237 #include <sys/stream.h>
 238 #include <sys/stropts.h>
 239 #include <sys/sysmacros.h>
 240 #include <sys/strsubr.h>
 241 #include <sys/strsun.h>
 242 #define _SUN_TPI_VERSION 2
 243 #include <sys/ddi.h>
 244 #include <sys/cmn_err.h>
 245 #include <sys/debug.h>
 246 
 247 #include <sys/systm.h>
 248 #include <sys/param.h>
 249 #include <sys/kmem.h>
 250 #include <sys/isa_defs.h>
 251 #include <inet/common.h>
 252 #include <netinet/ip6.h>
 253 #include <netinet/icmp6.h>
 254 
 255 #include <inet/ip.h>
 256 #include <inet/ip_if.h>
 257 #include <inet/ip_ire.h>
 258 #include <inet/ip6.h>
 259 #include <inet/ip_ndp.h>
 260 #include <inet/ip_impl.h>
 261 #include <inet/udp_impl.h>
 262 #include <inet/sctp_ip.h>
 263 #include <inet/sctp/sctp_impl.h>
 264 #include <inet/rawip_impl.h>
 265 #include <inet/rts_impl.h>
 266 #include <inet/iptun/iptun_impl.h>
 267 
 268 #include <sys/cpuvar.h>
 269 
 270 #include <inet/ipclassifier.h>
 271 #include <inet/tcp.h>
 272 #include <inet/ipsec_impl.h>
 273 
 274 #include <sys/tsol/tnet.h>
 275 #include <sys/sockio.h>
 276 
 277 /* Old value for compatibility. Setable in /etc/system */
 278 uint_t tcp_conn_hash_size = 0;
 279 
 280 /* New value. Zero means choose automatically.  Setable in /etc/system */
 281 uint_t ipcl_conn_hash_size = 0;
 282 uint_t ipcl_conn_hash_memfactor = 8192;
 283 uint_t ipcl_conn_hash_maxsize = 82500;
 284 
 285 /* bind/udp fanout table size */
 286 uint_t ipcl_bind_fanout_size = 512;
 287 uint_t ipcl_udp_fanout_size = 16384;
 288 
 289 /* Raw socket fanout size.  Must be a power of 2. */
 290 uint_t ipcl_raw_fanout_size = 256;
 291 
 292 /*
 293  * The IPCL_IPTUN_HASH() function works best with a prime table size.  We
 294  * expect that most large deployments would have hundreds of tunnels, and
 295  * thousands in the extreme case.
 296  */
 297 uint_t ipcl_iptun_fanout_size = 6143;
 298 
 299 /*
 300  * Power of 2^N Primes useful for hashing for N of 0-28,
 301  * these primes are the nearest prime <= 2^N - 2^(N-2).
 302  */
 303 
 304 #define P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067,  \
 305                 6143, 12281, 24571, 49139, 98299, 196597, 393209,       \
 306                 786431, 1572853, 3145721, 6291449, 12582893, 25165813,  \
 307                 50331599, 100663291, 201326557, 0}
 308 
 309 /*
 310  * wrapper structure to ensure that conn and what follows it (tcp_t, etc)
 311  * are aligned on cache lines.
 312  */
 313 typedef union itc_s {
 314         conn_t  itc_conn;
 315         char    itcu_filler[CACHE_ALIGN(conn_s)];
 316 } itc_t;
 317 
 318 struct kmem_cache  *tcp_conn_cache;
 319 struct kmem_cache  *ip_conn_cache;
 320 extern struct kmem_cache  *sctp_conn_cache;
 321 struct kmem_cache  *udp_conn_cache;
 322 struct kmem_cache  *rawip_conn_cache;
 323 struct kmem_cache  *rts_conn_cache;
 324 
 325 extern void     tcp_timermp_free(tcp_t *);
 326 extern mblk_t   *tcp_timermp_alloc(int);
 327 
 328 static int      ip_conn_constructor(void *, void *, int);
 329 static void     ip_conn_destructor(void *, void *);
 330 
 331 static int      tcp_conn_constructor(void *, void *, int);
 332 static void     tcp_conn_destructor(void *, void *);
 333 
 334 static int      udp_conn_constructor(void *, void *, int);
 335 static void     udp_conn_destructor(void *, void *);
 336 
 337 static int      rawip_conn_constructor(void *, void *, int);
 338 static void     rawip_conn_destructor(void *, void *);
 339 
 340 static int      rts_conn_constructor(void *, void *, int);
 341 static void     rts_conn_destructor(void *, void *);
 342 
 343 /*
 344  * Global (for all stack instances) init routine
 345  */
 346 void
 347 ipcl_g_init(void)
 348 {
 349         ip_conn_cache = kmem_cache_create("ip_conn_cache",
 350             sizeof (conn_t), CACHE_ALIGN_SIZE,
 351             ip_conn_constructor, ip_conn_destructor,
 352             NULL, NULL, NULL, 0);
 353 
 354         tcp_conn_cache = kmem_cache_create("tcp_conn_cache",
 355             sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE,
 356             tcp_conn_constructor, tcp_conn_destructor,
 357             tcp_conn_reclaim, NULL, NULL, 0);
 358 
 359         udp_conn_cache = kmem_cache_create("udp_conn_cache",
 360             sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE,
 361             udp_conn_constructor, udp_conn_destructor,
 362             NULL, NULL, NULL, 0);
 363 
 364         rawip_conn_cache = kmem_cache_create("rawip_conn_cache",
 365             sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE,
 366             rawip_conn_constructor, rawip_conn_destructor,
 367             NULL, NULL, NULL, 0);
 368 
 369         rts_conn_cache = kmem_cache_create("rts_conn_cache",
 370             sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE,
 371             rts_conn_constructor, rts_conn_destructor,
 372             NULL, NULL, NULL, 0);
 373 }
 374 
 375 /*
 376  * ipclassifier intialization routine, sets up hash tables.
 377  */
 378 void
 379 ipcl_init(ip_stack_t *ipst)
 380 {
 381         int i;
 382         int sizes[] = P2Ps();
 383 
 384         /*
 385          * Calculate size of conn fanout table from /etc/system settings
 386          */
 387         if (ipcl_conn_hash_size != 0) {
 388                 ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size;
 389         } else if (tcp_conn_hash_size != 0) {
 390                 ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size;
 391         } else {
 392                 extern pgcnt_t freemem;
 393 
 394                 ipst->ips_ipcl_conn_fanout_size =
 395                     (freemem * PAGESIZE) / ipcl_conn_hash_memfactor;
 396 
 397                 if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) {
 398                         ipst->ips_ipcl_conn_fanout_size =
 399                             ipcl_conn_hash_maxsize;
 400                 }
 401         }
 402 
 403         for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) {
 404                 if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) {
 405                         break;
 406                 }
 407         }
 408         if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) {
 409                 /* Out of range, use the 2^16 value */
 410                 ipst->ips_ipcl_conn_fanout_size = sizes[16];
 411         }
 412 
 413         /* Take values from /etc/system */
 414         ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size;
 415         ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size;
 416         ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size;
 417         ipst->ips_ipcl_iptun_fanout_size = ipcl_iptun_fanout_size;
 418 
 419         ASSERT(ipst->ips_ipcl_conn_fanout == NULL);
 420 
 421         ipst->ips_ipcl_conn_fanout = kmem_zalloc(
 422             ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP);
 423 
 424         for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
 425                 mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL,
 426                     MUTEX_DEFAULT, NULL);
 427         }
 428 
 429         ipst->ips_ipcl_bind_fanout = kmem_zalloc(
 430             ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP);
 431 
 432         for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
 433                 mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL,
 434                     MUTEX_DEFAULT, NULL);
 435         }
 436 
 437         ipst->ips_ipcl_proto_fanout_v4 = kmem_zalloc(IPPROTO_MAX *
 438             sizeof (connf_t), KM_SLEEP);
 439         for (i = 0; i < IPPROTO_MAX; i++) {
 440                 mutex_init(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock, NULL,
 441                     MUTEX_DEFAULT, NULL);
 442         }
 443 
 444         ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX *
 445             sizeof (connf_t), KM_SLEEP);
 446         for (i = 0; i < IPPROTO_MAX; i++) {
 447                 mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL,
 448                     MUTEX_DEFAULT, NULL);
 449         }
 450 
 451         ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP);
 452         mutex_init(&ipst->ips_rts_clients->connf_lock,
 453             NULL, MUTEX_DEFAULT, NULL);
 454 
 455         ipst->ips_ipcl_udp_fanout = kmem_zalloc(
 456             ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP);
 457         for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
 458                 mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL,
 459                     MUTEX_DEFAULT, NULL);
 460         }
 461 
 462         ipst->ips_ipcl_iptun_fanout = kmem_zalloc(
 463             ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t), KM_SLEEP);
 464         for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
 465                 mutex_init(&ipst->ips_ipcl_iptun_fanout[i].connf_lock, NULL,
 466                     MUTEX_DEFAULT, NULL);
 467         }
 468 
 469         ipst->ips_ipcl_raw_fanout = kmem_zalloc(
 470             ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP);
 471         for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
 472                 mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL,
 473                     MUTEX_DEFAULT, NULL);
 474         }
 475 
 476         ipst->ips_ipcl_globalhash_fanout = kmem_zalloc(
 477             sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP);
 478         for (i = 0; i < CONN_G_HASH_SIZE; i++) {
 479                 mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock,
 480                     NULL, MUTEX_DEFAULT, NULL);
 481         }
 482 }
 483 
 484 void
 485 ipcl_g_destroy(void)
 486 {
 487         kmem_cache_destroy(ip_conn_cache);
 488         kmem_cache_destroy(tcp_conn_cache);
 489         kmem_cache_destroy(udp_conn_cache);
 490         kmem_cache_destroy(rawip_conn_cache);
 491         kmem_cache_destroy(rts_conn_cache);
 492 }
 493 
 494 /*
 495  * All user-level and kernel use of the stack must be gone
 496  * by now.
 497  */
 498 void
 499 ipcl_destroy(ip_stack_t *ipst)
 500 {
 501         int i;
 502 
 503         for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
 504                 ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL);
 505                 mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock);
 506         }
 507         kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size *
 508             sizeof (connf_t));
 509         ipst->ips_ipcl_conn_fanout = NULL;
 510 
 511         for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
 512                 ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL);
 513                 mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock);
 514         }
 515         kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size *
 516             sizeof (connf_t));
 517         ipst->ips_ipcl_bind_fanout = NULL;
 518 
 519         for (i = 0; i < IPPROTO_MAX; i++) {
 520                 ASSERT(ipst->ips_ipcl_proto_fanout_v4[i].connf_head == NULL);
 521                 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock);
 522         }
 523         kmem_free(ipst->ips_ipcl_proto_fanout_v4,
 524             IPPROTO_MAX * sizeof (connf_t));
 525         ipst->ips_ipcl_proto_fanout_v4 = NULL;
 526 
 527         for (i = 0; i < IPPROTO_MAX; i++) {
 528                 ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL);
 529                 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock);
 530         }
 531         kmem_free(ipst->ips_ipcl_proto_fanout_v6,
 532             IPPROTO_MAX * sizeof (connf_t));
 533         ipst->ips_ipcl_proto_fanout_v6 = NULL;
 534 
 535         for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
 536                 ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL);
 537                 mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock);
 538         }
 539         kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size *
 540             sizeof (connf_t));
 541         ipst->ips_ipcl_udp_fanout = NULL;
 542 
 543         for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
 544                 ASSERT(ipst->ips_ipcl_iptun_fanout[i].connf_head == NULL);
 545                 mutex_destroy(&ipst->ips_ipcl_iptun_fanout[i].connf_lock);
 546         }
 547         kmem_free(ipst->ips_ipcl_iptun_fanout,
 548             ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t));
 549         ipst->ips_ipcl_iptun_fanout = NULL;
 550 
 551         for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
 552                 ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL);
 553                 mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock);
 554         }
 555         kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size *
 556             sizeof (connf_t));
 557         ipst->ips_ipcl_raw_fanout = NULL;
 558 
 559         for (i = 0; i < CONN_G_HASH_SIZE; i++) {
 560                 ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL);
 561                 mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
 562         }
 563         kmem_free(ipst->ips_ipcl_globalhash_fanout,
 564             sizeof (connf_t) * CONN_G_HASH_SIZE);
 565         ipst->ips_ipcl_globalhash_fanout = NULL;
 566 
 567         ASSERT(ipst->ips_rts_clients->connf_head == NULL);
 568         mutex_destroy(&ipst->ips_rts_clients->connf_lock);
 569         kmem_free(ipst->ips_rts_clients, sizeof (connf_t));
 570         ipst->ips_rts_clients = NULL;
 571 }
 572 
 573 /*
 574  * conn creation routine. initialize the conn, sets the reference
 575  * and inserts it in the global hash table.
 576  */
 577 conn_t *
 578 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns)
 579 {
 580         conn_t  *connp;
 581         struct kmem_cache *conn_cache;
 582 
 583         switch (type) {
 584         case IPCL_SCTPCONN:
 585                 if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL)
 586                         return (NULL);
 587                 sctp_conn_init(connp);
 588                 netstack_hold(ns);
 589                 connp->conn_netstack = ns;
 590                 connp->conn_ixa->ixa_ipst = ns->netstack_ip;
 591                 connp->conn_ixa->ixa_conn_id = (long)connp;
 592                 ipcl_globalhash_insert(connp);
 593                 return (connp);
 594 
 595         case IPCL_TCPCONN:
 596                 conn_cache = tcp_conn_cache;
 597                 break;
 598 
 599         case IPCL_UDPCONN:
 600                 conn_cache = udp_conn_cache;
 601                 break;
 602 
 603         case IPCL_RAWIPCONN:
 604                 conn_cache = rawip_conn_cache;
 605                 break;
 606 
 607         case IPCL_RTSCONN:
 608                 conn_cache = rts_conn_cache;
 609                 break;
 610 
 611         case IPCL_IPCCONN:
 612                 conn_cache = ip_conn_cache;
 613                 break;
 614 
 615         default:
 616                 conn_cache = NULL;
 617                 connp = NULL;
 618                 ASSERT(0);
 619         }
 620 
 621         if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL)
 622                 return (NULL);
 623 
 624         connp->conn_ref = 1;
 625         netstack_hold(ns);
 626         connp->conn_netstack = ns;
 627         connp->conn_ixa->ixa_ipst = ns->netstack_ip;
 628         connp->conn_ixa->ixa_conn_id = (long)connp;
 629         ipcl_globalhash_insert(connp);
 630         return (connp);
 631 }
 632 
 633 void
 634 ipcl_conn_destroy(conn_t *connp)
 635 {
 636         mblk_t  *mp;
 637         netstack_t      *ns = connp->conn_netstack;
 638 
 639         ASSERT(!MUTEX_HELD(&connp->conn_lock));
 640         ASSERT(connp->conn_ref == 0);
 641         ASSERT(connp->conn_ioctlref == 0);
 642 
 643         DTRACE_PROBE1(conn__destroy, conn_t *, connp);
 644 
 645         if (connp->conn_cred != NULL) {
 646                 crfree(connp->conn_cred);
 647                 connp->conn_cred = NULL;
 648                 /* ixa_cred done in ipcl_conn_cleanup below */
 649         }
 650 
 651         if (connp->conn_ht_iphc != NULL) {
 652                 kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
 653                 connp->conn_ht_iphc = NULL;
 654                 connp->conn_ht_iphc_allocated = 0;
 655                 connp->conn_ht_iphc_len = 0;
 656                 connp->conn_ht_ulp = NULL;
 657                 connp->conn_ht_ulp_len = 0;
 658         }
 659         ip_pkt_free(&connp->conn_xmit_ipp);
 660 
 661         ipcl_globalhash_remove(connp);
 662 
 663         if (connp->conn_latch != NULL) {
 664                 IPLATCH_REFRELE(connp->conn_latch);
 665                 connp->conn_latch = NULL;
 666         }
 667         if (connp->conn_latch_in_policy != NULL) {
 668                 IPPOL_REFRELE(connp->conn_latch_in_policy);
 669                 connp->conn_latch_in_policy = NULL;
 670         }
 671         if (connp->conn_latch_in_action != NULL) {
 672                 IPACT_REFRELE(connp->conn_latch_in_action);
 673                 connp->conn_latch_in_action = NULL;
 674         }
 675         if (connp->conn_policy != NULL) {
 676                 IPPH_REFRELE(connp->conn_policy, ns);
 677                 connp->conn_policy = NULL;
 678         }
 679 
 680         if (connp->conn_ipsec_opt_mp != NULL) {
 681                 freemsg(connp->conn_ipsec_opt_mp);
 682                 connp->conn_ipsec_opt_mp = NULL;
 683         }
 684 
 685         if (connp->conn_flags & IPCL_TCPCONN) {
 686                 tcp_t *tcp = connp->conn_tcp;
 687 
 688                 tcp_free(tcp);
 689                 mp = tcp->tcp_timercache;
 690 
 691                 tcp->tcp_tcps = NULL;
 692 
 693                 /*
 694                  * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate
 695                  * the mblk.
 696                  */
 697                 if (tcp->tcp_rsrv_mp != NULL) {
 698                         freeb(tcp->tcp_rsrv_mp);
 699                         tcp->tcp_rsrv_mp = NULL;
 700                         mutex_destroy(&tcp->tcp_rsrv_mp_lock);
 701                 }
 702 
 703                 ipcl_conn_cleanup(connp);
 704                 connp->conn_flags = IPCL_TCPCONN;
 705                 if (ns != NULL) {
 706                         ASSERT(tcp->tcp_tcps == NULL);
 707                         connp->conn_netstack = NULL;
 708                         connp->conn_ixa->ixa_ipst = NULL;
 709                         netstack_rele(ns);
 710                 }
 711 
 712                 bzero(tcp, sizeof (tcp_t));
 713 
 714                 tcp->tcp_timercache = mp;
 715                 tcp->tcp_connp = connp;
 716                 kmem_cache_free(tcp_conn_cache, connp);
 717                 return;
 718         }
 719 
 720         if (connp->conn_flags & IPCL_SCTPCONN) {
 721                 ASSERT(ns != NULL);
 722                 sctp_free(connp);
 723                 return;
 724         }
 725 
 726         ipcl_conn_cleanup(connp);
 727         if (ns != NULL) {
 728                 connp->conn_netstack = NULL;
 729                 connp->conn_ixa->ixa_ipst = NULL;
 730                 netstack_rele(ns);
 731         }
 732 
 733         /* leave conn_priv aka conn_udp, conn_icmp, etc in place. */
 734         if (connp->conn_flags & IPCL_UDPCONN) {
 735                 connp->conn_flags = IPCL_UDPCONN;
 736                 kmem_cache_free(udp_conn_cache, connp);
 737         } else if (connp->conn_flags & IPCL_RAWIPCONN) {
 738                 connp->conn_flags = IPCL_RAWIPCONN;
 739                 connp->conn_proto = IPPROTO_ICMP;
 740                 connp->conn_ixa->ixa_protocol = connp->conn_proto;
 741                 kmem_cache_free(rawip_conn_cache, connp);
 742         } else if (connp->conn_flags & IPCL_RTSCONN) {
 743                 connp->conn_flags = IPCL_RTSCONN;
 744                 kmem_cache_free(rts_conn_cache, connp);
 745         } else {
 746                 connp->conn_flags = IPCL_IPCCONN;
 747                 ASSERT(connp->conn_flags & IPCL_IPCCONN);
 748                 ASSERT(connp->conn_priv == NULL);
 749                 kmem_cache_free(ip_conn_cache, connp);
 750         }
 751 }
 752 
 753 /*
 754  * Running in cluster mode - deregister listener information
 755  */
 756 static void
 757 ipcl_conn_unlisten(conn_t *connp)
 758 {
 759         ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0);
 760         ASSERT(connp->conn_lport != 0);
 761 
 762         if (cl_inet_unlisten != NULL) {
 763                 sa_family_t     addr_family;
 764                 uint8_t         *laddrp;
 765 
 766                 if (connp->conn_ipversion == IPV6_VERSION) {
 767                         addr_family = AF_INET6;
 768                         laddrp = (uint8_t *)&connp->conn_bound_addr_v6;
 769                 } else {
 770                         addr_family = AF_INET;
 771                         laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
 772                 }
 773                 (*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid,
 774                     IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL);
 775         }
 776         connp->conn_flags &= ~IPCL_CL_LISTENER;
 777 }
 778 
 779 /*
 780  * We set the IPCL_REMOVED flag (instead of clearing the flag indicating
 781  * which table the conn belonged to). So for debugging we can see which hash
 782  * table this connection was in.
 783  */
 784 #define IPCL_HASH_REMOVE(connp) {                                       \
 785         connf_t *connfp = (connp)->conn_fanout;                              \
 786         ASSERT(!MUTEX_HELD(&((connp)->conn_lock)));                      \
 787         if (connfp != NULL) {                                           \
 788                 mutex_enter(&connfp->connf_lock);                        \
 789                 if ((connp)->conn_next != NULL)                              \
 790                         (connp)->conn_next->conn_prev =                   \
 791                             (connp)->conn_prev;                              \
 792                 if ((connp)->conn_prev != NULL)                              \
 793                         (connp)->conn_prev->conn_next =                   \
 794                             (connp)->conn_next;                              \
 795                 else                                                    \
 796                         connfp->connf_head = (connp)->conn_next;  \
 797                 (connp)->conn_fanout = NULL;                         \
 798                 (connp)->conn_next = NULL;                           \
 799                 (connp)->conn_prev = NULL;                           \
 800                 (connp)->conn_flags |= IPCL_REMOVED;                 \
 801                 if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0)       \
 802                         ipcl_conn_unlisten((connp));                    \
 803                 CONN_DEC_REF((connp));                                  \
 804                 mutex_exit(&connfp->connf_lock);                 \
 805         }                                                               \
 806 }
 807 
 808 void
 809 ipcl_hash_remove(conn_t *connp)
 810 {
 811         uint8_t         protocol = connp->conn_proto;
 812 
 813         IPCL_HASH_REMOVE(connp);
 814         if (protocol == IPPROTO_RSVP)
 815                 ill_set_inputfn_all(connp->conn_netstack->netstack_ip);
 816 }
 817 
 818 /*
 819  * The whole purpose of this function is allow removal of
 820  * a conn_t from the connected hash for timewait reclaim.
 821  * This is essentially a TW reclaim fastpath where timewait
 822  * collector checks under fanout lock (so no one else can
 823  * get access to the conn_t) that refcnt is 2 i.e. one for
 824  * TCP and one for the classifier hash list. If ref count
 825  * is indeed 2, we can just remove the conn under lock and
 826  * avoid cleaning up the conn under squeue. This gives us
 827  * improved performance.
 828  */
 829 void
 830 ipcl_hash_remove_locked(conn_t *connp, connf_t  *connfp)
 831 {
 832         ASSERT(MUTEX_HELD(&connfp->connf_lock));
 833         ASSERT(MUTEX_HELD(&connp->conn_lock));
 834         ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0);
 835 
 836         if ((connp)->conn_next != NULL) {
 837                 (connp)->conn_next->conn_prev = (connp)->conn_prev;
 838         }
 839         if ((connp)->conn_prev != NULL) {
 840                 (connp)->conn_prev->conn_next = (connp)->conn_next;
 841         } else {
 842                 connfp->connf_head = (connp)->conn_next;
 843         }
 844         (connp)->conn_fanout = NULL;
 845         (connp)->conn_next = NULL;
 846         (connp)->conn_prev = NULL;
 847         (connp)->conn_flags |= IPCL_REMOVED;
 848         ASSERT((connp)->conn_ref == 2);
 849         (connp)->conn_ref--;
 850 }
 851 
 852 #define IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) {              \
 853         ASSERT((connp)->conn_fanout == NULL);                                \
 854         ASSERT((connp)->conn_next == NULL);                          \
 855         ASSERT((connp)->conn_prev == NULL);                          \
 856         if ((connfp)->connf_head != NULL) {                          \
 857                 (connfp)->connf_head->conn_prev = (connp);                \
 858                 (connp)->conn_next = (connfp)->connf_head;                \
 859         }                                                               \
 860         (connp)->conn_fanout = (connfp);                             \
 861         (connfp)->connf_head = (connp);                                      \
 862         (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
 863             IPCL_CONNECTED;                                             \
 864         CONN_INC_REF(connp);                                            \
 865 }
 866 
 867 #define IPCL_HASH_INSERT_CONNECTED(connfp, connp) {                     \
 868         IPCL_HASH_REMOVE((connp));                                      \
 869         mutex_enter(&(connfp)->connf_lock);                              \
 870         IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);               \
 871         mutex_exit(&(connfp)->connf_lock);                               \
 872 }
 873 
 874 #define IPCL_HASH_INSERT_BOUND(connfp, connp) {                         \
 875         conn_t *pconnp = NULL, *nconnp;                                 \
 876         IPCL_HASH_REMOVE((connp));                                      \
 877         mutex_enter(&(connfp)->connf_lock);                              \
 878         nconnp = (connfp)->connf_head;                                       \
 879         while (nconnp != NULL &&                                        \
 880             !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) {            \
 881                 pconnp = nconnp;                                        \
 882                 nconnp = nconnp->conn_next;                          \
 883         }                                                               \
 884         if (pconnp != NULL) {                                           \
 885                 pconnp->conn_next = (connp);                         \
 886                 (connp)->conn_prev = pconnp;                         \
 887         } else {                                                        \
 888                 (connfp)->connf_head = (connp);                              \
 889         }                                                               \
 890         if (nconnp != NULL) {                                           \
 891                 (connp)->conn_next = nconnp;                         \
 892                 nconnp->conn_prev = (connp);                         \
 893         }                                                               \
 894         (connp)->conn_fanout = (connfp);                             \
 895         (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
 896             IPCL_BOUND;                                                 \
 897         CONN_INC_REF(connp);                                            \
 898         mutex_exit(&(connfp)->connf_lock);                               \
 899 }
 900 
 901 #define IPCL_HASH_INSERT_WILDCARD(connfp, connp) {                      \
 902         conn_t **list, *prev, *next;                                    \
 903         boolean_t isv4mapped =                                          \
 904             IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6);               \
 905         IPCL_HASH_REMOVE((connp));                                      \
 906         mutex_enter(&(connfp)->connf_lock);                              \
 907         list = &(connfp)->connf_head;                                    \
 908         prev = NULL;                                                    \
 909         while ((next = *list) != NULL) {                                \
 910                 if (isv4mapped &&                                       \
 911                     IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) &&     \
 912                     connp->conn_zoneid == next->conn_zoneid) {            \
 913                         (connp)->conn_next = next;                   \
 914                         if (prev != NULL)                               \
 915                                 prev = next->conn_prev;                      \
 916                         next->conn_prev = (connp);                   \
 917                         break;                                          \
 918                 }                                                       \
 919                 list = &next->conn_next;                         \
 920                 prev = next;                                            \
 921         }                                                               \
 922         (connp)->conn_prev = prev;                                   \
 923         *list = (connp);                                                \
 924         (connp)->conn_fanout = (connfp);                             \
 925         (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
 926             IPCL_BOUND;                                                 \
 927         CONN_INC_REF((connp));                                          \
 928         mutex_exit(&(connfp)->connf_lock);                               \
 929 }
 930 
 931 void
 932 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
 933 {
 934         IPCL_HASH_INSERT_WILDCARD(connfp, connp);
 935 }
 936 
 937 /*
 938  * Because the classifier is used to classify inbound packets, the destination
 939  * address is meant to be our local tunnel address (tunnel source), and the
 940  * source the remote tunnel address (tunnel destination).
 941  *
 942  * Note that conn_proto can't be used for fanout since the upper protocol
 943  * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel.
 944  */
 945 conn_t *
 946 ipcl_iptun_classify_v4(ipaddr_t *src, ipaddr_t *dst, ip_stack_t *ipst)
 947 {
 948         connf_t *connfp;
 949         conn_t  *connp;
 950 
 951         /* first look for IPv4 tunnel links */
 952         connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, *src)];
 953         mutex_enter(&connfp->connf_lock);
 954         for (connp = connfp->connf_head; connp != NULL;
 955             connp = connp->conn_next) {
 956                 if (IPCL_IPTUN_MATCH(connp, *dst, *src))
 957                         break;
 958         }
 959         if (connp != NULL)
 960                 goto done;
 961 
 962         mutex_exit(&connfp->connf_lock);
 963 
 964         /* We didn't find an IPv4 tunnel, try a 6to4 tunnel */
 965         connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst,
 966             INADDR_ANY)];
 967         mutex_enter(&connfp->connf_lock);
 968         for (connp = connfp->connf_head; connp != NULL;
 969             connp = connp->conn_next) {
 970                 if (IPCL_IPTUN_MATCH(connp, *dst, INADDR_ANY))
 971                         break;
 972         }
 973 done:
 974         if (connp != NULL)
 975                 CONN_INC_REF(connp);
 976         mutex_exit(&connfp->connf_lock);
 977         return (connp);
 978 }
 979 
 980 conn_t *
 981 ipcl_iptun_classify_v6(in6_addr_t *src, in6_addr_t *dst, ip_stack_t *ipst)
 982 {
 983         connf_t *connfp;
 984         conn_t  *connp;
 985 
 986         /* Look for an IPv6 tunnel link */
 987         connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(dst, src)];
 988         mutex_enter(&connfp->connf_lock);
 989         for (connp = connfp->connf_head; connp != NULL;
 990             connp = connp->conn_next) {
 991                 if (IPCL_IPTUN_MATCH_V6(connp, dst, src)) {
 992                         CONN_INC_REF(connp);
 993                         break;
 994                 }
 995         }
 996         mutex_exit(&connfp->connf_lock);
 997         return (connp);
 998 }
 999 
1000 /*
1001  * This function is used only for inserting SCTP raw socket now.
1002  * This may change later.
1003  *
1004  * Note that only one raw socket can be bound to a port.  The param
1005  * lport is in network byte order.
1006  */
1007 static int
1008 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
1009 {
1010         connf_t *connfp;
1011         conn_t  *oconnp;
1012         ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
1013 
1014         connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1015 
1016         /* Check for existing raw socket already bound to the port. */
1017         mutex_enter(&connfp->connf_lock);
1018         for (oconnp = connfp->connf_head; oconnp != NULL;
1019             oconnp = oconnp->conn_next) {
1020                 if (oconnp->conn_lport == lport &&
1021                     oconnp->conn_zoneid == connp->conn_zoneid &&
1022                     oconnp->conn_family == connp->conn_family &&
1023                     ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1024                     IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_laddr_v6) ||
1025                     IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6) ||
1026                     IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_laddr_v6)) ||
1027                     IN6_ARE_ADDR_EQUAL(&oconnp->conn_laddr_v6,
1028                     &connp->conn_laddr_v6))) {
1029                         break;
1030                 }
1031         }
1032         mutex_exit(&connfp->connf_lock);
1033         if (oconnp != NULL)
1034                 return (EADDRNOTAVAIL);
1035 
1036         if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) ||
1037             IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1038                 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1039                     IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) {
1040                         IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1041                 } else {
1042                         IPCL_HASH_INSERT_BOUND(connfp, connp);
1043                 }
1044         } else {
1045                 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1046         }
1047         return (0);
1048 }
1049 
1050 static int
1051 ipcl_iptun_hash_insert(conn_t *connp, ip_stack_t *ipst)
1052 {
1053         connf_t *connfp;
1054         conn_t  *tconnp;
1055         ipaddr_t laddr = connp->conn_laddr_v4;
1056         ipaddr_t faddr = connp->conn_faddr_v4;
1057 
1058         connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(laddr, faddr)];
1059         mutex_enter(&connfp->connf_lock);
1060         for (tconnp = connfp->connf_head; tconnp != NULL;
1061             tconnp = tconnp->conn_next) {
1062                 if (IPCL_IPTUN_MATCH(tconnp, laddr, faddr)) {
1063                         /* A tunnel is already bound to these addresses. */
1064                         mutex_exit(&connfp->connf_lock);
1065                         return (EADDRINUSE);
1066                 }
1067         }
1068         IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1069         mutex_exit(&connfp->connf_lock);
1070         return (0);
1071 }
1072 
1073 static int
1074 ipcl_iptun_hash_insert_v6(conn_t *connp, ip_stack_t *ipst)
1075 {
1076         connf_t *connfp;
1077         conn_t  *tconnp;
1078         in6_addr_t *laddr = &connp->conn_laddr_v6;
1079         in6_addr_t *faddr = &connp->conn_faddr_v6;
1080 
1081         connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(laddr, faddr)];
1082         mutex_enter(&connfp->connf_lock);
1083         for (tconnp = connfp->connf_head; tconnp != NULL;
1084             tconnp = tconnp->conn_next) {
1085                 if (IPCL_IPTUN_MATCH_V6(tconnp, laddr, faddr)) {
1086                         /* A tunnel is already bound to these addresses. */
1087                         mutex_exit(&connfp->connf_lock);
1088                         return (EADDRINUSE);
1089                 }
1090         }
1091         IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1092         mutex_exit(&connfp->connf_lock);
1093         return (0);
1094 }
1095 
1096 /*
1097  * Check for a MAC exemption conflict on a labeled system.  Note that for
1098  * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the
1099  * transport layer.  This check is for binding all other protocols.
1100  *
1101  * Returns true if there's a conflict.
1102  */
1103 static boolean_t
1104 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst)
1105 {
1106         connf_t *connfp;
1107         conn_t *tconn;
1108 
1109         connfp = &ipst->ips_ipcl_proto_fanout_v4[connp->conn_proto];
1110         mutex_enter(&connfp->connf_lock);
1111         for (tconn = connfp->connf_head; tconn != NULL;
1112             tconn = tconn->conn_next) {
1113                 /* We don't allow v4 fallback for v6 raw socket */
1114                 if (connp->conn_family != tconn->conn_family)
1115                         continue;
1116                 /* If neither is exempt, then there's no conflict */
1117                 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1118                     (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1119                         continue;
1120                 /* We are only concerned about sockets for a different zone */
1121                 if (connp->conn_zoneid == tconn->conn_zoneid)
1122                         continue;
1123                 /* If both are bound to different specific addrs, ok */
1124                 if (connp->conn_laddr_v4 != INADDR_ANY &&
1125                     tconn->conn_laddr_v4 != INADDR_ANY &&
1126                     connp->conn_laddr_v4 != tconn->conn_laddr_v4)
1127                         continue;
1128                 /* These two conflict; fail */
1129                 break;
1130         }
1131         mutex_exit(&connfp->connf_lock);
1132         return (tconn != NULL);
1133 }
1134 
1135 static boolean_t
1136 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst)
1137 {
1138         connf_t *connfp;
1139         conn_t *tconn;
1140 
1141         connfp = &ipst->ips_ipcl_proto_fanout_v6[connp->conn_proto];
1142         mutex_enter(&connfp->connf_lock);
1143         for (tconn = connfp->connf_head; tconn != NULL;
1144             tconn = tconn->conn_next) {
1145                 /* We don't allow v4 fallback for v6 raw socket */
1146                 if (connp->conn_family != tconn->conn_family)
1147                         continue;
1148                 /* If neither is exempt, then there's no conflict */
1149                 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1150                     (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1151                         continue;
1152                 /* We are only concerned about sockets for a different zone */
1153                 if (connp->conn_zoneid == tconn->conn_zoneid)
1154                         continue;
1155                 /* If both are bound to different addrs, ok */
1156                 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) &&
1157                     !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_laddr_v6) &&
1158                     !IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6,
1159                     &tconn->conn_laddr_v6))
1160                         continue;
1161                 /* These two conflict; fail */
1162                 break;
1163         }
1164         mutex_exit(&connfp->connf_lock);
1165         return (tconn != NULL);
1166 }
1167 
1168 /*
1169  * (v4, v6) bind hash insertion routines
1170  * The caller has already setup the conn (conn_proto, conn_laddr_v6, conn_lport)
1171  */
1172 
1173 int
1174 ipcl_bind_insert(conn_t *connp)
1175 {
1176         if (connp->conn_ipversion == IPV6_VERSION)
1177                 return (ipcl_bind_insert_v6(connp));
1178         else
1179                 return (ipcl_bind_insert_v4(connp));
1180 }
1181 
1182 int
1183 ipcl_bind_insert_v4(conn_t *connp)
1184 {
1185         connf_t *connfp;
1186         int     ret = 0;
1187         ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
1188         uint16_t        lport = connp->conn_lport;
1189         uint8_t         protocol = connp->conn_proto;
1190 
1191         if (IPCL_IS_IPTUN(connp))
1192                 return (ipcl_iptun_hash_insert(connp, ipst));
1193 
1194         switch (protocol) {
1195         default:
1196                 if (is_system_labeled() &&
1197                     check_exempt_conflict_v4(connp, ipst))
1198                         return (EADDRINUSE);
1199                 /* FALLTHROUGH */
1200         case IPPROTO_UDP:
1201                 if (protocol == IPPROTO_UDP) {
1202                         connfp = &ipst->ips_ipcl_udp_fanout[
1203                             IPCL_UDP_HASH(lport, ipst)];
1204                 } else {
1205                         connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1206                 }
1207 
1208                 if (connp->conn_faddr_v4 != INADDR_ANY) {
1209                         IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1210                 } else if (connp->conn_laddr_v4 != INADDR_ANY) {
1211                         IPCL_HASH_INSERT_BOUND(connfp, connp);
1212                 } else {
1213                         IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1214                 }
1215                 if (protocol == IPPROTO_RSVP)
1216                         ill_set_inputfn_all(ipst);
1217                 break;
1218 
1219         case IPPROTO_TCP:
1220                 /* Insert it in the Bind Hash */
1221                 ASSERT(connp->conn_zoneid != ALL_ZONES);
1222                 connfp = &ipst->ips_ipcl_bind_fanout[
1223                     IPCL_BIND_HASH(lport, ipst)];
1224                 if (connp->conn_laddr_v4 != INADDR_ANY) {
1225                         IPCL_HASH_INSERT_BOUND(connfp, connp);
1226                 } else {
1227                         IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1228                 }
1229                 if (cl_inet_listen != NULL) {
1230                         ASSERT(connp->conn_ipversion == IPV4_VERSION);
1231                         connp->conn_flags |= IPCL_CL_LISTENER;
1232                         (*cl_inet_listen)(
1233                             connp->conn_netstack->netstack_stackid,
1234                             IPPROTO_TCP, AF_INET,
1235                             (uint8_t *)&connp->conn_bound_addr_v4, lport, NULL);
1236                 }
1237                 break;
1238 
1239         case IPPROTO_SCTP:
1240                 ret = ipcl_sctp_hash_insert(connp, lport);
1241                 break;
1242         }
1243 
1244         return (ret);
1245 }
1246 
1247 int
1248 ipcl_bind_insert_v6(conn_t *connp)
1249 {
1250         connf_t         *connfp;
1251         int             ret = 0;
1252         ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
1253         uint16_t        lport = connp->conn_lport;
1254         uint8_t         protocol = connp->conn_proto;
1255 
1256         if (IPCL_IS_IPTUN(connp)) {
1257                 return (ipcl_iptun_hash_insert_v6(connp, ipst));
1258         }
1259 
1260         switch (protocol) {
1261         default:
1262                 if (is_system_labeled() &&
1263                     check_exempt_conflict_v6(connp, ipst))
1264                         return (EADDRINUSE);
1265                 /* FALLTHROUGH */
1266         case IPPROTO_UDP:
1267                 if (protocol == IPPROTO_UDP) {
1268                         connfp = &ipst->ips_ipcl_udp_fanout[
1269                             IPCL_UDP_HASH(lport, ipst)];
1270                 } else {
1271                         connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1272                 }
1273 
1274                 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1275                         IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1276                 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1277                         IPCL_HASH_INSERT_BOUND(connfp, connp);
1278                 } else {
1279                         IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1280                 }
1281                 break;
1282 
1283         case IPPROTO_TCP:
1284                 /* Insert it in the Bind Hash */
1285                 ASSERT(connp->conn_zoneid != ALL_ZONES);
1286                 connfp = &ipst->ips_ipcl_bind_fanout[
1287                     IPCL_BIND_HASH(lport, ipst)];
1288                 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1289                         IPCL_HASH_INSERT_BOUND(connfp, connp);
1290                 } else {
1291                         IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1292                 }
1293                 if (cl_inet_listen != NULL) {
1294                         sa_family_t     addr_family;
1295                         uint8_t         *laddrp;
1296 
1297                         if (connp->conn_ipversion == IPV6_VERSION) {
1298                                 addr_family = AF_INET6;
1299                                 laddrp =
1300                                     (uint8_t *)&connp->conn_bound_addr_v6;
1301                         } else {
1302                                 addr_family = AF_INET;
1303                                 laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
1304                         }
1305                         connp->conn_flags |= IPCL_CL_LISTENER;
1306                         (*cl_inet_listen)(
1307                             connp->conn_netstack->netstack_stackid,
1308                             IPPROTO_TCP, addr_family, laddrp, lport, NULL);
1309                 }
1310                 break;
1311 
1312         case IPPROTO_SCTP:
1313                 ret = ipcl_sctp_hash_insert(connp, lport);
1314                 break;
1315         }
1316 
1317         return (ret);
1318 }
1319 
1320 /*
1321  * ipcl_conn_hash insertion routines.
1322  * The caller has already set conn_proto and the addresses/ports in the conn_t.
1323  */
1324 
1325 int
1326 ipcl_conn_insert(conn_t *connp)
1327 {
1328         if (connp->conn_ipversion == IPV6_VERSION)
1329                 return (ipcl_conn_insert_v6(connp));
1330         else
1331                 return (ipcl_conn_insert_v4(connp));
1332 }
1333 
1334 int
1335 ipcl_conn_insert_v4(conn_t *connp)
1336 {
1337         connf_t         *connfp;
1338         conn_t          *tconnp;
1339         int             ret = 0;
1340         ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
1341         uint16_t        lport = connp->conn_lport;
1342         uint8_t         protocol = connp->conn_proto;
1343 
1344         if (IPCL_IS_IPTUN(connp))
1345                 return (ipcl_iptun_hash_insert(connp, ipst));
1346 
1347         switch (protocol) {
1348         case IPPROTO_TCP:
1349                 /*
1350                  * For TCP, we check whether the connection tuple already
1351                  * exists before allowing the connection to proceed.  We
1352                  * also allow indexing on the zoneid. This is to allow
1353                  * multiple shared stack zones to have the same tcp
1354                  * connection tuple. In practice this only happens for
1355                  * INADDR_LOOPBACK as it's the only local address which
1356                  * doesn't have to be unique.
1357                  */
1358                 connfp = &ipst->ips_ipcl_conn_fanout[
1359                     IPCL_CONN_HASH(connp->conn_faddr_v4,
1360                     connp->conn_ports, ipst)];
1361                 mutex_enter(&connfp->connf_lock);
1362                 for (tconnp = connfp->connf_head; tconnp != NULL;
1363                     tconnp = tconnp->conn_next) {
1364                         if (IPCL_CONN_MATCH(tconnp, connp->conn_proto,
1365                             connp->conn_faddr_v4, connp->conn_laddr_v4,
1366                             connp->conn_ports) &&
1367                             IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1368                                 /* Already have a conn. bail out */
1369                                 mutex_exit(&connfp->connf_lock);
1370                                 return (EADDRINUSE);
1371                         }
1372                 }
1373                 if (connp->conn_fanout != NULL) {
1374                         /*
1375                          * Probably a XTI/TLI application trying to do a
1376                          * rebind. Let it happen.
1377                          */
1378                         mutex_exit(&connfp->connf_lock);
1379                         IPCL_HASH_REMOVE(connp);
1380                         mutex_enter(&connfp->connf_lock);
1381                 }
1382 
1383                 ASSERT(connp->conn_recv != NULL);
1384                 ASSERT(connp->conn_recvicmp != NULL);
1385 
1386                 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1387                 mutex_exit(&connfp->connf_lock);
1388                 break;
1389 
1390         case IPPROTO_SCTP:
1391                 /*
1392                  * The raw socket may have already been bound, remove it
1393                  * from the hash first.
1394                  */
1395                 IPCL_HASH_REMOVE(connp);
1396                 ret = ipcl_sctp_hash_insert(connp, lport);
1397                 break;
1398 
1399         default:
1400                 /*
1401                  * Check for conflicts among MAC exempt bindings.  For
1402                  * transports with port numbers, this is done by the upper
1403                  * level per-transport binding logic.  For all others, it's
1404                  * done here.
1405                  */
1406                 if (is_system_labeled() &&
1407                     check_exempt_conflict_v4(connp, ipst))
1408                         return (EADDRINUSE);
1409                 /* FALLTHROUGH */
1410 
1411         case IPPROTO_UDP:
1412                 if (protocol == IPPROTO_UDP) {
1413                         connfp = &ipst->ips_ipcl_udp_fanout[
1414                             IPCL_UDP_HASH(lport, ipst)];
1415                 } else {
1416                         connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1417                 }
1418 
1419                 if (connp->conn_faddr_v4 != INADDR_ANY) {
1420                         IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1421                 } else if (connp->conn_laddr_v4 != INADDR_ANY) {
1422                         IPCL_HASH_INSERT_BOUND(connfp, connp);
1423                 } else {
1424                         IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1425                 }
1426                 break;
1427         }
1428 
1429         return (ret);
1430 }
1431 
1432 int
1433 ipcl_conn_insert_v6(conn_t *connp)
1434 {
1435         connf_t         *connfp;
1436         conn_t          *tconnp;
1437         int             ret = 0;
1438         ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
1439         uint16_t        lport = connp->conn_lport;
1440         uint8_t         protocol = connp->conn_proto;
1441         uint_t          ifindex = connp->conn_bound_if;
1442 
1443         if (IPCL_IS_IPTUN(connp))
1444                 return (ipcl_iptun_hash_insert_v6(connp, ipst));
1445 
1446         switch (protocol) {
1447         case IPPROTO_TCP:
1448 
1449                 /*
1450                  * For tcp, we check whether the connection tuple already
1451                  * exists before allowing the connection to proceed.  We
1452                  * also allow indexing on the zoneid. This is to allow
1453                  * multiple shared stack zones to have the same tcp
1454                  * connection tuple. In practice this only happens for
1455                  * ipv6_loopback as it's the only local address which
1456                  * doesn't have to be unique.
1457                  */
1458                 connfp = &ipst->ips_ipcl_conn_fanout[
1459                     IPCL_CONN_HASH_V6(connp->conn_faddr_v6, connp->conn_ports,
1460                     ipst)];
1461                 mutex_enter(&connfp->connf_lock);
1462                 for (tconnp = connfp->connf_head; tconnp != NULL;
1463                     tconnp = tconnp->conn_next) {
1464                         /* NOTE: need to match zoneid. Bug in onnv-gate */
1465                         if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_proto,
1466                             connp->conn_faddr_v6, connp->conn_laddr_v6,
1467                             connp->conn_ports) &&
1468                             (tconnp->conn_bound_if == 0 ||
1469                             tconnp->conn_bound_if == ifindex) &&
1470                             IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1471                                 /* Already have a conn. bail out */
1472                                 mutex_exit(&connfp->connf_lock);
1473                                 return (EADDRINUSE);
1474                         }
1475                 }
1476                 if (connp->conn_fanout != NULL) {
1477                         /*
1478                          * Probably a XTI/TLI application trying to do a
1479                          * rebind. Let it happen.
1480                          */
1481                         mutex_exit(&connfp->connf_lock);
1482                         IPCL_HASH_REMOVE(connp);
1483                         mutex_enter(&connfp->connf_lock);
1484                 }
1485                 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1486                 mutex_exit(&connfp->connf_lock);
1487                 break;
1488 
1489         case IPPROTO_SCTP:
1490                 IPCL_HASH_REMOVE(connp);
1491                 ret = ipcl_sctp_hash_insert(connp, lport);
1492                 break;
1493 
1494         default:
1495                 if (is_system_labeled() &&
1496                     check_exempt_conflict_v6(connp, ipst))
1497                         return (EADDRINUSE);
1498                 /* FALLTHROUGH */
1499         case IPPROTO_UDP:
1500                 if (protocol == IPPROTO_UDP) {
1501                         connfp = &ipst->ips_ipcl_udp_fanout[
1502                             IPCL_UDP_HASH(lport, ipst)];
1503                 } else {
1504                         connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1505                 }
1506 
1507                 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1508                         IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1509                 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1510                         IPCL_HASH_INSERT_BOUND(connfp, connp);
1511                 } else {
1512                         IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1513                 }
1514                 break;
1515         }
1516 
1517         return (ret);
1518 }
1519 
1520 /*
1521  * v4 packet classifying function. looks up the fanout table to
1522  * find the conn, the packet belongs to. returns the conn with
1523  * the reference held, null otherwise.
1524  *
1525  * If zoneid is ALL_ZONES, then the search rules described in the "Connection
1526  * Lookup" comment block are applied.  Labels are also checked as described
1527  * above.  If the packet is from the inside (looped back), and is from the same
1528  * zone, then label checks are omitted.
1529  */
1530 conn_t *
1531 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1532     ip_recv_attr_t *ira, ip_stack_t *ipst)
1533 {
1534         ipha_t  *ipha;
1535         connf_t *connfp, *bind_connfp;
1536         uint16_t lport;
1537         uint16_t fport;
1538         uint32_t ports;
1539         conn_t  *connp;
1540         uint16_t  *up;
1541         zoneid_t        zoneid = ira->ira_zoneid;
1542 
1543         ipha = (ipha_t *)mp->b_rptr;
1544         up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET);
1545 
1546         switch (protocol) {
1547         case IPPROTO_TCP:
1548                 ports = *(uint32_t *)up;
1549                 connfp =
1550                     &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src,
1551                     ports, ipst)];
1552                 mutex_enter(&connfp->connf_lock);
1553                 for (connp = connfp->connf_head; connp != NULL;
1554                     connp = connp->conn_next) {
1555                         if (IPCL_CONN_MATCH(connp, protocol,
1556                             ipha->ipha_src, ipha->ipha_dst, ports) &&
1557                             (connp->conn_zoneid == zoneid ||
1558                             connp->conn_allzones ||
1559                             ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1560                             (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1561                             (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1562                                 break;
1563                 }
1564 
1565                 if (connp != NULL) {
1566                         /*
1567                          * We have a fully-bound TCP connection.
1568                          *
1569                          * For labeled systems, there's no need to check the
1570                          * label here.  It's known to be good as we checked
1571                          * before allowing the connection to become bound.
1572                          */
1573                         CONN_INC_REF(connp);
1574                         mutex_exit(&connfp->connf_lock);
1575                         return (connp);
1576                 }
1577 
1578                 mutex_exit(&connfp->connf_lock);
1579                 lport = up[1];
1580                 bind_connfp =
1581                     &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1582                 mutex_enter(&bind_connfp->connf_lock);
1583                 for (connp = bind_connfp->connf_head; connp != NULL;
1584                     connp = connp->conn_next) {
1585                         if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst,
1586                             lport) &&
1587                             (connp->conn_zoneid == zoneid ||
1588                             connp->conn_allzones ||
1589                             ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1590                             (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1591                             (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1592                                 break;
1593                 }
1594 
1595                 /*
1596                  * If the matching connection is SLP on a private address, then
1597                  * the label on the packet must match the local zone's label.
1598                  * Otherwise, it must be in the label range defined by tnrh.
1599                  * This is ensured by tsol_receive_local.
1600                  *
1601                  * Note that we don't check tsol_receive_local for
1602                  * the connected case.
1603                  */
1604                 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1605                     !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1606                     ira, connp)) {
1607                         DTRACE_PROBE3(tx__ip__log__info__classify__tcp,
1608                             char *, "connp(1) could not receive mp(2)",
1609                             conn_t *, connp, mblk_t *, mp);
1610                         connp = NULL;
1611                 }
1612 
1613                 if (connp != NULL) {
1614                         /* Have a listener at least */
1615                         CONN_INC_REF(connp);
1616                         mutex_exit(&bind_connfp->connf_lock);
1617                         return (connp);
1618                 }
1619 
1620                 mutex_exit(&bind_connfp->connf_lock);
1621                 break;
1622 
1623         case IPPROTO_UDP:
1624                 lport = up[1];
1625                 fport = up[0];
1626                 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1627                 mutex_enter(&connfp->connf_lock);
1628                 for (connp = connfp->connf_head; connp != NULL;
1629                     connp = connp->conn_next) {
1630                         if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst,
1631                             fport, ipha->ipha_src) &&
1632                             (connp->conn_zoneid == zoneid ||
1633                             connp->conn_allzones ||
1634                             ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1635                             (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE))))
1636                                 break;
1637                 }
1638 
1639                 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1640                     !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1641                     ira, connp)) {
1642                         DTRACE_PROBE3(tx__ip__log__info__classify__udp,
1643                             char *, "connp(1) could not receive mp(2)",
1644                             conn_t *, connp, mblk_t *, mp);
1645                         connp = NULL;
1646                 }
1647 
1648                 if (connp != NULL) {
1649                         CONN_INC_REF(connp);
1650                         mutex_exit(&connfp->connf_lock);
1651                         return (connp);
1652                 }
1653 
1654                 /*
1655                  * We shouldn't come here for multicast/broadcast packets
1656                  */
1657                 mutex_exit(&connfp->connf_lock);
1658 
1659                 break;
1660 
1661         case IPPROTO_ENCAP:
1662         case IPPROTO_IPV6:
1663                 return (ipcl_iptun_classify_v4(&ipha->ipha_src,
1664                     &ipha->ipha_dst, ipst));
1665         }
1666 
1667         return (NULL);
1668 }
1669 
1670 conn_t *
1671 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1672     ip_recv_attr_t *ira, ip_stack_t *ipst)
1673 {
1674         ip6_t           *ip6h;
1675         connf_t         *connfp, *bind_connfp;
1676         uint16_t        lport;
1677         uint16_t        fport;
1678         tcpha_t         *tcpha;
1679         uint32_t        ports;
1680         conn_t          *connp;
1681         uint16_t        *up;
1682         zoneid_t        zoneid = ira->ira_zoneid;
1683 
1684         ip6h = (ip6_t *)mp->b_rptr;
1685 
1686         switch (protocol) {
1687         case IPPROTO_TCP:
1688                 tcpha = (tcpha_t *)&mp->b_rptr[hdr_len];
1689                 up = &tcpha->tha_lport;
1690                 ports = *(uint32_t *)up;
1691 
1692                 connfp =
1693                     &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src,
1694                     ports, ipst)];
1695                 mutex_enter(&connfp->connf_lock);
1696                 for (connp = connfp->connf_head; connp != NULL;
1697                     connp = connp->conn_next) {
1698                         if (IPCL_CONN_MATCH_V6(connp, protocol,
1699                             ip6h->ip6_src, ip6h->ip6_dst, ports) &&
1700                             (connp->conn_zoneid == zoneid ||
1701                             connp->conn_allzones ||
1702                             ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1703                             (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1704                             (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1705                                 break;
1706                 }
1707 
1708                 if (connp != NULL) {
1709                         /*
1710                          * We have a fully-bound TCP connection.
1711                          *
1712                          * For labeled systems, there's no need to check the
1713                          * label here.  It's known to be good as we checked
1714                          * before allowing the connection to become bound.
1715                          */
1716                         CONN_INC_REF(connp);
1717                         mutex_exit(&connfp->connf_lock);
1718                         return (connp);
1719                 }
1720 
1721                 mutex_exit(&connfp->connf_lock);
1722 
1723                 lport = up[1];
1724                 bind_connfp =
1725                     &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1726                 mutex_enter(&bind_connfp->connf_lock);
1727                 for (connp = bind_connfp->connf_head; connp != NULL;
1728                     connp = connp->conn_next) {
1729                         if (IPCL_BIND_MATCH_V6(connp, protocol,
1730                             ip6h->ip6_dst, lport) &&
1731                             (connp->conn_zoneid == zoneid ||
1732                             connp->conn_allzones ||
1733                             ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1734                             (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1735                             (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1736                                 break;
1737                 }
1738 
1739                 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1740                     !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1741                     ira, connp)) {
1742                         DTRACE_PROBE3(tx__ip__log__info__classify__tcp6,
1743                             char *, "connp(1) could not receive mp(2)",
1744                             conn_t *, connp, mblk_t *, mp);
1745                         connp = NULL;
1746                 }
1747 
1748                 if (connp != NULL) {
1749                         /* Have a listner at least */
1750                         CONN_INC_REF(connp);
1751                         mutex_exit(&bind_connfp->connf_lock);
1752                         return (connp);
1753                 }
1754 
1755                 mutex_exit(&bind_connfp->connf_lock);
1756                 break;
1757 
1758         case IPPROTO_UDP:
1759                 up = (uint16_t *)&mp->b_rptr[hdr_len];
1760                 lport = up[1];
1761                 fport = up[0];
1762                 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1763                 mutex_enter(&connfp->connf_lock);
1764                 for (connp = connfp->connf_head; connp != NULL;
1765                     connp = connp->conn_next) {
1766                         if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst,
1767                             fport, ip6h->ip6_src) &&
1768                             (connp->conn_zoneid == zoneid ||
1769                             connp->conn_allzones ||
1770                             ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1771                             (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1772                             (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1773                                 break;
1774                 }
1775 
1776                 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1777                     !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1778                     ira, connp)) {
1779                         DTRACE_PROBE3(tx__ip__log__info__classify__udp6,
1780                             char *, "connp(1) could not receive mp(2)",
1781                             conn_t *, connp, mblk_t *, mp);
1782                         connp = NULL;
1783                 }
1784 
1785                 if (connp != NULL) {
1786                         CONN_INC_REF(connp);
1787                         mutex_exit(&connfp->connf_lock);
1788                         return (connp);
1789                 }
1790 
1791                 /*
1792                  * We shouldn't come here for multicast/broadcast packets
1793                  */
1794                 mutex_exit(&connfp->connf_lock);
1795                 break;
1796         case IPPROTO_ENCAP:
1797         case IPPROTO_IPV6:
1798                 return (ipcl_iptun_classify_v6(&ip6h->ip6_src,
1799                     &ip6h->ip6_dst, ipst));
1800         }
1801 
1802         return (NULL);
1803 }
1804 
1805 /*
1806  * wrapper around ipcl_classify_(v4,v6) routines.
1807  */
1808 conn_t *
1809 ipcl_classify(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst)
1810 {
1811         if (ira->ira_flags & IRAF_IS_IPV4) {
1812                 return (ipcl_classify_v4(mp, ira->ira_protocol,
1813                     ira->ira_ip_hdr_length, ira, ipst));
1814         } else {
1815                 return (ipcl_classify_v6(mp, ira->ira_protocol,
1816                     ira->ira_ip_hdr_length, ira, ipst));
1817         }
1818 }
1819 
1820 /*
1821  * Only used to classify SCTP RAW sockets
1822  */
1823 conn_t *
1824 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, uint32_t ports,
1825     ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira, ip_stack_t *ipst)
1826 {
1827         connf_t         *connfp;
1828         conn_t          *connp;
1829         in_port_t       lport;
1830         int             ipversion;
1831         const void      *dst;
1832         zoneid_t        zoneid = ira->ira_zoneid;
1833 
1834         lport = ((uint16_t *)&ports)[1];
1835         if (ira->ira_flags & IRAF_IS_IPV4) {
1836                 dst = (const void *)&ipha->ipha_dst;
1837                 ipversion = IPV4_VERSION;
1838         } else {
1839                 dst = (const void *)&ip6h->ip6_dst;
1840                 ipversion = IPV6_VERSION;
1841         }
1842 
1843         connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1844         mutex_enter(&connfp->connf_lock);
1845         for (connp = connfp->connf_head; connp != NULL;
1846             connp = connp->conn_next) {
1847                 /* We don't allow v4 fallback for v6 raw socket. */
1848                 if (ipversion != connp->conn_ipversion)
1849                         continue;
1850                 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1851                     !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1852                         if (ipversion == IPV4_VERSION) {
1853                                 if (!IPCL_CONN_MATCH(connp, protocol,
1854                                     ipha->ipha_src, ipha->ipha_dst, ports))
1855                                         continue;
1856                         } else {
1857                                 if (!IPCL_CONN_MATCH_V6(connp, protocol,
1858                                     ip6h->ip6_src, ip6h->ip6_dst, ports))
1859                                         continue;
1860                         }
1861                 } else {
1862                         if (ipversion == IPV4_VERSION) {
1863                                 if (!IPCL_BIND_MATCH(connp, protocol,
1864                                     ipha->ipha_dst, lport))
1865                                         continue;
1866                         } else {
1867                                 if (!IPCL_BIND_MATCH_V6(connp, protocol,
1868                                     ip6h->ip6_dst, lport))
1869                                         continue;
1870                         }
1871                 }
1872 
1873                 if (connp->conn_zoneid == zoneid ||
1874                     connp->conn_allzones ||
1875                     ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1876                     (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1877                     (ira->ira_flags & IRAF_TX_SHARED_ADDR)))
1878                         break;
1879         }
1880 
1881         if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1882             !tsol_receive_local(mp, dst, ipversion, ira, connp)) {
1883                 DTRACE_PROBE3(tx__ip__log__info__classify__rawip,
1884                     char *, "connp(1) could not receive mp(2)",
1885                     conn_t *, connp, mblk_t *, mp);
1886                 connp = NULL;
1887         }
1888 
1889         if (connp != NULL)
1890                 goto found;
1891         mutex_exit(&connfp->connf_lock);
1892 
1893         /* Try to look for a wildcard SCTP RAW socket match. */
1894         connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)];
1895         mutex_enter(&connfp->connf_lock);
1896         for (connp = connfp->connf_head; connp != NULL;
1897             connp = connp->conn_next) {
1898                 /* We don't allow v4 fallback for v6 raw socket. */
1899                 if (ipversion != connp->conn_ipversion)
1900                         continue;
1901                 if (!IPCL_ZONE_MATCH(connp, zoneid))
1902                         continue;
1903 
1904                 if (ipversion == IPV4_VERSION) {
1905                         if (IPCL_RAW_MATCH(connp, protocol, ipha->ipha_dst))
1906                                 break;
1907                 } else {
1908                         if (IPCL_RAW_MATCH_V6(connp, protocol, ip6h->ip6_dst)) {
1909                                 break;
1910                         }
1911                 }
1912         }
1913 
1914         if (connp != NULL)
1915                 goto found;
1916 
1917         mutex_exit(&connfp->connf_lock);
1918         return (NULL);
1919 
1920 found:
1921         ASSERT(connp != NULL);
1922         CONN_INC_REF(connp);
1923         mutex_exit(&connfp->connf_lock);
1924         return (connp);
1925 }
1926 
1927 /* ARGSUSED */
1928 static int
1929 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags)
1930 {
1931         itc_t   *itc = (itc_t *)buf;
1932         conn_t  *connp = &itc->itc_conn;
1933         tcp_t   *tcp = (tcp_t *)&itc[1];
1934 
1935         bzero(connp, sizeof (conn_t));
1936         bzero(tcp, sizeof (tcp_t));
1937 
1938         mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
1939         cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
1940         cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL);
1941         tcp->tcp_timercache = tcp_timermp_alloc(kmflags);
1942         if (tcp->tcp_timercache == NULL)
1943                 return (ENOMEM);
1944         connp->conn_tcp = tcp;
1945         connp->conn_flags = IPCL_TCPCONN;
1946         connp->conn_proto = IPPROTO_TCP;
1947         tcp->tcp_connp = connp;
1948         rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
1949 
1950         connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
1951         if (connp->conn_ixa == NULL) {
1952                 tcp_timermp_free(tcp);
1953                 return (ENOMEM);
1954         }
1955         connp->conn_ixa->ixa_refcnt = 1;
1956         connp->conn_ixa->ixa_protocol = connp->conn_proto;
1957         connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
1958         return (0);
1959 }
1960 
1961 /* ARGSUSED */
1962 static void
1963 tcp_conn_destructor(void *buf, void *cdrarg)
1964 {
1965         itc_t   *itc = (itc_t *)buf;
1966         conn_t  *connp = &itc->itc_conn;
1967         tcp_t   *tcp = (tcp_t *)&itc[1];
1968 
1969         ASSERT(connp->conn_flags & IPCL_TCPCONN);
1970         ASSERT(tcp->tcp_connp == connp);
1971         ASSERT(connp->conn_tcp == tcp);
1972         tcp_timermp_free(tcp);
1973         mutex_destroy(&connp->conn_lock);
1974         cv_destroy(&connp->conn_cv);
1975         cv_destroy(&connp->conn_sq_cv);
1976         rw_destroy(&connp->conn_ilg_lock);
1977 
1978         /* Can be NULL if constructor failed */
1979         if (connp->conn_ixa != NULL) {
1980                 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
1981                 ASSERT(connp->conn_ixa->ixa_ire == NULL);
1982                 ASSERT(connp->conn_ixa->ixa_nce == NULL);
1983                 ixa_refrele(connp->conn_ixa);
1984         }
1985 }
1986 
1987 /* ARGSUSED */
1988 static int
1989 ip_conn_constructor(void *buf, void *cdrarg, int kmflags)
1990 {
1991         itc_t   *itc = (itc_t *)buf;
1992         conn_t  *connp = &itc->itc_conn;
1993 
1994         bzero(connp, sizeof (conn_t));
1995         mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
1996         cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
1997         connp->conn_flags = IPCL_IPCCONN;
1998         rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
1999 
2000         connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2001         if (connp->conn_ixa == NULL)
2002                 return (ENOMEM);
2003         connp->conn_ixa->ixa_refcnt = 1;
2004         connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2005         return (0);
2006 }
2007 
2008 /* ARGSUSED */
2009 static void
2010 ip_conn_destructor(void *buf, void *cdrarg)
2011 {
2012         itc_t   *itc = (itc_t *)buf;
2013         conn_t  *connp = &itc->itc_conn;
2014 
2015         ASSERT(connp->conn_flags & IPCL_IPCCONN);
2016         ASSERT(connp->conn_priv == NULL);
2017         mutex_destroy(&connp->conn_lock);
2018         cv_destroy(&connp->conn_cv);
2019         rw_destroy(&connp->conn_ilg_lock);
2020 
2021         /* Can be NULL if constructor failed */
2022         if (connp->conn_ixa != NULL) {
2023                 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2024                 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2025                 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2026                 ixa_refrele(connp->conn_ixa);
2027         }
2028 }
2029 
2030 /* ARGSUSED */
2031 static int
2032 udp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2033 {
2034         itc_t   *itc = (itc_t *)buf;
2035         conn_t  *connp = &itc->itc_conn;
2036         udp_t   *udp = (udp_t *)&itc[1];
2037 
2038         bzero(connp, sizeof (conn_t));
2039         bzero(udp, sizeof (udp_t));
2040 
2041         mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2042         cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2043         connp->conn_udp = udp;
2044         connp->conn_flags = IPCL_UDPCONN;
2045         connp->conn_proto = IPPROTO_UDP;
2046         udp->udp_connp = connp;
2047         rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2048         connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2049         if (connp->conn_ixa == NULL)
2050                 return (ENOMEM);
2051         connp->conn_ixa->ixa_refcnt = 1;
2052         connp->conn_ixa->ixa_protocol = connp->conn_proto;
2053         connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2054         return (0);
2055 }
2056 
2057 /* ARGSUSED */
2058 static void
2059 udp_conn_destructor(void *buf, void *cdrarg)
2060 {
2061         itc_t   *itc = (itc_t *)buf;
2062         conn_t  *connp = &itc->itc_conn;
2063         udp_t   *udp = (udp_t *)&itc[1];
2064 
2065         ASSERT(connp->conn_flags & IPCL_UDPCONN);
2066         ASSERT(udp->udp_connp == connp);
2067         ASSERT(connp->conn_udp == udp);
2068         mutex_destroy(&connp->conn_lock);
2069         cv_destroy(&connp->conn_cv);
2070         rw_destroy(&connp->conn_ilg_lock);
2071 
2072         /* Can be NULL if constructor failed */
2073         if (connp->conn_ixa != NULL) {
2074                 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2075                 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2076                 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2077                 ixa_refrele(connp->conn_ixa);
2078         }
2079 }
2080 
2081 /* ARGSUSED */
2082 static int
2083 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2084 {
2085         itc_t   *itc = (itc_t *)buf;
2086         conn_t  *connp = &itc->itc_conn;
2087         icmp_t  *icmp = (icmp_t *)&itc[1];
2088 
2089         bzero(connp, sizeof (conn_t));
2090         bzero(icmp, sizeof (icmp_t));
2091 
2092         mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2093         cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2094         connp->conn_icmp = icmp;
2095         connp->conn_flags = IPCL_RAWIPCONN;
2096         connp->conn_proto = IPPROTO_ICMP;
2097         icmp->icmp_connp = connp;
2098         rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2099         connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2100         if (connp->conn_ixa == NULL)
2101                 return (ENOMEM);
2102         connp->conn_ixa->ixa_refcnt = 1;
2103         connp->conn_ixa->ixa_protocol = connp->conn_proto;
2104         connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2105         return (0);
2106 }
2107 
2108 /* ARGSUSED */
2109 static void
2110 rawip_conn_destructor(void *buf, void *cdrarg)
2111 {
2112         itc_t   *itc = (itc_t *)buf;
2113         conn_t  *connp = &itc->itc_conn;
2114         icmp_t  *icmp = (icmp_t *)&itc[1];
2115 
2116         ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
2117         ASSERT(icmp->icmp_connp == connp);
2118         ASSERT(connp->conn_icmp == icmp);
2119         mutex_destroy(&connp->conn_lock);
2120         cv_destroy(&connp->conn_cv);
2121         rw_destroy(&connp->conn_ilg_lock);
2122 
2123         /* Can be NULL if constructor failed */
2124         if (connp->conn_ixa != NULL) {
2125                 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2126                 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2127                 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2128                 ixa_refrele(connp->conn_ixa);
2129         }
2130 }
2131 
2132 /* ARGSUSED */
2133 static int
2134 rts_conn_constructor(void *buf, void *cdrarg, int kmflags)
2135 {
2136         itc_t   *itc = (itc_t *)buf;
2137         conn_t  *connp = &itc->itc_conn;
2138         rts_t   *rts = (rts_t *)&itc[1];
2139 
2140         bzero(connp, sizeof (conn_t));
2141         bzero(rts, sizeof (rts_t));
2142 
2143         mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2144         cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2145         connp->conn_rts = rts;
2146         connp->conn_flags = IPCL_RTSCONN;
2147         rts->rts_connp = connp;
2148         rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2149         connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2150         if (connp->conn_ixa == NULL)
2151                 return (ENOMEM);
2152         connp->conn_ixa->ixa_refcnt = 1;
2153         connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2154         return (0);
2155 }
2156 
2157 /* ARGSUSED */
2158 static void
2159 rts_conn_destructor(void *buf, void *cdrarg)
2160 {
2161         itc_t   *itc = (itc_t *)buf;
2162         conn_t  *connp = &itc->itc_conn;
2163         rts_t   *rts = (rts_t *)&itc[1];
2164 
2165         ASSERT(connp->conn_flags & IPCL_RTSCONN);
2166         ASSERT(rts->rts_connp == connp);
2167         ASSERT(connp->conn_rts == rts);
2168         mutex_destroy(&connp->conn_lock);
2169         cv_destroy(&connp->conn_cv);
2170         rw_destroy(&connp->conn_ilg_lock);
2171 
2172         /* Can be NULL if constructor failed */
2173         if (connp->conn_ixa != NULL) {
2174                 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2175                 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2176                 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2177                 ixa_refrele(connp->conn_ixa);
2178         }
2179 }
2180 
2181 /*
2182  * Called as part of ipcl_conn_destroy to assert and clear any pointers
2183  * in the conn_t.
2184  *
2185  * Below we list all the pointers in the conn_t as a documentation aid.
2186  * The ones that we can not ASSERT to be NULL are #ifdef'ed out.
2187  * If you add any pointers to the conn_t please add an ASSERT here
2188  * and #ifdef it out if it can't be actually asserted to be NULL.
2189  * In any case, we bzero most of the conn_t at the end of the function.
2190  */
2191 void
2192 ipcl_conn_cleanup(conn_t *connp)
2193 {
2194         ip_xmit_attr_t  *ixa;
2195 
2196         ASSERT(connp->conn_latch == NULL);
2197         ASSERT(connp->conn_latch_in_policy == NULL);
2198         ASSERT(connp->conn_latch_in_action == NULL);
2199 #ifdef notdef
2200         ASSERT(connp->conn_rq == NULL);
2201         ASSERT(connp->conn_wq == NULL);
2202 #endif
2203         ASSERT(connp->conn_cred == NULL);
2204         ASSERT(connp->conn_g_fanout == NULL);
2205         ASSERT(connp->conn_g_next == NULL);
2206         ASSERT(connp->conn_g_prev == NULL);
2207         ASSERT(connp->conn_policy == NULL);
2208         ASSERT(connp->conn_fanout == NULL);
2209         ASSERT(connp->conn_next == NULL);
2210         ASSERT(connp->conn_prev == NULL);
2211         ASSERT(connp->conn_oper_pending_ill == NULL);
2212         ASSERT(connp->conn_ilg == NULL);
2213         ASSERT(connp->conn_drain_next == NULL);
2214         ASSERT(connp->conn_drain_prev == NULL);
2215 #ifdef notdef
2216         /* conn_idl is not cleared when removed from idl list */
2217         ASSERT(connp->conn_idl == NULL);
2218 #endif
2219         ASSERT(connp->conn_ipsec_opt_mp == NULL);
2220 #ifdef notdef
2221         /* conn_netstack is cleared by the caller; needed by ixa_cleanup */
2222         ASSERT(connp->conn_netstack == NULL);
2223 #endif
2224 
2225         ASSERT(connp->conn_helper_info == NULL);
2226         ASSERT(connp->conn_ixa != NULL);
2227         ixa = connp->conn_ixa;
2228         ASSERT(ixa->ixa_refcnt == 1);
2229         /* Need to preserve ixa_protocol */
2230         ixa_cleanup(ixa);
2231         ixa->ixa_flags = 0;
2232 
2233         /* Clear out the conn_t fields that are not preserved */
2234         bzero(&connp->conn_start_clr,
2235             sizeof (conn_t) -
2236             ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp));
2237 }
2238 
2239 /*
2240  * All conns are inserted in a global multi-list for the benefit of
2241  * walkers. The walk is guaranteed to walk all open conns at the time
2242  * of the start of the walk exactly once. This property is needed to
2243  * achieve some cleanups during unplumb of interfaces. This is achieved
2244  * as follows.
2245  *
2246  * ipcl_conn_create and ipcl_conn_destroy are the only functions that
2247  * call the insert and delete functions below at creation and deletion
2248  * time respectively. The conn never moves or changes its position in this
2249  * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt
2250  * won't increase due to walkers, once the conn deletion has started. Note
2251  * that we can't remove the conn from the global list and then wait for
2252  * the refcnt to drop to zero, since walkers would then see a truncated
2253  * list. CONN_INCIPIENT ensures that walkers don't start looking at
2254  * conns until ip_open is ready to make them globally visible.
2255  * The global round robin multi-list locks are held only to get the
2256  * next member/insertion/deletion and contention should be negligible
2257  * if the multi-list is much greater than the number of cpus.
2258  */
2259 void
2260 ipcl_globalhash_insert(conn_t *connp)
2261 {
2262         int     index;
2263         struct connf_s  *connfp;
2264         ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
2265 
2266         /*
2267          * No need for atomic here. Approximate even distribution
2268          * in the global lists is sufficient.
2269          */
2270         ipst->ips_conn_g_index++;
2271         index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1);
2272 
2273         connp->conn_g_prev = NULL;
2274         /*
2275          * Mark as INCIPIENT, so that walkers will ignore this
2276          * for now, till ip_open is ready to make it visible globally.
2277          */
2278         connp->conn_state_flags |= CONN_INCIPIENT;
2279 
2280         connfp = &ipst->ips_ipcl_globalhash_fanout[index];
2281         /* Insert at the head of the list */
2282         mutex_enter(&connfp->connf_lock);
2283         connp->conn_g_next = connfp->connf_head;
2284         if (connp->conn_g_next != NULL)
2285                 connp->conn_g_next->conn_g_prev = connp;
2286         connfp->connf_head = connp;
2287 
2288         /* The fanout bucket this conn points to */
2289         connp->conn_g_fanout = connfp;
2290 
2291         mutex_exit(&connfp->connf_lock);
2292 }
2293 
2294 void
2295 ipcl_globalhash_remove(conn_t *connp)
2296 {
2297         struct connf_s  *connfp;
2298 
2299         /*
2300          * We were never inserted in the global multi list.
2301          * IPCL_NONE variety is never inserted in the global multilist
2302          * since it is presumed to not need any cleanup and is transient.
2303          */
2304         if (connp->conn_g_fanout == NULL)
2305                 return;
2306 
2307         connfp = connp->conn_g_fanout;
2308         mutex_enter(&connfp->connf_lock);
2309         if (connp->conn_g_prev != NULL)
2310                 connp->conn_g_prev->conn_g_next = connp->conn_g_next;
2311         else
2312                 connfp->connf_head = connp->conn_g_next;
2313         if (connp->conn_g_next != NULL)
2314                 connp->conn_g_next->conn_g_prev = connp->conn_g_prev;
2315         mutex_exit(&connfp->connf_lock);
2316 
2317         /* Better to stumble on a null pointer than to corrupt memory */
2318         connp->conn_g_next = NULL;
2319         connp->conn_g_prev = NULL;
2320         connp->conn_g_fanout = NULL;
2321 }
2322 
2323 /*
2324  * Walk the list of all conn_t's in the system, calling the function provided
2325  * With the specified argument for each.
2326  * Applies to both IPv4 and IPv6.
2327  *
2328  * CONNs may hold pointers to ills (conn_dhcpinit_ill and
2329  * conn_oper_pending_ill). To guard against stale pointers
2330  * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is
2331  * unplumbed or removed. New conn_t's that are created while we are walking
2332  * may be missed by this walk, because they are not necessarily inserted
2333  * at the tail of the list. They are new conn_t's and thus don't have any
2334  * stale pointers. The CONN_CLOSING flag ensures that no new reference
2335  * is created to the struct that is going away.
2336  */
2337 void
2338 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst)
2339 {
2340         int     i;
2341         conn_t  *connp;
2342         conn_t  *prev_connp;
2343 
2344         for (i = 0; i < CONN_G_HASH_SIZE; i++) {
2345                 mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2346                 prev_connp = NULL;
2347                 connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head;
2348                 while (connp != NULL) {
2349                         mutex_enter(&connp->conn_lock);
2350                         if (connp->conn_state_flags &
2351                             (CONN_CONDEMNED | CONN_INCIPIENT)) {
2352                                 mutex_exit(&connp->conn_lock);
2353                                 connp = connp->conn_g_next;
2354                                 continue;
2355                         }
2356                         CONN_INC_REF_LOCKED(connp);
2357                         mutex_exit(&connp->conn_lock);
2358                         mutex_exit(
2359                             &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2360                         (*func)(connp, arg);
2361                         if (prev_connp != NULL)
2362                                 CONN_DEC_REF(prev_connp);
2363                         mutex_enter(
2364                             &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2365                         prev_connp = connp;
2366                         connp = connp->conn_g_next;
2367                 }
2368                 mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2369                 if (prev_connp != NULL)
2370                         CONN_DEC_REF(prev_connp);
2371         }
2372 }
2373 
2374 /*
2375  * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on
2376  * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2377  * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2378  * (peer tcp in ESTABLISHED state).
2379  */
2380 conn_t *
2381 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcpha_t *tcpha,
2382     ip_stack_t *ipst)
2383 {
2384         uint32_t ports;
2385         uint16_t *pports = (uint16_t *)&ports;
2386         connf_t *connfp;
2387         conn_t  *tconnp;
2388         boolean_t zone_chk;
2389 
2390         /*
2391          * If either the source of destination address is loopback, then
2392          * both endpoints must be in the same Zone.  Otherwise, both of
2393          * the addresses are system-wide unique (tcp is in ESTABLISHED
2394          * state) and the endpoints may reside in different Zones.
2395          */
2396         zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) ||
2397             ipha->ipha_dst == htonl(INADDR_LOOPBACK));
2398 
2399         pports[0] = tcpha->tha_fport;
2400         pports[1] = tcpha->tha_lport;
2401 
2402         connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2403             ports, ipst)];
2404 
2405         mutex_enter(&connfp->connf_lock);
2406         for (tconnp = connfp->connf_head; tconnp != NULL;
2407             tconnp = tconnp->conn_next) {
2408 
2409                 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2410                     ipha->ipha_dst, ipha->ipha_src, ports) &&
2411                     tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2412                     (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2413 
2414                         ASSERT(tconnp != connp);
2415                         CONN_INC_REF(tconnp);
2416                         mutex_exit(&connfp->connf_lock);
2417                         return (tconnp);
2418                 }
2419         }
2420         mutex_exit(&connfp->connf_lock);
2421         return (NULL);
2422 }
2423 
2424 /*
2425  * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on
2426  * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2427  * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2428  * (peer tcp in ESTABLISHED state).
2429  */
2430 conn_t *
2431 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcpha_t *tcpha,
2432     ip_stack_t *ipst)
2433 {
2434         uint32_t ports;
2435         uint16_t *pports = (uint16_t *)&ports;
2436         connf_t *connfp;
2437         conn_t  *tconnp;
2438         boolean_t zone_chk;
2439 
2440         /*
2441          * If either the source of destination address is loopback, then
2442          * both endpoints must be in the same Zone.  Otherwise, both of
2443          * the addresses are system-wide unique (tcp is in ESTABLISHED
2444          * state) and the endpoints may reside in different Zones.  We
2445          * don't do Zone check for link local address(es) because the
2446          * current Zone implementation treats each link local address as
2447          * being unique per system node, i.e. they belong to global Zone.
2448          */
2449         zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) ||
2450             IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst));
2451 
2452         pports[0] = tcpha->tha_fport;
2453         pports[1] = tcpha->tha_lport;
2454 
2455         connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2456             ports, ipst)];
2457 
2458         mutex_enter(&connfp->connf_lock);
2459         for (tconnp = connfp->connf_head; tconnp != NULL;
2460             tconnp = tconnp->conn_next) {
2461 
2462                 /* We skip conn_bound_if check here as this is loopback tcp */
2463                 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2464                     ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2465                     tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2466                     (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2467 
2468                         ASSERT(tconnp != connp);
2469                         CONN_INC_REF(tconnp);
2470                         mutex_exit(&connfp->connf_lock);
2471                         return (tconnp);
2472                 }
2473         }
2474         mutex_exit(&connfp->connf_lock);
2475         return (NULL);
2476 }
2477 
2478 /*
2479  * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2480  * Returns with conn reference held. Caller must call CONN_DEC_REF.
2481  * Only checks for connected entries i.e. no INADDR_ANY checks.
2482  */
2483 conn_t *
2484 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcpha_t *tcpha, int min_state,
2485     ip_stack_t *ipst)
2486 {
2487         uint32_t ports;
2488         uint16_t *pports;
2489         connf_t *connfp;
2490         conn_t  *tconnp;
2491 
2492         pports = (uint16_t *)&ports;
2493         pports[0] = tcpha->tha_fport;
2494         pports[1] = tcpha->tha_lport;
2495 
2496         connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2497             ports, ipst)];
2498 
2499         mutex_enter(&connfp->connf_lock);
2500         for (tconnp = connfp->connf_head; tconnp != NULL;
2501             tconnp = tconnp->conn_next) {
2502 
2503                 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2504                     ipha->ipha_dst, ipha->ipha_src, ports) &&
2505                     tconnp->conn_tcp->tcp_state >= min_state) {
2506 
2507                         CONN_INC_REF(tconnp);
2508                         mutex_exit(&connfp->connf_lock);
2509                         return (tconnp);
2510                 }
2511         }
2512         mutex_exit(&connfp->connf_lock);
2513         return (NULL);
2514 }
2515 
2516 /*
2517  * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2518  * Returns with conn reference held. Caller must call CONN_DEC_REF.
2519  * Only checks for connected entries i.e. no INADDR_ANY checks.
2520  * Match on ifindex in addition to addresses.
2521  */
2522 conn_t *
2523 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state,
2524     uint_t ifindex, ip_stack_t *ipst)
2525 {
2526         tcp_t   *tcp;
2527         uint32_t ports;
2528         uint16_t *pports;
2529         connf_t *connfp;
2530         conn_t  *tconnp;
2531 
2532         pports = (uint16_t *)&ports;
2533         pports[0] = tcpha->tha_fport;
2534         pports[1] = tcpha->tha_lport;
2535 
2536         connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2537             ports, ipst)];
2538 
2539         mutex_enter(&connfp->connf_lock);
2540         for (tconnp = connfp->connf_head; tconnp != NULL;
2541             tconnp = tconnp->conn_next) {
2542 
2543                 tcp = tconnp->conn_tcp;
2544                 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2545                     ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2546                     tcp->tcp_state >= min_state &&
2547                     (tconnp->conn_bound_if == 0 ||
2548                     tconnp->conn_bound_if == ifindex)) {
2549 
2550                         CONN_INC_REF(tconnp);
2551                         mutex_exit(&connfp->connf_lock);
2552                         return (tconnp);
2553                 }
2554         }
2555         mutex_exit(&connfp->connf_lock);
2556         return (NULL);
2557 }
2558 
2559 /*
2560  * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate
2561  * a listener when changing state.
2562  */
2563 conn_t *
2564 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid,
2565     ip_stack_t *ipst)
2566 {
2567         connf_t         *bind_connfp;
2568         conn_t          *connp;
2569         tcp_t           *tcp;
2570 
2571         /*
2572          * Avoid false matches for packets sent to an IP destination of
2573          * all zeros.
2574          */
2575         if (laddr == 0)
2576                 return (NULL);
2577 
2578         ASSERT(zoneid != ALL_ZONES);
2579 
2580         bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2581         mutex_enter(&bind_connfp->connf_lock);
2582         for (connp = bind_connfp->connf_head; connp != NULL;
2583             connp = connp->conn_next) {
2584                 tcp = connp->conn_tcp;
2585                 if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) &&
2586                     IPCL_ZONE_MATCH(connp, zoneid) &&
2587                     (tcp->tcp_listener == NULL)) {
2588                         CONN_INC_REF(connp);
2589                         mutex_exit(&bind_connfp->connf_lock);
2590                         return (connp);
2591                 }
2592         }
2593         mutex_exit(&bind_connfp->connf_lock);
2594         return (NULL);
2595 }
2596 
2597 /*
2598  * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate
2599  * a listener when changing state.
2600  */
2601 conn_t *
2602 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex,
2603     zoneid_t zoneid, ip_stack_t *ipst)
2604 {
2605         connf_t         *bind_connfp;
2606         conn_t          *connp = NULL;
2607         tcp_t           *tcp;
2608 
2609         /*
2610          * Avoid false matches for packets sent to an IP destination of
2611          * all zeros.
2612          */
2613         if (IN6_IS_ADDR_UNSPECIFIED(laddr))
2614                 return (NULL);
2615 
2616         ASSERT(zoneid != ALL_ZONES);
2617 
2618         bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2619         mutex_enter(&bind_connfp->connf_lock);
2620         for (connp = bind_connfp->connf_head; connp != NULL;
2621             connp = connp->conn_next) {
2622                 tcp = connp->conn_tcp;
2623                 if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) &&
2624                     IPCL_ZONE_MATCH(connp, zoneid) &&
2625                     (connp->conn_bound_if == 0 ||
2626                     connp->conn_bound_if == ifindex) &&
2627                     tcp->tcp_listener == NULL) {
2628                         CONN_INC_REF(connp);
2629                         mutex_exit(&bind_connfp->connf_lock);
2630                         return (connp);
2631                 }
2632         }
2633         mutex_exit(&bind_connfp->connf_lock);
2634         return (NULL);
2635 }
2636 
2637 /*
2638  * ipcl_get_next_conn
2639  *      get the next entry in the conn global list
2640  *      and put a reference on the next_conn.
2641  *      decrement the reference on the current conn.
2642  *
2643  * This is an iterator based walker function that also provides for
2644  * some selection by the caller. It walks through the conn_hash bucket
2645  * searching for the next valid connp in the list, and selects connections
2646  * that are neither closed nor condemned. It also REFHOLDS the conn
2647  * thus ensuring that the conn exists when the caller uses the conn.
2648  */
2649 conn_t *
2650 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags)
2651 {
2652         conn_t  *next_connp;
2653 
2654         if (connfp == NULL)
2655                 return (NULL);
2656 
2657         mutex_enter(&connfp->connf_lock);
2658 
2659         next_connp = (connp == NULL) ?
2660             connfp->connf_head : connp->conn_g_next;
2661 
2662         while (next_connp != NULL) {
2663                 mutex_enter(&next_connp->conn_lock);
2664                 if (!(next_connp->conn_flags & conn_flags) ||
2665                     (next_connp->conn_state_flags &
2666                     (CONN_CONDEMNED | CONN_INCIPIENT))) {
2667                         /*
2668                          * This conn has been condemned or
2669                          * is closing, or the flags don't match
2670                          */
2671                         mutex_exit(&next_connp->conn_lock);
2672                         next_connp = next_connp->conn_g_next;
2673                         continue;
2674                 }
2675                 CONN_INC_REF_LOCKED(next_connp);
2676                 mutex_exit(&next_connp->conn_lock);
2677                 break;
2678         }
2679 
2680         mutex_exit(&connfp->connf_lock);
2681 
2682         if (connp != NULL)
2683                 CONN_DEC_REF(connp);
2684 
2685         return (next_connp);
2686 }
2687 
2688 #ifdef CONN_DEBUG
2689 /*
2690  * Trace of the last NBUF refhold/refrele
2691  */
2692 int
2693 conn_trace_ref(conn_t *connp)
2694 {
2695         int     last;
2696         conn_trace_t    *ctb;
2697 
2698         ASSERT(MUTEX_HELD(&connp->conn_lock));
2699         last = connp->conn_trace_last;
2700         last++;
2701         if (last == CONN_TRACE_MAX)
2702                 last = 0;
2703 
2704         ctb = &connp->conn_trace_buf[last];
2705         ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2706         connp->conn_trace_last = last;
2707         return (1);
2708 }
2709 
2710 int
2711 conn_untrace_ref(conn_t *connp)
2712 {
2713         int     last;
2714         conn_trace_t    *ctb;
2715 
2716         ASSERT(MUTEX_HELD(&connp->conn_lock));
2717         last = connp->conn_trace_last;
2718         last++;
2719         if (last == CONN_TRACE_MAX)
2720                 last = 0;
2721 
2722         ctb = &connp->conn_trace_buf[last];
2723         ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2724         connp->conn_trace_last = last;
2725         return (1);
2726 }
2727 #endif
2728 
2729 mib2_socketInfoEntry_t *
2730 conn_get_socket_info(conn_t *connp, mib2_socketInfoEntry_t *sie)
2731 {
2732         vnode_t *vn = NULL;
2733         vattr_t attr;
2734         uint64_t flags = 0;
2735         sock_upcalls_t *upcalls;
2736         sock_upper_handle_t upper_handle;
2737 
2738         /*
2739          * If the connection is closing, it is not safe to make an upcall or
2740          * access the stream associated with the connection.
2741          * The callers of this function have a reference on connp itself
2742          * so, as long as it is not closing, it's safe to continue.
2743          */
2744         mutex_enter(&connp->conn_lock);
2745 
2746         if ((connp->conn_state_flags & CONN_CLOSING)) {
2747                 mutex_exit(&connp->conn_lock);
2748                 return (NULL);
2749         }
2750 
2751         /*
2752          * Continue to hold conn_lock because we don't want to race with an
2753          * in-progress close, which will have set-to-NULL (and destroyed
2754          * upper_handle, aka sonode (and vnode)) BEFORE setting CONN_CLOSING.
2755          *
2756          * There is still a race with an in-progress OPEN, however, where
2757          * conn_upper_handle and conn_upcalls are being assigned (in multiple
2758          * codepaths) WITHOUT conn_lock being held.  We address that race
2759          * HERE, however, given that both are going from NULL to non-NULL,
2760          * if we lose the race, we don't get any data for the in-progress-OPEN
2761          * socket.
2762          */
2763 
2764         upcalls = connp->conn_upcalls;
2765         upper_handle = connp->conn_upper_handle;
2766         /* Check BOTH for non-NULL before attempting an upcall. */
2767         if (upper_handle != NULL && upcalls != NULL) {
2768                 /* su_get_vnode() returns one with VN_HOLD() already done. */
2769                 vn = upcalls->su_get_vnode(upper_handle);
2770         } else if (!IPCL_IS_NONSTR(connp) && connp->conn_rq != NULL) {
2771                 vn = STREAM(connp->conn_rq)->sd_pvnode;
2772                 if (vn != NULL)
2773                         VN_HOLD(vn);
2774                 flags |= MIB2_SOCKINFO_STREAM;
2775         }
2776 
2777         mutex_exit(&connp->conn_lock);
2778 
2779         if (vn == NULL || VOP_GETATTR(vn, &attr, 0, CRED(), NULL) != 0) {
2780                 if (vn != NULL)
2781                         VN_RELE(vn);
2782                 return (NULL);
2783         }
2784 
2785         VN_RELE(vn);
2786 
2787         bzero(sie, sizeof (*sie));
2788 
2789         sie->sie_flags = flags;
2790         sie->sie_inode = attr.va_nodeid;
2791         sie->sie_dev = attr.va_rdev;
2792 
2793         return (sie);
2794 }