1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  * Copyright (c) 2017, Joyent, Inc.  All rights reserved.
  26  */
  27 
  28 #include <sys/param.h>
  29 #include <sys/sysmacros.h>
  30 #include <sys/vm.h>
  31 #include <sys/proc.h>
  32 #include <sys/tuneable.h>
  33 #include <sys/systm.h>
  34 #include <sys/cmn_err.h>
  35 #include <sys/debug.h>
  36 #include <sys/sdt.h>
  37 #include <sys/mutex.h>
  38 #include <sys/bitmap.h>
  39 #include <sys/atomic.h>
  40 #include <sys/sunddi.h>
  41 #include <sys/kobj.h>
  42 #include <sys/disp.h>
  43 #include <vm/seg_kmem.h>
  44 #include <sys/zone.h>
  45 #include <sys/netstack.h>
  46 
  47 /*
  48  * What we use so that the zones framework can tell us about new zones,
  49  * which we use to create new stacks.
  50  */
  51 static zone_key_t netstack_zone_key;
  52 
  53 static int      netstack_initialized = 0;
  54 
  55 /*
  56  * Track the registered netstacks.
  57  * The global lock protects
  58  * - ns_reg
  59  * - the list starting at netstack_head and following the netstack_next
  60  *   pointers.
  61  */
  62 static kmutex_t netstack_g_lock;
  63 
  64 /*
  65  * Registry of netstacks with their create/shutdown/destory functions.
  66  */
  67 static struct netstack_registry ns_reg[NS_MAX];
  68 
  69 /*
  70  * Global list of existing stacks.  We use this when a new zone with
  71  * an exclusive IP instance is created.
  72  *
  73  * Note that in some cases a netstack_t needs to stay around after the zone
  74  * has gone away. This is because there might be outstanding references
  75  * (from TCP TIME_WAIT connections, IPsec state, etc). The netstack_t data
  76  * structure and all the foo_stack_t's hanging off of it will be cleaned up
  77  * when the last reference to it is dropped.
  78  * However, the same zone might be rebooted. That is handled using the
  79  * assumption that the zones framework picks a new zoneid each time a zone
  80  * is (re)booted. We assert for that condition in netstack_zone_create().
  81  * Thus the old netstack_t can take its time for things to time out.
  82  */
  83 static netstack_t *netstack_head;
  84 
  85 /*
  86  * To support kstat_create_netstack() using kstat_zone_add we need
  87  * to track both
  88  *  - all zoneids that use the global/shared stack
  89  *  - all kstats that have been added for the shared stack
  90  */
  91 struct shared_zone_list {
  92         struct shared_zone_list *sz_next;
  93         zoneid_t                sz_zoneid;
  94 };
  95 
  96 struct shared_kstat_list {
  97         struct shared_kstat_list *sk_next;
  98         kstat_t                  *sk_kstat;
  99 };
 100 
 101 static kmutex_t netstack_shared_lock;   /* protects the following two */
 102 static struct shared_zone_list  *netstack_shared_zones;
 103 static struct shared_kstat_list *netstack_shared_kstats;
 104 
 105 static void     *netstack_zone_create(zoneid_t zoneid);
 106 static void     netstack_zone_shutdown(zoneid_t zoneid, void *arg);
 107 static void     netstack_zone_destroy(zoneid_t zoneid, void *arg);
 108 
 109 static void     netstack_shared_zone_add(zoneid_t zoneid);
 110 static void     netstack_shared_zone_remove(zoneid_t zoneid);
 111 static void     netstack_shared_kstat_add(kstat_t *ks);
 112 static void     netstack_shared_kstat_remove(kstat_t *ks);
 113 
 114 typedef boolean_t applyfn_t(kmutex_t *, netstack_t *, int);
 115 
 116 static void     apply_all_netstacks(int, applyfn_t *);
 117 static void     apply_all_modules(netstack_t *, applyfn_t *);
 118 static void     apply_all_modules_reverse(netstack_t *, applyfn_t *);
 119 static boolean_t netstack_apply_create(kmutex_t *, netstack_t *, int);
 120 static boolean_t netstack_apply_shutdown(kmutex_t *, netstack_t *, int);
 121 static boolean_t netstack_apply_destroy(kmutex_t *, netstack_t *, int);
 122 static boolean_t wait_for_zone_creator(netstack_t *, kmutex_t *);
 123 static boolean_t wait_for_nms_inprogress(netstack_t *, nm_state_t *,
 124     kmutex_t *);
 125 
 126 static void netstack_hold_locked(netstack_t *);
 127 
 128 static ksema_t netstack_reap_limiter;
 129 /*
 130  * Hard-coded constant, but since this is not tunable in real-time, it seems
 131  * making it an /etc/system tunable is better than nothing.
 132  */
 133 uint_t netstack_outstanding_reaps = 1024;
 134 
 135 void
 136 netstack_init(void)
 137 {
 138         mutex_init(&netstack_g_lock, NULL, MUTEX_DEFAULT, NULL);
 139         mutex_init(&netstack_shared_lock, NULL, MUTEX_DEFAULT, NULL);
 140 
 141         sema_init(&netstack_reap_limiter, netstack_outstanding_reaps, NULL,
 142             SEMA_DRIVER, NULL);
 143 
 144         netstack_initialized = 1;
 145 
 146         /*
 147          * We want to be informed each time a zone is created or
 148          * destroyed in the kernel, so we can maintain the
 149          * stack instance information.
 150          */
 151         zone_key_create(&netstack_zone_key, netstack_zone_create,
 152             netstack_zone_shutdown, netstack_zone_destroy);
 153 }
 154 
 155 /*
 156  * Register a new module with the framework.
 157  * This registers interest in changes to the set of netstacks.
 158  * The createfn and destroyfn are required, but the shutdownfn can be
 159  * NULL.
 160  * Note that due to the current zsd implementation, when the create
 161  * function is called the zone isn't fully present, thus functions
 162  * like zone_find_by_* will fail, hence the create function can not
 163  * use many zones kernel functions including zcmn_err().
 164  */
 165 void
 166 netstack_register(int moduleid,
 167     void *(*module_create)(netstackid_t, netstack_t *),
 168     void (*module_shutdown)(netstackid_t, void *),
 169     void (*module_destroy)(netstackid_t, void *))
 170 {
 171         netstack_t *ns;
 172 
 173         ASSERT(netstack_initialized);
 174         ASSERT(moduleid >= 0 && moduleid < NS_MAX);
 175         ASSERT(module_create != NULL);
 176 
 177         /*
 178          * Make instances created after this point in time run the create
 179          * callback.
 180          */
 181         mutex_enter(&netstack_g_lock);
 182         ASSERT(ns_reg[moduleid].nr_create == NULL);
 183         ASSERT(ns_reg[moduleid].nr_flags == 0);
 184         ns_reg[moduleid].nr_create = module_create;
 185         ns_reg[moduleid].nr_shutdown = module_shutdown;
 186         ns_reg[moduleid].nr_destroy = module_destroy;
 187         ns_reg[moduleid].nr_flags = NRF_REGISTERED;
 188 
 189         /*
 190          * Determine the set of stacks that exist before we drop the lock.
 191          * Set NSS_CREATE_NEEDED for each of those.
 192          * netstacks which have been deleted will have NSS_CREATE_COMPLETED
 193          * set, but check NSF_CLOSING to be sure.
 194          */
 195         for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
 196                 nm_state_t *nms = &ns->netstack_m_state[moduleid];
 197 
 198                 mutex_enter(&ns->netstack_lock);
 199                 if (!(ns->netstack_flags & NSF_CLOSING) &&
 200                     (nms->nms_flags & NSS_CREATE_ALL) == 0) {
 201                         nms->nms_flags |= NSS_CREATE_NEEDED;
 202                         DTRACE_PROBE2(netstack__create__needed,
 203                             netstack_t *, ns, int, moduleid);
 204                 }
 205                 mutex_exit(&ns->netstack_lock);
 206         }
 207         mutex_exit(&netstack_g_lock);
 208 
 209         /*
 210          * At this point in time a new instance can be created or an instance
 211          * can be destroyed, or some other module can register or unregister.
 212          * Make sure we either run all the create functions for this moduleid
 213          * or we wait for any other creators for this moduleid.
 214          */
 215         apply_all_netstacks(moduleid, netstack_apply_create);
 216 }
 217 
 218 void
 219 netstack_unregister(int moduleid)
 220 {
 221         netstack_t *ns;
 222 
 223         ASSERT(moduleid >= 0 && moduleid < NS_MAX);
 224 
 225         ASSERT(ns_reg[moduleid].nr_create != NULL);
 226         ASSERT(ns_reg[moduleid].nr_flags & NRF_REGISTERED);
 227 
 228         mutex_enter(&netstack_g_lock);
 229         /*
 230          * Determine the set of stacks that exist before we drop the lock.
 231          * Set NSS_SHUTDOWN_NEEDED and NSS_DESTROY_NEEDED for each of those.
 232          * That ensures that when we return all the callbacks for existing
 233          * instances have completed. And since we set NRF_DYING no new
 234          * instances can use this module.
 235          */
 236         for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
 237                 boolean_t created = B_FALSE;
 238                 nm_state_t *nms = &ns->netstack_m_state[moduleid];
 239 
 240                 mutex_enter(&ns->netstack_lock);
 241 
 242                 /*
 243                  * We need to be careful here. We could actually have a netstack
 244                  * being created as we speak waiting for us to let go of this
 245                  * lock to proceed. It may have set NSS_CREATE_NEEDED, but not
 246                  * have gotten to the point of completing it yet. If
 247                  * NSS_CREATE_NEEDED, we can safely just remove it here and
 248                  * never create the module. However, if NSS_CREATE_INPROGRESS is
 249                  * set, we need to still flag this module for shutdown and
 250                  * deletion, just as though it had reached NSS_CREATE_COMPLETED.
 251                  *
 252                  * It is safe to do that because of two different guarantees
 253                  * that exist in the system. The first is that before we do a
 254                  * create, shutdown, or destroy, we ensure that nothing else is
 255                  * in progress in the system for this netstack and wait for it
 256                  * to complete. Secondly, because the zone is being created, we
 257                  * know that the following call to apply_all_netstack will block
 258                  * on the zone finishing its initialization.
 259                  */
 260                 if (nms->nms_flags & NSS_CREATE_NEEDED)
 261                         nms->nms_flags &= ~NSS_CREATE_NEEDED;
 262 
 263                 if (nms->nms_flags & NSS_CREATE_INPROGRESS ||
 264                     nms->nms_flags & NSS_CREATE_COMPLETED)
 265                         created = B_TRUE;
 266 
 267                 if (ns_reg[moduleid].nr_shutdown != NULL && created &&
 268                     (nms->nms_flags & NSS_CREATE_COMPLETED) &&
 269                     (nms->nms_flags & NSS_SHUTDOWN_ALL) == 0) {
 270                         nms->nms_flags |= NSS_SHUTDOWN_NEEDED;
 271                         DTRACE_PROBE2(netstack__shutdown__needed,
 272                             netstack_t *, ns, int, moduleid);
 273                 }
 274                 if ((ns_reg[moduleid].nr_flags & NRF_REGISTERED) &&
 275                     ns_reg[moduleid].nr_destroy != NULL && created &&
 276                     (nms->nms_flags & NSS_DESTROY_ALL) == 0) {
 277                         nms->nms_flags |= NSS_DESTROY_NEEDED;
 278                         DTRACE_PROBE2(netstack__destroy__needed,
 279                             netstack_t *, ns, int, moduleid);
 280                 }
 281                 mutex_exit(&ns->netstack_lock);
 282         }
 283         /*
 284          * Prevent any new netstack from calling the registered create
 285          * function, while keeping the function pointers in place until the
 286          * shutdown and destroy callbacks are complete.
 287          */
 288         ns_reg[moduleid].nr_flags |= NRF_DYING;
 289         mutex_exit(&netstack_g_lock);
 290 
 291         apply_all_netstacks(moduleid, netstack_apply_shutdown);
 292         apply_all_netstacks(moduleid, netstack_apply_destroy);
 293 
 294         /*
 295          * Clear the nms_flags so that we can handle this module
 296          * being loaded again.
 297          * Also remove the registered functions.
 298          */
 299         mutex_enter(&netstack_g_lock);
 300         ASSERT(ns_reg[moduleid].nr_flags & NRF_REGISTERED);
 301         ASSERT(ns_reg[moduleid].nr_flags & NRF_DYING);
 302         for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
 303                 nm_state_t *nms = &ns->netstack_m_state[moduleid];
 304 
 305                 mutex_enter(&ns->netstack_lock);
 306                 if (nms->nms_flags & NSS_DESTROY_COMPLETED) {
 307                         nms->nms_flags = 0;
 308                         DTRACE_PROBE2(netstack__destroy__done,
 309                             netstack_t *, ns, int, moduleid);
 310                 }
 311                 mutex_exit(&ns->netstack_lock);
 312         }
 313 
 314         ns_reg[moduleid].nr_create = NULL;
 315         ns_reg[moduleid].nr_shutdown = NULL;
 316         ns_reg[moduleid].nr_destroy = NULL;
 317         ns_reg[moduleid].nr_flags = 0;
 318         mutex_exit(&netstack_g_lock);
 319 }
 320 
 321 /*
 322  * Lookup and/or allocate a netstack for this zone.
 323  */
 324 static void *
 325 netstack_zone_create(zoneid_t zoneid)
 326 {
 327         netstackid_t stackid;
 328         netstack_t *ns;
 329         netstack_t **nsp;
 330         zone_t  *zone;
 331         int i;
 332 
 333         ASSERT(netstack_initialized);
 334 
 335         zone = zone_find_by_id_nolock(zoneid);
 336         ASSERT(zone != NULL);
 337 
 338         if (zone->zone_flags & ZF_NET_EXCL) {
 339                 stackid = zoneid;
 340         } else {
 341                 /* Look for the stack instance for the global */
 342                 stackid = GLOBAL_NETSTACKID;
 343         }
 344 
 345         /* Allocate even if it isn't needed; simplifies locking */
 346         ns = (netstack_t *)kmem_zalloc(sizeof (netstack_t), KM_SLEEP);
 347 
 348         /* Look if there is a matching stack instance */
 349         mutex_enter(&netstack_g_lock);
 350         for (nsp = &netstack_head; *nsp != NULL;
 351             nsp = &((*nsp)->netstack_next)) {
 352                 if ((*nsp)->netstack_stackid == stackid) {
 353                         /*
 354                          * Should never find a pre-existing exclusive stack
 355                          */
 356                         VERIFY(stackid == GLOBAL_NETSTACKID);
 357                         kmem_free(ns, sizeof (netstack_t));
 358                         ns = *nsp;
 359                         mutex_enter(&ns->netstack_lock);
 360                         ns->netstack_numzones++;
 361                         mutex_exit(&ns->netstack_lock);
 362                         mutex_exit(&netstack_g_lock);
 363                         DTRACE_PROBE1(netstack__inc__numzones,
 364                             netstack_t *, ns);
 365                         /* Record that we have a new shared stack zone */
 366                         netstack_shared_zone_add(zoneid);
 367                         zone->zone_netstack = ns;
 368                         return (ns);
 369                 }
 370         }
 371         /* Not found */
 372         mutex_init(&ns->netstack_lock, NULL, MUTEX_DEFAULT, NULL);
 373         cv_init(&ns->netstack_cv, NULL, CV_DEFAULT, NULL);
 374         ns->netstack_stackid = zoneid;
 375         ns->netstack_numzones = 1;
 376         ns->netstack_refcnt = 1; /* Decremented by netstack_zone_destroy */
 377         ns->netstack_flags = NSF_UNINIT;
 378         *nsp = ns;
 379         zone->zone_netstack = ns;
 380 
 381         mutex_enter(&ns->netstack_lock);
 382         /*
 383          * Mark this netstack as having a CREATE running so
 384          * any netstack_register/netstack_unregister waits for
 385          * the existing create callbacks to complete in moduleid order
 386          */
 387         ns->netstack_flags |= NSF_ZONE_CREATE;
 388 
 389         /*
 390          * Determine the set of module create functions that need to be
 391          * called before we drop the lock.
 392          * Set NSS_CREATE_NEEDED for each of those.
 393          * Skip any with NRF_DYING set, since those are in the process of
 394          * going away, by checking for flags being exactly NRF_REGISTERED.
 395          */
 396         for (i = 0; i < NS_MAX; i++) {
 397                 nm_state_t *nms = &ns->netstack_m_state[i];
 398 
 399                 cv_init(&nms->nms_cv, NULL, CV_DEFAULT, NULL);
 400 
 401                 if ((ns_reg[i].nr_flags == NRF_REGISTERED) &&
 402                     (nms->nms_flags & NSS_CREATE_ALL) == 0) {
 403                         nms->nms_flags |= NSS_CREATE_NEEDED;
 404                         DTRACE_PROBE2(netstack__create__needed,
 405                             netstack_t *, ns, int, i);
 406                 }
 407         }
 408         mutex_exit(&ns->netstack_lock);
 409         mutex_exit(&netstack_g_lock);
 410 
 411         apply_all_modules(ns, netstack_apply_create);
 412 
 413         /* Tell any waiting netstack_register/netstack_unregister to proceed */
 414         mutex_enter(&ns->netstack_lock);
 415         ns->netstack_flags &= ~NSF_UNINIT;
 416         ASSERT(ns->netstack_flags & NSF_ZONE_CREATE);
 417         ns->netstack_flags &= ~NSF_ZONE_CREATE;
 418         cv_broadcast(&ns->netstack_cv);
 419         mutex_exit(&ns->netstack_lock);
 420 
 421         return (ns);
 422 }
 423 
 424 /* ARGSUSED */
 425 static void
 426 netstack_zone_shutdown(zoneid_t zoneid, void *arg)
 427 {
 428         netstack_t *ns = (netstack_t *)arg;
 429         int i;
 430 
 431         ASSERT(arg != NULL);
 432 
 433         mutex_enter(&ns->netstack_lock);
 434         ASSERT(ns->netstack_numzones > 0);
 435         if (ns->netstack_numzones != 1) {
 436                 /* Stack instance being used by other zone */
 437                 mutex_exit(&ns->netstack_lock);
 438                 ASSERT(ns->netstack_stackid == GLOBAL_NETSTACKID);
 439                 return;
 440         }
 441         mutex_exit(&ns->netstack_lock);
 442 
 443         mutex_enter(&netstack_g_lock);
 444         mutex_enter(&ns->netstack_lock);
 445         /*
 446          * Mark this netstack as having a SHUTDOWN running so
 447          * any netstack_register/netstack_unregister waits for
 448          * the existing create callbacks to complete in moduleid order
 449          */
 450         ASSERT(!(ns->netstack_flags & NSF_ZONE_INPROGRESS));
 451         ns->netstack_flags |= NSF_ZONE_SHUTDOWN;
 452 
 453         /*
 454          * Determine the set of stacks that exist before we drop the lock.
 455          * Set NSS_SHUTDOWN_NEEDED for each of those.
 456          */
 457         for (i = 0; i < NS_MAX; i++) {
 458                 nm_state_t *nms = &ns->netstack_m_state[i];
 459 
 460                 if ((ns_reg[i].nr_flags & NRF_REGISTERED) &&
 461                     ns_reg[i].nr_shutdown != NULL &&
 462                     (nms->nms_flags & NSS_CREATE_COMPLETED) &&
 463                     (nms->nms_flags & NSS_SHUTDOWN_ALL) == 0) {
 464                         nms->nms_flags |= NSS_SHUTDOWN_NEEDED;
 465                         DTRACE_PROBE2(netstack__shutdown__needed,
 466                             netstack_t *, ns, int, i);
 467                 }
 468         }
 469         mutex_exit(&ns->netstack_lock);
 470         mutex_exit(&netstack_g_lock);
 471 
 472         /*
 473          * Call the shutdown function for all registered modules for this
 474          * netstack.
 475          */
 476         apply_all_modules_reverse(ns, netstack_apply_shutdown);
 477 
 478         /* Tell any waiting netstack_register/netstack_unregister to proceed */
 479         mutex_enter(&ns->netstack_lock);
 480         ASSERT(ns->netstack_flags & NSF_ZONE_SHUTDOWN);
 481         ns->netstack_flags &= ~NSF_ZONE_SHUTDOWN;
 482         cv_broadcast(&ns->netstack_cv);
 483         mutex_exit(&ns->netstack_lock);
 484 }
 485 
 486 /*
 487  * Common routine to release a zone.
 488  * If this was the last zone using the stack instance then prepare to
 489  * have the refcnt dropping to zero free the zone.
 490  */
 491 /* ARGSUSED */
 492 static void
 493 netstack_zone_destroy(zoneid_t zoneid, void *arg)
 494 {
 495         netstack_t *ns = (netstack_t *)arg;
 496 
 497         ASSERT(arg != NULL);
 498 
 499         mutex_enter(&ns->netstack_lock);
 500         ASSERT(ns->netstack_numzones > 0);
 501         ns->netstack_numzones--;
 502         if (ns->netstack_numzones != 0) {
 503                 /* Stack instance being used by other zone */
 504                 mutex_exit(&ns->netstack_lock);
 505                 ASSERT(ns->netstack_stackid == GLOBAL_NETSTACKID);
 506                 /* Record that we a shared stack zone has gone away */
 507                 netstack_shared_zone_remove(zoneid);
 508                 return;
 509         }
 510         /*
 511          * Set CLOSING so that netstack_find_by will not find it.
 512          */
 513         ns->netstack_flags |= NSF_CLOSING;
 514         mutex_exit(&ns->netstack_lock);
 515         DTRACE_PROBE1(netstack__dec__numzones, netstack_t *, ns);
 516         /* No other thread can call zone_destroy for this stack */
 517 
 518         /*
 519          * Decrease refcnt to account for the one in netstack_zone_init()
 520          */
 521         netstack_rele(ns);
 522 }
 523 
 524 /*
 525  * Called when the reference count drops to zero.
 526  * Call the destroy functions for each registered module.
 527  */
 528 static void
 529 netstack_stack_inactive(netstack_t *ns)
 530 {
 531         int i;
 532 
 533         mutex_enter(&netstack_g_lock);
 534         mutex_enter(&ns->netstack_lock);
 535         /*
 536          * Mark this netstack as having a DESTROY running so
 537          * any netstack_register/netstack_unregister waits for
 538          * the existing destroy callbacks to complete in reverse moduleid order
 539          */
 540         ASSERT(!(ns->netstack_flags & NSF_ZONE_INPROGRESS));
 541         ns->netstack_flags |= NSF_ZONE_DESTROY;
 542         /*
 543          * If the shutdown callback wasn't called earlier (e.g., if this is
 544          * a netstack shared between multiple zones), then we schedule it now.
 545          *
 546          * Determine the set of stacks that exist before we drop the lock.
 547          * Set NSS_DESTROY_NEEDED for each of those. That
 548          * ensures that when we return all the callbacks for existing
 549          * instances have completed.
 550          */
 551         for (i = 0; i < NS_MAX; i++) {
 552                 nm_state_t *nms = &ns->netstack_m_state[i];
 553 
 554                 if ((ns_reg[i].nr_flags & NRF_REGISTERED) &&
 555                     ns_reg[i].nr_shutdown != NULL &&
 556                     (nms->nms_flags & NSS_CREATE_COMPLETED) &&
 557                     (nms->nms_flags & NSS_SHUTDOWN_ALL) == 0) {
 558                         nms->nms_flags |= NSS_SHUTDOWN_NEEDED;
 559                         DTRACE_PROBE2(netstack__shutdown__needed,
 560                             netstack_t *, ns, int, i);
 561                 }
 562 
 563                 if ((ns_reg[i].nr_flags & NRF_REGISTERED) &&
 564                     ns_reg[i].nr_destroy != NULL &&
 565                     (nms->nms_flags & NSS_CREATE_COMPLETED) &&
 566                     (nms->nms_flags & NSS_DESTROY_ALL) == 0) {
 567                         nms->nms_flags |= NSS_DESTROY_NEEDED;
 568                         DTRACE_PROBE2(netstack__destroy__needed,
 569                             netstack_t *, ns, int, i);
 570                 }
 571         }
 572         mutex_exit(&ns->netstack_lock);
 573         mutex_exit(&netstack_g_lock);
 574 
 575         /*
 576          * Call the shutdown and destroy functions for all registered modules
 577          * for this netstack.
 578          *
 579          * Since there are some ordering dependencies between the modules we
 580          * tear them down in the reverse order of what was used to create them.
 581          *
 582          * Since a netstack_t is never reused (when a zone is rebooted it gets
 583          * a new zoneid == netstackid i.e. a new netstack_t is allocated) we
 584          * leave nms_flags the way it is i.e. with NSS_DESTROY_COMPLETED set.
 585          * That is different than in the netstack_unregister() case.
 586          */
 587         apply_all_modules_reverse(ns, netstack_apply_shutdown);
 588         apply_all_modules_reverse(ns, netstack_apply_destroy);
 589 
 590         /* Tell any waiting netstack_register/netstack_unregister to proceed */
 591         mutex_enter(&ns->netstack_lock);
 592         ASSERT(ns->netstack_flags & NSF_ZONE_DESTROY);
 593         ns->netstack_flags &= ~NSF_ZONE_DESTROY;
 594         cv_broadcast(&ns->netstack_cv);
 595         mutex_exit(&ns->netstack_lock);
 596 }
 597 
 598 /*
 599  * Apply a function to all netstacks for a particular moduleid.
 600  *
 601  * If there is any zone activity (due to a zone being created, shutdown,
 602  * or destroyed) we wait for that to complete before we proceed. This ensures
 603  * that the moduleids are processed in order when a zone is created or
 604  * destroyed.
 605  *
 606  * The applyfn has to drop netstack_g_lock if it does some work.
 607  * In that case we don't follow netstack_next,
 608  * even if it is possible to do so without any hazards. This is
 609  * because we want the design to allow for the list of netstacks threaded
 610  * by netstack_next to change in any arbitrary way during the time the
 611  * lock was dropped.
 612  *
 613  * It is safe to restart the loop at netstack_head since the applyfn
 614  * changes netstack_m_state as it processes things, so a subsequent
 615  * pass through will have no effect in applyfn, hence the loop will terminate
 616  * in at worst O(N^2).
 617  */
 618 static void
 619 apply_all_netstacks(int moduleid, applyfn_t *applyfn)
 620 {
 621         netstack_t *ns;
 622 
 623         mutex_enter(&netstack_g_lock);
 624         ns = netstack_head;
 625         while (ns != NULL) {
 626                 if (wait_for_zone_creator(ns, &netstack_g_lock)) {
 627                         /* Lock dropped - restart at head */
 628                         ns = netstack_head;
 629                 } else if ((applyfn)(&netstack_g_lock, ns, moduleid)) {
 630                         /* Lock dropped - restart at head */
 631                         ns = netstack_head;
 632                 } else {
 633                         ns = ns->netstack_next;
 634                 }
 635         }
 636         mutex_exit(&netstack_g_lock);
 637 }
 638 
 639 /*
 640  * Apply a function to all moduleids for a particular netstack.
 641  *
 642  * Since the netstack linkage doesn't matter in this case we can
 643  * ignore whether the function drops the lock.
 644  */
 645 static void
 646 apply_all_modules(netstack_t *ns, applyfn_t *applyfn)
 647 {
 648         int i;
 649 
 650         mutex_enter(&netstack_g_lock);
 651         for (i = 0; i < NS_MAX; i++) {
 652                 /*
 653                  * We don't care whether the lock was dropped
 654                  * since we are not iterating over netstack_head.
 655                  */
 656                 (void) (applyfn)(&netstack_g_lock, ns, i);
 657         }
 658         mutex_exit(&netstack_g_lock);
 659 }
 660 
 661 /* Like the above but in reverse moduleid order */
 662 static void
 663 apply_all_modules_reverse(netstack_t *ns, applyfn_t *applyfn)
 664 {
 665         int i;
 666 
 667         mutex_enter(&netstack_g_lock);
 668         for (i = NS_MAX-1; i >= 0; i--) {
 669                 /*
 670                  * We don't care whether the lock was dropped
 671                  * since we are not iterating over netstack_head.
 672                  */
 673                 (void) (applyfn)(&netstack_g_lock, ns, i);
 674         }
 675         mutex_exit(&netstack_g_lock);
 676 }
 677 
 678 /*
 679  * Call the create function for the ns and moduleid if CREATE_NEEDED
 680  * is set.
 681  * If some other thread gets here first and sets *_INPROGRESS, then
 682  * we wait for that thread to complete so that we can ensure that
 683  * all the callbacks are done when we've looped over all netstacks/moduleids.
 684  *
 685  * When we call the create function, we temporarily drop the netstack_lock
 686  * held by the caller, and return true to tell the caller it needs to
 687  * re-evalute the state.
 688  */
 689 static boolean_t
 690 netstack_apply_create(kmutex_t *lockp, netstack_t *ns, int moduleid)
 691 {
 692         void *result;
 693         netstackid_t stackid;
 694         nm_state_t *nms = &ns->netstack_m_state[moduleid];
 695         boolean_t dropped = B_FALSE;
 696 
 697         ASSERT(MUTEX_HELD(lockp));
 698         mutex_enter(&ns->netstack_lock);
 699 
 700         if (wait_for_nms_inprogress(ns, nms, lockp))
 701                 dropped = B_TRUE;
 702 
 703         if (nms->nms_flags & NSS_CREATE_NEEDED) {
 704                 nms->nms_flags &= ~NSS_CREATE_NEEDED;
 705                 nms->nms_flags |= NSS_CREATE_INPROGRESS;
 706                 DTRACE_PROBE2(netstack__create__inprogress,
 707                     netstack_t *, ns, int, moduleid);
 708                 mutex_exit(&ns->netstack_lock);
 709                 mutex_exit(lockp);
 710                 dropped = B_TRUE;
 711 
 712                 ASSERT(ns_reg[moduleid].nr_create != NULL);
 713                 stackid = ns->netstack_stackid;
 714                 DTRACE_PROBE2(netstack__create__start,
 715                     netstackid_t, stackid,
 716                     netstack_t *, ns);
 717                 result = (ns_reg[moduleid].nr_create)(stackid, ns);
 718                 DTRACE_PROBE2(netstack__create__end,
 719                     void *, result, netstack_t *, ns);
 720 
 721                 ASSERT(result != NULL);
 722                 mutex_enter(lockp);
 723                 mutex_enter(&ns->netstack_lock);
 724                 ns->netstack_modules[moduleid] = result;
 725                 nms->nms_flags &= ~NSS_CREATE_INPROGRESS;
 726                 nms->nms_flags |= NSS_CREATE_COMPLETED;
 727                 cv_broadcast(&nms->nms_cv);
 728                 DTRACE_PROBE2(netstack__create__completed,
 729                     netstack_t *, ns, int, moduleid);
 730                 mutex_exit(&ns->netstack_lock);
 731                 return (dropped);
 732         } else {
 733                 mutex_exit(&ns->netstack_lock);
 734                 return (dropped);
 735         }
 736 }
 737 
 738 /*
 739  * Call the shutdown function for the ns and moduleid if SHUTDOWN_NEEDED
 740  * is set.
 741  * If some other thread gets here first and sets *_INPROGRESS, then
 742  * we wait for that thread to complete so that we can ensure that
 743  * all the callbacks are done when we've looped over all netstacks/moduleids.
 744  *
 745  * When we call the shutdown function, we temporarily drop the netstack_lock
 746  * held by the caller, and return true to tell the caller it needs to
 747  * re-evalute the state.
 748  */
 749 static boolean_t
 750 netstack_apply_shutdown(kmutex_t *lockp, netstack_t *ns, int moduleid)
 751 {
 752         netstackid_t stackid;
 753         void * netstack_module;
 754         nm_state_t *nms = &ns->netstack_m_state[moduleid];
 755         boolean_t dropped = B_FALSE;
 756 
 757         ASSERT(MUTEX_HELD(lockp));
 758         mutex_enter(&ns->netstack_lock);
 759 
 760         if (wait_for_nms_inprogress(ns, nms, lockp))
 761                 dropped = B_TRUE;
 762 
 763         if (nms->nms_flags & NSS_SHUTDOWN_NEEDED) {
 764                 nms->nms_flags &= ~NSS_SHUTDOWN_NEEDED;
 765                 nms->nms_flags |= NSS_SHUTDOWN_INPROGRESS;
 766                 DTRACE_PROBE2(netstack__shutdown__inprogress,
 767                     netstack_t *, ns, int, moduleid);
 768                 mutex_exit(&ns->netstack_lock);
 769                 mutex_exit(lockp);
 770                 dropped = B_TRUE;
 771 
 772                 ASSERT(ns_reg[moduleid].nr_shutdown != NULL);
 773                 stackid = ns->netstack_stackid;
 774                 netstack_module = ns->netstack_modules[moduleid];
 775                 DTRACE_PROBE2(netstack__shutdown__start,
 776                     netstackid_t, stackid,
 777                     void *, netstack_module);
 778                 (ns_reg[moduleid].nr_shutdown)(stackid, netstack_module);
 779                 DTRACE_PROBE1(netstack__shutdown__end,
 780                     netstack_t *, ns);
 781 
 782                 mutex_enter(lockp);
 783                 mutex_enter(&ns->netstack_lock);
 784                 nms->nms_flags &= ~NSS_SHUTDOWN_INPROGRESS;
 785                 nms->nms_flags |= NSS_SHUTDOWN_COMPLETED;
 786                 cv_broadcast(&nms->nms_cv);
 787                 DTRACE_PROBE2(netstack__shutdown__completed,
 788                     netstack_t *, ns, int, moduleid);
 789                 mutex_exit(&ns->netstack_lock);
 790                 return (dropped);
 791         } else {
 792                 mutex_exit(&ns->netstack_lock);
 793                 return (dropped);
 794         }
 795 }
 796 
 797 /*
 798  * Call the destroy function for the ns and moduleid if DESTROY_NEEDED
 799  * is set.
 800  * If some other thread gets here first and sets *_INPROGRESS, then
 801  * we wait for that thread to complete so that we can ensure that
 802  * all the callbacks are done when we've looped over all netstacks/moduleids.
 803  *
 804  * When we call the destroy function, we temporarily drop the netstack_lock
 805  * held by the caller, and return true to tell the caller it needs to
 806  * re-evalute the state.
 807  */
 808 static boolean_t
 809 netstack_apply_destroy(kmutex_t *lockp, netstack_t *ns, int moduleid)
 810 {
 811         netstackid_t stackid;
 812         void * netstack_module;
 813         nm_state_t *nms = &ns->netstack_m_state[moduleid];
 814         boolean_t dropped = B_FALSE;
 815 
 816         ASSERT(MUTEX_HELD(lockp));
 817         mutex_enter(&ns->netstack_lock);
 818 
 819         if (wait_for_nms_inprogress(ns, nms, lockp))
 820                 dropped = B_TRUE;
 821 
 822         if (nms->nms_flags & NSS_DESTROY_NEEDED) {
 823                 nms->nms_flags &= ~NSS_DESTROY_NEEDED;
 824                 nms->nms_flags |= NSS_DESTROY_INPROGRESS;
 825                 DTRACE_PROBE2(netstack__destroy__inprogress,
 826                     netstack_t *, ns, int, moduleid);
 827                 mutex_exit(&ns->netstack_lock);
 828                 mutex_exit(lockp);
 829                 dropped = B_TRUE;
 830 
 831                 ASSERT(ns_reg[moduleid].nr_destroy != NULL);
 832                 stackid = ns->netstack_stackid;
 833                 netstack_module = ns->netstack_modules[moduleid];
 834                 DTRACE_PROBE2(netstack__destroy__start,
 835                     netstackid_t, stackid,
 836                     void *, netstack_module);
 837                 (ns_reg[moduleid].nr_destroy)(stackid, netstack_module);
 838                 DTRACE_PROBE1(netstack__destroy__end,
 839                     netstack_t *, ns);
 840 
 841                 mutex_enter(lockp);
 842                 mutex_enter(&ns->netstack_lock);
 843                 ns->netstack_modules[moduleid] = NULL;
 844                 nms->nms_flags &= ~NSS_DESTROY_INPROGRESS;
 845                 nms->nms_flags |= NSS_DESTROY_COMPLETED;
 846                 cv_broadcast(&nms->nms_cv);
 847                 DTRACE_PROBE2(netstack__destroy__completed,
 848                     netstack_t *, ns, int, moduleid);
 849                 mutex_exit(&ns->netstack_lock);
 850                 return (dropped);
 851         } else {
 852                 mutex_exit(&ns->netstack_lock);
 853                 return (dropped);
 854         }
 855 }
 856 
 857 /*
 858  * If somebody  is creating the netstack (due to a new zone being created)
 859  * then we wait for them to complete. This ensures that any additional
 860  * netstack_register() doesn't cause the create functions to run out of
 861  * order.
 862  * Note that we do not need such a global wait in the case of the shutdown
 863  * and destroy callbacks, since in that case it is sufficient for both
 864  * threads to set NEEDED and wait for INPROGRESS to ensure ordering.
 865  * Returns true if lockp was temporarily dropped while waiting.
 866  */
 867 static boolean_t
 868 wait_for_zone_creator(netstack_t *ns, kmutex_t *lockp)
 869 {
 870         boolean_t dropped = B_FALSE;
 871 
 872         mutex_enter(&ns->netstack_lock);
 873         while (ns->netstack_flags & NSF_ZONE_CREATE) {
 874                 DTRACE_PROBE1(netstack__wait__zone__inprogress,
 875                     netstack_t *, ns);
 876                 if (lockp != NULL) {
 877                         dropped = B_TRUE;
 878                         mutex_exit(lockp);
 879                 }
 880                 cv_wait(&ns->netstack_cv, &ns->netstack_lock);
 881                 if (lockp != NULL) {
 882                         /* First drop netstack_lock to preserve order */
 883                         mutex_exit(&ns->netstack_lock);
 884                         mutex_enter(lockp);
 885                         mutex_enter(&ns->netstack_lock);
 886                 }
 887         }
 888         mutex_exit(&ns->netstack_lock);
 889         return (dropped);
 890 }
 891 
 892 /*
 893  * Wait for any INPROGRESS flag to be cleared for the netstack/moduleid
 894  * combination.
 895  * Returns true if lockp was temporarily dropped while waiting.
 896  */
 897 static boolean_t
 898 wait_for_nms_inprogress(netstack_t *ns, nm_state_t *nms, kmutex_t *lockp)
 899 {
 900         boolean_t dropped = B_FALSE;
 901 
 902         while (nms->nms_flags & NSS_ALL_INPROGRESS) {
 903                 DTRACE_PROBE2(netstack__wait__nms__inprogress,
 904                     netstack_t *, ns, nm_state_t *, nms);
 905                 if (lockp != NULL) {
 906                         dropped = B_TRUE;
 907                         mutex_exit(lockp);
 908                 }
 909                 cv_wait(&nms->nms_cv, &ns->netstack_lock);
 910                 if (lockp != NULL) {
 911                         /* First drop netstack_lock to preserve order */
 912                         mutex_exit(&ns->netstack_lock);
 913                         mutex_enter(lockp);
 914                         mutex_enter(&ns->netstack_lock);
 915                 }
 916         }
 917         return (dropped);
 918 }
 919 
 920 /*
 921  * Get the stack instance used in caller's zone.
 922  * Increases the reference count, caller must do a netstack_rele.
 923  * It can't be called after zone_destroy() has started.
 924  */
 925 netstack_t *
 926 netstack_get_current(void)
 927 {
 928         netstack_t *ns;
 929 
 930         ns = curproc->p_zone->zone_netstack;
 931         ASSERT(ns != NULL);
 932         return (netstack_hold_if_active(ns));
 933 }
 934 
 935 /*
 936  * Find a stack instance given the cred.
 937  * This is used by the modules to potentially allow for a future when
 938  * something other than the zoneid is used to determine the stack.
 939  */
 940 netstack_t *
 941 netstack_find_by_cred(const cred_t *cr)
 942 {
 943         zoneid_t zoneid = crgetzoneid(cr);
 944 
 945         /* Handle the case when cr_zone is NULL */
 946         if (zoneid == (zoneid_t)-1)
 947                 zoneid = GLOBAL_ZONEID;
 948 
 949         /* For performance ... */
 950         if (curproc->p_zone->zone_id == zoneid)
 951                 return (netstack_get_current());
 952         else
 953                 return (netstack_find_by_zoneid(zoneid));
 954 }
 955 
 956 /*
 957  * Find a stack instance given the zoneid.
 958  * Increases the reference count if found; caller must do a
 959  * netstack_rele().
 960  *
 961  * If there is no exact match then assume the shared stack instance
 962  * matches.
 963  *
 964  * Skip the uninitialized and closing ones.
 965  */
 966 netstack_t *
 967 netstack_find_by_zoneid(zoneid_t zoneid)
 968 {
 969         netstack_t *ns;
 970         zone_t *zone;
 971 
 972         zone = zone_find_by_id(zoneid);
 973 
 974         if (zone == NULL)
 975                 return (NULL);
 976 
 977         ASSERT(zone->zone_netstack != NULL);
 978         ns = netstack_hold_if_active(zone->zone_netstack);
 979 
 980         zone_rele(zone);
 981         return (ns);
 982 }
 983 
 984 /*
 985  * Find a stack instance given the zoneid. Can only be called from
 986  * the create callback. See the comments in zone_find_by_id_nolock why
 987  * that limitation exists.
 988  *
 989  * Increases the reference count if found; caller must do a
 990  * netstack_rele().
 991  *
 992  * If there is no exact match then assume the shared stack instance
 993  * matches.
 994  *
 995  * Skip the unitialized ones.
 996  */
 997 netstack_t *
 998 netstack_find_by_zoneid_nolock(zoneid_t zoneid)
 999 {
1000         zone_t *zone;
1001 
1002         zone = zone_find_by_id_nolock(zoneid);
1003 
1004         if (zone == NULL)
1005                 return (NULL);
1006 
1007         ASSERT(zone->zone_netstack != NULL);
1008         /* zone_find_by_id_nolock does not have a hold on the zone */
1009         return (netstack_hold_if_active(zone->zone_netstack));
1010 }
1011 
1012 /*
1013  * Find a stack instance given the stackid with exact match?
1014  * Increases the reference count if found; caller must do a
1015  * netstack_rele().
1016  *
1017  * Skip the unitialized ones.
1018  */
1019 netstack_t *
1020 netstack_find_by_stackid(netstackid_t stackid)
1021 {
1022         netstack_t *ns;
1023 
1024         mutex_enter(&netstack_g_lock);
1025         for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
1026                 /* Can't use hold_if_active because of stackid check. */
1027                 mutex_enter(&ns->netstack_lock);
1028                 if (ns->netstack_stackid == stackid &&
1029                     !(ns->netstack_flags & (NSF_UNINIT|NSF_CLOSING))) {
1030                         netstack_hold_locked(ns);
1031                         mutex_exit(&ns->netstack_lock);
1032                         mutex_exit(&netstack_g_lock);
1033                         return (ns);
1034                 }
1035                 mutex_exit(&ns->netstack_lock);
1036         }
1037         mutex_exit(&netstack_g_lock);
1038         return (NULL);
1039 }
1040 
1041 boolean_t
1042 netstack_inuse_by_stackid(netstackid_t stackid)
1043 {
1044         netstack_t *ns;
1045         boolean_t rval = B_FALSE;
1046 
1047         mutex_enter(&netstack_g_lock);
1048 
1049         for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
1050                 if (ns->netstack_stackid == stackid) {
1051                         rval = B_TRUE;
1052                         break;
1053                 }
1054         }
1055 
1056         mutex_exit(&netstack_g_lock);
1057 
1058         return (rval);
1059 }
1060 
1061 
1062 static void
1063 netstack_reap(void *arg)
1064 {
1065         netstack_t **nsp, *ns = (netstack_t *)arg;
1066         boolean_t found;
1067         int i;
1068 
1069         /*
1070          * Time to call the destroy functions and free up
1071          * the structure
1072          */
1073         netstack_stack_inactive(ns);
1074 
1075         /* Make sure nothing increased the references */
1076         ASSERT(ns->netstack_refcnt == 0);
1077         ASSERT(ns->netstack_numzones == 0);
1078 
1079         /* Finally remove from list of netstacks */
1080         mutex_enter(&netstack_g_lock);
1081         found = B_FALSE;
1082         for (nsp = &netstack_head; *nsp != NULL;
1083             nsp = &(*nsp)->netstack_next) {
1084                 if (*nsp == ns) {
1085                         *nsp = ns->netstack_next;
1086                         ns->netstack_next = NULL;
1087                         found = B_TRUE;
1088                         break;
1089                 }
1090         }
1091         ASSERT(found);
1092         mutex_exit(&netstack_g_lock);
1093 
1094         /* Make sure nothing increased the references */
1095         ASSERT(ns->netstack_refcnt == 0);
1096         ASSERT(ns->netstack_numzones == 0);
1097 
1098         ASSERT(ns->netstack_flags & NSF_CLOSING);
1099 
1100         for (i = 0; i < NS_MAX; i++) {
1101                 nm_state_t *nms = &ns->netstack_m_state[i];
1102 
1103                 cv_destroy(&nms->nms_cv);
1104         }
1105         mutex_destroy(&ns->netstack_lock);
1106         cv_destroy(&ns->netstack_cv);
1107         kmem_free(ns, sizeof (*ns));
1108         /* Allow another reap to be scheduled. */
1109         sema_v(&netstack_reap_limiter);
1110 }
1111 
1112 void
1113 netstack_rele(netstack_t *ns)
1114 {
1115         int refcnt, numzones;
1116 
1117         mutex_enter(&ns->netstack_lock);
1118         ASSERT(ns->netstack_refcnt > 0);
1119         ns->netstack_refcnt--;
1120         /*
1121          * As we drop the lock additional netstack_rele()s can come in
1122          * and decrement the refcnt to zero and free the netstack_t.
1123          * Store pointers in local variables and if we were not the last
1124          * then don't reference the netstack_t after that.
1125          */
1126         refcnt = ns->netstack_refcnt;
1127         numzones = ns->netstack_numzones;
1128         DTRACE_PROBE1(netstack__dec__ref, netstack_t *, ns);
1129         mutex_exit(&ns->netstack_lock);
1130 
1131         if (refcnt == 0 && numzones == 0) {
1132                 /*
1133                  * Because there are possibilities of re-entrancy in various
1134                  * netstack structures by callers, which might cause a lock up
1135                  * due to odd reference models, or other factors, we choose to
1136                  * schedule the actual deletion of this netstack as a deferred
1137                  * task on the system taskq.  This way, any such reference
1138                  * models won't trip over themselves.
1139                  *
1140                  * Assume we aren't in a high-priority interrupt context, so
1141                  * we can use KM_SLEEP and semaphores.
1142                  */
1143                 if (sema_tryp(&netstack_reap_limiter) == 0) {
1144                         /*
1145                          * Indicate we're slamming against a limit.
1146                          */
1147                         hrtime_t measurement = gethrtime();
1148 
1149                         sema_p(&netstack_reap_limiter);
1150                         /* Capture delay in ns. */
1151                         DTRACE_PROBE1(netstack__reap__rate__limited,
1152                             hrtime_t, gethrtime() - measurement);
1153                 }
1154 
1155                 /* TQ_SLEEP should prevent taskq_dispatch() from failing. */
1156                 (void) taskq_dispatch(system_taskq, netstack_reap, ns,
1157                     TQ_SLEEP);
1158         }
1159 }
1160 
1161 static void
1162 netstack_hold_locked(netstack_t *ns)
1163 {
1164         ASSERT(MUTEX_HELD(&ns->netstack_lock));
1165         ns->netstack_refcnt++;
1166         ASSERT(ns->netstack_refcnt > 0);
1167         DTRACE_PROBE1(netstack__inc__ref, netstack_t *, ns);
1168 }
1169 
1170 /*
1171  * If the passed-in netstack isn't active (i.e. it's uninitialized or closing),
1172  * return NULL, otherwise return it with its reference held.  Common code
1173  * for many netstack_find*() functions.
1174  */
1175 netstack_t *
1176 netstack_hold_if_active(netstack_t *ns)
1177 {
1178         netstack_t *retval;
1179 
1180         mutex_enter(&ns->netstack_lock);
1181         if (ns->netstack_flags & (NSF_UNINIT | NSF_CLOSING)) {
1182                 retval = NULL;
1183         } else {
1184                 netstack_hold_locked(ns);
1185                 retval = ns;
1186         }
1187         mutex_exit(&ns->netstack_lock);
1188 
1189         return (retval);
1190 }
1191 
1192 void
1193 netstack_hold(netstack_t *ns)
1194 {
1195         mutex_enter(&ns->netstack_lock);
1196         netstack_hold_locked(ns);
1197         mutex_exit(&ns->netstack_lock);
1198 }
1199 
1200 /*
1201  * To support kstat_create_netstack() using kstat_zone_add we need
1202  * to track both
1203  *  - all zoneids that use the global/shared stack
1204  *  - all kstats that have been added for the shared stack
1205  */
1206 kstat_t *
1207 kstat_create_netstack(char *ks_module, int ks_instance, char *ks_name,
1208     char *ks_class, uchar_t ks_type, uint_t ks_ndata, uchar_t ks_flags,
1209     netstackid_t ks_netstackid)
1210 {
1211         kstat_t *ks;
1212 
1213         if (ks_netstackid == GLOBAL_NETSTACKID) {
1214                 ks = kstat_create_zone(ks_module, ks_instance, ks_name,
1215                     ks_class, ks_type, ks_ndata, ks_flags, GLOBAL_ZONEID);
1216                 if (ks != NULL)
1217                         netstack_shared_kstat_add(ks);
1218                 return (ks);
1219         } else {
1220                 zoneid_t zoneid = ks_netstackid;
1221 
1222                 return (kstat_create_zone(ks_module, ks_instance, ks_name,
1223                     ks_class, ks_type, ks_ndata, ks_flags, zoneid));
1224         }
1225 }
1226 
1227 void
1228 kstat_delete_netstack(kstat_t *ks, netstackid_t ks_netstackid)
1229 {
1230         if (ks_netstackid == GLOBAL_NETSTACKID) {
1231                 netstack_shared_kstat_remove(ks);
1232         }
1233         kstat_delete(ks);
1234 }
1235 
1236 static void
1237 netstack_shared_zone_add(zoneid_t zoneid)
1238 {
1239         struct shared_zone_list *sz;
1240         struct shared_kstat_list *sk;
1241 
1242         sz = (struct shared_zone_list *)kmem_zalloc(sizeof (*sz), KM_SLEEP);
1243         sz->sz_zoneid = zoneid;
1244 
1245         /* Insert in list */
1246         mutex_enter(&netstack_shared_lock);
1247         sz->sz_next = netstack_shared_zones;
1248         netstack_shared_zones = sz;
1249 
1250         /*
1251          * Perform kstat_zone_add for each existing shared stack kstat.
1252          * Note: Holds netstack_shared_lock lock across kstat_zone_add.
1253          */
1254         for (sk = netstack_shared_kstats; sk != NULL; sk = sk->sk_next) {
1255                 kstat_zone_add(sk->sk_kstat, zoneid);
1256         }
1257         mutex_exit(&netstack_shared_lock);
1258 }
1259 
1260 static void
1261 netstack_shared_zone_remove(zoneid_t zoneid)
1262 {
1263         struct shared_zone_list **szp, *sz;
1264         struct shared_kstat_list *sk;
1265 
1266         /* Find in list */
1267         mutex_enter(&netstack_shared_lock);
1268         sz = NULL;
1269         for (szp = &netstack_shared_zones; *szp != NULL;
1270             szp = &((*szp)->sz_next)) {
1271                 if ((*szp)->sz_zoneid == zoneid) {
1272                         sz = *szp;
1273                         break;
1274                 }
1275         }
1276         /* We must find it */
1277         ASSERT(sz != NULL);
1278         *szp = sz->sz_next;
1279         sz->sz_next = NULL;
1280 
1281         /*
1282          * Perform kstat_zone_remove for each existing shared stack kstat.
1283          * Note: Holds netstack_shared_lock lock across kstat_zone_remove.
1284          */
1285         for (sk = netstack_shared_kstats; sk != NULL; sk = sk->sk_next) {
1286                 kstat_zone_remove(sk->sk_kstat, zoneid);
1287         }
1288         mutex_exit(&netstack_shared_lock);
1289 
1290         kmem_free(sz, sizeof (*sz));
1291 }
1292 
1293 static void
1294 netstack_shared_kstat_add(kstat_t *ks)
1295 {
1296         struct shared_zone_list *sz;
1297         struct shared_kstat_list *sk;
1298 
1299         sk = (struct shared_kstat_list *)kmem_zalloc(sizeof (*sk), KM_SLEEP);
1300         sk->sk_kstat = ks;
1301 
1302         /* Insert in list */
1303         mutex_enter(&netstack_shared_lock);
1304         sk->sk_next = netstack_shared_kstats;
1305         netstack_shared_kstats = sk;
1306 
1307         /*
1308          * Perform kstat_zone_add for each existing shared stack zone.
1309          * Note: Holds netstack_shared_lock lock across kstat_zone_add.
1310          */
1311         for (sz = netstack_shared_zones; sz != NULL; sz = sz->sz_next) {
1312                 kstat_zone_add(ks, sz->sz_zoneid);
1313         }
1314         mutex_exit(&netstack_shared_lock);
1315 }
1316 
1317 static void
1318 netstack_shared_kstat_remove(kstat_t *ks)
1319 {
1320         struct shared_zone_list *sz;
1321         struct shared_kstat_list **skp, *sk;
1322 
1323         /* Find in list */
1324         mutex_enter(&netstack_shared_lock);
1325         sk = NULL;
1326         for (skp = &netstack_shared_kstats; *skp != NULL;
1327             skp = &((*skp)->sk_next)) {
1328                 if ((*skp)->sk_kstat == ks) {
1329                         sk = *skp;
1330                         break;
1331                 }
1332         }
1333         /* Must find it */
1334         ASSERT(sk != NULL);
1335         *skp = sk->sk_next;
1336         sk->sk_next = NULL;
1337 
1338         /*
1339          * Perform kstat_zone_remove for each existing shared stack kstat.
1340          * Note: Holds netstack_shared_lock lock across kstat_zone_remove.
1341          */
1342         for (sz = netstack_shared_zones; sz != NULL; sz = sz->sz_next) {
1343                 kstat_zone_remove(ks, sz->sz_zoneid);
1344         }
1345         mutex_exit(&netstack_shared_lock);
1346         kmem_free(sk, sizeof (*sk));
1347 }
1348 
1349 /*
1350  * If a zoneid is part of the shared zone, return true
1351  */
1352 static boolean_t
1353 netstack_find_shared_zoneid(zoneid_t zoneid)
1354 {
1355         struct shared_zone_list *sz;
1356 
1357         mutex_enter(&netstack_shared_lock);
1358         for (sz = netstack_shared_zones; sz != NULL; sz = sz->sz_next) {
1359                 if (sz->sz_zoneid == zoneid) {
1360                         mutex_exit(&netstack_shared_lock);
1361                         return (B_TRUE);
1362                 }
1363         }
1364         mutex_exit(&netstack_shared_lock);
1365         return (B_FALSE);
1366 }
1367 
1368 /*
1369  * Hide the fact that zoneids and netstackids are allocated from
1370  * the same space in the current implementation.
1371  * We currently do not check that the stackid/zoneids are valid, since there
1372  * is no need for that. But this should only be done for ids that are
1373  * valid.
1374  */
1375 zoneid_t
1376 netstackid_to_zoneid(netstackid_t stackid)
1377 {
1378         return (stackid);
1379 }
1380 
1381 netstackid_t
1382 zoneid_to_netstackid(zoneid_t zoneid)
1383 {
1384         if (netstack_find_shared_zoneid(zoneid))
1385                 return (GLOBAL_ZONEID);
1386         else
1387                 return (zoneid);
1388 }
1389 
1390 zoneid_t
1391 netstack_get_zoneid(netstack_t *ns)
1392 {
1393         return (netstackid_to_zoneid(ns->netstack_stackid));
1394 }
1395 
1396 /*
1397  * Simplistic support for walking all the handles.
1398  * Example usage:
1399  *      netstack_handle_t nh;
1400  *      netstack_t *ns;
1401  *
1402  *      netstack_next_init(&nh);
1403  *      while ((ns = netstack_next(&nh)) != NULL) {
1404  *              do something;
1405  *              netstack_rele(ns);
1406  *      }
1407  *      netstack_next_fini(&nh);
1408  */
1409 void
1410 netstack_next_init(netstack_handle_t *handle)
1411 {
1412         *handle = 0;
1413 }
1414 
1415 /* ARGSUSED */
1416 void
1417 netstack_next_fini(netstack_handle_t *handle)
1418 {
1419 }
1420 
1421 netstack_t *
1422 netstack_next(netstack_handle_t *handle)
1423 {
1424         netstack_t *ns;
1425         int i, end;
1426 
1427         end = *handle;
1428         /* Walk skipping *handle number of instances */
1429 
1430         /* Look if there is a matching stack instance */
1431         mutex_enter(&netstack_g_lock);
1432         ns = netstack_head;
1433         for (i = 0; i < end; i++) {
1434                 if (ns == NULL)
1435                         break;
1436                 ns = ns->netstack_next;
1437         }
1438         /*
1439          * Skip those that aren't really here (uninitialized or closing).
1440          * Can't use hold_if_active because of "end" tracking.
1441          */
1442         while (ns != NULL) {
1443                 mutex_enter(&ns->netstack_lock);
1444                 if ((ns->netstack_flags & (NSF_UNINIT|NSF_CLOSING)) == 0) {
1445                         *handle = end + 1;
1446                         netstack_hold_locked(ns);
1447                         mutex_exit(&ns->netstack_lock);
1448                         break;
1449                 }
1450                 mutex_exit(&ns->netstack_lock);
1451                 end++;
1452                 ns = ns->netstack_next;
1453         }
1454         mutex_exit(&netstack_g_lock);
1455         return (ns);
1456 }