1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  * Copyright (c) 2016, Joyent, Inc.  All rights reserved.
  26  */
  27 
  28 #include <sys/param.h>
  29 #include <sys/sysmacros.h>
  30 #include <sys/vm.h>
  31 #include <sys/proc.h>
  32 #include <sys/tuneable.h>
  33 #include <sys/systm.h>
  34 #include <sys/cmn_err.h>
  35 #include <sys/debug.h>
  36 #include <sys/sdt.h>
  37 #include <sys/mutex.h>
  38 #include <sys/bitmap.h>
  39 #include <sys/atomic.h>
  40 #include <sys/sunddi.h>
  41 #include <sys/kobj.h>
  42 #include <sys/disp.h>
  43 #include <vm/seg_kmem.h>
  44 #include <sys/zone.h>
  45 #include <sys/netstack.h>
  46 
  47 /*
  48  * What we use so that the zones framework can tell us about new zones,
  49  * which we use to create new stacks.
  50  */
  51 static zone_key_t netstack_zone_key;
  52 
  53 static int      netstack_initialized = 0;
  54 
  55 /*
  56  * Track the registered netstacks.
  57  * The global lock protects
  58  * - ns_reg
  59  * - the list starting at netstack_head and following the netstack_next
  60  *   pointers.
  61  */
  62 static kmutex_t netstack_g_lock;
  63 
  64 /*
  65  * Registry of netstacks with their create/shutdown/destory functions.
  66  */
  67 static struct netstack_registry ns_reg[NS_MAX];
  68 
  69 /*
  70  * Global list of existing stacks.  We use this when a new zone with
  71  * an exclusive IP instance is created.
  72  *
  73  * Note that in some cases a netstack_t needs to stay around after the zone
  74  * has gone away. This is because there might be outstanding references
  75  * (from TCP TIME_WAIT connections, IPsec state, etc). The netstack_t data
  76  * structure and all the foo_stack_t's hanging off of it will be cleaned up
  77  * when the last reference to it is dropped.
  78  * However, the same zone might be rebooted. That is handled using the
  79  * assumption that the zones framework picks a new zoneid each time a zone
  80  * is (re)booted. We assert for that condition in netstack_zone_create().
  81  * Thus the old netstack_t can take its time for things to time out.
  82  */
  83 static netstack_t *netstack_head;
  84 
  85 /*
  86  * To support kstat_create_netstack() using kstat_zone_add we need
  87  * to track both
  88  *  - all zoneids that use the global/shared stack
  89  *  - all kstats that have been added for the shared stack
  90  */
  91 struct shared_zone_list {
  92         struct shared_zone_list *sz_next;
  93         zoneid_t                sz_zoneid;
  94 };
  95 
  96 struct shared_kstat_list {
  97         struct shared_kstat_list *sk_next;
  98         kstat_t                  *sk_kstat;
  99 };
 100 
 101 static kmutex_t netstack_shared_lock;   /* protects the following two */
 102 static struct shared_zone_list  *netstack_shared_zones;
 103 static struct shared_kstat_list *netstack_shared_kstats;
 104 
 105 static void     *netstack_zone_create(zoneid_t zoneid);
 106 static void     netstack_zone_shutdown(zoneid_t zoneid, void *arg);
 107 static void     netstack_zone_destroy(zoneid_t zoneid, void *arg);
 108 
 109 static void     netstack_shared_zone_add(zoneid_t zoneid);
 110 static void     netstack_shared_zone_remove(zoneid_t zoneid);
 111 static void     netstack_shared_kstat_add(kstat_t *ks);
 112 static void     netstack_shared_kstat_remove(kstat_t *ks);
 113 
 114 typedef boolean_t applyfn_t(kmutex_t *, netstack_t *, int);
 115 
 116 static void     apply_all_netstacks(int, applyfn_t *);
 117 static void     apply_all_modules(netstack_t *, applyfn_t *);
 118 static void     apply_all_modules_reverse(netstack_t *, applyfn_t *);
 119 static boolean_t netstack_apply_create(kmutex_t *, netstack_t *, int);
 120 static boolean_t netstack_apply_shutdown(kmutex_t *, netstack_t *, int);
 121 static boolean_t netstack_apply_destroy(kmutex_t *, netstack_t *, int);
 122 static boolean_t wait_for_zone_creator(netstack_t *, kmutex_t *);
 123 static boolean_t wait_for_nms_inprogress(netstack_t *, nm_state_t *,
 124     kmutex_t *);
 125 
 126 static void netstack_reap_work(netstack_t *, boolean_t);
 127 ksema_t netstack_reap_limiter;
 128 
 129 void
 130 netstack_init(void)
 131 {
 132         mutex_init(&netstack_g_lock, NULL, MUTEX_DEFAULT, NULL);
 133         mutex_init(&netstack_shared_lock, NULL, MUTEX_DEFAULT, NULL);
 134 
 135         /* XXX KEBE SAYS hard-coded constant needs to be fixed. */
 136         sema_init(&netstack_reap_limiter, 1024, NULL, SEMA_DRIVER, NULL);
 137 
 138         netstack_initialized = 1;
 139 
 140         /*
 141          * We want to be informed each time a zone is created or
 142          * destroyed in the kernel, so we can maintain the
 143          * stack instance information.
 144          */
 145         zone_key_create(&netstack_zone_key, netstack_zone_create,
 146             netstack_zone_shutdown, netstack_zone_destroy);
 147 }
 148 
 149 /*
 150  * Register a new module with the framework.
 151  * This registers interest in changes to the set of netstacks.
 152  * The createfn and destroyfn are required, but the shutdownfn can be
 153  * NULL.
 154  * Note that due to the current zsd implementation, when the create
 155  * function is called the zone isn't fully present, thus functions
 156  * like zone_find_by_* will fail, hence the create function can not
 157  * use many zones kernel functions including zcmn_err().
 158  */
 159 void
 160 netstack_register(int moduleid,
 161     void *(*module_create)(netstackid_t, netstack_t *),
 162     void (*module_shutdown)(netstackid_t, void *),
 163     void (*module_destroy)(netstackid_t, void *))
 164 {
 165         netstack_t *ns;
 166 
 167         ASSERT(netstack_initialized);
 168         ASSERT(moduleid >= 0 && moduleid < NS_MAX);
 169         ASSERT(module_create != NULL);
 170 
 171         /*
 172          * Make instances created after this point in time run the create
 173          * callback.
 174          */
 175         mutex_enter(&netstack_g_lock);
 176         ASSERT(ns_reg[moduleid].nr_create == NULL);
 177         ASSERT(ns_reg[moduleid].nr_flags == 0);
 178         ns_reg[moduleid].nr_create = module_create;
 179         ns_reg[moduleid].nr_shutdown = module_shutdown;
 180         ns_reg[moduleid].nr_destroy = module_destroy;
 181         ns_reg[moduleid].nr_flags = NRF_REGISTERED;
 182 
 183         /*
 184          * Determine the set of stacks that exist before we drop the lock.
 185          * Set NSS_CREATE_NEEDED for each of those.
 186          * netstacks which have been deleted will have NSS_CREATE_COMPLETED
 187          * set, but check NSF_CLOSING to be sure.
 188          */
 189         for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
 190                 nm_state_t *nms = &ns->netstack_m_state[moduleid];
 191 
 192                 mutex_enter(&ns->netstack_lock);
 193                 if (!(ns->netstack_flags & NSF_CLOSING) &&
 194                     (nms->nms_flags & NSS_CREATE_ALL) == 0) {
 195                         nms->nms_flags |= NSS_CREATE_NEEDED;
 196                         DTRACE_PROBE2(netstack__create__needed,
 197                             netstack_t *, ns, int, moduleid);
 198                 }
 199                 mutex_exit(&ns->netstack_lock);
 200         }
 201         mutex_exit(&netstack_g_lock);
 202 
 203         /*
 204          * At this point in time a new instance can be created or an instance
 205          * can be destroyed, or some other module can register or unregister.
 206          * Make sure we either run all the create functions for this moduleid
 207          * or we wait for any other creators for this moduleid.
 208          */
 209         apply_all_netstacks(moduleid, netstack_apply_create);
 210 }
 211 
 212 void
 213 netstack_unregister(int moduleid)
 214 {
 215         netstack_t *ns;
 216 
 217         ASSERT(moduleid >= 0 && moduleid < NS_MAX);
 218 
 219         ASSERT(ns_reg[moduleid].nr_create != NULL);
 220         ASSERT(ns_reg[moduleid].nr_flags & NRF_REGISTERED);
 221 
 222         mutex_enter(&netstack_g_lock);
 223         /*
 224          * Determine the set of stacks that exist before we drop the lock.
 225          * Set NSS_SHUTDOWN_NEEDED and NSS_DESTROY_NEEDED for each of those.
 226          * That ensures that when we return all the callbacks for existing
 227          * instances have completed. And since we set NRF_DYING no new
 228          * instances can use this module.
 229          */
 230         for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
 231                 boolean_t created = B_FALSE;
 232                 nm_state_t *nms = &ns->netstack_m_state[moduleid];
 233 
 234                 mutex_enter(&ns->netstack_lock);
 235 
 236                 /*
 237                  * We need to be careful here. We could actually have a netstack
 238                  * being created as we speak waiting for us to let go of this
 239                  * lock to proceed. It may have set NSS_CREATE_NEEDED, but not
 240                  * have gotten to the point of completing it yet. If
 241                  * NSS_CREATE_NEEDED, we can safely just remove it here and
 242                  * never create the module. However, if NSS_CREATE_INPROGRESS is
 243                  * set, we need to still flag this module for shutdown and
 244                  * deletion, just as though it had reached NSS_CREATE_COMPLETED.
 245                  *
 246                  * It is safe to do that because of two different guarantees
 247                  * that exist in the system. The first is that before we do a
 248                  * create, shutdown, or destroy, we ensure that nothing else is
 249                  * in progress in the system for this netstack and wait for it
 250                  * to complete. Secondly, because the zone is being created, we
 251                  * know that the following call to apply_all_netstack will block
 252                  * on the zone finishing its initialization.
 253                  */
 254                 if (nms->nms_flags & NSS_CREATE_NEEDED)
 255                         nms->nms_flags &= ~NSS_CREATE_NEEDED;
 256 
 257                 if (nms->nms_flags & NSS_CREATE_INPROGRESS ||
 258                     nms->nms_flags & NSS_CREATE_COMPLETED)
 259                         created = B_TRUE;
 260 
 261                 if (ns_reg[moduleid].nr_shutdown != NULL && created &&
 262                     (nms->nms_flags & NSS_CREATE_COMPLETED) &&
 263                     (nms->nms_flags & NSS_SHUTDOWN_ALL) == 0) {
 264                         nms->nms_flags |= NSS_SHUTDOWN_NEEDED;
 265                         DTRACE_PROBE2(netstack__shutdown__needed,
 266                             netstack_t *, ns, int, moduleid);
 267                 }
 268                 if ((ns_reg[moduleid].nr_flags & NRF_REGISTERED) &&
 269                     ns_reg[moduleid].nr_destroy != NULL && created &&
 270                     (nms->nms_flags & NSS_DESTROY_ALL) == 0) {
 271                         nms->nms_flags |= NSS_DESTROY_NEEDED;
 272                         DTRACE_PROBE2(netstack__destroy__needed,
 273                             netstack_t *, ns, int, moduleid);
 274                 }
 275                 mutex_exit(&ns->netstack_lock);
 276         }
 277         /*
 278          * Prevent any new netstack from calling the registered create
 279          * function, while keeping the function pointers in place until the
 280          * shutdown and destroy callbacks are complete.
 281          */
 282         ns_reg[moduleid].nr_flags |= NRF_DYING;
 283         mutex_exit(&netstack_g_lock);
 284 
 285         apply_all_netstacks(moduleid, netstack_apply_shutdown);
 286         apply_all_netstacks(moduleid, netstack_apply_destroy);
 287 
 288         /*
 289          * Clear the nms_flags so that we can handle this module
 290          * being loaded again.
 291          * Also remove the registered functions.
 292          */
 293         mutex_enter(&netstack_g_lock);
 294         ASSERT(ns_reg[moduleid].nr_flags & NRF_REGISTERED);
 295         ASSERT(ns_reg[moduleid].nr_flags & NRF_DYING);
 296         for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
 297                 nm_state_t *nms = &ns->netstack_m_state[moduleid];
 298 
 299                 mutex_enter(&ns->netstack_lock);
 300                 if (nms->nms_flags & NSS_DESTROY_COMPLETED) {
 301                         nms->nms_flags = 0;
 302                         DTRACE_PROBE2(netstack__destroy__done,
 303                             netstack_t *, ns, int, moduleid);
 304                 }
 305                 mutex_exit(&ns->netstack_lock);
 306         }
 307 
 308         ns_reg[moduleid].nr_create = NULL;
 309         ns_reg[moduleid].nr_shutdown = NULL;
 310         ns_reg[moduleid].nr_destroy = NULL;
 311         ns_reg[moduleid].nr_flags = 0;
 312         mutex_exit(&netstack_g_lock);
 313 }
 314 
 315 /*
 316  * Lookup and/or allocate a netstack for this zone.
 317  */
 318 static void *
 319 netstack_zone_create(zoneid_t zoneid)
 320 {
 321         netstackid_t stackid;
 322         netstack_t *ns;
 323         netstack_t **nsp;
 324         zone_t  *zone;
 325         int i;
 326 
 327         ASSERT(netstack_initialized);
 328 
 329         zone = zone_find_by_id_nolock(zoneid);
 330         ASSERT(zone != NULL);
 331 
 332         if (zone->zone_flags & ZF_NET_EXCL) {
 333                 stackid = zoneid;
 334         } else {
 335                 /* Look for the stack instance for the global */
 336                 stackid = GLOBAL_NETSTACKID;
 337         }
 338 
 339         /* Allocate even if it isn't needed; simplifies locking */
 340         ns = (netstack_t *)kmem_zalloc(sizeof (netstack_t), KM_SLEEP);
 341 
 342         /* Look if there is a matching stack instance */
 343         mutex_enter(&netstack_g_lock);
 344         for (nsp = &netstack_head; *nsp != NULL;
 345             nsp = &((*nsp)->netstack_next)) {
 346                 if ((*nsp)->netstack_stackid == stackid) {
 347                         /*
 348                          * Should never find a pre-existing exclusive stack
 349                          */
 350                         VERIFY(stackid == GLOBAL_NETSTACKID);
 351                         kmem_free(ns, sizeof (netstack_t));
 352                         ns = *nsp;
 353                         mutex_enter(&ns->netstack_lock);
 354                         ns->netstack_numzones++;
 355                         mutex_exit(&ns->netstack_lock);
 356                         mutex_exit(&netstack_g_lock);
 357                         DTRACE_PROBE1(netstack__inc__numzones,
 358                             netstack_t *, ns);
 359                         /* Record that we have a new shared stack zone */
 360                         netstack_shared_zone_add(zoneid);
 361                         zone->zone_netstack = ns;
 362                         return (ns);
 363                 }
 364         }
 365         /* Not found */
 366         mutex_init(&ns->netstack_lock, NULL, MUTEX_DEFAULT, NULL);
 367         cv_init(&ns->netstack_cv, NULL, CV_DEFAULT, NULL);
 368         ns->netstack_stackid = zoneid;
 369         ns->netstack_numzones = 1;
 370         ns->netstack_refcnt = 1; /* Decremented by netstack_zone_destroy */
 371         ns->netstack_flags = NSF_UNINIT;
 372         *nsp = ns;
 373         zone->zone_netstack = ns;
 374 
 375         mutex_enter(&ns->netstack_lock);
 376         /*
 377          * Mark this netstack as having a CREATE running so
 378          * any netstack_register/netstack_unregister waits for
 379          * the existing create callbacks to complete in moduleid order
 380          */
 381         ns->netstack_flags |= NSF_ZONE_CREATE;
 382 
 383         /*
 384          * Determine the set of module create functions that need to be
 385          * called before we drop the lock.
 386          * Set NSS_CREATE_NEEDED for each of those.
 387          * Skip any with NRF_DYING set, since those are in the process of
 388          * going away, by checking for flags being exactly NRF_REGISTERED.
 389          */
 390         for (i = 0; i < NS_MAX; i++) {
 391                 nm_state_t *nms = &ns->netstack_m_state[i];
 392 
 393                 cv_init(&nms->nms_cv, NULL, CV_DEFAULT, NULL);
 394 
 395                 if ((ns_reg[i].nr_flags == NRF_REGISTERED) &&
 396                     (nms->nms_flags & NSS_CREATE_ALL) == 0) {
 397                         nms->nms_flags |= NSS_CREATE_NEEDED;
 398                         DTRACE_PROBE2(netstack__create__needed,
 399                             netstack_t *, ns, int, i);
 400                 }
 401         }
 402         mutex_exit(&ns->netstack_lock);
 403         mutex_exit(&netstack_g_lock);
 404 
 405         apply_all_modules(ns, netstack_apply_create);
 406 
 407         /* Tell any waiting netstack_register/netstack_unregister to proceed */
 408         mutex_enter(&ns->netstack_lock);
 409         ns->netstack_flags &= ~NSF_UNINIT;
 410         ASSERT(ns->netstack_flags & NSF_ZONE_CREATE);
 411         ns->netstack_flags &= ~NSF_ZONE_CREATE;
 412         cv_broadcast(&ns->netstack_cv);
 413         mutex_exit(&ns->netstack_lock);
 414 
 415         return (ns);
 416 }
 417 
 418 /* ARGSUSED */
 419 static void
 420 netstack_zone_shutdown(zoneid_t zoneid, void *arg)
 421 {
 422         netstack_t *ns = (netstack_t *)arg;
 423         int i;
 424 
 425         ASSERT(arg != NULL);
 426 
 427         mutex_enter(&ns->netstack_lock);
 428         ASSERT(ns->netstack_numzones > 0);
 429         if (ns->netstack_numzones != 1) {
 430                 /* Stack instance being used by other zone */
 431                 mutex_exit(&ns->netstack_lock);
 432                 ASSERT(ns->netstack_stackid == GLOBAL_NETSTACKID);
 433                 return;
 434         }
 435         mutex_exit(&ns->netstack_lock);
 436 
 437         mutex_enter(&netstack_g_lock);
 438         mutex_enter(&ns->netstack_lock);
 439         /*
 440          * Mark this netstack as having a SHUTDOWN running so
 441          * any netstack_register/netstack_unregister waits for
 442          * the existing create callbacks to complete in moduleid order
 443          */
 444         ASSERT(!(ns->netstack_flags & NSF_ZONE_INPROGRESS));
 445         ns->netstack_flags |= NSF_ZONE_SHUTDOWN;
 446 
 447         /*
 448          * Determine the set of stacks that exist before we drop the lock.
 449          * Set NSS_SHUTDOWN_NEEDED for each of those.
 450          */
 451         for (i = 0; i < NS_MAX; i++) {
 452                 nm_state_t *nms = &ns->netstack_m_state[i];
 453 
 454                 if ((ns_reg[i].nr_flags & NRF_REGISTERED) &&
 455                     ns_reg[i].nr_shutdown != NULL &&
 456                     (nms->nms_flags & NSS_CREATE_COMPLETED) &&
 457                     (nms->nms_flags & NSS_SHUTDOWN_ALL) == 0) {
 458                         nms->nms_flags |= NSS_SHUTDOWN_NEEDED;
 459                         DTRACE_PROBE2(netstack__shutdown__needed,
 460                             netstack_t *, ns, int, i);
 461                 }
 462         }
 463         mutex_exit(&ns->netstack_lock);
 464         mutex_exit(&netstack_g_lock);
 465 
 466         /*
 467          * Call the shutdown function for all registered modules for this
 468          * netstack.
 469          */
 470         apply_all_modules_reverse(ns, netstack_apply_shutdown);
 471 
 472         /* Tell any waiting netstack_register/netstack_unregister to proceed */
 473         mutex_enter(&ns->netstack_lock);
 474         ASSERT(ns->netstack_flags & NSF_ZONE_SHUTDOWN);
 475         ns->netstack_flags &= ~NSF_ZONE_SHUTDOWN;
 476         cv_broadcast(&ns->netstack_cv);
 477         mutex_exit(&ns->netstack_lock);
 478 }
 479 
 480 /*
 481  * Common routine to release a zone.
 482  * If this was the last zone using the stack instance then prepare to
 483  * have the refcnt dropping to zero free the zone.
 484  */
 485 /* ARGSUSED */
 486 static void
 487 netstack_zone_destroy(zoneid_t zoneid, void *arg)
 488 {
 489         netstack_t *ns = (netstack_t *)arg;
 490 
 491         ASSERT(arg != NULL);
 492 
 493         mutex_enter(&ns->netstack_lock);
 494         ASSERT(ns->netstack_numzones > 0);
 495         ns->netstack_numzones--;
 496         if (ns->netstack_numzones != 0) {
 497                 /* Stack instance being used by other zone */
 498                 mutex_exit(&ns->netstack_lock);
 499                 ASSERT(ns->netstack_stackid == GLOBAL_NETSTACKID);
 500                 /* Record that we a shared stack zone has gone away */
 501                 netstack_shared_zone_remove(zoneid);
 502                 return;
 503         }
 504         /*
 505          * Set CLOSING so that netstack_find_by will not find it.
 506          */
 507         ns->netstack_flags |= NSF_CLOSING;
 508         mutex_exit(&ns->netstack_lock);
 509         DTRACE_PROBE1(netstack__dec__numzones, netstack_t *, ns);
 510         /* No other thread can call zone_destroy for this stack */
 511 
 512         /*
 513          * Decrease refcnt to account for the one in netstack_zone_init()
 514          */
 515         netstack_rele(ns);
 516 }
 517 
 518 /*
 519  * Called when the reference count drops to zero.
 520  * Call the destroy functions for each registered module.
 521  */
 522 static void
 523 netstack_stack_inactive(netstack_t *ns)
 524 {
 525         int i;
 526 
 527         mutex_enter(&netstack_g_lock);
 528         mutex_enter(&ns->netstack_lock);
 529         /*
 530          * Mark this netstack as having a DESTROY running so
 531          * any netstack_register/netstack_unregister waits for
 532          * the existing destroy callbacks to complete in reverse moduleid order
 533          */
 534         ASSERT(!(ns->netstack_flags & NSF_ZONE_INPROGRESS));
 535         ns->netstack_flags |= NSF_ZONE_DESTROY;
 536         /*
 537          * If the shutdown callback wasn't called earlier (e.g., if this is
 538          * a netstack shared between multiple zones), then we schedule it now.
 539          *
 540          * Determine the set of stacks that exist before we drop the lock.
 541          * Set NSS_DESTROY_NEEDED for each of those. That
 542          * ensures that when we return all the callbacks for existing
 543          * instances have completed.
 544          */
 545         for (i = 0; i < NS_MAX; i++) {
 546                 nm_state_t *nms = &ns->netstack_m_state[i];
 547 
 548                 if ((ns_reg[i].nr_flags & NRF_REGISTERED) &&
 549                     ns_reg[i].nr_shutdown != NULL &&
 550                     (nms->nms_flags & NSS_CREATE_COMPLETED) &&
 551                     (nms->nms_flags & NSS_SHUTDOWN_ALL) == 0) {
 552                         nms->nms_flags |= NSS_SHUTDOWN_NEEDED;
 553                         DTRACE_PROBE2(netstack__shutdown__needed,
 554                             netstack_t *, ns, int, i);
 555                 }
 556 
 557                 if ((ns_reg[i].nr_flags & NRF_REGISTERED) &&
 558                     ns_reg[i].nr_destroy != NULL &&
 559                     (nms->nms_flags & NSS_CREATE_COMPLETED) &&
 560                     (nms->nms_flags & NSS_DESTROY_ALL) == 0) {
 561                         nms->nms_flags |= NSS_DESTROY_NEEDED;
 562                         DTRACE_PROBE2(netstack__destroy__needed,
 563                             netstack_t *, ns, int, i);
 564                 }
 565         }
 566         mutex_exit(&ns->netstack_lock);
 567         mutex_exit(&netstack_g_lock);
 568 
 569         /*
 570          * Call the shutdown and destroy functions for all registered modules
 571          * for this netstack.
 572          *
 573          * Since there are some ordering dependencies between the modules we
 574          * tear them down in the reverse order of what was used to create them.
 575          *
 576          * Since a netstack_t is never reused (when a zone is rebooted it gets
 577          * a new zoneid == netstackid i.e. a new netstack_t is allocated) we
 578          * leave nms_flags the way it is i.e. with NSS_DESTROY_COMPLETED set.
 579          * That is different than in the netstack_unregister() case.
 580          */
 581         apply_all_modules_reverse(ns, netstack_apply_shutdown);
 582         apply_all_modules_reverse(ns, netstack_apply_destroy);
 583 
 584         /* Tell any waiting netstack_register/netstack_unregister to proceed */
 585         mutex_enter(&ns->netstack_lock);
 586         ASSERT(ns->netstack_flags & NSF_ZONE_DESTROY);
 587         ns->netstack_flags &= ~NSF_ZONE_DESTROY;
 588         cv_broadcast(&ns->netstack_cv);
 589         mutex_exit(&ns->netstack_lock);
 590 }
 591 
 592 /*
 593  * Apply a function to all netstacks for a particular moduleid.
 594  *
 595  * If there is any zone activity (due to a zone being created, shutdown,
 596  * or destroyed) we wait for that to complete before we proceed. This ensures
 597  * that the moduleids are processed in order when a zone is created or
 598  * destroyed.
 599  *
 600  * The applyfn has to drop netstack_g_lock if it does some work.
 601  * In that case we don't follow netstack_next,
 602  * even if it is possible to do so without any hazards. This is
 603  * because we want the design to allow for the list of netstacks threaded
 604  * by netstack_next to change in any arbitrary way during the time the
 605  * lock was dropped.
 606  *
 607  * It is safe to restart the loop at netstack_head since the applyfn
 608  * changes netstack_m_state as it processes things, so a subsequent
 609  * pass through will have no effect in applyfn, hence the loop will terminate
 610  * in at worst O(N^2).
 611  */
 612 static void
 613 apply_all_netstacks(int moduleid, applyfn_t *applyfn)
 614 {
 615         netstack_t *ns;
 616 
 617         mutex_enter(&netstack_g_lock);
 618         ns = netstack_head;
 619         while (ns != NULL) {
 620                 if (wait_for_zone_creator(ns, &netstack_g_lock)) {
 621                         /* Lock dropped - restart at head */
 622                         ns = netstack_head;
 623                 } else if ((applyfn)(&netstack_g_lock, ns, moduleid)) {
 624                         /* Lock dropped - restart at head */
 625                         ns = netstack_head;
 626                 } else {
 627                         ns = ns->netstack_next;
 628                 }
 629         }
 630         mutex_exit(&netstack_g_lock);
 631 }
 632 
 633 /*
 634  * Apply a function to all moduleids for a particular netstack.
 635  *
 636  * Since the netstack linkage doesn't matter in this case we can
 637  * ignore whether the function drops the lock.
 638  */
 639 static void
 640 apply_all_modules(netstack_t *ns, applyfn_t *applyfn)
 641 {
 642         int i;
 643 
 644         mutex_enter(&netstack_g_lock);
 645         for (i = 0; i < NS_MAX; i++) {
 646                 /*
 647                  * We don't care whether the lock was dropped
 648                  * since we are not iterating over netstack_head.
 649                  */
 650                 (void) (applyfn)(&netstack_g_lock, ns, i);
 651         }
 652         mutex_exit(&netstack_g_lock);
 653 }
 654 
 655 /* Like the above but in reverse moduleid order */
 656 static void
 657 apply_all_modules_reverse(netstack_t *ns, applyfn_t *applyfn)
 658 {
 659         int i;
 660 
 661         mutex_enter(&netstack_g_lock);
 662         for (i = NS_MAX-1; i >= 0; i--) {
 663                 /*
 664                  * We don't care whether the lock was dropped
 665                  * since we are not iterating over netstack_head.
 666                  */
 667                 (void) (applyfn)(&netstack_g_lock, ns, i);
 668         }
 669         mutex_exit(&netstack_g_lock);
 670 }
 671 
 672 /*
 673  * Call the create function for the ns and moduleid if CREATE_NEEDED
 674  * is set.
 675  * If some other thread gets here first and sets *_INPROGRESS, then
 676  * we wait for that thread to complete so that we can ensure that
 677  * all the callbacks are done when we've looped over all netstacks/moduleids.
 678  *
 679  * When we call the create function, we temporarily drop the netstack_lock
 680  * held by the caller, and return true to tell the caller it needs to
 681  * re-evalute the state.
 682  */
 683 static boolean_t
 684 netstack_apply_create(kmutex_t *lockp, netstack_t *ns, int moduleid)
 685 {
 686         void *result;
 687         netstackid_t stackid;
 688         nm_state_t *nms = &ns->netstack_m_state[moduleid];
 689         boolean_t dropped = B_FALSE;
 690 
 691         ASSERT(MUTEX_HELD(lockp));
 692         mutex_enter(&ns->netstack_lock);
 693 
 694         if (wait_for_nms_inprogress(ns, nms, lockp))
 695                 dropped = B_TRUE;
 696 
 697         if (nms->nms_flags & NSS_CREATE_NEEDED) {
 698                 nms->nms_flags &= ~NSS_CREATE_NEEDED;
 699                 nms->nms_flags |= NSS_CREATE_INPROGRESS;
 700                 DTRACE_PROBE2(netstack__create__inprogress,
 701                     netstack_t *, ns, int, moduleid);
 702                 mutex_exit(&ns->netstack_lock);
 703                 mutex_exit(lockp);
 704                 dropped = B_TRUE;
 705 
 706                 ASSERT(ns_reg[moduleid].nr_create != NULL);
 707                 stackid = ns->netstack_stackid;
 708                 DTRACE_PROBE2(netstack__create__start,
 709                     netstackid_t, stackid,
 710                     netstack_t *, ns);
 711                 result = (ns_reg[moduleid].nr_create)(stackid, ns);
 712                 DTRACE_PROBE2(netstack__create__end,
 713                     void *, result, netstack_t *, ns);
 714 
 715                 ASSERT(result != NULL);
 716                 mutex_enter(lockp);
 717                 mutex_enter(&ns->netstack_lock);
 718                 ns->netstack_modules[moduleid] = result;
 719                 nms->nms_flags &= ~NSS_CREATE_INPROGRESS;
 720                 nms->nms_flags |= NSS_CREATE_COMPLETED;
 721                 cv_broadcast(&nms->nms_cv);
 722                 DTRACE_PROBE2(netstack__create__completed,
 723                     netstack_t *, ns, int, moduleid);
 724                 mutex_exit(&ns->netstack_lock);
 725                 return (dropped);
 726         } else {
 727                 mutex_exit(&ns->netstack_lock);
 728                 return (dropped);
 729         }
 730 }
 731 
 732 /*
 733  * Call the shutdown function for the ns and moduleid if SHUTDOWN_NEEDED
 734  * is set.
 735  * If some other thread gets here first and sets *_INPROGRESS, then
 736  * we wait for that thread to complete so that we can ensure that
 737  * all the callbacks are done when we've looped over all netstacks/moduleids.
 738  *
 739  * When we call the shutdown function, we temporarily drop the netstack_lock
 740  * held by the caller, and return true to tell the caller it needs to
 741  * re-evalute the state.
 742  */
 743 static boolean_t
 744 netstack_apply_shutdown(kmutex_t *lockp, netstack_t *ns, int moduleid)
 745 {
 746         netstackid_t stackid;
 747         void * netstack_module;
 748         nm_state_t *nms = &ns->netstack_m_state[moduleid];
 749         boolean_t dropped = B_FALSE;
 750 
 751         ASSERT(MUTEX_HELD(lockp));
 752         mutex_enter(&ns->netstack_lock);
 753 
 754         if (wait_for_nms_inprogress(ns, nms, lockp))
 755                 dropped = B_TRUE;
 756 
 757         if (nms->nms_flags & NSS_SHUTDOWN_NEEDED) {
 758                 nms->nms_flags &= ~NSS_SHUTDOWN_NEEDED;
 759                 nms->nms_flags |= NSS_SHUTDOWN_INPROGRESS;
 760                 DTRACE_PROBE2(netstack__shutdown__inprogress,
 761                     netstack_t *, ns, int, moduleid);
 762                 mutex_exit(&ns->netstack_lock);
 763                 mutex_exit(lockp);
 764                 dropped = B_TRUE;
 765 
 766                 ASSERT(ns_reg[moduleid].nr_shutdown != NULL);
 767                 stackid = ns->netstack_stackid;
 768                 netstack_module = ns->netstack_modules[moduleid];
 769                 DTRACE_PROBE2(netstack__shutdown__start,
 770                     netstackid_t, stackid,
 771                     void *, netstack_module);
 772                 (ns_reg[moduleid].nr_shutdown)(stackid, netstack_module);
 773                 DTRACE_PROBE1(netstack__shutdown__end,
 774                     netstack_t *, ns);
 775 
 776                 mutex_enter(lockp);
 777                 mutex_enter(&ns->netstack_lock);
 778                 nms->nms_flags &= ~NSS_SHUTDOWN_INPROGRESS;
 779                 nms->nms_flags |= NSS_SHUTDOWN_COMPLETED;
 780                 cv_broadcast(&nms->nms_cv);
 781                 DTRACE_PROBE2(netstack__shutdown__completed,
 782                     netstack_t *, ns, int, moduleid);
 783                 mutex_exit(&ns->netstack_lock);
 784                 return (dropped);
 785         } else {
 786                 mutex_exit(&ns->netstack_lock);
 787                 return (dropped);
 788         }
 789 }
 790 
 791 /*
 792  * Call the destroy function for the ns and moduleid if DESTROY_NEEDED
 793  * is set.
 794  * If some other thread gets here first and sets *_INPROGRESS, then
 795  * we wait for that thread to complete so that we can ensure that
 796  * all the callbacks are done when we've looped over all netstacks/moduleids.
 797  *
 798  * When we call the destroy function, we temporarily drop the netstack_lock
 799  * held by the caller, and return true to tell the caller it needs to
 800  * re-evalute the state.
 801  */
 802 static boolean_t
 803 netstack_apply_destroy(kmutex_t *lockp, netstack_t *ns, int moduleid)
 804 {
 805         netstackid_t stackid;
 806         void * netstack_module;
 807         nm_state_t *nms = &ns->netstack_m_state[moduleid];
 808         boolean_t dropped = B_FALSE;
 809 
 810         ASSERT(MUTEX_HELD(lockp));
 811         mutex_enter(&ns->netstack_lock);
 812 
 813         if (wait_for_nms_inprogress(ns, nms, lockp))
 814                 dropped = B_TRUE;
 815 
 816         if (nms->nms_flags & NSS_DESTROY_NEEDED) {
 817                 nms->nms_flags &= ~NSS_DESTROY_NEEDED;
 818                 nms->nms_flags |= NSS_DESTROY_INPROGRESS;
 819                 DTRACE_PROBE2(netstack__destroy__inprogress,
 820                     netstack_t *, ns, int, moduleid);
 821                 mutex_exit(&ns->netstack_lock);
 822                 mutex_exit(lockp);
 823                 dropped = B_TRUE;
 824 
 825                 ASSERT(ns_reg[moduleid].nr_destroy != NULL);
 826                 stackid = ns->netstack_stackid;
 827                 netstack_module = ns->netstack_modules[moduleid];
 828                 DTRACE_PROBE2(netstack__destroy__start,
 829                     netstackid_t, stackid,
 830                     void *, netstack_module);
 831                 (ns_reg[moduleid].nr_destroy)(stackid, netstack_module);
 832                 DTRACE_PROBE1(netstack__destroy__end,
 833                     netstack_t *, ns);
 834 
 835                 mutex_enter(lockp);
 836                 mutex_enter(&ns->netstack_lock);
 837                 ns->netstack_modules[moduleid] = NULL;
 838                 nms->nms_flags &= ~NSS_DESTROY_INPROGRESS;
 839                 nms->nms_flags |= NSS_DESTROY_COMPLETED;
 840                 cv_broadcast(&nms->nms_cv);
 841                 DTRACE_PROBE2(netstack__destroy__completed,
 842                     netstack_t *, ns, int, moduleid);
 843                 mutex_exit(&ns->netstack_lock);
 844                 return (dropped);
 845         } else {
 846                 mutex_exit(&ns->netstack_lock);
 847                 return (dropped);
 848         }
 849 }
 850 
 851 /*
 852  * If somebody  is creating the netstack (due to a new zone being created)
 853  * then we wait for them to complete. This ensures that any additional
 854  * netstack_register() doesn't cause the create functions to run out of
 855  * order.
 856  * Note that we do not need such a global wait in the case of the shutdown
 857  * and destroy callbacks, since in that case it is sufficient for both
 858  * threads to set NEEDED and wait for INPROGRESS to ensure ordering.
 859  * Returns true if lockp was temporarily dropped while waiting.
 860  */
 861 static boolean_t
 862 wait_for_zone_creator(netstack_t *ns, kmutex_t *lockp)
 863 {
 864         boolean_t dropped = B_FALSE;
 865 
 866         mutex_enter(&ns->netstack_lock);
 867         while (ns->netstack_flags & NSF_ZONE_CREATE) {
 868                 DTRACE_PROBE1(netstack__wait__zone__inprogress,
 869                     netstack_t *, ns);
 870                 if (lockp != NULL) {
 871                         dropped = B_TRUE;
 872                         mutex_exit(lockp);
 873                 }
 874                 cv_wait(&ns->netstack_cv, &ns->netstack_lock);
 875                 if (lockp != NULL) {
 876                         /* First drop netstack_lock to preserve order */
 877                         mutex_exit(&ns->netstack_lock);
 878                         mutex_enter(lockp);
 879                         mutex_enter(&ns->netstack_lock);
 880                 }
 881         }
 882         mutex_exit(&ns->netstack_lock);
 883         return (dropped);
 884 }
 885 
 886 /*
 887  * Wait for any INPROGRESS flag to be cleared for the netstack/moduleid
 888  * combination.
 889  * Returns true if lockp was temporarily dropped while waiting.
 890  */
 891 static boolean_t
 892 wait_for_nms_inprogress(netstack_t *ns, nm_state_t *nms, kmutex_t *lockp)
 893 {
 894         boolean_t dropped = B_FALSE;
 895 
 896         while (nms->nms_flags & NSS_ALL_INPROGRESS) {
 897                 DTRACE_PROBE2(netstack__wait__nms__inprogress,
 898                     netstack_t *, ns, nm_state_t *, nms);
 899                 if (lockp != NULL) {
 900                         dropped = B_TRUE;
 901                         mutex_exit(lockp);
 902                 }
 903                 cv_wait(&nms->nms_cv, &ns->netstack_lock);
 904                 if (lockp != NULL) {
 905                         /* First drop netstack_lock to preserve order */
 906                         mutex_exit(&ns->netstack_lock);
 907                         mutex_enter(lockp);
 908                         mutex_enter(&ns->netstack_lock);
 909                 }
 910         }
 911         return (dropped);
 912 }
 913 
 914 /*
 915  * Get the stack instance used in caller's zone.
 916  * Increases the reference count, caller must do a netstack_rele.
 917  * It can't be called after zone_destroy() has started.
 918  */
 919 netstack_t *
 920 netstack_get_current(void)
 921 {
 922         netstack_t *ns;
 923 
 924         ns = curproc->p_zone->zone_netstack;
 925         ASSERT(ns != NULL);
 926         if (ns->netstack_flags & (NSF_UNINIT|NSF_CLOSING))
 927                 return (NULL);
 928 
 929         netstack_hold(ns);
 930 
 931         return (ns);
 932 }
 933 
 934 /*
 935  * Find a stack instance given the cred.
 936  * This is used by the modules to potentially allow for a future when
 937  * something other than the zoneid is used to determine the stack.
 938  */
 939 netstack_t *
 940 netstack_find_by_cred(const cred_t *cr)
 941 {
 942         zoneid_t zoneid = crgetzoneid(cr);
 943 
 944         /* Handle the case when cr_zone is NULL */
 945         if (zoneid == (zoneid_t)-1)
 946                 zoneid = GLOBAL_ZONEID;
 947 
 948         /* For performance ... */
 949         if (curproc->p_zone->zone_id == zoneid)
 950                 return (netstack_get_current());
 951         else
 952                 return (netstack_find_by_zoneid(zoneid));
 953 }
 954 
 955 /*
 956  * Find a stack instance given the zoneid.
 957  * Increases the reference count if found; caller must do a
 958  * netstack_rele().
 959  *
 960  * If there is no exact match then assume the shared stack instance
 961  * matches.
 962  *
 963  * Skip the unitialized ones.
 964  */
 965 netstack_t *
 966 netstack_find_by_zoneid(zoneid_t zoneid)
 967 {
 968         netstack_t *ns;
 969         zone_t *zone;
 970 
 971         zone = zone_find_by_id(zoneid);
 972 
 973         if (zone == NULL)
 974                 return (NULL);
 975 
 976         ns = zone->zone_netstack;
 977         ASSERT(ns != NULL);
 978         if (ns->netstack_flags & (NSF_UNINIT|NSF_CLOSING))
 979                 ns = NULL;
 980         else
 981                 netstack_hold(ns);
 982 
 983         zone_rele(zone);
 984         return (ns);
 985 }
 986 
 987 /*
 988  * Find a stack instance given the zoneid. Can only be called from
 989  * the create callback. See the comments in zone_find_by_id_nolock why
 990  * that limitation exists.
 991  *
 992  * Increases the reference count if found; caller must do a
 993  * netstack_rele().
 994  *
 995  * If there is no exact match then assume the shared stack instance
 996  * matches.
 997  *
 998  * Skip the unitialized ones.
 999  */
1000 netstack_t *
1001 netstack_find_by_zoneid_nolock(zoneid_t zoneid)
1002 {
1003         netstack_t *ns;
1004         zone_t *zone;
1005 
1006         zone = zone_find_by_id_nolock(zoneid);
1007 
1008         if (zone == NULL)
1009                 return (NULL);
1010 
1011         ns = zone->zone_netstack;
1012         ASSERT(ns != NULL);
1013 
1014         if (ns->netstack_flags & (NSF_UNINIT|NSF_CLOSING))
1015                 ns = NULL;
1016         else
1017                 netstack_hold(ns);
1018 
1019         /* zone_find_by_id_nolock does not have a hold on the zone */
1020         return (ns);
1021 }
1022 
1023 /*
1024  * Find a stack instance given the stackid with exact match?
1025  * Increases the reference count if found; caller must do a
1026  * netstack_rele().
1027  *
1028  * Skip the unitialized ones.
1029  */
1030 netstack_t *
1031 netstack_find_by_stackid(netstackid_t stackid)
1032 {
1033         netstack_t *ns;
1034 
1035         mutex_enter(&netstack_g_lock);
1036         for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
1037                 mutex_enter(&ns->netstack_lock);
1038                 if (ns->netstack_stackid == stackid &&
1039                     !(ns->netstack_flags & (NSF_UNINIT|NSF_CLOSING))) {
1040                         mutex_exit(&ns->netstack_lock);
1041                         netstack_hold(ns);
1042                         mutex_exit(&netstack_g_lock);
1043                         return (ns);
1044                 }
1045                 mutex_exit(&ns->netstack_lock);
1046         }
1047         mutex_exit(&netstack_g_lock);
1048         return (NULL);
1049 }
1050 
1051 boolean_t
1052 netstack_inuse_by_stackid(netstackid_t stackid)
1053 {
1054         netstack_t *ns;
1055         boolean_t rval = B_FALSE;
1056 
1057         mutex_enter(&netstack_g_lock);
1058 
1059         for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
1060                 if (ns->netstack_stackid == stackid) {
1061                         rval = B_TRUE;
1062                         break;
1063                 }
1064         }
1065 
1066         mutex_exit(&netstack_g_lock);
1067 
1068         return (rval);
1069 }
1070 
1071 
1072 static void
1073 netstack_reap(void *arg)
1074 {
1075         /* Indicate we took a semaphore to get here. */
1076         netstack_reap_work((netstack_t *)arg, B_TRUE);
1077 }
1078 
1079 static void
1080 netstack_reap_intr(void *arg)
1081 {
1082         /* Indicate we did NOT TAKE a semaphore to get here. */
1083         netstack_reap_work((netstack_t *)arg, B_FALSE);
1084 }
1085 
1086 static void
1087 netstack_reap_work(netstack_t *ns, boolean_t semaphore_signal)
1088 {
1089         netstack_t **nsp;
1090         boolean_t found;
1091         int i;
1092 
1093         /*
1094          * Time to call the destroy functions and free up
1095          * the structure
1096          */
1097         netstack_stack_inactive(ns);
1098 
1099         /* Make sure nothing increased the references */
1100         ASSERT(ns->netstack_refcnt == 0);
1101         ASSERT(ns->netstack_numzones == 0);
1102 
1103         /* Finally remove from list of netstacks */
1104         mutex_enter(&netstack_g_lock);
1105         found = B_FALSE;
1106         for (nsp = &netstack_head; *nsp != NULL;
1107              nsp = &(*nsp)->netstack_next) {
1108                 if (*nsp == ns) {
1109                         *nsp = ns->netstack_next;
1110                         ns->netstack_next = NULL;
1111                         found = B_TRUE;
1112                         break;
1113                 }
1114         }
1115         ASSERT(found);
1116         mutex_exit(&netstack_g_lock);
1117 
1118         /* Make sure nothing increased the references */
1119         ASSERT(ns->netstack_refcnt == 0);
1120         ASSERT(ns->netstack_numzones == 0);
1121 
1122         ASSERT(ns->netstack_flags & NSF_CLOSING);
1123 
1124         for (i = 0; i < NS_MAX; i++) {
1125                 nm_state_t *nms = &ns->netstack_m_state[i];
1126 
1127                 cv_destroy(&nms->nms_cv);
1128         }
1129         mutex_destroy(&ns->netstack_lock);
1130         cv_destroy(&ns->netstack_cv);
1131         kmem_free(ns, sizeof (*ns));
1132         /* Allow another reap to be scheduled. */
1133         if (semaphore_signal)
1134                 sema_v(&netstack_reap_limiter);
1135 }
1136 
1137 void
1138 netstack_rele(netstack_t *ns)
1139 {
1140         int refcnt, numzones;
1141 
1142         mutex_enter(&ns->netstack_lock);
1143         ASSERT(ns->netstack_refcnt > 0);
1144         ns->netstack_refcnt--;
1145         /*
1146          * As we drop the lock additional netstack_rele()s can come in
1147          * and decrement the refcnt to zero and free the netstack_t.
1148          * Store pointers in local variables and if we were not the last
1149          * then don't reference the netstack_t after that.
1150          */
1151         refcnt = ns->netstack_refcnt;
1152         numzones = ns->netstack_numzones;
1153         DTRACE_PROBE1(netstack__dec__ref, netstack_t *, ns);
1154         mutex_exit(&ns->netstack_lock);
1155 
1156         if (refcnt == 0 && numzones == 0) {
1157                 boolean_t is_not_intr = !servicing_interrupt();
1158 
1159                 /*
1160                  * Because there are possibilities of kstats being held by
1161                  * callers, which would then be immediately freed, but held up
1162                  * due to kstat's odd reference model recording the thread, we
1163                  * choose to schedule the actual deletion of this netstack as
1164                  * a deferred task on the system taskq.  This way, any
1165                  * store-the-thread-pointer semantics won't trip over
1166                  * themselves.
1167                  *
1168                  * On the off chance this is called in interrupt context, we
1169                  * cannot use the semaphore to enforce rate-limiting.
1170                  */
1171                 if (is_not_intr && sema_tryp(&netstack_reap_limiter) == 0) {
1172                         /*
1173                          * XXX KEBE SAYS inidicate we're slamming against
1174                          * a limit.
1175                          */
1176                         hrtime_t measurement = gethrtime();
1177 
1178                         sema_p(&netstack_reap_limiter);
1179                         /* Caputre delay in ns. */
1180                         DTRACE_PROBE1(netstack__reap__rate__limited,
1181                             hrtime_t *, gethrtime() - measurement);
1182                 }
1183 
1184                 if (taskq_dispatch(system_taskq,
1185                     is_not_intr ? netstack_reap : netstack_reap_intr, ns,
1186                     TQ_NOSLEEP) == NULL) {
1187                         /*
1188                          * Well shoot, why can't we taskq_dispatch?
1189                          * Take our chances with a direct call.
1190                          */
1191                         DTRACE_PROBE1(netstack__reap__taskq__fail,
1192                             netstack_t *, ns);
1193                         netstack_reap_work(ns, is_not_intr);
1194                 }
1195         }
1196 }
1197 
1198 void
1199 netstack_hold(netstack_t *ns)
1200 {
1201         mutex_enter(&ns->netstack_lock);
1202         ns->netstack_refcnt++;
1203         ASSERT(ns->netstack_refcnt > 0);
1204         mutex_exit(&ns->netstack_lock);
1205         DTRACE_PROBE1(netstack__inc__ref, netstack_t *, ns);
1206 }
1207 
1208 /*
1209  * To support kstat_create_netstack() using kstat_zone_add we need
1210  * to track both
1211  *  - all zoneids that use the global/shared stack
1212  *  - all kstats that have been added for the shared stack
1213  */
1214 kstat_t *
1215 kstat_create_netstack(char *ks_module, int ks_instance, char *ks_name,
1216     char *ks_class, uchar_t ks_type, uint_t ks_ndata, uchar_t ks_flags,
1217     netstackid_t ks_netstackid)
1218 {
1219         kstat_t *ks;
1220 
1221         if (ks_netstackid == GLOBAL_NETSTACKID) {
1222                 ks = kstat_create_zone(ks_module, ks_instance, ks_name,
1223                     ks_class, ks_type, ks_ndata, ks_flags, GLOBAL_ZONEID);
1224                 if (ks != NULL)
1225                         netstack_shared_kstat_add(ks);
1226                 return (ks);
1227         } else {
1228                 zoneid_t zoneid = ks_netstackid;
1229 
1230                 return (kstat_create_zone(ks_module, ks_instance, ks_name,
1231                     ks_class, ks_type, ks_ndata, ks_flags, zoneid));
1232         }
1233 }
1234 
1235 void
1236 kstat_delete_netstack(kstat_t *ks, netstackid_t ks_netstackid)
1237 {
1238         if (ks_netstackid == GLOBAL_NETSTACKID) {
1239                 netstack_shared_kstat_remove(ks);
1240         }
1241         kstat_delete(ks);
1242 }
1243 
1244 static void
1245 netstack_shared_zone_add(zoneid_t zoneid)
1246 {
1247         struct shared_zone_list *sz;
1248         struct shared_kstat_list *sk;
1249 
1250         sz = (struct shared_zone_list *)kmem_zalloc(sizeof (*sz), KM_SLEEP);
1251         sz->sz_zoneid = zoneid;
1252 
1253         /* Insert in list */
1254         mutex_enter(&netstack_shared_lock);
1255         sz->sz_next = netstack_shared_zones;
1256         netstack_shared_zones = sz;
1257 
1258         /*
1259          * Perform kstat_zone_add for each existing shared stack kstat.
1260          * Note: Holds netstack_shared_lock lock across kstat_zone_add.
1261          */
1262         for (sk = netstack_shared_kstats; sk != NULL; sk = sk->sk_next) {
1263                 kstat_zone_add(sk->sk_kstat, zoneid);
1264         }
1265         mutex_exit(&netstack_shared_lock);
1266 }
1267 
1268 static void
1269 netstack_shared_zone_remove(zoneid_t zoneid)
1270 {
1271         struct shared_zone_list **szp, *sz;
1272         struct shared_kstat_list *sk;
1273 
1274         /* Find in list */
1275         mutex_enter(&netstack_shared_lock);
1276         sz = NULL;
1277         for (szp = &netstack_shared_zones; *szp != NULL;
1278             szp = &((*szp)->sz_next)) {
1279                 if ((*szp)->sz_zoneid == zoneid) {
1280                         sz = *szp;
1281                         break;
1282                 }
1283         }
1284         /* We must find it */
1285         ASSERT(sz != NULL);
1286         *szp = sz->sz_next;
1287         sz->sz_next = NULL;
1288 
1289         /*
1290          * Perform kstat_zone_remove for each existing shared stack kstat.
1291          * Note: Holds netstack_shared_lock lock across kstat_zone_remove.
1292          */
1293         for (sk = netstack_shared_kstats; sk != NULL; sk = sk->sk_next) {
1294                 kstat_zone_remove(sk->sk_kstat, zoneid);
1295         }
1296         mutex_exit(&netstack_shared_lock);
1297 
1298         kmem_free(sz, sizeof (*sz));
1299 }
1300 
1301 static void
1302 netstack_shared_kstat_add(kstat_t *ks)
1303 {
1304         struct shared_zone_list *sz;
1305         struct shared_kstat_list *sk;
1306 
1307         sk = (struct shared_kstat_list *)kmem_zalloc(sizeof (*sk), KM_SLEEP);
1308         sk->sk_kstat = ks;
1309 
1310         /* Insert in list */
1311         mutex_enter(&netstack_shared_lock);
1312         sk->sk_next = netstack_shared_kstats;
1313         netstack_shared_kstats = sk;
1314 
1315         /*
1316          * Perform kstat_zone_add for each existing shared stack zone.
1317          * Note: Holds netstack_shared_lock lock across kstat_zone_add.
1318          */
1319         for (sz = netstack_shared_zones; sz != NULL; sz = sz->sz_next) {
1320                 kstat_zone_add(ks, sz->sz_zoneid);
1321         }
1322         mutex_exit(&netstack_shared_lock);
1323 }
1324 
1325 static void
1326 netstack_shared_kstat_remove(kstat_t *ks)
1327 {
1328         struct shared_zone_list *sz;
1329         struct shared_kstat_list **skp, *sk;
1330 
1331         /* Find in list */
1332         mutex_enter(&netstack_shared_lock);
1333         sk = NULL;
1334         for (skp = &netstack_shared_kstats; *skp != NULL;
1335             skp = &((*skp)->sk_next)) {
1336                 if ((*skp)->sk_kstat == ks) {
1337                         sk = *skp;
1338                         break;
1339                 }
1340         }
1341         /* Must find it */
1342         ASSERT(sk != NULL);
1343         *skp = sk->sk_next;
1344         sk->sk_next = NULL;
1345 
1346         /*
1347          * Perform kstat_zone_remove for each existing shared stack kstat.
1348          * Note: Holds netstack_shared_lock lock across kstat_zone_remove.
1349          */
1350         for (sz = netstack_shared_zones; sz != NULL; sz = sz->sz_next) {
1351                 kstat_zone_remove(ks, sz->sz_zoneid);
1352         }
1353         mutex_exit(&netstack_shared_lock);
1354         kmem_free(sk, sizeof (*sk));
1355 }
1356 
1357 /*
1358  * If a zoneid is part of the shared zone, return true
1359  */
1360 static boolean_t
1361 netstack_find_shared_zoneid(zoneid_t zoneid)
1362 {
1363         struct shared_zone_list *sz;
1364 
1365         mutex_enter(&netstack_shared_lock);
1366         for (sz = netstack_shared_zones; sz != NULL; sz = sz->sz_next) {
1367                 if (sz->sz_zoneid == zoneid) {
1368                         mutex_exit(&netstack_shared_lock);
1369                         return (B_TRUE);
1370                 }
1371         }
1372         mutex_exit(&netstack_shared_lock);
1373         return (B_FALSE);
1374 }
1375 
1376 /*
1377  * Hide the fact that zoneids and netstackids are allocated from
1378  * the same space in the current implementation.
1379  * We currently do not check that the stackid/zoneids are valid, since there
1380  * is no need for that. But this should only be done for ids that are
1381  * valid.
1382  */
1383 zoneid_t
1384 netstackid_to_zoneid(netstackid_t stackid)
1385 {
1386         return (stackid);
1387 }
1388 
1389 netstackid_t
1390 zoneid_to_netstackid(zoneid_t zoneid)
1391 {
1392         if (netstack_find_shared_zoneid(zoneid))
1393                 return (GLOBAL_ZONEID);
1394         else
1395                 return (zoneid);
1396 }
1397 
1398 zoneid_t
1399 netstack_get_zoneid(netstack_t *ns)
1400 {
1401         return (netstackid_to_zoneid(ns->netstack_stackid));
1402 }
1403 
1404 /*
1405  * Simplistic support for walking all the handles.
1406  * Example usage:
1407  *      netstack_handle_t nh;
1408  *      netstack_t *ns;
1409  *
1410  *      netstack_next_init(&nh);
1411  *      while ((ns = netstack_next(&nh)) != NULL) {
1412  *              do something;
1413  *              netstack_rele(ns);
1414  *      }
1415  *      netstack_next_fini(&nh);
1416  */
1417 void
1418 netstack_next_init(netstack_handle_t *handle)
1419 {
1420         *handle = 0;
1421 }
1422 
1423 /* ARGSUSED */
1424 void
1425 netstack_next_fini(netstack_handle_t *handle)
1426 {
1427 }
1428 
1429 netstack_t *
1430 netstack_next(netstack_handle_t *handle)
1431 {
1432         netstack_t *ns;
1433         int i, end;
1434 
1435         end = *handle;
1436         /* Walk skipping *handle number of instances */
1437 
1438         /* Look if there is a matching stack instance */
1439         mutex_enter(&netstack_g_lock);
1440         ns = netstack_head;
1441         for (i = 0; i < end; i++) {
1442                 if (ns == NULL)
1443                         break;
1444                 ns = ns->netstack_next;
1445         }
1446         /* skip those with that aren't really here */
1447         while (ns != NULL) {
1448                 mutex_enter(&ns->netstack_lock);
1449                 if ((ns->netstack_flags & (NSF_UNINIT|NSF_CLOSING)) == 0) {
1450                         mutex_exit(&ns->netstack_lock);
1451                         break;
1452                 }
1453                 mutex_exit(&ns->netstack_lock);
1454                 end++;
1455                 ns = ns->netstack_next;
1456         }
1457         if (ns != NULL) {
1458                 *handle = end + 1;
1459                 netstack_hold(ns);
1460         }
1461         mutex_exit(&netstack_g_lock);
1462         return (ns);
1463 }