1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  * Copyright (c) 2017, Joyent, Inc.  All rights reserved.
  26  */
  27 
  28 #include <sys/param.h>
  29 #include <sys/sysmacros.h>
  30 #include <sys/vm.h>
  31 #include <sys/proc.h>
  32 #include <sys/tuneable.h>
  33 #include <sys/systm.h>
  34 #include <sys/cmn_err.h>
  35 #include <sys/debug.h>
  36 #include <sys/sdt.h>
  37 #include <sys/mutex.h>
  38 #include <sys/bitmap.h>
  39 #include <sys/atomic.h>
  40 #include <sys/sunddi.h>
  41 #include <sys/kobj.h>
  42 #include <sys/disp.h>
  43 #include <vm/seg_kmem.h>
  44 #include <sys/zone.h>
  45 #include <sys/netstack.h>
  46 
  47 /*
  48  * What we use so that the zones framework can tell us about new zones,
  49  * which we use to create new stacks.
  50  */
  51 static zone_key_t netstack_zone_key;
  52 
  53 static int      netstack_initialized = 0;
  54 
  55 /*
  56  * Track the registered netstacks.
  57  * The global lock protects
  58  * - ns_reg
  59  * - the list starting at netstack_head and following the netstack_next
  60  *   pointers.
  61  */
  62 static kmutex_t netstack_g_lock;
  63 
  64 /*
  65  * Registry of netstacks with their create/shutdown/destory functions.
  66  */
  67 static struct netstack_registry ns_reg[NS_MAX];
  68 
  69 /*
  70  * Global list of existing stacks.  We use this when a new zone with
  71  * an exclusive IP instance is created.
  72  *
  73  * Note that in some cases a netstack_t needs to stay around after the zone
  74  * has gone away. This is because there might be outstanding references
  75  * (from TCP TIME_WAIT connections, IPsec state, etc). The netstack_t data
  76  * structure and all the foo_stack_t's hanging off of it will be cleaned up
  77  * when the last reference to it is dropped.
  78  * However, the same zone might be rebooted. That is handled using the
  79  * assumption that the zones framework picks a new zoneid each time a zone
  80  * is (re)booted. We assert for that condition in netstack_zone_create().
  81  * Thus the old netstack_t can take its time for things to time out.
  82  */
  83 static netstack_t *netstack_head;
  84 
  85 /*
  86  * To support kstat_create_netstack() using kstat_zone_add we need
  87  * to track both
  88  *  - all zoneids that use the global/shared stack
  89  *  - all kstats that have been added for the shared stack
  90  */
  91 struct shared_zone_list {
  92         struct shared_zone_list *sz_next;
  93         zoneid_t                sz_zoneid;
  94 };
  95 
  96 struct shared_kstat_list {
  97         struct shared_kstat_list *sk_next;
  98         kstat_t                  *sk_kstat;
  99 };
 100 
 101 static kmutex_t netstack_shared_lock;   /* protects the following two */
 102 static struct shared_zone_list  *netstack_shared_zones;
 103 static struct shared_kstat_list *netstack_shared_kstats;
 104 
 105 static void     *netstack_zone_create(zoneid_t zoneid);
 106 static void     netstack_zone_shutdown(zoneid_t zoneid, void *arg);
 107 static void     netstack_zone_destroy(zoneid_t zoneid, void *arg);
 108 
 109 static void     netstack_shared_zone_add(zoneid_t zoneid);
 110 static void     netstack_shared_zone_remove(zoneid_t zoneid);
 111 static void     netstack_shared_kstat_add(kstat_t *ks);
 112 static void     netstack_shared_kstat_remove(kstat_t *ks);
 113 
 114 typedef boolean_t applyfn_t(kmutex_t *, netstack_t *, int);
 115 
 116 static void     apply_all_netstacks(int, applyfn_t *);
 117 static void     apply_all_modules(netstack_t *, applyfn_t *);
 118 static void     apply_all_modules_reverse(netstack_t *, applyfn_t *);
 119 static boolean_t netstack_apply_create(kmutex_t *, netstack_t *, int);
 120 static boolean_t netstack_apply_shutdown(kmutex_t *, netstack_t *, int);
 121 static boolean_t netstack_apply_destroy(kmutex_t *, netstack_t *, int);
 122 static boolean_t wait_for_zone_creator(netstack_t *, kmutex_t *);
 123 static boolean_t wait_for_nms_inprogress(netstack_t *, nm_state_t *,
 124     kmutex_t *);
 125 
 126 static ksema_t netstack_reap_limiter;
 127 /*
 128  * Hard-coded constant, but since this is not tunable in real-time, it seems
 129  * making it an /etc/system tunable is better than nothing.
 130  */
 131 uint_t netstack_outstanding_reaps = 1024;
 132 
 133 void
 134 netstack_init(void)
 135 {
 136         mutex_init(&netstack_g_lock, NULL, MUTEX_DEFAULT, NULL);
 137         mutex_init(&netstack_shared_lock, NULL, MUTEX_DEFAULT, NULL);
 138 
 139         sema_init(&netstack_reap_limiter, netstack_outstanding_reaps, NULL,
 140             SEMA_DRIVER, NULL);
 141 
 142         netstack_initialized = 1;
 143 
 144         /*
 145          * We want to be informed each time a zone is created or
 146          * destroyed in the kernel, so we can maintain the
 147          * stack instance information.
 148          */
 149         zone_key_create(&netstack_zone_key, netstack_zone_create,
 150             netstack_zone_shutdown, netstack_zone_destroy);
 151 }
 152 
 153 /*
 154  * Register a new module with the framework.
 155  * This registers interest in changes to the set of netstacks.
 156  * The createfn and destroyfn are required, but the shutdownfn can be
 157  * NULL.
 158  * Note that due to the current zsd implementation, when the create
 159  * function is called the zone isn't fully present, thus functions
 160  * like zone_find_by_* will fail, hence the create function can not
 161  * use many zones kernel functions including zcmn_err().
 162  */
 163 void
 164 netstack_register(int moduleid,
 165     void *(*module_create)(netstackid_t, netstack_t *),
 166     void (*module_shutdown)(netstackid_t, void *),
 167     void (*module_destroy)(netstackid_t, void *))
 168 {
 169         netstack_t *ns;
 170 
 171         ASSERT(netstack_initialized);
 172         ASSERT(moduleid >= 0 && moduleid < NS_MAX);
 173         ASSERT(module_create != NULL);
 174 
 175         /*
 176          * Make instances created after this point in time run the create
 177          * callback.
 178          */
 179         mutex_enter(&netstack_g_lock);
 180         ASSERT(ns_reg[moduleid].nr_create == NULL);
 181         ASSERT(ns_reg[moduleid].nr_flags == 0);
 182         ns_reg[moduleid].nr_create = module_create;
 183         ns_reg[moduleid].nr_shutdown = module_shutdown;
 184         ns_reg[moduleid].nr_destroy = module_destroy;
 185         ns_reg[moduleid].nr_flags = NRF_REGISTERED;
 186 
 187         /*
 188          * Determine the set of stacks that exist before we drop the lock.
 189          * Set NSS_CREATE_NEEDED for each of those.
 190          * netstacks which have been deleted will have NSS_CREATE_COMPLETED
 191          * set, but check NSF_CLOSING to be sure.
 192          */
 193         for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
 194                 nm_state_t *nms = &ns->netstack_m_state[moduleid];
 195 
 196                 mutex_enter(&ns->netstack_lock);
 197                 if (!(ns->netstack_flags & NSF_CLOSING) &&
 198                     (nms->nms_flags & NSS_CREATE_ALL) == 0) {
 199                         nms->nms_flags |= NSS_CREATE_NEEDED;
 200                         DTRACE_PROBE2(netstack__create__needed,
 201                             netstack_t *, ns, int, moduleid);
 202                 }
 203                 mutex_exit(&ns->netstack_lock);
 204         }
 205         mutex_exit(&netstack_g_lock);
 206 
 207         /*
 208          * At this point in time a new instance can be created or an instance
 209          * can be destroyed, or some other module can register or unregister.
 210          * Make sure we either run all the create functions for this moduleid
 211          * or we wait for any other creators for this moduleid.
 212          */
 213         apply_all_netstacks(moduleid, netstack_apply_create);
 214 }
 215 
 216 void
 217 netstack_unregister(int moduleid)
 218 {
 219         netstack_t *ns;
 220 
 221         ASSERT(moduleid >= 0 && moduleid < NS_MAX);
 222 
 223         ASSERT(ns_reg[moduleid].nr_create != NULL);
 224         ASSERT(ns_reg[moduleid].nr_flags & NRF_REGISTERED);
 225 
 226         mutex_enter(&netstack_g_lock);
 227         /*
 228          * Determine the set of stacks that exist before we drop the lock.
 229          * Set NSS_SHUTDOWN_NEEDED and NSS_DESTROY_NEEDED for each of those.
 230          * That ensures that when we return all the callbacks for existing
 231          * instances have completed. And since we set NRF_DYING no new
 232          * instances can use this module.
 233          */
 234         for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
 235                 boolean_t created = B_FALSE;
 236                 nm_state_t *nms = &ns->netstack_m_state[moduleid];
 237 
 238                 mutex_enter(&ns->netstack_lock);
 239 
 240                 /*
 241                  * We need to be careful here. We could actually have a netstack
 242                  * being created as we speak waiting for us to let go of this
 243                  * lock to proceed. It may have set NSS_CREATE_NEEDED, but not
 244                  * have gotten to the point of completing it yet. If
 245                  * NSS_CREATE_NEEDED, we can safely just remove it here and
 246                  * never create the module. However, if NSS_CREATE_INPROGRESS is
 247                  * set, we need to still flag this module for shutdown and
 248                  * deletion, just as though it had reached NSS_CREATE_COMPLETED.
 249                  *
 250                  * It is safe to do that because of two different guarantees
 251                  * that exist in the system. The first is that before we do a
 252                  * create, shutdown, or destroy, we ensure that nothing else is
 253                  * in progress in the system for this netstack and wait for it
 254                  * to complete. Secondly, because the zone is being created, we
 255                  * know that the following call to apply_all_netstack will block
 256                  * on the zone finishing its initialization.
 257                  */
 258                 if (nms->nms_flags & NSS_CREATE_NEEDED)
 259                         nms->nms_flags &= ~NSS_CREATE_NEEDED;
 260 
 261                 if (nms->nms_flags & NSS_CREATE_INPROGRESS ||
 262                     nms->nms_flags & NSS_CREATE_COMPLETED)
 263                         created = B_TRUE;
 264 
 265                 if (ns_reg[moduleid].nr_shutdown != NULL && created &&
 266                     (nms->nms_flags & NSS_CREATE_COMPLETED) &&
 267                     (nms->nms_flags & NSS_SHUTDOWN_ALL) == 0) {
 268                         nms->nms_flags |= NSS_SHUTDOWN_NEEDED;
 269                         DTRACE_PROBE2(netstack__shutdown__needed,
 270                             netstack_t *, ns, int, moduleid);
 271                 }
 272                 if ((ns_reg[moduleid].nr_flags & NRF_REGISTERED) &&
 273                     ns_reg[moduleid].nr_destroy != NULL && created &&
 274                     (nms->nms_flags & NSS_DESTROY_ALL) == 0) {
 275                         nms->nms_flags |= NSS_DESTROY_NEEDED;
 276                         DTRACE_PROBE2(netstack__destroy__needed,
 277                             netstack_t *, ns, int, moduleid);
 278                 }
 279                 mutex_exit(&ns->netstack_lock);
 280         }
 281         /*
 282          * Prevent any new netstack from calling the registered create
 283          * function, while keeping the function pointers in place until the
 284          * shutdown and destroy callbacks are complete.
 285          */
 286         ns_reg[moduleid].nr_flags |= NRF_DYING;
 287         mutex_exit(&netstack_g_lock);
 288 
 289         apply_all_netstacks(moduleid, netstack_apply_shutdown);
 290         apply_all_netstacks(moduleid, netstack_apply_destroy);
 291 
 292         /*
 293          * Clear the nms_flags so that we can handle this module
 294          * being loaded again.
 295          * Also remove the registered functions.
 296          */
 297         mutex_enter(&netstack_g_lock);
 298         ASSERT(ns_reg[moduleid].nr_flags & NRF_REGISTERED);
 299         ASSERT(ns_reg[moduleid].nr_flags & NRF_DYING);
 300         for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
 301                 nm_state_t *nms = &ns->netstack_m_state[moduleid];
 302 
 303                 mutex_enter(&ns->netstack_lock);
 304                 if (nms->nms_flags & NSS_DESTROY_COMPLETED) {
 305                         nms->nms_flags = 0;
 306                         DTRACE_PROBE2(netstack__destroy__done,
 307                             netstack_t *, ns, int, moduleid);
 308                 }
 309                 mutex_exit(&ns->netstack_lock);
 310         }
 311 
 312         ns_reg[moduleid].nr_create = NULL;
 313         ns_reg[moduleid].nr_shutdown = NULL;
 314         ns_reg[moduleid].nr_destroy = NULL;
 315         ns_reg[moduleid].nr_flags = 0;
 316         mutex_exit(&netstack_g_lock);
 317 }
 318 
 319 /*
 320  * Lookup and/or allocate a netstack for this zone.
 321  */
 322 static void *
 323 netstack_zone_create(zoneid_t zoneid)
 324 {
 325         netstackid_t stackid;
 326         netstack_t *ns;
 327         netstack_t **nsp;
 328         zone_t  *zone;
 329         int i;
 330 
 331         ASSERT(netstack_initialized);
 332 
 333         zone = zone_find_by_id_nolock(zoneid);
 334         ASSERT(zone != NULL);
 335 
 336         if (zone->zone_flags & ZF_NET_EXCL) {
 337                 stackid = zoneid;
 338         } else {
 339                 /* Look for the stack instance for the global */
 340                 stackid = GLOBAL_NETSTACKID;
 341         }
 342 
 343         /* Allocate even if it isn't needed; simplifies locking */
 344         ns = (netstack_t *)kmem_zalloc(sizeof (netstack_t), KM_SLEEP);
 345 
 346         /* Look if there is a matching stack instance */
 347         mutex_enter(&netstack_g_lock);
 348         for (nsp = &netstack_head; *nsp != NULL;
 349             nsp = &((*nsp)->netstack_next)) {
 350                 if ((*nsp)->netstack_stackid == stackid) {
 351                         /*
 352                          * Should never find a pre-existing exclusive stack
 353                          */
 354                         VERIFY(stackid == GLOBAL_NETSTACKID);
 355                         kmem_free(ns, sizeof (netstack_t));
 356                         ns = *nsp;
 357                         mutex_enter(&ns->netstack_lock);
 358                         ns->netstack_numzones++;
 359                         mutex_exit(&ns->netstack_lock);
 360                         mutex_exit(&netstack_g_lock);
 361                         DTRACE_PROBE1(netstack__inc__numzones,
 362                             netstack_t *, ns);
 363                         /* Record that we have a new shared stack zone */
 364                         netstack_shared_zone_add(zoneid);
 365                         zone->zone_netstack = ns;
 366                         return (ns);
 367                 }
 368         }
 369         /* Not found */
 370         mutex_init(&ns->netstack_lock, NULL, MUTEX_DEFAULT, NULL);
 371         cv_init(&ns->netstack_cv, NULL, CV_DEFAULT, NULL);
 372         ns->netstack_stackid = zoneid;
 373         ns->netstack_numzones = 1;
 374         ns->netstack_refcnt = 1; /* Decremented by netstack_zone_destroy */
 375         ns->netstack_flags = NSF_UNINIT;
 376         *nsp = ns;
 377         zone->zone_netstack = ns;
 378 
 379         mutex_enter(&ns->netstack_lock);
 380         /*
 381          * Mark this netstack as having a CREATE running so
 382          * any netstack_register/netstack_unregister waits for
 383          * the existing create callbacks to complete in moduleid order
 384          */
 385         ns->netstack_flags |= NSF_ZONE_CREATE;
 386 
 387         /*
 388          * Determine the set of module create functions that need to be
 389          * called before we drop the lock.
 390          * Set NSS_CREATE_NEEDED for each of those.
 391          * Skip any with NRF_DYING set, since those are in the process of
 392          * going away, by checking for flags being exactly NRF_REGISTERED.
 393          */
 394         for (i = 0; i < NS_MAX; i++) {
 395                 nm_state_t *nms = &ns->netstack_m_state[i];
 396 
 397                 cv_init(&nms->nms_cv, NULL, CV_DEFAULT, NULL);
 398 
 399                 if ((ns_reg[i].nr_flags == NRF_REGISTERED) &&
 400                     (nms->nms_flags & NSS_CREATE_ALL) == 0) {
 401                         nms->nms_flags |= NSS_CREATE_NEEDED;
 402                         DTRACE_PROBE2(netstack__create__needed,
 403                             netstack_t *, ns, int, i);
 404                 }
 405         }
 406         mutex_exit(&ns->netstack_lock);
 407         mutex_exit(&netstack_g_lock);
 408 
 409         apply_all_modules(ns, netstack_apply_create);
 410 
 411         /* Tell any waiting netstack_register/netstack_unregister to proceed */
 412         mutex_enter(&ns->netstack_lock);
 413         ns->netstack_flags &= ~NSF_UNINIT;
 414         ASSERT(ns->netstack_flags & NSF_ZONE_CREATE);
 415         ns->netstack_flags &= ~NSF_ZONE_CREATE;
 416         cv_broadcast(&ns->netstack_cv);
 417         mutex_exit(&ns->netstack_lock);
 418 
 419         return (ns);
 420 }
 421 
 422 /* ARGSUSED */
 423 static void
 424 netstack_zone_shutdown(zoneid_t zoneid, void *arg)
 425 {
 426         netstack_t *ns = (netstack_t *)arg;
 427         int i;
 428 
 429         ASSERT(arg != NULL);
 430 
 431         mutex_enter(&ns->netstack_lock);
 432         ASSERT(ns->netstack_numzones > 0);
 433         if (ns->netstack_numzones != 1) {
 434                 /* Stack instance being used by other zone */
 435                 mutex_exit(&ns->netstack_lock);
 436                 ASSERT(ns->netstack_stackid == GLOBAL_NETSTACKID);
 437                 return;
 438         }
 439         mutex_exit(&ns->netstack_lock);
 440 
 441         mutex_enter(&netstack_g_lock);
 442         mutex_enter(&ns->netstack_lock);
 443         /*
 444          * Mark this netstack as having a SHUTDOWN running so
 445          * any netstack_register/netstack_unregister waits for
 446          * the existing create callbacks to complete in moduleid order
 447          */
 448         ASSERT(!(ns->netstack_flags & NSF_ZONE_INPROGRESS));
 449         ns->netstack_flags |= NSF_ZONE_SHUTDOWN;
 450 
 451         /*
 452          * Determine the set of stacks that exist before we drop the lock.
 453          * Set NSS_SHUTDOWN_NEEDED for each of those.
 454          */
 455         for (i = 0; i < NS_MAX; i++) {
 456                 nm_state_t *nms = &ns->netstack_m_state[i];
 457 
 458                 if ((ns_reg[i].nr_flags & NRF_REGISTERED) &&
 459                     ns_reg[i].nr_shutdown != NULL &&
 460                     (nms->nms_flags & NSS_CREATE_COMPLETED) &&
 461                     (nms->nms_flags & NSS_SHUTDOWN_ALL) == 0) {
 462                         nms->nms_flags |= NSS_SHUTDOWN_NEEDED;
 463                         DTRACE_PROBE2(netstack__shutdown__needed,
 464                             netstack_t *, ns, int, i);
 465                 }
 466         }
 467         mutex_exit(&ns->netstack_lock);
 468         mutex_exit(&netstack_g_lock);
 469 
 470         /*
 471          * Call the shutdown function for all registered modules for this
 472          * netstack.
 473          */
 474         apply_all_modules_reverse(ns, netstack_apply_shutdown);
 475 
 476         /* Tell any waiting netstack_register/netstack_unregister to proceed */
 477         mutex_enter(&ns->netstack_lock);
 478         ASSERT(ns->netstack_flags & NSF_ZONE_SHUTDOWN);
 479         ns->netstack_flags &= ~NSF_ZONE_SHUTDOWN;
 480         cv_broadcast(&ns->netstack_cv);
 481         mutex_exit(&ns->netstack_lock);
 482 }
 483 
 484 /*
 485  * Common routine to release a zone.
 486  * If this was the last zone using the stack instance then prepare to
 487  * have the refcnt dropping to zero free the zone.
 488  */
 489 /* ARGSUSED */
 490 static void
 491 netstack_zone_destroy(zoneid_t zoneid, void *arg)
 492 {
 493         netstack_t *ns = (netstack_t *)arg;
 494 
 495         ASSERT(arg != NULL);
 496 
 497         mutex_enter(&ns->netstack_lock);
 498         ASSERT(ns->netstack_numzones > 0);
 499         ns->netstack_numzones--;
 500         if (ns->netstack_numzones != 0) {
 501                 /* Stack instance being used by other zone */
 502                 mutex_exit(&ns->netstack_lock);
 503                 ASSERT(ns->netstack_stackid == GLOBAL_NETSTACKID);
 504                 /* Record that we a shared stack zone has gone away */
 505                 netstack_shared_zone_remove(zoneid);
 506                 return;
 507         }
 508         /*
 509          * Set CLOSING so that netstack_find_by will not find it.
 510          */
 511         ns->netstack_flags |= NSF_CLOSING;
 512         mutex_exit(&ns->netstack_lock);
 513         DTRACE_PROBE1(netstack__dec__numzones, netstack_t *, ns);
 514         /* No other thread can call zone_destroy for this stack */
 515 
 516         /*
 517          * Decrease refcnt to account for the one in netstack_zone_init()
 518          */
 519         netstack_rele(ns);
 520 }
 521 
 522 /*
 523  * Called when the reference count drops to zero.
 524  * Call the destroy functions for each registered module.
 525  */
 526 static void
 527 netstack_stack_inactive(netstack_t *ns)
 528 {
 529         int i;
 530 
 531         mutex_enter(&netstack_g_lock);
 532         mutex_enter(&ns->netstack_lock);
 533         /*
 534          * Mark this netstack as having a DESTROY running so
 535          * any netstack_register/netstack_unregister waits for
 536          * the existing destroy callbacks to complete in reverse moduleid order
 537          */
 538         ASSERT(!(ns->netstack_flags & NSF_ZONE_INPROGRESS));
 539         ns->netstack_flags |= NSF_ZONE_DESTROY;
 540         /*
 541          * If the shutdown callback wasn't called earlier (e.g., if this is
 542          * a netstack shared between multiple zones), then we schedule it now.
 543          *
 544          * Determine the set of stacks that exist before we drop the lock.
 545          * Set NSS_DESTROY_NEEDED for each of those. That
 546          * ensures that when we return all the callbacks for existing
 547          * instances have completed.
 548          */
 549         for (i = 0; i < NS_MAX; i++) {
 550                 nm_state_t *nms = &ns->netstack_m_state[i];
 551 
 552                 if ((ns_reg[i].nr_flags & NRF_REGISTERED) &&
 553                     ns_reg[i].nr_shutdown != NULL &&
 554                     (nms->nms_flags & NSS_CREATE_COMPLETED) &&
 555                     (nms->nms_flags & NSS_SHUTDOWN_ALL) == 0) {
 556                         nms->nms_flags |= NSS_SHUTDOWN_NEEDED;
 557                         DTRACE_PROBE2(netstack__shutdown__needed,
 558                             netstack_t *, ns, int, i);
 559                 }
 560 
 561                 if ((ns_reg[i].nr_flags & NRF_REGISTERED) &&
 562                     ns_reg[i].nr_destroy != NULL &&
 563                     (nms->nms_flags & NSS_CREATE_COMPLETED) &&
 564                     (nms->nms_flags & NSS_DESTROY_ALL) == 0) {
 565                         nms->nms_flags |= NSS_DESTROY_NEEDED;
 566                         DTRACE_PROBE2(netstack__destroy__needed,
 567                             netstack_t *, ns, int, i);
 568                 }
 569         }
 570         mutex_exit(&ns->netstack_lock);
 571         mutex_exit(&netstack_g_lock);
 572 
 573         /*
 574          * Call the shutdown and destroy functions for all registered modules
 575          * for this netstack.
 576          *
 577          * Since there are some ordering dependencies between the modules we
 578          * tear them down in the reverse order of what was used to create them.
 579          *
 580          * Since a netstack_t is never reused (when a zone is rebooted it gets
 581          * a new zoneid == netstackid i.e. a new netstack_t is allocated) we
 582          * leave nms_flags the way it is i.e. with NSS_DESTROY_COMPLETED set.
 583          * That is different than in the netstack_unregister() case.
 584          */
 585         apply_all_modules_reverse(ns, netstack_apply_shutdown);
 586         apply_all_modules_reverse(ns, netstack_apply_destroy);
 587 
 588         /* Tell any waiting netstack_register/netstack_unregister to proceed */
 589         mutex_enter(&ns->netstack_lock);
 590         ASSERT(ns->netstack_flags & NSF_ZONE_DESTROY);
 591         ns->netstack_flags &= ~NSF_ZONE_DESTROY;
 592         cv_broadcast(&ns->netstack_cv);
 593         mutex_exit(&ns->netstack_lock);
 594 }
 595 
 596 /*
 597  * Apply a function to all netstacks for a particular moduleid.
 598  *
 599  * If there is any zone activity (due to a zone being created, shutdown,
 600  * or destroyed) we wait for that to complete before we proceed. This ensures
 601  * that the moduleids are processed in order when a zone is created or
 602  * destroyed.
 603  *
 604  * The applyfn has to drop netstack_g_lock if it does some work.
 605  * In that case we don't follow netstack_next,
 606  * even if it is possible to do so without any hazards. This is
 607  * because we want the design to allow for the list of netstacks threaded
 608  * by netstack_next to change in any arbitrary way during the time the
 609  * lock was dropped.
 610  *
 611  * It is safe to restart the loop at netstack_head since the applyfn
 612  * changes netstack_m_state as it processes things, so a subsequent
 613  * pass through will have no effect in applyfn, hence the loop will terminate
 614  * in at worst O(N^2).
 615  */
 616 static void
 617 apply_all_netstacks(int moduleid, applyfn_t *applyfn)
 618 {
 619         netstack_t *ns;
 620 
 621         mutex_enter(&netstack_g_lock);
 622         ns = netstack_head;
 623         while (ns != NULL) {
 624                 if (wait_for_zone_creator(ns, &netstack_g_lock)) {
 625                         /* Lock dropped - restart at head */
 626                         ns = netstack_head;
 627                 } else if ((applyfn)(&netstack_g_lock, ns, moduleid)) {
 628                         /* Lock dropped - restart at head */
 629                         ns = netstack_head;
 630                 } else {
 631                         ns = ns->netstack_next;
 632                 }
 633         }
 634         mutex_exit(&netstack_g_lock);
 635 }
 636 
 637 /*
 638  * Apply a function to all moduleids for a particular netstack.
 639  *
 640  * Since the netstack linkage doesn't matter in this case we can
 641  * ignore whether the function drops the lock.
 642  */
 643 static void
 644 apply_all_modules(netstack_t *ns, applyfn_t *applyfn)
 645 {
 646         int i;
 647 
 648         mutex_enter(&netstack_g_lock);
 649         for (i = 0; i < NS_MAX; i++) {
 650                 /*
 651                  * We don't care whether the lock was dropped
 652                  * since we are not iterating over netstack_head.
 653                  */
 654                 (void) (applyfn)(&netstack_g_lock, ns, i);
 655         }
 656         mutex_exit(&netstack_g_lock);
 657 }
 658 
 659 /* Like the above but in reverse moduleid order */
 660 static void
 661 apply_all_modules_reverse(netstack_t *ns, applyfn_t *applyfn)
 662 {
 663         int i;
 664 
 665         mutex_enter(&netstack_g_lock);
 666         for (i = NS_MAX-1; i >= 0; i--) {
 667                 /*
 668                  * We don't care whether the lock was dropped
 669                  * since we are not iterating over netstack_head.
 670                  */
 671                 (void) (applyfn)(&netstack_g_lock, ns, i);
 672         }
 673         mutex_exit(&netstack_g_lock);
 674 }
 675 
 676 /*
 677  * Call the create function for the ns and moduleid if CREATE_NEEDED
 678  * is set.
 679  * If some other thread gets here first and sets *_INPROGRESS, then
 680  * we wait for that thread to complete so that we can ensure that
 681  * all the callbacks are done when we've looped over all netstacks/moduleids.
 682  *
 683  * When we call the create function, we temporarily drop the netstack_lock
 684  * held by the caller, and return true to tell the caller it needs to
 685  * re-evalute the state.
 686  */
 687 static boolean_t
 688 netstack_apply_create(kmutex_t *lockp, netstack_t *ns, int moduleid)
 689 {
 690         void *result;
 691         netstackid_t stackid;
 692         nm_state_t *nms = &ns->netstack_m_state[moduleid];
 693         boolean_t dropped = B_FALSE;
 694 
 695         ASSERT(MUTEX_HELD(lockp));
 696         mutex_enter(&ns->netstack_lock);
 697 
 698         if (wait_for_nms_inprogress(ns, nms, lockp))
 699                 dropped = B_TRUE;
 700 
 701         if (nms->nms_flags & NSS_CREATE_NEEDED) {
 702                 nms->nms_flags &= ~NSS_CREATE_NEEDED;
 703                 nms->nms_flags |= NSS_CREATE_INPROGRESS;
 704                 DTRACE_PROBE2(netstack__create__inprogress,
 705                     netstack_t *, ns, int, moduleid);
 706                 mutex_exit(&ns->netstack_lock);
 707                 mutex_exit(lockp);
 708                 dropped = B_TRUE;
 709 
 710                 ASSERT(ns_reg[moduleid].nr_create != NULL);
 711                 stackid = ns->netstack_stackid;
 712                 DTRACE_PROBE2(netstack__create__start,
 713                     netstackid_t, stackid,
 714                     netstack_t *, ns);
 715                 result = (ns_reg[moduleid].nr_create)(stackid, ns);
 716                 DTRACE_PROBE2(netstack__create__end,
 717                     void *, result, netstack_t *, ns);
 718 
 719                 ASSERT(result != NULL);
 720                 mutex_enter(lockp);
 721                 mutex_enter(&ns->netstack_lock);
 722                 ns->netstack_modules[moduleid] = result;
 723                 nms->nms_flags &= ~NSS_CREATE_INPROGRESS;
 724                 nms->nms_flags |= NSS_CREATE_COMPLETED;
 725                 cv_broadcast(&nms->nms_cv);
 726                 DTRACE_PROBE2(netstack__create__completed,
 727                     netstack_t *, ns, int, moduleid);
 728                 mutex_exit(&ns->netstack_lock);
 729                 return (dropped);
 730         } else {
 731                 mutex_exit(&ns->netstack_lock);
 732                 return (dropped);
 733         }
 734 }
 735 
 736 /*
 737  * Call the shutdown function for the ns and moduleid if SHUTDOWN_NEEDED
 738  * is set.
 739  * If some other thread gets here first and sets *_INPROGRESS, then
 740  * we wait for that thread to complete so that we can ensure that
 741  * all the callbacks are done when we've looped over all netstacks/moduleids.
 742  *
 743  * When we call the shutdown function, we temporarily drop the netstack_lock
 744  * held by the caller, and return true to tell the caller it needs to
 745  * re-evalute the state.
 746  */
 747 static boolean_t
 748 netstack_apply_shutdown(kmutex_t *lockp, netstack_t *ns, int moduleid)
 749 {
 750         netstackid_t stackid;
 751         void * netstack_module;
 752         nm_state_t *nms = &ns->netstack_m_state[moduleid];
 753         boolean_t dropped = B_FALSE;
 754 
 755         ASSERT(MUTEX_HELD(lockp));
 756         mutex_enter(&ns->netstack_lock);
 757 
 758         if (wait_for_nms_inprogress(ns, nms, lockp))
 759                 dropped = B_TRUE;
 760 
 761         if (nms->nms_flags & NSS_SHUTDOWN_NEEDED) {
 762                 nms->nms_flags &= ~NSS_SHUTDOWN_NEEDED;
 763                 nms->nms_flags |= NSS_SHUTDOWN_INPROGRESS;
 764                 DTRACE_PROBE2(netstack__shutdown__inprogress,
 765                     netstack_t *, ns, int, moduleid);
 766                 mutex_exit(&ns->netstack_lock);
 767                 mutex_exit(lockp);
 768                 dropped = B_TRUE;
 769 
 770                 ASSERT(ns_reg[moduleid].nr_shutdown != NULL);
 771                 stackid = ns->netstack_stackid;
 772                 netstack_module = ns->netstack_modules[moduleid];
 773                 DTRACE_PROBE2(netstack__shutdown__start,
 774                     netstackid_t, stackid,
 775                     void *, netstack_module);
 776                 (ns_reg[moduleid].nr_shutdown)(stackid, netstack_module);
 777                 DTRACE_PROBE1(netstack__shutdown__end,
 778                     netstack_t *, ns);
 779 
 780                 mutex_enter(lockp);
 781                 mutex_enter(&ns->netstack_lock);
 782                 nms->nms_flags &= ~NSS_SHUTDOWN_INPROGRESS;
 783                 nms->nms_flags |= NSS_SHUTDOWN_COMPLETED;
 784                 cv_broadcast(&nms->nms_cv);
 785                 DTRACE_PROBE2(netstack__shutdown__completed,
 786                     netstack_t *, ns, int, moduleid);
 787                 mutex_exit(&ns->netstack_lock);
 788                 return (dropped);
 789         } else {
 790                 mutex_exit(&ns->netstack_lock);
 791                 return (dropped);
 792         }
 793 }
 794 
 795 /*
 796  * Call the destroy function for the ns and moduleid if DESTROY_NEEDED
 797  * is set.
 798  * If some other thread gets here first and sets *_INPROGRESS, then
 799  * we wait for that thread to complete so that we can ensure that
 800  * all the callbacks are done when we've looped over all netstacks/moduleids.
 801  *
 802  * When we call the destroy function, we temporarily drop the netstack_lock
 803  * held by the caller, and return true to tell the caller it needs to
 804  * re-evalute the state.
 805  */
 806 static boolean_t
 807 netstack_apply_destroy(kmutex_t *lockp, netstack_t *ns, int moduleid)
 808 {
 809         netstackid_t stackid;
 810         void * netstack_module;
 811         nm_state_t *nms = &ns->netstack_m_state[moduleid];
 812         boolean_t dropped = B_FALSE;
 813 
 814         ASSERT(MUTEX_HELD(lockp));
 815         mutex_enter(&ns->netstack_lock);
 816 
 817         if (wait_for_nms_inprogress(ns, nms, lockp))
 818                 dropped = B_TRUE;
 819 
 820         if (nms->nms_flags & NSS_DESTROY_NEEDED) {
 821                 nms->nms_flags &= ~NSS_DESTROY_NEEDED;
 822                 nms->nms_flags |= NSS_DESTROY_INPROGRESS;
 823                 DTRACE_PROBE2(netstack__destroy__inprogress,
 824                     netstack_t *, ns, int, moduleid);
 825                 mutex_exit(&ns->netstack_lock);
 826                 mutex_exit(lockp);
 827                 dropped = B_TRUE;
 828 
 829                 ASSERT(ns_reg[moduleid].nr_destroy != NULL);
 830                 stackid = ns->netstack_stackid;
 831                 netstack_module = ns->netstack_modules[moduleid];
 832                 DTRACE_PROBE2(netstack__destroy__start,
 833                     netstackid_t, stackid,
 834                     void *, netstack_module);
 835                 (ns_reg[moduleid].nr_destroy)(stackid, netstack_module);
 836                 DTRACE_PROBE1(netstack__destroy__end,
 837                     netstack_t *, ns);
 838 
 839                 mutex_enter(lockp);
 840                 mutex_enter(&ns->netstack_lock);
 841                 ns->netstack_modules[moduleid] = NULL;
 842                 nms->nms_flags &= ~NSS_DESTROY_INPROGRESS;
 843                 nms->nms_flags |= NSS_DESTROY_COMPLETED;
 844                 cv_broadcast(&nms->nms_cv);
 845                 DTRACE_PROBE2(netstack__destroy__completed,
 846                     netstack_t *, ns, int, moduleid);
 847                 mutex_exit(&ns->netstack_lock);
 848                 return (dropped);
 849         } else {
 850                 mutex_exit(&ns->netstack_lock);
 851                 return (dropped);
 852         }
 853 }
 854 
 855 /*
 856  * If somebody  is creating the netstack (due to a new zone being created)
 857  * then we wait for them to complete. This ensures that any additional
 858  * netstack_register() doesn't cause the create functions to run out of
 859  * order.
 860  * Note that we do not need such a global wait in the case of the shutdown
 861  * and destroy callbacks, since in that case it is sufficient for both
 862  * threads to set NEEDED and wait for INPROGRESS to ensure ordering.
 863  * Returns true if lockp was temporarily dropped while waiting.
 864  */
 865 static boolean_t
 866 wait_for_zone_creator(netstack_t *ns, kmutex_t *lockp)
 867 {
 868         boolean_t dropped = B_FALSE;
 869 
 870         mutex_enter(&ns->netstack_lock);
 871         while (ns->netstack_flags & NSF_ZONE_CREATE) {
 872                 DTRACE_PROBE1(netstack__wait__zone__inprogress,
 873                     netstack_t *, ns);
 874                 if (lockp != NULL) {
 875                         dropped = B_TRUE;
 876                         mutex_exit(lockp);
 877                 }
 878                 cv_wait(&ns->netstack_cv, &ns->netstack_lock);
 879                 if (lockp != NULL) {
 880                         /* First drop netstack_lock to preserve order */
 881                         mutex_exit(&ns->netstack_lock);
 882                         mutex_enter(lockp);
 883                         mutex_enter(&ns->netstack_lock);
 884                 }
 885         }
 886         mutex_exit(&ns->netstack_lock);
 887         return (dropped);
 888 }
 889 
 890 /*
 891  * Wait for any INPROGRESS flag to be cleared for the netstack/moduleid
 892  * combination.
 893  * Returns true if lockp was temporarily dropped while waiting.
 894  */
 895 static boolean_t
 896 wait_for_nms_inprogress(netstack_t *ns, nm_state_t *nms, kmutex_t *lockp)
 897 {
 898         boolean_t dropped = B_FALSE;
 899 
 900         while (nms->nms_flags & NSS_ALL_INPROGRESS) {
 901                 DTRACE_PROBE2(netstack__wait__nms__inprogress,
 902                     netstack_t *, ns, nm_state_t *, nms);
 903                 if (lockp != NULL) {
 904                         dropped = B_TRUE;
 905                         mutex_exit(lockp);
 906                 }
 907                 cv_wait(&nms->nms_cv, &ns->netstack_lock);
 908                 if (lockp != NULL) {
 909                         /* First drop netstack_lock to preserve order */
 910                         mutex_exit(&ns->netstack_lock);
 911                         mutex_enter(lockp);
 912                         mutex_enter(&ns->netstack_lock);
 913                 }
 914         }
 915         return (dropped);
 916 }
 917 
 918 /*
 919  * Get the stack instance used in caller's zone.
 920  * Increases the reference count, caller must do a netstack_rele.
 921  * It can't be called after zone_destroy() has started.
 922  */
 923 netstack_t *
 924 netstack_get_current(void)
 925 {
 926         netstack_t *ns;
 927 
 928         ns = curproc->p_zone->zone_netstack;
 929         ASSERT(ns != NULL);
 930         if (ns->netstack_flags & (NSF_UNINIT|NSF_CLOSING))
 931                 return (NULL);
 932 
 933         netstack_hold(ns);
 934 
 935         return (ns);
 936 }
 937 
 938 /*
 939  * Find a stack instance given the cred.
 940  * This is used by the modules to potentially allow for a future when
 941  * something other than the zoneid is used to determine the stack.
 942  */
 943 netstack_t *
 944 netstack_find_by_cred(const cred_t *cr)
 945 {
 946         zoneid_t zoneid = crgetzoneid(cr);
 947 
 948         /* Handle the case when cr_zone is NULL */
 949         if (zoneid == (zoneid_t)-1)
 950                 zoneid = GLOBAL_ZONEID;
 951 
 952         /* For performance ... */
 953         if (curproc->p_zone->zone_id == zoneid)
 954                 return (netstack_get_current());
 955         else
 956                 return (netstack_find_by_zoneid(zoneid));
 957 }
 958 
 959 /*
 960  * Find a stack instance given the zoneid.
 961  * Increases the reference count if found; caller must do a
 962  * netstack_rele().
 963  *
 964  * If there is no exact match then assume the shared stack instance
 965  * matches.
 966  *
 967  * Skip the unitialized ones.
 968  */
 969 netstack_t *
 970 netstack_find_by_zoneid(zoneid_t zoneid)
 971 {
 972         netstack_t *ns;
 973         zone_t *zone;
 974 
 975         zone = zone_find_by_id(zoneid);
 976 
 977         if (zone == NULL)
 978                 return (NULL);
 979 
 980         ns = zone->zone_netstack;
 981         ASSERT(ns != NULL);
 982         if (ns->netstack_flags & (NSF_UNINIT|NSF_CLOSING))
 983                 ns = NULL;
 984         else
 985                 netstack_hold(ns);
 986 
 987         zone_rele(zone);
 988         return (ns);
 989 }
 990 
 991 /*
 992  * Find a stack instance given the zoneid. Can only be called from
 993  * the create callback. See the comments in zone_find_by_id_nolock why
 994  * that limitation exists.
 995  *
 996  * Increases the reference count if found; caller must do a
 997  * netstack_rele().
 998  *
 999  * If there is no exact match then assume the shared stack instance
1000  * matches.
1001  *
1002  * Skip the unitialized ones.
1003  */
1004 netstack_t *
1005 netstack_find_by_zoneid_nolock(zoneid_t zoneid)
1006 {
1007         netstack_t *ns;
1008         zone_t *zone;
1009 
1010         zone = zone_find_by_id_nolock(zoneid);
1011 
1012         if (zone == NULL)
1013                 return (NULL);
1014 
1015         ns = zone->zone_netstack;
1016         ASSERT(ns != NULL);
1017 
1018         if (ns->netstack_flags & (NSF_UNINIT|NSF_CLOSING))
1019                 ns = NULL;
1020         else
1021                 netstack_hold(ns);
1022 
1023         /* zone_find_by_id_nolock does not have a hold on the zone */
1024         return (ns);
1025 }
1026 
1027 /*
1028  * Find a stack instance given the stackid with exact match?
1029  * Increases the reference count if found; caller must do a
1030  * netstack_rele().
1031  *
1032  * Skip the unitialized ones.
1033  */
1034 netstack_t *
1035 netstack_find_by_stackid(netstackid_t stackid)
1036 {
1037         netstack_t *ns;
1038 
1039         mutex_enter(&netstack_g_lock);
1040         for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
1041                 mutex_enter(&ns->netstack_lock);
1042                 if (ns->netstack_stackid == stackid &&
1043                     !(ns->netstack_flags & (NSF_UNINIT|NSF_CLOSING))) {
1044                         mutex_exit(&ns->netstack_lock);
1045                         netstack_hold(ns);
1046                         mutex_exit(&netstack_g_lock);
1047                         return (ns);
1048                 }
1049                 mutex_exit(&ns->netstack_lock);
1050         }
1051         mutex_exit(&netstack_g_lock);
1052         return (NULL);
1053 }
1054 
1055 boolean_t
1056 netstack_inuse_by_stackid(netstackid_t stackid)
1057 {
1058         netstack_t *ns;
1059         boolean_t rval = B_FALSE;
1060 
1061         mutex_enter(&netstack_g_lock);
1062 
1063         for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
1064                 if (ns->netstack_stackid == stackid) {
1065                         rval = B_TRUE;
1066                         break;
1067                 }
1068         }
1069 
1070         mutex_exit(&netstack_g_lock);
1071 
1072         return (rval);
1073 }
1074 
1075 
1076 static void
1077 netstack_reap(void *arg)
1078 {
1079         netstack_t **nsp, *ns = (netstack_t *)arg;
1080         boolean_t found;
1081         int i;
1082 
1083         /*
1084          * Time to call the destroy functions and free up
1085          * the structure
1086          */
1087         netstack_stack_inactive(ns);
1088 
1089         /* Make sure nothing increased the references */
1090         ASSERT(ns->netstack_refcnt == 0);
1091         ASSERT(ns->netstack_numzones == 0);
1092 
1093         /* Finally remove from list of netstacks */
1094         mutex_enter(&netstack_g_lock);
1095         found = B_FALSE;
1096         for (nsp = &netstack_head; *nsp != NULL;
1097             nsp = &(*nsp)->netstack_next) {
1098                 if (*nsp == ns) {
1099                         *nsp = ns->netstack_next;
1100                         ns->netstack_next = NULL;
1101                         found = B_TRUE;
1102                         break;
1103                 }
1104         }
1105         ASSERT(found);
1106         mutex_exit(&netstack_g_lock);
1107 
1108         /* Make sure nothing increased the references */
1109         ASSERT(ns->netstack_refcnt == 0);
1110         ASSERT(ns->netstack_numzones == 0);
1111 
1112         ASSERT(ns->netstack_flags & NSF_CLOSING);
1113 
1114         for (i = 0; i < NS_MAX; i++) {
1115                 nm_state_t *nms = &ns->netstack_m_state[i];
1116 
1117                 cv_destroy(&nms->nms_cv);
1118         }
1119         mutex_destroy(&ns->netstack_lock);
1120         cv_destroy(&ns->netstack_cv);
1121         kmem_free(ns, sizeof (*ns));
1122         /* Allow another reap to be scheduled. */
1123         sema_v(&netstack_reap_limiter);
1124 }
1125 
1126 void
1127 netstack_rele(netstack_t *ns)
1128 {
1129         int refcnt, numzones;
1130 
1131         mutex_enter(&ns->netstack_lock);
1132         ASSERT(ns->netstack_refcnt > 0);
1133         ns->netstack_refcnt--;
1134         /*
1135          * As we drop the lock additional netstack_rele()s can come in
1136          * and decrement the refcnt to zero and free the netstack_t.
1137          * Store pointers in local variables and if we were not the last
1138          * then don't reference the netstack_t after that.
1139          */
1140         refcnt = ns->netstack_refcnt;
1141         numzones = ns->netstack_numzones;
1142         DTRACE_PROBE1(netstack__dec__ref, netstack_t *, ns);
1143         mutex_exit(&ns->netstack_lock);
1144 
1145         if (refcnt == 0 && numzones == 0) {
1146                 /*
1147                  * Because there are possibilities of re-entrancy in various
1148                  * netstack structures by callers, which might cause a lock up
1149                  * due to odd reference models, or other factors, we choose to
1150                  * schedule the actual deletion of this netstack as a deferred
1151                  * task on the system taskq.  This way, any such reference
1152                  * models won't trip over themselves.
1153                  *
1154                  * Assume we aren't in a high-priority interrupt context, so
1155                  * we can use KM_SLEEP and semaphores.
1156                  */
1157                 if (sema_tryp(&netstack_reap_limiter) == 0) {
1158                         /*
1159                          * Indicate we're slamming against a limit.
1160                          */
1161                         hrtime_t measurement = gethrtime();
1162 
1163                         sema_p(&netstack_reap_limiter);
1164                         /* Capture delay in ns. */
1165                         DTRACE_PROBE1(netstack__reap__rate__limited,
1166                             hrtime_t, gethrtime() - measurement);
1167                 }
1168 
1169                 /* TQ_SLEEP should prevent taskq_dispatch() from failing. */
1170                 (void) taskq_dispatch(system_taskq, netstack_reap, ns,
1171                     TQ_SLEEP);
1172         }
1173 }
1174 
1175 void
1176 netstack_hold(netstack_t *ns)
1177 {
1178         mutex_enter(&ns->netstack_lock);
1179         ns->netstack_refcnt++;
1180         ASSERT(ns->netstack_refcnt > 0);
1181         mutex_exit(&ns->netstack_lock);
1182         DTRACE_PROBE1(netstack__inc__ref, netstack_t *, ns);
1183 }
1184 
1185 /*
1186  * To support kstat_create_netstack() using kstat_zone_add we need
1187  * to track both
1188  *  - all zoneids that use the global/shared stack
1189  *  - all kstats that have been added for the shared stack
1190  */
1191 kstat_t *
1192 kstat_create_netstack(char *ks_module, int ks_instance, char *ks_name,
1193     char *ks_class, uchar_t ks_type, uint_t ks_ndata, uchar_t ks_flags,
1194     netstackid_t ks_netstackid)
1195 {
1196         kstat_t *ks;
1197 
1198         if (ks_netstackid == GLOBAL_NETSTACKID) {
1199                 ks = kstat_create_zone(ks_module, ks_instance, ks_name,
1200                     ks_class, ks_type, ks_ndata, ks_flags, GLOBAL_ZONEID);
1201                 if (ks != NULL)
1202                         netstack_shared_kstat_add(ks);
1203                 return (ks);
1204         } else {
1205                 zoneid_t zoneid = ks_netstackid;
1206 
1207                 return (kstat_create_zone(ks_module, ks_instance, ks_name,
1208                     ks_class, ks_type, ks_ndata, ks_flags, zoneid));
1209         }
1210 }
1211 
1212 void
1213 kstat_delete_netstack(kstat_t *ks, netstackid_t ks_netstackid)
1214 {
1215         if (ks_netstackid == GLOBAL_NETSTACKID) {
1216                 netstack_shared_kstat_remove(ks);
1217         }
1218         kstat_delete(ks);
1219 }
1220 
1221 static void
1222 netstack_shared_zone_add(zoneid_t zoneid)
1223 {
1224         struct shared_zone_list *sz;
1225         struct shared_kstat_list *sk;
1226 
1227         sz = (struct shared_zone_list *)kmem_zalloc(sizeof (*sz), KM_SLEEP);
1228         sz->sz_zoneid = zoneid;
1229 
1230         /* Insert in list */
1231         mutex_enter(&netstack_shared_lock);
1232         sz->sz_next = netstack_shared_zones;
1233         netstack_shared_zones = sz;
1234 
1235         /*
1236          * Perform kstat_zone_add for each existing shared stack kstat.
1237          * Note: Holds netstack_shared_lock lock across kstat_zone_add.
1238          */
1239         for (sk = netstack_shared_kstats; sk != NULL; sk = sk->sk_next) {
1240                 kstat_zone_add(sk->sk_kstat, zoneid);
1241         }
1242         mutex_exit(&netstack_shared_lock);
1243 }
1244 
1245 static void
1246 netstack_shared_zone_remove(zoneid_t zoneid)
1247 {
1248         struct shared_zone_list **szp, *sz;
1249         struct shared_kstat_list *sk;
1250 
1251         /* Find in list */
1252         mutex_enter(&netstack_shared_lock);
1253         sz = NULL;
1254         for (szp = &netstack_shared_zones; *szp != NULL;
1255             szp = &((*szp)->sz_next)) {
1256                 if ((*szp)->sz_zoneid == zoneid) {
1257                         sz = *szp;
1258                         break;
1259                 }
1260         }
1261         /* We must find it */
1262         ASSERT(sz != NULL);
1263         *szp = sz->sz_next;
1264         sz->sz_next = NULL;
1265 
1266         /*
1267          * Perform kstat_zone_remove for each existing shared stack kstat.
1268          * Note: Holds netstack_shared_lock lock across kstat_zone_remove.
1269          */
1270         for (sk = netstack_shared_kstats; sk != NULL; sk = sk->sk_next) {
1271                 kstat_zone_remove(sk->sk_kstat, zoneid);
1272         }
1273         mutex_exit(&netstack_shared_lock);
1274 
1275         kmem_free(sz, sizeof (*sz));
1276 }
1277 
1278 static void
1279 netstack_shared_kstat_add(kstat_t *ks)
1280 {
1281         struct shared_zone_list *sz;
1282         struct shared_kstat_list *sk;
1283 
1284         sk = (struct shared_kstat_list *)kmem_zalloc(sizeof (*sk), KM_SLEEP);
1285         sk->sk_kstat = ks;
1286 
1287         /* Insert in list */
1288         mutex_enter(&netstack_shared_lock);
1289         sk->sk_next = netstack_shared_kstats;
1290         netstack_shared_kstats = sk;
1291 
1292         /*
1293          * Perform kstat_zone_add for each existing shared stack zone.
1294          * Note: Holds netstack_shared_lock lock across kstat_zone_add.
1295          */
1296         for (sz = netstack_shared_zones; sz != NULL; sz = sz->sz_next) {
1297                 kstat_zone_add(ks, sz->sz_zoneid);
1298         }
1299         mutex_exit(&netstack_shared_lock);
1300 }
1301 
1302 static void
1303 netstack_shared_kstat_remove(kstat_t *ks)
1304 {
1305         struct shared_zone_list *sz;
1306         struct shared_kstat_list **skp, *sk;
1307 
1308         /* Find in list */
1309         mutex_enter(&netstack_shared_lock);
1310         sk = NULL;
1311         for (skp = &netstack_shared_kstats; *skp != NULL;
1312             skp = &((*skp)->sk_next)) {
1313                 if ((*skp)->sk_kstat == ks) {
1314                         sk = *skp;
1315                         break;
1316                 }
1317         }
1318         /* Must find it */
1319         ASSERT(sk != NULL);
1320         *skp = sk->sk_next;
1321         sk->sk_next = NULL;
1322 
1323         /*
1324          * Perform kstat_zone_remove for each existing shared stack kstat.
1325          * Note: Holds netstack_shared_lock lock across kstat_zone_remove.
1326          */
1327         for (sz = netstack_shared_zones; sz != NULL; sz = sz->sz_next) {
1328                 kstat_zone_remove(ks, sz->sz_zoneid);
1329         }
1330         mutex_exit(&netstack_shared_lock);
1331         kmem_free(sk, sizeof (*sk));
1332 }
1333 
1334 /*
1335  * If a zoneid is part of the shared zone, return true
1336  */
1337 static boolean_t
1338 netstack_find_shared_zoneid(zoneid_t zoneid)
1339 {
1340         struct shared_zone_list *sz;
1341 
1342         mutex_enter(&netstack_shared_lock);
1343         for (sz = netstack_shared_zones; sz != NULL; sz = sz->sz_next) {
1344                 if (sz->sz_zoneid == zoneid) {
1345                         mutex_exit(&netstack_shared_lock);
1346                         return (B_TRUE);
1347                 }
1348         }
1349         mutex_exit(&netstack_shared_lock);
1350         return (B_FALSE);
1351 }
1352 
1353 /*
1354  * Hide the fact that zoneids and netstackids are allocated from
1355  * the same space in the current implementation.
1356  * We currently do not check that the stackid/zoneids are valid, since there
1357  * is no need for that. But this should only be done for ids that are
1358  * valid.
1359  */
1360 zoneid_t
1361 netstackid_to_zoneid(netstackid_t stackid)
1362 {
1363         return (stackid);
1364 }
1365 
1366 netstackid_t
1367 zoneid_to_netstackid(zoneid_t zoneid)
1368 {
1369         if (netstack_find_shared_zoneid(zoneid))
1370                 return (GLOBAL_ZONEID);
1371         else
1372                 return (zoneid);
1373 }
1374 
1375 zoneid_t
1376 netstack_get_zoneid(netstack_t *ns)
1377 {
1378         return (netstackid_to_zoneid(ns->netstack_stackid));
1379 }
1380 
1381 /*
1382  * Simplistic support for walking all the handles.
1383  * Example usage:
1384  *      netstack_handle_t nh;
1385  *      netstack_t *ns;
1386  *
1387  *      netstack_next_init(&nh);
1388  *      while ((ns = netstack_next(&nh)) != NULL) {
1389  *              do something;
1390  *              netstack_rele(ns);
1391  *      }
1392  *      netstack_next_fini(&nh);
1393  */
1394 void
1395 netstack_next_init(netstack_handle_t *handle)
1396 {
1397         *handle = 0;
1398 }
1399 
1400 /* ARGSUSED */
1401 void
1402 netstack_next_fini(netstack_handle_t *handle)
1403 {
1404 }
1405 
1406 netstack_t *
1407 netstack_next(netstack_handle_t *handle)
1408 {
1409         netstack_t *ns;
1410         int i, end;
1411 
1412         end = *handle;
1413         /* Walk skipping *handle number of instances */
1414 
1415         /* Look if there is a matching stack instance */
1416         mutex_enter(&netstack_g_lock);
1417         ns = netstack_head;
1418         for (i = 0; i < end; i++) {
1419                 if (ns == NULL)
1420                         break;
1421                 ns = ns->netstack_next;
1422         }
1423         /* skip those with that aren't really here */
1424         while (ns != NULL) {
1425                 mutex_enter(&ns->netstack_lock);
1426                 if ((ns->netstack_flags & (NSF_UNINIT|NSF_CLOSING)) == 0) {
1427                         mutex_exit(&ns->netstack_lock);
1428                         break;
1429                 }
1430                 mutex_exit(&ns->netstack_lock);
1431                 end++;
1432                 ns = ns->netstack_next;
1433         }
1434         if (ns != NULL) {
1435                 *handle = end + 1;
1436                 netstack_hold(ns);
1437         }
1438         mutex_exit(&netstack_g_lock);
1439         return (ns);
1440 }