1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  * Copyright (c) 2016, Joyent, Inc.  All rights reserved.
  26  */
  27 
  28 #include <sys/param.h>
  29 #include <sys/sysmacros.h>
  30 #include <sys/vm.h>
  31 #include <sys/proc.h>
  32 #include <sys/tuneable.h>
  33 #include <sys/systm.h>
  34 #include <sys/cmn_err.h>
  35 #include <sys/debug.h>
  36 #include <sys/sdt.h>
  37 #include <sys/mutex.h>
  38 #include <sys/bitmap.h>
  39 #include <sys/atomic.h>
  40 #include <sys/sunddi.h>
  41 #include <sys/kobj.h>
  42 #include <sys/disp.h>
  43 #include <vm/seg_kmem.h>
  44 #include <sys/zone.h>
  45 #include <sys/netstack.h>
  46 
  47 /*
  48  * What we use so that the zones framework can tell us about new zones,
  49  * which we use to create new stacks.
  50  */
  51 static zone_key_t netstack_zone_key;
  52 
  53 static int      netstack_initialized = 0;
  54 
  55 /*
  56  * Track the registered netstacks.
  57  * The global lock protects
  58  * - ns_reg
  59  * - the list starting at netstack_head and following the netstack_next
  60  *   pointers.
  61  */
  62 static kmutex_t netstack_g_lock;
  63 
  64 /*
  65  * Registry of netstacks with their create/shutdown/destory functions.
  66  */
  67 static struct netstack_registry ns_reg[NS_MAX];
  68 
  69 /*
  70  * Global list of existing stacks.  We use this when a new zone with
  71  * an exclusive IP instance is created.
  72  *
  73  * Note that in some cases a netstack_t needs to stay around after the zone
  74  * has gone away. This is because there might be outstanding references
  75  * (from TCP TIME_WAIT connections, IPsec state, etc). The netstack_t data
  76  * structure and all the foo_stack_t's hanging off of it will be cleaned up
  77  * when the last reference to it is dropped.
  78  * However, the same zone might be rebooted. That is handled using the
  79  * assumption that the zones framework picks a new zoneid each time a zone
  80  * is (re)booted. We assert for that condition in netstack_zone_create().
  81  * Thus the old netstack_t can take its time for things to time out.
  82  */
  83 static netstack_t *netstack_head;
  84 
  85 /*
  86  * To support kstat_create_netstack() using kstat_zone_add we need
  87  * to track both
  88  *  - all zoneids that use the global/shared stack
  89  *  - all kstats that have been added for the shared stack
  90  */
  91 struct shared_zone_list {
  92         struct shared_zone_list *sz_next;
  93         zoneid_t                sz_zoneid;
  94 };
  95 
  96 struct shared_kstat_list {
  97         struct shared_kstat_list *sk_next;
  98         kstat_t                  *sk_kstat;
  99 };
 100 
 101 static kmutex_t netstack_shared_lock;   /* protects the following two */
 102 static struct shared_zone_list  *netstack_shared_zones;
 103 static struct shared_kstat_list *netstack_shared_kstats;
 104 
 105 static void     *netstack_zone_create(zoneid_t zoneid);
 106 static void     netstack_zone_shutdown(zoneid_t zoneid, void *arg);
 107 static void     netstack_zone_destroy(zoneid_t zoneid, void *arg);
 108 
 109 static void     netstack_shared_zone_add(zoneid_t zoneid);
 110 static void     netstack_shared_zone_remove(zoneid_t zoneid);
 111 static void     netstack_shared_kstat_add(kstat_t *ks);
 112 static void     netstack_shared_kstat_remove(kstat_t *ks);
 113 
 114 typedef boolean_t applyfn_t(kmutex_t *, netstack_t *, int);
 115 
 116 static void     apply_all_netstacks(int, applyfn_t *);
 117 static void     apply_all_modules(netstack_t *, applyfn_t *);
 118 static void     apply_all_modules_reverse(netstack_t *, applyfn_t *);
 119 static boolean_t netstack_apply_create(kmutex_t *, netstack_t *, int);
 120 static boolean_t netstack_apply_shutdown(kmutex_t *, netstack_t *, int);
 121 static boolean_t netstack_apply_destroy(kmutex_t *, netstack_t *, int);
 122 static boolean_t wait_for_zone_creator(netstack_t *, kmutex_t *);
 123 static boolean_t wait_for_nms_inprogress(netstack_t *, nm_state_t *,
 124     kmutex_t *);
 125 
 126 static void netstack_hold_locked(netstack_t *);
 127 static void netstack_reap_work(netstack_t *, boolean_t);
 128 ksema_t netstack_reap_limiter;
 129 
 130 void
 131 netstack_init(void)
 132 {
 133         mutex_init(&netstack_g_lock, NULL, MUTEX_DEFAULT, NULL);
 134         mutex_init(&netstack_shared_lock, NULL, MUTEX_DEFAULT, NULL);
 135 
 136         /* XXX KEBE SAYS hard-coded constant needs to be fixed. */
 137         sema_init(&netstack_reap_limiter, 1024, NULL, SEMA_DRIVER, NULL);
 138 
 139         netstack_initialized = 1;
 140 
 141         /*
 142          * We want to be informed each time a zone is created or
 143          * destroyed in the kernel, so we can maintain the
 144          * stack instance information.
 145          */
 146         zone_key_create(&netstack_zone_key, netstack_zone_create,
 147             netstack_zone_shutdown, netstack_zone_destroy);
 148 }
 149 
 150 /*
 151  * Register a new module with the framework.
 152  * This registers interest in changes to the set of netstacks.
 153  * The createfn and destroyfn are required, but the shutdownfn can be
 154  * NULL.
 155  * Note that due to the current zsd implementation, when the create
 156  * function is called the zone isn't fully present, thus functions
 157  * like zone_find_by_* will fail, hence the create function can not
 158  * use many zones kernel functions including zcmn_err().
 159  */
 160 void
 161 netstack_register(int moduleid,
 162     void *(*module_create)(netstackid_t, netstack_t *),
 163     void (*module_shutdown)(netstackid_t, void *),
 164     void (*module_destroy)(netstackid_t, void *))
 165 {
 166         netstack_t *ns;
 167 
 168         ASSERT(netstack_initialized);
 169         ASSERT(moduleid >= 0 && moduleid < NS_MAX);
 170         ASSERT(module_create != NULL);
 171 
 172         /*
 173          * Make instances created after this point in time run the create
 174          * callback.
 175          */
 176         mutex_enter(&netstack_g_lock);
 177         ASSERT(ns_reg[moduleid].nr_create == NULL);
 178         ASSERT(ns_reg[moduleid].nr_flags == 0);
 179         ns_reg[moduleid].nr_create = module_create;
 180         ns_reg[moduleid].nr_shutdown = module_shutdown;
 181         ns_reg[moduleid].nr_destroy = module_destroy;
 182         ns_reg[moduleid].nr_flags = NRF_REGISTERED;
 183 
 184         /*
 185          * Determine the set of stacks that exist before we drop the lock.
 186          * Set NSS_CREATE_NEEDED for each of those.
 187          * netstacks which have been deleted will have NSS_CREATE_COMPLETED
 188          * set, but check NSF_CLOSING to be sure.
 189          */
 190         for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
 191                 nm_state_t *nms = &ns->netstack_m_state[moduleid];
 192 
 193                 mutex_enter(&ns->netstack_lock);
 194                 if (!(ns->netstack_flags & NSF_CLOSING) &&
 195                     (nms->nms_flags & NSS_CREATE_ALL) == 0) {
 196                         nms->nms_flags |= NSS_CREATE_NEEDED;
 197                         DTRACE_PROBE2(netstack__create__needed,
 198                             netstack_t *, ns, int, moduleid);
 199                 }
 200                 mutex_exit(&ns->netstack_lock);
 201         }
 202         mutex_exit(&netstack_g_lock);
 203 
 204         /*
 205          * At this point in time a new instance can be created or an instance
 206          * can be destroyed, or some other module can register or unregister.
 207          * Make sure we either run all the create functions for this moduleid
 208          * or we wait for any other creators for this moduleid.
 209          */
 210         apply_all_netstacks(moduleid, netstack_apply_create);
 211 }
 212 
 213 void
 214 netstack_unregister(int moduleid)
 215 {
 216         netstack_t *ns;
 217 
 218         ASSERT(moduleid >= 0 && moduleid < NS_MAX);
 219 
 220         ASSERT(ns_reg[moduleid].nr_create != NULL);
 221         ASSERT(ns_reg[moduleid].nr_flags & NRF_REGISTERED);
 222 
 223         mutex_enter(&netstack_g_lock);
 224         /*
 225          * Determine the set of stacks that exist before we drop the lock.
 226          * Set NSS_SHUTDOWN_NEEDED and NSS_DESTROY_NEEDED for each of those.
 227          * That ensures that when we return all the callbacks for existing
 228          * instances have completed. And since we set NRF_DYING no new
 229          * instances can use this module.
 230          */
 231         for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
 232                 boolean_t created = B_FALSE;
 233                 nm_state_t *nms = &ns->netstack_m_state[moduleid];
 234 
 235                 mutex_enter(&ns->netstack_lock);
 236 
 237                 /*
 238                  * We need to be careful here. We could actually have a netstack
 239                  * being created as we speak waiting for us to let go of this
 240                  * lock to proceed. It may have set NSS_CREATE_NEEDED, but not
 241                  * have gotten to the point of completing it yet. If
 242                  * NSS_CREATE_NEEDED, we can safely just remove it here and
 243                  * never create the module. However, if NSS_CREATE_INPROGRESS is
 244                  * set, we need to still flag this module for shutdown and
 245                  * deletion, just as though it had reached NSS_CREATE_COMPLETED.
 246                  *
 247                  * It is safe to do that because of two different guarantees
 248                  * that exist in the system. The first is that before we do a
 249                  * create, shutdown, or destroy, we ensure that nothing else is
 250                  * in progress in the system for this netstack and wait for it
 251                  * to complete. Secondly, because the zone is being created, we
 252                  * know that the following call to apply_all_netstack will block
 253                  * on the zone finishing its initialization.
 254                  */
 255                 if (nms->nms_flags & NSS_CREATE_NEEDED)
 256                         nms->nms_flags &= ~NSS_CREATE_NEEDED;
 257 
 258                 if (nms->nms_flags & NSS_CREATE_INPROGRESS ||
 259                     nms->nms_flags & NSS_CREATE_COMPLETED)
 260                         created = B_TRUE;
 261 
 262                 if (ns_reg[moduleid].nr_shutdown != NULL && created &&
 263                     (nms->nms_flags & NSS_CREATE_COMPLETED) &&
 264                     (nms->nms_flags & NSS_SHUTDOWN_ALL) == 0) {
 265                         nms->nms_flags |= NSS_SHUTDOWN_NEEDED;
 266                         DTRACE_PROBE2(netstack__shutdown__needed,
 267                             netstack_t *, ns, int, moduleid);
 268                 }
 269                 if ((ns_reg[moduleid].nr_flags & NRF_REGISTERED) &&
 270                     ns_reg[moduleid].nr_destroy != NULL && created &&
 271                     (nms->nms_flags & NSS_DESTROY_ALL) == 0) {
 272                         nms->nms_flags |= NSS_DESTROY_NEEDED;
 273                         DTRACE_PROBE2(netstack__destroy__needed,
 274                             netstack_t *, ns, int, moduleid);
 275                 }
 276                 mutex_exit(&ns->netstack_lock);
 277         }
 278         /*
 279          * Prevent any new netstack from calling the registered create
 280          * function, while keeping the function pointers in place until the
 281          * shutdown and destroy callbacks are complete.
 282          */
 283         ns_reg[moduleid].nr_flags |= NRF_DYING;
 284         mutex_exit(&netstack_g_lock);
 285 
 286         apply_all_netstacks(moduleid, netstack_apply_shutdown);
 287         apply_all_netstacks(moduleid, netstack_apply_destroy);
 288 
 289         /*
 290          * Clear the nms_flags so that we can handle this module
 291          * being loaded again.
 292          * Also remove the registered functions.
 293          */
 294         mutex_enter(&netstack_g_lock);
 295         ASSERT(ns_reg[moduleid].nr_flags & NRF_REGISTERED);
 296         ASSERT(ns_reg[moduleid].nr_flags & NRF_DYING);
 297         for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
 298                 nm_state_t *nms = &ns->netstack_m_state[moduleid];
 299 
 300                 mutex_enter(&ns->netstack_lock);
 301                 if (nms->nms_flags & NSS_DESTROY_COMPLETED) {
 302                         nms->nms_flags = 0;
 303                         DTRACE_PROBE2(netstack__destroy__done,
 304                             netstack_t *, ns, int, moduleid);
 305                 }
 306                 mutex_exit(&ns->netstack_lock);
 307         }
 308 
 309         ns_reg[moduleid].nr_create = NULL;
 310         ns_reg[moduleid].nr_shutdown = NULL;
 311         ns_reg[moduleid].nr_destroy = NULL;
 312         ns_reg[moduleid].nr_flags = 0;
 313         mutex_exit(&netstack_g_lock);
 314 }
 315 
 316 /*
 317  * Lookup and/or allocate a netstack for this zone.
 318  */
 319 static void *
 320 netstack_zone_create(zoneid_t zoneid)
 321 {
 322         netstackid_t stackid;
 323         netstack_t *ns;
 324         netstack_t **nsp;
 325         zone_t  *zone;
 326         int i;
 327 
 328         ASSERT(netstack_initialized);
 329 
 330         zone = zone_find_by_id_nolock(zoneid);
 331         ASSERT(zone != NULL);
 332 
 333         if (zone->zone_flags & ZF_NET_EXCL) {
 334                 stackid = zoneid;
 335         } else {
 336                 /* Look for the stack instance for the global */
 337                 stackid = GLOBAL_NETSTACKID;
 338         }
 339 
 340         /* Allocate even if it isn't needed; simplifies locking */
 341         ns = (netstack_t *)kmem_zalloc(sizeof (netstack_t), KM_SLEEP);
 342 
 343         /* Look if there is a matching stack instance */
 344         mutex_enter(&netstack_g_lock);
 345         for (nsp = &netstack_head; *nsp != NULL;
 346             nsp = &((*nsp)->netstack_next)) {
 347                 if ((*nsp)->netstack_stackid == stackid) {
 348                         /*
 349                          * Should never find a pre-existing exclusive stack
 350                          */
 351                         VERIFY(stackid == GLOBAL_NETSTACKID);
 352                         kmem_free(ns, sizeof (netstack_t));
 353                         ns = *nsp;
 354                         mutex_enter(&ns->netstack_lock);
 355                         ns->netstack_numzones++;
 356                         mutex_exit(&ns->netstack_lock);
 357                         mutex_exit(&netstack_g_lock);
 358                         DTRACE_PROBE1(netstack__inc__numzones,
 359                             netstack_t *, ns);
 360                         /* Record that we have a new shared stack zone */
 361                         netstack_shared_zone_add(zoneid);
 362                         zone->zone_netstack = ns;
 363                         return (ns);
 364                 }
 365         }
 366         /* Not found */
 367         mutex_init(&ns->netstack_lock, NULL, MUTEX_DEFAULT, NULL);
 368         cv_init(&ns->netstack_cv, NULL, CV_DEFAULT, NULL);
 369         ns->netstack_stackid = zoneid;
 370         ns->netstack_numzones = 1;
 371         ns->netstack_refcnt = 1; /* Decremented by netstack_zone_destroy */
 372         ns->netstack_flags = NSF_UNINIT;
 373         *nsp = ns;
 374         zone->zone_netstack = ns;
 375 
 376         mutex_enter(&ns->netstack_lock);
 377         /*
 378          * Mark this netstack as having a CREATE running so
 379          * any netstack_register/netstack_unregister waits for
 380          * the existing create callbacks to complete in moduleid order
 381          */
 382         ns->netstack_flags |= NSF_ZONE_CREATE;
 383 
 384         /*
 385          * Determine the set of module create functions that need to be
 386          * called before we drop the lock.
 387          * Set NSS_CREATE_NEEDED for each of those.
 388          * Skip any with NRF_DYING set, since those are in the process of
 389          * going away, by checking for flags being exactly NRF_REGISTERED.
 390          */
 391         for (i = 0; i < NS_MAX; i++) {
 392                 nm_state_t *nms = &ns->netstack_m_state[i];
 393 
 394                 cv_init(&nms->nms_cv, NULL, CV_DEFAULT, NULL);
 395 
 396                 if ((ns_reg[i].nr_flags == NRF_REGISTERED) &&
 397                     (nms->nms_flags & NSS_CREATE_ALL) == 0) {
 398                         nms->nms_flags |= NSS_CREATE_NEEDED;
 399                         DTRACE_PROBE2(netstack__create__needed,
 400                             netstack_t *, ns, int, i);
 401                 }
 402         }
 403         mutex_exit(&ns->netstack_lock);
 404         mutex_exit(&netstack_g_lock);
 405 
 406         apply_all_modules(ns, netstack_apply_create);
 407 
 408         /* Tell any waiting netstack_register/netstack_unregister to proceed */
 409         mutex_enter(&ns->netstack_lock);
 410         ns->netstack_flags &= ~NSF_UNINIT;
 411         ASSERT(ns->netstack_flags & NSF_ZONE_CREATE);
 412         ns->netstack_flags &= ~NSF_ZONE_CREATE;
 413         cv_broadcast(&ns->netstack_cv);
 414         mutex_exit(&ns->netstack_lock);
 415 
 416         return (ns);
 417 }
 418 
 419 /* ARGSUSED */
 420 static void
 421 netstack_zone_shutdown(zoneid_t zoneid, void *arg)
 422 {
 423         netstack_t *ns = (netstack_t *)arg;
 424         int i;
 425 
 426         ASSERT(arg != NULL);
 427 
 428         mutex_enter(&ns->netstack_lock);
 429         ASSERT(ns->netstack_numzones > 0);
 430         if (ns->netstack_numzones != 1) {
 431                 /* Stack instance being used by other zone */
 432                 mutex_exit(&ns->netstack_lock);
 433                 ASSERT(ns->netstack_stackid == GLOBAL_NETSTACKID);
 434                 return;
 435         }
 436         mutex_exit(&ns->netstack_lock);
 437 
 438         mutex_enter(&netstack_g_lock);
 439         mutex_enter(&ns->netstack_lock);
 440         /*
 441          * Mark this netstack as having a SHUTDOWN running so
 442          * any netstack_register/netstack_unregister waits for
 443          * the existing create callbacks to complete in moduleid order
 444          */
 445         ASSERT(!(ns->netstack_flags & NSF_ZONE_INPROGRESS));
 446         ns->netstack_flags |= NSF_ZONE_SHUTDOWN;
 447 
 448         /*
 449          * Determine the set of stacks that exist before we drop the lock.
 450          * Set NSS_SHUTDOWN_NEEDED for each of those.
 451          */
 452         for (i = 0; i < NS_MAX; i++) {
 453                 nm_state_t *nms = &ns->netstack_m_state[i];
 454 
 455                 if ((ns_reg[i].nr_flags & NRF_REGISTERED) &&
 456                     ns_reg[i].nr_shutdown != NULL &&
 457                     (nms->nms_flags & NSS_CREATE_COMPLETED) &&
 458                     (nms->nms_flags & NSS_SHUTDOWN_ALL) == 0) {
 459                         nms->nms_flags |= NSS_SHUTDOWN_NEEDED;
 460                         DTRACE_PROBE2(netstack__shutdown__needed,
 461                             netstack_t *, ns, int, i);
 462                 }
 463         }
 464         mutex_exit(&ns->netstack_lock);
 465         mutex_exit(&netstack_g_lock);
 466 
 467         /*
 468          * Call the shutdown function for all registered modules for this
 469          * netstack.
 470          */
 471         apply_all_modules_reverse(ns, netstack_apply_shutdown);
 472 
 473         /* Tell any waiting netstack_register/netstack_unregister to proceed */
 474         mutex_enter(&ns->netstack_lock);
 475         ASSERT(ns->netstack_flags & NSF_ZONE_SHUTDOWN);
 476         ns->netstack_flags &= ~NSF_ZONE_SHUTDOWN;
 477         cv_broadcast(&ns->netstack_cv);
 478         mutex_exit(&ns->netstack_lock);
 479 }
 480 
 481 /*
 482  * Common routine to release a zone.
 483  * If this was the last zone using the stack instance then prepare to
 484  * have the refcnt dropping to zero free the zone.
 485  */
 486 /* ARGSUSED */
 487 static void
 488 netstack_zone_destroy(zoneid_t zoneid, void *arg)
 489 {
 490         netstack_t *ns = (netstack_t *)arg;
 491 
 492         ASSERT(arg != NULL);
 493 
 494         mutex_enter(&ns->netstack_lock);
 495         ASSERT(ns->netstack_numzones > 0);
 496         ns->netstack_numzones--;
 497         if (ns->netstack_numzones != 0) {
 498                 /* Stack instance being used by other zone */
 499                 mutex_exit(&ns->netstack_lock);
 500                 ASSERT(ns->netstack_stackid == GLOBAL_NETSTACKID);
 501                 /* Record that we a shared stack zone has gone away */
 502                 netstack_shared_zone_remove(zoneid);
 503                 return;
 504         }
 505         /*
 506          * Set CLOSING so that netstack_find_by will not find it.
 507          */
 508         ns->netstack_flags |= NSF_CLOSING;
 509         mutex_exit(&ns->netstack_lock);
 510         DTRACE_PROBE1(netstack__dec__numzones, netstack_t *, ns);
 511         /* No other thread can call zone_destroy for this stack */
 512 
 513         /*
 514          * Decrease refcnt to account for the one in netstack_zone_init()
 515          */
 516         netstack_rele(ns);
 517 }
 518 
 519 /*
 520  * Called when the reference count drops to zero.
 521  * Call the destroy functions for each registered module.
 522  */
 523 static void
 524 netstack_stack_inactive(netstack_t *ns)
 525 {
 526         int i;
 527 
 528         mutex_enter(&netstack_g_lock);
 529         mutex_enter(&ns->netstack_lock);
 530         /*
 531          * Mark this netstack as having a DESTROY running so
 532          * any netstack_register/netstack_unregister waits for
 533          * the existing destroy callbacks to complete in reverse moduleid order
 534          */
 535         ASSERT(!(ns->netstack_flags & NSF_ZONE_INPROGRESS));
 536         ns->netstack_flags |= NSF_ZONE_DESTROY;
 537         /*
 538          * If the shutdown callback wasn't called earlier (e.g., if this is
 539          * a netstack shared between multiple zones), then we schedule it now.
 540          *
 541          * Determine the set of stacks that exist before we drop the lock.
 542          * Set NSS_DESTROY_NEEDED for each of those. That
 543          * ensures that when we return all the callbacks for existing
 544          * instances have completed.
 545          */
 546         for (i = 0; i < NS_MAX; i++) {
 547                 nm_state_t *nms = &ns->netstack_m_state[i];
 548 
 549                 if ((ns_reg[i].nr_flags & NRF_REGISTERED) &&
 550                     ns_reg[i].nr_shutdown != NULL &&
 551                     (nms->nms_flags & NSS_CREATE_COMPLETED) &&
 552                     (nms->nms_flags & NSS_SHUTDOWN_ALL) == 0) {
 553                         nms->nms_flags |= NSS_SHUTDOWN_NEEDED;
 554                         DTRACE_PROBE2(netstack__shutdown__needed,
 555                             netstack_t *, ns, int, i);
 556                 }
 557 
 558                 if ((ns_reg[i].nr_flags & NRF_REGISTERED) &&
 559                     ns_reg[i].nr_destroy != NULL &&
 560                     (nms->nms_flags & NSS_CREATE_COMPLETED) &&
 561                     (nms->nms_flags & NSS_DESTROY_ALL) == 0) {
 562                         nms->nms_flags |= NSS_DESTROY_NEEDED;
 563                         DTRACE_PROBE2(netstack__destroy__needed,
 564                             netstack_t *, ns, int, i);
 565                 }
 566         }
 567         mutex_exit(&ns->netstack_lock);
 568         mutex_exit(&netstack_g_lock);
 569 
 570         /*
 571          * Call the shutdown and destroy functions for all registered modules
 572          * for this netstack.
 573          *
 574          * Since there are some ordering dependencies between the modules we
 575          * tear them down in the reverse order of what was used to create them.
 576          *
 577          * Since a netstack_t is never reused (when a zone is rebooted it gets
 578          * a new zoneid == netstackid i.e. a new netstack_t is allocated) we
 579          * leave nms_flags the way it is i.e. with NSS_DESTROY_COMPLETED set.
 580          * That is different than in the netstack_unregister() case.
 581          */
 582         apply_all_modules_reverse(ns, netstack_apply_shutdown);
 583         apply_all_modules_reverse(ns, netstack_apply_destroy);
 584 
 585         /* Tell any waiting netstack_register/netstack_unregister to proceed */
 586         mutex_enter(&ns->netstack_lock);
 587         ASSERT(ns->netstack_flags & NSF_ZONE_DESTROY);
 588         ns->netstack_flags &= ~NSF_ZONE_DESTROY;
 589         cv_broadcast(&ns->netstack_cv);
 590         mutex_exit(&ns->netstack_lock);
 591 }
 592 
 593 /*
 594  * Apply a function to all netstacks for a particular moduleid.
 595  *
 596  * If there is any zone activity (due to a zone being created, shutdown,
 597  * or destroyed) we wait for that to complete before we proceed. This ensures
 598  * that the moduleids are processed in order when a zone is created or
 599  * destroyed.
 600  *
 601  * The applyfn has to drop netstack_g_lock if it does some work.
 602  * In that case we don't follow netstack_next,
 603  * even if it is possible to do so without any hazards. This is
 604  * because we want the design to allow for the list of netstacks threaded
 605  * by netstack_next to change in any arbitrary way during the time the
 606  * lock was dropped.
 607  *
 608  * It is safe to restart the loop at netstack_head since the applyfn
 609  * changes netstack_m_state as it processes things, so a subsequent
 610  * pass through will have no effect in applyfn, hence the loop will terminate
 611  * in at worst O(N^2).
 612  */
 613 static void
 614 apply_all_netstacks(int moduleid, applyfn_t *applyfn)
 615 {
 616         netstack_t *ns;
 617 
 618         mutex_enter(&netstack_g_lock);
 619         ns = netstack_head;
 620         while (ns != NULL) {
 621                 if (wait_for_zone_creator(ns, &netstack_g_lock)) {
 622                         /* Lock dropped - restart at head */
 623                         ns = netstack_head;
 624                 } else if ((applyfn)(&netstack_g_lock, ns, moduleid)) {
 625                         /* Lock dropped - restart at head */
 626                         ns = netstack_head;
 627                 } else {
 628                         ns = ns->netstack_next;
 629                 }
 630         }
 631         mutex_exit(&netstack_g_lock);
 632 }
 633 
 634 /*
 635  * Apply a function to all moduleids for a particular netstack.
 636  *
 637  * Since the netstack linkage doesn't matter in this case we can
 638  * ignore whether the function drops the lock.
 639  */
 640 static void
 641 apply_all_modules(netstack_t *ns, applyfn_t *applyfn)
 642 {
 643         int i;
 644 
 645         mutex_enter(&netstack_g_lock);
 646         for (i = 0; i < NS_MAX; i++) {
 647                 /*
 648                  * We don't care whether the lock was dropped
 649                  * since we are not iterating over netstack_head.
 650                  */
 651                 (void) (applyfn)(&netstack_g_lock, ns, i);
 652         }
 653         mutex_exit(&netstack_g_lock);
 654 }
 655 
 656 /* Like the above but in reverse moduleid order */
 657 static void
 658 apply_all_modules_reverse(netstack_t *ns, applyfn_t *applyfn)
 659 {
 660         int i;
 661 
 662         mutex_enter(&netstack_g_lock);
 663         for (i = NS_MAX-1; i >= 0; i--) {
 664                 /*
 665                  * We don't care whether the lock was dropped
 666                  * since we are not iterating over netstack_head.
 667                  */
 668                 (void) (applyfn)(&netstack_g_lock, ns, i);
 669         }
 670         mutex_exit(&netstack_g_lock);
 671 }
 672 
 673 /*
 674  * Call the create function for the ns and moduleid if CREATE_NEEDED
 675  * is set.
 676  * If some other thread gets here first and sets *_INPROGRESS, then
 677  * we wait for that thread to complete so that we can ensure that
 678  * all the callbacks are done when we've looped over all netstacks/moduleids.
 679  *
 680  * When we call the create function, we temporarily drop the netstack_lock
 681  * held by the caller, and return true to tell the caller it needs to
 682  * re-evalute the state.
 683  */
 684 static boolean_t
 685 netstack_apply_create(kmutex_t *lockp, netstack_t *ns, int moduleid)
 686 {
 687         void *result;
 688         netstackid_t stackid;
 689         nm_state_t *nms = &ns->netstack_m_state[moduleid];
 690         boolean_t dropped = B_FALSE;
 691 
 692         ASSERT(MUTEX_HELD(lockp));
 693         mutex_enter(&ns->netstack_lock);
 694 
 695         if (wait_for_nms_inprogress(ns, nms, lockp))
 696                 dropped = B_TRUE;
 697 
 698         if (nms->nms_flags & NSS_CREATE_NEEDED) {
 699                 nms->nms_flags &= ~NSS_CREATE_NEEDED;
 700                 nms->nms_flags |= NSS_CREATE_INPROGRESS;
 701                 DTRACE_PROBE2(netstack__create__inprogress,
 702                     netstack_t *, ns, int, moduleid);
 703                 mutex_exit(&ns->netstack_lock);
 704                 mutex_exit(lockp);
 705                 dropped = B_TRUE;
 706 
 707                 ASSERT(ns_reg[moduleid].nr_create != NULL);
 708                 stackid = ns->netstack_stackid;
 709                 DTRACE_PROBE2(netstack__create__start,
 710                     netstackid_t, stackid,
 711                     netstack_t *, ns);
 712                 result = (ns_reg[moduleid].nr_create)(stackid, ns);
 713                 DTRACE_PROBE2(netstack__create__end,
 714                     void *, result, netstack_t *, ns);
 715 
 716                 ASSERT(result != NULL);
 717                 mutex_enter(lockp);
 718                 mutex_enter(&ns->netstack_lock);
 719                 ns->netstack_modules[moduleid] = result;
 720                 nms->nms_flags &= ~NSS_CREATE_INPROGRESS;
 721                 nms->nms_flags |= NSS_CREATE_COMPLETED;
 722                 cv_broadcast(&nms->nms_cv);
 723                 DTRACE_PROBE2(netstack__create__completed,
 724                     netstack_t *, ns, int, moduleid);
 725                 mutex_exit(&ns->netstack_lock);
 726                 return (dropped);
 727         } else {
 728                 mutex_exit(&ns->netstack_lock);
 729                 return (dropped);
 730         }
 731 }
 732 
 733 /*
 734  * Call the shutdown function for the ns and moduleid if SHUTDOWN_NEEDED
 735  * is set.
 736  * If some other thread gets here first and sets *_INPROGRESS, then
 737  * we wait for that thread to complete so that we can ensure that
 738  * all the callbacks are done when we've looped over all netstacks/moduleids.
 739  *
 740  * When we call the shutdown function, we temporarily drop the netstack_lock
 741  * held by the caller, and return true to tell the caller it needs to
 742  * re-evalute the state.
 743  */
 744 static boolean_t
 745 netstack_apply_shutdown(kmutex_t *lockp, netstack_t *ns, int moduleid)
 746 {
 747         netstackid_t stackid;
 748         void * netstack_module;
 749         nm_state_t *nms = &ns->netstack_m_state[moduleid];
 750         boolean_t dropped = B_FALSE;
 751 
 752         ASSERT(MUTEX_HELD(lockp));
 753         mutex_enter(&ns->netstack_lock);
 754 
 755         if (wait_for_nms_inprogress(ns, nms, lockp))
 756                 dropped = B_TRUE;
 757 
 758         if (nms->nms_flags & NSS_SHUTDOWN_NEEDED) {
 759                 nms->nms_flags &= ~NSS_SHUTDOWN_NEEDED;
 760                 nms->nms_flags |= NSS_SHUTDOWN_INPROGRESS;
 761                 DTRACE_PROBE2(netstack__shutdown__inprogress,
 762                     netstack_t *, ns, int, moduleid);
 763                 mutex_exit(&ns->netstack_lock);
 764                 mutex_exit(lockp);
 765                 dropped = B_TRUE;
 766 
 767                 ASSERT(ns_reg[moduleid].nr_shutdown != NULL);
 768                 stackid = ns->netstack_stackid;
 769                 netstack_module = ns->netstack_modules[moduleid];
 770                 DTRACE_PROBE2(netstack__shutdown__start,
 771                     netstackid_t, stackid,
 772                     void *, netstack_module);
 773                 (ns_reg[moduleid].nr_shutdown)(stackid, netstack_module);
 774                 DTRACE_PROBE1(netstack__shutdown__end,
 775                     netstack_t *, ns);
 776 
 777                 mutex_enter(lockp);
 778                 mutex_enter(&ns->netstack_lock);
 779                 nms->nms_flags &= ~NSS_SHUTDOWN_INPROGRESS;
 780                 nms->nms_flags |= NSS_SHUTDOWN_COMPLETED;
 781                 cv_broadcast(&nms->nms_cv);
 782                 DTRACE_PROBE2(netstack__shutdown__completed,
 783                     netstack_t *, ns, int, moduleid);
 784                 mutex_exit(&ns->netstack_lock);
 785                 return (dropped);
 786         } else {
 787                 mutex_exit(&ns->netstack_lock);
 788                 return (dropped);
 789         }
 790 }
 791 
 792 /*
 793  * Call the destroy function for the ns and moduleid if DESTROY_NEEDED
 794  * is set.
 795  * If some other thread gets here first and sets *_INPROGRESS, then
 796  * we wait for that thread to complete so that we can ensure that
 797  * all the callbacks are done when we've looped over all netstacks/moduleids.
 798  *
 799  * When we call the destroy function, we temporarily drop the netstack_lock
 800  * held by the caller, and return true to tell the caller it needs to
 801  * re-evalute the state.
 802  */
 803 static boolean_t
 804 netstack_apply_destroy(kmutex_t *lockp, netstack_t *ns, int moduleid)
 805 {
 806         netstackid_t stackid;
 807         void * netstack_module;
 808         nm_state_t *nms = &ns->netstack_m_state[moduleid];
 809         boolean_t dropped = B_FALSE;
 810 
 811         ASSERT(MUTEX_HELD(lockp));
 812         mutex_enter(&ns->netstack_lock);
 813 
 814         if (wait_for_nms_inprogress(ns, nms, lockp))
 815                 dropped = B_TRUE;
 816 
 817         if (nms->nms_flags & NSS_DESTROY_NEEDED) {
 818                 nms->nms_flags &= ~NSS_DESTROY_NEEDED;
 819                 nms->nms_flags |= NSS_DESTROY_INPROGRESS;
 820                 DTRACE_PROBE2(netstack__destroy__inprogress,
 821                     netstack_t *, ns, int, moduleid);
 822                 mutex_exit(&ns->netstack_lock);
 823                 mutex_exit(lockp);
 824                 dropped = B_TRUE;
 825 
 826                 ASSERT(ns_reg[moduleid].nr_destroy != NULL);
 827                 stackid = ns->netstack_stackid;
 828                 netstack_module = ns->netstack_modules[moduleid];
 829                 DTRACE_PROBE2(netstack__destroy__start,
 830                     netstackid_t, stackid,
 831                     void *, netstack_module);
 832                 (ns_reg[moduleid].nr_destroy)(stackid, netstack_module);
 833                 DTRACE_PROBE1(netstack__destroy__end,
 834                     netstack_t *, ns);
 835 
 836                 mutex_enter(lockp);
 837                 mutex_enter(&ns->netstack_lock);
 838                 ns->netstack_modules[moduleid] = NULL;
 839                 nms->nms_flags &= ~NSS_DESTROY_INPROGRESS;
 840                 nms->nms_flags |= NSS_DESTROY_COMPLETED;
 841                 cv_broadcast(&nms->nms_cv);
 842                 DTRACE_PROBE2(netstack__destroy__completed,
 843                     netstack_t *, ns, int, moduleid);
 844                 mutex_exit(&ns->netstack_lock);
 845                 return (dropped);
 846         } else {
 847                 mutex_exit(&ns->netstack_lock);
 848                 return (dropped);
 849         }
 850 }
 851 
 852 /*
 853  * If somebody  is creating the netstack (due to a new zone being created)
 854  * then we wait for them to complete. This ensures that any additional
 855  * netstack_register() doesn't cause the create functions to run out of
 856  * order.
 857  * Note that we do not need such a global wait in the case of the shutdown
 858  * and destroy callbacks, since in that case it is sufficient for both
 859  * threads to set NEEDED and wait for INPROGRESS to ensure ordering.
 860  * Returns true if lockp was temporarily dropped while waiting.
 861  */
 862 static boolean_t
 863 wait_for_zone_creator(netstack_t *ns, kmutex_t *lockp)
 864 {
 865         boolean_t dropped = B_FALSE;
 866 
 867         mutex_enter(&ns->netstack_lock);
 868         while (ns->netstack_flags & NSF_ZONE_CREATE) {
 869                 DTRACE_PROBE1(netstack__wait__zone__inprogress,
 870                     netstack_t *, ns);
 871                 if (lockp != NULL) {
 872                         dropped = B_TRUE;
 873                         mutex_exit(lockp);
 874                 }
 875                 cv_wait(&ns->netstack_cv, &ns->netstack_lock);
 876                 if (lockp != NULL) {
 877                         /* First drop netstack_lock to preserve order */
 878                         mutex_exit(&ns->netstack_lock);
 879                         mutex_enter(lockp);
 880                         mutex_enter(&ns->netstack_lock);
 881                 }
 882         }
 883         mutex_exit(&ns->netstack_lock);
 884         return (dropped);
 885 }
 886 
 887 /*
 888  * Wait for any INPROGRESS flag to be cleared for the netstack/moduleid
 889  * combination.
 890  * Returns true if lockp was temporarily dropped while waiting.
 891  */
 892 static boolean_t
 893 wait_for_nms_inprogress(netstack_t *ns, nm_state_t *nms, kmutex_t *lockp)
 894 {
 895         boolean_t dropped = B_FALSE;
 896 
 897         while (nms->nms_flags & NSS_ALL_INPROGRESS) {
 898                 DTRACE_PROBE2(netstack__wait__nms__inprogress,
 899                     netstack_t *, ns, nm_state_t *, nms);
 900                 if (lockp != NULL) {
 901                         dropped = B_TRUE;
 902                         mutex_exit(lockp);
 903                 }
 904                 cv_wait(&nms->nms_cv, &ns->netstack_lock);
 905                 if (lockp != NULL) {
 906                         /* First drop netstack_lock to preserve order */
 907                         mutex_exit(&ns->netstack_lock);
 908                         mutex_enter(lockp);
 909                         mutex_enter(&ns->netstack_lock);
 910                 }
 911         }
 912         return (dropped);
 913 }
 914 
 915 /*
 916  * Get the stack instance used in caller's zone.
 917  * Increases the reference count, caller must do a netstack_rele.
 918  * It can't be called after zone_destroy() has started.
 919  */
 920 netstack_t *
 921 netstack_get_current(void)
 922 {
 923         netstack_t *ns;
 924 
 925         ns = curproc->p_zone->zone_netstack;
 926         ASSERT(ns != NULL);
 927         if (ns->netstack_flags & (NSF_UNINIT|NSF_CLOSING))
 928                 return (NULL);
 929 
 930         netstack_hold(ns);
 931 
 932         return (ns);
 933 }
 934 
 935 /*
 936  * Find a stack instance given the cred.
 937  * This is used by the modules to potentially allow for a future when
 938  * something other than the zoneid is used to determine the stack.
 939  */
 940 netstack_t *
 941 netstack_find_by_cred(const cred_t *cr)
 942 {
 943         zoneid_t zoneid = crgetzoneid(cr);
 944 
 945         /* Handle the case when cr_zone is NULL */
 946         if (zoneid == (zoneid_t)-1)
 947                 zoneid = GLOBAL_ZONEID;
 948 
 949         /* For performance ... */
 950         if (curproc->p_zone->zone_id == zoneid)
 951                 return (netstack_get_current());
 952         else
 953                 return (netstack_find_by_zoneid(zoneid));
 954 }
 955 
 956 /*
 957  * Find a stack instance given the zoneid.
 958  * Increases the reference count if found; caller must do a
 959  * netstack_rele().
 960  *
 961  * If there is no exact match then assume the shared stack instance
 962  * matches.
 963  *
 964  * Skip the unitialized ones.
 965  */
 966 netstack_t *
 967 netstack_find_by_zoneid(zoneid_t zoneid)
 968 {
 969         netstack_t *ns;
 970         zone_t *zone;
 971 
 972         zone = zone_find_by_id(zoneid);
 973 
 974         if (zone == NULL)
 975                 return (NULL);
 976 
 977         ns = zone->zone_netstack;
 978         ASSERT(ns != NULL);
 979         if (ns->netstack_flags & (NSF_UNINIT|NSF_CLOSING))
 980                 ns = NULL;
 981         else
 982                 netstack_hold(ns);
 983 
 984         zone_rele(zone);
 985         return (ns);
 986 }
 987 
 988 /*
 989  * Find a stack instance given the zoneid. Can only be called from
 990  * the create callback. See the comments in zone_find_by_id_nolock why
 991  * that limitation exists.
 992  *
 993  * Increases the reference count if found; caller must do a
 994  * netstack_rele().
 995  *
 996  * If there is no exact match then assume the shared stack instance
 997  * matches.
 998  *
 999  * Skip the unitialized ones.
1000  */
1001 netstack_t *
1002 netstack_find_by_zoneid_nolock(zoneid_t zoneid)
1003 {
1004         netstack_t *ns;
1005         zone_t *zone;
1006 
1007         zone = zone_find_by_id_nolock(zoneid);
1008 
1009         if (zone == NULL)
1010                 return (NULL);
1011 
1012         ns = zone->zone_netstack;
1013         ASSERT(ns != NULL);
1014 
1015         if (ns->netstack_flags & (NSF_UNINIT|NSF_CLOSING))
1016                 ns = NULL;
1017         else
1018                 netstack_hold(ns);
1019 
1020         /* zone_find_by_id_nolock does not have a hold on the zone */
1021         return (ns);
1022 }
1023 
1024 /*
1025  * Find a stack instance given the stackid with exact match?
1026  * Increases the reference count if found; caller must do a
1027  * netstack_rele().
1028  *
1029  * Skip the unitialized ones.
1030  */
1031 netstack_t *
1032 netstack_find_by_stackid(netstackid_t stackid)
1033 {
1034         netstack_t *ns;
1035 
1036         mutex_enter(&netstack_g_lock);
1037         for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
1038                 mutex_enter(&ns->netstack_lock);
1039                 if (ns->netstack_stackid == stackid &&
1040                     !(ns->netstack_flags & (NSF_UNINIT|NSF_CLOSING))) {
1041                         netstack_hold_locked(ns);
1042                         mutex_exit(&ns->netstack_lock);
1043                         mutex_exit(&netstack_g_lock);
1044                         return (ns);
1045                 }
1046                 mutex_exit(&ns->netstack_lock);
1047         }
1048         mutex_exit(&netstack_g_lock);
1049         return (NULL);
1050 }
1051 
1052 boolean_t
1053 netstack_inuse_by_stackid(netstackid_t stackid)
1054 {
1055         netstack_t *ns;
1056         boolean_t rval = B_FALSE;
1057 
1058         mutex_enter(&netstack_g_lock);
1059 
1060         for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
1061                 if (ns->netstack_stackid == stackid) {
1062                         rval = B_TRUE;
1063                         break;
1064                 }
1065         }
1066 
1067         mutex_exit(&netstack_g_lock);
1068 
1069         return (rval);
1070 }
1071 
1072 
1073 static void
1074 netstack_reap(void *arg)
1075 {
1076         /* Indicate we took a semaphore to get here. */
1077         netstack_reap_work((netstack_t *)arg, B_TRUE);
1078 }
1079 
1080 static void
1081 netstack_reap_intr(void *arg)
1082 {
1083         /* Indicate we did NOT TAKE a semaphore to get here. */
1084         netstack_reap_work((netstack_t *)arg, B_FALSE);
1085 }
1086 
1087 static void
1088 netstack_reap_work(netstack_t *ns, boolean_t semaphore_signal)
1089 {
1090         netstack_t **nsp;
1091         boolean_t found;
1092         int i;
1093 
1094         /*
1095          * Time to call the destroy functions and free up
1096          * the structure
1097          */
1098         netstack_stack_inactive(ns);
1099 
1100         /* Make sure nothing increased the references */
1101         ASSERT(ns->netstack_refcnt == 0);
1102         ASSERT(ns->netstack_numzones == 0);
1103 
1104         /* Finally remove from list of netstacks */
1105         mutex_enter(&netstack_g_lock);
1106         found = B_FALSE;
1107         for (nsp = &netstack_head; *nsp != NULL;
1108              nsp = &(*nsp)->netstack_next) {
1109                 if (*nsp == ns) {
1110                         *nsp = ns->netstack_next;
1111                         ns->netstack_next = NULL;
1112                         found = B_TRUE;
1113                         break;
1114                 }
1115         }
1116         ASSERT(found);
1117         mutex_exit(&netstack_g_lock);
1118 
1119         /* Make sure nothing increased the references */
1120         ASSERT(ns->netstack_refcnt == 0);
1121         ASSERT(ns->netstack_numzones == 0);
1122 
1123         ASSERT(ns->netstack_flags & NSF_CLOSING);
1124 
1125         for (i = 0; i < NS_MAX; i++) {
1126                 nm_state_t *nms = &ns->netstack_m_state[i];
1127 
1128                 cv_destroy(&nms->nms_cv);
1129         }
1130         mutex_destroy(&ns->netstack_lock);
1131         cv_destroy(&ns->netstack_cv);
1132         kmem_free(ns, sizeof (*ns));
1133         /* Allow another reap to be scheduled. */
1134         if (semaphore_signal)
1135                 sema_v(&netstack_reap_limiter);
1136 }
1137 
1138 void
1139 netstack_rele(netstack_t *ns)
1140 {
1141         int refcnt, numzones;
1142 
1143         mutex_enter(&ns->netstack_lock);
1144         ASSERT(ns->netstack_refcnt > 0);
1145         ns->netstack_refcnt--;
1146         /*
1147          * As we drop the lock additional netstack_rele()s can come in
1148          * and decrement the refcnt to zero and free the netstack_t.
1149          * Store pointers in local variables and if we were not the last
1150          * then don't reference the netstack_t after that.
1151          */
1152         refcnt = ns->netstack_refcnt;
1153         numzones = ns->netstack_numzones;
1154         DTRACE_PROBE1(netstack__dec__ref, netstack_t *, ns);
1155         mutex_exit(&ns->netstack_lock);
1156 
1157         if (refcnt == 0 && numzones == 0) {
1158                 boolean_t is_not_intr = !servicing_interrupt();
1159 
1160                 /*
1161                  * Because there are possibilities of kstats being held by
1162                  * callers, which would then be immediately freed, but held up
1163                  * due to kstat's odd reference model recording the thread, we
1164                  * choose to schedule the actual deletion of this netstack as
1165                  * a deferred task on the system taskq.  This way, any
1166                  * store-the-thread-pointer semantics won't trip over
1167                  * themselves.
1168                  *
1169                  * On the off chance this is called in interrupt context, we
1170                  * cannot use the semaphore to enforce rate-limiting.
1171                  */
1172                 if (is_not_intr && sema_tryp(&netstack_reap_limiter) == 0) {
1173                         /*
1174                          * XXX KEBE SAYS inidicate we're slamming against
1175                          * a limit.
1176                          */
1177                         hrtime_t measurement = gethrtime();
1178 
1179                         sema_p(&netstack_reap_limiter);
1180                         /* Caputre delay in ns. */
1181                         DTRACE_PROBE1(netstack__reap__rate__limited,
1182                             hrtime_t *, gethrtime() - measurement);
1183                 }
1184 
1185                 if (taskq_dispatch(system_taskq,
1186                     is_not_intr ? netstack_reap : netstack_reap_intr, ns,
1187                     TQ_NOSLEEP) == NULL) {
1188                         /*
1189                          * Well shoot, why can't we taskq_dispatch?
1190                          * Take our chances with a direct call.
1191                          */
1192                         DTRACE_PROBE1(netstack__reap__taskq__fail,
1193                             netstack_t *, ns);
1194                         netstack_reap_work(ns, is_not_intr);
1195                 }
1196         }
1197 }
1198 
1199 static void
1200 netstack_hold_locked(netstack_t *ns)
1201 {
1202         ASSERT(MUTEX_HELD(&ns->netstack_lock));
1203         ns->netstack_refcnt++;
1204         ASSERT(ns->netstack_refcnt > 0);
1205         DTRACE_PROBE1(netstack__inc__ref, netstack_t *, ns);
1206 }
1207 
1208 void
1209 netstack_hold(netstack_t *ns)
1210 {
1211         mutex_enter(&ns->netstack_lock);
1212         netstack_hold_locked(ns);
1213         mutex_exit(&ns->netstack_lock);
1214 }
1215 
1216 /*
1217  * To support kstat_create_netstack() using kstat_zone_add we need
1218  * to track both
1219  *  - all zoneids that use the global/shared stack
1220  *  - all kstats that have been added for the shared stack
1221  */
1222 kstat_t *
1223 kstat_create_netstack(char *ks_module, int ks_instance, char *ks_name,
1224     char *ks_class, uchar_t ks_type, uint_t ks_ndata, uchar_t ks_flags,
1225     netstackid_t ks_netstackid)
1226 {
1227         kstat_t *ks;
1228 
1229         if (ks_netstackid == GLOBAL_NETSTACKID) {
1230                 ks = kstat_create_zone(ks_module, ks_instance, ks_name,
1231                     ks_class, ks_type, ks_ndata, ks_flags, GLOBAL_ZONEID);
1232                 if (ks != NULL)
1233                         netstack_shared_kstat_add(ks);
1234                 return (ks);
1235         } else {
1236                 zoneid_t zoneid = ks_netstackid;
1237 
1238                 return (kstat_create_zone(ks_module, ks_instance, ks_name,
1239                     ks_class, ks_type, ks_ndata, ks_flags, zoneid));
1240         }
1241 }
1242 
1243 void
1244 kstat_delete_netstack(kstat_t *ks, netstackid_t ks_netstackid)
1245 {
1246         if (ks_netstackid == GLOBAL_NETSTACKID) {
1247                 netstack_shared_kstat_remove(ks);
1248         }
1249         kstat_delete(ks);
1250 }
1251 
1252 static void
1253 netstack_shared_zone_add(zoneid_t zoneid)
1254 {
1255         struct shared_zone_list *sz;
1256         struct shared_kstat_list *sk;
1257 
1258         sz = (struct shared_zone_list *)kmem_zalloc(sizeof (*sz), KM_SLEEP);
1259         sz->sz_zoneid = zoneid;
1260 
1261         /* Insert in list */
1262         mutex_enter(&netstack_shared_lock);
1263         sz->sz_next = netstack_shared_zones;
1264         netstack_shared_zones = sz;
1265 
1266         /*
1267          * Perform kstat_zone_add for each existing shared stack kstat.
1268          * Note: Holds netstack_shared_lock lock across kstat_zone_add.
1269          */
1270         for (sk = netstack_shared_kstats; sk != NULL; sk = sk->sk_next) {
1271                 kstat_zone_add(sk->sk_kstat, zoneid);
1272         }
1273         mutex_exit(&netstack_shared_lock);
1274 }
1275 
1276 static void
1277 netstack_shared_zone_remove(zoneid_t zoneid)
1278 {
1279         struct shared_zone_list **szp, *sz;
1280         struct shared_kstat_list *sk;
1281 
1282         /* Find in list */
1283         mutex_enter(&netstack_shared_lock);
1284         sz = NULL;
1285         for (szp = &netstack_shared_zones; *szp != NULL;
1286             szp = &((*szp)->sz_next)) {
1287                 if ((*szp)->sz_zoneid == zoneid) {
1288                         sz = *szp;
1289                         break;
1290                 }
1291         }
1292         /* We must find it */
1293         ASSERT(sz != NULL);
1294         *szp = sz->sz_next;
1295         sz->sz_next = NULL;
1296 
1297         /*
1298          * Perform kstat_zone_remove for each existing shared stack kstat.
1299          * Note: Holds netstack_shared_lock lock across kstat_zone_remove.
1300          */
1301         for (sk = netstack_shared_kstats; sk != NULL; sk = sk->sk_next) {
1302                 kstat_zone_remove(sk->sk_kstat, zoneid);
1303         }
1304         mutex_exit(&netstack_shared_lock);
1305 
1306         kmem_free(sz, sizeof (*sz));
1307 }
1308 
1309 static void
1310 netstack_shared_kstat_add(kstat_t *ks)
1311 {
1312         struct shared_zone_list *sz;
1313         struct shared_kstat_list *sk;
1314 
1315         sk = (struct shared_kstat_list *)kmem_zalloc(sizeof (*sk), KM_SLEEP);
1316         sk->sk_kstat = ks;
1317 
1318         /* Insert in list */
1319         mutex_enter(&netstack_shared_lock);
1320         sk->sk_next = netstack_shared_kstats;
1321         netstack_shared_kstats = sk;
1322 
1323         /*
1324          * Perform kstat_zone_add for each existing shared stack zone.
1325          * Note: Holds netstack_shared_lock lock across kstat_zone_add.
1326          */
1327         for (sz = netstack_shared_zones; sz != NULL; sz = sz->sz_next) {
1328                 kstat_zone_add(ks, sz->sz_zoneid);
1329         }
1330         mutex_exit(&netstack_shared_lock);
1331 }
1332 
1333 static void
1334 netstack_shared_kstat_remove(kstat_t *ks)
1335 {
1336         struct shared_zone_list *sz;
1337         struct shared_kstat_list **skp, *sk;
1338 
1339         /* Find in list */
1340         mutex_enter(&netstack_shared_lock);
1341         sk = NULL;
1342         for (skp = &netstack_shared_kstats; *skp != NULL;
1343             skp = &((*skp)->sk_next)) {
1344                 if ((*skp)->sk_kstat == ks) {
1345                         sk = *skp;
1346                         break;
1347                 }
1348         }
1349         /* Must find it */
1350         ASSERT(sk != NULL);
1351         *skp = sk->sk_next;
1352         sk->sk_next = NULL;
1353 
1354         /*
1355          * Perform kstat_zone_remove for each existing shared stack kstat.
1356          * Note: Holds netstack_shared_lock lock across kstat_zone_remove.
1357          */
1358         for (sz = netstack_shared_zones; sz != NULL; sz = sz->sz_next) {
1359                 kstat_zone_remove(ks, sz->sz_zoneid);
1360         }
1361         mutex_exit(&netstack_shared_lock);
1362         kmem_free(sk, sizeof (*sk));
1363 }
1364 
1365 /*
1366  * If a zoneid is part of the shared zone, return true
1367  */
1368 static boolean_t
1369 netstack_find_shared_zoneid(zoneid_t zoneid)
1370 {
1371         struct shared_zone_list *sz;
1372 
1373         mutex_enter(&netstack_shared_lock);
1374         for (sz = netstack_shared_zones; sz != NULL; sz = sz->sz_next) {
1375                 if (sz->sz_zoneid == zoneid) {
1376                         mutex_exit(&netstack_shared_lock);
1377                         return (B_TRUE);
1378                 }
1379         }
1380         mutex_exit(&netstack_shared_lock);
1381         return (B_FALSE);
1382 }
1383 
1384 /*
1385  * Hide the fact that zoneids and netstackids are allocated from
1386  * the same space in the current implementation.
1387  * We currently do not check that the stackid/zoneids are valid, since there
1388  * is no need for that. But this should only be done for ids that are
1389  * valid.
1390  */
1391 zoneid_t
1392 netstackid_to_zoneid(netstackid_t stackid)
1393 {
1394         return (stackid);
1395 }
1396 
1397 netstackid_t
1398 zoneid_to_netstackid(zoneid_t zoneid)
1399 {
1400         if (netstack_find_shared_zoneid(zoneid))
1401                 return (GLOBAL_ZONEID);
1402         else
1403                 return (zoneid);
1404 }
1405 
1406 zoneid_t
1407 netstack_get_zoneid(netstack_t *ns)
1408 {
1409         return (netstackid_to_zoneid(ns->netstack_stackid));
1410 }
1411 
1412 /*
1413  * Simplistic support for walking all the handles.
1414  * Example usage:
1415  *      netstack_handle_t nh;
1416  *      netstack_t *ns;
1417  *
1418  *      netstack_next_init(&nh);
1419  *      while ((ns = netstack_next(&nh)) != NULL) {
1420  *              do something;
1421  *              netstack_rele(ns);
1422  *      }
1423  *      netstack_next_fini(&nh);
1424  */
1425 void
1426 netstack_next_init(netstack_handle_t *handle)
1427 {
1428         *handle = 0;
1429 }
1430 
1431 /* ARGSUSED */
1432 void
1433 netstack_next_fini(netstack_handle_t *handle)
1434 {
1435 }
1436 
1437 netstack_t *
1438 netstack_next(netstack_handle_t *handle)
1439 {
1440         netstack_t *ns;
1441         int i, end;
1442 
1443         end = *handle;
1444         /* Walk skipping *handle number of instances */
1445 
1446         /* Look if there is a matching stack instance */
1447         mutex_enter(&netstack_g_lock);
1448         ns = netstack_head;
1449         for (i = 0; i < end; i++) {
1450                 if (ns == NULL)
1451                         break;
1452                 ns = ns->netstack_next;
1453         }
1454         /* skip those with that aren't really here */
1455         while (ns != NULL) {
1456                 mutex_enter(&ns->netstack_lock);
1457                 if ((ns->netstack_flags & (NSF_UNINIT|NSF_CLOSING)) == 0) {
1458                         mutex_exit(&ns->netstack_lock);
1459                         break;
1460                 }
1461                 mutex_exit(&ns->netstack_lock);
1462                 end++;
1463                 ns = ns->netstack_next;
1464         }
1465         if (ns != NULL) {
1466                 *handle = end + 1;
1467                 netstack_hold(ns);
1468         }
1469         mutex_exit(&netstack_g_lock);
1470         return (ns);
1471 }