Print this page
    
8900 deadlock between netstack teardown and kstat read
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Ryan Zezeski <rpz@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/os/netstack.c
          +++ new/usr/src/uts/common/os/netstack.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  
    | 
      ↓ open down ↓ | 
    14 lines elided | 
    
      ↑ open up ↑ | 
  
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24   24   * Use is subject to license terms.
  25      - * Copyright (c) 2016, Joyent, Inc.  All rights reserved.
       25 + * Copyright (c) 2017, Joyent, Inc.  All rights reserved.
  26   26   */
  27   27  
  28   28  #include <sys/param.h>
  29   29  #include <sys/sysmacros.h>
  30   30  #include <sys/vm.h>
  31   31  #include <sys/proc.h>
  32   32  #include <sys/tuneable.h>
  33   33  #include <sys/systm.h>
  34   34  #include <sys/cmn_err.h>
  35   35  #include <sys/debug.h>
  36   36  #include <sys/sdt.h>
  37   37  #include <sys/mutex.h>
  38   38  #include <sys/bitmap.h>
  39   39  #include <sys/atomic.h>
       40 +#include <sys/sunddi.h>
  40   41  #include <sys/kobj.h>
  41   42  #include <sys/disp.h>
  42   43  #include <vm/seg_kmem.h>
  43   44  #include <sys/zone.h>
  44   45  #include <sys/netstack.h>
  45   46  
  46   47  /*
  47   48   * What we use so that the zones framework can tell us about new zones,
  48   49   * which we use to create new stacks.
  49   50   */
  50   51  static zone_key_t netstack_zone_key;
  51   52  
  52   53  static int      netstack_initialized = 0;
  53   54  
  54   55  /*
  55   56   * Track the registered netstacks.
  56   57   * The global lock protects
  57   58   * - ns_reg
  58   59   * - the list starting at netstack_head and following the netstack_next
  59   60   *   pointers.
  60   61   */
  61   62  static kmutex_t netstack_g_lock;
  62   63  
  63   64  /*
  64   65   * Registry of netstacks with their create/shutdown/destory functions.
  65   66   */
  66   67  static struct netstack_registry ns_reg[NS_MAX];
  67   68  
  68   69  /*
  69   70   * Global list of existing stacks.  We use this when a new zone with
  70   71   * an exclusive IP instance is created.
  71   72   *
  72   73   * Note that in some cases a netstack_t needs to stay around after the zone
  73   74   * has gone away. This is because there might be outstanding references
  74   75   * (from TCP TIME_WAIT connections, IPsec state, etc). The netstack_t data
  75   76   * structure and all the foo_stack_t's hanging off of it will be cleaned up
  76   77   * when the last reference to it is dropped.
  77   78   * However, the same zone might be rebooted. That is handled using the
  78   79   * assumption that the zones framework picks a new zoneid each time a zone
  79   80   * is (re)booted. We assert for that condition in netstack_zone_create().
  80   81   * Thus the old netstack_t can take its time for things to time out.
  81   82   */
  82   83  static netstack_t *netstack_head;
  83   84  
  84   85  /*
  85   86   * To support kstat_create_netstack() using kstat_zone_add we need
  86   87   * to track both
  87   88   *  - all zoneids that use the global/shared stack
  88   89   *  - all kstats that have been added for the shared stack
  89   90   */
  90   91  struct shared_zone_list {
  91   92          struct shared_zone_list *sz_next;
  92   93          zoneid_t                sz_zoneid;
  93   94  };
  94   95  
  95   96  struct shared_kstat_list {
  96   97          struct shared_kstat_list *sk_next;
  97   98          kstat_t                  *sk_kstat;
  98   99  };
  99  100  
 100  101  static kmutex_t netstack_shared_lock;   /* protects the following two */
 101  102  static struct shared_zone_list  *netstack_shared_zones;
 102  103  static struct shared_kstat_list *netstack_shared_kstats;
 103  104  
 104  105  static void     *netstack_zone_create(zoneid_t zoneid);
 105  106  static void     netstack_zone_shutdown(zoneid_t zoneid, void *arg);
 106  107  static void     netstack_zone_destroy(zoneid_t zoneid, void *arg);
 107  108  
 108  109  static void     netstack_shared_zone_add(zoneid_t zoneid);
 109  110  static void     netstack_shared_zone_remove(zoneid_t zoneid);
 110  111  static void     netstack_shared_kstat_add(kstat_t *ks);
 111  112  static void     netstack_shared_kstat_remove(kstat_t *ks);
 112  113  
 113  114  typedef boolean_t applyfn_t(kmutex_t *, netstack_t *, int);
 114  115  
  
    | 
      ↓ open down ↓ | 
    65 lines elided | 
    
      ↑ open up ↑ | 
  
 115  116  static void     apply_all_netstacks(int, applyfn_t *);
 116  117  static void     apply_all_modules(netstack_t *, applyfn_t *);
 117  118  static void     apply_all_modules_reverse(netstack_t *, applyfn_t *);
 118  119  static boolean_t netstack_apply_create(kmutex_t *, netstack_t *, int);
 119  120  static boolean_t netstack_apply_shutdown(kmutex_t *, netstack_t *, int);
 120  121  static boolean_t netstack_apply_destroy(kmutex_t *, netstack_t *, int);
 121  122  static boolean_t wait_for_zone_creator(netstack_t *, kmutex_t *);
 122  123  static boolean_t wait_for_nms_inprogress(netstack_t *, nm_state_t *,
 123  124      kmutex_t *);
 124  125  
      126 +static ksema_t netstack_reap_limiter;
      127 +/*
      128 + * Hard-coded constant, but since this is not tunable in real-time, it seems
      129 + * making it an /etc/system tunable is better than nothing.
      130 + */
      131 +uint_t netstack_outstanding_reaps = 1024;
      132 +
 125  133  void
 126  134  netstack_init(void)
 127  135  {
 128  136          mutex_init(&netstack_g_lock, NULL, MUTEX_DEFAULT, NULL);
 129  137          mutex_init(&netstack_shared_lock, NULL, MUTEX_DEFAULT, NULL);
 130  138  
      139 +        sema_init(&netstack_reap_limiter, netstack_outstanding_reaps, NULL,
      140 +            SEMA_DRIVER, NULL);
      141 +
 131  142          netstack_initialized = 1;
 132  143  
 133  144          /*
 134  145           * We want to be informed each time a zone is created or
 135  146           * destroyed in the kernel, so we can maintain the
 136  147           * stack instance information.
 137  148           */
 138  149          zone_key_create(&netstack_zone_key, netstack_zone_create,
 139  150              netstack_zone_shutdown, netstack_zone_destroy);
 140  151  }
 141  152  
 142  153  /*
 143  154   * Register a new module with the framework.
 144  155   * This registers interest in changes to the set of netstacks.
 145  156   * The createfn and destroyfn are required, but the shutdownfn can be
 146  157   * NULL.
 147  158   * Note that due to the current zsd implementation, when the create
 148  159   * function is called the zone isn't fully present, thus functions
 149  160   * like zone_find_by_* will fail, hence the create function can not
 150  161   * use many zones kernel functions including zcmn_err().
 151  162   */
 152  163  void
 153  164  netstack_register(int moduleid,
 154  165      void *(*module_create)(netstackid_t, netstack_t *),
 155  166      void (*module_shutdown)(netstackid_t, void *),
 156  167      void (*module_destroy)(netstackid_t, void *))
 157  168  {
 158  169          netstack_t *ns;
 159  170  
 160  171          ASSERT(netstack_initialized);
 161  172          ASSERT(moduleid >= 0 && moduleid < NS_MAX);
 162  173          ASSERT(module_create != NULL);
 163  174  
 164  175          /*
 165  176           * Make instances created after this point in time run the create
 166  177           * callback.
 167  178           */
 168  179          mutex_enter(&netstack_g_lock);
 169  180          ASSERT(ns_reg[moduleid].nr_create == NULL);
 170  181          ASSERT(ns_reg[moduleid].nr_flags == 0);
 171  182          ns_reg[moduleid].nr_create = module_create;
 172  183          ns_reg[moduleid].nr_shutdown = module_shutdown;
 173  184          ns_reg[moduleid].nr_destroy = module_destroy;
 174  185          ns_reg[moduleid].nr_flags = NRF_REGISTERED;
 175  186  
 176  187          /*
 177  188           * Determine the set of stacks that exist before we drop the lock.
 178  189           * Set NSS_CREATE_NEEDED for each of those.
 179  190           * netstacks which have been deleted will have NSS_CREATE_COMPLETED
 180  191           * set, but check NSF_CLOSING to be sure.
 181  192           */
 182  193          for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
 183  194                  nm_state_t *nms = &ns->netstack_m_state[moduleid];
 184  195  
 185  196                  mutex_enter(&ns->netstack_lock);
 186  197                  if (!(ns->netstack_flags & NSF_CLOSING) &&
 187  198                      (nms->nms_flags & NSS_CREATE_ALL) == 0) {
 188  199                          nms->nms_flags |= NSS_CREATE_NEEDED;
 189  200                          DTRACE_PROBE2(netstack__create__needed,
 190  201                              netstack_t *, ns, int, moduleid);
 191  202                  }
 192  203                  mutex_exit(&ns->netstack_lock);
 193  204          }
 194  205          mutex_exit(&netstack_g_lock);
 195  206  
 196  207          /*
 197  208           * At this point in time a new instance can be created or an instance
 198  209           * can be destroyed, or some other module can register or unregister.
 199  210           * Make sure we either run all the create functions for this moduleid
 200  211           * or we wait for any other creators for this moduleid.
 201  212           */
 202  213          apply_all_netstacks(moduleid, netstack_apply_create);
 203  214  }
 204  215  
 205  216  void
 206  217  netstack_unregister(int moduleid)
 207  218  {
 208  219          netstack_t *ns;
 209  220  
 210  221          ASSERT(moduleid >= 0 && moduleid < NS_MAX);
 211  222  
 212  223          ASSERT(ns_reg[moduleid].nr_create != NULL);
 213  224          ASSERT(ns_reg[moduleid].nr_flags & NRF_REGISTERED);
 214  225  
 215  226          mutex_enter(&netstack_g_lock);
 216  227          /*
 217  228           * Determine the set of stacks that exist before we drop the lock.
 218  229           * Set NSS_SHUTDOWN_NEEDED and NSS_DESTROY_NEEDED for each of those.
 219  230           * That ensures that when we return all the callbacks for existing
 220  231           * instances have completed. And since we set NRF_DYING no new
 221  232           * instances can use this module.
 222  233           */
 223  234          for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
 224  235                  boolean_t created = B_FALSE;
 225  236                  nm_state_t *nms = &ns->netstack_m_state[moduleid];
 226  237  
 227  238                  mutex_enter(&ns->netstack_lock);
 228  239  
 229  240                  /*
 230  241                   * We need to be careful here. We could actually have a netstack
 231  242                   * being created as we speak waiting for us to let go of this
 232  243                   * lock to proceed. It may have set NSS_CREATE_NEEDED, but not
 233  244                   * have gotten to the point of completing it yet. If
 234  245                   * NSS_CREATE_NEEDED, we can safely just remove it here and
 235  246                   * never create the module. However, if NSS_CREATE_INPROGRESS is
 236  247                   * set, we need to still flag this module for shutdown and
 237  248                   * deletion, just as though it had reached NSS_CREATE_COMPLETED.
 238  249                   *
 239  250                   * It is safe to do that because of two different guarantees
 240  251                   * that exist in the system. The first is that before we do a
 241  252                   * create, shutdown, or destroy, we ensure that nothing else is
 242  253                   * in progress in the system for this netstack and wait for it
 243  254                   * to complete. Secondly, because the zone is being created, we
 244  255                   * know that the following call to apply_all_netstack will block
 245  256                   * on the zone finishing its initialization.
 246  257                   */
 247  258                  if (nms->nms_flags & NSS_CREATE_NEEDED)
 248  259                          nms->nms_flags &= ~NSS_CREATE_NEEDED;
 249  260  
 250  261                  if (nms->nms_flags & NSS_CREATE_INPROGRESS ||
 251  262                      nms->nms_flags & NSS_CREATE_COMPLETED)
 252  263                          created = B_TRUE;
 253  264  
 254  265                  if (ns_reg[moduleid].nr_shutdown != NULL && created &&
 255  266                      (nms->nms_flags & NSS_CREATE_COMPLETED) &&
 256  267                      (nms->nms_flags & NSS_SHUTDOWN_ALL) == 0) {
 257  268                          nms->nms_flags |= NSS_SHUTDOWN_NEEDED;
 258  269                          DTRACE_PROBE2(netstack__shutdown__needed,
 259  270                              netstack_t *, ns, int, moduleid);
 260  271                  }
 261  272                  if ((ns_reg[moduleid].nr_flags & NRF_REGISTERED) &&
 262  273                      ns_reg[moduleid].nr_destroy != NULL && created &&
 263  274                      (nms->nms_flags & NSS_DESTROY_ALL) == 0) {
 264  275                          nms->nms_flags |= NSS_DESTROY_NEEDED;
 265  276                          DTRACE_PROBE2(netstack__destroy__needed,
 266  277                              netstack_t *, ns, int, moduleid);
 267  278                  }
 268  279                  mutex_exit(&ns->netstack_lock);
 269  280          }
 270  281          /*
 271  282           * Prevent any new netstack from calling the registered create
 272  283           * function, while keeping the function pointers in place until the
 273  284           * shutdown and destroy callbacks are complete.
 274  285           */
 275  286          ns_reg[moduleid].nr_flags |= NRF_DYING;
 276  287          mutex_exit(&netstack_g_lock);
 277  288  
 278  289          apply_all_netstacks(moduleid, netstack_apply_shutdown);
 279  290          apply_all_netstacks(moduleid, netstack_apply_destroy);
 280  291  
 281  292          /*
 282  293           * Clear the nms_flags so that we can handle this module
 283  294           * being loaded again.
 284  295           * Also remove the registered functions.
 285  296           */
 286  297          mutex_enter(&netstack_g_lock);
 287  298          ASSERT(ns_reg[moduleid].nr_flags & NRF_REGISTERED);
 288  299          ASSERT(ns_reg[moduleid].nr_flags & NRF_DYING);
 289  300          for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
 290  301                  nm_state_t *nms = &ns->netstack_m_state[moduleid];
 291  302  
 292  303                  mutex_enter(&ns->netstack_lock);
 293  304                  if (nms->nms_flags & NSS_DESTROY_COMPLETED) {
 294  305                          nms->nms_flags = 0;
 295  306                          DTRACE_PROBE2(netstack__destroy__done,
 296  307                              netstack_t *, ns, int, moduleid);
 297  308                  }
 298  309                  mutex_exit(&ns->netstack_lock);
 299  310          }
 300  311  
 301  312          ns_reg[moduleid].nr_create = NULL;
 302  313          ns_reg[moduleid].nr_shutdown = NULL;
 303  314          ns_reg[moduleid].nr_destroy = NULL;
 304  315          ns_reg[moduleid].nr_flags = 0;
 305  316          mutex_exit(&netstack_g_lock);
 306  317  }
 307  318  
 308  319  /*
 309  320   * Lookup and/or allocate a netstack for this zone.
 310  321   */
 311  322  static void *
 312  323  netstack_zone_create(zoneid_t zoneid)
 313  324  {
 314  325          netstackid_t stackid;
 315  326          netstack_t *ns;
 316  327          netstack_t **nsp;
 317  328          zone_t  *zone;
 318  329          int i;
 319  330  
 320  331          ASSERT(netstack_initialized);
 321  332  
 322  333          zone = zone_find_by_id_nolock(zoneid);
 323  334          ASSERT(zone != NULL);
 324  335  
 325  336          if (zone->zone_flags & ZF_NET_EXCL) {
 326  337                  stackid = zoneid;
 327  338          } else {
 328  339                  /* Look for the stack instance for the global */
 329  340                  stackid = GLOBAL_NETSTACKID;
 330  341          }
 331  342  
 332  343          /* Allocate even if it isn't needed; simplifies locking */
 333  344          ns = (netstack_t *)kmem_zalloc(sizeof (netstack_t), KM_SLEEP);
 334  345  
 335  346          /* Look if there is a matching stack instance */
 336  347          mutex_enter(&netstack_g_lock);
 337  348          for (nsp = &netstack_head; *nsp != NULL;
 338  349              nsp = &((*nsp)->netstack_next)) {
 339  350                  if ((*nsp)->netstack_stackid == stackid) {
 340  351                          /*
 341  352                           * Should never find a pre-existing exclusive stack
 342  353                           */
 343  354                          VERIFY(stackid == GLOBAL_NETSTACKID);
 344  355                          kmem_free(ns, sizeof (netstack_t));
 345  356                          ns = *nsp;
 346  357                          mutex_enter(&ns->netstack_lock);
 347  358                          ns->netstack_numzones++;
 348  359                          mutex_exit(&ns->netstack_lock);
 349  360                          mutex_exit(&netstack_g_lock);
 350  361                          DTRACE_PROBE1(netstack__inc__numzones,
 351  362                              netstack_t *, ns);
 352  363                          /* Record that we have a new shared stack zone */
 353  364                          netstack_shared_zone_add(zoneid);
 354  365                          zone->zone_netstack = ns;
 355  366                          return (ns);
 356  367                  }
 357  368          }
 358  369          /* Not found */
 359  370          mutex_init(&ns->netstack_lock, NULL, MUTEX_DEFAULT, NULL);
 360  371          cv_init(&ns->netstack_cv, NULL, CV_DEFAULT, NULL);
 361  372          ns->netstack_stackid = zoneid;
 362  373          ns->netstack_numzones = 1;
 363  374          ns->netstack_refcnt = 1; /* Decremented by netstack_zone_destroy */
 364  375          ns->netstack_flags = NSF_UNINIT;
 365  376          *nsp = ns;
 366  377          zone->zone_netstack = ns;
 367  378  
 368  379          mutex_enter(&ns->netstack_lock);
 369  380          /*
 370  381           * Mark this netstack as having a CREATE running so
 371  382           * any netstack_register/netstack_unregister waits for
 372  383           * the existing create callbacks to complete in moduleid order
 373  384           */
 374  385          ns->netstack_flags |= NSF_ZONE_CREATE;
 375  386  
 376  387          /*
 377  388           * Determine the set of module create functions that need to be
 378  389           * called before we drop the lock.
 379  390           * Set NSS_CREATE_NEEDED for each of those.
 380  391           * Skip any with NRF_DYING set, since those are in the process of
 381  392           * going away, by checking for flags being exactly NRF_REGISTERED.
 382  393           */
 383  394          for (i = 0; i < NS_MAX; i++) {
 384  395                  nm_state_t *nms = &ns->netstack_m_state[i];
 385  396  
 386  397                  cv_init(&nms->nms_cv, NULL, CV_DEFAULT, NULL);
 387  398  
 388  399                  if ((ns_reg[i].nr_flags == NRF_REGISTERED) &&
 389  400                      (nms->nms_flags & NSS_CREATE_ALL) == 0) {
 390  401                          nms->nms_flags |= NSS_CREATE_NEEDED;
 391  402                          DTRACE_PROBE2(netstack__create__needed,
 392  403                              netstack_t *, ns, int, i);
 393  404                  }
 394  405          }
 395  406          mutex_exit(&ns->netstack_lock);
 396  407          mutex_exit(&netstack_g_lock);
 397  408  
 398  409          apply_all_modules(ns, netstack_apply_create);
 399  410  
 400  411          /* Tell any waiting netstack_register/netstack_unregister to proceed */
 401  412          mutex_enter(&ns->netstack_lock);
 402  413          ns->netstack_flags &= ~NSF_UNINIT;
 403  414          ASSERT(ns->netstack_flags & NSF_ZONE_CREATE);
 404  415          ns->netstack_flags &= ~NSF_ZONE_CREATE;
 405  416          cv_broadcast(&ns->netstack_cv);
 406  417          mutex_exit(&ns->netstack_lock);
 407  418  
 408  419          return (ns);
 409  420  }
 410  421  
 411  422  /* ARGSUSED */
 412  423  static void
 413  424  netstack_zone_shutdown(zoneid_t zoneid, void *arg)
 414  425  {
 415  426          netstack_t *ns = (netstack_t *)arg;
 416  427          int i;
 417  428  
 418  429          ASSERT(arg != NULL);
 419  430  
 420  431          mutex_enter(&ns->netstack_lock);
 421  432          ASSERT(ns->netstack_numzones > 0);
 422  433          if (ns->netstack_numzones != 1) {
 423  434                  /* Stack instance being used by other zone */
 424  435                  mutex_exit(&ns->netstack_lock);
 425  436                  ASSERT(ns->netstack_stackid == GLOBAL_NETSTACKID);
 426  437                  return;
 427  438          }
 428  439          mutex_exit(&ns->netstack_lock);
 429  440  
 430  441          mutex_enter(&netstack_g_lock);
 431  442          mutex_enter(&ns->netstack_lock);
 432  443          /*
 433  444           * Mark this netstack as having a SHUTDOWN running so
 434  445           * any netstack_register/netstack_unregister waits for
 435  446           * the existing create callbacks to complete in moduleid order
 436  447           */
 437  448          ASSERT(!(ns->netstack_flags & NSF_ZONE_INPROGRESS));
 438  449          ns->netstack_flags |= NSF_ZONE_SHUTDOWN;
 439  450  
 440  451          /*
 441  452           * Determine the set of stacks that exist before we drop the lock.
 442  453           * Set NSS_SHUTDOWN_NEEDED for each of those.
 443  454           */
 444  455          for (i = 0; i < NS_MAX; i++) {
 445  456                  nm_state_t *nms = &ns->netstack_m_state[i];
 446  457  
 447  458                  if ((ns_reg[i].nr_flags & NRF_REGISTERED) &&
 448  459                      ns_reg[i].nr_shutdown != NULL &&
 449  460                      (nms->nms_flags & NSS_CREATE_COMPLETED) &&
 450  461                      (nms->nms_flags & NSS_SHUTDOWN_ALL) == 0) {
 451  462                          nms->nms_flags |= NSS_SHUTDOWN_NEEDED;
 452  463                          DTRACE_PROBE2(netstack__shutdown__needed,
 453  464                              netstack_t *, ns, int, i);
 454  465                  }
 455  466          }
 456  467          mutex_exit(&ns->netstack_lock);
 457  468          mutex_exit(&netstack_g_lock);
 458  469  
 459  470          /*
 460  471           * Call the shutdown function for all registered modules for this
 461  472           * netstack.
 462  473           */
 463  474          apply_all_modules_reverse(ns, netstack_apply_shutdown);
 464  475  
 465  476          /* Tell any waiting netstack_register/netstack_unregister to proceed */
 466  477          mutex_enter(&ns->netstack_lock);
 467  478          ASSERT(ns->netstack_flags & NSF_ZONE_SHUTDOWN);
 468  479          ns->netstack_flags &= ~NSF_ZONE_SHUTDOWN;
 469  480          cv_broadcast(&ns->netstack_cv);
 470  481          mutex_exit(&ns->netstack_lock);
 471  482  }
 472  483  
 473  484  /*
 474  485   * Common routine to release a zone.
 475  486   * If this was the last zone using the stack instance then prepare to
 476  487   * have the refcnt dropping to zero free the zone.
 477  488   */
 478  489  /* ARGSUSED */
 479  490  static void
 480  491  netstack_zone_destroy(zoneid_t zoneid, void *arg)
 481  492  {
 482  493          netstack_t *ns = (netstack_t *)arg;
 483  494  
 484  495          ASSERT(arg != NULL);
 485  496  
 486  497          mutex_enter(&ns->netstack_lock);
 487  498          ASSERT(ns->netstack_numzones > 0);
 488  499          ns->netstack_numzones--;
 489  500          if (ns->netstack_numzones != 0) {
 490  501                  /* Stack instance being used by other zone */
 491  502                  mutex_exit(&ns->netstack_lock);
 492  503                  ASSERT(ns->netstack_stackid == GLOBAL_NETSTACKID);
 493  504                  /* Record that we a shared stack zone has gone away */
 494  505                  netstack_shared_zone_remove(zoneid);
 495  506                  return;
 496  507          }
 497  508          /*
 498  509           * Set CLOSING so that netstack_find_by will not find it.
 499  510           */
 500  511          ns->netstack_flags |= NSF_CLOSING;
 501  512          mutex_exit(&ns->netstack_lock);
 502  513          DTRACE_PROBE1(netstack__dec__numzones, netstack_t *, ns);
 503  514          /* No other thread can call zone_destroy for this stack */
 504  515  
 505  516          /*
 506  517           * Decrease refcnt to account for the one in netstack_zone_init()
 507  518           */
 508  519          netstack_rele(ns);
 509  520  }
 510  521  
 511  522  /*
 512  523   * Called when the reference count drops to zero.
 513  524   * Call the destroy functions for each registered module.
 514  525   */
 515  526  static void
 516  527  netstack_stack_inactive(netstack_t *ns)
 517  528  {
 518  529          int i;
 519  530  
 520  531          mutex_enter(&netstack_g_lock);
 521  532          mutex_enter(&ns->netstack_lock);
 522  533          /*
 523  534           * Mark this netstack as having a DESTROY running so
 524  535           * any netstack_register/netstack_unregister waits for
 525  536           * the existing destroy callbacks to complete in reverse moduleid order
 526  537           */
 527  538          ASSERT(!(ns->netstack_flags & NSF_ZONE_INPROGRESS));
 528  539          ns->netstack_flags |= NSF_ZONE_DESTROY;
 529  540          /*
 530  541           * If the shutdown callback wasn't called earlier (e.g., if this is
 531  542           * a netstack shared between multiple zones), then we schedule it now.
 532  543           *
 533  544           * Determine the set of stacks that exist before we drop the lock.
 534  545           * Set NSS_DESTROY_NEEDED for each of those. That
 535  546           * ensures that when we return all the callbacks for existing
 536  547           * instances have completed.
 537  548           */
 538  549          for (i = 0; i < NS_MAX; i++) {
 539  550                  nm_state_t *nms = &ns->netstack_m_state[i];
 540  551  
 541  552                  if ((ns_reg[i].nr_flags & NRF_REGISTERED) &&
 542  553                      ns_reg[i].nr_shutdown != NULL &&
 543  554                      (nms->nms_flags & NSS_CREATE_COMPLETED) &&
 544  555                      (nms->nms_flags & NSS_SHUTDOWN_ALL) == 0) {
 545  556                          nms->nms_flags |= NSS_SHUTDOWN_NEEDED;
 546  557                          DTRACE_PROBE2(netstack__shutdown__needed,
 547  558                              netstack_t *, ns, int, i);
 548  559                  }
 549  560  
 550  561                  if ((ns_reg[i].nr_flags & NRF_REGISTERED) &&
 551  562                      ns_reg[i].nr_destroy != NULL &&
 552  563                      (nms->nms_flags & NSS_CREATE_COMPLETED) &&
 553  564                      (nms->nms_flags & NSS_DESTROY_ALL) == 0) {
 554  565                          nms->nms_flags |= NSS_DESTROY_NEEDED;
 555  566                          DTRACE_PROBE2(netstack__destroy__needed,
 556  567                              netstack_t *, ns, int, i);
 557  568                  }
 558  569          }
 559  570          mutex_exit(&ns->netstack_lock);
 560  571          mutex_exit(&netstack_g_lock);
 561  572  
 562  573          /*
 563  574           * Call the shutdown and destroy functions for all registered modules
 564  575           * for this netstack.
 565  576           *
 566  577           * Since there are some ordering dependencies between the modules we
 567  578           * tear them down in the reverse order of what was used to create them.
 568  579           *
 569  580           * Since a netstack_t is never reused (when a zone is rebooted it gets
 570  581           * a new zoneid == netstackid i.e. a new netstack_t is allocated) we
 571  582           * leave nms_flags the way it is i.e. with NSS_DESTROY_COMPLETED set.
 572  583           * That is different than in the netstack_unregister() case.
 573  584           */
 574  585          apply_all_modules_reverse(ns, netstack_apply_shutdown);
 575  586          apply_all_modules_reverse(ns, netstack_apply_destroy);
 576  587  
 577  588          /* Tell any waiting netstack_register/netstack_unregister to proceed */
 578  589          mutex_enter(&ns->netstack_lock);
 579  590          ASSERT(ns->netstack_flags & NSF_ZONE_DESTROY);
 580  591          ns->netstack_flags &= ~NSF_ZONE_DESTROY;
 581  592          cv_broadcast(&ns->netstack_cv);
 582  593          mutex_exit(&ns->netstack_lock);
 583  594  }
 584  595  
 585  596  /*
 586  597   * Apply a function to all netstacks for a particular moduleid.
 587  598   *
 588  599   * If there is any zone activity (due to a zone being created, shutdown,
 589  600   * or destroyed) we wait for that to complete before we proceed. This ensures
 590  601   * that the moduleids are processed in order when a zone is created or
 591  602   * destroyed.
 592  603   *
 593  604   * The applyfn has to drop netstack_g_lock if it does some work.
 594  605   * In that case we don't follow netstack_next,
 595  606   * even if it is possible to do so without any hazards. This is
 596  607   * because we want the design to allow for the list of netstacks threaded
 597  608   * by netstack_next to change in any arbitrary way during the time the
 598  609   * lock was dropped.
 599  610   *
 600  611   * It is safe to restart the loop at netstack_head since the applyfn
 601  612   * changes netstack_m_state as it processes things, so a subsequent
 602  613   * pass through will have no effect in applyfn, hence the loop will terminate
 603  614   * in at worst O(N^2).
 604  615   */
 605  616  static void
 606  617  apply_all_netstacks(int moduleid, applyfn_t *applyfn)
 607  618  {
 608  619          netstack_t *ns;
 609  620  
 610  621          mutex_enter(&netstack_g_lock);
 611  622          ns = netstack_head;
 612  623          while (ns != NULL) {
 613  624                  if (wait_for_zone_creator(ns, &netstack_g_lock)) {
 614  625                          /* Lock dropped - restart at head */
 615  626                          ns = netstack_head;
 616  627                  } else if ((applyfn)(&netstack_g_lock, ns, moduleid)) {
 617  628                          /* Lock dropped - restart at head */
 618  629                          ns = netstack_head;
 619  630                  } else {
 620  631                          ns = ns->netstack_next;
 621  632                  }
 622  633          }
 623  634          mutex_exit(&netstack_g_lock);
 624  635  }
 625  636  
 626  637  /*
 627  638   * Apply a function to all moduleids for a particular netstack.
 628  639   *
 629  640   * Since the netstack linkage doesn't matter in this case we can
 630  641   * ignore whether the function drops the lock.
 631  642   */
 632  643  static void
 633  644  apply_all_modules(netstack_t *ns, applyfn_t *applyfn)
 634  645  {
 635  646          int i;
 636  647  
 637  648          mutex_enter(&netstack_g_lock);
 638  649          for (i = 0; i < NS_MAX; i++) {
 639  650                  /*
 640  651                   * We don't care whether the lock was dropped
 641  652                   * since we are not iterating over netstack_head.
 642  653                   */
 643  654                  (void) (applyfn)(&netstack_g_lock, ns, i);
 644  655          }
 645  656          mutex_exit(&netstack_g_lock);
 646  657  }
 647  658  
 648  659  /* Like the above but in reverse moduleid order */
 649  660  static void
 650  661  apply_all_modules_reverse(netstack_t *ns, applyfn_t *applyfn)
 651  662  {
 652  663          int i;
 653  664  
 654  665          mutex_enter(&netstack_g_lock);
 655  666          for (i = NS_MAX-1; i >= 0; i--) {
 656  667                  /*
 657  668                   * We don't care whether the lock was dropped
 658  669                   * since we are not iterating over netstack_head.
 659  670                   */
 660  671                  (void) (applyfn)(&netstack_g_lock, ns, i);
 661  672          }
 662  673          mutex_exit(&netstack_g_lock);
 663  674  }
 664  675  
 665  676  /*
 666  677   * Call the create function for the ns and moduleid if CREATE_NEEDED
 667  678   * is set.
 668  679   * If some other thread gets here first and sets *_INPROGRESS, then
 669  680   * we wait for that thread to complete so that we can ensure that
 670  681   * all the callbacks are done when we've looped over all netstacks/moduleids.
 671  682   *
 672  683   * When we call the create function, we temporarily drop the netstack_lock
 673  684   * held by the caller, and return true to tell the caller it needs to
 674  685   * re-evalute the state.
 675  686   */
 676  687  static boolean_t
 677  688  netstack_apply_create(kmutex_t *lockp, netstack_t *ns, int moduleid)
 678  689  {
 679  690          void *result;
 680  691          netstackid_t stackid;
 681  692          nm_state_t *nms = &ns->netstack_m_state[moduleid];
 682  693          boolean_t dropped = B_FALSE;
 683  694  
 684  695          ASSERT(MUTEX_HELD(lockp));
 685  696          mutex_enter(&ns->netstack_lock);
 686  697  
 687  698          if (wait_for_nms_inprogress(ns, nms, lockp))
 688  699                  dropped = B_TRUE;
 689  700  
 690  701          if (nms->nms_flags & NSS_CREATE_NEEDED) {
 691  702                  nms->nms_flags &= ~NSS_CREATE_NEEDED;
 692  703                  nms->nms_flags |= NSS_CREATE_INPROGRESS;
 693  704                  DTRACE_PROBE2(netstack__create__inprogress,
 694  705                      netstack_t *, ns, int, moduleid);
 695  706                  mutex_exit(&ns->netstack_lock);
 696  707                  mutex_exit(lockp);
 697  708                  dropped = B_TRUE;
 698  709  
 699  710                  ASSERT(ns_reg[moduleid].nr_create != NULL);
 700  711                  stackid = ns->netstack_stackid;
 701  712                  DTRACE_PROBE2(netstack__create__start,
 702  713                      netstackid_t, stackid,
 703  714                      netstack_t *, ns);
 704  715                  result = (ns_reg[moduleid].nr_create)(stackid, ns);
 705  716                  DTRACE_PROBE2(netstack__create__end,
 706  717                      void *, result, netstack_t *, ns);
 707  718  
 708  719                  ASSERT(result != NULL);
 709  720                  mutex_enter(lockp);
 710  721                  mutex_enter(&ns->netstack_lock);
 711  722                  ns->netstack_modules[moduleid] = result;
 712  723                  nms->nms_flags &= ~NSS_CREATE_INPROGRESS;
 713  724                  nms->nms_flags |= NSS_CREATE_COMPLETED;
 714  725                  cv_broadcast(&nms->nms_cv);
 715  726                  DTRACE_PROBE2(netstack__create__completed,
 716  727                      netstack_t *, ns, int, moduleid);
 717  728                  mutex_exit(&ns->netstack_lock);
 718  729                  return (dropped);
 719  730          } else {
 720  731                  mutex_exit(&ns->netstack_lock);
 721  732                  return (dropped);
 722  733          }
 723  734  }
 724  735  
 725  736  /*
 726  737   * Call the shutdown function for the ns and moduleid if SHUTDOWN_NEEDED
 727  738   * is set.
 728  739   * If some other thread gets here first and sets *_INPROGRESS, then
 729  740   * we wait for that thread to complete so that we can ensure that
 730  741   * all the callbacks are done when we've looped over all netstacks/moduleids.
 731  742   *
 732  743   * When we call the shutdown function, we temporarily drop the netstack_lock
 733  744   * held by the caller, and return true to tell the caller it needs to
 734  745   * re-evalute the state.
 735  746   */
 736  747  static boolean_t
 737  748  netstack_apply_shutdown(kmutex_t *lockp, netstack_t *ns, int moduleid)
 738  749  {
 739  750          netstackid_t stackid;
 740  751          void * netstack_module;
 741  752          nm_state_t *nms = &ns->netstack_m_state[moduleid];
 742  753          boolean_t dropped = B_FALSE;
 743  754  
 744  755          ASSERT(MUTEX_HELD(lockp));
 745  756          mutex_enter(&ns->netstack_lock);
 746  757  
 747  758          if (wait_for_nms_inprogress(ns, nms, lockp))
 748  759                  dropped = B_TRUE;
 749  760  
 750  761          if (nms->nms_flags & NSS_SHUTDOWN_NEEDED) {
 751  762                  nms->nms_flags &= ~NSS_SHUTDOWN_NEEDED;
 752  763                  nms->nms_flags |= NSS_SHUTDOWN_INPROGRESS;
 753  764                  DTRACE_PROBE2(netstack__shutdown__inprogress,
 754  765                      netstack_t *, ns, int, moduleid);
 755  766                  mutex_exit(&ns->netstack_lock);
 756  767                  mutex_exit(lockp);
 757  768                  dropped = B_TRUE;
 758  769  
 759  770                  ASSERT(ns_reg[moduleid].nr_shutdown != NULL);
 760  771                  stackid = ns->netstack_stackid;
 761  772                  netstack_module = ns->netstack_modules[moduleid];
 762  773                  DTRACE_PROBE2(netstack__shutdown__start,
 763  774                      netstackid_t, stackid,
 764  775                      void *, netstack_module);
 765  776                  (ns_reg[moduleid].nr_shutdown)(stackid, netstack_module);
 766  777                  DTRACE_PROBE1(netstack__shutdown__end,
 767  778                      netstack_t *, ns);
 768  779  
 769  780                  mutex_enter(lockp);
 770  781                  mutex_enter(&ns->netstack_lock);
 771  782                  nms->nms_flags &= ~NSS_SHUTDOWN_INPROGRESS;
 772  783                  nms->nms_flags |= NSS_SHUTDOWN_COMPLETED;
 773  784                  cv_broadcast(&nms->nms_cv);
 774  785                  DTRACE_PROBE2(netstack__shutdown__completed,
 775  786                      netstack_t *, ns, int, moduleid);
 776  787                  mutex_exit(&ns->netstack_lock);
 777  788                  return (dropped);
 778  789          } else {
 779  790                  mutex_exit(&ns->netstack_lock);
 780  791                  return (dropped);
 781  792          }
 782  793  }
 783  794  
 784  795  /*
 785  796   * Call the destroy function for the ns and moduleid if DESTROY_NEEDED
 786  797   * is set.
 787  798   * If some other thread gets here first and sets *_INPROGRESS, then
 788  799   * we wait for that thread to complete so that we can ensure that
 789  800   * all the callbacks are done when we've looped over all netstacks/moduleids.
 790  801   *
 791  802   * When we call the destroy function, we temporarily drop the netstack_lock
 792  803   * held by the caller, and return true to tell the caller it needs to
 793  804   * re-evalute the state.
 794  805   */
 795  806  static boolean_t
 796  807  netstack_apply_destroy(kmutex_t *lockp, netstack_t *ns, int moduleid)
 797  808  {
 798  809          netstackid_t stackid;
 799  810          void * netstack_module;
 800  811          nm_state_t *nms = &ns->netstack_m_state[moduleid];
 801  812          boolean_t dropped = B_FALSE;
 802  813  
 803  814          ASSERT(MUTEX_HELD(lockp));
 804  815          mutex_enter(&ns->netstack_lock);
 805  816  
 806  817          if (wait_for_nms_inprogress(ns, nms, lockp))
 807  818                  dropped = B_TRUE;
 808  819  
 809  820          if (nms->nms_flags & NSS_DESTROY_NEEDED) {
 810  821                  nms->nms_flags &= ~NSS_DESTROY_NEEDED;
 811  822                  nms->nms_flags |= NSS_DESTROY_INPROGRESS;
 812  823                  DTRACE_PROBE2(netstack__destroy__inprogress,
 813  824                      netstack_t *, ns, int, moduleid);
 814  825                  mutex_exit(&ns->netstack_lock);
 815  826                  mutex_exit(lockp);
 816  827                  dropped = B_TRUE;
 817  828  
 818  829                  ASSERT(ns_reg[moduleid].nr_destroy != NULL);
 819  830                  stackid = ns->netstack_stackid;
 820  831                  netstack_module = ns->netstack_modules[moduleid];
 821  832                  DTRACE_PROBE2(netstack__destroy__start,
 822  833                      netstackid_t, stackid,
 823  834                      void *, netstack_module);
 824  835                  (ns_reg[moduleid].nr_destroy)(stackid, netstack_module);
 825  836                  DTRACE_PROBE1(netstack__destroy__end,
 826  837                      netstack_t *, ns);
 827  838  
 828  839                  mutex_enter(lockp);
 829  840                  mutex_enter(&ns->netstack_lock);
 830  841                  ns->netstack_modules[moduleid] = NULL;
 831  842                  nms->nms_flags &= ~NSS_DESTROY_INPROGRESS;
 832  843                  nms->nms_flags |= NSS_DESTROY_COMPLETED;
 833  844                  cv_broadcast(&nms->nms_cv);
 834  845                  DTRACE_PROBE2(netstack__destroy__completed,
 835  846                      netstack_t *, ns, int, moduleid);
 836  847                  mutex_exit(&ns->netstack_lock);
 837  848                  return (dropped);
 838  849          } else {
 839  850                  mutex_exit(&ns->netstack_lock);
 840  851                  return (dropped);
 841  852          }
 842  853  }
 843  854  
 844  855  /*
 845  856   * If somebody  is creating the netstack (due to a new zone being created)
 846  857   * then we wait for them to complete. This ensures that any additional
 847  858   * netstack_register() doesn't cause the create functions to run out of
 848  859   * order.
 849  860   * Note that we do not need such a global wait in the case of the shutdown
 850  861   * and destroy callbacks, since in that case it is sufficient for both
 851  862   * threads to set NEEDED and wait for INPROGRESS to ensure ordering.
 852  863   * Returns true if lockp was temporarily dropped while waiting.
 853  864   */
 854  865  static boolean_t
 855  866  wait_for_zone_creator(netstack_t *ns, kmutex_t *lockp)
 856  867  {
 857  868          boolean_t dropped = B_FALSE;
 858  869  
 859  870          mutex_enter(&ns->netstack_lock);
 860  871          while (ns->netstack_flags & NSF_ZONE_CREATE) {
 861  872                  DTRACE_PROBE1(netstack__wait__zone__inprogress,
 862  873                      netstack_t *, ns);
 863  874                  if (lockp != NULL) {
 864  875                          dropped = B_TRUE;
 865  876                          mutex_exit(lockp);
 866  877                  }
 867  878                  cv_wait(&ns->netstack_cv, &ns->netstack_lock);
 868  879                  if (lockp != NULL) {
 869  880                          /* First drop netstack_lock to preserve order */
 870  881                          mutex_exit(&ns->netstack_lock);
 871  882                          mutex_enter(lockp);
 872  883                          mutex_enter(&ns->netstack_lock);
 873  884                  }
 874  885          }
 875  886          mutex_exit(&ns->netstack_lock);
 876  887          return (dropped);
 877  888  }
 878  889  
 879  890  /*
 880  891   * Wait for any INPROGRESS flag to be cleared for the netstack/moduleid
 881  892   * combination.
 882  893   * Returns true if lockp was temporarily dropped while waiting.
 883  894   */
 884  895  static boolean_t
 885  896  wait_for_nms_inprogress(netstack_t *ns, nm_state_t *nms, kmutex_t *lockp)
 886  897  {
 887  898          boolean_t dropped = B_FALSE;
 888  899  
 889  900          while (nms->nms_flags & NSS_ALL_INPROGRESS) {
 890  901                  DTRACE_PROBE2(netstack__wait__nms__inprogress,
 891  902                      netstack_t *, ns, nm_state_t *, nms);
 892  903                  if (lockp != NULL) {
 893  904                          dropped = B_TRUE;
 894  905                          mutex_exit(lockp);
 895  906                  }
 896  907                  cv_wait(&nms->nms_cv, &ns->netstack_lock);
 897  908                  if (lockp != NULL) {
 898  909                          /* First drop netstack_lock to preserve order */
 899  910                          mutex_exit(&ns->netstack_lock);
 900  911                          mutex_enter(lockp);
 901  912                          mutex_enter(&ns->netstack_lock);
 902  913                  }
 903  914          }
 904  915          return (dropped);
 905  916  }
 906  917  
 907  918  /*
 908  919   * Get the stack instance used in caller's zone.
 909  920   * Increases the reference count, caller must do a netstack_rele.
 910  921   * It can't be called after zone_destroy() has started.
 911  922   */
 912  923  netstack_t *
 913  924  netstack_get_current(void)
 914  925  {
 915  926          netstack_t *ns;
 916  927  
 917  928          ns = curproc->p_zone->zone_netstack;
 918  929          ASSERT(ns != NULL);
 919  930          if (ns->netstack_flags & (NSF_UNINIT|NSF_CLOSING))
 920  931                  return (NULL);
 921  932  
 922  933          netstack_hold(ns);
 923  934  
 924  935          return (ns);
 925  936  }
 926  937  
 927  938  /*
 928  939   * Find a stack instance given the cred.
 929  940   * This is used by the modules to potentially allow for a future when
 930  941   * something other than the zoneid is used to determine the stack.
 931  942   */
 932  943  netstack_t *
 933  944  netstack_find_by_cred(const cred_t *cr)
 934  945  {
 935  946          zoneid_t zoneid = crgetzoneid(cr);
 936  947  
 937  948          /* Handle the case when cr_zone is NULL */
 938  949          if (zoneid == (zoneid_t)-1)
 939  950                  zoneid = GLOBAL_ZONEID;
 940  951  
 941  952          /* For performance ... */
 942  953          if (curproc->p_zone->zone_id == zoneid)
 943  954                  return (netstack_get_current());
 944  955          else
 945  956                  return (netstack_find_by_zoneid(zoneid));
 946  957  }
 947  958  
 948  959  /*
 949  960   * Find a stack instance given the zoneid.
 950  961   * Increases the reference count if found; caller must do a
 951  962   * netstack_rele().
 952  963   *
 953  964   * If there is no exact match then assume the shared stack instance
 954  965   * matches.
 955  966   *
 956  967   * Skip the unitialized ones.
 957  968   */
 958  969  netstack_t *
 959  970  netstack_find_by_zoneid(zoneid_t zoneid)
 960  971  {
 961  972          netstack_t *ns;
 962  973          zone_t *zone;
 963  974  
 964  975          zone = zone_find_by_id(zoneid);
 965  976  
 966  977          if (zone == NULL)
 967  978                  return (NULL);
 968  979  
 969  980          ns = zone->zone_netstack;
 970  981          ASSERT(ns != NULL);
 971  982          if (ns->netstack_flags & (NSF_UNINIT|NSF_CLOSING))
 972  983                  ns = NULL;
 973  984          else
 974  985                  netstack_hold(ns);
 975  986  
 976  987          zone_rele(zone);
 977  988          return (ns);
 978  989  }
 979  990  
 980  991  /*
 981  992   * Find a stack instance given the zoneid. Can only be called from
 982  993   * the create callback. See the comments in zone_find_by_id_nolock why
 983  994   * that limitation exists.
 984  995   *
 985  996   * Increases the reference count if found; caller must do a
 986  997   * netstack_rele().
 987  998   *
 988  999   * If there is no exact match then assume the shared stack instance
 989 1000   * matches.
 990 1001   *
 991 1002   * Skip the unitialized ones.
 992 1003   */
 993 1004  netstack_t *
 994 1005  netstack_find_by_zoneid_nolock(zoneid_t zoneid)
 995 1006  {
 996 1007          netstack_t *ns;
 997 1008          zone_t *zone;
 998 1009  
 999 1010          zone = zone_find_by_id_nolock(zoneid);
1000 1011  
1001 1012          if (zone == NULL)
1002 1013                  return (NULL);
1003 1014  
1004 1015          ns = zone->zone_netstack;
1005 1016          ASSERT(ns != NULL);
1006 1017  
1007 1018          if (ns->netstack_flags & (NSF_UNINIT|NSF_CLOSING))
1008 1019                  ns = NULL;
1009 1020          else
1010 1021                  netstack_hold(ns);
1011 1022  
1012 1023          /* zone_find_by_id_nolock does not have a hold on the zone */
1013 1024          return (ns);
1014 1025  }
1015 1026  
1016 1027  /*
1017 1028   * Find a stack instance given the stackid with exact match?
1018 1029   * Increases the reference count if found; caller must do a
1019 1030   * netstack_rele().
1020 1031   *
1021 1032   * Skip the unitialized ones.
1022 1033   */
1023 1034  netstack_t *
1024 1035  netstack_find_by_stackid(netstackid_t stackid)
1025 1036  {
1026 1037          netstack_t *ns;
1027 1038  
1028 1039          mutex_enter(&netstack_g_lock);
1029 1040          for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
1030 1041                  mutex_enter(&ns->netstack_lock);
1031 1042                  if (ns->netstack_stackid == stackid &&
1032 1043                      !(ns->netstack_flags & (NSF_UNINIT|NSF_CLOSING))) {
1033 1044                          mutex_exit(&ns->netstack_lock);
1034 1045                          netstack_hold(ns);
1035 1046                          mutex_exit(&netstack_g_lock);
1036 1047                          return (ns);
1037 1048                  }
1038 1049                  mutex_exit(&ns->netstack_lock);
1039 1050          }
1040 1051          mutex_exit(&netstack_g_lock);
1041 1052          return (NULL);
1042 1053  }
1043 1054  
1044 1055  boolean_t
1045 1056  netstack_inuse_by_stackid(netstackid_t stackid)
1046 1057  {
1047 1058          netstack_t *ns;
1048 1059          boolean_t rval = B_FALSE;
1049 1060  
1050 1061          mutex_enter(&netstack_g_lock);
1051 1062  
1052 1063          for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
1053 1064                  if (ns->netstack_stackid == stackid) {
  
    | 
      ↓ open down ↓ | 
    913 lines elided | 
    
      ↑ open up ↑ | 
  
1054 1065                          rval = B_TRUE;
1055 1066                          break;
1056 1067                  }
1057 1068          }
1058 1069  
1059 1070          mutex_exit(&netstack_g_lock);
1060 1071  
1061 1072          return (rval);
1062 1073  }
1063 1074  
     1075 +
     1076 +static void
     1077 +netstack_reap(void *arg)
     1078 +{
     1079 +        netstack_t **nsp, *ns = (netstack_t *)arg;
     1080 +        boolean_t found;
     1081 +        int i;
     1082 +
     1083 +        /*
     1084 +         * Time to call the destroy functions and free up
     1085 +         * the structure
     1086 +         */
     1087 +        netstack_stack_inactive(ns);
     1088 +
     1089 +        /* Make sure nothing increased the references */
     1090 +        ASSERT(ns->netstack_refcnt == 0);
     1091 +        ASSERT(ns->netstack_numzones == 0);
     1092 +
     1093 +        /* Finally remove from list of netstacks */
     1094 +        mutex_enter(&netstack_g_lock);
     1095 +        found = B_FALSE;
     1096 +        for (nsp = &netstack_head; *nsp != NULL;
     1097 +            nsp = &(*nsp)->netstack_next) {
     1098 +                if (*nsp == ns) {
     1099 +                        *nsp = ns->netstack_next;
     1100 +                        ns->netstack_next = NULL;
     1101 +                        found = B_TRUE;
     1102 +                        break;
     1103 +                }
     1104 +        }
     1105 +        ASSERT(found);
     1106 +        mutex_exit(&netstack_g_lock);
     1107 +
     1108 +        /* Make sure nothing increased the references */
     1109 +        ASSERT(ns->netstack_refcnt == 0);
     1110 +        ASSERT(ns->netstack_numzones == 0);
     1111 +
     1112 +        ASSERT(ns->netstack_flags & NSF_CLOSING);
     1113 +
     1114 +        for (i = 0; i < NS_MAX; i++) {
     1115 +                nm_state_t *nms = &ns->netstack_m_state[i];
     1116 +
     1117 +                cv_destroy(&nms->nms_cv);
     1118 +        }
     1119 +        mutex_destroy(&ns->netstack_lock);
     1120 +        cv_destroy(&ns->netstack_cv);
     1121 +        kmem_free(ns, sizeof (*ns));
     1122 +        /* Allow another reap to be scheduled. */
     1123 +        sema_v(&netstack_reap_limiter);
     1124 +}
     1125 +
1064 1126  void
1065 1127  netstack_rele(netstack_t *ns)
1066 1128  {
1067      -        netstack_t **nsp;
1068      -        boolean_t found;
1069 1129          int refcnt, numzones;
1070      -        int i;
1071 1130  
1072 1131          mutex_enter(&ns->netstack_lock);
1073 1132          ASSERT(ns->netstack_refcnt > 0);
1074 1133          ns->netstack_refcnt--;
1075 1134          /*
1076 1135           * As we drop the lock additional netstack_rele()s can come in
1077 1136           * and decrement the refcnt to zero and free the netstack_t.
1078 1137           * Store pointers in local variables and if we were not the last
1079 1138           * then don't reference the netstack_t after that.
1080 1139           */
1081 1140          refcnt = ns->netstack_refcnt;
1082 1141          numzones = ns->netstack_numzones;
1083 1142          DTRACE_PROBE1(netstack__dec__ref, netstack_t *, ns);
1084 1143          mutex_exit(&ns->netstack_lock);
1085 1144  
1086 1145          if (refcnt == 0 && numzones == 0) {
1087 1146                  /*
1088      -                 * Time to call the destroy functions and free up
1089      -                 * the structure
     1147 +                 * Because there are possibilities of re-entrancy in various
     1148 +                 * netstack structures by callers, which might cause a lock up
     1149 +                 * due to odd reference models, or other factors, we choose to
     1150 +                 * schedule the actual deletion of this netstack as a deferred
     1151 +                 * task on the system taskq.  This way, any such reference
     1152 +                 * models won't trip over themselves.
     1153 +                 *
     1154 +                 * Assume we aren't in a high-priority interrupt context, so
     1155 +                 * we can use KM_SLEEP and semaphores.
1090 1156                   */
1091      -                netstack_stack_inactive(ns);
     1157 +                if (sema_tryp(&netstack_reap_limiter) == 0) {
     1158 +                        /*
     1159 +                         * Indicate we're slamming against a limit.
     1160 +                         */
     1161 +                        hrtime_t measurement = gethrtime();
1092 1162  
1093      -                /* Make sure nothing increased the references */
1094      -                ASSERT(ns->netstack_refcnt == 0);
1095      -                ASSERT(ns->netstack_numzones == 0);
1096      -
1097      -                /* Finally remove from list of netstacks */
1098      -                mutex_enter(&netstack_g_lock);
1099      -                found = B_FALSE;
1100      -                for (nsp = &netstack_head; *nsp != NULL;
1101      -                    nsp = &(*nsp)->netstack_next) {
1102      -                        if (*nsp == ns) {
1103      -                                *nsp = ns->netstack_next;
1104      -                                ns->netstack_next = NULL;
1105      -                                found = B_TRUE;
1106      -                                break;
1107      -                        }
     1163 +                        sema_p(&netstack_reap_limiter);
     1164 +                        /* Capture delay in ns. */
     1165 +                        DTRACE_PROBE1(netstack__reap__rate__limited,
     1166 +                            hrtime_t, gethrtime() - measurement);
1108 1167                  }
1109      -                ASSERT(found);
1110      -                mutex_exit(&netstack_g_lock);
1111 1168  
1112      -                /* Make sure nothing increased the references */
1113      -                ASSERT(ns->netstack_refcnt == 0);
1114      -                ASSERT(ns->netstack_numzones == 0);
1115      -
1116      -                ASSERT(ns->netstack_flags & NSF_CLOSING);
1117      -
1118      -                for (i = 0; i < NS_MAX; i++) {
1119      -                        nm_state_t *nms = &ns->netstack_m_state[i];
1120      -
1121      -                        cv_destroy(&nms->nms_cv);
1122      -                }
1123      -                mutex_destroy(&ns->netstack_lock);
1124      -                cv_destroy(&ns->netstack_cv);
1125      -                kmem_free(ns, sizeof (*ns));
     1169 +                /* TQ_SLEEP should prevent taskq_dispatch() from failing. */
     1170 +                (void) taskq_dispatch(system_taskq, netstack_reap, ns,
     1171 +                    TQ_SLEEP);
1126 1172          }
1127 1173  }
1128 1174  
1129 1175  void
1130 1176  netstack_hold(netstack_t *ns)
1131 1177  {
1132 1178          mutex_enter(&ns->netstack_lock);
1133 1179          ns->netstack_refcnt++;
1134 1180          ASSERT(ns->netstack_refcnt > 0);
1135 1181          mutex_exit(&ns->netstack_lock);
1136 1182          DTRACE_PROBE1(netstack__inc__ref, netstack_t *, ns);
1137 1183  }
1138 1184  
1139 1185  /*
1140 1186   * To support kstat_create_netstack() using kstat_zone_add we need
1141 1187   * to track both
1142 1188   *  - all zoneids that use the global/shared stack
1143 1189   *  - all kstats that have been added for the shared stack
1144 1190   */
1145 1191  kstat_t *
1146 1192  kstat_create_netstack(char *ks_module, int ks_instance, char *ks_name,
1147 1193      char *ks_class, uchar_t ks_type, uint_t ks_ndata, uchar_t ks_flags,
1148 1194      netstackid_t ks_netstackid)
1149 1195  {
1150 1196          kstat_t *ks;
1151 1197  
1152 1198          if (ks_netstackid == GLOBAL_NETSTACKID) {
1153 1199                  ks = kstat_create_zone(ks_module, ks_instance, ks_name,
1154 1200                      ks_class, ks_type, ks_ndata, ks_flags, GLOBAL_ZONEID);
1155 1201                  if (ks != NULL)
1156 1202                          netstack_shared_kstat_add(ks);
1157 1203                  return (ks);
1158 1204          } else {
1159 1205                  zoneid_t zoneid = ks_netstackid;
1160 1206  
1161 1207                  return (kstat_create_zone(ks_module, ks_instance, ks_name,
1162 1208                      ks_class, ks_type, ks_ndata, ks_flags, zoneid));
1163 1209          }
1164 1210  }
1165 1211  
1166 1212  void
1167 1213  kstat_delete_netstack(kstat_t *ks, netstackid_t ks_netstackid)
1168 1214  {
1169 1215          if (ks_netstackid == GLOBAL_NETSTACKID) {
1170 1216                  netstack_shared_kstat_remove(ks);
1171 1217          }
1172 1218          kstat_delete(ks);
1173 1219  }
1174 1220  
1175 1221  static void
1176 1222  netstack_shared_zone_add(zoneid_t zoneid)
1177 1223  {
1178 1224          struct shared_zone_list *sz;
1179 1225          struct shared_kstat_list *sk;
1180 1226  
1181 1227          sz = (struct shared_zone_list *)kmem_zalloc(sizeof (*sz), KM_SLEEP);
1182 1228          sz->sz_zoneid = zoneid;
1183 1229  
1184 1230          /* Insert in list */
1185 1231          mutex_enter(&netstack_shared_lock);
1186 1232          sz->sz_next = netstack_shared_zones;
1187 1233          netstack_shared_zones = sz;
1188 1234  
1189 1235          /*
1190 1236           * Perform kstat_zone_add for each existing shared stack kstat.
1191 1237           * Note: Holds netstack_shared_lock lock across kstat_zone_add.
1192 1238           */
1193 1239          for (sk = netstack_shared_kstats; sk != NULL; sk = sk->sk_next) {
1194 1240                  kstat_zone_add(sk->sk_kstat, zoneid);
1195 1241          }
1196 1242          mutex_exit(&netstack_shared_lock);
1197 1243  }
1198 1244  
1199 1245  static void
1200 1246  netstack_shared_zone_remove(zoneid_t zoneid)
1201 1247  {
1202 1248          struct shared_zone_list **szp, *sz;
1203 1249          struct shared_kstat_list *sk;
1204 1250  
1205 1251          /* Find in list */
1206 1252          mutex_enter(&netstack_shared_lock);
1207 1253          sz = NULL;
1208 1254          for (szp = &netstack_shared_zones; *szp != NULL;
1209 1255              szp = &((*szp)->sz_next)) {
1210 1256                  if ((*szp)->sz_zoneid == zoneid) {
1211 1257                          sz = *szp;
1212 1258                          break;
1213 1259                  }
1214 1260          }
1215 1261          /* We must find it */
1216 1262          ASSERT(sz != NULL);
1217 1263          *szp = sz->sz_next;
1218 1264          sz->sz_next = NULL;
1219 1265  
1220 1266          /*
1221 1267           * Perform kstat_zone_remove for each existing shared stack kstat.
1222 1268           * Note: Holds netstack_shared_lock lock across kstat_zone_remove.
1223 1269           */
1224 1270          for (sk = netstack_shared_kstats; sk != NULL; sk = sk->sk_next) {
1225 1271                  kstat_zone_remove(sk->sk_kstat, zoneid);
1226 1272          }
1227 1273          mutex_exit(&netstack_shared_lock);
1228 1274  
1229 1275          kmem_free(sz, sizeof (*sz));
1230 1276  }
1231 1277  
1232 1278  static void
1233 1279  netstack_shared_kstat_add(kstat_t *ks)
1234 1280  {
1235 1281          struct shared_zone_list *sz;
1236 1282          struct shared_kstat_list *sk;
1237 1283  
1238 1284          sk = (struct shared_kstat_list *)kmem_zalloc(sizeof (*sk), KM_SLEEP);
1239 1285          sk->sk_kstat = ks;
1240 1286  
1241 1287          /* Insert in list */
1242 1288          mutex_enter(&netstack_shared_lock);
1243 1289          sk->sk_next = netstack_shared_kstats;
1244 1290          netstack_shared_kstats = sk;
1245 1291  
1246 1292          /*
1247 1293           * Perform kstat_zone_add for each existing shared stack zone.
1248 1294           * Note: Holds netstack_shared_lock lock across kstat_zone_add.
1249 1295           */
1250 1296          for (sz = netstack_shared_zones; sz != NULL; sz = sz->sz_next) {
1251 1297                  kstat_zone_add(ks, sz->sz_zoneid);
1252 1298          }
1253 1299          mutex_exit(&netstack_shared_lock);
1254 1300  }
1255 1301  
1256 1302  static void
1257 1303  netstack_shared_kstat_remove(kstat_t *ks)
1258 1304  {
1259 1305          struct shared_zone_list *sz;
1260 1306          struct shared_kstat_list **skp, *sk;
1261 1307  
1262 1308          /* Find in list */
1263 1309          mutex_enter(&netstack_shared_lock);
1264 1310          sk = NULL;
1265 1311          for (skp = &netstack_shared_kstats; *skp != NULL;
1266 1312              skp = &((*skp)->sk_next)) {
1267 1313                  if ((*skp)->sk_kstat == ks) {
1268 1314                          sk = *skp;
1269 1315                          break;
1270 1316                  }
1271 1317          }
1272 1318          /* Must find it */
1273 1319          ASSERT(sk != NULL);
1274 1320          *skp = sk->sk_next;
1275 1321          sk->sk_next = NULL;
1276 1322  
1277 1323          /*
1278 1324           * Perform kstat_zone_remove for each existing shared stack kstat.
1279 1325           * Note: Holds netstack_shared_lock lock across kstat_zone_remove.
1280 1326           */
1281 1327          for (sz = netstack_shared_zones; sz != NULL; sz = sz->sz_next) {
1282 1328                  kstat_zone_remove(ks, sz->sz_zoneid);
1283 1329          }
1284 1330          mutex_exit(&netstack_shared_lock);
1285 1331          kmem_free(sk, sizeof (*sk));
1286 1332  }
1287 1333  
1288 1334  /*
1289 1335   * If a zoneid is part of the shared zone, return true
1290 1336   */
1291 1337  static boolean_t
1292 1338  netstack_find_shared_zoneid(zoneid_t zoneid)
1293 1339  {
1294 1340          struct shared_zone_list *sz;
1295 1341  
1296 1342          mutex_enter(&netstack_shared_lock);
1297 1343          for (sz = netstack_shared_zones; sz != NULL; sz = sz->sz_next) {
1298 1344                  if (sz->sz_zoneid == zoneid) {
1299 1345                          mutex_exit(&netstack_shared_lock);
1300 1346                          return (B_TRUE);
1301 1347                  }
1302 1348          }
1303 1349          mutex_exit(&netstack_shared_lock);
1304 1350          return (B_FALSE);
1305 1351  }
1306 1352  
1307 1353  /*
1308 1354   * Hide the fact that zoneids and netstackids are allocated from
1309 1355   * the same space in the current implementation.
1310 1356   * We currently do not check that the stackid/zoneids are valid, since there
1311 1357   * is no need for that. But this should only be done for ids that are
1312 1358   * valid.
1313 1359   */
1314 1360  zoneid_t
1315 1361  netstackid_to_zoneid(netstackid_t stackid)
1316 1362  {
1317 1363          return (stackid);
1318 1364  }
1319 1365  
1320 1366  netstackid_t
1321 1367  zoneid_to_netstackid(zoneid_t zoneid)
1322 1368  {
1323 1369          if (netstack_find_shared_zoneid(zoneid))
1324 1370                  return (GLOBAL_ZONEID);
1325 1371          else
1326 1372                  return (zoneid);
1327 1373  }
1328 1374  
1329 1375  zoneid_t
1330 1376  netstack_get_zoneid(netstack_t *ns)
1331 1377  {
1332 1378          return (netstackid_to_zoneid(ns->netstack_stackid));
1333 1379  }
1334 1380  
1335 1381  /*
1336 1382   * Simplistic support for walking all the handles.
1337 1383   * Example usage:
1338 1384   *      netstack_handle_t nh;
1339 1385   *      netstack_t *ns;
1340 1386   *
1341 1387   *      netstack_next_init(&nh);
1342 1388   *      while ((ns = netstack_next(&nh)) != NULL) {
1343 1389   *              do something;
1344 1390   *              netstack_rele(ns);
1345 1391   *      }
1346 1392   *      netstack_next_fini(&nh);
1347 1393   */
1348 1394  void
1349 1395  netstack_next_init(netstack_handle_t *handle)
1350 1396  {
1351 1397          *handle = 0;
1352 1398  }
1353 1399  
1354 1400  /* ARGSUSED */
1355 1401  void
1356 1402  netstack_next_fini(netstack_handle_t *handle)
1357 1403  {
1358 1404  }
1359 1405  
1360 1406  netstack_t *
1361 1407  netstack_next(netstack_handle_t *handle)
1362 1408  {
1363 1409          netstack_t *ns;
1364 1410          int i, end;
1365 1411  
1366 1412          end = *handle;
1367 1413          /* Walk skipping *handle number of instances */
1368 1414  
1369 1415          /* Look if there is a matching stack instance */
1370 1416          mutex_enter(&netstack_g_lock);
1371 1417          ns = netstack_head;
1372 1418          for (i = 0; i < end; i++) {
1373 1419                  if (ns == NULL)
1374 1420                          break;
1375 1421                  ns = ns->netstack_next;
1376 1422          }
1377 1423          /* skip those with that aren't really here */
1378 1424          while (ns != NULL) {
1379 1425                  mutex_enter(&ns->netstack_lock);
1380 1426                  if ((ns->netstack_flags & (NSF_UNINIT|NSF_CLOSING)) == 0) {
1381 1427                          mutex_exit(&ns->netstack_lock);
1382 1428                          break;
1383 1429                  }
1384 1430                  mutex_exit(&ns->netstack_lock);
1385 1431                  end++;
1386 1432                  ns = ns->netstack_next;
1387 1433          }
1388 1434          if (ns != NULL) {
1389 1435                  *handle = end + 1;
1390 1436                  netstack_hold(ns);
1391 1437          }
1392 1438          mutex_exit(&netstack_g_lock);
1393 1439          return (ns);
1394 1440  }
  
    | 
      ↓ open down ↓ | 
    259 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX