Print this page
    
11083 support NFS server in zone
Portions contributed by: Dan Kruchinin <dan.kruchinin@nexenta.com>
Portions contributed by: Stepan Zastupov <stepan.zastupov@gmail.com>
Portions contributed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
Portions contributed by: Mike Zeller <mike@mikezeller.net>
Portions contributed by: Dan McDonald <danmcd@joyent.com>
Portions contributed by: Gordon Ross <gordon.w.ross@gmail.com>
Portions contributed by: Vitaliy Gusev <gusev.vitaliy@gmail.com>
Reviewed by: Rick McNeal <rick.mcneal@nexenta.com>
Reviewed by: Rob Gittins <rob.gittins@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Jason King <jbk@joyent.com>
Reviewed by: C Fraire <cfraire@me.com>
Change-Id: I22f289d357503f9b48a0bc2482cc4328a6d43d16
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/nfs/nfs4_db.c
          +++ new/usr/src/uts/common/fs/nfs/nfs4_db.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  
    | 
      ↓ open down ↓ | 
    10 lines elided | 
    
      ↑ open up ↑ | 
  
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
       21 +
  21   22  /*
  22   23   * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  23   24   */
  24   25  
       26 +/*
       27 + * Copyright 2018 Nexenta Systems, Inc.
       28 + */
       29 +
  25   30  #include <sys/systm.h>
  26   31  #include <sys/cmn_err.h>
  27   32  #include <sys/kmem.h>
  28   33  #include <sys/disp.h>
  29   34  #include <sys/id_space.h>
  30   35  #include <sys/atomic.h>
  31   36  #include <rpc/rpc.h>
  32   37  #include <nfs/nfs4.h>
  33   38  #include <nfs/nfs4_db_impl.h>
  34   39  #include <sys/sdt.h>
  35   40  
  36   41  static int rfs4_reap_interval = RFS4_REAP_INTERVAL;
  37   42  
  38   43  static void rfs4_dbe_reap(rfs4_table_t *, time_t, uint32_t);
  39   44  static void rfs4_dbe_destroy(rfs4_dbe_t *);
  40   45  static rfs4_dbe_t *rfs4_dbe_create(rfs4_table_t *, id_t, rfs4_entry_t);
  41   46  static void rfs4_start_reaper(rfs4_table_t *);
  42   47  
  43   48  /*
  44   49   * t_lowat - integer percentage of table entries        /etc/system only
  45   50   * t_hiwat - integer percentage of table entries        /etc/system only
  46   51   * t_lreap - integer percentage of table reap time      mdb or /etc/system
  47   52   * t_hreap - integer percentage of table reap time      mdb or /etc/system
  48   53   */
  49   54  uint32_t        t_lowat = 50;   /* reap at t_lreap when id's in use hit 50% */
  50   55  uint32_t        t_hiwat = 75;   /* reap at t_hreap when id's in use hit 75% */
  51   56  time_t          t_lreap = 50;   /* default to 50% of table's reap interval */
  52   57  time_t          t_hreap = 10;   /* default to 10% of table's reap interval */
  53   58  
  54   59  id_t
  55   60  rfs4_dbe_getid(rfs4_dbe_t *entry)
  56   61  {
  57   62          return (entry->dbe_id);
  58   63  }
  59   64  
  60   65  void
  61   66  rfs4_dbe_hold(rfs4_dbe_t *entry)
  62   67  {
  63   68          atomic_inc_32(&entry->dbe_refcnt);
  64   69  }
  65   70  
  66   71  /*
  67   72   * rfs4_dbe_rele_nolock only decrements the reference count of the entry.
  68   73   */
  69   74  void
  70   75  rfs4_dbe_rele_nolock(rfs4_dbe_t *entry)
  71   76  {
  72   77          atomic_dec_32(&entry->dbe_refcnt);
  73   78  }
  74   79  
  75   80  
  76   81  uint32_t
  77   82  rfs4_dbe_refcnt(rfs4_dbe_t *entry)
  78   83  {
  79   84          return (entry->dbe_refcnt);
  80   85  }
  81   86  
  82   87  /*
  83   88   * Mark an entry such that the dbsearch will skip it.
  84   89   * Caller does not want this entry to be found any longer
  85   90   */
  86   91  void
  87   92  rfs4_dbe_invalidate(rfs4_dbe_t *entry)
  88   93  {
  89   94          entry->dbe_invalid = TRUE;
  90   95          entry->dbe_skipsearch = TRUE;
  91   96  }
  92   97  
  93   98  /*
  94   99   * Is this entry invalid?
  95  100   */
  96  101  bool_t
  97  102  rfs4_dbe_is_invalid(rfs4_dbe_t *entry)
  98  103  {
  99  104          return (entry->dbe_invalid);
 100  105  }
 101  106  
 102  107  time_t
 103  108  rfs4_dbe_get_timerele(rfs4_dbe_t *entry)
 104  109  {
 105  110          return (entry->dbe_time_rele);
 106  111  }
 107  112  
 108  113  /*
 109  114   * Use these to temporarily hide/unhide a db entry.
 110  115   */
 111  116  void
 112  117  rfs4_dbe_hide(rfs4_dbe_t *entry)
 113  118  {
 114  119          rfs4_dbe_lock(entry);
 115  120          entry->dbe_skipsearch = TRUE;
 116  121          rfs4_dbe_unlock(entry);
 117  122  }
 118  123  
 119  124  void
 120  125  rfs4_dbe_unhide(rfs4_dbe_t *entry)
 121  126  {
 122  127          rfs4_dbe_lock(entry);
 123  128          entry->dbe_skipsearch = FALSE;
 124  129          rfs4_dbe_unlock(entry);
 125  130  }
 126  131  
 127  132  void
 128  133  rfs4_dbe_rele(rfs4_dbe_t *entry)
 129  134  {
 130  135          mutex_enter(entry->dbe_lock);
 131  136          ASSERT(entry->dbe_refcnt > 1);
 132  137          atomic_dec_32(&entry->dbe_refcnt);
 133  138          entry->dbe_time_rele = gethrestime_sec();
 134  139          mutex_exit(entry->dbe_lock);
 135  140  }
 136  141  
 137  142  void
 138  143  rfs4_dbe_lock(rfs4_dbe_t *entry)
 139  144  {
 140  145          mutex_enter(entry->dbe_lock);
 141  146  }
 142  147  
 143  148  void
 144  149  rfs4_dbe_unlock(rfs4_dbe_t *entry)
 145  150  {
 146  151          mutex_exit(entry->dbe_lock);
 147  152  }
 148  153  
 149  154  bool_t
 150  155  rfs4_dbe_islocked(rfs4_dbe_t *entry)
 151  156  {
 152  157          return (mutex_owned(entry->dbe_lock));
 153  158  }
 154  159  
 155  160  clock_t
 156  161  rfs4_dbe_twait(rfs4_dbe_t *entry, clock_t timeout)
 157  162  {
 158  163          return (cv_timedwait(entry->dbe_cv, entry->dbe_lock, timeout));
 159  164  }
 160  165  
 161  166  void
 162  167  rfs4_dbe_cv_broadcast(rfs4_dbe_t *entry)
 163  168  {
 164  169          cv_broadcast(entry->dbe_cv);
 165  170  }
 166  171  
 167  172  /* ARGSUSED */
 168  173  static int
 169  174  rfs4_dbe_kmem_constructor(void *obj, void *private, int kmflag)
 170  175  {
 171  176          rfs4_dbe_t *entry = obj;
 172  177  
 173  178          mutex_init(entry->dbe_lock, NULL, MUTEX_DEFAULT, NULL);
 174  179          cv_init(entry->dbe_cv, NULL, CV_DEFAULT, NULL);
 175  180  
 176  181          return (0);
 177  182  }
 178  183  
 179  184  static void
 180  185  rfs4_dbe_kmem_destructor(void *obj, void *private)
 181  186  {
 182  187          rfs4_dbe_t *entry = obj;
 183  188          /*LINTED*/
 184  189          rfs4_table_t *table = private;
 185  190  
 186  191          mutex_destroy(entry->dbe_lock);
 187  192          cv_destroy(entry->dbe_cv);
 188  193  }
 189  194  
 190  195  rfs4_database_t *
 191  196  rfs4_database_create(uint32_t flags)
 192  197  {
 193  198          rfs4_database_t *db;
 194  199  
 195  200          db = kmem_alloc(sizeof (rfs4_database_t), KM_SLEEP);
 196  201          mutex_init(db->db_lock, NULL, MUTEX_DEFAULT, NULL);
 197  202          db->db_tables = NULL;
 198  203          db->db_debug_flags = flags;
 199  204          db->db_shutdown_count = 0;
 200  205          cv_init(&db->db_shutdown_wait, NULL, CV_DEFAULT, NULL);
 201  206          return (db);
 202  207  }
 203  208  
 204  209  
 205  210  /*
 206  211   * The reaper threads that have been created for the tables in this
 207  212   * database must be stopped and the entries in the tables released.
 208  213   * Each table will be marked as "shutdown" and the reaper threads
 209  214   * poked and they will see that a shutdown is in progress and cleanup
 210  215   * and exit.  This function waits for all reaper threads to stop
 211  216   * before returning to the caller.
 212  217   */
 213  218  void
 214  219  rfs4_database_shutdown(rfs4_database_t *db)
 215  220  {
 216  221          rfs4_table_t *table;
 217  222  
 218  223          mutex_enter(db->db_lock);
 219  224          for (table = db->db_tables; table; table = table->dbt_tnext) {
 220  225                  mutex_enter(&table->dbt_reaper_cv_lock);
 221  226                  table->dbt_reaper_shutdown = TRUE;
 222  227                  cv_broadcast(&table->dbt_reaper_wait);
 223  228                  db->db_shutdown_count++;
 224  229                  mutex_exit(&table->dbt_reaper_cv_lock);
 225  230          }
 226  231          while (db->db_shutdown_count > 0) {
 227  232                  cv_wait(&db->db_shutdown_wait, db->db_lock);
 228  233          }
 229  234          mutex_exit(db->db_lock);
 230  235  }
 231  236  
 232  237  /*
 233  238   * Given a database that has been "shutdown" by the function above all
 234  239   * of the table tables are destroyed and then the database itself
 235  240   * freed.
 236  241   */
 237  242  void
 238  243  rfs4_database_destroy(rfs4_database_t *db)
 239  244  {
 240  245          rfs4_table_t *next, *tmp;
 241  246  
  
    | 
      ↓ open down ↓ | 
    207 lines elided | 
    
      ↑ open up ↑ | 
  
 242  247          for (next = db->db_tables; next; ) {
 243  248                  tmp = next;
 244  249                  next = tmp->dbt_tnext;
 245  250                  rfs4_table_destroy(db, tmp);
 246  251          }
 247  252  
 248  253          mutex_destroy(db->db_lock);
 249  254          kmem_free(db, sizeof (rfs4_database_t));
 250  255  }
 251  256  
      257 +/*
      258 + * Used to get the correct kmem_cache database for the state table being
      259 + * created.
      260 + * Helper function for rfs4_table_create
      261 + */
      262 +static kmem_cache_t *
      263 +get_db_mem_cache(char *name)
      264 +{
      265 +        int i;
      266 +
      267 +        for (i = 0; i < RFS4_DB_MEM_CACHE_NUM; i++) {
      268 +                if (strcmp(name, rfs4_db_mem_cache_table[i].r_db_name) == 0)
      269 +                        return (rfs4_db_mem_cache_table[i].r_db_mem_cache);
      270 +        }
      271 +        /*
      272 +         * There is no associated kmem cache for this NFS4 server state
      273 +         * table name
      274 +         */
      275 +        return (NULL);
      276 +}
      277 +
      278 +/*
      279 + * Used to initialize the global NFSv4 server state database.
      280 + * Helper funtion for rfs4_state_g_init and called when module is loaded.
      281 + */
      282 +kmem_cache_t *
      283 +/* CSTYLED */
      284 +nfs4_init_mem_cache(char *cache_name, uint32_t idxcnt, uint32_t size, uint32_t idx)
      285 +{
      286 +        kmem_cache_t *mem_cache = kmem_cache_create(cache_name,
      287 +            sizeof (rfs4_dbe_t) + idxcnt * sizeof (rfs4_link_t) + size,
      288 +            0,
      289 +            rfs4_dbe_kmem_constructor,
      290 +            rfs4_dbe_kmem_destructor,
      291 +            NULL,
      292 +            NULL,
      293 +            NULL,
      294 +            0);
      295 +        (void) strlcpy(rfs4_db_mem_cache_table[idx].r_db_name, cache_name,
      296 +            strlen(cache_name) + 1);
      297 +        rfs4_db_mem_cache_table[idx].r_db_mem_cache = mem_cache;
      298 +        return (mem_cache);
      299 +}
      300 +
 252  301  rfs4_table_t *
 253  302  rfs4_table_create(rfs4_database_t *db, char *tabname, time_t max_cache_time,
 254  303      uint32_t idxcnt, bool_t (*create)(rfs4_entry_t, void *),
 255  304      void (*destroy)(rfs4_entry_t),
 256  305      bool_t (*expiry)(rfs4_entry_t),
 257  306      uint32_t size, uint32_t hashsize,
 258  307      uint32_t maxentries, id_t start)
 259  308  {
 260  309          rfs4_table_t    *table;
 261  310          int              len;
 262  311          char            *cache_name;
 263  312          char            *id_name;
 264  313  
 265  314          table = kmem_alloc(sizeof (rfs4_table_t), KM_SLEEP);
 266  315          table->dbt_db = db;
 267  316          rw_init(table->dbt_t_lock, NULL, RW_DEFAULT, NULL);
 268  317          mutex_init(table->dbt_lock, NULL, MUTEX_DEFAULT, NULL);
 269  318          mutex_init(&table->dbt_reaper_cv_lock, NULL, MUTEX_DEFAULT, NULL);
 270  319          cv_init(&table->dbt_reaper_wait, NULL, CV_DEFAULT, NULL);
 271  320  
 272  321          len = strlen(tabname);
 273  322          table->dbt_name = kmem_alloc(len+1, KM_SLEEP);
 274  323          cache_name = kmem_alloc(len + 12 /* "_entry_cache" */ + 1, KM_SLEEP);
 275  324          (void) strcpy(table->dbt_name, tabname);
 276  325          (void) sprintf(cache_name, "%s_entry_cache", table->dbt_name);
 277  326          table->dbt_max_cache_time = max_cache_time;
 278  327          table->dbt_usize = size;
 279  328          table->dbt_len = hashsize;
 280  329          table->dbt_count = 0;
 281  330          table->dbt_idxcnt = 0;
 282  331          table->dbt_ccnt = 0;
 283  332          table->dbt_maxcnt = idxcnt;
 284  333          table->dbt_indices = NULL;
 285  334          table->dbt_id_space = NULL;
 286  335          table->dbt_reaper_shutdown = FALSE;
 287  336  
 288  337          if (start >= 0) {
 289  338                  if (maxentries + (uint32_t)start > (uint32_t)INT32_MAX)
 290  339                          maxentries = INT32_MAX - start;
 291  340                  id_name = kmem_alloc(len + 9 /* "_id_space" */ + 1, KM_SLEEP);
 292  341                  (void) sprintf(id_name, "%s_id_space", table->dbt_name);
 293  342                  table->dbt_id_space = id_space_create(id_name, start,
 294  343                      maxentries + start);
 295  344                  kmem_free(id_name, len + 10);
 296  345          }
  
    | 
      ↓ open down ↓ | 
    35 lines elided | 
    
      ↑ open up ↑ | 
  
 297  346          ASSERT(t_lowat != 0);
 298  347          table->dbt_id_lwat = (maxentries * t_lowat) / 100;
 299  348          ASSERT(t_hiwat != 0);
 300  349          table->dbt_id_hwat = (maxentries * t_hiwat) / 100;
 301  350          table->dbt_id_reap = MIN(rfs4_reap_interval, max_cache_time);
 302  351          table->dbt_maxentries = maxentries;
 303  352          table->dbt_create = create;
 304  353          table->dbt_destroy = destroy;
 305  354          table->dbt_expiry = expiry;
 306  355  
 307      -        table->dbt_mem_cache = kmem_cache_create(cache_name,
 308      -            sizeof (rfs4_dbe_t) + idxcnt * sizeof (rfs4_link_t) + size,
 309      -            0,
 310      -            rfs4_dbe_kmem_constructor,
 311      -            rfs4_dbe_kmem_destructor,
 312      -            NULL,
 313      -            table,
 314      -            NULL,
 315      -            0);
      356 +        /*
      357 +         * get the correct kmem_cache for this table type based on the name.
      358 +         */
      359 +        table->dbt_mem_cache = get_db_mem_cache(cache_name);
      360 +
 316  361          kmem_free(cache_name, len+13);
 317  362  
 318  363          table->dbt_debug = db->db_debug_flags;
 319  364  
 320  365          mutex_enter(db->db_lock);
 321  366          table->dbt_tnext = db->db_tables;
 322  367          db->db_tables = table;
 323  368          mutex_exit(db->db_lock);
 324  369  
 325  370          rfs4_start_reaper(table);
 326  371  
 327  372          return (table);
 328  373  }
 329  374  
 330  375  void
 331  376  rfs4_table_destroy(rfs4_database_t *db, rfs4_table_t *table)
 332  377  {
 333  378          rfs4_table_t *p;
 334  379          rfs4_index_t *idx;
 335  380  
 336  381          ASSERT(table->dbt_count == 0);
 337  382  
 338  383          mutex_enter(db->db_lock);
 339  384          if (table == db->db_tables)
 340  385                  db->db_tables = table->dbt_tnext;
 341  386          else {
 342  387                  for (p = db->db_tables; p; p = p->dbt_tnext)
 343  388                          if (p->dbt_tnext == table) {
 344  389                                  p->dbt_tnext = table->dbt_tnext;
 345  390                                  table->dbt_tnext = NULL;
 346  391                                  break;
 347  392                          }
 348  393                  ASSERT(p != NULL);
 349  394          }
 350  395          mutex_exit(db->db_lock);
 351  396  
 352  397          /* Destroy indices */
 353  398          while (table->dbt_indices) {
 354  399                  idx = table->dbt_indices;
 355  400                  table->dbt_indices = idx->dbi_inext;
 356  401                  rfs4_index_destroy(idx);
  
    | 
      ↓ open down ↓ | 
    31 lines elided | 
    
      ↑ open up ↑ | 
  
 357  402          }
 358  403  
 359  404          rw_destroy(table->dbt_t_lock);
 360  405          mutex_destroy(table->dbt_lock);
 361  406          mutex_destroy(&table->dbt_reaper_cv_lock);
 362  407          cv_destroy(&table->dbt_reaper_wait);
 363  408  
 364  409          kmem_free(table->dbt_name, strlen(table->dbt_name) + 1);
 365  410          if (table->dbt_id_space)
 366  411                  id_space_destroy(table->dbt_id_space);
 367      -        kmem_cache_destroy(table->dbt_mem_cache);
      412 +        table->dbt_mem_cache = NULL;
 368  413          kmem_free(table, sizeof (rfs4_table_t));
 369  414  }
 370  415  
 371  416  rfs4_index_t *
 372  417  rfs4_index_create(rfs4_table_t *table, char *keyname,
 373  418      uint32_t (*hash)(void *),
 374  419      bool_t (compare)(rfs4_entry_t, void *),
 375  420      void *(*mkkey)(rfs4_entry_t),
 376  421      bool_t createable)
 377  422  {
 378  423          rfs4_index_t *idx;
 379  424  
 380  425          ASSERT(table->dbt_idxcnt < table->dbt_maxcnt);
 381  426  
 382  427          idx = kmem_alloc(sizeof (rfs4_index_t), KM_SLEEP);
 383  428  
 384  429          idx->dbi_table = table;
 385  430          idx->dbi_keyname = kmem_alloc(strlen(keyname) + 1, KM_SLEEP);
 386  431          (void) strcpy(idx->dbi_keyname, keyname);
 387  432          idx->dbi_hash = hash;
 388  433          idx->dbi_compare = compare;
 389  434          idx->dbi_mkkey = mkkey;
 390  435          idx->dbi_tblidx = table->dbt_idxcnt;
 391  436          table->dbt_idxcnt++;
 392  437          if (createable) {
 393  438                  table->dbt_ccnt++;
 394  439                  if (table->dbt_ccnt > 1)
 395  440                          panic("Table %s currently can have only have one "
 396  441                              "index that will allow creation of entries",
 397  442                              table->dbt_name);
 398  443                  idx->dbi_createable = TRUE;
 399  444          } else {
 400  445                  idx->dbi_createable = FALSE;
 401  446          }
 402  447  
 403  448          idx->dbi_inext = table->dbt_indices;
 404  449          table->dbt_indices = idx;
 405  450          idx->dbi_buckets = kmem_zalloc(sizeof (rfs4_bucket_t) * table->dbt_len,
 406  451              KM_SLEEP);
 407  452  
 408  453          return (idx);
 409  454  }
 410  455  
 411  456  void
 412  457  rfs4_index_destroy(rfs4_index_t *idx)
 413  458  {
 414  459          kmem_free(idx->dbi_keyname, strlen(idx->dbi_keyname) + 1);
 415  460          kmem_free(idx->dbi_buckets,
 416  461              sizeof (rfs4_bucket_t) * idx->dbi_table->dbt_len);
 417  462          kmem_free(idx, sizeof (rfs4_index_t));
 418  463  }
 419  464  
 420  465  static void
 421  466  rfs4_dbe_destroy(rfs4_dbe_t *entry)
 422  467  {
 423  468          rfs4_index_t *idx;
 424  469          void *key;
 425  470          int i;
 426  471          rfs4_bucket_t *bp;
 427  472          rfs4_table_t *table = entry->dbe_table;
 428  473          rfs4_link_t *l;
 429  474  
 430  475          NFS4_DEBUG(table->dbt_debug & DESTROY_DEBUG,
 431  476              (CE_NOTE, "Destroying entry %p from %s",
 432  477              (void*)entry, table->dbt_name));
 433  478  
 434  479          mutex_enter(entry->dbe_lock);
 435  480          ASSERT(entry->dbe_refcnt == 0);
 436  481          mutex_exit(entry->dbe_lock);
 437  482  
 438  483          /* Unlink from all indices */
 439  484          for (idx = table->dbt_indices; idx; idx = idx->dbi_inext) {
 440  485                  l = &entry->dbe_indices[idx->dbi_tblidx];
 441  486                  /* check and see if we were ever linked in to the index */
 442  487                  if (INVALID_LINK(l)) {
 443  488                          ASSERT(l->next == NULL && l->prev == NULL);
 444  489                          continue;
 445  490                  }
 446  491                  key = idx->dbi_mkkey(entry->dbe_data);
 447  492                  i = HASH(idx, key);
 448  493                  bp = &idx->dbi_buckets[i];
 449  494                  ASSERT(bp->dbk_head != NULL);
 450  495                  DEQUEUE_IDX(bp, &entry->dbe_indices[idx->dbi_tblidx]);
 451  496          }
 452  497  
 453  498          /* Destroy user data */
 454  499          if (table->dbt_destroy)
 455  500                  (*table->dbt_destroy)(entry->dbe_data);
 456  501  
 457  502          if (table->dbt_id_space)
 458  503                  id_free(table->dbt_id_space, entry->dbe_id);
 459  504  
 460  505          mutex_enter(table->dbt_lock);
 461  506          table->dbt_count--;
 462  507          mutex_exit(table->dbt_lock);
 463  508  
 464  509          /* Destroy the entry itself */
 465  510          kmem_cache_free(table->dbt_mem_cache, entry);
 466  511  }
 467  512  
 468  513  
 469  514  static rfs4_dbe_t *
 470  515  rfs4_dbe_create(rfs4_table_t *table, id_t id, rfs4_entry_t data)
 471  516  {
 472  517          rfs4_dbe_t *entry;
 473  518          int i;
 474  519  
 475  520          NFS4_DEBUG(table->dbt_debug & CREATE_DEBUG,
 476  521              (CE_NOTE, "Creating entry in table %s", table->dbt_name));
 477  522  
 478  523          entry = kmem_cache_alloc(table->dbt_mem_cache, KM_SLEEP);
 479  524  
 480  525          entry->dbe_refcnt = 1;
 481  526          entry->dbe_invalid = FALSE;
 482  527          entry->dbe_skipsearch = FALSE;
 483  528          entry->dbe_time_rele = 0;
 484  529          entry->dbe_id = 0;
 485  530  
 486  531          if (table->dbt_id_space)
 487  532                  entry->dbe_id = id;
 488  533          entry->dbe_table = table;
 489  534  
 490  535          for (i = 0; i < table->dbt_maxcnt; i++) {
 491  536                  entry->dbe_indices[i].next = entry->dbe_indices[i].prev = NULL;
 492  537                  entry->dbe_indices[i].entry = entry;
 493  538                  /*
 494  539                   * We mark the entry as not indexed by setting the low
 495  540                   * order bit, since address are word aligned. This has
 496  541                   * the advantage of causeing a trap if the address is
 497  542                   * used. After the entry is linked in to the
 498  543                   * corresponding index the bit will be cleared.
 499  544                   */
 500  545                  INVALIDATE_ADDR(entry->dbe_indices[i].entry);
 501  546          }
 502  547  
 503  548          entry->dbe_data = (rfs4_entry_t)&entry->dbe_indices[table->dbt_maxcnt];
 504  549          bzero(entry->dbe_data, table->dbt_usize);
 505  550          entry->dbe_data->dbe = entry;
 506  551  
 507  552          if (!(*table->dbt_create)(entry->dbe_data, data)) {
 508  553                  kmem_cache_free(table->dbt_mem_cache, entry);
 509  554                  return (NULL);
 510  555          }
 511  556  
 512  557          mutex_enter(table->dbt_lock);
 513  558          table->dbt_count++;
 514  559          mutex_exit(table->dbt_lock);
 515  560  
 516  561          return (entry);
 517  562  }
 518  563  
 519  564  static void
 520  565  rfs4_dbe_tabreap_adjust(rfs4_table_t *table)
 521  566  {
 522  567          clock_t         tabreap;
 523  568          clock_t         reap_int;
 524  569          uint32_t        in_use;
 525  570  
 526  571          /*
 527  572           * Adjust the table's reap interval based on the
 528  573           * number of id's currently in use. Each table's
 529  574           * default remains the same if id usage subsides.
 530  575           */
 531  576          ASSERT(MUTEX_HELD(&table->dbt_reaper_cv_lock));
 532  577          tabreap = MIN(rfs4_reap_interval, table->dbt_max_cache_time);
 533  578  
 534  579          in_use = table->dbt_count + 1;  /* see rfs4_dbe_create */
 535  580          if (in_use >= table->dbt_id_hwat) {
 536  581                  ASSERT(t_hreap != 0);
 537  582                  reap_int = (tabreap * t_hreap) / 100;
 538  583          } else if (in_use >= table->dbt_id_lwat) {
 539  584                  ASSERT(t_lreap != 0);
 540  585                  reap_int = (tabreap * t_lreap) / 100;
 541  586          } else {
 542  587                  reap_int = tabreap;
 543  588          }
 544  589          table->dbt_id_reap = reap_int;
 545  590          DTRACE_PROBE2(table__reap__interval, char *,
 546  591              table->dbt_name, time_t, table->dbt_id_reap);
 547  592  }
 548  593  
 549  594  rfs4_entry_t
 550  595  rfs4_dbsearch(rfs4_index_t *idx, void *key, bool_t *create, void *arg,
 551  596      rfs4_dbsearch_type_t dbsearch_type)
 552  597  {
 553  598          int              already_done;
 554  599          uint32_t         i;
 555  600          rfs4_table_t    *table = idx->dbi_table;
 556  601          rfs4_index_t    *ip;
 557  602          rfs4_bucket_t   *bp;
 558  603          rfs4_link_t     *l;
 559  604          rfs4_dbe_t      *entry;
 560  605          id_t             id = -1;
 561  606  
 562  607          i = HASH(idx, key);
 563  608          bp = &idx->dbi_buckets[i];
 564  609  
 565  610          NFS4_DEBUG(table->dbt_debug & SEARCH_DEBUG,
 566  611              (CE_NOTE, "Searching for key %p in table %s by %s",
 567  612              key, table->dbt_name, idx->dbi_keyname));
 568  613  
 569  614          rw_enter(bp->dbk_lock, RW_READER);
 570  615  retry:
 571  616          for (l = bp->dbk_head; l; l = l->next) {
 572  617                  if (l->entry->dbe_refcnt > 0 &&
 573  618                      (l->entry->dbe_skipsearch == FALSE ||
 574  619                      (l->entry->dbe_skipsearch == TRUE &&
 575  620                      dbsearch_type == RFS4_DBS_INVALID)) &&
 576  621                      (*idx->dbi_compare)(l->entry->dbe_data, key)) {
 577  622                          mutex_enter(l->entry->dbe_lock);
 578  623                          if (l->entry->dbe_refcnt == 0) {
 579  624                                  mutex_exit(l->entry->dbe_lock);
 580  625                                  continue;
 581  626                          }
 582  627  
 583  628                          /* place an additional hold since we are returning */
 584  629                          rfs4_dbe_hold(l->entry);
 585  630  
 586  631                          mutex_exit(l->entry->dbe_lock);
 587  632                          rw_exit(bp->dbk_lock);
 588  633  
 589  634                          *create = FALSE;
 590  635  
 591  636                          NFS4_DEBUG((table->dbt_debug & SEARCH_DEBUG),
 592  637                              (CE_NOTE, "Found entry %p for %p in table %s",
 593  638                              (void *)l->entry, key, table->dbt_name));
 594  639  
 595  640                          if (id != -1)
 596  641                                  id_free(table->dbt_id_space, id);
 597  642                          return (l->entry->dbe_data);
 598  643                  }
 599  644          }
 600  645  
 601  646          if (!*create || table->dbt_create == NULL || !idx->dbi_createable ||
 602  647              table->dbt_maxentries == table->dbt_count) {
 603  648                  NFS4_DEBUG(table->dbt_debug & SEARCH_DEBUG,
 604  649                      (CE_NOTE, "Entry for %p in %s not found",
 605  650                      key, table->dbt_name));
 606  651  
 607  652                  rw_exit(bp->dbk_lock);
 608  653                  if (id != -1)
 609  654                          id_free(table->dbt_id_space, id);
 610  655                  return (NULL);
 611  656          }
 612  657  
 613  658          if (table->dbt_id_space && id == -1) {
 614  659                  rw_exit(bp->dbk_lock);
 615  660  
 616  661                  /* get an id, ok to sleep for it here */
 617  662                  id = id_alloc(table->dbt_id_space);
 618  663                  ASSERT(id != -1);
 619  664  
 620  665                  mutex_enter(&table->dbt_reaper_cv_lock);
 621  666                  rfs4_dbe_tabreap_adjust(table);
 622  667                  mutex_exit(&table->dbt_reaper_cv_lock);
 623  668  
 624  669                  rw_enter(bp->dbk_lock, RW_WRITER);
 625  670                  goto retry;
 626  671          }
 627  672  
 628  673          /* get an exclusive lock on the bucket */
 629  674          if (rw_read_locked(bp->dbk_lock) && !rw_tryupgrade(bp->dbk_lock)) {
 630  675                  NFS4_DEBUG(table->dbt_debug & OTHER_DEBUG,
 631  676                      (CE_NOTE, "Trying to upgrade lock on "
 632  677                      "hash chain %d (%p) for  %s by %s",
 633  678                      i, (void*)bp, table->dbt_name, idx->dbi_keyname));
 634  679  
 635  680                  rw_exit(bp->dbk_lock);
 636  681                  rw_enter(bp->dbk_lock, RW_WRITER);
 637  682                  goto retry;
 638  683          }
 639  684  
 640  685          /* create entry */
 641  686          entry = rfs4_dbe_create(table, id, arg);
 642  687          if (entry == NULL) {
 643  688                  rw_exit(bp->dbk_lock);
 644  689                  if (id != -1)
 645  690                          id_free(table->dbt_id_space, id);
 646  691  
 647  692                  NFS4_DEBUG(table->dbt_debug & CREATE_DEBUG,
 648  693                      (CE_NOTE, "Constructor for table %s failed",
 649  694                      table->dbt_name));
 650  695                  return (NULL);
 651  696          }
 652  697  
 653  698          /*
 654  699           * Add one ref for entry into table's hash - only one
 655  700           * reference added even though there may be multiple indices
 656  701           */
 657  702          rfs4_dbe_hold(entry);
 658  703          ENQUEUE(bp->dbk_head, &entry->dbe_indices[idx->dbi_tblidx]);
 659  704          VALIDATE_ADDR(entry->dbe_indices[idx->dbi_tblidx].entry);
 660  705  
 661  706          already_done = idx->dbi_tblidx;
 662  707          rw_exit(bp->dbk_lock);
 663  708  
 664  709          for (ip = table->dbt_indices; ip; ip = ip->dbi_inext) {
 665  710                  if (ip->dbi_tblidx == already_done)
 666  711                          continue;
 667  712                  l = &entry->dbe_indices[ip->dbi_tblidx];
 668  713                  i = HASH(ip, ip->dbi_mkkey(entry->dbe_data));
 669  714                  ASSERT(i < ip->dbi_table->dbt_len);
 670  715                  bp = &ip->dbi_buckets[i];
 671  716                  ENQUEUE_IDX(bp, l);
 672  717          }
 673  718  
 674  719          NFS4_DEBUG(
 675  720              table->dbt_debug & SEARCH_DEBUG || table->dbt_debug & CREATE_DEBUG,
  
    | 
      ↓ open down ↓ | 
    298 lines elided | 
    
      ↑ open up ↑ | 
  
 676  721              (CE_NOTE, "Entry %p created for %s = %p in table %s",
 677  722              (void*)entry, idx->dbi_keyname, (void*)key, table->dbt_name));
 678  723  
 679  724          return (entry->dbe_data);
 680  725  }
 681  726  
 682  727  /*ARGSUSED*/
 683  728  boolean_t
 684  729  rfs4_cpr_callb(void *arg, int code)
 685  730  {
 686      -        rfs4_table_t *table = rfs4_client_tab;
 687  731          rfs4_bucket_t *buckets, *bp;
 688  732          rfs4_link_t *l;
 689  733          rfs4_client_t *cp;
 690  734          int i;
 691  735  
      736 +        nfs4_srv_t *nsrv4 = nfs4_get_srv();
      737 +        rfs4_table_t *table = nsrv4->rfs4_client_tab;
      738 +
 692  739          /*
 693  740           * We get called for Suspend and Resume events.
 694  741           * For the suspend case we simply don't care!  Nor do we care if
 695  742           * there are no clients.
 696  743           */
 697  744          if (code == CB_CODE_CPR_CHKPT || table == NULL) {
 698  745                  return (B_TRUE);
 699  746          }
 700  747  
 701  748          buckets = table->dbt_indices->dbi_buckets;
 702  749  
 703  750          /*
 704  751           * When we get this far we are in the process of
 705  752           * resuming the system from a previous suspend.
 706  753           *
 707  754           * We are going to blast through and update the
 708  755           * last_access time for all the clients and in
 709  756           * doing so extend them by one lease period.
 710  757           */
 711  758          for (i = 0; i < table->dbt_len; i++) {
 712  759                  bp = &buckets[i];
 713  760                  for (l = bp->dbk_head; l; l = l->next) {
 714  761                          cp = (rfs4_client_t *)l->entry->dbe_data;
 715  762                          cp->rc_last_access = gethrestime_sec();
 716  763                  }
 717  764          }
 718  765  
 719  766          return (B_TRUE);
 720  767  }
 721  768  
 722  769  /*
 723  770   * Given a table, lock each of the buckets and walk all entries (in
 724  771   * turn locking those) and calling the provided "callout" function
 725  772   * with the provided parameter.  Obviously used to iterate across all
 726  773   * entries in a particular table via the database locking hierarchy.
 727  774   * Obviously the caller must not hold locks on any of the entries in
 728  775   * the specified table.
 729  776   */
 730  777  void
 731  778  rfs4_dbe_walk(rfs4_table_t *table,
 732  779      void (*callout)(rfs4_entry_t, void *),
 733  780      void *data)
 734  781  {
 735  782          rfs4_bucket_t *buckets = table->dbt_indices->dbi_buckets, *bp;
 736  783          rfs4_link_t *l;
 737  784          rfs4_dbe_t *entry;
 738  785          int i;
 739  786  
 740  787          NFS4_DEBUG(table->dbt_debug & WALK_DEBUG,
 741  788              (CE_NOTE, "Walking entries in %s", table->dbt_name));
 742  789  
 743  790          /* Walk the buckets looking for entries to release/destroy */
 744  791          for (i = 0; i < table->dbt_len; i++) {
 745  792                  bp = &buckets[i];
 746  793                  rw_enter(bp->dbk_lock, RW_READER);
 747  794                  for (l = bp->dbk_head; l; l = l->next) {
 748  795                          entry = l->entry;
 749  796                          mutex_enter(entry->dbe_lock);
 750  797                          (*callout)(entry->dbe_data, data);
 751  798                          mutex_exit(entry->dbe_lock);
 752  799                  }
 753  800                  rw_exit(bp->dbk_lock);
 754  801          }
 755  802  
 756  803          NFS4_DEBUG(table->dbt_debug & WALK_DEBUG,
 757  804              (CE_NOTE, "Walking entries complete %s", table->dbt_name));
 758  805  }
 759  806  
 760  807  
 761  808  static void
 762  809  rfs4_dbe_reap(rfs4_table_t *table, time_t cache_time, uint32_t desired)
 763  810  {
 764  811          rfs4_index_t *idx = table->dbt_indices;
 765  812          rfs4_bucket_t *buckets = idx->dbi_buckets, *bp;
 766  813          rfs4_link_t *l, *t;
 767  814          rfs4_dbe_t *entry;
 768  815          bool_t found;
 769  816          int i;
 770  817          int count = 0;
 771  818  
 772  819          NFS4_DEBUG(table->dbt_debug & REAP_DEBUG,
 773  820              (CE_NOTE, "Reaping %d entries older than %ld seconds in table %s",
 774  821              desired, cache_time, table->dbt_name));
 775  822  
 776  823          /* Walk the buckets looking for entries to release/destroy */
 777  824          for (i = 0; i < table->dbt_len; i++) {
 778  825                  bp = &buckets[i];
 779  826                  do {
 780  827                          found = FALSE;
 781  828                          rw_enter(bp->dbk_lock, RW_READER);
 782  829                          for (l = bp->dbk_head; l; l = l->next) {
 783  830                                  entry = l->entry;
 784  831                                  /*
 785  832                                   * Examine an entry.  Ref count of 1 means
 786  833                                   * that the only reference is for the hash
 787  834                                   * table reference.
 788  835                                   */
 789  836                                  if (entry->dbe_refcnt != 1)
 790  837                                          continue;
 791  838                                  mutex_enter(entry->dbe_lock);
 792  839                                  if ((entry->dbe_refcnt == 1) &&
 793  840                                      (table->dbt_reaper_shutdown ||
 794  841                                      table->dbt_expiry == NULL ||
 795  842                                      (*table->dbt_expiry)(entry->dbe_data))) {
 796  843                                          entry->dbe_refcnt--;
 797  844                                          count++;
 798  845                                          found = TRUE;
 799  846                                  }
 800  847                                  mutex_exit(entry->dbe_lock);
 801  848                          }
 802  849                          if (found) {
 803  850                                  if (!rw_tryupgrade(bp->dbk_lock)) {
 804  851                                          rw_exit(bp->dbk_lock);
 805  852                                          rw_enter(bp->dbk_lock, RW_WRITER);
 806  853                                  }
 807  854  
 808  855                                  l = bp->dbk_head;
 809  856                                  while (l) {
 810  857                                          t = l;
 811  858                                          entry = t->entry;
 812  859                                          l = l->next;
 813  860                                          if (entry->dbe_refcnt == 0) {
 814  861                                                  DEQUEUE(bp->dbk_head, t);
 815  862                                                  t->next = NULL;
 816  863                                                  t->prev = NULL;
 817  864                                                  INVALIDATE_ADDR(t->entry);
 818  865                                                  rfs4_dbe_destroy(entry);
 819  866                                          }
 820  867                                  }
 821  868                          }
 822  869                          rw_exit(bp->dbk_lock);
 823  870                          /*
 824  871                           * delay slightly if there is more work to do
 825  872                           * with the expectation that other reaper
 826  873                           * threads are freeing data structures as well
 827  874                           * and in turn will reduce ref counts on
 828  875                           * entries in this table allowing them to be
 829  876                           * released.  This is only done in the
 830  877                           * instance that the tables are being shut down.
 831  878                           */
 832  879                          if (table->dbt_reaper_shutdown && bp->dbk_head != NULL)
 833  880                                  delay(hz/100);
 834  881                  /*
 835  882                   * If this is a table shutdown, keep going until
 836  883                   * everything is gone
 837  884                   */
 838  885                  } while (table->dbt_reaper_shutdown && bp->dbk_head != NULL);
 839  886  
 840  887                  if (!table->dbt_reaper_shutdown && desired && count >= desired)
 841  888                          break;
 842  889          }
 843  890  
 844  891          NFS4_DEBUG(table->dbt_debug & REAP_DEBUG,
 845  892              (CE_NOTE, "Reaped %d entries older than %ld seconds in table %s",
 846  893              count, cache_time, table->dbt_name));
 847  894  }
 848  895  
 849  896  static void
 850  897  reaper_thread(caddr_t *arg)
 851  898  {
 852  899          rfs4_table_t    *table = (rfs4_table_t *)arg;
 853  900          clock_t          rc;
 854  901  
 855  902          NFS4_DEBUG(table->dbt_debug,
 856  903              (CE_NOTE, "rfs4_reaper_thread starting for %s", table->dbt_name));
 857  904  
 858  905          CALLB_CPR_INIT(&table->dbt_reaper_cpr_info, &table->dbt_reaper_cv_lock,
 859  906              callb_generic_cpr, "nfsv4Reaper");
 860  907  
 861  908          mutex_enter(&table->dbt_reaper_cv_lock);
 862  909          do {
 863  910                  CALLB_CPR_SAFE_BEGIN(&table->dbt_reaper_cpr_info);
 864  911                  rc = cv_reltimedwait_sig(&table->dbt_reaper_wait,
 865  912                      &table->dbt_reaper_cv_lock,
 866  913                      SEC_TO_TICK(table->dbt_id_reap), TR_CLOCK_TICK);
 867  914                  CALLB_CPR_SAFE_END(&table->dbt_reaper_cpr_info,
 868  915                      &table->dbt_reaper_cv_lock);
 869  916                  rfs4_dbe_reap(table, table->dbt_max_cache_time, 0);
 870  917          } while (rc != 0 && table->dbt_reaper_shutdown == FALSE);
 871  918  
  
    | 
      ↓ open down ↓ | 
    170 lines elided | 
    
      ↑ open up ↑ | 
  
 872  919          CALLB_CPR_EXIT(&table->dbt_reaper_cpr_info);
 873  920  
 874  921          NFS4_DEBUG(table->dbt_debug,
 875  922              (CE_NOTE, "rfs4_reaper_thread exiting for %s", table->dbt_name));
 876  923  
 877  924          /* Notify the database shutdown processing that the table is shutdown */
 878  925          mutex_enter(table->dbt_db->db_lock);
 879  926          table->dbt_db->db_shutdown_count--;
 880  927          cv_signal(&table->dbt_db->db_shutdown_wait);
 881  928          mutex_exit(table->dbt_db->db_lock);
      929 +        zthread_exit();
 882  930  }
 883  931  
 884  932  static void
 885  933  rfs4_start_reaper(rfs4_table_t *table)
 886  934  {
 887  935          if (table->dbt_max_cache_time == 0)
 888  936                  return;
 889  937  
 890      -        (void) thread_create(NULL, 0, reaper_thread, table, 0, &p0, TS_RUN,
      938 +        (void) zthread_create(NULL, 0, reaper_thread, table, 0,
 891  939              minclsyspri);
 892  940  }
 893  941  
 894  942  #ifdef DEBUG
 895  943  void
 896  944  rfs4_dbe_debug(rfs4_dbe_t *entry)
 897  945  {
 898  946          cmn_err(CE_NOTE, "Entry %p from table %s",
 899  947              (void *)entry, entry->dbe_table->dbt_name);
 900  948          cmn_err(CE_CONT, "\trefcnt = %d id = %d",
 901  949              entry->dbe_refcnt, entry->dbe_id);
 902  950  }
 903  951  #endif
    
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX