Print this page
    
Backport fix from 
Make NFS4.x dbe related ops lockless with atomic
Evan's review
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/nfs/nfs4_db.c
          +++ new/usr/src/uts/common/fs/nfs/nfs4_db.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   */
  25   25  
  26   26  /*
  27   27   * Copyright 2018 Nexenta Systems, Inc.
  28   28   */
  29   29  
  30   30  #include <sys/systm.h>
  31   31  #include <sys/cmn_err.h>
  32   32  #include <sys/kmem.h>
  33   33  #include <sys/disp.h>
  34   34  #include <sys/id_space.h>
  35   35  #include <sys/atomic.h>
  36   36  #include <rpc/rpc.h>
  37   37  #include <nfs/nfs4.h>
  38   38  #include <nfs/nfs4_db_impl.h>
  39   39  #include <sys/sdt.h>
  40   40  
  41   41  static int rfs4_reap_interval = RFS4_REAP_INTERVAL;
  42   42  
  43   43  static void rfs4_dbe_reap(rfs4_table_t *, time_t, uint32_t);
  44   44  static void rfs4_dbe_destroy(rfs4_dbe_t *);
  45   45  static rfs4_dbe_t *rfs4_dbe_create(rfs4_table_t *, id_t, rfs4_entry_t);
  46   46  static void rfs4_start_reaper(rfs4_table_t *);
  47   47  
  48   48  /*
  49   49   * t_lowat - integer percentage of table entries        /etc/system only
  50   50   * t_hiwat - integer percentage of table entries        /etc/system only
  51   51   * t_lreap - integer percentage of table reap time      mdb or /etc/system
  52   52   * t_hreap - integer percentage of table reap time      mdb or /etc/system
  53   53   */
  54   54  uint32_t        t_lowat = 50;   /* reap at t_lreap when id's in use hit 50% */
  55   55  uint32_t        t_hiwat = 75;   /* reap at t_hreap when id's in use hit 75% */
  56   56  time_t          t_lreap = 50;   /* default to 50% of table's reap interval */
  57   57  time_t          t_hreap = 10;   /* default to 10% of table's reap interval */
  58   58  
  59   59  id_t
  60   60  rfs4_dbe_getid(rfs4_dbe_t *entry)
  61   61  {
  62   62          return (entry->dbe_id);
  63   63  }
  64   64  
  65   65  void
  66   66  rfs4_dbe_hold(rfs4_dbe_t *entry)
  67   67  {
  68   68          atomic_inc_32(&entry->dbe_refcnt);
  69   69  }
  70   70  
  71   71  /*
  72   72   * rfs4_dbe_rele_nolock only decrements the reference count of the entry.
  73   73   */
  74   74  void
  75   75  rfs4_dbe_rele_nolock(rfs4_dbe_t *entry)
  76   76  {
  77   77          atomic_dec_32(&entry->dbe_refcnt);
  78   78  }
  79   79  
  80   80  
  81   81  uint32_t
  82   82  rfs4_dbe_refcnt(rfs4_dbe_t *entry)
  83   83  {
  84   84          return (entry->dbe_refcnt);
  85   85  }
  86   86  
  87   87  /*
  88   88   * Mark an entry such that the dbsearch will skip it.
  89   89   * Caller does not want this entry to be found any longer
  90   90   */
  91   91  void
  92   92  rfs4_dbe_invalidate(rfs4_dbe_t *entry)
  93   93  {
  94   94          entry->dbe_invalid = TRUE;
  95   95          entry->dbe_skipsearch = TRUE;
  96   96  }
  97   97  
  98   98  /*
  99   99   * Is this entry invalid?
 100  100   */
 101  101  bool_t
 102  102  rfs4_dbe_is_invalid(rfs4_dbe_t *entry)
 103  103  {
 104  104          return (entry->dbe_invalid);
 105  105  }
 106  106  
 107  107  time_t
 108  108  rfs4_dbe_get_timerele(rfs4_dbe_t *entry)
 109  109  {
 110  110          return (entry->dbe_time_rele);
 111  111  }
 112  112  
 113  113  /*
 114  114   * Use these to temporarily hide/unhide a db entry.
 115  115   */
 116  116  void
 117  117  rfs4_dbe_hide(rfs4_dbe_t *entry)
 118  118  {
 119  119          rfs4_dbe_lock(entry);
 120  120          entry->dbe_skipsearch = TRUE;
 121  121          rfs4_dbe_unlock(entry);
 122  122  }
 123  123  
 124  124  void
 125  125  rfs4_dbe_unhide(rfs4_dbe_t *entry)
 126  126  {
 127  127          rfs4_dbe_lock(entry);
 128  128          entry->dbe_skipsearch = FALSE;
 129  129          rfs4_dbe_unlock(entry);
 130  130  }
 131  131  
 132  132  void
 133  133  rfs4_dbe_rele(rfs4_dbe_t *entry)
 134  134  {
 135  135          mutex_enter(entry->dbe_lock);
 136  136          ASSERT(entry->dbe_refcnt > 1);
 137  137          atomic_dec_32(&entry->dbe_refcnt);
 138  138          entry->dbe_time_rele = gethrestime_sec();
 139  139          mutex_exit(entry->dbe_lock);
 140  140  }
 141  141  
 142  142  void
 143  143  rfs4_dbe_lock(rfs4_dbe_t *entry)
 144  144  {
 145  145          mutex_enter(entry->dbe_lock);
 146  146  }
 147  147  
 148  148  void
 149  149  rfs4_dbe_unlock(rfs4_dbe_t *entry)
 150  150  {
 151  151          mutex_exit(entry->dbe_lock);
 152  152  }
 153  153  
 154  154  bool_t
 155  155  rfs4_dbe_islocked(rfs4_dbe_t *entry)
 156  156  {
 157  157          return (mutex_owned(entry->dbe_lock));
 158  158  }
 159  159  
 160  160  clock_t
 161  161  rfs4_dbe_twait(rfs4_dbe_t *entry, clock_t timeout)
 162  162  {
 163  163          return (cv_timedwait(entry->dbe_cv, entry->dbe_lock, timeout));
 164  164  }
 165  165  
 166  166  void
 167  167  rfs4_dbe_cv_broadcast(rfs4_dbe_t *entry)
 168  168  {
 169  169          cv_broadcast(entry->dbe_cv);
 170  170  }
 171  171  
 172  172  /* ARGSUSED */
 173  173  static int
 174  174  rfs4_dbe_kmem_constructor(void *obj, void *private, int kmflag)
 175  175  {
 176  176          rfs4_dbe_t *entry = obj;
 177  177  
 178  178          mutex_init(entry->dbe_lock, NULL, MUTEX_DEFAULT, NULL);
 179  179          cv_init(entry->dbe_cv, NULL, CV_DEFAULT, NULL);
 180  180  
 181  181          return (0);
 182  182  }
 183  183  
 184  184  static void
 185  185  rfs4_dbe_kmem_destructor(void *obj, void *private)
 186  186  {
 187  187          rfs4_dbe_t *entry = obj;
 188  188          /*LINTED*/
 189  189          rfs4_table_t *table = private;
 190  190  
 191  191          mutex_destroy(entry->dbe_lock);
 192  192          cv_destroy(entry->dbe_cv);
 193  193  }
 194  194  
 195  195  rfs4_database_t *
 196  196  rfs4_database_create(uint32_t flags)
 197  197  {
 198  198          rfs4_database_t *db;
 199  199  
 200  200          db = kmem_alloc(sizeof (rfs4_database_t), KM_SLEEP);
 201  201          mutex_init(db->db_lock, NULL, MUTEX_DEFAULT, NULL);
 202  202          db->db_tables = NULL;
 203  203          db->db_debug_flags = flags;
 204  204          db->db_shutdown_count = 0;
 205  205          cv_init(&db->db_shutdown_wait, NULL, CV_DEFAULT, NULL);
 206  206          return (db);
 207  207  }
 208  208  
 209  209  
 210  210  /*
 211  211   * The reaper threads that have been created for the tables in this
 212  212   * database must be stopped and the entries in the tables released.
 213  213   * Each table will be marked as "shutdown" and the reaper threads
 214  214   * poked and they will see that a shutdown is in progress and cleanup
 215  215   * and exit.  This function waits for all reaper threads to stop
 216  216   * before returning to the caller.
 217  217   */
 218  218  void
 219  219  rfs4_database_shutdown(rfs4_database_t *db)
 220  220  {
 221  221          rfs4_table_t *table;
 222  222  
 223  223          mutex_enter(db->db_lock);
 224  224          for (table = db->db_tables; table; table = table->dbt_tnext) {
 225  225                  mutex_enter(&table->dbt_reaper_cv_lock);
 226  226                  table->dbt_reaper_shutdown = TRUE;
 227  227                  cv_broadcast(&table->dbt_reaper_wait);
 228  228                  db->db_shutdown_count++;
 229  229                  mutex_exit(&table->dbt_reaper_cv_lock);
 230  230          }
 231  231          while (db->db_shutdown_count > 0) {
 232  232                  cv_wait(&db->db_shutdown_wait, db->db_lock);
 233  233          }
 234  234          mutex_exit(db->db_lock);
 235  235  }
 236  236  
 237  237  /*
 238  238   * Given a database that has been "shutdown" by the function above all
 239  239   * of the table tables are destroyed and then the database itself
 240  240   * freed.
 241  241   */
 242  242  void
 243  243  rfs4_database_destroy(rfs4_database_t *db)
 244  244  {
 245  245          rfs4_table_t *next, *tmp;
 246  246  
 247  247          for (next = db->db_tables; next; ) {
 248  248                  tmp = next;
 249  249                  next = tmp->dbt_tnext;
 250  250                  rfs4_table_destroy(db, tmp);
 251  251          }
 252  252  
 253  253          mutex_destroy(db->db_lock);
 254  254          kmem_free(db, sizeof (rfs4_database_t));
 255  255  }
 256  256  
 257  257  /*
 258  258   * Used to get the correct kmem_cache database for the state table being
 259  259   * created.
 260  260   * Helper function for rfs4_table_create
 261  261   */
 262  262  static kmem_cache_t *
 263  263  get_db_mem_cache(char *name)
 264  264  {
 265  265          int i;
 266  266  
 267  267          for (i = 0; i < RFS4_DB_MEM_CACHE_NUM; i++) {
 268  268                  if (strcmp(name, rfs4_db_mem_cache_table[i].r_db_name) == 0)
 269  269                          return (rfs4_db_mem_cache_table[i].r_db_mem_cache);
 270  270          }
 271  271          /*
 272  272           * There is no associated kmem cache for this NFS4 server state
 273  273           * table name
 274  274           */
 275  275          return (NULL);
 276  276  }
 277  277  
 278  278  /*
 279  279   * Used to initialize the global NFSv4 server state database.
 280  280   * Helper funtion for rfs4_state_g_init and called when module is loaded.
 281  281   */
 282  282  kmem_cache_t *
 283  283  /* CSTYLED */
 284  284  nfs4_init_mem_cache(char *cache_name, uint32_t idxcnt, uint32_t size, uint32_t idx)
 285  285  {
 286  286          kmem_cache_t *mem_cache = kmem_cache_create(cache_name,
 287  287              sizeof (rfs4_dbe_t) + idxcnt * sizeof (rfs4_link_t) + size,
 288  288              0,
 289  289              rfs4_dbe_kmem_constructor,
 290  290              rfs4_dbe_kmem_destructor,
 291  291              NULL,
 292  292              NULL,
 293  293              NULL,
 294  294              0);
 295  295          (void) strlcpy(rfs4_db_mem_cache_table[idx].r_db_name, cache_name,
 296  296              strlen(cache_name) + 1);
 297  297          rfs4_db_mem_cache_table[idx].r_db_mem_cache = mem_cache;
 298  298          return (mem_cache);
 299  299  }
 300  300  
 301  301  rfs4_table_t *
 302  302  rfs4_table_create(rfs4_database_t *db, char *tabname, time_t max_cache_time,
 303  303      uint32_t idxcnt, bool_t (*create)(rfs4_entry_t, void *),
 304  304      void (*destroy)(rfs4_entry_t),
 305  305      bool_t (*expiry)(rfs4_entry_t),
 306  306      uint32_t size, uint32_t hashsize,
 307  307      uint32_t maxentries, id_t start)
 308  308  {
 309  309          rfs4_table_t    *table;
 310  310          int              len;
 311  311          char            *cache_name;
 312  312          char            *id_name;
 313  313  
 314  314          table = kmem_alloc(sizeof (rfs4_table_t), KM_SLEEP);
 315  315          table->dbt_db = db;
 316  316          rw_init(table->dbt_t_lock, NULL, RW_DEFAULT, NULL);
 317  317          mutex_init(table->dbt_lock, NULL, MUTEX_DEFAULT, NULL);
 318  318          mutex_init(&table->dbt_reaper_cv_lock, NULL, MUTEX_DEFAULT, NULL);
 319  319          cv_init(&table->dbt_reaper_wait, NULL, CV_DEFAULT, NULL);
 320  320  
 321  321          len = strlen(tabname);
 322  322          table->dbt_name = kmem_alloc(len+1, KM_SLEEP);
 323  323          cache_name = kmem_alloc(len + 12 /* "_entry_cache" */ + 1, KM_SLEEP);
 324  324          (void) strcpy(table->dbt_name, tabname);
 325  325          (void) sprintf(cache_name, "%s_entry_cache", table->dbt_name);
 326  326          table->dbt_max_cache_time = max_cache_time;
 327  327          table->dbt_usize = size;
 328  328          table->dbt_len = hashsize;
 329  329          table->dbt_count = 0;
 330  330          table->dbt_idxcnt = 0;
 331  331          table->dbt_ccnt = 0;
 332  332          table->dbt_maxcnt = idxcnt;
 333  333          table->dbt_indices = NULL;
 334  334          table->dbt_id_space = NULL;
 335  335          table->dbt_reaper_shutdown = FALSE;
 336  336  
 337  337          if (start >= 0) {
 338  338                  if (maxentries + (uint32_t)start > (uint32_t)INT32_MAX)
 339  339                          maxentries = INT32_MAX - start;
 340  340                  id_name = kmem_alloc(len + 9 /* "_id_space" */ + 1, KM_SLEEP);
 341  341                  (void) sprintf(id_name, "%s_id_space", table->dbt_name);
 342  342                  table->dbt_id_space = id_space_create(id_name, start,
 343  343                      maxentries + start);
 344  344                  kmem_free(id_name, len + 10);
 345  345          }
 346  346          ASSERT(t_lowat != 0);
 347  347          table->dbt_id_lwat = (maxentries * t_lowat) / 100;
 348  348          ASSERT(t_hiwat != 0);
 349  349          table->dbt_id_hwat = (maxentries * t_hiwat) / 100;
 350  350          table->dbt_id_reap = MIN(rfs4_reap_interval, max_cache_time);
 351  351          table->dbt_maxentries = maxentries;
 352  352          table->dbt_create = create;
 353  353          table->dbt_destroy = destroy;
 354  354          table->dbt_expiry = expiry;
 355  355  
 356  356          /*
 357  357           * get the correct kmem_cache for this table type based on the name.
 358  358           */
 359  359          table->dbt_mem_cache = get_db_mem_cache(cache_name);
 360  360  
 361  361          kmem_free(cache_name, len+13);
 362  362  
 363  363          table->dbt_debug = db->db_debug_flags;
 364  364  
 365  365          mutex_enter(db->db_lock);
 366  366          table->dbt_tnext = db->db_tables;
 367  367          db->db_tables = table;
 368  368          mutex_exit(db->db_lock);
 369  369  
 370  370          rfs4_start_reaper(table);
 371  371  
 372  372          return (table);
 373  373  }
 374  374  
 375  375  void
 376  376  rfs4_table_destroy(rfs4_database_t *db, rfs4_table_t *table)
 377  377  {
 378  378          rfs4_table_t *p;
 379  379          rfs4_index_t *idx;
 380  380  
 381  381          ASSERT(table->dbt_count == 0);
 382  382  
 383  383          mutex_enter(db->db_lock);
 384  384          if (table == db->db_tables)
 385  385                  db->db_tables = table->dbt_tnext;
 386  386          else {
 387  387                  for (p = db->db_tables; p; p = p->dbt_tnext)
 388  388                          if (p->dbt_tnext == table) {
 389  389                                  p->dbt_tnext = table->dbt_tnext;
 390  390                                  table->dbt_tnext = NULL;
 391  391                                  break;
 392  392                          }
 393  393                  ASSERT(p != NULL);
 394  394          }
 395  395          mutex_exit(db->db_lock);
 396  396  
 397  397          /* Destroy indices */
 398  398          while (table->dbt_indices) {
 399  399                  idx = table->dbt_indices;
 400  400                  table->dbt_indices = idx->dbi_inext;
 401  401                  rfs4_index_destroy(idx);
 402  402          }
 403  403  
 404  404          rw_destroy(table->dbt_t_lock);
 405  405          mutex_destroy(table->dbt_lock);
 406  406          mutex_destroy(&table->dbt_reaper_cv_lock);
 407  407          cv_destroy(&table->dbt_reaper_wait);
 408  408  
 409  409          kmem_free(table->dbt_name, strlen(table->dbt_name) + 1);
 410  410          if (table->dbt_id_space)
 411  411                  id_space_destroy(table->dbt_id_space);
 412  412          table->dbt_mem_cache = NULL;
 413  413          kmem_free(table, sizeof (rfs4_table_t));
 414  414  }
 415  415  
 416  416  rfs4_index_t *
 417  417  rfs4_index_create(rfs4_table_t *table, char *keyname,
 418  418      uint32_t (*hash)(void *),
 419  419      bool_t (compare)(rfs4_entry_t, void *),
 420  420      void *(*mkkey)(rfs4_entry_t),
 421  421      bool_t createable)
 422  422  {
 423  423          rfs4_index_t *idx;
 424  424  
 425  425          ASSERT(table->dbt_idxcnt < table->dbt_maxcnt);
 426  426  
 427  427          idx = kmem_alloc(sizeof (rfs4_index_t), KM_SLEEP);
 428  428  
 429  429          idx->dbi_table = table;
 430  430          idx->dbi_keyname = kmem_alloc(strlen(keyname) + 1, KM_SLEEP);
 431  431          (void) strcpy(idx->dbi_keyname, keyname);
 432  432          idx->dbi_hash = hash;
 433  433          idx->dbi_compare = compare;
 434  434          idx->dbi_mkkey = mkkey;
 435  435          idx->dbi_tblidx = table->dbt_idxcnt;
 436  436          table->dbt_idxcnt++;
 437  437          if (createable) {
 438  438                  table->dbt_ccnt++;
 439  439                  if (table->dbt_ccnt > 1)
 440  440                          panic("Table %s currently can have only have one "
 441  441                              "index that will allow creation of entries",
 442  442                              table->dbt_name);
 443  443                  idx->dbi_createable = TRUE;
 444  444          } else {
 445  445                  idx->dbi_createable = FALSE;
 446  446          }
 447  447  
 448  448          idx->dbi_inext = table->dbt_indices;
 449  449          table->dbt_indices = idx;
 450  450          idx->dbi_buckets = kmem_zalloc(sizeof (rfs4_bucket_t) * table->dbt_len,
 451  451              KM_SLEEP);
 452  452  
 453  453          return (idx);
 454  454  }
 455  455  
 456  456  void
 457  457  rfs4_index_destroy(rfs4_index_t *idx)
 458  458  {
 459  459          kmem_free(idx->dbi_keyname, strlen(idx->dbi_keyname) + 1);
 460  460          kmem_free(idx->dbi_buckets,
 461  461              sizeof (rfs4_bucket_t) * idx->dbi_table->dbt_len);
 462  462          kmem_free(idx, sizeof (rfs4_index_t));
 463  463  }
 464  464  
 465  465  static void
 466  466  rfs4_dbe_destroy(rfs4_dbe_t *entry)
 467  467  {
 468  468          rfs4_index_t *idx;
 469  469          void *key;
 470  470          int i;
 471  471          rfs4_bucket_t *bp;
 472  472          rfs4_table_t *table = entry->dbe_table;
 473  473          rfs4_link_t *l;
 474  474  
 475  475          NFS4_DEBUG(table->dbt_debug & DESTROY_DEBUG,
 476  476              (CE_NOTE, "Destroying entry %p from %s",
 477  477              (void*)entry, table->dbt_name));
 478  478  
 479  479          mutex_enter(entry->dbe_lock);
 480  480          ASSERT(entry->dbe_refcnt == 0);
 481  481          mutex_exit(entry->dbe_lock);
 482  482  
 483  483          /* Unlink from all indices */
 484  484          for (idx = table->dbt_indices; idx; idx = idx->dbi_inext) {
 485  485                  l = &entry->dbe_indices[idx->dbi_tblidx];
 486  486                  /* check and see if we were ever linked in to the index */
 487  487                  if (INVALID_LINK(l)) {
 488  488                          ASSERT(l->next == NULL && l->prev == NULL);
 489  489                          continue;
 490  490                  }
 491  491                  key = idx->dbi_mkkey(entry->dbe_data);
 492  492                  i = HASH(idx, key);
 493  493                  bp = &idx->dbi_buckets[i];
 494  494                  ASSERT(bp->dbk_head != NULL);
 495  495                  DEQUEUE_IDX(bp, &entry->dbe_indices[idx->dbi_tblidx]);
 496  496          }
 497  497  
 498  498          /* Destroy user data */
 499  499          if (table->dbt_destroy)
 500  500                  (*table->dbt_destroy)(entry->dbe_data);
 501  501  
 502  502          if (table->dbt_id_space)
 503  503                  id_free(table->dbt_id_space, entry->dbe_id);
 504  504  
 505  505          mutex_enter(table->dbt_lock);
 506  506          table->dbt_count--;
 507  507          mutex_exit(table->dbt_lock);
 508  508  
 509  509          /* Destroy the entry itself */
 510  510          kmem_cache_free(table->dbt_mem_cache, entry);
 511  511  }
 512  512  
 513  513  
 514  514  static rfs4_dbe_t *
 515  515  rfs4_dbe_create(rfs4_table_t *table, id_t id, rfs4_entry_t data)
 516  516  {
 517  517          rfs4_dbe_t *entry;
 518  518          int i;
 519  519  
 520  520          NFS4_DEBUG(table->dbt_debug & CREATE_DEBUG,
 521  521              (CE_NOTE, "Creating entry in table %s", table->dbt_name));
 522  522  
 523  523          entry = kmem_cache_alloc(table->dbt_mem_cache, KM_SLEEP);
 524  524  
 525  525          entry->dbe_refcnt = 1;
 526  526          entry->dbe_invalid = FALSE;
 527  527          entry->dbe_skipsearch = FALSE;
 528  528          entry->dbe_time_rele = 0;
 529  529          entry->dbe_id = 0;
 530  530  
 531  531          if (table->dbt_id_space)
 532  532                  entry->dbe_id = id;
 533  533          entry->dbe_table = table;
 534  534  
 535  535          for (i = 0; i < table->dbt_maxcnt; i++) {
 536  536                  entry->dbe_indices[i].next = entry->dbe_indices[i].prev = NULL;
 537  537                  entry->dbe_indices[i].entry = entry;
 538  538                  /*
 539  539                   * We mark the entry as not indexed by setting the low
 540  540                   * order bit, since address are word aligned. This has
 541  541                   * the advantage of causeing a trap if the address is
 542  542                   * used. After the entry is linked in to the
 543  543                   * corresponding index the bit will be cleared.
 544  544                   */
 545  545                  INVALIDATE_ADDR(entry->dbe_indices[i].entry);
 546  546          }
 547  547  
 548  548          entry->dbe_data = (rfs4_entry_t)&entry->dbe_indices[table->dbt_maxcnt];
 549  549          bzero(entry->dbe_data, table->dbt_usize);
 550  550          entry->dbe_data->dbe = entry;
 551  551  
 552  552          if (!(*table->dbt_create)(entry->dbe_data, data)) {
 553  553                  kmem_cache_free(table->dbt_mem_cache, entry);
 554  554                  return (NULL);
 555  555          }
 556  556  
 557  557          mutex_enter(table->dbt_lock);
 558  558          table->dbt_count++;
 559  559          mutex_exit(table->dbt_lock);
 560  560  
 561  561          return (entry);
 562  562  }
 563  563  
 564  564  static void
 565  565  rfs4_dbe_tabreap_adjust(rfs4_table_t *table)
 566  566  {
 567  567          clock_t         tabreap;
 568  568          clock_t         reap_int;
 569  569          uint32_t        in_use;
 570  570  
 571  571          /*
 572  572           * Adjust the table's reap interval based on the
 573  573           * number of id's currently in use. Each table's
 574  574           * default remains the same if id usage subsides.
 575  575           */
 576  576          ASSERT(MUTEX_HELD(&table->dbt_reaper_cv_lock));
 577  577          tabreap = MIN(rfs4_reap_interval, table->dbt_max_cache_time);
 578  578  
 579  579          in_use = table->dbt_count + 1;  /* see rfs4_dbe_create */
 580  580          if (in_use >= table->dbt_id_hwat) {
 581  581                  ASSERT(t_hreap != 0);
 582  582                  reap_int = (tabreap * t_hreap) / 100;
 583  583          } else if (in_use >= table->dbt_id_lwat) {
 584  584                  ASSERT(t_lreap != 0);
 585  585                  reap_int = (tabreap * t_lreap) / 100;
 586  586          } else {
 587  587                  reap_int = tabreap;
 588  588          }
 589  589          table->dbt_id_reap = reap_int;
 590  590          DTRACE_PROBE2(table__reap__interval, char *,
 591  591              table->dbt_name, time_t, table->dbt_id_reap);
 592  592  }
 593  593  
 594  594  rfs4_entry_t
 595  595  rfs4_dbsearch(rfs4_index_t *idx, void *key, bool_t *create, void *arg,
 596  596      rfs4_dbsearch_type_t dbsearch_type)
 597  597  {
 598  598          int              already_done;
 599  599          uint32_t         i;
 600  600          rfs4_table_t    *table = idx->dbi_table;
 601  601          rfs4_index_t    *ip;
 602  602          rfs4_bucket_t   *bp;
 603  603          rfs4_link_t     *l;
 604  604          rfs4_dbe_t      *entry;
 605  605          id_t             id = -1;
 606  606  
 607  607          i = HASH(idx, key);
 608  608          bp = &idx->dbi_buckets[i];
 609  609  
 610  610          NFS4_DEBUG(table->dbt_debug & SEARCH_DEBUG,
 611  611              (CE_NOTE, "Searching for key %p in table %s by %s",
 612  612              key, table->dbt_name, idx->dbi_keyname));
 613  613  
 614  614          rw_enter(bp->dbk_lock, RW_READER);
 615  615  retry:
 616  616          for (l = bp->dbk_head; l; l = l->next) {
 617  617                  if (l->entry->dbe_refcnt > 0 &&
 618  618                      (l->entry->dbe_skipsearch == FALSE ||
 619  619                      (l->entry->dbe_skipsearch == TRUE &&
 620  620                      dbsearch_type == RFS4_DBS_INVALID)) &&
 621  621                      (*idx->dbi_compare)(l->entry->dbe_data, key)) {
 622  622                          mutex_enter(l->entry->dbe_lock);
 623  623                          if (l->entry->dbe_refcnt == 0) {
 624  624                                  mutex_exit(l->entry->dbe_lock);
 625  625                                  continue;
 626  626                          }
 627  627  
 628  628                          /* place an additional hold since we are returning */
 629  629                          rfs4_dbe_hold(l->entry);
 630  630  
 631  631                          mutex_exit(l->entry->dbe_lock);
 632  632                          rw_exit(bp->dbk_lock);
 633  633  
 634  634                          *create = FALSE;
 635  635  
 636  636                          NFS4_DEBUG((table->dbt_debug & SEARCH_DEBUG),
 637  637                              (CE_NOTE, "Found entry %p for %p in table %s",
 638  638                              (void *)l->entry, key, table->dbt_name));
 639  639  
 640  640                          if (id != -1)
 641  641                                  id_free(table->dbt_id_space, id);
 642  642                          return (l->entry->dbe_data);
 643  643                  }
 644  644          }
 645  645  
 646  646          if (!*create || table->dbt_create == NULL || !idx->dbi_createable ||
 647  647              table->dbt_maxentries == table->dbt_count) {
 648  648                  NFS4_DEBUG(table->dbt_debug & SEARCH_DEBUG,
 649  649                      (CE_NOTE, "Entry for %p in %s not found",
 650  650                      key, table->dbt_name));
 651  651  
 652  652                  rw_exit(bp->dbk_lock);
 653  653                  if (id != -1)
 654  654                          id_free(table->dbt_id_space, id);
 655  655                  return (NULL);
 656  656          }
 657  657  
 658  658          if (table->dbt_id_space && id == -1) {
 659  659                  rw_exit(bp->dbk_lock);
 660  660  
 661  661                  /* get an id, ok to sleep for it here */
 662  662                  id = id_alloc(table->dbt_id_space);
 663  663                  ASSERT(id != -1);
 664  664  
 665  665                  mutex_enter(&table->dbt_reaper_cv_lock);
 666  666                  rfs4_dbe_tabreap_adjust(table);
 667  667                  mutex_exit(&table->dbt_reaper_cv_lock);
 668  668  
 669  669                  rw_enter(bp->dbk_lock, RW_WRITER);
 670  670                  goto retry;
 671  671          }
 672  672  
 673  673          /* get an exclusive lock on the bucket */
 674  674          if (rw_read_locked(bp->dbk_lock) && !rw_tryupgrade(bp->dbk_lock)) {
 675  675                  NFS4_DEBUG(table->dbt_debug & OTHER_DEBUG,
 676  676                      (CE_NOTE, "Trying to upgrade lock on "
 677  677                      "hash chain %d (%p) for  %s by %s",
 678  678                      i, (void*)bp, table->dbt_name, idx->dbi_keyname));
 679  679  
 680  680                  rw_exit(bp->dbk_lock);
 681  681                  rw_enter(bp->dbk_lock, RW_WRITER);
 682  682                  goto retry;
 683  683          }
 684  684  
 685  685          /* create entry */
 686  686          entry = rfs4_dbe_create(table, id, arg);
 687  687          if (entry == NULL) {
 688  688                  rw_exit(bp->dbk_lock);
 689  689                  if (id != -1)
 690  690                          id_free(table->dbt_id_space, id);
 691  691  
 692  692                  NFS4_DEBUG(table->dbt_debug & CREATE_DEBUG,
 693  693                      (CE_NOTE, "Constructor for table %s failed",
 694  694                      table->dbt_name));
 695  695                  return (NULL);
 696  696          }
 697  697  
 698  698          /*
 699  699           * Add one ref for entry into table's hash - only one
 700  700           * reference added even though there may be multiple indices
 701  701           */
 702  702          rfs4_dbe_hold(entry);
 703  703          ENQUEUE(bp->dbk_head, &entry->dbe_indices[idx->dbi_tblidx]);
 704  704          VALIDATE_ADDR(entry->dbe_indices[idx->dbi_tblidx].entry);
 705  705  
 706  706          already_done = idx->dbi_tblidx;
 707  707          rw_exit(bp->dbk_lock);
 708  708  
 709  709          for (ip = table->dbt_indices; ip; ip = ip->dbi_inext) {
 710  710                  if (ip->dbi_tblidx == already_done)
 711  711                          continue;
 712  712                  l = &entry->dbe_indices[ip->dbi_tblidx];
 713  713                  i = HASH(ip, ip->dbi_mkkey(entry->dbe_data));
 714  714                  ASSERT(i < ip->dbi_table->dbt_len);
 715  715                  bp = &ip->dbi_buckets[i];
 716  716                  ENQUEUE_IDX(bp, l);
 717  717          }
 718  718  
 719  719          NFS4_DEBUG(
 720  720              table->dbt_debug & SEARCH_DEBUG || table->dbt_debug & CREATE_DEBUG,
 721  721              (CE_NOTE, "Entry %p created for %s = %p in table %s",
 722  722              (void*)entry, idx->dbi_keyname, (void*)key, table->dbt_name));
 723  723  
 724  724          return (entry->dbe_data);
 725  725  }
 726  726  
 727  727  /*ARGSUSED*/
 728  728  boolean_t
 729  729  rfs4_cpr_callb(void *arg, int code)
 730  730  {
 731  731          rfs4_bucket_t *buckets, *bp;
 732  732          rfs4_link_t *l;
 733  733          rfs4_client_t *cp;
 734  734          int i;
 735  735  
 736  736          nfs4_srv_t *nsrv4 = nfs4_get_srv();
 737  737          rfs4_table_t *table = nsrv4->rfs4_client_tab;
 738  738  
 739  739          /*
 740  740           * We get called for Suspend and Resume events.
 741  741           * For the suspend case we simply don't care!  Nor do we care if
 742  742           * there are no clients.
 743  743           */
 744  744          if (code == CB_CODE_CPR_CHKPT || table == NULL) {
 745  745                  return (B_TRUE);
 746  746          }
 747  747  
 748  748          buckets = table->dbt_indices->dbi_buckets;
 749  749  
 750  750          /*
 751  751           * When we get this far we are in the process of
 752  752           * resuming the system from a previous suspend.
 753  753           *
 754  754           * We are going to blast through and update the
 755  755           * last_access time for all the clients and in
 756  756           * doing so extend them by one lease period.
 757  757           */
 758  758          for (i = 0; i < table->dbt_len; i++) {
 759  759                  bp = &buckets[i];
 760  760                  for (l = bp->dbk_head; l; l = l->next) {
 761  761                          cp = (rfs4_client_t *)l->entry->dbe_data;
 762  762                          cp->rc_last_access = gethrestime_sec();
 763  763                  }
 764  764          }
 765  765  
 766  766          return (B_TRUE);
 767  767  }
 768  768  
 769  769  /*
 770  770   * Given a table, lock each of the buckets and walk all entries (in
 771  771   * turn locking those) and calling the provided "callout" function
 772  772   * with the provided parameter.  Obviously used to iterate across all
 773  773   * entries in a particular table via the database locking hierarchy.
 774  774   * Obviously the caller must not hold locks on any of the entries in
 775  775   * the specified table.
 776  776   */
 777  777  void
 778  778  rfs4_dbe_walk(rfs4_table_t *table,
 779  779      void (*callout)(rfs4_entry_t, void *),
 780  780      void *data)
 781  781  {
 782  782          rfs4_bucket_t *buckets = table->dbt_indices->dbi_buckets, *bp;
 783  783          rfs4_link_t *l;
 784  784          rfs4_dbe_t *entry;
 785  785          int i;
 786  786  
 787  787          NFS4_DEBUG(table->dbt_debug & WALK_DEBUG,
 788  788              (CE_NOTE, "Walking entries in %s", table->dbt_name));
 789  789  
 790  790          /* Walk the buckets looking for entries to release/destroy */
 791  791          for (i = 0; i < table->dbt_len; i++) {
 792  792                  bp = &buckets[i];
 793  793                  rw_enter(bp->dbk_lock, RW_READER);
 794  794                  for (l = bp->dbk_head; l; l = l->next) {
 795  795                          entry = l->entry;
 796  796                          mutex_enter(entry->dbe_lock);
 797  797                          (*callout)(entry->dbe_data, data);
 798  798                          mutex_exit(entry->dbe_lock);
 799  799                  }
 800  800                  rw_exit(bp->dbk_lock);
 801  801          }
 802  802  
 803  803          NFS4_DEBUG(table->dbt_debug & WALK_DEBUG,
 804  804              (CE_NOTE, "Walking entries complete %s", table->dbt_name));
 805  805  }
 806  806  
 807  807  
 808  808  static void
 809  809  rfs4_dbe_reap(rfs4_table_t *table, time_t cache_time, uint32_t desired)
 810  810  {
 811  811          rfs4_index_t *idx = table->dbt_indices;
 812  812          rfs4_bucket_t *buckets = idx->dbi_buckets, *bp;
 813  813          rfs4_link_t *l, *t;
 814  814          rfs4_dbe_t *entry;
 815  815          bool_t found;
 816  816          int i;
 817  817          int count = 0;
 818  818  
 819  819          NFS4_DEBUG(table->dbt_debug & REAP_DEBUG,
  
    | 
      ↓ open down ↓ | 
    819 lines elided | 
    
      ↑ open up ↑ | 
  
 820  820              (CE_NOTE, "Reaping %d entries older than %ld seconds in table %s",
 821  821              desired, cache_time, table->dbt_name));
 822  822  
 823  823          /* Walk the buckets looking for entries to release/destroy */
 824  824          for (i = 0; i < table->dbt_len; i++) {
 825  825                  bp = &buckets[i];
 826  826                  do {
 827  827                          found = FALSE;
 828  828                          rw_enter(bp->dbk_lock, RW_READER);
 829  829                          for (l = bp->dbk_head; l; l = l->next) {
      830 +                                uint32_t refcnt;
      831 +
 830  832                                  entry = l->entry;
 831  833                                  /*
 832  834                                   * Examine an entry.  Ref count of 1 means
 833  835                                   * that the only reference is for the hash
 834  836                                   * table reference.
 835  837                                   */
 836  838                                  if (entry->dbe_refcnt != 1)
 837  839                                          continue;
 838  840                                  mutex_enter(entry->dbe_lock);
      841 +                                /*
      842 +                                 * Recheck the ref. count with the lock,
      843 +                                 * and if non-zero, leave things alone.
      844 +                                 */
 839  845                                  if ((entry->dbe_refcnt == 1) &&
 840  846                                      (table->dbt_reaper_shutdown ||
 841  847                                      table->dbt_expiry == NULL ||
 842  848                                      (*table->dbt_expiry)(entry->dbe_data))) {
 843      -                                        entry->dbe_refcnt--;
 844      -                                        count++;
 845      -                                        found = TRUE;
      849 +                                        refcnt = atomic_dec_32_nv(&entry->dbe_refcnt);
      850 +                                        if (refcnt == 0) {
      851 +                                                count++;
      852 +                                                found = TRUE;
      853 +                                        } else {
      854 +                                                /*
      855 +                                                 * Lost race w/ incr.
      856 +                                                 * Leave it as it was
      857 +                                                 */
      858 +                                                atomic_inc_32(&entry->dbe_refcnt);
      859 +                                        }
 846  860                                  }
 847  861                                  mutex_exit(entry->dbe_lock);
 848  862                          }
 849  863                          if (found) {
 850  864                                  if (!rw_tryupgrade(bp->dbk_lock)) {
 851  865                                          rw_exit(bp->dbk_lock);
 852  866                                          rw_enter(bp->dbk_lock, RW_WRITER);
 853  867                                  }
 854  868  
 855  869                                  l = bp->dbk_head;
 856  870                                  while (l) {
 857  871                                          t = l;
 858  872                                          entry = t->entry;
 859  873                                          l = l->next;
 860  874                                          if (entry->dbe_refcnt == 0) {
 861  875                                                  DEQUEUE(bp->dbk_head, t);
 862  876                                                  t->next = NULL;
 863  877                                                  t->prev = NULL;
 864  878                                                  INVALIDATE_ADDR(t->entry);
 865  879                                                  rfs4_dbe_destroy(entry);
 866  880                                          }
 867  881                                  }
 868  882                          }
 869  883                          rw_exit(bp->dbk_lock);
 870  884                          /*
 871  885                           * delay slightly if there is more work to do
 872  886                           * with the expectation that other reaper
 873  887                           * threads are freeing data structures as well
 874  888                           * and in turn will reduce ref counts on
 875  889                           * entries in this table allowing them to be
 876  890                           * released.  This is only done in the
 877  891                           * instance that the tables are being shut down.
 878  892                           */
 879  893                          if (table->dbt_reaper_shutdown && bp->dbk_head != NULL)
 880  894                                  delay(hz/100);
 881  895                  /*
 882  896                   * If this is a table shutdown, keep going until
 883  897                   * everything is gone
 884  898                   */
 885  899                  } while (table->dbt_reaper_shutdown && bp->dbk_head != NULL);
 886  900  
 887  901                  if (!table->dbt_reaper_shutdown && desired && count >= desired)
 888  902                          break;
 889  903          }
 890  904  
 891  905          NFS4_DEBUG(table->dbt_debug & REAP_DEBUG,
 892  906              (CE_NOTE, "Reaped %d entries older than %ld seconds in table %s",
 893  907              count, cache_time, table->dbt_name));
 894  908  }
 895  909  
 896  910  static void
 897  911  reaper_thread(caddr_t *arg)
 898  912  {
 899  913          rfs4_table_t    *table = (rfs4_table_t *)arg;
 900  914          clock_t          rc;
 901  915  
 902  916          NFS4_DEBUG(table->dbt_debug,
 903  917              (CE_NOTE, "rfs4_reaper_thread starting for %s", table->dbt_name));
 904  918  
 905  919          CALLB_CPR_INIT(&table->dbt_reaper_cpr_info, &table->dbt_reaper_cv_lock,
 906  920              callb_generic_cpr, "nfsv4Reaper");
 907  921  
 908  922          mutex_enter(&table->dbt_reaper_cv_lock);
 909  923          do {
 910  924                  CALLB_CPR_SAFE_BEGIN(&table->dbt_reaper_cpr_info);
 911  925                  rc = cv_reltimedwait_sig(&table->dbt_reaper_wait,
 912  926                      &table->dbt_reaper_cv_lock,
 913  927                      SEC_TO_TICK(table->dbt_id_reap), TR_CLOCK_TICK);
 914  928                  CALLB_CPR_SAFE_END(&table->dbt_reaper_cpr_info,
 915  929                      &table->dbt_reaper_cv_lock);
 916  930                  rfs4_dbe_reap(table, table->dbt_max_cache_time, 0);
 917  931          } while (rc != 0 && table->dbt_reaper_shutdown == FALSE);
 918  932  
 919  933          CALLB_CPR_EXIT(&table->dbt_reaper_cpr_info);
 920  934  
 921  935          NFS4_DEBUG(table->dbt_debug,
 922  936              (CE_NOTE, "rfs4_reaper_thread exiting for %s", table->dbt_name));
 923  937  
 924  938          /* Notify the database shutdown processing that the table is shutdown */
 925  939          mutex_enter(table->dbt_db->db_lock);
 926  940          table->dbt_db->db_shutdown_count--;
 927  941          cv_signal(&table->dbt_db->db_shutdown_wait);
 928  942          mutex_exit(table->dbt_db->db_lock);
 929  943          zthread_exit();
 930  944  }
 931  945  
 932  946  static void
 933  947  rfs4_start_reaper(rfs4_table_t *table)
 934  948  {
 935  949          if (table->dbt_max_cache_time == 0)
 936  950                  return;
 937  951  
 938  952          (void) zthread_create(NULL, 0, reaper_thread, table, 0,
 939  953              minclsyspri);
 940  954  }
 941  955  
 942  956  #ifdef DEBUG
 943  957  void
 944  958  rfs4_dbe_debug(rfs4_dbe_t *entry)
 945  959  {
 946  960          cmn_err(CE_NOTE, "Entry %p from table %s",
 947  961              (void *)entry, entry->dbe_table->dbt_name);
 948  962          cmn_err(CE_CONT, "\trefcnt = %d id = %d",
 949  963              entry->dbe_refcnt, entry->dbe_id);
 950  964  }
 951  965  #endif
  
    | 
      ↓ open down ↓ | 
    96 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX