1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 /*
  27  * Copyright 2018 Nexenta Systems, Inc.
  28  */
  29 
  30 #include <sys/systm.h>
  31 #include <sys/cmn_err.h>
  32 #include <sys/kmem.h>
  33 #include <sys/disp.h>
  34 #include <sys/id_space.h>
  35 #include <sys/atomic.h>
  36 #include <rpc/rpc.h>
  37 #include <nfs/nfs4.h>
  38 #include <nfs/nfs4_db_impl.h>
  39 #include <sys/sdt.h>
  40 
  41 static int rfs4_reap_interval = RFS4_REAP_INTERVAL;
  42 
  43 static void rfs4_dbe_reap(rfs4_table_t *, time_t, uint32_t);
  44 static void rfs4_dbe_destroy(rfs4_dbe_t *);
  45 static rfs4_dbe_t *rfs4_dbe_create(rfs4_table_t *, id_t, rfs4_entry_t);
  46 static void rfs4_start_reaper(rfs4_table_t *);
  47 
  48 /*
  49  * t_lowat - integer percentage of table entries        /etc/system only
  50  * t_hiwat - integer percentage of table entries        /etc/system only
  51  * t_lreap - integer percentage of table reap time      mdb or /etc/system
  52  * t_hreap - integer percentage of table reap time      mdb or /etc/system
  53  */
  54 uint32_t        t_lowat = 50;   /* reap at t_lreap when id's in use hit 50% */
  55 uint32_t        t_hiwat = 75;   /* reap at t_hreap when id's in use hit 75% */
  56 time_t          t_lreap = 50;   /* default to 50% of table's reap interval */
  57 time_t          t_hreap = 10;   /* default to 10% of table's reap interval */
  58 
  59 id_t
  60 rfs4_dbe_getid(rfs4_dbe_t *entry)
  61 {
  62         return (entry->dbe_id);
  63 }
  64 
  65 void
  66 rfs4_dbe_hold(rfs4_dbe_t *entry)
  67 {
  68         atomic_inc_32(&entry->dbe_refcnt);
  69 }
  70 
  71 /*
  72  * rfs4_dbe_rele_nolock only decrements the reference count of the entry.
  73  */
  74 void
  75 rfs4_dbe_rele_nolock(rfs4_dbe_t *entry)
  76 {
  77         atomic_dec_32(&entry->dbe_refcnt);
  78 }
  79 
  80 
  81 uint32_t
  82 rfs4_dbe_refcnt(rfs4_dbe_t *entry)
  83 {
  84         return (entry->dbe_refcnt);
  85 }
  86 
  87 /*
  88  * Mark an entry such that the dbsearch will skip it.
  89  * Caller does not want this entry to be found any longer
  90  */
  91 void
  92 rfs4_dbe_invalidate(rfs4_dbe_t *entry)
  93 {
  94         entry->dbe_invalid = TRUE;
  95         entry->dbe_skipsearch = TRUE;
  96 }
  97 
  98 /*
  99  * Is this entry invalid?
 100  */
 101 bool_t
 102 rfs4_dbe_is_invalid(rfs4_dbe_t *entry)
 103 {
 104         return (entry->dbe_invalid);
 105 }
 106 
 107 time_t
 108 rfs4_dbe_get_timerele(rfs4_dbe_t *entry)
 109 {
 110         return (entry->dbe_time_rele);
 111 }
 112 
 113 /*
 114  * Use these to temporarily hide/unhide a db entry.
 115  */
 116 void
 117 rfs4_dbe_hide(rfs4_dbe_t *entry)
 118 {
 119         rfs4_dbe_lock(entry);
 120         entry->dbe_skipsearch = TRUE;
 121         rfs4_dbe_unlock(entry);
 122 }
 123 
 124 void
 125 rfs4_dbe_unhide(rfs4_dbe_t *entry)
 126 {
 127         rfs4_dbe_lock(entry);
 128         entry->dbe_skipsearch = FALSE;
 129         rfs4_dbe_unlock(entry);
 130 }
 131 
 132 void
 133 rfs4_dbe_rele(rfs4_dbe_t *entry)
 134 {
 135         mutex_enter(entry->dbe_lock);
 136         ASSERT(entry->dbe_refcnt > 1);
 137         atomic_dec_32(&entry->dbe_refcnt);
 138         entry->dbe_time_rele = gethrestime_sec();
 139         mutex_exit(entry->dbe_lock);
 140 }
 141 
 142 void
 143 rfs4_dbe_lock(rfs4_dbe_t *entry)
 144 {
 145         mutex_enter(entry->dbe_lock);
 146 }
 147 
 148 void
 149 rfs4_dbe_unlock(rfs4_dbe_t *entry)
 150 {
 151         mutex_exit(entry->dbe_lock);
 152 }
 153 
 154 bool_t
 155 rfs4_dbe_islocked(rfs4_dbe_t *entry)
 156 {
 157         return (mutex_owned(entry->dbe_lock));
 158 }
 159 
 160 clock_t
 161 rfs4_dbe_twait(rfs4_dbe_t *entry, clock_t timeout)
 162 {
 163         return (cv_timedwait(entry->dbe_cv, entry->dbe_lock, timeout));
 164 }
 165 
 166 void
 167 rfs4_dbe_cv_broadcast(rfs4_dbe_t *entry)
 168 {
 169         cv_broadcast(entry->dbe_cv);
 170 }
 171 
 172 /* ARGSUSED */
 173 static int
 174 rfs4_dbe_kmem_constructor(void *obj, void *private, int kmflag)
 175 {
 176         rfs4_dbe_t *entry = obj;
 177 
 178         mutex_init(entry->dbe_lock, NULL, MUTEX_DEFAULT, NULL);
 179         cv_init(entry->dbe_cv, NULL, CV_DEFAULT, NULL);
 180 
 181         return (0);
 182 }
 183 
 184 static void
 185 rfs4_dbe_kmem_destructor(void *obj, void *private)
 186 {
 187         rfs4_dbe_t *entry = obj;
 188         /*LINTED*/
 189         rfs4_table_t *table = private;
 190 
 191         mutex_destroy(entry->dbe_lock);
 192         cv_destroy(entry->dbe_cv);
 193 }
 194 
 195 rfs4_database_t *
 196 rfs4_database_create(uint32_t flags)
 197 {
 198         rfs4_database_t *db;
 199 
 200         db = kmem_alloc(sizeof (rfs4_database_t), KM_SLEEP);
 201         mutex_init(db->db_lock, NULL, MUTEX_DEFAULT, NULL);
 202         db->db_tables = NULL;
 203         db->db_debug_flags = flags;
 204         db->db_shutdown_count = 0;
 205         cv_init(&db->db_shutdown_wait, NULL, CV_DEFAULT, NULL);
 206         return (db);
 207 }
 208 
 209 
 210 /*
 211  * The reaper threads that have been created for the tables in this
 212  * database must be stopped and the entries in the tables released.
 213  * Each table will be marked as "shutdown" and the reaper threads
 214  * poked and they will see that a shutdown is in progress and cleanup
 215  * and exit.  This function waits for all reaper threads to stop
 216  * before returning to the caller.
 217  */
 218 void
 219 rfs4_database_shutdown(rfs4_database_t *db)
 220 {
 221         rfs4_table_t *table;
 222 
 223         mutex_enter(db->db_lock);
 224         for (table = db->db_tables; table; table = table->dbt_tnext) {
 225                 mutex_enter(&table->dbt_reaper_cv_lock);
 226                 table->dbt_reaper_shutdown = TRUE;
 227                 cv_broadcast(&table->dbt_reaper_wait);
 228                 db->db_shutdown_count++;
 229                 mutex_exit(&table->dbt_reaper_cv_lock);
 230         }
 231         while (db->db_shutdown_count > 0) {
 232                 cv_wait(&db->db_shutdown_wait, db->db_lock);
 233         }
 234         mutex_exit(db->db_lock);
 235 }
 236 
 237 /*
 238  * Given a database that has been "shutdown" by the function above all
 239  * of the table tables are destroyed and then the database itself
 240  * freed.
 241  */
 242 void
 243 rfs4_database_destroy(rfs4_database_t *db)
 244 {
 245         rfs4_table_t *next, *tmp;
 246 
 247         for (next = db->db_tables; next; ) {
 248                 tmp = next;
 249                 next = tmp->dbt_tnext;
 250                 rfs4_table_destroy(db, tmp);
 251         }
 252 
 253         mutex_destroy(db->db_lock);
 254         kmem_free(db, sizeof (rfs4_database_t));
 255 }
 256 
 257 /*
 258  * Used to get the correct kmem_cache database for the state table being
 259  * created.
 260  * Helper function for rfs4_table_create
 261  */
 262 static kmem_cache_t *
 263 get_db_mem_cache(char *name)
 264 {
 265         int i;
 266 
 267         for (i = 0; i < RFS4_DB_MEM_CACHE_NUM; i++) {
 268                 if (strcmp(name, rfs4_db_mem_cache_table[i].r_db_name) == 0)
 269                         return (rfs4_db_mem_cache_table[i].r_db_mem_cache);
 270         }
 271         /*
 272          * There is no associated kmem cache for this NFS4 server state
 273          * table name
 274          */
 275         return (NULL);
 276 }
 277 
 278 /*
 279  * Used to initialize the global NFSv4 server state database.
 280  * Helper funtion for rfs4_state_g_init and called when module is loaded.
 281  */
 282 kmem_cache_t *
 283 /* CSTYLED */
 284 nfs4_init_mem_cache(char *cache_name, uint32_t idxcnt, uint32_t size, uint32_t idx)
 285 {
 286         kmem_cache_t *mem_cache = kmem_cache_create(cache_name,
 287             sizeof (rfs4_dbe_t) + idxcnt * sizeof (rfs4_link_t) + size,
 288             0,
 289             rfs4_dbe_kmem_constructor,
 290             rfs4_dbe_kmem_destructor,
 291             NULL,
 292             NULL,
 293             NULL,
 294             0);
 295         (void) strlcpy(rfs4_db_mem_cache_table[idx].r_db_name, cache_name,
 296             strlen(cache_name) + 1);
 297         rfs4_db_mem_cache_table[idx].r_db_mem_cache = mem_cache;
 298         return (mem_cache);
 299 }
 300 
 301 rfs4_table_t *
 302 rfs4_table_create(rfs4_database_t *db, char *tabname, time_t max_cache_time,
 303     uint32_t idxcnt, bool_t (*create)(rfs4_entry_t, void *),
 304     void (*destroy)(rfs4_entry_t),
 305     bool_t (*expiry)(rfs4_entry_t),
 306     uint32_t size, uint32_t hashsize,
 307     uint32_t maxentries, id_t start)
 308 {
 309         rfs4_table_t    *table;
 310         int              len;
 311         char            *cache_name;
 312         char            *id_name;
 313 
 314         table = kmem_alloc(sizeof (rfs4_table_t), KM_SLEEP);
 315         table->dbt_db = db;
 316         rw_init(table->dbt_t_lock, NULL, RW_DEFAULT, NULL);
 317         mutex_init(table->dbt_lock, NULL, MUTEX_DEFAULT, NULL);
 318         mutex_init(&table->dbt_reaper_cv_lock, NULL, MUTEX_DEFAULT, NULL);
 319         cv_init(&table->dbt_reaper_wait, NULL, CV_DEFAULT, NULL);
 320 
 321         len = strlen(tabname);
 322         table->dbt_name = kmem_alloc(len+1, KM_SLEEP);
 323         cache_name = kmem_alloc(len + 12 /* "_entry_cache" */ + 1, KM_SLEEP);
 324         (void) strcpy(table->dbt_name, tabname);
 325         (void) sprintf(cache_name, "%s_entry_cache", table->dbt_name);
 326         table->dbt_max_cache_time = max_cache_time;
 327         table->dbt_usize = size;
 328         table->dbt_len = hashsize;
 329         table->dbt_count = 0;
 330         table->dbt_idxcnt = 0;
 331         table->dbt_ccnt = 0;
 332         table->dbt_maxcnt = idxcnt;
 333         table->dbt_indices = NULL;
 334         table->dbt_id_space = NULL;
 335         table->dbt_reaper_shutdown = FALSE;
 336 
 337         if (start >= 0) {
 338                 if (maxentries + (uint32_t)start > (uint32_t)INT32_MAX)
 339                         maxentries = INT32_MAX - start;
 340                 id_name = kmem_alloc(len + 9 /* "_id_space" */ + 1, KM_SLEEP);
 341                 (void) sprintf(id_name, "%s_id_space", table->dbt_name);
 342                 table->dbt_id_space = id_space_create(id_name, start,
 343                     maxentries + start);
 344                 kmem_free(id_name, len + 10);
 345         }
 346         ASSERT(t_lowat != 0);
 347         table->dbt_id_lwat = (maxentries * t_lowat) / 100;
 348         ASSERT(t_hiwat != 0);
 349         table->dbt_id_hwat = (maxentries * t_hiwat) / 100;
 350         table->dbt_id_reap = MIN(rfs4_reap_interval, max_cache_time);
 351         table->dbt_maxentries = maxentries;
 352         table->dbt_create = create;
 353         table->dbt_destroy = destroy;
 354         table->dbt_expiry = expiry;
 355 
 356         /*
 357          * get the correct kmem_cache for this table type based on the name.
 358          */
 359         table->dbt_mem_cache = get_db_mem_cache(cache_name);
 360 
 361         kmem_free(cache_name, len+13);
 362 
 363         table->dbt_debug = db->db_debug_flags;
 364 
 365         mutex_enter(db->db_lock);
 366         table->dbt_tnext = db->db_tables;
 367         db->db_tables = table;
 368         mutex_exit(db->db_lock);
 369 
 370         rfs4_start_reaper(table);
 371 
 372         return (table);
 373 }
 374 
 375 void
 376 rfs4_table_destroy(rfs4_database_t *db, rfs4_table_t *table)
 377 {
 378         rfs4_table_t *p;
 379         rfs4_index_t *idx;
 380 
 381         ASSERT(table->dbt_count == 0);
 382 
 383         mutex_enter(db->db_lock);
 384         if (table == db->db_tables)
 385                 db->db_tables = table->dbt_tnext;
 386         else {
 387                 for (p = db->db_tables; p; p = p->dbt_tnext)
 388                         if (p->dbt_tnext == table) {
 389                                 p->dbt_tnext = table->dbt_tnext;
 390                                 table->dbt_tnext = NULL;
 391                                 break;
 392                         }
 393                 ASSERT(p != NULL);
 394         }
 395         mutex_exit(db->db_lock);
 396 
 397         /* Destroy indices */
 398         while (table->dbt_indices) {
 399                 idx = table->dbt_indices;
 400                 table->dbt_indices = idx->dbi_inext;
 401                 rfs4_index_destroy(idx);
 402         }
 403 
 404         rw_destroy(table->dbt_t_lock);
 405         mutex_destroy(table->dbt_lock);
 406         mutex_destroy(&table->dbt_reaper_cv_lock);
 407         cv_destroy(&table->dbt_reaper_wait);
 408 
 409         kmem_free(table->dbt_name, strlen(table->dbt_name) + 1);
 410         if (table->dbt_id_space)
 411                 id_space_destroy(table->dbt_id_space);
 412         table->dbt_mem_cache = NULL;
 413         kmem_free(table, sizeof (rfs4_table_t));
 414 }
 415 
 416 rfs4_index_t *
 417 rfs4_index_create(rfs4_table_t *table, char *keyname,
 418     uint32_t (*hash)(void *),
 419     bool_t (compare)(rfs4_entry_t, void *),
 420     void *(*mkkey)(rfs4_entry_t),
 421     bool_t createable)
 422 {
 423         rfs4_index_t *idx;
 424 
 425         ASSERT(table->dbt_idxcnt < table->dbt_maxcnt);
 426 
 427         idx = kmem_alloc(sizeof (rfs4_index_t), KM_SLEEP);
 428 
 429         idx->dbi_table = table;
 430         idx->dbi_keyname = kmem_alloc(strlen(keyname) + 1, KM_SLEEP);
 431         (void) strcpy(idx->dbi_keyname, keyname);
 432         idx->dbi_hash = hash;
 433         idx->dbi_compare = compare;
 434         idx->dbi_mkkey = mkkey;
 435         idx->dbi_tblidx = table->dbt_idxcnt;
 436         table->dbt_idxcnt++;
 437         if (createable) {
 438                 table->dbt_ccnt++;
 439                 if (table->dbt_ccnt > 1)
 440                         panic("Table %s currently can have only have one "
 441                             "index that will allow creation of entries",
 442                             table->dbt_name);
 443                 idx->dbi_createable = TRUE;
 444         } else {
 445                 idx->dbi_createable = FALSE;
 446         }
 447 
 448         idx->dbi_inext = table->dbt_indices;
 449         table->dbt_indices = idx;
 450         idx->dbi_buckets = kmem_zalloc(sizeof (rfs4_bucket_t) * table->dbt_len,
 451             KM_SLEEP);
 452 
 453         return (idx);
 454 }
 455 
 456 void
 457 rfs4_index_destroy(rfs4_index_t *idx)
 458 {
 459         kmem_free(idx->dbi_keyname, strlen(idx->dbi_keyname) + 1);
 460         kmem_free(idx->dbi_buckets,
 461             sizeof (rfs4_bucket_t) * idx->dbi_table->dbt_len);
 462         kmem_free(idx, sizeof (rfs4_index_t));
 463 }
 464 
 465 static void
 466 rfs4_dbe_destroy(rfs4_dbe_t *entry)
 467 {
 468         rfs4_index_t *idx;
 469         void *key;
 470         int i;
 471         rfs4_bucket_t *bp;
 472         rfs4_table_t *table = entry->dbe_table;
 473         rfs4_link_t *l;
 474 
 475         NFS4_DEBUG(table->dbt_debug & DESTROY_DEBUG,
 476             (CE_NOTE, "Destroying entry %p from %s",
 477             (void*)entry, table->dbt_name));
 478 
 479         mutex_enter(entry->dbe_lock);
 480         ASSERT(entry->dbe_refcnt == 0);
 481         mutex_exit(entry->dbe_lock);
 482 
 483         /* Unlink from all indices */
 484         for (idx = table->dbt_indices; idx; idx = idx->dbi_inext) {
 485                 l = &entry->dbe_indices[idx->dbi_tblidx];
 486                 /* check and see if we were ever linked in to the index */
 487                 if (INVALID_LINK(l)) {
 488                         ASSERT(l->next == NULL && l->prev == NULL);
 489                         continue;
 490                 }
 491                 key = idx->dbi_mkkey(entry->dbe_data);
 492                 i = HASH(idx, key);
 493                 bp = &idx->dbi_buckets[i];
 494                 ASSERT(bp->dbk_head != NULL);
 495                 DEQUEUE_IDX(bp, &entry->dbe_indices[idx->dbi_tblidx]);
 496         }
 497 
 498         /* Destroy user data */
 499         if (table->dbt_destroy)
 500                 (*table->dbt_destroy)(entry->dbe_data);
 501 
 502         if (table->dbt_id_space)
 503                 id_free(table->dbt_id_space, entry->dbe_id);
 504 
 505         mutex_enter(table->dbt_lock);
 506         table->dbt_count--;
 507         mutex_exit(table->dbt_lock);
 508 
 509         /* Destroy the entry itself */
 510         kmem_cache_free(table->dbt_mem_cache, entry);
 511 }
 512 
 513 
 514 static rfs4_dbe_t *
 515 rfs4_dbe_create(rfs4_table_t *table, id_t id, rfs4_entry_t data)
 516 {
 517         rfs4_dbe_t *entry;
 518         int i;
 519 
 520         NFS4_DEBUG(table->dbt_debug & CREATE_DEBUG,
 521             (CE_NOTE, "Creating entry in table %s", table->dbt_name));
 522 
 523         entry = kmem_cache_alloc(table->dbt_mem_cache, KM_SLEEP);
 524 
 525         entry->dbe_refcnt = 1;
 526         entry->dbe_invalid = FALSE;
 527         entry->dbe_skipsearch = FALSE;
 528         entry->dbe_time_rele = 0;
 529         entry->dbe_id = 0;
 530 
 531         if (table->dbt_id_space)
 532                 entry->dbe_id = id;
 533         entry->dbe_table = table;
 534 
 535         for (i = 0; i < table->dbt_maxcnt; i++) {
 536                 entry->dbe_indices[i].next = entry->dbe_indices[i].prev = NULL;
 537                 entry->dbe_indices[i].entry = entry;
 538                 /*
 539                  * We mark the entry as not indexed by setting the low
 540                  * order bit, since address are word aligned. This has
 541                  * the advantage of causeing a trap if the address is
 542                  * used. After the entry is linked in to the
 543                  * corresponding index the bit will be cleared.
 544                  */
 545                 INVALIDATE_ADDR(entry->dbe_indices[i].entry);
 546         }
 547 
 548         entry->dbe_data = (rfs4_entry_t)&entry->dbe_indices[table->dbt_maxcnt];
 549         bzero(entry->dbe_data, table->dbt_usize);
 550         entry->dbe_data->dbe = entry;
 551 
 552         if (!(*table->dbt_create)(entry->dbe_data, data)) {
 553                 kmem_cache_free(table->dbt_mem_cache, entry);
 554                 return (NULL);
 555         }
 556 
 557         mutex_enter(table->dbt_lock);
 558         table->dbt_count++;
 559         mutex_exit(table->dbt_lock);
 560 
 561         return (entry);
 562 }
 563 
 564 static void
 565 rfs4_dbe_tabreap_adjust(rfs4_table_t *table)
 566 {
 567         clock_t         tabreap;
 568         clock_t         reap_int;
 569         uint32_t        in_use;
 570 
 571         /*
 572          * Adjust the table's reap interval based on the
 573          * number of id's currently in use. Each table's
 574          * default remains the same if id usage subsides.
 575          */
 576         ASSERT(MUTEX_HELD(&table->dbt_reaper_cv_lock));
 577         tabreap = MIN(rfs4_reap_interval, table->dbt_max_cache_time);
 578 
 579         in_use = table->dbt_count + 1;       /* see rfs4_dbe_create */
 580         if (in_use >= table->dbt_id_hwat) {
 581                 ASSERT(t_hreap != 0);
 582                 reap_int = (tabreap * t_hreap) / 100;
 583         } else if (in_use >= table->dbt_id_lwat) {
 584                 ASSERT(t_lreap != 0);
 585                 reap_int = (tabreap * t_lreap) / 100;
 586         } else {
 587                 reap_int = tabreap;
 588         }
 589         table->dbt_id_reap = reap_int;
 590         DTRACE_PROBE2(table__reap__interval, char *,
 591             table->dbt_name, time_t, table->dbt_id_reap);
 592 }
 593 
 594 rfs4_entry_t
 595 rfs4_dbsearch(rfs4_index_t *idx, void *key, bool_t *create, void *arg,
 596     rfs4_dbsearch_type_t dbsearch_type)
 597 {
 598         int              already_done;
 599         uint32_t         i;
 600         rfs4_table_t    *table = idx->dbi_table;
 601         rfs4_index_t    *ip;
 602         rfs4_bucket_t   *bp;
 603         rfs4_link_t     *l;
 604         rfs4_dbe_t      *entry;
 605         id_t             id = -1;
 606 
 607         i = HASH(idx, key);
 608         bp = &idx->dbi_buckets[i];
 609 
 610         NFS4_DEBUG(table->dbt_debug & SEARCH_DEBUG,
 611             (CE_NOTE, "Searching for key %p in table %s by %s",
 612             key, table->dbt_name, idx->dbi_keyname));
 613 
 614         rw_enter(bp->dbk_lock, RW_READER);
 615 retry:
 616         for (l = bp->dbk_head; l; l = l->next) {
 617                 if (l->entry->dbe_refcnt > 0 &&
 618                     (l->entry->dbe_skipsearch == FALSE ||
 619                     (l->entry->dbe_skipsearch == TRUE &&
 620                     dbsearch_type == RFS4_DBS_INVALID)) &&
 621                     (*idx->dbi_compare)(l->entry->dbe_data, key)) {
 622                         mutex_enter(l->entry->dbe_lock);
 623                         if (l->entry->dbe_refcnt == 0) {
 624                                 mutex_exit(l->entry->dbe_lock);
 625                                 continue;
 626                         }
 627 
 628                         /* place an additional hold since we are returning */
 629                         rfs4_dbe_hold(l->entry);
 630 
 631                         mutex_exit(l->entry->dbe_lock);
 632                         rw_exit(bp->dbk_lock);
 633 
 634                         *create = FALSE;
 635 
 636                         NFS4_DEBUG((table->dbt_debug & SEARCH_DEBUG),
 637                             (CE_NOTE, "Found entry %p for %p in table %s",
 638                             (void *)l->entry, key, table->dbt_name));
 639 
 640                         if (id != -1)
 641                                 id_free(table->dbt_id_space, id);
 642                         return (l->entry->dbe_data);
 643                 }
 644         }
 645 
 646         if (!*create || table->dbt_create == NULL || !idx->dbi_createable ||
 647             table->dbt_maxentries == table->dbt_count) {
 648                 NFS4_DEBUG(table->dbt_debug & SEARCH_DEBUG,
 649                     (CE_NOTE, "Entry for %p in %s not found",
 650                     key, table->dbt_name));
 651 
 652                 rw_exit(bp->dbk_lock);
 653                 if (id != -1)
 654                         id_free(table->dbt_id_space, id);
 655                 return (NULL);
 656         }
 657 
 658         if (table->dbt_id_space && id == -1) {
 659                 rw_exit(bp->dbk_lock);
 660 
 661                 /* get an id, ok to sleep for it here */
 662                 id = id_alloc(table->dbt_id_space);
 663                 ASSERT(id != -1);
 664 
 665                 mutex_enter(&table->dbt_reaper_cv_lock);
 666                 rfs4_dbe_tabreap_adjust(table);
 667                 mutex_exit(&table->dbt_reaper_cv_lock);
 668 
 669                 rw_enter(bp->dbk_lock, RW_WRITER);
 670                 goto retry;
 671         }
 672 
 673         /* get an exclusive lock on the bucket */
 674         if (rw_read_locked(bp->dbk_lock) && !rw_tryupgrade(bp->dbk_lock)) {
 675                 NFS4_DEBUG(table->dbt_debug & OTHER_DEBUG,
 676                     (CE_NOTE, "Trying to upgrade lock on "
 677                     "hash chain %d (%p) for  %s by %s",
 678                     i, (void*)bp, table->dbt_name, idx->dbi_keyname));
 679 
 680                 rw_exit(bp->dbk_lock);
 681                 rw_enter(bp->dbk_lock, RW_WRITER);
 682                 goto retry;
 683         }
 684 
 685         /* create entry */
 686         entry = rfs4_dbe_create(table, id, arg);
 687         if (entry == NULL) {
 688                 rw_exit(bp->dbk_lock);
 689                 if (id != -1)
 690                         id_free(table->dbt_id_space, id);
 691 
 692                 NFS4_DEBUG(table->dbt_debug & CREATE_DEBUG,
 693                     (CE_NOTE, "Constructor for table %s failed",
 694                     table->dbt_name));
 695                 return (NULL);
 696         }
 697 
 698         /*
 699          * Add one ref for entry into table's hash - only one
 700          * reference added even though there may be multiple indices
 701          */
 702         rfs4_dbe_hold(entry);
 703         ENQUEUE(bp->dbk_head, &entry->dbe_indices[idx->dbi_tblidx]);
 704         VALIDATE_ADDR(entry->dbe_indices[idx->dbi_tblidx].entry);
 705 
 706         already_done = idx->dbi_tblidx;
 707         rw_exit(bp->dbk_lock);
 708 
 709         for (ip = table->dbt_indices; ip; ip = ip->dbi_inext) {
 710                 if (ip->dbi_tblidx == already_done)
 711                         continue;
 712                 l = &entry->dbe_indices[ip->dbi_tblidx];
 713                 i = HASH(ip, ip->dbi_mkkey(entry->dbe_data));
 714                 ASSERT(i < ip->dbi_table->dbt_len);
 715                 bp = &ip->dbi_buckets[i];
 716                 ENQUEUE_IDX(bp, l);
 717         }
 718 
 719         NFS4_DEBUG(
 720             table->dbt_debug & SEARCH_DEBUG || table->dbt_debug & CREATE_DEBUG,
 721             (CE_NOTE, "Entry %p created for %s = %p in table %s",
 722             (void*)entry, idx->dbi_keyname, (void*)key, table->dbt_name));
 723 
 724         return (entry->dbe_data);
 725 }
 726 
 727 /*ARGSUSED*/
 728 boolean_t
 729 rfs4_cpr_callb(void *arg, int code)
 730 {
 731         rfs4_bucket_t *buckets, *bp;
 732         rfs4_link_t *l;
 733         rfs4_client_t *cp;
 734         int i;
 735 
 736         nfs4_srv_t *nsrv4 = nfs4_get_srv();
 737         rfs4_table_t *table = nsrv4->rfs4_client_tab;
 738 
 739         /*
 740          * We get called for Suspend and Resume events.
 741          * For the suspend case we simply don't care!  Nor do we care if
 742          * there are no clients.
 743          */
 744         if (code == CB_CODE_CPR_CHKPT || table == NULL) {
 745                 return (B_TRUE);
 746         }
 747 
 748         buckets = table->dbt_indices->dbi_buckets;
 749 
 750         /*
 751          * When we get this far we are in the process of
 752          * resuming the system from a previous suspend.
 753          *
 754          * We are going to blast through and update the
 755          * last_access time for all the clients and in
 756          * doing so extend them by one lease period.
 757          */
 758         for (i = 0; i < table->dbt_len; i++) {
 759                 bp = &buckets[i];
 760                 for (l = bp->dbk_head; l; l = l->next) {
 761                         cp = (rfs4_client_t *)l->entry->dbe_data;
 762                         cp->rc_last_access = gethrestime_sec();
 763                 }
 764         }
 765 
 766         return (B_TRUE);
 767 }
 768 
 769 /*
 770  * Given a table, lock each of the buckets and walk all entries (in
 771  * turn locking those) and calling the provided "callout" function
 772  * with the provided parameter.  Obviously used to iterate across all
 773  * entries in a particular table via the database locking hierarchy.
 774  * Obviously the caller must not hold locks on any of the entries in
 775  * the specified table.
 776  */
 777 void
 778 rfs4_dbe_walk(rfs4_table_t *table,
 779     void (*callout)(rfs4_entry_t, void *),
 780     void *data)
 781 {
 782         rfs4_bucket_t *buckets = table->dbt_indices->dbi_buckets, *bp;
 783         rfs4_link_t *l;
 784         rfs4_dbe_t *entry;
 785         int i;
 786 
 787         NFS4_DEBUG(table->dbt_debug & WALK_DEBUG,
 788             (CE_NOTE, "Walking entries in %s", table->dbt_name));
 789 
 790         /* Walk the buckets looking for entries to release/destroy */
 791         for (i = 0; i < table->dbt_len; i++) {
 792                 bp = &buckets[i];
 793                 rw_enter(bp->dbk_lock, RW_READER);
 794                 for (l = bp->dbk_head; l; l = l->next) {
 795                         entry = l->entry;
 796                         mutex_enter(entry->dbe_lock);
 797                         (*callout)(entry->dbe_data, data);
 798                         mutex_exit(entry->dbe_lock);
 799                 }
 800                 rw_exit(bp->dbk_lock);
 801         }
 802 
 803         NFS4_DEBUG(table->dbt_debug & WALK_DEBUG,
 804             (CE_NOTE, "Walking entries complete %s", table->dbt_name));
 805 }
 806 
 807 
 808 static void
 809 rfs4_dbe_reap(rfs4_table_t *table, time_t cache_time, uint32_t desired)
 810 {
 811         rfs4_index_t *idx = table->dbt_indices;
 812         rfs4_bucket_t *buckets = idx->dbi_buckets, *bp;
 813         rfs4_link_t *l, *t;
 814         rfs4_dbe_t *entry;
 815         bool_t found;
 816         int i;
 817         int count = 0;
 818 
 819         NFS4_DEBUG(table->dbt_debug & REAP_DEBUG,
 820             (CE_NOTE, "Reaping %d entries older than %ld seconds in table %s",
 821             desired, cache_time, table->dbt_name));
 822 
 823         /* Walk the buckets looking for entries to release/destroy */
 824         for (i = 0; i < table->dbt_len; i++) {
 825                 bp = &buckets[i];
 826                 do {
 827                         found = FALSE;
 828                         rw_enter(bp->dbk_lock, RW_READER);
 829                         for (l = bp->dbk_head; l; l = l->next) {
 830                                 uint32_t refcnt;
 831 
 832                                 entry = l->entry;
 833                                 /*
 834                                  * Examine an entry.  Ref count of 1 means
 835                                  * that the only reference is for the hash
 836                                  * table reference.
 837                                  */
 838                                 if (entry->dbe_refcnt != 1)
 839                                         continue;
 840                                 mutex_enter(entry->dbe_lock);
 841                                 /*
 842                                  * Recheck the ref. count with the lock,
 843                                  * and if non-zero, leave things alone.
 844                                  */
 845                                 if ((entry->dbe_refcnt == 1) &&
 846                                     (table->dbt_reaper_shutdown ||
 847                                     table->dbt_expiry == NULL ||
 848                                     (*table->dbt_expiry)(entry->dbe_data))) {
 849                                         refcnt = atomic_dec_32_nv(&entry->dbe_refcnt);
 850                                         if (refcnt == 0) {
 851                                                 count++;
 852                                                 found = TRUE;
 853                                         } else {
 854                                                 /*
 855                                                  * Lost race w/ incr.
 856                                                  * Leave it as it was
 857                                                  */
 858                                                 atomic_inc_32(&entry->dbe_refcnt);
 859                                         }
 860                                 }
 861                                 mutex_exit(entry->dbe_lock);
 862                         }
 863                         if (found) {
 864                                 if (!rw_tryupgrade(bp->dbk_lock)) {
 865                                         rw_exit(bp->dbk_lock);
 866                                         rw_enter(bp->dbk_lock, RW_WRITER);
 867                                 }
 868 
 869                                 l = bp->dbk_head;
 870                                 while (l) {
 871                                         t = l;
 872                                         entry = t->entry;
 873                                         l = l->next;
 874                                         if (entry->dbe_refcnt == 0) {
 875                                                 DEQUEUE(bp->dbk_head, t);
 876                                                 t->next = NULL;
 877                                                 t->prev = NULL;
 878                                                 INVALIDATE_ADDR(t->entry);
 879                                                 rfs4_dbe_destroy(entry);
 880                                         }
 881                                 }
 882                         }
 883                         rw_exit(bp->dbk_lock);
 884                         /*
 885                          * delay slightly if there is more work to do
 886                          * with the expectation that other reaper
 887                          * threads are freeing data structures as well
 888                          * and in turn will reduce ref counts on
 889                          * entries in this table allowing them to be
 890                          * released.  This is only done in the
 891                          * instance that the tables are being shut down.
 892                          */
 893                         if (table->dbt_reaper_shutdown && bp->dbk_head != NULL)
 894                                 delay(hz/100);
 895                 /*
 896                  * If this is a table shutdown, keep going until
 897                  * everything is gone
 898                  */
 899                 } while (table->dbt_reaper_shutdown && bp->dbk_head != NULL);
 900 
 901                 if (!table->dbt_reaper_shutdown && desired && count >= desired)
 902                         break;
 903         }
 904 
 905         NFS4_DEBUG(table->dbt_debug & REAP_DEBUG,
 906             (CE_NOTE, "Reaped %d entries older than %ld seconds in table %s",
 907             count, cache_time, table->dbt_name));
 908 }
 909 
 910 static void
 911 reaper_thread(caddr_t *arg)
 912 {
 913         rfs4_table_t    *table = (rfs4_table_t *)arg;
 914         clock_t          rc;
 915 
 916         NFS4_DEBUG(table->dbt_debug,
 917             (CE_NOTE, "rfs4_reaper_thread starting for %s", table->dbt_name));
 918 
 919         CALLB_CPR_INIT(&table->dbt_reaper_cpr_info, &table->dbt_reaper_cv_lock,
 920             callb_generic_cpr, "nfsv4Reaper");
 921 
 922         mutex_enter(&table->dbt_reaper_cv_lock);
 923         do {
 924                 CALLB_CPR_SAFE_BEGIN(&table->dbt_reaper_cpr_info);
 925                 rc = cv_reltimedwait_sig(&table->dbt_reaper_wait,
 926                     &table->dbt_reaper_cv_lock,
 927                     SEC_TO_TICK(table->dbt_id_reap), TR_CLOCK_TICK);
 928                 CALLB_CPR_SAFE_END(&table->dbt_reaper_cpr_info,
 929                     &table->dbt_reaper_cv_lock);
 930                 rfs4_dbe_reap(table, table->dbt_max_cache_time, 0);
 931         } while (rc != 0 && table->dbt_reaper_shutdown == FALSE);
 932 
 933         CALLB_CPR_EXIT(&table->dbt_reaper_cpr_info);
 934 
 935         NFS4_DEBUG(table->dbt_debug,
 936             (CE_NOTE, "rfs4_reaper_thread exiting for %s", table->dbt_name));
 937 
 938         /* Notify the database shutdown processing that the table is shutdown */
 939         mutex_enter(table->dbt_db->db_lock);
 940         table->dbt_db->db_shutdown_count--;
 941         cv_signal(&table->dbt_db->db_shutdown_wait);
 942         mutex_exit(table->dbt_db->db_lock);
 943         zthread_exit();
 944 }
 945 
 946 static void
 947 rfs4_start_reaper(rfs4_table_t *table)
 948 {
 949         if (table->dbt_max_cache_time == 0)
 950                 return;
 951 
 952         (void) zthread_create(NULL, 0, reaper_thread, table, 0,
 953             minclsyspri);
 954 }
 955 
 956 #ifdef DEBUG
 957 void
 958 rfs4_dbe_debug(rfs4_dbe_t *entry)
 959 {
 960         cmn_err(CE_NOTE, "Entry %p from table %s",
 961             (void *)entry, entry->dbe_table->dbt_name);
 962         cmn_err(CE_CONT, "\trefcnt = %d id = %d",
 963             entry->dbe_refcnt, entry->dbe_id);
 964 }
 965 #endif