1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 /*
  27  * Copyright 2019 Nexenta Systems, Inc.
  28  */
  29 
  30 #include <sys/systm.h>
  31 #include <sys/cmn_err.h>
  32 #include <sys/kmem.h>
  33 #include <sys/disp.h>
  34 #include <sys/id_space.h>
  35 #include <rpc/rpc.h>
  36 #include <nfs/nfs4.h>
  37 #include <nfs/nfs4_db_impl.h>
  38 #include <sys/sdt.h>
  39 
  40 static int rfs4_reap_interval = RFS4_REAP_INTERVAL;
  41 
  42 static void rfs4_dbe_reap(rfs4_table_t *, time_t, uint32_t);
  43 static void rfs4_dbe_destroy(rfs4_dbe_t *);
  44 static rfs4_dbe_t *rfs4_dbe_create(rfs4_table_t *, id_t, rfs4_entry_t);
  45 static void rfs4_start_reaper(rfs4_table_t *);
  46 
  47 /*
  48  * t_lowat - integer percentage of table entries        /etc/system only
  49  * t_hiwat - integer percentage of table entries        /etc/system only
  50  * t_lreap - integer percentage of table reap time      mdb or /etc/system
  51  * t_hreap - integer percentage of table reap time      mdb or /etc/system
  52  */
  53 uint32_t        t_lowat = 50;   /* reap at t_lreap when id's in use hit 50% */
  54 uint32_t        t_hiwat = 75;   /* reap at t_hreap when id's in use hit 75% */
  55 time_t          t_lreap = 50;   /* default to 50% of table's reap interval */
  56 time_t          t_hreap = 10;   /* default to 10% of table's reap interval */
  57 
  58 id_t
  59 rfs4_dbe_getid(rfs4_dbe_t *entry)
  60 {
  61         return (entry->dbe_id);
  62 }
  63 
  64 void
  65 rfs4_dbe_hold(rfs4_dbe_t *entry)
  66 {
  67         if (!MUTEX_HELD(entry->dbe_lock)) {
  68                 mutex_enter(entry->dbe_lock);
  69                 entry->dbe_refcnt++;
  70                 mutex_exit(entry->dbe_lock);
  71         } else {
  72                 entry->dbe_refcnt++;
  73         }
  74 }
  75 
  76 /*
  77  * rfs4_dbe_rele_nolock only decrements the reference count of the entry.
  78  */
  79 void
  80 rfs4_dbe_rele_nolock(rfs4_dbe_t *entry)
  81 {
  82         if (!MUTEX_HELD(entry->dbe_lock)) {
  83                 ASSERT(entry->dbe_refcnt > 0);
  84                 mutex_enter(entry->dbe_lock);
  85                 entry->dbe_refcnt--;
  86                 mutex_exit(entry->dbe_lock);
  87         } else {
  88                 entry->dbe_refcnt--;
  89         }
  90 }
  91 
  92 
  93 uint32_t
  94 rfs4_dbe_refcnt(rfs4_dbe_t *entry)
  95 {
  96         return (entry->dbe_refcnt);
  97 }
  98 
  99 /*
 100  * Mark an entry such that the dbsearch will skip it.
 101  * Caller does not want this entry to be found any longer
 102  */
 103 void
 104 rfs4_dbe_invalidate(rfs4_dbe_t *entry)
 105 {
 106         if (!MUTEX_HELD(entry->dbe_lock)) {
 107                 mutex_enter(entry->dbe_lock);
 108                 entry->dbe_invalid = TRUE;
 109                 entry->dbe_skipsearch = TRUE;
 110                 mutex_exit(entry->dbe_lock);
 111         } else {
 112                 entry->dbe_invalid = TRUE;
 113                 entry->dbe_skipsearch = TRUE;
 114         }
 115 }
 116 
 117 /*
 118  * Is this entry invalid?
 119  */
 120 bool_t
 121 rfs4_dbe_is_invalid(rfs4_dbe_t *entry)
 122 {
 123         return (entry->dbe_invalid);
 124 }
 125 
 126 time_t
 127 rfs4_dbe_get_timerele(rfs4_dbe_t *entry)
 128 {
 129         return (entry->dbe_time_rele);
 130 }
 131 
 132 /*
 133  * Use these to temporarily hide/unhide a db entry.
 134  */
 135 void
 136 rfs4_dbe_hide(rfs4_dbe_t *entry)
 137 {
 138         rfs4_dbe_lock(entry);
 139         entry->dbe_skipsearch = TRUE;
 140         rfs4_dbe_unlock(entry);
 141 }
 142 
 143 void
 144 rfs4_dbe_unhide(rfs4_dbe_t *entry)
 145 {
 146         rfs4_dbe_lock(entry);
 147         entry->dbe_skipsearch = FALSE;
 148         rfs4_dbe_unlock(entry);
 149 }
 150 
 151 void
 152 rfs4_dbe_rele(rfs4_dbe_t *entry)
 153 {
 154         mutex_enter(entry->dbe_lock);
 155         ASSERT(entry->dbe_refcnt > 1);
 156         entry->dbe_refcnt--;
 157         entry->dbe_time_rele = gethrestime_sec();
 158         mutex_exit(entry->dbe_lock);
 159 }
 160 
 161 void
 162 rfs4_dbe_lock(rfs4_dbe_t *entry)
 163 {
 164         mutex_enter(entry->dbe_lock);
 165 }
 166 
 167 void
 168 rfs4_dbe_unlock(rfs4_dbe_t *entry)
 169 {
 170         mutex_exit(entry->dbe_lock);
 171 }
 172 
 173 bool_t
 174 rfs4_dbe_islocked(rfs4_dbe_t *entry)
 175 {
 176         return (mutex_owned(entry->dbe_lock));
 177 }
 178 
 179 clock_t
 180 rfs4_dbe_twait(rfs4_dbe_t *entry, clock_t timeout)
 181 {
 182         return (cv_timedwait(entry->dbe_cv, entry->dbe_lock, timeout));
 183 }
 184 
 185 void
 186 rfs4_dbe_cv_broadcast(rfs4_dbe_t *entry)
 187 {
 188         cv_broadcast(entry->dbe_cv);
 189 }
 190 
 191 /* ARGSUSED */
 192 static int
 193 rfs4_dbe_kmem_constructor(void *obj, void *private, int kmflag)
 194 {
 195         rfs4_dbe_t *entry = obj;
 196 
 197         mutex_init(entry->dbe_lock, NULL, MUTEX_DEFAULT, NULL);
 198         cv_init(entry->dbe_cv, NULL, CV_DEFAULT, NULL);
 199 
 200         return (0);
 201 }
 202 
 203 static void
 204 rfs4_dbe_kmem_destructor(void *obj, void *private)
 205 {
 206         rfs4_dbe_t *entry = obj;
 207         /*LINTED*/
 208         rfs4_table_t *table = private;
 209 
 210         mutex_destroy(entry->dbe_lock);
 211         cv_destroy(entry->dbe_cv);
 212 }
 213 
 214 rfs4_database_t *
 215 rfs4_database_create(uint32_t flags)
 216 {
 217         rfs4_database_t *db;
 218 
 219         db = kmem_alloc(sizeof (rfs4_database_t), KM_SLEEP);
 220         mutex_init(db->db_lock, NULL, MUTEX_DEFAULT, NULL);
 221         db->db_tables = NULL;
 222         db->db_debug_flags = flags;
 223         db->db_shutdown_count = 0;
 224         cv_init(&db->db_shutdown_wait, NULL, CV_DEFAULT, NULL);
 225         return (db);
 226 }
 227 
 228 
 229 /*
 230  * The reaper threads that have been created for the tables in this
 231  * database must be stopped and the entries in the tables released.
 232  * Each table will be marked as "shutdown" and the reaper threads
 233  * poked and they will see that a shutdown is in progress and cleanup
 234  * and exit.  This function waits for all reaper threads to stop
 235  * before returning to the caller.
 236  */
 237 void
 238 rfs4_database_shutdown(rfs4_database_t *db)
 239 {
 240         rfs4_table_t *table;
 241 
 242         mutex_enter(db->db_lock);
 243         for (table = db->db_tables; table; table = table->dbt_tnext) {
 244                 mutex_enter(&table->dbt_reaper_cv_lock);
 245                 table->dbt_reaper_shutdown = TRUE;
 246                 cv_broadcast(&table->dbt_reaper_wait);
 247                 db->db_shutdown_count++;
 248                 mutex_exit(&table->dbt_reaper_cv_lock);
 249         }
 250         while (db->db_shutdown_count > 0) {
 251                 cv_wait(&db->db_shutdown_wait, db->db_lock);
 252         }
 253         mutex_exit(db->db_lock);
 254 }
 255 
 256 /*
 257  * Given a database that has been "shutdown" by the function above all
 258  * of the table tables are destroyed and then the database itself
 259  * freed.
 260  */
 261 void
 262 rfs4_database_destroy(rfs4_database_t *db)
 263 {
 264         rfs4_table_t *next, *tmp;
 265 
 266         for (next = db->db_tables; next; ) {
 267                 tmp = next;
 268                 next = tmp->dbt_tnext;
 269                 rfs4_table_destroy(db, tmp);
 270         }
 271 
 272         mutex_destroy(db->db_lock);
 273         kmem_free(db, sizeof (rfs4_database_t));
 274 }
 275 
 276 /*
 277  * Used to get the correct kmem_cache database for the state table being
 278  * created.
 279  * Helper function for rfs4_table_create
 280  */
 281 static kmem_cache_t *
 282 get_db_mem_cache(char *name)
 283 {
 284         int i;
 285 
 286         for (i = 0; i < RFS4_DB_MEM_CACHE_NUM; i++) {
 287                 if (strcmp(name, rfs4_db_mem_cache_table[i].r_db_name) == 0)
 288                         return (rfs4_db_mem_cache_table[i].r_db_mem_cache);
 289         }
 290         /*
 291          * There is no associated kmem cache for this NFS4 server state
 292          * table name
 293          */
 294         return (NULL);
 295 }
 296 
 297 /*
 298  * Used to initialize the global NFSv4 server state database.
 299  * Helper funtion for rfs4_state_g_init and called when module is loaded.
 300  */
 301 kmem_cache_t *
 302 /* CSTYLED */
 303 nfs4_init_mem_cache(char *cache_name, uint32_t idxcnt, uint32_t size, uint32_t idx)
 304 {
 305         kmem_cache_t *mem_cache = kmem_cache_create(cache_name,
 306             sizeof (rfs4_dbe_t) + idxcnt * sizeof (rfs4_link_t) + size,
 307             0,
 308             rfs4_dbe_kmem_constructor,
 309             rfs4_dbe_kmem_destructor,
 310             NULL,
 311             NULL,
 312             NULL,
 313             0);
 314         (void) strlcpy(rfs4_db_mem_cache_table[idx].r_db_name, cache_name,
 315             strlen(cache_name) + 1);
 316         rfs4_db_mem_cache_table[idx].r_db_mem_cache = mem_cache;
 317         return (mem_cache);
 318 }
 319 
 320 rfs4_table_t *
 321 rfs4_table_create(rfs4_database_t *db, char *tabname, time_t max_cache_time,
 322     uint32_t idxcnt, bool_t (*create)(rfs4_entry_t, void *),
 323     void (*destroy)(rfs4_entry_t),
 324     bool_t (*expiry)(rfs4_entry_t),
 325     uint32_t size, uint32_t hashsize,
 326     uint32_t maxentries, id_t start)
 327 {
 328         rfs4_table_t    *table;
 329         int              len;
 330         char            *cache_name;
 331         char            *id_name;
 332 
 333         table = kmem_alloc(sizeof (rfs4_table_t), KM_SLEEP);
 334         table->dbt_db = db;
 335         rw_init(table->dbt_t_lock, NULL, RW_DEFAULT, NULL);
 336         mutex_init(table->dbt_lock, NULL, MUTEX_DEFAULT, NULL);
 337         mutex_init(&table->dbt_reaper_cv_lock, NULL, MUTEX_DEFAULT, NULL);
 338         cv_init(&table->dbt_reaper_wait, NULL, CV_DEFAULT, NULL);
 339 
 340         len = strlen(tabname);
 341         table->dbt_name = kmem_alloc(len+1, KM_SLEEP);
 342         cache_name = kmem_alloc(len + 12 /* "_entry_cache" */ + 1, KM_SLEEP);
 343         (void) strcpy(table->dbt_name, tabname);
 344         (void) sprintf(cache_name, "%s_entry_cache", table->dbt_name);
 345         table->dbt_max_cache_time = max_cache_time;
 346         table->dbt_usize = size;
 347         table->dbt_len = hashsize;
 348         table->dbt_count = 0;
 349         table->dbt_idxcnt = 0;
 350         table->dbt_ccnt = 0;
 351         table->dbt_maxcnt = idxcnt;
 352         table->dbt_indices = NULL;
 353         table->dbt_id_space = NULL;
 354         table->dbt_reaper_shutdown = FALSE;
 355 
 356         if (start >= 0) {
 357                 if (maxentries + (uint32_t)start > (uint32_t)INT32_MAX)
 358                         maxentries = INT32_MAX - start;
 359                 id_name = kmem_alloc(len + 9 /* "_id_space" */ + 1, KM_SLEEP);
 360                 (void) sprintf(id_name, "%s_id_space", table->dbt_name);
 361                 table->dbt_id_space = id_space_create(id_name, start,
 362                     maxentries + start);
 363                 kmem_free(id_name, len + 10);
 364         }
 365         ASSERT(t_lowat != 0);
 366         table->dbt_id_lwat = (maxentries * t_lowat) / 100;
 367         ASSERT(t_hiwat != 0);
 368         table->dbt_id_hwat = (maxentries * t_hiwat) / 100;
 369         table->dbt_id_reap = MIN(rfs4_reap_interval, max_cache_time);
 370         table->dbt_maxentries = maxentries;
 371         table->dbt_create = create;
 372         table->dbt_destroy = destroy;
 373         table->dbt_expiry = expiry;
 374 
 375         /*
 376          * get the correct kmem_cache for this table type based on the name.
 377          */
 378         table->dbt_mem_cache = get_db_mem_cache(cache_name);
 379 
 380         kmem_free(cache_name, len+13);
 381 
 382         table->dbt_debug = db->db_debug_flags;
 383 
 384         mutex_enter(db->db_lock);
 385         table->dbt_tnext = db->db_tables;
 386         db->db_tables = table;
 387         mutex_exit(db->db_lock);
 388 
 389         rfs4_start_reaper(table);
 390 
 391         return (table);
 392 }
 393 
 394 void
 395 rfs4_table_destroy(rfs4_database_t *db, rfs4_table_t *table)
 396 {
 397         rfs4_table_t *p;
 398         rfs4_index_t *idx;
 399 
 400         ASSERT(table->dbt_count == 0);
 401 
 402         mutex_enter(db->db_lock);
 403         if (table == db->db_tables)
 404                 db->db_tables = table->dbt_tnext;
 405         else {
 406                 for (p = db->db_tables; p; p = p->dbt_tnext)
 407                         if (p->dbt_tnext == table) {
 408                                 p->dbt_tnext = table->dbt_tnext;
 409                                 table->dbt_tnext = NULL;
 410                                 break;
 411                         }
 412                 ASSERT(p != NULL);
 413         }
 414         mutex_exit(db->db_lock);
 415 
 416         /* Destroy indices */
 417         while (table->dbt_indices) {
 418                 idx = table->dbt_indices;
 419                 table->dbt_indices = idx->dbi_inext;
 420                 rfs4_index_destroy(idx);
 421         }
 422 
 423         rw_destroy(table->dbt_t_lock);
 424         mutex_destroy(table->dbt_lock);
 425         mutex_destroy(&table->dbt_reaper_cv_lock);
 426         cv_destroy(&table->dbt_reaper_wait);
 427 
 428         kmem_free(table->dbt_name, strlen(table->dbt_name) + 1);
 429         if (table->dbt_id_space)
 430                 id_space_destroy(table->dbt_id_space);
 431         table->dbt_mem_cache = NULL;
 432         kmem_free(table, sizeof (rfs4_table_t));
 433 }
 434 
 435 rfs4_index_t *
 436 rfs4_index_create(rfs4_table_t *table, char *keyname,
 437     uint32_t (*hash)(void *),
 438     bool_t (compare)(rfs4_entry_t, void *),
 439     void *(*mkkey)(rfs4_entry_t),
 440     bool_t createable)
 441 {
 442         rfs4_index_t *idx;
 443 
 444         ASSERT(table->dbt_idxcnt < table->dbt_maxcnt);
 445 
 446         idx = kmem_alloc(sizeof (rfs4_index_t), KM_SLEEP);
 447 
 448         idx->dbi_table = table;
 449         idx->dbi_keyname = kmem_alloc(strlen(keyname) + 1, KM_SLEEP);
 450         (void) strcpy(idx->dbi_keyname, keyname);
 451         idx->dbi_hash = hash;
 452         idx->dbi_compare = compare;
 453         idx->dbi_mkkey = mkkey;
 454         idx->dbi_tblidx = table->dbt_idxcnt;
 455         table->dbt_idxcnt++;
 456         if (createable) {
 457                 table->dbt_ccnt++;
 458                 if (table->dbt_ccnt > 1)
 459                         panic("Table %s currently can have only have one "
 460                             "index that will allow creation of entries",
 461                             table->dbt_name);
 462                 idx->dbi_createable = TRUE;
 463         } else {
 464                 idx->dbi_createable = FALSE;
 465         }
 466 
 467         idx->dbi_inext = table->dbt_indices;
 468         table->dbt_indices = idx;
 469         idx->dbi_buckets = kmem_zalloc(sizeof (rfs4_bucket_t) * table->dbt_len,
 470             KM_SLEEP);
 471 
 472         return (idx);
 473 }
 474 
 475 void
 476 rfs4_index_destroy(rfs4_index_t *idx)
 477 {
 478         kmem_free(idx->dbi_keyname, strlen(idx->dbi_keyname) + 1);
 479         kmem_free(idx->dbi_buckets,
 480             sizeof (rfs4_bucket_t) * idx->dbi_table->dbt_len);
 481         kmem_free(idx, sizeof (rfs4_index_t));
 482 }
 483 
 484 static void
 485 rfs4_dbe_destroy(rfs4_dbe_t *entry)
 486 {
 487         rfs4_index_t *idx;
 488         void *key;
 489         int i;
 490         rfs4_bucket_t *bp;
 491         rfs4_table_t *table = entry->dbe_table;
 492         rfs4_link_t *l;
 493 
 494         NFS4_DEBUG(table->dbt_debug & DESTROY_DEBUG,
 495             (CE_NOTE, "Destroying entry %p from %s",
 496             (void*)entry, table->dbt_name));
 497 
 498         mutex_enter(entry->dbe_lock);
 499         ASSERT(entry->dbe_refcnt == 0);
 500         mutex_exit(entry->dbe_lock);
 501 
 502         /* Unlink from all indices */
 503         for (idx = table->dbt_indices; idx; idx = idx->dbi_inext) {
 504                 l = &entry->dbe_indices[idx->dbi_tblidx];
 505                 /* check and see if we were ever linked in to the index */
 506                 if (INVALID_LINK(l)) {
 507                         ASSERT(l->next == NULL && l->prev == NULL);
 508                         continue;
 509                 }
 510                 key = idx->dbi_mkkey(entry->dbe_data);
 511                 i = HASH(idx, key);
 512                 bp = &idx->dbi_buckets[i];
 513                 ASSERT(bp->dbk_head != NULL);
 514                 DEQUEUE_IDX(bp, &entry->dbe_indices[idx->dbi_tblidx]);
 515         }
 516 
 517         /* Destroy user data */
 518         if (table->dbt_destroy)
 519                 (*table->dbt_destroy)(entry->dbe_data);
 520 
 521         if (table->dbt_id_space)
 522                 id_free(table->dbt_id_space, entry->dbe_id);
 523 
 524         mutex_enter(table->dbt_lock);
 525         table->dbt_count--;
 526         mutex_exit(table->dbt_lock);
 527 
 528         /* Destroy the entry itself */
 529         kmem_cache_free(table->dbt_mem_cache, entry);
 530 }
 531 
 532 
 533 static rfs4_dbe_t *
 534 rfs4_dbe_create(rfs4_table_t *table, id_t id, rfs4_entry_t data)
 535 {
 536         rfs4_dbe_t *entry;
 537         int i;
 538 
 539         NFS4_DEBUG(table->dbt_debug & CREATE_DEBUG,
 540             (CE_NOTE, "Creating entry in table %s", table->dbt_name));
 541 
 542         entry = kmem_cache_alloc(table->dbt_mem_cache, KM_SLEEP);
 543 
 544         entry->dbe_refcnt = 1;
 545         entry->dbe_invalid = FALSE;
 546         entry->dbe_skipsearch = FALSE;
 547         entry->dbe_time_rele = 0;
 548         entry->dbe_id = 0;
 549 
 550         if (table->dbt_id_space)
 551                 entry->dbe_id = id;
 552         entry->dbe_table = table;
 553 
 554         for (i = 0; i < table->dbt_maxcnt; i++) {
 555                 entry->dbe_indices[i].next = entry->dbe_indices[i].prev = NULL;
 556                 entry->dbe_indices[i].entry = entry;
 557                 /*
 558                  * We mark the entry as not indexed by setting the low
 559                  * order bit, since address are word aligned. This has
 560                  * the advantage of causeing a trap if the address is
 561                  * used. After the entry is linked in to the
 562                  * corresponding index the bit will be cleared.
 563                  */
 564                 INVALIDATE_ADDR(entry->dbe_indices[i].entry);
 565         }
 566 
 567         entry->dbe_data = (rfs4_entry_t)&entry->dbe_indices[table->dbt_maxcnt];
 568         bzero(entry->dbe_data, table->dbt_usize);
 569         entry->dbe_data->dbe = entry;
 570 
 571         if (!(*table->dbt_create)(entry->dbe_data, data)) {
 572                 kmem_cache_free(table->dbt_mem_cache, entry);
 573                 return (NULL);
 574         }
 575 
 576         mutex_enter(table->dbt_lock);
 577         table->dbt_count++;
 578         mutex_exit(table->dbt_lock);
 579 
 580         return (entry);
 581 }
 582 
 583 static void
 584 rfs4_dbe_tabreap_adjust(rfs4_table_t *table)
 585 {
 586         clock_t         tabreap;
 587         clock_t         reap_int;
 588         uint32_t        in_use;
 589 
 590         /*
 591          * Adjust the table's reap interval based on the
 592          * number of id's currently in use. Each table's
 593          * default remains the same if id usage subsides.
 594          */
 595         ASSERT(MUTEX_HELD(&table->dbt_reaper_cv_lock));
 596         tabreap = MIN(rfs4_reap_interval, table->dbt_max_cache_time);
 597 
 598         in_use = table->dbt_count + 1;       /* see rfs4_dbe_create */
 599         if (in_use >= table->dbt_id_hwat) {
 600                 ASSERT(t_hreap != 0);
 601                 reap_int = (tabreap * t_hreap) / 100;
 602         } else if (in_use >= table->dbt_id_lwat) {
 603                 ASSERT(t_lreap != 0);
 604                 reap_int = (tabreap * t_lreap) / 100;
 605         } else {
 606                 reap_int = tabreap;
 607         }
 608         table->dbt_id_reap = reap_int;
 609         DTRACE_PROBE2(table__reap__interval, char *,
 610             table->dbt_name, time_t, table->dbt_id_reap);
 611 }
 612 
 613 rfs4_entry_t
 614 rfs4_dbsearch(rfs4_index_t *idx, void *key, bool_t *create, void *arg,
 615     rfs4_dbsearch_type_t dbsearch_type)
 616 {
 617         int              already_done;
 618         uint32_t         i;
 619         rfs4_table_t    *table = idx->dbi_table;
 620         rfs4_index_t    *ip;
 621         rfs4_bucket_t   *bp;
 622         rfs4_link_t     *l;
 623         rfs4_dbe_t      *entry;
 624         id_t             id = -1;
 625 
 626         i = HASH(idx, key);
 627         bp = &idx->dbi_buckets[i];
 628 
 629         NFS4_DEBUG(table->dbt_debug & SEARCH_DEBUG,
 630             (CE_NOTE, "Searching for key %p in table %s by %s",
 631             key, table->dbt_name, idx->dbi_keyname));
 632 
 633         rw_enter(bp->dbk_lock, RW_READER);
 634 retry:
 635         for (l = bp->dbk_head; l; l = l->next) {
 636                 if (l->entry->dbe_refcnt > 0 &&
 637                     (l->entry->dbe_skipsearch == FALSE ||
 638                     (l->entry->dbe_skipsearch == TRUE &&
 639                     dbsearch_type == RFS4_DBS_INVALID)) &&
 640                     (*idx->dbi_compare)(l->entry->dbe_data, key)) {
 641                         mutex_enter(l->entry->dbe_lock);
 642                         if (l->entry->dbe_refcnt == 0) {
 643                                 mutex_exit(l->entry->dbe_lock);
 644                                 continue;
 645                         }
 646 
 647                         /* place an additional hold since we are returning */
 648                         rfs4_dbe_hold(l->entry);
 649 
 650                         mutex_exit(l->entry->dbe_lock);
 651                         rw_exit(bp->dbk_lock);
 652 
 653                         *create = FALSE;
 654 
 655                         NFS4_DEBUG((table->dbt_debug & SEARCH_DEBUG),
 656                             (CE_NOTE, "Found entry %p for %p in table %s",
 657                             (void *)l->entry, key, table->dbt_name));
 658 
 659                         if (id != -1)
 660                                 id_free(table->dbt_id_space, id);
 661                         return (l->entry->dbe_data);
 662                 }
 663         }
 664 
 665         if (!*create || table->dbt_create == NULL || !idx->dbi_createable ||
 666             table->dbt_maxentries == table->dbt_count) {
 667                 NFS4_DEBUG(table->dbt_debug & SEARCH_DEBUG,
 668                     (CE_NOTE, "Entry for %p in %s not found",
 669                     key, table->dbt_name));
 670 
 671                 rw_exit(bp->dbk_lock);
 672                 if (id != -1)
 673                         id_free(table->dbt_id_space, id);
 674                 return (NULL);
 675         }
 676 
 677         if (table->dbt_id_space && id == -1) {
 678                 rw_exit(bp->dbk_lock);
 679 
 680                 /* get an id, ok to sleep for it here */
 681                 id = id_alloc(table->dbt_id_space);
 682                 ASSERT(id != -1);
 683 
 684                 mutex_enter(&table->dbt_reaper_cv_lock);
 685                 rfs4_dbe_tabreap_adjust(table);
 686                 mutex_exit(&table->dbt_reaper_cv_lock);
 687 
 688                 rw_enter(bp->dbk_lock, RW_WRITER);
 689                 goto retry;
 690         }
 691 
 692         /* get an exclusive lock on the bucket */
 693         if (rw_read_locked(bp->dbk_lock) && !rw_tryupgrade(bp->dbk_lock)) {
 694                 NFS4_DEBUG(table->dbt_debug & OTHER_DEBUG,
 695                     (CE_NOTE, "Trying to upgrade lock on "
 696                     "hash chain %d (%p) for  %s by %s",
 697                     i, (void*)bp, table->dbt_name, idx->dbi_keyname));
 698 
 699                 rw_exit(bp->dbk_lock);
 700                 rw_enter(bp->dbk_lock, RW_WRITER);
 701                 goto retry;
 702         }
 703 
 704         /* create entry */
 705         entry = rfs4_dbe_create(table, id, arg);
 706         if (entry == NULL) {
 707                 rw_exit(bp->dbk_lock);
 708                 if (id != -1)
 709                         id_free(table->dbt_id_space, id);
 710 
 711                 NFS4_DEBUG(table->dbt_debug & CREATE_DEBUG,
 712                     (CE_NOTE, "Constructor for table %s failed",
 713                     table->dbt_name));
 714                 return (NULL);
 715         }
 716 
 717         /*
 718          * Add one ref for entry into table's hash - only one
 719          * reference added even though there may be multiple indices
 720          */
 721         rfs4_dbe_hold(entry);
 722         ENQUEUE(bp->dbk_head, &entry->dbe_indices[idx->dbi_tblidx]);
 723         VALIDATE_ADDR(entry->dbe_indices[idx->dbi_tblidx].entry);
 724 
 725         already_done = idx->dbi_tblidx;
 726         rw_exit(bp->dbk_lock);
 727 
 728         for (ip = table->dbt_indices; ip; ip = ip->dbi_inext) {
 729                 if (ip->dbi_tblidx == already_done)
 730                         continue;
 731                 l = &entry->dbe_indices[ip->dbi_tblidx];
 732                 i = HASH(ip, ip->dbi_mkkey(entry->dbe_data));
 733                 ASSERT(i < ip->dbi_table->dbt_len);
 734                 bp = &ip->dbi_buckets[i];
 735                 ENQUEUE_IDX(bp, l);
 736         }
 737 
 738         NFS4_DEBUG(
 739             table->dbt_debug & SEARCH_DEBUG || table->dbt_debug & CREATE_DEBUG,
 740             (CE_NOTE, "Entry %p created for %s = %p in table %s",
 741             (void*)entry, idx->dbi_keyname, (void*)key, table->dbt_name));
 742 
 743         return (entry->dbe_data);
 744 }
 745 
 746 /*ARGSUSED*/
 747 boolean_t
 748 rfs4_cpr_callb(void *arg, int code)
 749 {
 750         rfs4_bucket_t *buckets, *bp;
 751         rfs4_link_t *l;
 752         rfs4_client_t *cp;
 753         int i;
 754 
 755         nfs4_srv_t *nsrv4 = zone_getspecific(rfs4_zone_key, curzone);
 756         rfs4_table_t *table = nsrv4->rfs4_client_tab;
 757 
 758         /*
 759          * We get called for Suspend and Resume events.
 760          * For the suspend case we simply don't care!  Nor do we care if
 761          * there are no clients.
 762          */
 763         if (code == CB_CODE_CPR_CHKPT || table == NULL) {
 764                 return (B_TRUE);
 765         }
 766 
 767         buckets = table->dbt_indices->dbi_buckets;
 768 
 769         /*
 770          * When we get this far we are in the process of
 771          * resuming the system from a previous suspend.
 772          *
 773          * We are going to blast through and update the
 774          * last_access time for all the clients and in
 775          * doing so extend them by one lease period.
 776          */
 777         for (i = 0; i < table->dbt_len; i++) {
 778                 bp = &buckets[i];
 779                 for (l = bp->dbk_head; l; l = l->next) {
 780                         cp = (rfs4_client_t *)l->entry->dbe_data;
 781                         cp->rc_last_access = gethrestime_sec();
 782                 }
 783         }
 784 
 785         return (B_TRUE);
 786 }
 787 
 788 /*
 789  * Given a table, lock each of the buckets and walk all entries (in
 790  * turn locking those) and calling the provided "callout" function
 791  * with the provided parameter.  Obviously used to iterate across all
 792  * entries in a particular table via the database locking hierarchy.
 793  * Obviously the caller must not hold locks on any of the entries in
 794  * the specified table.
 795  */
 796 void
 797 rfs4_dbe_walk(rfs4_table_t *table,
 798     void (*callout)(rfs4_entry_t, void *),
 799     void *data)
 800 {
 801         rfs4_bucket_t *buckets = table->dbt_indices->dbi_buckets, *bp;
 802         rfs4_link_t *l;
 803         rfs4_dbe_t *entry;
 804         int i;
 805 
 806         NFS4_DEBUG(table->dbt_debug & WALK_DEBUG,
 807             (CE_NOTE, "Walking entries in %s", table->dbt_name));
 808 
 809         /* Walk the buckets looking for entries to release/destroy */
 810         for (i = 0; i < table->dbt_len; i++) {
 811                 bp = &buckets[i];
 812                 rw_enter(bp->dbk_lock, RW_READER);
 813                 for (l = bp->dbk_head; l; l = l->next) {
 814                         entry = l->entry;
 815                         mutex_enter(entry->dbe_lock);
 816                         (*callout)(entry->dbe_data, data);
 817                         mutex_exit(entry->dbe_lock);
 818                 }
 819                 rw_exit(bp->dbk_lock);
 820         }
 821 
 822         NFS4_DEBUG(table->dbt_debug & WALK_DEBUG,
 823             (CE_NOTE, "Walking entries complete %s", table->dbt_name));
 824 }
 825 
 826 
 827 static void
 828 rfs4_dbe_reap(rfs4_table_t *table, time_t cache_time, uint32_t desired)
 829 {
 830         rfs4_index_t *idx = table->dbt_indices;
 831         rfs4_bucket_t *buckets = idx->dbi_buckets, *bp;
 832         rfs4_link_t *l, *t;
 833         rfs4_dbe_t *entry;
 834         bool_t found;
 835         int i;
 836         int count = 0;
 837 
 838         NFS4_DEBUG(table->dbt_debug & REAP_DEBUG,
 839             (CE_NOTE, "Reaping %d entries older than %ld seconds in table %s",
 840             desired, cache_time, table->dbt_name));
 841 
 842         /* Walk the buckets looking for entries to release/destroy */
 843         for (i = 0; i < table->dbt_len; i++) {
 844                 int retries = 0;
 845                 bp = &buckets[i];
 846                 do {
 847                         found = FALSE;
 848                         rw_enter(bp->dbk_lock, RW_READER);
 849                         for (l = bp->dbk_head; l; l = l->next) {
 850                                 entry = l->entry;
 851                                 mutex_enter(entry->dbe_lock);
 852                                 ASSERT(entry->dbe_refcnt != 0);
 853                                 /*
 854                                  * Examine an entry.  Ref count of 1 means
 855                                  * that the only reference is for the hash
 856                                  * table reference.
 857                                  */
 858                                 if (entry->dbe_refcnt != 1) {
 859 #ifdef DEBUG
 860                                         rfs4_dbe_debug(entry);
 861 #endif
 862                                         mutex_exit(entry->dbe_lock);
 863                                         continue;
 864                                 }
 865                                 if ((entry->dbe_refcnt == 1) &&
 866                                     (table->dbt_reaper_shutdown ||
 867                                     table->dbt_expiry == NULL ||
 868                                     (*table->dbt_expiry)(entry->dbe_data))) {
 869                                         rfs4_dbe_rele_nolock(entry);
 870                                         count++;
 871                                         found = TRUE;
 872                                 }
 873                                 mutex_exit(entry->dbe_lock);
 874                         }
 875                         if (found) {
 876                                 if (!rw_tryupgrade(bp->dbk_lock)) {
 877                                         rw_exit(bp->dbk_lock);
 878                                         rw_enter(bp->dbk_lock, RW_WRITER);
 879                                 }
 880 
 881                                 l = bp->dbk_head;
 882                                 while (l) {
 883                                         t = l;
 884                                         entry = t->entry;
 885                                         l = l->next;
 886                                         mutex_enter(entry->dbe_lock);
 887                                         if (entry->dbe_refcnt == 0) {
 888                                                 DEQUEUE(bp->dbk_head, t);
 889                                                 mutex_exit(entry->dbe_lock);
 890                                                 t->next = NULL;
 891                                                 t->prev = NULL;
 892                                                 INVALIDATE_ADDR(t->entry);
 893                                                 rfs4_dbe_destroy(entry);
 894                                         } else
 895                                                 mutex_exit(entry->dbe_lock);
 896                                 }
 897                         }
 898                         rw_exit(bp->dbk_lock);
 899                         /*
 900                          * delay slightly if there is more work to do
 901                          * with the expectation that other reaper
 902                          * threads are freeing data structures as well
 903                          * and in turn will reduce ref counts on
 904                          * entries in this table allowing them to be
 905                          * released.  This is only done in the
 906                          * instance that the tables are being shut down.
 907                          */
 908                         if (table->dbt_reaper_shutdown && bp->dbk_head != NULL) {
 909                                 delay(hz/100);
 910                                 retries++;
 911                         }
 912                 /*
 913                  * If this is a table shutdown, keep going until
 914                  * everything is gone
 915                  */
 916                 } while (table->dbt_reaper_shutdown && bp->dbk_head != NULL && retries < 5);
 917 
 918                 if (!table->dbt_reaper_shutdown && desired && count >= desired)
 919                         break;
 920         }
 921 
 922         NFS4_DEBUG(table->dbt_debug & REAP_DEBUG,
 923             (CE_NOTE, "Reaped %d entries older than %ld seconds in table %s",
 924             count, cache_time, table->dbt_name));
 925 }
 926 
 927 static void
 928 reaper_thread(caddr_t *arg)
 929 {
 930         rfs4_table_t    *table = (rfs4_table_t *)arg;
 931         clock_t          rc;
 932 
 933         NFS4_DEBUG(table->dbt_debug,
 934             (CE_NOTE, "rfs4_reaper_thread starting for %s", table->dbt_name));
 935 
 936         CALLB_CPR_INIT(&table->dbt_reaper_cpr_info, &table->dbt_reaper_cv_lock,
 937             callb_generic_cpr, "nfsv4Reaper");
 938 
 939         mutex_enter(&table->dbt_reaper_cv_lock);
 940         do {
 941                 CALLB_CPR_SAFE_BEGIN(&table->dbt_reaper_cpr_info);
 942                 rc = cv_reltimedwait_sig(&table->dbt_reaper_wait,
 943                     &table->dbt_reaper_cv_lock,
 944                     SEC_TO_TICK(table->dbt_id_reap), TR_CLOCK_TICK);
 945                 CALLB_CPR_SAFE_END(&table->dbt_reaper_cpr_info,
 946                     &table->dbt_reaper_cv_lock);
 947                 rfs4_dbe_reap(table, table->dbt_max_cache_time, 0);
 948         } while (rc != 0 && table->dbt_reaper_shutdown == FALSE);
 949 
 950         CALLB_CPR_EXIT(&table->dbt_reaper_cpr_info);
 951 
 952         NFS4_DEBUG(table->dbt_debug,
 953             (CE_NOTE, "rfs4_reaper_thread exiting for %s", table->dbt_name));
 954 
 955         /* Notify the database shutdown processing that the table is shutdown */
 956         mutex_enter(table->dbt_db->db_lock);
 957         table->dbt_db->db_shutdown_count--;
 958         cv_signal(&table->dbt_db->db_shutdown_wait);
 959         mutex_exit(table->dbt_db->db_lock);
 960         zthread_exit();
 961 }
 962 
 963 static void
 964 rfs4_start_reaper(rfs4_table_t *table)
 965 {
 966         if (table->dbt_max_cache_time == 0)
 967                 return;
 968 
 969         (void) zthread_create(NULL, 0, reaper_thread, table, 0,
 970             minclsyspri);
 971 }
 972 
 973 #ifdef DEBUG
 974 void
 975 rfs4_dbe_debug(rfs4_dbe_t *entry)
 976 {
 977         cmn_err(CE_NOTE, "Entry %p from table %s",
 978             (void *)entry, entry->dbe_table->dbt_name);
 979         cmn_err(CE_CONT, "\trefcnt = %d id = %d",
 980             entry->dbe_refcnt, entry->dbe_id);
 981 }
 982 #endif