Print this page
Backport fix from
Make NFS4.x dbe related ops lockless with atomic
Evan's review
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/fs/nfs/nfs4_db.c
+++ new/usr/src/uts/common/fs/nfs/nfs4_db.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24 24 */
25 25
26 26 /*
27 27 * Copyright 2018 Nexenta Systems, Inc.
28 28 */
29 29
30 30 #include <sys/systm.h>
31 31 #include <sys/cmn_err.h>
32 32 #include <sys/kmem.h>
33 33 #include <sys/disp.h>
34 34 #include <sys/id_space.h>
35 35 #include <sys/atomic.h>
36 36 #include <rpc/rpc.h>
37 37 #include <nfs/nfs4.h>
38 38 #include <nfs/nfs4_db_impl.h>
39 39 #include <sys/sdt.h>
40 40
41 41 static int rfs4_reap_interval = RFS4_REAP_INTERVAL;
42 42
43 43 static void rfs4_dbe_reap(rfs4_table_t *, time_t, uint32_t);
44 44 static void rfs4_dbe_destroy(rfs4_dbe_t *);
45 45 static rfs4_dbe_t *rfs4_dbe_create(rfs4_table_t *, id_t, rfs4_entry_t);
46 46 static void rfs4_start_reaper(rfs4_table_t *);
47 47
48 48 /*
49 49 * t_lowat - integer percentage of table entries /etc/system only
50 50 * t_hiwat - integer percentage of table entries /etc/system only
51 51 * t_lreap - integer percentage of table reap time mdb or /etc/system
52 52 * t_hreap - integer percentage of table reap time mdb or /etc/system
53 53 */
54 54 uint32_t t_lowat = 50; /* reap at t_lreap when id's in use hit 50% */
55 55 uint32_t t_hiwat = 75; /* reap at t_hreap when id's in use hit 75% */
56 56 time_t t_lreap = 50; /* default to 50% of table's reap interval */
57 57 time_t t_hreap = 10; /* default to 10% of table's reap interval */
58 58
59 59 id_t
60 60 rfs4_dbe_getid(rfs4_dbe_t *entry)
61 61 {
62 62 return (entry->dbe_id);
63 63 }
64 64
65 65 void
66 66 rfs4_dbe_hold(rfs4_dbe_t *entry)
67 67 {
68 68 atomic_inc_32(&entry->dbe_refcnt);
69 69 }
70 70
71 71 /*
72 72 * rfs4_dbe_rele_nolock only decrements the reference count of the entry.
73 73 */
74 74 void
75 75 rfs4_dbe_rele_nolock(rfs4_dbe_t *entry)
76 76 {
77 77 atomic_dec_32(&entry->dbe_refcnt);
78 78 }
79 79
80 80
81 81 uint32_t
82 82 rfs4_dbe_refcnt(rfs4_dbe_t *entry)
83 83 {
84 84 return (entry->dbe_refcnt);
85 85 }
86 86
87 87 /*
88 88 * Mark an entry such that the dbsearch will skip it.
89 89 * Caller does not want this entry to be found any longer
90 90 */
91 91 void
92 92 rfs4_dbe_invalidate(rfs4_dbe_t *entry)
93 93 {
94 94 entry->dbe_invalid = TRUE;
95 95 entry->dbe_skipsearch = TRUE;
96 96 }
97 97
98 98 /*
99 99 * Is this entry invalid?
100 100 */
101 101 bool_t
102 102 rfs4_dbe_is_invalid(rfs4_dbe_t *entry)
103 103 {
104 104 return (entry->dbe_invalid);
105 105 }
106 106
107 107 time_t
108 108 rfs4_dbe_get_timerele(rfs4_dbe_t *entry)
109 109 {
110 110 return (entry->dbe_time_rele);
111 111 }
112 112
113 113 /*
114 114 * Use these to temporarily hide/unhide a db entry.
115 115 */
116 116 void
117 117 rfs4_dbe_hide(rfs4_dbe_t *entry)
118 118 {
119 119 rfs4_dbe_lock(entry);
120 120 entry->dbe_skipsearch = TRUE;
121 121 rfs4_dbe_unlock(entry);
122 122 }
123 123
124 124 void
125 125 rfs4_dbe_unhide(rfs4_dbe_t *entry)
126 126 {
127 127 rfs4_dbe_lock(entry);
128 128 entry->dbe_skipsearch = FALSE;
129 129 rfs4_dbe_unlock(entry);
130 130 }
131 131
132 132 void
133 133 rfs4_dbe_rele(rfs4_dbe_t *entry)
134 134 {
135 135 mutex_enter(entry->dbe_lock);
136 136 ASSERT(entry->dbe_refcnt > 1);
137 137 atomic_dec_32(&entry->dbe_refcnt);
138 138 entry->dbe_time_rele = gethrestime_sec();
139 139 mutex_exit(entry->dbe_lock);
140 140 }
141 141
142 142 void
143 143 rfs4_dbe_lock(rfs4_dbe_t *entry)
144 144 {
145 145 mutex_enter(entry->dbe_lock);
146 146 }
147 147
148 148 void
149 149 rfs4_dbe_unlock(rfs4_dbe_t *entry)
150 150 {
151 151 mutex_exit(entry->dbe_lock);
152 152 }
153 153
154 154 bool_t
155 155 rfs4_dbe_islocked(rfs4_dbe_t *entry)
156 156 {
157 157 return (mutex_owned(entry->dbe_lock));
158 158 }
159 159
160 160 clock_t
161 161 rfs4_dbe_twait(rfs4_dbe_t *entry, clock_t timeout)
162 162 {
163 163 return (cv_timedwait(entry->dbe_cv, entry->dbe_lock, timeout));
164 164 }
165 165
166 166 void
167 167 rfs4_dbe_cv_broadcast(rfs4_dbe_t *entry)
168 168 {
169 169 cv_broadcast(entry->dbe_cv);
170 170 }
171 171
172 172 /* ARGSUSED */
173 173 static int
174 174 rfs4_dbe_kmem_constructor(void *obj, void *private, int kmflag)
175 175 {
176 176 rfs4_dbe_t *entry = obj;
177 177
178 178 mutex_init(entry->dbe_lock, NULL, MUTEX_DEFAULT, NULL);
179 179 cv_init(entry->dbe_cv, NULL, CV_DEFAULT, NULL);
180 180
181 181 return (0);
182 182 }
183 183
184 184 static void
185 185 rfs4_dbe_kmem_destructor(void *obj, void *private)
186 186 {
187 187 rfs4_dbe_t *entry = obj;
188 188 /*LINTED*/
189 189 rfs4_table_t *table = private;
190 190
191 191 mutex_destroy(entry->dbe_lock);
192 192 cv_destroy(entry->dbe_cv);
193 193 }
194 194
195 195 rfs4_database_t *
196 196 rfs4_database_create(uint32_t flags)
197 197 {
198 198 rfs4_database_t *db;
199 199
200 200 db = kmem_alloc(sizeof (rfs4_database_t), KM_SLEEP);
201 201 mutex_init(db->db_lock, NULL, MUTEX_DEFAULT, NULL);
202 202 db->db_tables = NULL;
203 203 db->db_debug_flags = flags;
204 204 db->db_shutdown_count = 0;
205 205 cv_init(&db->db_shutdown_wait, NULL, CV_DEFAULT, NULL);
206 206 return (db);
207 207 }
208 208
209 209
210 210 /*
211 211 * The reaper threads that have been created for the tables in this
212 212 * database must be stopped and the entries in the tables released.
213 213 * Each table will be marked as "shutdown" and the reaper threads
214 214 * poked and they will see that a shutdown is in progress and cleanup
215 215 * and exit. This function waits for all reaper threads to stop
216 216 * before returning to the caller.
217 217 */
218 218 void
219 219 rfs4_database_shutdown(rfs4_database_t *db)
220 220 {
221 221 rfs4_table_t *table;
222 222
223 223 mutex_enter(db->db_lock);
224 224 for (table = db->db_tables; table; table = table->dbt_tnext) {
225 225 mutex_enter(&table->dbt_reaper_cv_lock);
226 226 table->dbt_reaper_shutdown = TRUE;
227 227 cv_broadcast(&table->dbt_reaper_wait);
228 228 db->db_shutdown_count++;
229 229 mutex_exit(&table->dbt_reaper_cv_lock);
230 230 }
231 231 while (db->db_shutdown_count > 0) {
232 232 cv_wait(&db->db_shutdown_wait, db->db_lock);
233 233 }
234 234 mutex_exit(db->db_lock);
235 235 }
236 236
237 237 /*
238 238 * Given a database that has been "shutdown" by the function above all
239 239 * of the table tables are destroyed and then the database itself
240 240 * freed.
241 241 */
242 242 void
243 243 rfs4_database_destroy(rfs4_database_t *db)
244 244 {
245 245 rfs4_table_t *next, *tmp;
246 246
247 247 for (next = db->db_tables; next; ) {
248 248 tmp = next;
249 249 next = tmp->dbt_tnext;
250 250 rfs4_table_destroy(db, tmp);
251 251 }
252 252
253 253 mutex_destroy(db->db_lock);
254 254 kmem_free(db, sizeof (rfs4_database_t));
255 255 }
256 256
257 257 /*
258 258 * Used to get the correct kmem_cache database for the state table being
259 259 * created.
260 260 * Helper function for rfs4_table_create
261 261 */
262 262 static kmem_cache_t *
263 263 get_db_mem_cache(char *name)
264 264 {
265 265 int i;
266 266
267 267 for (i = 0; i < RFS4_DB_MEM_CACHE_NUM; i++) {
268 268 if (strcmp(name, rfs4_db_mem_cache_table[i].r_db_name) == 0)
269 269 return (rfs4_db_mem_cache_table[i].r_db_mem_cache);
270 270 }
271 271 /*
272 272 * There is no associated kmem cache for this NFS4 server state
273 273 * table name
274 274 */
275 275 return (NULL);
276 276 }
277 277
278 278 /*
279 279 * Used to initialize the global NFSv4 server state database.
280 280 * Helper funtion for rfs4_state_g_init and called when module is loaded.
281 281 */
282 282 kmem_cache_t *
283 283 /* CSTYLED */
284 284 nfs4_init_mem_cache(char *cache_name, uint32_t idxcnt, uint32_t size, uint32_t idx)
285 285 {
286 286 kmem_cache_t *mem_cache = kmem_cache_create(cache_name,
287 287 sizeof (rfs4_dbe_t) + idxcnt * sizeof (rfs4_link_t) + size,
288 288 0,
289 289 rfs4_dbe_kmem_constructor,
290 290 rfs4_dbe_kmem_destructor,
291 291 NULL,
292 292 NULL,
293 293 NULL,
294 294 0);
295 295 (void) strlcpy(rfs4_db_mem_cache_table[idx].r_db_name, cache_name,
296 296 strlen(cache_name) + 1);
297 297 rfs4_db_mem_cache_table[idx].r_db_mem_cache = mem_cache;
298 298 return (mem_cache);
299 299 }
300 300
301 301 rfs4_table_t *
302 302 rfs4_table_create(rfs4_database_t *db, char *tabname, time_t max_cache_time,
303 303 uint32_t idxcnt, bool_t (*create)(rfs4_entry_t, void *),
304 304 void (*destroy)(rfs4_entry_t),
305 305 bool_t (*expiry)(rfs4_entry_t),
306 306 uint32_t size, uint32_t hashsize,
307 307 uint32_t maxentries, id_t start)
308 308 {
309 309 rfs4_table_t *table;
310 310 int len;
311 311 char *cache_name;
312 312 char *id_name;
313 313
314 314 table = kmem_alloc(sizeof (rfs4_table_t), KM_SLEEP);
315 315 table->dbt_db = db;
316 316 rw_init(table->dbt_t_lock, NULL, RW_DEFAULT, NULL);
317 317 mutex_init(table->dbt_lock, NULL, MUTEX_DEFAULT, NULL);
318 318 mutex_init(&table->dbt_reaper_cv_lock, NULL, MUTEX_DEFAULT, NULL);
319 319 cv_init(&table->dbt_reaper_wait, NULL, CV_DEFAULT, NULL);
320 320
321 321 len = strlen(tabname);
322 322 table->dbt_name = kmem_alloc(len+1, KM_SLEEP);
323 323 cache_name = kmem_alloc(len + 12 /* "_entry_cache" */ + 1, KM_SLEEP);
324 324 (void) strcpy(table->dbt_name, tabname);
325 325 (void) sprintf(cache_name, "%s_entry_cache", table->dbt_name);
326 326 table->dbt_max_cache_time = max_cache_time;
327 327 table->dbt_usize = size;
328 328 table->dbt_len = hashsize;
329 329 table->dbt_count = 0;
330 330 table->dbt_idxcnt = 0;
331 331 table->dbt_ccnt = 0;
332 332 table->dbt_maxcnt = idxcnt;
333 333 table->dbt_indices = NULL;
334 334 table->dbt_id_space = NULL;
335 335 table->dbt_reaper_shutdown = FALSE;
336 336
337 337 if (start >= 0) {
338 338 if (maxentries + (uint32_t)start > (uint32_t)INT32_MAX)
339 339 maxentries = INT32_MAX - start;
340 340 id_name = kmem_alloc(len + 9 /* "_id_space" */ + 1, KM_SLEEP);
341 341 (void) sprintf(id_name, "%s_id_space", table->dbt_name);
342 342 table->dbt_id_space = id_space_create(id_name, start,
343 343 maxentries + start);
344 344 kmem_free(id_name, len + 10);
345 345 }
346 346 ASSERT(t_lowat != 0);
347 347 table->dbt_id_lwat = (maxentries * t_lowat) / 100;
348 348 ASSERT(t_hiwat != 0);
349 349 table->dbt_id_hwat = (maxentries * t_hiwat) / 100;
350 350 table->dbt_id_reap = MIN(rfs4_reap_interval, max_cache_time);
351 351 table->dbt_maxentries = maxentries;
352 352 table->dbt_create = create;
353 353 table->dbt_destroy = destroy;
354 354 table->dbt_expiry = expiry;
355 355
356 356 /*
357 357 * get the correct kmem_cache for this table type based on the name.
358 358 */
359 359 table->dbt_mem_cache = get_db_mem_cache(cache_name);
360 360
361 361 kmem_free(cache_name, len+13);
362 362
363 363 table->dbt_debug = db->db_debug_flags;
364 364
365 365 mutex_enter(db->db_lock);
366 366 table->dbt_tnext = db->db_tables;
367 367 db->db_tables = table;
368 368 mutex_exit(db->db_lock);
369 369
370 370 rfs4_start_reaper(table);
371 371
372 372 return (table);
373 373 }
374 374
375 375 void
376 376 rfs4_table_destroy(rfs4_database_t *db, rfs4_table_t *table)
377 377 {
378 378 rfs4_table_t *p;
379 379 rfs4_index_t *idx;
380 380
381 381 ASSERT(table->dbt_count == 0);
382 382
383 383 mutex_enter(db->db_lock);
384 384 if (table == db->db_tables)
385 385 db->db_tables = table->dbt_tnext;
386 386 else {
387 387 for (p = db->db_tables; p; p = p->dbt_tnext)
388 388 if (p->dbt_tnext == table) {
389 389 p->dbt_tnext = table->dbt_tnext;
390 390 table->dbt_tnext = NULL;
391 391 break;
392 392 }
393 393 ASSERT(p != NULL);
394 394 }
395 395 mutex_exit(db->db_lock);
396 396
397 397 /* Destroy indices */
398 398 while (table->dbt_indices) {
399 399 idx = table->dbt_indices;
400 400 table->dbt_indices = idx->dbi_inext;
401 401 rfs4_index_destroy(idx);
402 402 }
403 403
404 404 rw_destroy(table->dbt_t_lock);
405 405 mutex_destroy(table->dbt_lock);
406 406 mutex_destroy(&table->dbt_reaper_cv_lock);
407 407 cv_destroy(&table->dbt_reaper_wait);
408 408
409 409 kmem_free(table->dbt_name, strlen(table->dbt_name) + 1);
410 410 if (table->dbt_id_space)
411 411 id_space_destroy(table->dbt_id_space);
412 412 table->dbt_mem_cache = NULL;
413 413 kmem_free(table, sizeof (rfs4_table_t));
414 414 }
415 415
416 416 rfs4_index_t *
417 417 rfs4_index_create(rfs4_table_t *table, char *keyname,
418 418 uint32_t (*hash)(void *),
419 419 bool_t (compare)(rfs4_entry_t, void *),
420 420 void *(*mkkey)(rfs4_entry_t),
421 421 bool_t createable)
422 422 {
423 423 rfs4_index_t *idx;
424 424
425 425 ASSERT(table->dbt_idxcnt < table->dbt_maxcnt);
426 426
427 427 idx = kmem_alloc(sizeof (rfs4_index_t), KM_SLEEP);
428 428
429 429 idx->dbi_table = table;
430 430 idx->dbi_keyname = kmem_alloc(strlen(keyname) + 1, KM_SLEEP);
431 431 (void) strcpy(idx->dbi_keyname, keyname);
432 432 idx->dbi_hash = hash;
433 433 idx->dbi_compare = compare;
434 434 idx->dbi_mkkey = mkkey;
435 435 idx->dbi_tblidx = table->dbt_idxcnt;
436 436 table->dbt_idxcnt++;
437 437 if (createable) {
438 438 table->dbt_ccnt++;
439 439 if (table->dbt_ccnt > 1)
440 440 panic("Table %s currently can have only have one "
441 441 "index that will allow creation of entries",
442 442 table->dbt_name);
443 443 idx->dbi_createable = TRUE;
444 444 } else {
445 445 idx->dbi_createable = FALSE;
446 446 }
447 447
448 448 idx->dbi_inext = table->dbt_indices;
449 449 table->dbt_indices = idx;
450 450 idx->dbi_buckets = kmem_zalloc(sizeof (rfs4_bucket_t) * table->dbt_len,
451 451 KM_SLEEP);
452 452
453 453 return (idx);
454 454 }
455 455
456 456 void
457 457 rfs4_index_destroy(rfs4_index_t *idx)
458 458 {
459 459 kmem_free(idx->dbi_keyname, strlen(idx->dbi_keyname) + 1);
460 460 kmem_free(idx->dbi_buckets,
461 461 sizeof (rfs4_bucket_t) * idx->dbi_table->dbt_len);
462 462 kmem_free(idx, sizeof (rfs4_index_t));
463 463 }
464 464
465 465 static void
466 466 rfs4_dbe_destroy(rfs4_dbe_t *entry)
467 467 {
468 468 rfs4_index_t *idx;
469 469 void *key;
470 470 int i;
471 471 rfs4_bucket_t *bp;
472 472 rfs4_table_t *table = entry->dbe_table;
473 473 rfs4_link_t *l;
474 474
475 475 NFS4_DEBUG(table->dbt_debug & DESTROY_DEBUG,
476 476 (CE_NOTE, "Destroying entry %p from %s",
477 477 (void*)entry, table->dbt_name));
478 478
479 479 mutex_enter(entry->dbe_lock);
480 480 ASSERT(entry->dbe_refcnt == 0);
481 481 mutex_exit(entry->dbe_lock);
482 482
483 483 /* Unlink from all indices */
484 484 for (idx = table->dbt_indices; idx; idx = idx->dbi_inext) {
485 485 l = &entry->dbe_indices[idx->dbi_tblidx];
486 486 /* check and see if we were ever linked in to the index */
487 487 if (INVALID_LINK(l)) {
488 488 ASSERT(l->next == NULL && l->prev == NULL);
489 489 continue;
490 490 }
491 491 key = idx->dbi_mkkey(entry->dbe_data);
492 492 i = HASH(idx, key);
493 493 bp = &idx->dbi_buckets[i];
494 494 ASSERT(bp->dbk_head != NULL);
495 495 DEQUEUE_IDX(bp, &entry->dbe_indices[idx->dbi_tblidx]);
496 496 }
497 497
498 498 /* Destroy user data */
499 499 if (table->dbt_destroy)
500 500 (*table->dbt_destroy)(entry->dbe_data);
501 501
502 502 if (table->dbt_id_space)
503 503 id_free(table->dbt_id_space, entry->dbe_id);
504 504
505 505 mutex_enter(table->dbt_lock);
506 506 table->dbt_count--;
507 507 mutex_exit(table->dbt_lock);
508 508
509 509 /* Destroy the entry itself */
510 510 kmem_cache_free(table->dbt_mem_cache, entry);
511 511 }
512 512
513 513
514 514 static rfs4_dbe_t *
515 515 rfs4_dbe_create(rfs4_table_t *table, id_t id, rfs4_entry_t data)
516 516 {
517 517 rfs4_dbe_t *entry;
518 518 int i;
519 519
520 520 NFS4_DEBUG(table->dbt_debug & CREATE_DEBUG,
521 521 (CE_NOTE, "Creating entry in table %s", table->dbt_name));
522 522
523 523 entry = kmem_cache_alloc(table->dbt_mem_cache, KM_SLEEP);
524 524
525 525 entry->dbe_refcnt = 1;
526 526 entry->dbe_invalid = FALSE;
527 527 entry->dbe_skipsearch = FALSE;
528 528 entry->dbe_time_rele = 0;
529 529 entry->dbe_id = 0;
530 530
531 531 if (table->dbt_id_space)
532 532 entry->dbe_id = id;
533 533 entry->dbe_table = table;
534 534
535 535 for (i = 0; i < table->dbt_maxcnt; i++) {
536 536 entry->dbe_indices[i].next = entry->dbe_indices[i].prev = NULL;
537 537 entry->dbe_indices[i].entry = entry;
538 538 /*
539 539 * We mark the entry as not indexed by setting the low
540 540 * order bit, since address are word aligned. This has
541 541 * the advantage of causeing a trap if the address is
542 542 * used. After the entry is linked in to the
543 543 * corresponding index the bit will be cleared.
544 544 */
545 545 INVALIDATE_ADDR(entry->dbe_indices[i].entry);
546 546 }
547 547
548 548 entry->dbe_data = (rfs4_entry_t)&entry->dbe_indices[table->dbt_maxcnt];
549 549 bzero(entry->dbe_data, table->dbt_usize);
550 550 entry->dbe_data->dbe = entry;
551 551
552 552 if (!(*table->dbt_create)(entry->dbe_data, data)) {
553 553 kmem_cache_free(table->dbt_mem_cache, entry);
554 554 return (NULL);
555 555 }
556 556
557 557 mutex_enter(table->dbt_lock);
558 558 table->dbt_count++;
559 559 mutex_exit(table->dbt_lock);
560 560
561 561 return (entry);
562 562 }
563 563
564 564 static void
565 565 rfs4_dbe_tabreap_adjust(rfs4_table_t *table)
566 566 {
567 567 clock_t tabreap;
568 568 clock_t reap_int;
569 569 uint32_t in_use;
570 570
571 571 /*
572 572 * Adjust the table's reap interval based on the
573 573 * number of id's currently in use. Each table's
574 574 * default remains the same if id usage subsides.
575 575 */
576 576 ASSERT(MUTEX_HELD(&table->dbt_reaper_cv_lock));
577 577 tabreap = MIN(rfs4_reap_interval, table->dbt_max_cache_time);
578 578
579 579 in_use = table->dbt_count + 1; /* see rfs4_dbe_create */
580 580 if (in_use >= table->dbt_id_hwat) {
581 581 ASSERT(t_hreap != 0);
582 582 reap_int = (tabreap * t_hreap) / 100;
583 583 } else if (in_use >= table->dbt_id_lwat) {
584 584 ASSERT(t_lreap != 0);
585 585 reap_int = (tabreap * t_lreap) / 100;
586 586 } else {
587 587 reap_int = tabreap;
588 588 }
589 589 table->dbt_id_reap = reap_int;
590 590 DTRACE_PROBE2(table__reap__interval, char *,
591 591 table->dbt_name, time_t, table->dbt_id_reap);
592 592 }
593 593
594 594 rfs4_entry_t
595 595 rfs4_dbsearch(rfs4_index_t *idx, void *key, bool_t *create, void *arg,
596 596 rfs4_dbsearch_type_t dbsearch_type)
597 597 {
598 598 int already_done;
599 599 uint32_t i;
600 600 rfs4_table_t *table = idx->dbi_table;
601 601 rfs4_index_t *ip;
602 602 rfs4_bucket_t *bp;
603 603 rfs4_link_t *l;
604 604 rfs4_dbe_t *entry;
605 605 id_t id = -1;
606 606
607 607 i = HASH(idx, key);
608 608 bp = &idx->dbi_buckets[i];
609 609
610 610 NFS4_DEBUG(table->dbt_debug & SEARCH_DEBUG,
611 611 (CE_NOTE, "Searching for key %p in table %s by %s",
612 612 key, table->dbt_name, idx->dbi_keyname));
613 613
614 614 rw_enter(bp->dbk_lock, RW_READER);
615 615 retry:
616 616 for (l = bp->dbk_head; l; l = l->next) {
617 617 if (l->entry->dbe_refcnt > 0 &&
618 618 (l->entry->dbe_skipsearch == FALSE ||
619 619 (l->entry->dbe_skipsearch == TRUE &&
620 620 dbsearch_type == RFS4_DBS_INVALID)) &&
621 621 (*idx->dbi_compare)(l->entry->dbe_data, key)) {
622 622 mutex_enter(l->entry->dbe_lock);
623 623 if (l->entry->dbe_refcnt == 0) {
624 624 mutex_exit(l->entry->dbe_lock);
625 625 continue;
626 626 }
627 627
628 628 /* place an additional hold since we are returning */
629 629 rfs4_dbe_hold(l->entry);
630 630
631 631 mutex_exit(l->entry->dbe_lock);
632 632 rw_exit(bp->dbk_lock);
633 633
634 634 *create = FALSE;
635 635
636 636 NFS4_DEBUG((table->dbt_debug & SEARCH_DEBUG),
637 637 (CE_NOTE, "Found entry %p for %p in table %s",
638 638 (void *)l->entry, key, table->dbt_name));
639 639
640 640 if (id != -1)
641 641 id_free(table->dbt_id_space, id);
642 642 return (l->entry->dbe_data);
643 643 }
644 644 }
645 645
646 646 if (!*create || table->dbt_create == NULL || !idx->dbi_createable ||
647 647 table->dbt_maxentries == table->dbt_count) {
648 648 NFS4_DEBUG(table->dbt_debug & SEARCH_DEBUG,
649 649 (CE_NOTE, "Entry for %p in %s not found",
650 650 key, table->dbt_name));
651 651
652 652 rw_exit(bp->dbk_lock);
653 653 if (id != -1)
654 654 id_free(table->dbt_id_space, id);
655 655 return (NULL);
656 656 }
657 657
658 658 if (table->dbt_id_space && id == -1) {
659 659 rw_exit(bp->dbk_lock);
660 660
661 661 /* get an id, ok to sleep for it here */
662 662 id = id_alloc(table->dbt_id_space);
663 663 ASSERT(id != -1);
664 664
665 665 mutex_enter(&table->dbt_reaper_cv_lock);
666 666 rfs4_dbe_tabreap_adjust(table);
667 667 mutex_exit(&table->dbt_reaper_cv_lock);
668 668
669 669 rw_enter(bp->dbk_lock, RW_WRITER);
670 670 goto retry;
671 671 }
672 672
673 673 /* get an exclusive lock on the bucket */
674 674 if (rw_read_locked(bp->dbk_lock) && !rw_tryupgrade(bp->dbk_lock)) {
675 675 NFS4_DEBUG(table->dbt_debug & OTHER_DEBUG,
676 676 (CE_NOTE, "Trying to upgrade lock on "
677 677 "hash chain %d (%p) for %s by %s",
678 678 i, (void*)bp, table->dbt_name, idx->dbi_keyname));
679 679
680 680 rw_exit(bp->dbk_lock);
681 681 rw_enter(bp->dbk_lock, RW_WRITER);
682 682 goto retry;
683 683 }
684 684
685 685 /* create entry */
686 686 entry = rfs4_dbe_create(table, id, arg);
687 687 if (entry == NULL) {
688 688 rw_exit(bp->dbk_lock);
689 689 if (id != -1)
690 690 id_free(table->dbt_id_space, id);
691 691
692 692 NFS4_DEBUG(table->dbt_debug & CREATE_DEBUG,
693 693 (CE_NOTE, "Constructor for table %s failed",
694 694 table->dbt_name));
695 695 return (NULL);
696 696 }
697 697
698 698 /*
699 699 * Add one ref for entry into table's hash - only one
700 700 * reference added even though there may be multiple indices
701 701 */
702 702 rfs4_dbe_hold(entry);
703 703 ENQUEUE(bp->dbk_head, &entry->dbe_indices[idx->dbi_tblidx]);
704 704 VALIDATE_ADDR(entry->dbe_indices[idx->dbi_tblidx].entry);
705 705
706 706 already_done = idx->dbi_tblidx;
707 707 rw_exit(bp->dbk_lock);
708 708
709 709 for (ip = table->dbt_indices; ip; ip = ip->dbi_inext) {
710 710 if (ip->dbi_tblidx == already_done)
711 711 continue;
712 712 l = &entry->dbe_indices[ip->dbi_tblidx];
713 713 i = HASH(ip, ip->dbi_mkkey(entry->dbe_data));
714 714 ASSERT(i < ip->dbi_table->dbt_len);
715 715 bp = &ip->dbi_buckets[i];
716 716 ENQUEUE_IDX(bp, l);
717 717 }
718 718
719 719 NFS4_DEBUG(
720 720 table->dbt_debug & SEARCH_DEBUG || table->dbt_debug & CREATE_DEBUG,
721 721 (CE_NOTE, "Entry %p created for %s = %p in table %s",
722 722 (void*)entry, idx->dbi_keyname, (void*)key, table->dbt_name));
723 723
724 724 return (entry->dbe_data);
725 725 }
726 726
727 727 /*ARGSUSED*/
728 728 boolean_t
729 729 rfs4_cpr_callb(void *arg, int code)
730 730 {
731 731 rfs4_bucket_t *buckets, *bp;
732 732 rfs4_link_t *l;
733 733 rfs4_client_t *cp;
734 734 int i;
735 735
736 736 nfs4_srv_t *nsrv4 = nfs4_get_srv();
737 737 rfs4_table_t *table = nsrv4->rfs4_client_tab;
738 738
739 739 /*
740 740 * We get called for Suspend and Resume events.
741 741 * For the suspend case we simply don't care! Nor do we care if
742 742 * there are no clients.
743 743 */
744 744 if (code == CB_CODE_CPR_CHKPT || table == NULL) {
745 745 return (B_TRUE);
746 746 }
747 747
748 748 buckets = table->dbt_indices->dbi_buckets;
749 749
750 750 /*
751 751 * When we get this far we are in the process of
752 752 * resuming the system from a previous suspend.
753 753 *
754 754 * We are going to blast through and update the
755 755 * last_access time for all the clients and in
756 756 * doing so extend them by one lease period.
757 757 */
758 758 for (i = 0; i < table->dbt_len; i++) {
759 759 bp = &buckets[i];
760 760 for (l = bp->dbk_head; l; l = l->next) {
761 761 cp = (rfs4_client_t *)l->entry->dbe_data;
762 762 cp->rc_last_access = gethrestime_sec();
763 763 }
764 764 }
765 765
766 766 return (B_TRUE);
767 767 }
768 768
769 769 /*
770 770 * Given a table, lock each of the buckets and walk all entries (in
771 771 * turn locking those) and calling the provided "callout" function
772 772 * with the provided parameter. Obviously used to iterate across all
773 773 * entries in a particular table via the database locking hierarchy.
774 774 * Obviously the caller must not hold locks on any of the entries in
775 775 * the specified table.
776 776 */
777 777 void
778 778 rfs4_dbe_walk(rfs4_table_t *table,
779 779 void (*callout)(rfs4_entry_t, void *),
780 780 void *data)
781 781 {
782 782 rfs4_bucket_t *buckets = table->dbt_indices->dbi_buckets, *bp;
783 783 rfs4_link_t *l;
784 784 rfs4_dbe_t *entry;
785 785 int i;
786 786
787 787 NFS4_DEBUG(table->dbt_debug & WALK_DEBUG,
788 788 (CE_NOTE, "Walking entries in %s", table->dbt_name));
789 789
790 790 /* Walk the buckets looking for entries to release/destroy */
791 791 for (i = 0; i < table->dbt_len; i++) {
792 792 bp = &buckets[i];
793 793 rw_enter(bp->dbk_lock, RW_READER);
794 794 for (l = bp->dbk_head; l; l = l->next) {
795 795 entry = l->entry;
796 796 mutex_enter(entry->dbe_lock);
797 797 (*callout)(entry->dbe_data, data);
798 798 mutex_exit(entry->dbe_lock);
799 799 }
800 800 rw_exit(bp->dbk_lock);
801 801 }
802 802
803 803 NFS4_DEBUG(table->dbt_debug & WALK_DEBUG,
804 804 (CE_NOTE, "Walking entries complete %s", table->dbt_name));
805 805 }
806 806
807 807
808 808 static void
809 809 rfs4_dbe_reap(rfs4_table_t *table, time_t cache_time, uint32_t desired)
810 810 {
811 811 rfs4_index_t *idx = table->dbt_indices;
812 812 rfs4_bucket_t *buckets = idx->dbi_buckets, *bp;
813 813 rfs4_link_t *l, *t;
814 814 rfs4_dbe_t *entry;
815 815 bool_t found;
816 816 int i;
817 817 int count = 0;
818 818
819 819 NFS4_DEBUG(table->dbt_debug & REAP_DEBUG,
|
↓ open down ↓ |
819 lines elided |
↑ open up ↑ |
820 820 (CE_NOTE, "Reaping %d entries older than %ld seconds in table %s",
821 821 desired, cache_time, table->dbt_name));
822 822
823 823 /* Walk the buckets looking for entries to release/destroy */
824 824 for (i = 0; i < table->dbt_len; i++) {
825 825 bp = &buckets[i];
826 826 do {
827 827 found = FALSE;
828 828 rw_enter(bp->dbk_lock, RW_READER);
829 829 for (l = bp->dbk_head; l; l = l->next) {
830 + uint32_t refcnt;
831 +
830 832 entry = l->entry;
831 833 /*
832 834 * Examine an entry. Ref count of 1 means
833 835 * that the only reference is for the hash
834 836 * table reference.
835 837 */
836 838 if (entry->dbe_refcnt != 1)
837 839 continue;
838 840 mutex_enter(entry->dbe_lock);
841 + /*
842 + * Recheck the ref. count with the lock,
843 + * and if non-zero, leave things alone.
844 + */
839 845 if ((entry->dbe_refcnt == 1) &&
840 846 (table->dbt_reaper_shutdown ||
841 847 table->dbt_expiry == NULL ||
842 848 (*table->dbt_expiry)(entry->dbe_data))) {
843 - entry->dbe_refcnt--;
844 - count++;
845 - found = TRUE;
849 + refcnt = atomic_dec_32_nv(&entry->dbe_refcnt);
850 + if (refcnt == 0) {
851 + count++;
852 + found = TRUE;
853 + } else {
854 + /*
855 + * Lost race w/ incr.
856 + * Leave it as it was
857 + */
858 + atomic_inc_32(&entry->dbe_refcnt);
859 + }
846 860 }
847 861 mutex_exit(entry->dbe_lock);
848 862 }
849 863 if (found) {
850 864 if (!rw_tryupgrade(bp->dbk_lock)) {
851 865 rw_exit(bp->dbk_lock);
852 866 rw_enter(bp->dbk_lock, RW_WRITER);
853 867 }
854 868
855 869 l = bp->dbk_head;
856 870 while (l) {
857 871 t = l;
858 872 entry = t->entry;
859 873 l = l->next;
860 874 if (entry->dbe_refcnt == 0) {
861 875 DEQUEUE(bp->dbk_head, t);
862 876 t->next = NULL;
863 877 t->prev = NULL;
864 878 INVALIDATE_ADDR(t->entry);
865 879 rfs4_dbe_destroy(entry);
866 880 }
867 881 }
868 882 }
869 883 rw_exit(bp->dbk_lock);
870 884 /*
871 885 * delay slightly if there is more work to do
872 886 * with the expectation that other reaper
873 887 * threads are freeing data structures as well
874 888 * and in turn will reduce ref counts on
875 889 * entries in this table allowing them to be
876 890 * released. This is only done in the
877 891 * instance that the tables are being shut down.
878 892 */
879 893 if (table->dbt_reaper_shutdown && bp->dbk_head != NULL)
880 894 delay(hz/100);
881 895 /*
882 896 * If this is a table shutdown, keep going until
883 897 * everything is gone
884 898 */
885 899 } while (table->dbt_reaper_shutdown && bp->dbk_head != NULL);
886 900
887 901 if (!table->dbt_reaper_shutdown && desired && count >= desired)
888 902 break;
889 903 }
890 904
891 905 NFS4_DEBUG(table->dbt_debug & REAP_DEBUG,
892 906 (CE_NOTE, "Reaped %d entries older than %ld seconds in table %s",
893 907 count, cache_time, table->dbt_name));
894 908 }
895 909
896 910 static void
897 911 reaper_thread(caddr_t *arg)
898 912 {
899 913 rfs4_table_t *table = (rfs4_table_t *)arg;
900 914 clock_t rc;
901 915
902 916 NFS4_DEBUG(table->dbt_debug,
903 917 (CE_NOTE, "rfs4_reaper_thread starting for %s", table->dbt_name));
904 918
905 919 CALLB_CPR_INIT(&table->dbt_reaper_cpr_info, &table->dbt_reaper_cv_lock,
906 920 callb_generic_cpr, "nfsv4Reaper");
907 921
908 922 mutex_enter(&table->dbt_reaper_cv_lock);
909 923 do {
910 924 CALLB_CPR_SAFE_BEGIN(&table->dbt_reaper_cpr_info);
911 925 rc = cv_reltimedwait_sig(&table->dbt_reaper_wait,
912 926 &table->dbt_reaper_cv_lock,
913 927 SEC_TO_TICK(table->dbt_id_reap), TR_CLOCK_TICK);
914 928 CALLB_CPR_SAFE_END(&table->dbt_reaper_cpr_info,
915 929 &table->dbt_reaper_cv_lock);
916 930 rfs4_dbe_reap(table, table->dbt_max_cache_time, 0);
917 931 } while (rc != 0 && table->dbt_reaper_shutdown == FALSE);
918 932
919 933 CALLB_CPR_EXIT(&table->dbt_reaper_cpr_info);
920 934
921 935 NFS4_DEBUG(table->dbt_debug,
922 936 (CE_NOTE, "rfs4_reaper_thread exiting for %s", table->dbt_name));
923 937
924 938 /* Notify the database shutdown processing that the table is shutdown */
925 939 mutex_enter(table->dbt_db->db_lock);
926 940 table->dbt_db->db_shutdown_count--;
927 941 cv_signal(&table->dbt_db->db_shutdown_wait);
928 942 mutex_exit(table->dbt_db->db_lock);
929 943 zthread_exit();
930 944 }
931 945
932 946 static void
933 947 rfs4_start_reaper(rfs4_table_t *table)
934 948 {
935 949 if (table->dbt_max_cache_time == 0)
936 950 return;
937 951
938 952 (void) zthread_create(NULL, 0, reaper_thread, table, 0,
939 953 minclsyspri);
940 954 }
941 955
942 956 #ifdef DEBUG
943 957 void
944 958 rfs4_dbe_debug(rfs4_dbe_t *entry)
945 959 {
946 960 cmn_err(CE_NOTE, "Entry %p from table %s",
947 961 (void *)entry, entry->dbe_table->dbt_name);
948 962 cmn_err(CE_CONT, "\trefcnt = %d id = %d",
949 963 entry->dbe_refcnt, entry->dbe_id);
950 964 }
951 965 #endif
|
↓ open down ↓ |
96 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX