Print this page
5056 ZFS deadlock on db_mtx and dn_holds
Reviewed by: Will Andrews <willa@spectralogic.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Approved by: Dan McDonald <danmcd@omniti.com>
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/fs/zfs/zap_micro.c
+++ new/usr/src/uts/common/fs/zfs/zap_micro.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
|
↓ open down ↓ |
13 lines elided |
↑ open up ↑ |
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
24 + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
24 25 */
25 26
26 27 #include <sys/zio.h>
27 28 #include <sys/spa.h>
28 29 #include <sys/dmu.h>
29 30 #include <sys/zfs_context.h>
30 31 #include <sys/zap.h>
31 32 #include <sys/refcount.h>
32 33 #include <sys/zap_impl.h>
33 34 #include <sys/zap_leaf.h>
34 35 #include <sys/avl.h>
35 36 #include <sys/arc.h>
36 37 #include <sys/dmu_objset.h>
37 38
38 39 #ifdef _KERNEL
39 40 #include <sys/sunddi.h>
40 41 #endif
41 42
42 43 extern inline mzap_phys_t *zap_m_phys(zap_t *zap);
43 44
44 45 static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags);
45 46
46 47 uint64_t
47 48 zap_getflags(zap_t *zap)
48 49 {
49 50 if (zap->zap_ismicro)
50 51 return (0);
51 52 return (zap_f_phys(zap)->zap_flags);
52 53 }
53 54
54 55 int
55 56 zap_hashbits(zap_t *zap)
56 57 {
57 58 if (zap_getflags(zap) & ZAP_FLAG_HASH64)
58 59 return (48);
59 60 else
60 61 return (28);
61 62 }
62 63
63 64 uint32_t
64 65 zap_maxcd(zap_t *zap)
65 66 {
66 67 if (zap_getflags(zap) & ZAP_FLAG_HASH64)
67 68 return ((1<<16)-1);
68 69 else
69 70 return (-1U);
70 71 }
71 72
72 73 static uint64_t
73 74 zap_hash(zap_name_t *zn)
74 75 {
75 76 zap_t *zap = zn->zn_zap;
76 77 uint64_t h = 0;
77 78
78 79 if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) {
79 80 ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY);
80 81 h = *(uint64_t *)zn->zn_key_orig;
81 82 } else {
82 83 h = zap->zap_salt;
83 84 ASSERT(h != 0);
84 85 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
85 86
86 87 if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
87 88 int i;
88 89 const uint64_t *wp = zn->zn_key_norm;
89 90
90 91 ASSERT(zn->zn_key_intlen == 8);
91 92 for (i = 0; i < zn->zn_key_norm_numints; wp++, i++) {
92 93 int j;
93 94 uint64_t word = *wp;
94 95
95 96 for (j = 0; j < zn->zn_key_intlen; j++) {
96 97 h = (h >> 8) ^
97 98 zfs_crc64_table[(h ^ word) & 0xFF];
98 99 word >>= NBBY;
99 100 }
100 101 }
101 102 } else {
102 103 int i, len;
103 104 const uint8_t *cp = zn->zn_key_norm;
104 105
105 106 /*
106 107 * We previously stored the terminating null on
107 108 * disk, but didn't hash it, so we need to
108 109 * continue to not hash it. (The
109 110 * zn_key_*_numints includes the terminating
110 111 * null for non-binary keys.)
111 112 */
112 113 len = zn->zn_key_norm_numints - 1;
113 114
114 115 ASSERT(zn->zn_key_intlen == 1);
115 116 for (i = 0; i < len; cp++, i++) {
116 117 h = (h >> 8) ^
117 118 zfs_crc64_table[(h ^ *cp) & 0xFF];
118 119 }
119 120 }
120 121 }
121 122 /*
122 123 * Don't use all 64 bits, since we need some in the cookie for
123 124 * the collision differentiator. We MUST use the high bits,
124 125 * since those are the ones that we first pay attention to when
125 126 * chosing the bucket.
126 127 */
127 128 h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1);
128 129
129 130 return (h);
130 131 }
131 132
132 133 static int
133 134 zap_normalize(zap_t *zap, const char *name, char *namenorm)
134 135 {
135 136 size_t inlen, outlen;
136 137 int err;
137 138
138 139 ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY));
139 140
140 141 inlen = strlen(name) + 1;
141 142 outlen = ZAP_MAXNAMELEN;
142 143
143 144 err = 0;
144 145 (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen,
145 146 zap->zap_normflags | U8_TEXTPREP_IGNORE_NULL |
146 147 U8_TEXTPREP_IGNORE_INVALID, U8_UNICODE_LATEST, &err);
147 148
148 149 return (err);
149 150 }
150 151
151 152 boolean_t
152 153 zap_match(zap_name_t *zn, const char *matchname)
153 154 {
154 155 ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY));
155 156
156 157 if (zn->zn_matchtype == MT_FIRST) {
157 158 char norm[ZAP_MAXNAMELEN];
158 159
159 160 if (zap_normalize(zn->zn_zap, matchname, norm) != 0)
160 161 return (B_FALSE);
161 162
162 163 return (strcmp(zn->zn_key_norm, norm) == 0);
163 164 } else {
164 165 /* MT_BEST or MT_EXACT */
165 166 return (strcmp(zn->zn_key_orig, matchname) == 0);
166 167 }
167 168 }
168 169
169 170 void
170 171 zap_name_free(zap_name_t *zn)
171 172 {
172 173 kmem_free(zn, sizeof (zap_name_t));
173 174 }
174 175
175 176 zap_name_t *
176 177 zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt)
177 178 {
178 179 zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
179 180
180 181 zn->zn_zap = zap;
181 182 zn->zn_key_intlen = sizeof (*key);
182 183 zn->zn_key_orig = key;
183 184 zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1;
184 185 zn->zn_matchtype = mt;
185 186 if (zap->zap_normflags) {
186 187 if (zap_normalize(zap, key, zn->zn_normbuf) != 0) {
187 188 zap_name_free(zn);
188 189 return (NULL);
189 190 }
190 191 zn->zn_key_norm = zn->zn_normbuf;
191 192 zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
192 193 } else {
193 194 if (mt != MT_EXACT) {
194 195 zap_name_free(zn);
195 196 return (NULL);
196 197 }
197 198 zn->zn_key_norm = zn->zn_key_orig;
198 199 zn->zn_key_norm_numints = zn->zn_key_orig_numints;
199 200 }
200 201
201 202 zn->zn_hash = zap_hash(zn);
202 203 return (zn);
203 204 }
204 205
205 206 zap_name_t *
206 207 zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints)
207 208 {
208 209 zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
209 210
210 211 ASSERT(zap->zap_normflags == 0);
211 212 zn->zn_zap = zap;
212 213 zn->zn_key_intlen = sizeof (*key);
213 214 zn->zn_key_orig = zn->zn_key_norm = key;
214 215 zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints;
215 216 zn->zn_matchtype = MT_EXACT;
216 217
217 218 zn->zn_hash = zap_hash(zn);
218 219 return (zn);
219 220 }
220 221
221 222 static void
222 223 mzap_byteswap(mzap_phys_t *buf, size_t size)
223 224 {
224 225 int i, max;
225 226 buf->mz_block_type = BSWAP_64(buf->mz_block_type);
226 227 buf->mz_salt = BSWAP_64(buf->mz_salt);
227 228 buf->mz_normflags = BSWAP_64(buf->mz_normflags);
228 229 max = (size / MZAP_ENT_LEN) - 1;
229 230 for (i = 0; i < max; i++) {
230 231 buf->mz_chunk[i].mze_value =
231 232 BSWAP_64(buf->mz_chunk[i].mze_value);
232 233 buf->mz_chunk[i].mze_cd =
233 234 BSWAP_32(buf->mz_chunk[i].mze_cd);
234 235 }
235 236 }
236 237
237 238 void
238 239 zap_byteswap(void *buf, size_t size)
239 240 {
240 241 uint64_t block_type;
241 242
242 243 block_type = *(uint64_t *)buf;
243 244
244 245 if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) {
245 246 /* ASSERT(magic == ZAP_LEAF_MAGIC); */
246 247 mzap_byteswap(buf, size);
247 248 } else {
248 249 fzap_byteswap(buf, size);
249 250 }
250 251 }
251 252
252 253 static int
253 254 mze_compare(const void *arg1, const void *arg2)
254 255 {
255 256 const mzap_ent_t *mze1 = arg1;
256 257 const mzap_ent_t *mze2 = arg2;
257 258
258 259 if (mze1->mze_hash > mze2->mze_hash)
259 260 return (+1);
260 261 if (mze1->mze_hash < mze2->mze_hash)
261 262 return (-1);
262 263 if (mze1->mze_cd > mze2->mze_cd)
263 264 return (+1);
264 265 if (mze1->mze_cd < mze2->mze_cd)
265 266 return (-1);
266 267 return (0);
267 268 }
268 269
269 270 static void
270 271 mze_insert(zap_t *zap, int chunkid, uint64_t hash)
271 272 {
272 273 mzap_ent_t *mze;
273 274
274 275 ASSERT(zap->zap_ismicro);
275 276 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
276 277
277 278 mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
278 279 mze->mze_chunkid = chunkid;
279 280 mze->mze_hash = hash;
280 281 mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd;
281 282 ASSERT(MZE_PHYS(zap, mze)->mze_name[0] != 0);
282 283 avl_add(&zap->zap_m.zap_avl, mze);
283 284 }
284 285
285 286 static mzap_ent_t *
286 287 mze_find(zap_name_t *zn)
287 288 {
288 289 mzap_ent_t mze_tofind;
289 290 mzap_ent_t *mze;
290 291 avl_index_t idx;
291 292 avl_tree_t *avl = &zn->zn_zap->zap_m.zap_avl;
292 293
293 294 ASSERT(zn->zn_zap->zap_ismicro);
294 295 ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock));
295 296
296 297 mze_tofind.mze_hash = zn->zn_hash;
297 298 mze_tofind.mze_cd = 0;
298 299
299 300 again:
300 301 mze = avl_find(avl, &mze_tofind, &idx);
301 302 if (mze == NULL)
302 303 mze = avl_nearest(avl, idx, AVL_AFTER);
303 304 for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) {
304 305 ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd);
305 306 if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name))
306 307 return (mze);
307 308 }
308 309 if (zn->zn_matchtype == MT_BEST) {
309 310 zn->zn_matchtype = MT_FIRST;
310 311 goto again;
311 312 }
312 313 return (NULL);
313 314 }
314 315
315 316 static uint32_t
316 317 mze_find_unused_cd(zap_t *zap, uint64_t hash)
317 318 {
318 319 mzap_ent_t mze_tofind;
319 320 mzap_ent_t *mze;
320 321 avl_index_t idx;
321 322 avl_tree_t *avl = &zap->zap_m.zap_avl;
322 323 uint32_t cd;
323 324
324 325 ASSERT(zap->zap_ismicro);
325 326 ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
326 327
327 328 mze_tofind.mze_hash = hash;
328 329 mze_tofind.mze_cd = 0;
329 330
330 331 cd = 0;
331 332 for (mze = avl_find(avl, &mze_tofind, &idx);
332 333 mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
333 334 if (mze->mze_cd != cd)
334 335 break;
335 336 cd++;
336 337 }
337 338
338 339 return (cd);
339 340 }
340 341
341 342 static void
342 343 mze_remove(zap_t *zap, mzap_ent_t *mze)
343 344 {
344 345 ASSERT(zap->zap_ismicro);
345 346 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
346 347
347 348 avl_remove(&zap->zap_m.zap_avl, mze);
348 349 kmem_free(mze, sizeof (mzap_ent_t));
349 350 }
350 351
351 352 static void
352 353 mze_destroy(zap_t *zap)
353 354 {
354 355 mzap_ent_t *mze;
355 356 void *avlcookie = NULL;
356 357
357 358 while (mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie))
358 359 kmem_free(mze, sizeof (mzap_ent_t));
359 360 avl_destroy(&zap->zap_m.zap_avl);
360 361 }
361 362
362 363 static zap_t *
363 364 mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
364 365 {
365 366 zap_t *winner;
366 367 zap_t *zap;
367 368 int i;
368 369
369 370 ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));
370 371
371 372 zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
372 373 rw_init(&zap->zap_rwlock, 0, 0, 0);
373 374 rw_enter(&zap->zap_rwlock, RW_WRITER);
374 375 zap->zap_objset = os;
375 376 zap->zap_object = obj;
376 377 zap->zap_dbuf = db;
377 378
378 379 if (*(uint64_t *)db->db_data != ZBT_MICRO) {
379 380 mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
|
↓ open down ↓ |
346 lines elided |
↑ open up ↑ |
380 381 zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1;
381 382 } else {
382 383 zap->zap_ismicro = TRUE;
383 384 }
384 385
385 386 /*
386 387 * Make sure that zap_ismicro is set before we let others see
387 388 * it, because zap_lockdir() checks zap_ismicro without the lock
388 389 * held.
389 390 */
390 - winner = dmu_buf_set_user(db, zap, zap_evict);
391 + dmu_buf_init_user(&zap->zap_dbu, zap_evict, &zap->zap_dbuf);
392 + winner = dmu_buf_set_user(db, &zap->zap_dbu);
391 393
392 394 if (winner != NULL) {
393 395 rw_exit(&zap->zap_rwlock);
394 396 rw_destroy(&zap->zap_rwlock);
395 397 if (!zap->zap_ismicro)
396 398 mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
397 399 kmem_free(zap, sizeof (zap_t));
398 400 return (winner);
399 401 }
400 402
401 403 if (zap->zap_ismicro) {
402 404 zap->zap_salt = zap_m_phys(zap)->mz_salt;
403 405 zap->zap_normflags = zap_m_phys(zap)->mz_normflags;
404 406 zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
405 407 avl_create(&zap->zap_m.zap_avl, mze_compare,
406 408 sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node));
407 409
408 410 for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
409 411 mzap_ent_phys_t *mze =
410 412 &zap_m_phys(zap)->mz_chunk[i];
411 413 if (mze->mze_name[0]) {
412 414 zap_name_t *zn;
413 415
414 416 zap->zap_m.zap_num_entries++;
415 417 zn = zap_name_alloc(zap, mze->mze_name,
416 418 MT_EXACT);
417 419 mze_insert(zap, i, zn->zn_hash);
418 420 zap_name_free(zn);
419 421 }
420 422 }
421 423 } else {
422 424 zap->zap_salt = zap_f_phys(zap)->zap_salt;
423 425 zap->zap_normflags = zap_f_phys(zap)->zap_normflags;
424 426
425 427 ASSERT3U(sizeof (struct zap_leaf_header), ==,
426 428 2*ZAP_LEAF_CHUNKSIZE);
427 429
428 430 /*
429 431 * The embedded pointer table should not overlap the
430 432 * other members.
431 433 */
432 434 ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >,
433 435 &zap_f_phys(zap)->zap_salt);
434 436
435 437 /*
436 438 * The embedded pointer table should end at the end of
437 439 * the block
438 440 */
439 441 ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap,
440 442 1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) -
441 443 (uintptr_t)zap_f_phys(zap), ==,
442 444 zap->zap_dbuf->db_size);
443 445 }
444 446 rw_exit(&zap->zap_rwlock);
445 447 return (zap);
446 448 }
447 449
448 450 int
449 451 zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
450 452 krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
451 453 {
452 454 zap_t *zap;
453 455 dmu_buf_t *db;
454 456 krw_t lt;
455 457 int err;
456 458
457 459 *zapp = NULL;
458 460
459 461 err = dmu_buf_hold(os, obj, 0, NULL, &db, DMU_READ_NO_PREFETCH);
460 462 if (err)
461 463 return (err);
462 464
463 465 #ifdef ZFS_DEBUG
464 466 {
465 467 dmu_object_info_t doi;
466 468 dmu_object_info_from_db(db, &doi);
467 469 ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
468 470 }
469 471 #endif
470 472
471 473 zap = dmu_buf_get_user(db);
472 474 if (zap == NULL)
473 475 zap = mzap_open(os, obj, db);
474 476
475 477 /*
476 478 * We're checking zap_ismicro without the lock held, in order to
477 479 * tell what type of lock we want. Once we have some sort of
478 480 * lock, see if it really is the right type. In practice this
479 481 * can only be different if it was upgraded from micro to fat,
480 482 * and micro wanted WRITER but fat only needs READER.
481 483 */
482 484 lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
483 485 rw_enter(&zap->zap_rwlock, lt);
484 486 if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) {
485 487 /* it was upgraded, now we only need reader */
486 488 ASSERT(lt == RW_WRITER);
487 489 ASSERT(RW_READER ==
488 490 (!zap->zap_ismicro && fatreader) ? RW_READER : lti);
489 491 rw_downgrade(&zap->zap_rwlock);
490 492 lt = RW_READER;
491 493 }
492 494
493 495 zap->zap_objset = os;
494 496
495 497 if (lt == RW_WRITER)
496 498 dmu_buf_will_dirty(db, tx);
497 499
498 500 ASSERT3P(zap->zap_dbuf, ==, db);
499 501
500 502 ASSERT(!zap->zap_ismicro ||
501 503 zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks);
502 504 if (zap->zap_ismicro && tx && adding &&
503 505 zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) {
504 506 uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE;
505 507 if (newsz > MZAP_MAX_BLKSZ) {
506 508 dprintf("upgrading obj %llu: num_entries=%u\n",
507 509 obj, zap->zap_m.zap_num_entries);
508 510 *zapp = zap;
509 511 return (mzap_upgrade(zapp, tx, 0));
510 512 }
511 513 err = dmu_object_set_blocksize(os, obj, newsz, 0, tx);
512 514 ASSERT0(err);
513 515 zap->zap_m.zap_num_chunks =
514 516 db->db_size / MZAP_ENT_LEN - 1;
515 517 }
516 518
517 519 *zapp = zap;
518 520 return (0);
519 521 }
520 522
521 523 void
522 524 zap_unlockdir(zap_t *zap)
523 525 {
524 526 rw_exit(&zap->zap_rwlock);
525 527 dmu_buf_rele(zap->zap_dbuf, NULL);
526 528 }
527 529
528 530 static int
529 531 mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags)
530 532 {
531 533 mzap_phys_t *mzp;
532 534 int i, sz, nchunks;
533 535 int err = 0;
534 536 zap_t *zap = *zapp;
535 537
536 538 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
537 539
538 540 sz = zap->zap_dbuf->db_size;
539 541 mzp = kmem_alloc(sz, KM_SLEEP);
540 542 bcopy(zap->zap_dbuf->db_data, mzp, sz);
541 543 nchunks = zap->zap_m.zap_num_chunks;
542 544
543 545 if (!flags) {
544 546 err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
545 547 1ULL << fzap_default_block_shift, 0, tx);
546 548 if (err) {
547 549 kmem_free(mzp, sz);
548 550 return (err);
549 551 }
550 552 }
551 553
552 554 dprintf("upgrading obj=%llu with %u chunks\n",
553 555 zap->zap_object, nchunks);
554 556 /* XXX destroy the avl later, so we can use the stored hash value */
555 557 mze_destroy(zap);
556 558
557 559 fzap_upgrade(zap, tx, flags);
558 560
559 561 for (i = 0; i < nchunks; i++) {
560 562 mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
561 563 zap_name_t *zn;
562 564 if (mze->mze_name[0] == 0)
563 565 continue;
564 566 dprintf("adding %s=%llu\n",
565 567 mze->mze_name, mze->mze_value);
566 568 zn = zap_name_alloc(zap, mze->mze_name, MT_EXACT);
567 569 err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, tx);
568 570 zap = zn->zn_zap; /* fzap_add_cd() may change zap */
569 571 zap_name_free(zn);
570 572 if (err)
571 573 break;
572 574 }
573 575 kmem_free(mzp, sz);
574 576 *zapp = zap;
575 577 return (err);
576 578 }
577 579
578 580 void
579 581 mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags,
580 582 dmu_tx_t *tx)
581 583 {
582 584 dmu_buf_t *db;
583 585 mzap_phys_t *zp;
584 586
585 587 VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH));
586 588
587 589 #ifdef ZFS_DEBUG
588 590 {
589 591 dmu_object_info_t doi;
590 592 dmu_object_info_from_db(db, &doi);
591 593 ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
592 594 }
593 595 #endif
594 596
595 597 dmu_buf_will_dirty(db, tx);
596 598 zp = db->db_data;
597 599 zp->mz_block_type = ZBT_MICRO;
598 600 zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL;
599 601 zp->mz_normflags = normflags;
600 602 dmu_buf_rele(db, FTAG);
601 603
602 604 if (flags != 0) {
603 605 zap_t *zap;
604 606 /* Only fat zap supports flags; upgrade immediately. */
605 607 VERIFY(0 == zap_lockdir(os, obj, tx, RW_WRITER,
606 608 B_FALSE, B_FALSE, &zap));
607 609 VERIFY3U(0, ==, mzap_upgrade(&zap, tx, flags));
608 610 zap_unlockdir(zap);
609 611 }
610 612 }
611 613
612 614 int
613 615 zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
614 616 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
615 617 {
616 618 return (zap_create_claim_norm(os, obj,
617 619 0, ot, bonustype, bonuslen, tx));
618 620 }
619 621
620 622 int
621 623 zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags,
622 624 dmu_object_type_t ot,
623 625 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
624 626 {
625 627 int err;
626 628
627 629 err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx);
628 630 if (err != 0)
629 631 return (err);
630 632 mzap_create_impl(os, obj, normflags, 0, tx);
631 633 return (0);
632 634 }
633 635
634 636 uint64_t
635 637 zap_create(objset_t *os, dmu_object_type_t ot,
636 638 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
637 639 {
638 640 return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx));
639 641 }
640 642
641 643 uint64_t
642 644 zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot,
643 645 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
644 646 {
645 647 uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
646 648
647 649 mzap_create_impl(os, obj, normflags, 0, tx);
648 650 return (obj);
649 651 }
650 652
651 653 uint64_t
652 654 zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
653 655 dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
654 656 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
655 657 {
656 658 uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
657 659
658 660 ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT &&
659 661 leaf_blockshift <= SPA_OLD_MAXBLOCKSHIFT &&
660 662 indirect_blockshift >= SPA_MINBLOCKSHIFT &&
661 663 indirect_blockshift <= SPA_OLD_MAXBLOCKSHIFT);
662 664
663 665 VERIFY(dmu_object_set_blocksize(os, obj,
664 666 1ULL << leaf_blockshift, indirect_blockshift, tx) == 0);
665 667
666 668 mzap_create_impl(os, obj, normflags, flags, tx);
667 669 return (obj);
668 670 }
669 671
670 672 int
671 673 zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx)
|
↓ open down ↓ |
271 lines elided |
↑ open up ↑ |
672 674 {
673 675 /*
674 676 * dmu_object_free will free the object number and free the
675 677 * data. Freeing the data will cause our pageout function to be
676 678 * called, which will destroy our data (zap_leaf_t's and zap_t).
677 679 */
678 680
679 681 return (dmu_object_free(os, zapobj, tx));
680 682 }
681 683
682 -_NOTE(ARGSUSED(0))
683 684 void
684 -zap_evict(dmu_buf_t *db, void *vzap)
685 +zap_evict(void *dbu)
685 686 {
686 - zap_t *zap = vzap;
687 + zap_t *zap = dbu;
687 688
688 689 rw_destroy(&zap->zap_rwlock);
689 690
690 691 if (zap->zap_ismicro)
691 692 mze_destroy(zap);
692 693 else
693 694 mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
694 695
695 696 kmem_free(zap, sizeof (zap_t));
696 697 }
697 698
698 699 int
699 700 zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
700 701 {
701 702 zap_t *zap;
702 703 int err;
703 704
704 705 err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
705 706 if (err)
706 707 return (err);
707 708 if (!zap->zap_ismicro) {
708 709 err = fzap_count(zap, count);
709 710 } else {
710 711 *count = zap->zap_m.zap_num_entries;
711 712 }
712 713 zap_unlockdir(zap);
713 714 return (err);
714 715 }
715 716
716 717 /*
717 718 * zn may be NULL; if not specified, it will be computed if needed.
718 719 * See also the comment above zap_entry_normalization_conflict().
719 720 */
720 721 static boolean_t
721 722 mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze)
722 723 {
723 724 mzap_ent_t *other;
724 725 int direction = AVL_BEFORE;
725 726 boolean_t allocdzn = B_FALSE;
726 727
727 728 if (zap->zap_normflags == 0)
728 729 return (B_FALSE);
729 730
730 731 again:
731 732 for (other = avl_walk(&zap->zap_m.zap_avl, mze, direction);
732 733 other && other->mze_hash == mze->mze_hash;
733 734 other = avl_walk(&zap->zap_m.zap_avl, other, direction)) {
734 735
735 736 if (zn == NULL) {
736 737 zn = zap_name_alloc(zap, MZE_PHYS(zap, mze)->mze_name,
737 738 MT_FIRST);
738 739 allocdzn = B_TRUE;
739 740 }
740 741 if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
741 742 if (allocdzn)
742 743 zap_name_free(zn);
743 744 return (B_TRUE);
744 745 }
745 746 }
746 747
747 748 if (direction == AVL_BEFORE) {
748 749 direction = AVL_AFTER;
749 750 goto again;
750 751 }
751 752
752 753 if (allocdzn)
753 754 zap_name_free(zn);
754 755 return (B_FALSE);
755 756 }
756 757
757 758 /*
758 759 * Routines for manipulating attributes.
759 760 */
760 761
761 762 int
762 763 zap_lookup(objset_t *os, uint64_t zapobj, const char *name,
763 764 uint64_t integer_size, uint64_t num_integers, void *buf)
764 765 {
765 766 return (zap_lookup_norm(os, zapobj, name, integer_size,
766 767 num_integers, buf, MT_EXACT, NULL, 0, NULL));
767 768 }
768 769
769 770 int
770 771 zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
771 772 uint64_t integer_size, uint64_t num_integers, void *buf,
772 773 matchtype_t mt, char *realname, int rn_len,
773 774 boolean_t *ncp)
774 775 {
775 776 zap_t *zap;
776 777 int err;
777 778 mzap_ent_t *mze;
778 779 zap_name_t *zn;
779 780
780 781 err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
781 782 if (err)
782 783 return (err);
783 784 zn = zap_name_alloc(zap, name, mt);
784 785 if (zn == NULL) {
785 786 zap_unlockdir(zap);
786 787 return (SET_ERROR(ENOTSUP));
787 788 }
788 789
789 790 if (!zap->zap_ismicro) {
790 791 err = fzap_lookup(zn, integer_size, num_integers, buf,
791 792 realname, rn_len, ncp);
792 793 } else {
793 794 mze = mze_find(zn);
794 795 if (mze == NULL) {
795 796 err = SET_ERROR(ENOENT);
796 797 } else {
797 798 if (num_integers < 1) {
798 799 err = SET_ERROR(EOVERFLOW);
799 800 } else if (integer_size != 8) {
800 801 err = SET_ERROR(EINVAL);
801 802 } else {
802 803 *(uint64_t *)buf =
803 804 MZE_PHYS(zap, mze)->mze_value;
804 805 (void) strlcpy(realname,
805 806 MZE_PHYS(zap, mze)->mze_name, rn_len);
806 807 if (ncp) {
807 808 *ncp = mzap_normalization_conflict(zap,
808 809 zn, mze);
809 810 }
810 811 }
811 812 }
812 813 }
813 814 zap_name_free(zn);
814 815 zap_unlockdir(zap);
815 816 return (err);
816 817 }
817 818
818 819 int
819 820 zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
820 821 int key_numints)
821 822 {
822 823 zap_t *zap;
823 824 int err;
824 825 zap_name_t *zn;
825 826
826 827 err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
827 828 if (err)
828 829 return (err);
829 830 zn = zap_name_alloc_uint64(zap, key, key_numints);
830 831 if (zn == NULL) {
831 832 zap_unlockdir(zap);
832 833 return (SET_ERROR(ENOTSUP));
833 834 }
834 835
835 836 fzap_prefetch(zn);
836 837 zap_name_free(zn);
837 838 zap_unlockdir(zap);
838 839 return (err);
839 840 }
840 841
841 842 int
842 843 zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
843 844 int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
844 845 {
845 846 zap_t *zap;
846 847 int err;
847 848 zap_name_t *zn;
848 849
849 850 err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
850 851 if (err)
851 852 return (err);
852 853 zn = zap_name_alloc_uint64(zap, key, key_numints);
853 854 if (zn == NULL) {
854 855 zap_unlockdir(zap);
855 856 return (SET_ERROR(ENOTSUP));
856 857 }
857 858
858 859 err = fzap_lookup(zn, integer_size, num_integers, buf,
859 860 NULL, 0, NULL);
860 861 zap_name_free(zn);
861 862 zap_unlockdir(zap);
862 863 return (err);
863 864 }
864 865
865 866 int
866 867 zap_contains(objset_t *os, uint64_t zapobj, const char *name)
867 868 {
868 869 int err = zap_lookup_norm(os, zapobj, name, 0,
869 870 0, NULL, MT_EXACT, NULL, 0, NULL);
870 871 if (err == EOVERFLOW || err == EINVAL)
871 872 err = 0; /* found, but skipped reading the value */
872 873 return (err);
873 874 }
874 875
875 876 int
876 877 zap_length(objset_t *os, uint64_t zapobj, const char *name,
877 878 uint64_t *integer_size, uint64_t *num_integers)
878 879 {
879 880 zap_t *zap;
880 881 int err;
881 882 mzap_ent_t *mze;
882 883 zap_name_t *zn;
883 884
884 885 err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
885 886 if (err)
886 887 return (err);
887 888 zn = zap_name_alloc(zap, name, MT_EXACT);
888 889 if (zn == NULL) {
889 890 zap_unlockdir(zap);
890 891 return (SET_ERROR(ENOTSUP));
891 892 }
892 893 if (!zap->zap_ismicro) {
893 894 err = fzap_length(zn, integer_size, num_integers);
894 895 } else {
895 896 mze = mze_find(zn);
896 897 if (mze == NULL) {
897 898 err = SET_ERROR(ENOENT);
898 899 } else {
899 900 if (integer_size)
900 901 *integer_size = 8;
901 902 if (num_integers)
902 903 *num_integers = 1;
903 904 }
904 905 }
905 906 zap_name_free(zn);
906 907 zap_unlockdir(zap);
907 908 return (err);
908 909 }
909 910
910 911 int
911 912 zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
912 913 int key_numints, uint64_t *integer_size, uint64_t *num_integers)
913 914 {
914 915 zap_t *zap;
915 916 int err;
916 917 zap_name_t *zn;
917 918
918 919 err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
919 920 if (err)
920 921 return (err);
921 922 zn = zap_name_alloc_uint64(zap, key, key_numints);
922 923 if (zn == NULL) {
923 924 zap_unlockdir(zap);
924 925 return (SET_ERROR(ENOTSUP));
925 926 }
926 927 err = fzap_length(zn, integer_size, num_integers);
927 928 zap_name_free(zn);
928 929 zap_unlockdir(zap);
929 930 return (err);
930 931 }
931 932
932 933 static void
933 934 mzap_addent(zap_name_t *zn, uint64_t value)
934 935 {
935 936 int i;
936 937 zap_t *zap = zn->zn_zap;
937 938 int start = zap->zap_m.zap_alloc_next;
938 939 uint32_t cd;
939 940
940 941 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
941 942
942 943 #ifdef ZFS_DEBUG
943 944 for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
944 945 mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
945 946 ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0);
946 947 }
947 948 #endif
948 949
949 950 cd = mze_find_unused_cd(zap, zn->zn_hash);
950 951 /* given the limited size of the microzap, this can't happen */
951 952 ASSERT(cd < zap_maxcd(zap));
952 953
953 954 again:
954 955 for (i = start; i < zap->zap_m.zap_num_chunks; i++) {
955 956 mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
956 957 if (mze->mze_name[0] == 0) {
957 958 mze->mze_value = value;
958 959 mze->mze_cd = cd;
959 960 (void) strcpy(mze->mze_name, zn->zn_key_orig);
960 961 zap->zap_m.zap_num_entries++;
961 962 zap->zap_m.zap_alloc_next = i+1;
962 963 if (zap->zap_m.zap_alloc_next ==
963 964 zap->zap_m.zap_num_chunks)
964 965 zap->zap_m.zap_alloc_next = 0;
965 966 mze_insert(zap, i, zn->zn_hash);
966 967 return;
967 968 }
968 969 }
969 970 if (start != 0) {
970 971 start = 0;
971 972 goto again;
972 973 }
973 974 ASSERT(!"out of entries!");
974 975 }
975 976
976 977 int
977 978 zap_add(objset_t *os, uint64_t zapobj, const char *key,
978 979 int integer_size, uint64_t num_integers,
979 980 const void *val, dmu_tx_t *tx)
980 981 {
981 982 zap_t *zap;
982 983 int err;
983 984 mzap_ent_t *mze;
984 985 const uint64_t *intval = val;
985 986 zap_name_t *zn;
986 987
987 988 err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
988 989 if (err)
989 990 return (err);
990 991 zn = zap_name_alloc(zap, key, MT_EXACT);
991 992 if (zn == NULL) {
992 993 zap_unlockdir(zap);
993 994 return (SET_ERROR(ENOTSUP));
994 995 }
995 996 if (!zap->zap_ismicro) {
996 997 err = fzap_add(zn, integer_size, num_integers, val, tx);
997 998 zap = zn->zn_zap; /* fzap_add() may change zap */
998 999 } else if (integer_size != 8 || num_integers != 1 ||
999 1000 strlen(key) >= MZAP_NAME_LEN) {
1000 1001 err = mzap_upgrade(&zn->zn_zap, tx, 0);
1001 1002 if (err == 0)
1002 1003 err = fzap_add(zn, integer_size, num_integers, val, tx);
1003 1004 zap = zn->zn_zap; /* fzap_add() may change zap */
1004 1005 } else {
1005 1006 mze = mze_find(zn);
1006 1007 if (mze != NULL) {
1007 1008 err = SET_ERROR(EEXIST);
1008 1009 } else {
1009 1010 mzap_addent(zn, *intval);
1010 1011 }
1011 1012 }
1012 1013 ASSERT(zap == zn->zn_zap);
1013 1014 zap_name_free(zn);
1014 1015 if (zap != NULL) /* may be NULL if fzap_add() failed */
1015 1016 zap_unlockdir(zap);
1016 1017 return (err);
1017 1018 }
1018 1019
1019 1020 int
1020 1021 zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1021 1022 int key_numints, int integer_size, uint64_t num_integers,
1022 1023 const void *val, dmu_tx_t *tx)
1023 1024 {
1024 1025 zap_t *zap;
1025 1026 int err;
1026 1027 zap_name_t *zn;
1027 1028
1028 1029 err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
1029 1030 if (err)
1030 1031 return (err);
1031 1032 zn = zap_name_alloc_uint64(zap, key, key_numints);
1032 1033 if (zn == NULL) {
1033 1034 zap_unlockdir(zap);
1034 1035 return (SET_ERROR(ENOTSUP));
1035 1036 }
1036 1037 err = fzap_add(zn, integer_size, num_integers, val, tx);
1037 1038 zap = zn->zn_zap; /* fzap_add() may change zap */
1038 1039 zap_name_free(zn);
1039 1040 if (zap != NULL) /* may be NULL if fzap_add() failed */
1040 1041 zap_unlockdir(zap);
1041 1042 return (err);
1042 1043 }
1043 1044
1044 1045 int
1045 1046 zap_update(objset_t *os, uint64_t zapobj, const char *name,
1046 1047 int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
1047 1048 {
1048 1049 zap_t *zap;
1049 1050 mzap_ent_t *mze;
1050 1051 uint64_t oldval;
1051 1052 const uint64_t *intval = val;
1052 1053 zap_name_t *zn;
1053 1054 int err;
1054 1055
1055 1056 #ifdef ZFS_DEBUG
1056 1057 /*
1057 1058 * If there is an old value, it shouldn't change across the
1058 1059 * lockdir (eg, due to bprewrite's xlation).
1059 1060 */
1060 1061 if (integer_size == 8 && num_integers == 1)
1061 1062 (void) zap_lookup(os, zapobj, name, 8, 1, &oldval);
1062 1063 #endif
1063 1064
1064 1065 err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
1065 1066 if (err)
1066 1067 return (err);
1067 1068 zn = zap_name_alloc(zap, name, MT_EXACT);
1068 1069 if (zn == NULL) {
1069 1070 zap_unlockdir(zap);
1070 1071 return (SET_ERROR(ENOTSUP));
1071 1072 }
1072 1073 if (!zap->zap_ismicro) {
1073 1074 err = fzap_update(zn, integer_size, num_integers, val, tx);
1074 1075 zap = zn->zn_zap; /* fzap_update() may change zap */
1075 1076 } else if (integer_size != 8 || num_integers != 1 ||
1076 1077 strlen(name) >= MZAP_NAME_LEN) {
1077 1078 dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
1078 1079 zapobj, integer_size, num_integers, name);
1079 1080 err = mzap_upgrade(&zn->zn_zap, tx, 0);
1080 1081 if (err == 0)
1081 1082 err = fzap_update(zn, integer_size, num_integers,
1082 1083 val, tx);
1083 1084 zap = zn->zn_zap; /* fzap_update() may change zap */
1084 1085 } else {
1085 1086 mze = mze_find(zn);
1086 1087 if (mze != NULL) {
1087 1088 ASSERT3U(MZE_PHYS(zap, mze)->mze_value, ==, oldval);
1088 1089 MZE_PHYS(zap, mze)->mze_value = *intval;
1089 1090 } else {
1090 1091 mzap_addent(zn, *intval);
1091 1092 }
1092 1093 }
1093 1094 ASSERT(zap == zn->zn_zap);
1094 1095 zap_name_free(zn);
1095 1096 if (zap != NULL) /* may be NULL if fzap_upgrade() failed */
1096 1097 zap_unlockdir(zap);
1097 1098 return (err);
1098 1099 }
1099 1100
1100 1101 int
1101 1102 zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1102 1103 int key_numints,
1103 1104 int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
1104 1105 {
1105 1106 zap_t *zap;
1106 1107 zap_name_t *zn;
1107 1108 int err;
1108 1109
1109 1110 err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
1110 1111 if (err)
1111 1112 return (err);
1112 1113 zn = zap_name_alloc_uint64(zap, key, key_numints);
1113 1114 if (zn == NULL) {
1114 1115 zap_unlockdir(zap);
1115 1116 return (SET_ERROR(ENOTSUP));
1116 1117 }
1117 1118 err = fzap_update(zn, integer_size, num_integers, val, tx);
1118 1119 zap = zn->zn_zap; /* fzap_update() may change zap */
1119 1120 zap_name_free(zn);
1120 1121 if (zap != NULL) /* may be NULL if fzap_upgrade() failed */
1121 1122 zap_unlockdir(zap);
1122 1123 return (err);
1123 1124 }
1124 1125
1125 1126 int
1126 1127 zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
1127 1128 {
1128 1129 return (zap_remove_norm(os, zapobj, name, MT_EXACT, tx));
1129 1130 }
1130 1131
1131 1132 int
1132 1133 zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
1133 1134 matchtype_t mt, dmu_tx_t *tx)
1134 1135 {
1135 1136 zap_t *zap;
1136 1137 int err;
1137 1138 mzap_ent_t *mze;
1138 1139 zap_name_t *zn;
1139 1140
1140 1141 err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap);
1141 1142 if (err)
1142 1143 return (err);
1143 1144 zn = zap_name_alloc(zap, name, mt);
1144 1145 if (zn == NULL) {
1145 1146 zap_unlockdir(zap);
1146 1147 return (SET_ERROR(ENOTSUP));
1147 1148 }
1148 1149 if (!zap->zap_ismicro) {
1149 1150 err = fzap_remove(zn, tx);
1150 1151 } else {
1151 1152 mze = mze_find(zn);
1152 1153 if (mze == NULL) {
1153 1154 err = SET_ERROR(ENOENT);
1154 1155 } else {
1155 1156 zap->zap_m.zap_num_entries--;
1156 1157 bzero(&zap_m_phys(zap)->mz_chunk[mze->mze_chunkid],
1157 1158 sizeof (mzap_ent_phys_t));
1158 1159 mze_remove(zap, mze);
1159 1160 }
1160 1161 }
1161 1162 zap_name_free(zn);
1162 1163 zap_unlockdir(zap);
1163 1164 return (err);
1164 1165 }
1165 1166
1166 1167 int
1167 1168 zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1168 1169 int key_numints, dmu_tx_t *tx)
1169 1170 {
1170 1171 zap_t *zap;
1171 1172 int err;
1172 1173 zap_name_t *zn;
1173 1174
1174 1175 err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap);
1175 1176 if (err)
1176 1177 return (err);
1177 1178 zn = zap_name_alloc_uint64(zap, key, key_numints);
1178 1179 if (zn == NULL) {
1179 1180 zap_unlockdir(zap);
1180 1181 return (SET_ERROR(ENOTSUP));
1181 1182 }
1182 1183 err = fzap_remove(zn, tx);
1183 1184 zap_name_free(zn);
1184 1185 zap_unlockdir(zap);
1185 1186 return (err);
1186 1187 }
1187 1188
1188 1189 /*
1189 1190 * Routines for iterating over the attributes.
1190 1191 */
1191 1192
1192 1193 void
1193 1194 zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
1194 1195 uint64_t serialized)
1195 1196 {
1196 1197 zc->zc_objset = os;
1197 1198 zc->zc_zap = NULL;
1198 1199 zc->zc_leaf = NULL;
1199 1200 zc->zc_zapobj = zapobj;
1200 1201 zc->zc_serialized = serialized;
1201 1202 zc->zc_hash = 0;
1202 1203 zc->zc_cd = 0;
1203 1204 }
1204 1205
1205 1206 void
1206 1207 zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
1207 1208 {
1208 1209 zap_cursor_init_serialized(zc, os, zapobj, 0);
1209 1210 }
1210 1211
1211 1212 void
1212 1213 zap_cursor_fini(zap_cursor_t *zc)
1213 1214 {
1214 1215 if (zc->zc_zap) {
1215 1216 rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
1216 1217 zap_unlockdir(zc->zc_zap);
1217 1218 zc->zc_zap = NULL;
1218 1219 }
1219 1220 if (zc->zc_leaf) {
1220 1221 rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
1221 1222 zap_put_leaf(zc->zc_leaf);
1222 1223 zc->zc_leaf = NULL;
1223 1224 }
1224 1225 zc->zc_objset = NULL;
1225 1226 }
1226 1227
1227 1228 uint64_t
1228 1229 zap_cursor_serialize(zap_cursor_t *zc)
1229 1230 {
1230 1231 if (zc->zc_hash == -1ULL)
1231 1232 return (-1ULL);
1232 1233 if (zc->zc_zap == NULL)
1233 1234 return (zc->zc_serialized);
1234 1235 ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0);
1235 1236 ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap));
1236 1237
1237 1238 /*
1238 1239 * We want to keep the high 32 bits of the cursor zero if we can, so
1239 1240 * that 32-bit programs can access this. So usually use a small
1240 1241 * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits
1241 1242 * of the cursor.
1242 1243 *
1243 1244 * [ collision differentiator | zap_hashbits()-bit hash value ]
1244 1245 */
1245 1246 return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) |
1246 1247 ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap)));
1247 1248 }
1248 1249
1249 1250 int
1250 1251 zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
1251 1252 {
1252 1253 int err;
1253 1254 avl_index_t idx;
1254 1255 mzap_ent_t mze_tofind;
1255 1256 mzap_ent_t *mze;
1256 1257
1257 1258 if (zc->zc_hash == -1ULL)
1258 1259 return (SET_ERROR(ENOENT));
1259 1260
1260 1261 if (zc->zc_zap == NULL) {
1261 1262 int hb;
1262 1263 err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
1263 1264 RW_READER, TRUE, FALSE, &zc->zc_zap);
1264 1265 if (err)
1265 1266 return (err);
1266 1267
1267 1268 /*
1268 1269 * To support zap_cursor_init_serialized, advance, retrieve,
1269 1270 * we must add to the existing zc_cd, which may already
1270 1271 * be 1 due to the zap_cursor_advance.
1271 1272 */
1272 1273 ASSERT(zc->zc_hash == 0);
1273 1274 hb = zap_hashbits(zc->zc_zap);
1274 1275 zc->zc_hash = zc->zc_serialized << (64 - hb);
1275 1276 zc->zc_cd += zc->zc_serialized >> hb;
1276 1277 if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */
1277 1278 zc->zc_cd = 0;
1278 1279 } else {
1279 1280 rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
1280 1281 }
1281 1282 if (!zc->zc_zap->zap_ismicro) {
1282 1283 err = fzap_cursor_retrieve(zc->zc_zap, zc, za);
1283 1284 } else {
1284 1285 mze_tofind.mze_hash = zc->zc_hash;
1285 1286 mze_tofind.mze_cd = zc->zc_cd;
1286 1287
1287 1288 mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx);
1288 1289 if (mze == NULL) {
1289 1290 mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl,
1290 1291 idx, AVL_AFTER);
1291 1292 }
1292 1293 if (mze) {
1293 1294 mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze);
1294 1295 ASSERT3U(mze->mze_cd, ==, mzep->mze_cd);
1295 1296 za->za_normalization_conflict =
1296 1297 mzap_normalization_conflict(zc->zc_zap, NULL, mze);
1297 1298 za->za_integer_length = 8;
1298 1299 za->za_num_integers = 1;
1299 1300 za->za_first_integer = mzep->mze_value;
1300 1301 (void) strcpy(za->za_name, mzep->mze_name);
1301 1302 zc->zc_hash = mze->mze_hash;
1302 1303 zc->zc_cd = mze->mze_cd;
1303 1304 err = 0;
1304 1305 } else {
1305 1306 zc->zc_hash = -1ULL;
1306 1307 err = SET_ERROR(ENOENT);
1307 1308 }
1308 1309 }
1309 1310 rw_exit(&zc->zc_zap->zap_rwlock);
1310 1311 return (err);
1311 1312 }
1312 1313
1313 1314 void
1314 1315 zap_cursor_advance(zap_cursor_t *zc)
1315 1316 {
1316 1317 if (zc->zc_hash == -1ULL)
1317 1318 return;
1318 1319 zc->zc_cd++;
1319 1320 }
1320 1321
1321 1322 int
1322 1323 zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
1323 1324 {
1324 1325 int err;
1325 1326 zap_t *zap;
1326 1327
1327 1328 err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
1328 1329 if (err)
1329 1330 return (err);
1330 1331
1331 1332 bzero(zs, sizeof (zap_stats_t));
1332 1333
1333 1334 if (zap->zap_ismicro) {
1334 1335 zs->zs_blocksize = zap->zap_dbuf->db_size;
1335 1336 zs->zs_num_entries = zap->zap_m.zap_num_entries;
1336 1337 zs->zs_num_blocks = 1;
1337 1338 } else {
1338 1339 fzap_get_stats(zap, zs);
1339 1340 }
1340 1341 zap_unlockdir(zap);
1341 1342 return (0);
1342 1343 }
1343 1344
1344 1345 int
1345 1346 zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
1346 1347 uint64_t *towrite, uint64_t *tooverwrite)
1347 1348 {
1348 1349 zap_t *zap;
1349 1350 int err = 0;
1350 1351
1351 1352 /*
1352 1353 * Since, we don't have a name, we cannot figure out which blocks will
1353 1354 * be affected in this operation. So, account for the worst case :
1354 1355 * - 3 blocks overwritten: target leaf, ptrtbl block, header block
1355 1356 * - 4 new blocks written if adding:
1356 1357 * - 2 blocks for possibly split leaves,
1357 1358 * - 2 grown ptrtbl blocks
1358 1359 *
1359 1360 * This also accomodates the case where an add operation to a fairly
1360 1361 * large microzap results in a promotion to fatzap.
1361 1362 */
1362 1363 if (name == NULL) {
1363 1364 *towrite += (3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE;
1364 1365 return (err);
1365 1366 }
1366 1367
1367 1368 /*
1368 1369 * We lock the zap with adding == FALSE. Because, if we pass
1369 1370 * the actual value of add, it could trigger a mzap_upgrade().
1370 1371 * At present we are just evaluating the possibility of this operation
1371 1372 * and hence we donot want to trigger an upgrade.
1372 1373 */
1373 1374 err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
1374 1375 if (err)
1375 1376 return (err);
1376 1377
1377 1378 if (!zap->zap_ismicro) {
1378 1379 zap_name_t *zn = zap_name_alloc(zap, name, MT_EXACT);
1379 1380 if (zn) {
1380 1381 err = fzap_count_write(zn, add, towrite,
1381 1382 tooverwrite);
1382 1383 zap_name_free(zn);
1383 1384 } else {
1384 1385 /*
1385 1386 * We treat this case as similar to (name == NULL)
1386 1387 */
1387 1388 *towrite += (3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE;
1388 1389 }
1389 1390 } else {
1390 1391 /*
1391 1392 * We are here if (name != NULL) and this is a micro-zap.
1392 1393 * We account for the header block depending on whether it
1393 1394 * is freeable.
1394 1395 *
1395 1396 * Incase of an add-operation it is hard to find out
1396 1397 * if this add will promote this microzap to fatzap.
1397 1398 * Hence, we consider the worst case and account for the
1398 1399 * blocks assuming this microzap would be promoted to a
1399 1400 * fatzap.
1400 1401 *
1401 1402 * 1 block overwritten : header block
1402 1403 * 4 new blocks written : 2 new split leaf, 2 grown
1403 1404 * ptrtbl blocks
1404 1405 */
1405 1406 if (dmu_buf_freeable(zap->zap_dbuf))
1406 1407 *tooverwrite += MZAP_MAX_BLKSZ;
1407 1408 else
1408 1409 *towrite += MZAP_MAX_BLKSZ;
1409 1410
1410 1411 if (add) {
1411 1412 *towrite += 4 * MZAP_MAX_BLKSZ;
1412 1413 }
1413 1414 }
1414 1415
1415 1416 zap_unlockdir(zap);
1416 1417 return (err);
1417 1418 }
|
↓ open down ↓ |
721 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX