Print this page
6842 Fix empty xattr dir causing lockup
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/fs/zfs/zap.c
+++ new/usr/src/uts/common/fs/zfs/zap.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
24 24 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
25 25 */
26 26
27 27 /*
28 28 * This file contains the top half of the zfs directory structure
29 29 * implementation. The bottom half is in zap_leaf.c.
30 30 *
31 31 * The zdir is an extendable hash data structure. There is a table of
32 32 * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are
33 33 * each a constant size and hold a variable number of directory entries.
34 34 * The buckets (aka "leaf nodes") are implemented in zap_leaf.c.
35 35 *
36 36 * The pointer table holds a power of 2 number of pointers.
37 37 * (1<<zap_t->zd_data->zd_phys->zd_prefix_len). The bucket pointed to
38 38 * by the pointer at index i in the table holds entries whose hash value
39 39 * has a zd_prefix_len - bit prefix
40 40 */
41 41
42 42 #include <sys/spa.h>
43 43 #include <sys/dmu.h>
44 44 #include <sys/zfs_context.h>
45 45 #include <sys/zfs_znode.h>
46 46 #include <sys/fs/zfs.h>
47 47 #include <sys/zap.h>
48 48 #include <sys/refcount.h>
49 49 #include <sys/zap_impl.h>
50 50 #include <sys/zap_leaf.h>
51 51
52 52 int fzap_default_block_shift = 14; /* 16k blocksize */
53 53
54 54 extern inline zap_phys_t *zap_f_phys(zap_t *zap);
55 55
56 56 static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
57 57
58 58 void
59 59 fzap_byteswap(void *vbuf, size_t size)
60 60 {
61 61 uint64_t block_type;
62 62
63 63 block_type = *(uint64_t *)vbuf;
64 64
65 65 if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF))
66 66 zap_leaf_byteswap(vbuf, size);
67 67 else {
68 68 /* it's a ptrtbl block */
69 69 byteswap_uint64_array(vbuf, size);
70 70 }
71 71 }
72 72
73 73 void
74 74 fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
75 75 {
76 76 dmu_buf_t *db;
77 77 zap_leaf_t *l;
78 78 int i;
79 79 zap_phys_t *zp;
80 80
81 81 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
82 82 zap->zap_ismicro = FALSE;
83 83
84 84 zap->zap_dbu.dbu_evict_func = zap_evict;
85 85
86 86 mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
87 87 zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1;
88 88
89 89 zp = zap_f_phys(zap);
90 90 /*
91 91 * explicitly zero it since it might be coming from an
92 92 * initialized microzap
93 93 */
94 94 bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size);
95 95 zp->zap_block_type = ZBT_HEADER;
96 96 zp->zap_magic = ZAP_MAGIC;
97 97
98 98 zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap);
99 99
100 100 zp->zap_freeblk = 2; /* block 1 will be the first leaf */
101 101 zp->zap_num_leafs = 1;
102 102 zp->zap_num_entries = 0;
103 103 zp->zap_salt = zap->zap_salt;
104 104 zp->zap_normflags = zap->zap_normflags;
105 105 zp->zap_flags = flags;
106 106
107 107 /* block 1 will be the first leaf */
108 108 for (i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++)
109 109 ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1;
110 110
111 111 /*
112 112 * set up block 1 - the first leaf
113 113 */
114 114 VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
115 115 1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db, DMU_READ_NO_PREFETCH));
116 116 dmu_buf_will_dirty(db, tx);
117 117
118 118 l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
119 119 l->l_dbuf = db;
120 120
121 121 zap_leaf_init(l, zp->zap_normflags != 0);
122 122
123 123 kmem_free(l, sizeof (zap_leaf_t));
124 124 dmu_buf_rele(db, FTAG);
125 125 }
126 126
127 127 static int
128 128 zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx)
129 129 {
130 130 if (RW_WRITE_HELD(&zap->zap_rwlock))
131 131 return (1);
132 132 if (rw_tryupgrade(&zap->zap_rwlock)) {
133 133 dmu_buf_will_dirty(zap->zap_dbuf, tx);
134 134 return (1);
135 135 }
136 136 return (0);
137 137 }
138 138
139 139 /*
140 140 * Generic routines for dealing with the pointer & cookie tables.
141 141 */
142 142
143 143 static int
144 144 zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
145 145 void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n),
146 146 dmu_tx_t *tx)
147 147 {
148 148 uint64_t b, newblk;
149 149 dmu_buf_t *db_old, *db_new;
150 150 int err;
151 151 int bs = FZAP_BLOCK_SHIFT(zap);
152 152 int hepb = 1<<(bs-4);
153 153 /* hepb = half the number of entries in a block */
154 154
155 155 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
156 156 ASSERT(tbl->zt_blk != 0);
157 157 ASSERT(tbl->zt_numblks > 0);
158 158
159 159 if (tbl->zt_nextblk != 0) {
160 160 newblk = tbl->zt_nextblk;
161 161 } else {
162 162 newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
163 163 tbl->zt_nextblk = newblk;
164 164 ASSERT0(tbl->zt_blks_copied);
165 165 dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
166 166 tbl->zt_blk << bs, tbl->zt_numblks << bs,
167 167 ZIO_PRIORITY_SYNC_READ);
168 168 }
169 169
170 170 /*
171 171 * Copy the ptrtbl from the old to new location.
172 172 */
173 173
174 174 b = tbl->zt_blks_copied;
175 175 err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
176 176 (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH);
177 177 if (err)
178 178 return (err);
179 179
180 180 /* first half of entries in old[b] go to new[2*b+0] */
181 181 VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
182 182 (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
183 183 dmu_buf_will_dirty(db_new, tx);
184 184 transfer_func(db_old->db_data, db_new->db_data, hepb);
185 185 dmu_buf_rele(db_new, FTAG);
186 186
187 187 /* second half of entries in old[b] go to new[2*b+1] */
188 188 VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
189 189 (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
190 190 dmu_buf_will_dirty(db_new, tx);
191 191 transfer_func((uint64_t *)db_old->db_data + hepb,
192 192 db_new->db_data, hepb);
193 193 dmu_buf_rele(db_new, FTAG);
194 194
195 195 dmu_buf_rele(db_old, FTAG);
196 196
197 197 tbl->zt_blks_copied++;
198 198
199 199 dprintf("copied block %llu of %llu\n",
200 200 tbl->zt_blks_copied, tbl->zt_numblks);
201 201
202 202 if (tbl->zt_blks_copied == tbl->zt_numblks) {
203 203 (void) dmu_free_range(zap->zap_objset, zap->zap_object,
204 204 tbl->zt_blk << bs, tbl->zt_numblks << bs, tx);
205 205
206 206 tbl->zt_blk = newblk;
207 207 tbl->zt_numblks *= 2;
208 208 tbl->zt_shift++;
209 209 tbl->zt_nextblk = 0;
210 210 tbl->zt_blks_copied = 0;
211 211
212 212 dprintf("finished; numblocks now %llu (%lluk entries)\n",
213 213 tbl->zt_numblks, 1<<(tbl->zt_shift-10));
214 214 }
215 215
216 216 return (0);
217 217 }
218 218
219 219 static int
220 220 zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
221 221 dmu_tx_t *tx)
222 222 {
223 223 int err;
224 224 uint64_t blk, off;
225 225 int bs = FZAP_BLOCK_SHIFT(zap);
226 226 dmu_buf_t *db;
227 227
228 228 ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
229 229 ASSERT(tbl->zt_blk != 0);
230 230
231 231 dprintf("storing %llx at index %llx\n", val, idx);
232 232
233 233 blk = idx >> (bs-3);
234 234 off = idx & ((1<<(bs-3))-1);
235 235
236 236 err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
237 237 (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
238 238 if (err)
239 239 return (err);
240 240 dmu_buf_will_dirty(db, tx);
241 241
242 242 if (tbl->zt_nextblk != 0) {
243 243 uint64_t idx2 = idx * 2;
244 244 uint64_t blk2 = idx2 >> (bs-3);
245 245 uint64_t off2 = idx2 & ((1<<(bs-3))-1);
246 246 dmu_buf_t *db2;
247 247
248 248 err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
249 249 (tbl->zt_nextblk + blk2) << bs, FTAG, &db2,
250 250 DMU_READ_NO_PREFETCH);
251 251 if (err) {
252 252 dmu_buf_rele(db, FTAG);
253 253 return (err);
254 254 }
255 255 dmu_buf_will_dirty(db2, tx);
256 256 ((uint64_t *)db2->db_data)[off2] = val;
257 257 ((uint64_t *)db2->db_data)[off2+1] = val;
258 258 dmu_buf_rele(db2, FTAG);
259 259 }
260 260
261 261 ((uint64_t *)db->db_data)[off] = val;
262 262 dmu_buf_rele(db, FTAG);
263 263
264 264 return (0);
265 265 }
266 266
267 267 static int
268 268 zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
269 269 {
270 270 uint64_t blk, off;
271 271 int err;
272 272 dmu_buf_t *db;
273 273 int bs = FZAP_BLOCK_SHIFT(zap);
274 274
275 275 ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
276 276
277 277 blk = idx >> (bs-3);
278 278 off = idx & ((1<<(bs-3))-1);
279 279
280 280 err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
281 281 (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
282 282 if (err)
283 283 return (err);
284 284 *valp = ((uint64_t *)db->db_data)[off];
285 285 dmu_buf_rele(db, FTAG);
286 286
287 287 if (tbl->zt_nextblk != 0) {
288 288 /*
289 289 * read the nextblk for the sake of i/o error checking,
290 290 * so that zap_table_load() will catch errors for
291 291 * zap_table_store.
292 292 */
293 293 blk = (idx*2) >> (bs-3);
294 294
295 295 err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
296 296 (tbl->zt_nextblk + blk) << bs, FTAG, &db,
297 297 DMU_READ_NO_PREFETCH);
298 298 if (err == 0)
299 299 dmu_buf_rele(db, FTAG);
300 300 }
301 301 return (err);
302 302 }
303 303
304 304 /*
305 305 * Routines for growing the ptrtbl.
306 306 */
307 307
308 308 static void
309 309 zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n)
310 310 {
311 311 int i;
312 312 for (i = 0; i < n; i++) {
313 313 uint64_t lb = src[i];
314 314 dst[2*i+0] = lb;
315 315 dst[2*i+1] = lb;
316 316 }
317 317 }
318 318
319 319 static int
320 320 zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
321 321 {
322 322 /*
323 323 * The pointer table should never use more hash bits than we
324 324 * have (otherwise we'd be using useless zero bits to index it).
325 325 * If we are within 2 bits of running out, stop growing, since
326 326 * this is already an aberrant condition.
327 327 */
328 328 if (zap_f_phys(zap)->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2)
329 329 return (SET_ERROR(ENOSPC));
330 330
331 331 if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
332 332 /*
333 333 * We are outgrowing the "embedded" ptrtbl (the one
334 334 * stored in the header block). Give it its own entire
335 335 * block, which will double the size of the ptrtbl.
336 336 */
337 337 uint64_t newblk;
338 338 dmu_buf_t *db_new;
339 339 int err;
340 340
341 341 ASSERT3U(zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==,
342 342 ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
343 343 ASSERT0(zap_f_phys(zap)->zap_ptrtbl.zt_blk);
344 344
345 345 newblk = zap_allocate_blocks(zap, 1);
346 346 err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
347 347 newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new,
348 348 DMU_READ_NO_PREFETCH);
349 349 if (err)
350 350 return (err);
351 351 dmu_buf_will_dirty(db_new, tx);
352 352 zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
353 353 db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
354 354 dmu_buf_rele(db_new, FTAG);
355 355
356 356 zap_f_phys(zap)->zap_ptrtbl.zt_blk = newblk;
357 357 zap_f_phys(zap)->zap_ptrtbl.zt_numblks = 1;
358 358 zap_f_phys(zap)->zap_ptrtbl.zt_shift++;
359 359
360 360 ASSERT3U(1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==,
361 361 zap_f_phys(zap)->zap_ptrtbl.zt_numblks <<
362 362 (FZAP_BLOCK_SHIFT(zap)-3));
363 363
364 364 return (0);
365 365 } else {
366 366 return (zap_table_grow(zap, &zap_f_phys(zap)->zap_ptrtbl,
367 367 zap_ptrtbl_transfer, tx));
368 368 }
369 369 }
370 370
371 371 static void
372 372 zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx)
373 373 {
374 374 dmu_buf_will_dirty(zap->zap_dbuf, tx);
375 375 mutex_enter(&zap->zap_f.zap_num_entries_mtx);
376 376 ASSERT(delta > 0 || zap_f_phys(zap)->zap_num_entries >= -delta);
377 377 zap_f_phys(zap)->zap_num_entries += delta;
378 378 mutex_exit(&zap->zap_f.zap_num_entries_mtx);
379 379 }
380 380
381 381 static uint64_t
382 382 zap_allocate_blocks(zap_t *zap, int nblocks)
383 383 {
384 384 uint64_t newblk;
385 385 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
386 386 newblk = zap_f_phys(zap)->zap_freeblk;
387 387 zap_f_phys(zap)->zap_freeblk += nblocks;
388 388 return (newblk);
389 389 }
390 390
391 391 static void
392 392 zap_leaf_pageout(void *dbu)
393 393 {
394 394 zap_leaf_t *l = dbu;
395 395
396 396 rw_destroy(&l->l_rwlock);
397 397 kmem_free(l, sizeof (zap_leaf_t));
398 398 }
399 399
400 400 static zap_leaf_t *
401 401 zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
402 402 {
403 403 void *winner;
404 404 zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
405 405
406 406 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
407 407
408 408 rw_init(&l->l_rwlock, 0, 0, 0);
409 409 rw_enter(&l->l_rwlock, RW_WRITER);
410 410 l->l_blkid = zap_allocate_blocks(zap, 1);
411 411 l->l_dbuf = NULL;
412 412
413 413 VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
414 414 l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf,
415 415 DMU_READ_NO_PREFETCH));
416 416 dmu_buf_init_user(&l->l_dbu, zap_leaf_pageout, &l->l_dbuf);
417 417 winner = dmu_buf_set_user(l->l_dbuf, &l->l_dbu);
418 418 ASSERT(winner == NULL);
419 419 dmu_buf_will_dirty(l->l_dbuf, tx);
420 420
421 421 zap_leaf_init(l, zap->zap_normflags != 0);
422 422
423 423 zap_f_phys(zap)->zap_num_leafs++;
424 424
425 425 return (l);
426 426 }
427 427
428 428 int
429 429 fzap_count(zap_t *zap, uint64_t *count)
430 430 {
431 431 ASSERT(!zap->zap_ismicro);
432 432 mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */
433 433 *count = zap_f_phys(zap)->zap_num_entries;
434 434 mutex_exit(&zap->zap_f.zap_num_entries_mtx);
435 435 return (0);
436 436 }
437 437
438 438 /*
439 439 * Routines for obtaining zap_leaf_t's
440 440 */
441 441
442 442 void
443 443 zap_put_leaf(zap_leaf_t *l)
444 444 {
445 445 rw_exit(&l->l_rwlock);
446 446 dmu_buf_rele(l->l_dbuf, NULL);
447 447 }
448 448
449 449 static zap_leaf_t *
450 450 zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
451 451 {
452 452 zap_leaf_t *l, *winner;
453 453
454 454 ASSERT(blkid != 0);
455 455
456 456 l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
457 457 rw_init(&l->l_rwlock, 0, 0, 0);
458 458 rw_enter(&l->l_rwlock, RW_WRITER);
459 459 l->l_blkid = blkid;
460 460 l->l_bs = highbit64(db->db_size) - 1;
461 461 l->l_dbuf = db;
462 462
463 463 dmu_buf_init_user(&l->l_dbu, zap_leaf_pageout, &l->l_dbuf);
464 464 winner = dmu_buf_set_user(db, &l->l_dbu);
465 465
466 466 rw_exit(&l->l_rwlock);
467 467 if (winner != NULL) {
468 468 /* someone else set it first */
469 469 zap_leaf_pageout(&l->l_dbu);
470 470 l = winner;
471 471 }
472 472
473 473 /*
474 474 * lhr_pad was previously used for the next leaf in the leaf
475 475 * chain. There should be no chained leafs (as we have removed
476 476 * support for them).
477 477 */
478 478 ASSERT0(zap_leaf_phys(l)->l_hdr.lh_pad1);
479 479
480 480 /*
481 481 * There should be more hash entries than there can be
482 482 * chunks to put in the hash table
483 483 */
484 484 ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3);
485 485
486 486 /* The chunks should begin at the end of the hash table */
487 487 ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==,
488 488 &zap_leaf_phys(l)->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]);
489 489
490 490 /* The chunks should end at the end of the block */
491 491 ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) -
492 492 (uintptr_t)zap_leaf_phys(l), ==, l->l_dbuf->db_size);
493 493
494 494 return (l);
495 495 }
496 496
497 497 static int
498 498 zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
499 499 zap_leaf_t **lp)
500 500 {
501 501 dmu_buf_t *db;
502 502 zap_leaf_t *l;
503 503 int bs = FZAP_BLOCK_SHIFT(zap);
504 504 int err;
505 505
506 506 ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
507 507
508 508 err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
509 509 blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH);
510 510 if (err)
511 511 return (err);
512 512
513 513 ASSERT3U(db->db_object, ==, zap->zap_object);
514 514 ASSERT3U(db->db_offset, ==, blkid << bs);
515 515 ASSERT3U(db->db_size, ==, 1 << bs);
516 516 ASSERT(blkid != 0);
517 517
518 518 l = dmu_buf_get_user(db);
519 519
520 520 if (l == NULL)
521 521 l = zap_open_leaf(blkid, db);
522 522
523 523 rw_enter(&l->l_rwlock, lt);
524 524 /*
525 525 * Must lock before dirtying, otherwise zap_leaf_phys(l) could change,
526 526 * causing ASSERT below to fail.
527 527 */
528 528 if (lt == RW_WRITER)
529 529 dmu_buf_will_dirty(db, tx);
530 530 ASSERT3U(l->l_blkid, ==, blkid);
531 531 ASSERT3P(l->l_dbuf, ==, db);
532 532 ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_block_type, ==, ZBT_LEAF);
533 533 ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
534 534
535 535 *lp = l;
536 536 return (0);
537 537 }
538 538
539 539 static int
540 540 zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp)
541 541 {
542 542 ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
543 543
544 544 if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
545 545 ASSERT3U(idx, <,
546 546 (1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift));
547 547 *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx);
548 548 return (0);
549 549 } else {
550 550 return (zap_table_load(zap, &zap_f_phys(zap)->zap_ptrtbl,
551 551 idx, valp));
552 552 }
553 553 }
554 554
555 555 static int
556 556 zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
557 557 {
558 558 ASSERT(tx != NULL);
559 559 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
560 560
561 561 if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) {
562 562 ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk;
563 563 return (0);
564 564 } else {
565 565 return (zap_table_store(zap, &zap_f_phys(zap)->zap_ptrtbl,
566 566 idx, blk, tx));
567 567 }
|
↓ open down ↓ |
567 lines elided |
↑ open up ↑ |
568 568 }
569 569
570 570 static int
571 571 zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
572 572 {
573 573 uint64_t idx, blk;
574 574 int err;
575 575
576 576 ASSERT(zap->zap_dbuf == NULL ||
577 577 zap_f_phys(zap) == zap->zap_dbuf->db_data);
578 - ASSERT3U(zap_f_phys(zap)->zap_magic, ==, ZAP_MAGIC);
578 +
579 + /* Reality check for corrupt zap objects (leaf or header). */
580 + if ((zap_f_phys(zap)->zap_block_type != ZBT_LEAF &&
581 + zap_f_phys(zap)->zap_block_type != ZBT_HEADER) ||
582 + zap_f_phys(zap)->zap_magic != ZAP_MAGIC) {
583 + return (SET_ERROR(EIO));
584 + }
585 +
579 586 idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
580 587 err = zap_idx_to_blk(zap, idx, &blk);
581 588 if (err != 0)
582 589 return (err);
583 590 err = zap_get_leaf_byblk(zap, blk, tx, lt, lp);
584 591
585 592 ASSERT(err ||
586 593 ZAP_HASH_IDX(h, zap_leaf_phys(*lp)->l_hdr.lh_prefix_len) ==
587 594 zap_leaf_phys(*lp)->l_hdr.lh_prefix);
588 595 return (err);
589 596 }
590 597
591 598 static int
592 599 zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx, zap_leaf_t **lp)
593 600 {
594 601 zap_t *zap = zn->zn_zap;
595 602 uint64_t hash = zn->zn_hash;
596 603 zap_leaf_t *nl;
597 604 int prefix_diff, i, err;
598 605 uint64_t sibling;
599 606 int old_prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
600 607
601 608 ASSERT3U(old_prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
602 609 ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
603 610
604 611 ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
605 612 zap_leaf_phys(l)->l_hdr.lh_prefix);
606 613
607 614 if (zap_tryupgradedir(zap, tx) == 0 ||
608 615 old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
609 616 /* We failed to upgrade, or need to grow the pointer table */
610 617 objset_t *os = zap->zap_objset;
611 618 uint64_t object = zap->zap_object;
612 619
613 620 zap_put_leaf(l);
614 621 zap_unlockdir(zap);
615 622 err = zap_lockdir(os, object, tx, RW_WRITER,
616 623 FALSE, FALSE, &zn->zn_zap);
617 624 zap = zn->zn_zap;
618 625 if (err)
619 626 return (err);
620 627 ASSERT(!zap->zap_ismicro);
621 628
622 629 while (old_prefix_len ==
623 630 zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
624 631 err = zap_grow_ptrtbl(zap, tx);
625 632 if (err)
626 633 return (err);
627 634 }
628 635
629 636 err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
630 637 if (err)
631 638 return (err);
632 639
633 640 if (zap_leaf_phys(l)->l_hdr.lh_prefix_len != old_prefix_len) {
634 641 /* it split while our locks were down */
635 642 *lp = l;
636 643 return (0);
637 644 }
638 645 }
639 646 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
640 647 ASSERT3U(old_prefix_len, <, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
641 648 ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
642 649 zap_leaf_phys(l)->l_hdr.lh_prefix);
643 650
644 651 prefix_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
645 652 (old_prefix_len + 1);
646 653 sibling = (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff;
647 654
648 655 /* check for i/o errors before doing zap_leaf_split */
649 656 for (i = 0; i < (1ULL<<prefix_diff); i++) {
650 657 uint64_t blk;
651 658 err = zap_idx_to_blk(zap, sibling+i, &blk);
652 659 if (err)
653 660 return (err);
654 661 ASSERT3U(blk, ==, l->l_blkid);
655 662 }
656 663
657 664 nl = zap_create_leaf(zap, tx);
658 665 zap_leaf_split(l, nl, zap->zap_normflags != 0);
659 666
660 667 /* set sibling pointers */
661 668 for (i = 0; i < (1ULL << prefix_diff); i++) {
662 669 err = zap_set_idx_to_blk(zap, sibling+i, nl->l_blkid, tx);
663 670 ASSERT0(err); /* we checked for i/o errors above */
664 671 }
665 672
666 673 if (hash & (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len))) {
667 674 /* we want the sibling */
668 675 zap_put_leaf(l);
669 676 *lp = nl;
670 677 } else {
671 678 zap_put_leaf(nl);
672 679 *lp = l;
673 680 }
674 681
675 682 return (0);
676 683 }
677 684
678 685 static void
679 686 zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx)
680 687 {
681 688 zap_t *zap = zn->zn_zap;
682 689 int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
683 690 int leaffull = (zap_leaf_phys(l)->l_hdr.lh_prefix_len == shift &&
684 691 zap_leaf_phys(l)->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER);
685 692
686 693 zap_put_leaf(l);
687 694
688 695 if (leaffull || zap_f_phys(zap)->zap_ptrtbl.zt_nextblk) {
689 696 int err;
690 697
691 698 /*
692 699 * We are in the middle of growing the pointer table, or
693 700 * this leaf will soon make us grow it.
694 701 */
695 702 if (zap_tryupgradedir(zap, tx) == 0) {
696 703 objset_t *os = zap->zap_objset;
697 704 uint64_t zapobj = zap->zap_object;
698 705
699 706 zap_unlockdir(zap);
700 707 err = zap_lockdir(os, zapobj, tx,
701 708 RW_WRITER, FALSE, FALSE, &zn->zn_zap);
702 709 zap = zn->zn_zap;
703 710 if (err)
704 711 return;
705 712 }
706 713
707 714 /* could have finished growing while our locks were down */
708 715 if (zap_f_phys(zap)->zap_ptrtbl.zt_shift == shift)
709 716 (void) zap_grow_ptrtbl(zap, tx);
710 717 }
711 718 }
712 719
713 720 static int
714 721 fzap_checkname(zap_name_t *zn)
715 722 {
716 723 if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN)
717 724 return (SET_ERROR(ENAMETOOLONG));
718 725 return (0);
719 726 }
720 727
721 728 static int
722 729 fzap_checksize(uint64_t integer_size, uint64_t num_integers)
723 730 {
724 731 /* Only integer sizes supported by C */
725 732 switch (integer_size) {
726 733 case 1:
727 734 case 2:
728 735 case 4:
729 736 case 8:
730 737 break;
731 738 default:
732 739 return (SET_ERROR(EINVAL));
733 740 }
734 741
735 742 if (integer_size * num_integers > ZAP_MAXVALUELEN)
736 743 return (E2BIG);
737 744
738 745 return (0);
739 746 }
740 747
741 748 static int
742 749 fzap_check(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers)
743 750 {
744 751 int err;
745 752
746 753 if ((err = fzap_checkname(zn)) != 0)
747 754 return (err);
748 755 return (fzap_checksize(integer_size, num_integers));
749 756 }
750 757
751 758 /*
752 759 * Routines for manipulating attributes.
753 760 */
754 761 int
755 762 fzap_lookup(zap_name_t *zn,
756 763 uint64_t integer_size, uint64_t num_integers, void *buf,
757 764 char *realname, int rn_len, boolean_t *ncp)
758 765 {
759 766 zap_leaf_t *l;
760 767 int err;
761 768 zap_entry_handle_t zeh;
762 769
763 770 if ((err = fzap_checkname(zn)) != 0)
764 771 return (err);
765 772
766 773 err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l);
767 774 if (err != 0)
768 775 return (err);
769 776 err = zap_leaf_lookup(l, zn, &zeh);
770 777 if (err == 0) {
771 778 if ((err = fzap_checksize(integer_size, num_integers)) != 0) {
772 779 zap_put_leaf(l);
773 780 return (err);
774 781 }
775 782
776 783 err = zap_entry_read(&zeh, integer_size, num_integers, buf);
777 784 (void) zap_entry_read_name(zn->zn_zap, &zeh, rn_len, realname);
778 785 if (ncp) {
779 786 *ncp = zap_entry_normalization_conflict(&zeh,
780 787 zn, NULL, zn->zn_zap);
781 788 }
782 789 }
783 790
784 791 zap_put_leaf(l);
785 792 return (err);
786 793 }
787 794
788 795 int
789 796 fzap_add_cd(zap_name_t *zn,
790 797 uint64_t integer_size, uint64_t num_integers,
791 798 const void *val, uint32_t cd, dmu_tx_t *tx)
792 799 {
793 800 zap_leaf_t *l;
794 801 int err;
795 802 zap_entry_handle_t zeh;
796 803 zap_t *zap = zn->zn_zap;
797 804
798 805 ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
799 806 ASSERT(!zap->zap_ismicro);
800 807 ASSERT(fzap_check(zn, integer_size, num_integers) == 0);
801 808
802 809 err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l);
803 810 if (err != 0)
804 811 return (err);
805 812 retry:
806 813 err = zap_leaf_lookup(l, zn, &zeh);
807 814 if (err == 0) {
808 815 err = SET_ERROR(EEXIST);
809 816 goto out;
810 817 }
811 818 if (err != ENOENT)
812 819 goto out;
813 820
814 821 err = zap_entry_create(l, zn, cd,
815 822 integer_size, num_integers, val, &zeh);
816 823
817 824 if (err == 0) {
818 825 zap_increment_num_entries(zap, 1, tx);
819 826 } else if (err == EAGAIN) {
820 827 err = zap_expand_leaf(zn, l, tx, &l);
821 828 zap = zn->zn_zap; /* zap_expand_leaf() may change zap */
822 829 if (err == 0)
823 830 goto retry;
824 831 }
825 832
826 833 out:
827 834 if (zap != NULL)
828 835 zap_put_leaf_maybe_grow_ptrtbl(zn, l, tx);
829 836 return (err);
830 837 }
831 838
832 839 int
833 840 fzap_add(zap_name_t *zn,
834 841 uint64_t integer_size, uint64_t num_integers,
835 842 const void *val, dmu_tx_t *tx)
836 843 {
837 844 int err = fzap_check(zn, integer_size, num_integers);
838 845 if (err != 0)
839 846 return (err);
840 847
841 848 return (fzap_add_cd(zn, integer_size, num_integers,
842 849 val, ZAP_NEED_CD, tx));
843 850 }
844 851
845 852 int
846 853 fzap_update(zap_name_t *zn,
847 854 int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
848 855 {
849 856 zap_leaf_t *l;
850 857 int err, create;
851 858 zap_entry_handle_t zeh;
852 859 zap_t *zap = zn->zn_zap;
853 860
854 861 ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
855 862 err = fzap_check(zn, integer_size, num_integers);
856 863 if (err != 0)
857 864 return (err);
858 865
859 866 err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l);
860 867 if (err != 0)
861 868 return (err);
862 869 retry:
863 870 err = zap_leaf_lookup(l, zn, &zeh);
864 871 create = (err == ENOENT);
865 872 ASSERT(err == 0 || err == ENOENT);
866 873
867 874 if (create) {
868 875 err = zap_entry_create(l, zn, ZAP_NEED_CD,
869 876 integer_size, num_integers, val, &zeh);
870 877 if (err == 0)
871 878 zap_increment_num_entries(zap, 1, tx);
872 879 } else {
873 880 err = zap_entry_update(&zeh, integer_size, num_integers, val);
874 881 }
875 882
876 883 if (err == EAGAIN) {
877 884 err = zap_expand_leaf(zn, l, tx, &l);
878 885 zap = zn->zn_zap; /* zap_expand_leaf() may change zap */
879 886 if (err == 0)
880 887 goto retry;
881 888 }
882 889
883 890 if (zap != NULL)
884 891 zap_put_leaf_maybe_grow_ptrtbl(zn, l, tx);
885 892 return (err);
886 893 }
887 894
888 895 int
889 896 fzap_length(zap_name_t *zn,
890 897 uint64_t *integer_size, uint64_t *num_integers)
891 898 {
892 899 zap_leaf_t *l;
893 900 int err;
894 901 zap_entry_handle_t zeh;
895 902
896 903 err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l);
897 904 if (err != 0)
898 905 return (err);
899 906 err = zap_leaf_lookup(l, zn, &zeh);
900 907 if (err != 0)
901 908 goto out;
902 909
903 910 if (integer_size)
904 911 *integer_size = zeh.zeh_integer_size;
905 912 if (num_integers)
906 913 *num_integers = zeh.zeh_num_integers;
907 914 out:
908 915 zap_put_leaf(l);
909 916 return (err);
910 917 }
911 918
912 919 int
913 920 fzap_remove(zap_name_t *zn, dmu_tx_t *tx)
914 921 {
915 922 zap_leaf_t *l;
916 923 int err;
917 924 zap_entry_handle_t zeh;
918 925
919 926 err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, tx, RW_WRITER, &l);
920 927 if (err != 0)
921 928 return (err);
922 929 err = zap_leaf_lookup(l, zn, &zeh);
923 930 if (err == 0) {
924 931 zap_entry_remove(&zeh);
925 932 zap_increment_num_entries(zn->zn_zap, -1, tx);
926 933 }
927 934 zap_put_leaf(l);
928 935 return (err);
929 936 }
930 937
931 938 void
932 939 fzap_prefetch(zap_name_t *zn)
933 940 {
934 941 uint64_t idx, blk;
935 942 zap_t *zap = zn->zn_zap;
936 943 int bs;
937 944
938 945 idx = ZAP_HASH_IDX(zn->zn_hash,
939 946 zap_f_phys(zap)->zap_ptrtbl.zt_shift);
940 947 if (zap_idx_to_blk(zap, idx, &blk) != 0)
941 948 return;
942 949 bs = FZAP_BLOCK_SHIFT(zap);
943 950 dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs,
944 951 ZIO_PRIORITY_SYNC_READ);
945 952 }
946 953
947 954 /*
948 955 * Helper functions for consumers.
949 956 */
950 957
951 958 uint64_t
952 959 zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj,
953 960 const char *name, dmu_tx_t *tx)
954 961 {
955 962 uint64_t new_obj;
956 963
957 964 VERIFY((new_obj = zap_create(os, ot, DMU_OT_NONE, 0, tx)) > 0);
958 965 VERIFY(zap_add(os, parent_obj, name, sizeof (uint64_t), 1, &new_obj,
959 966 tx) == 0);
960 967
961 968 return (new_obj);
962 969 }
963 970
964 971 int
965 972 zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask,
966 973 char *name)
967 974 {
968 975 zap_cursor_t zc;
969 976 zap_attribute_t *za;
970 977 int err;
971 978
972 979 if (mask == 0)
973 980 mask = -1ULL;
974 981
975 982 za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
976 983 for (zap_cursor_init(&zc, os, zapobj);
977 984 (err = zap_cursor_retrieve(&zc, za)) == 0;
978 985 zap_cursor_advance(&zc)) {
979 986 if ((za->za_first_integer & mask) == (value & mask)) {
980 987 (void) strcpy(name, za->za_name);
981 988 break;
982 989 }
983 990 }
984 991 zap_cursor_fini(&zc);
985 992 kmem_free(za, sizeof (zap_attribute_t));
986 993 return (err);
987 994 }
988 995
989 996 int
990 997 zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx)
991 998 {
992 999 zap_cursor_t zc;
993 1000 zap_attribute_t za;
994 1001 int err;
995 1002
996 1003 err = 0;
997 1004 for (zap_cursor_init(&zc, os, fromobj);
998 1005 zap_cursor_retrieve(&zc, &za) == 0;
999 1006 (void) zap_cursor_advance(&zc)) {
1000 1007 if (za.za_integer_length != 8 || za.za_num_integers != 1) {
1001 1008 err = SET_ERROR(EINVAL);
1002 1009 break;
1003 1010 }
1004 1011 err = zap_add(os, intoobj, za.za_name,
1005 1012 8, 1, &za.za_first_integer, tx);
1006 1013 if (err)
1007 1014 break;
1008 1015 }
1009 1016 zap_cursor_fini(&zc);
1010 1017 return (err);
1011 1018 }
1012 1019
1013 1020 int
1014 1021 zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
1015 1022 uint64_t value, dmu_tx_t *tx)
1016 1023 {
1017 1024 zap_cursor_t zc;
1018 1025 zap_attribute_t za;
1019 1026 int err;
1020 1027
1021 1028 err = 0;
1022 1029 for (zap_cursor_init(&zc, os, fromobj);
1023 1030 zap_cursor_retrieve(&zc, &za) == 0;
1024 1031 (void) zap_cursor_advance(&zc)) {
1025 1032 if (za.za_integer_length != 8 || za.za_num_integers != 1) {
1026 1033 err = SET_ERROR(EINVAL);
1027 1034 break;
1028 1035 }
1029 1036 err = zap_add(os, intoobj, za.za_name,
1030 1037 8, 1, &value, tx);
1031 1038 if (err)
1032 1039 break;
1033 1040 }
1034 1041 zap_cursor_fini(&zc);
1035 1042 return (err);
1036 1043 }
1037 1044
1038 1045 int
1039 1046 zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj,
1040 1047 dmu_tx_t *tx)
1041 1048 {
1042 1049 zap_cursor_t zc;
1043 1050 zap_attribute_t za;
1044 1051 int err;
1045 1052
1046 1053 err = 0;
1047 1054 for (zap_cursor_init(&zc, os, fromobj);
1048 1055 zap_cursor_retrieve(&zc, &za) == 0;
1049 1056 (void) zap_cursor_advance(&zc)) {
1050 1057 uint64_t delta = 0;
1051 1058
1052 1059 if (za.za_integer_length != 8 || za.za_num_integers != 1) {
1053 1060 err = SET_ERROR(EINVAL);
1054 1061 break;
1055 1062 }
1056 1063
1057 1064 err = zap_lookup(os, intoobj, za.za_name, 8, 1, &delta);
1058 1065 if (err != 0 && err != ENOENT)
1059 1066 break;
1060 1067 delta += za.za_first_integer;
1061 1068 err = zap_update(os, intoobj, za.za_name, 8, 1, &delta, tx);
1062 1069 if (err)
1063 1070 break;
1064 1071 }
1065 1072 zap_cursor_fini(&zc);
1066 1073 return (err);
1067 1074 }
1068 1075
1069 1076 int
1070 1077 zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx)
1071 1078 {
1072 1079 char name[20];
1073 1080
1074 1081 (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
1075 1082 return (zap_add(os, obj, name, 8, 1, &value, tx));
1076 1083 }
1077 1084
1078 1085 int
1079 1086 zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx)
1080 1087 {
1081 1088 char name[20];
1082 1089
1083 1090 (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
1084 1091 return (zap_remove(os, obj, name, tx));
1085 1092 }
1086 1093
1087 1094 int
1088 1095 zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value)
1089 1096 {
1090 1097 char name[20];
1091 1098
1092 1099 (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
1093 1100 return (zap_lookup(os, obj, name, 8, 1, &value));
1094 1101 }
1095 1102
1096 1103 int
1097 1104 zap_add_int_key(objset_t *os, uint64_t obj,
1098 1105 uint64_t key, uint64_t value, dmu_tx_t *tx)
1099 1106 {
1100 1107 char name[20];
1101 1108
1102 1109 (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
1103 1110 return (zap_add(os, obj, name, 8, 1, &value, tx));
1104 1111 }
1105 1112
1106 1113 int
1107 1114 zap_update_int_key(objset_t *os, uint64_t obj,
1108 1115 uint64_t key, uint64_t value, dmu_tx_t *tx)
1109 1116 {
1110 1117 char name[20];
1111 1118
1112 1119 (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
1113 1120 return (zap_update(os, obj, name, 8, 1, &value, tx));
1114 1121 }
1115 1122
1116 1123 int
1117 1124 zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep)
1118 1125 {
1119 1126 char name[20];
1120 1127
1121 1128 (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
1122 1129 return (zap_lookup(os, obj, name, 8, 1, valuep));
1123 1130 }
1124 1131
1125 1132 int
1126 1133 zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
1127 1134 dmu_tx_t *tx)
1128 1135 {
1129 1136 uint64_t value = 0;
1130 1137 int err;
1131 1138
1132 1139 if (delta == 0)
1133 1140 return (0);
1134 1141
1135 1142 err = zap_lookup(os, obj, name, 8, 1, &value);
1136 1143 if (err != 0 && err != ENOENT)
1137 1144 return (err);
1138 1145 value += delta;
1139 1146 if (value == 0)
1140 1147 err = zap_remove(os, obj, name, tx);
1141 1148 else
1142 1149 err = zap_update(os, obj, name, 8, 1, &value, tx);
1143 1150 return (err);
1144 1151 }
1145 1152
1146 1153 int
1147 1154 zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
1148 1155 dmu_tx_t *tx)
1149 1156 {
1150 1157 char name[20];
1151 1158
1152 1159 (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
1153 1160 return (zap_increment(os, obj, name, delta, tx));
1154 1161 }
1155 1162
1156 1163 /*
1157 1164 * Routines for iterating over the attributes.
1158 1165 */
1159 1166
1160 1167 int
1161 1168 fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
1162 1169 {
1163 1170 int err = ENOENT;
1164 1171 zap_entry_handle_t zeh;
1165 1172 zap_leaf_t *l;
1166 1173
1167 1174 /* retrieve the next entry at or after zc_hash/zc_cd */
1168 1175 /* if no entry, return ENOENT */
1169 1176
1170 1177 if (zc->zc_leaf &&
1171 1178 (ZAP_HASH_IDX(zc->zc_hash,
1172 1179 zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) !=
1173 1180 zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) {
1174 1181 rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
1175 1182 zap_put_leaf(zc->zc_leaf);
1176 1183 zc->zc_leaf = NULL;
1177 1184 }
1178 1185
1179 1186 again:
1180 1187 if (zc->zc_leaf == NULL) {
1181 1188 err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER,
1182 1189 &zc->zc_leaf);
1183 1190 if (err != 0)
1184 1191 return (err);
1185 1192 } else {
1186 1193 rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
1187 1194 }
1188 1195 l = zc->zc_leaf;
1189 1196
1190 1197 err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh);
1191 1198
1192 1199 if (err == ENOENT) {
1193 1200 uint64_t nocare =
1194 1201 (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len)) - 1;
1195 1202 zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1;
1196 1203 zc->zc_cd = 0;
1197 1204 if (zap_leaf_phys(l)->l_hdr.lh_prefix_len == 0 ||
1198 1205 zc->zc_hash == 0) {
1199 1206 zc->zc_hash = -1ULL;
1200 1207 } else {
1201 1208 zap_put_leaf(zc->zc_leaf);
1202 1209 zc->zc_leaf = NULL;
1203 1210 goto again;
1204 1211 }
1205 1212 }
1206 1213
1207 1214 if (err == 0) {
1208 1215 zc->zc_hash = zeh.zeh_hash;
1209 1216 zc->zc_cd = zeh.zeh_cd;
1210 1217 za->za_integer_length = zeh.zeh_integer_size;
1211 1218 za->za_num_integers = zeh.zeh_num_integers;
1212 1219 if (zeh.zeh_num_integers == 0) {
1213 1220 za->za_first_integer = 0;
1214 1221 } else {
1215 1222 err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer);
1216 1223 ASSERT(err == 0 || err == EOVERFLOW);
1217 1224 }
1218 1225 err = zap_entry_read_name(zap, &zeh,
1219 1226 sizeof (za->za_name), za->za_name);
1220 1227 ASSERT(err == 0);
1221 1228
1222 1229 za->za_normalization_conflict =
1223 1230 zap_entry_normalization_conflict(&zeh,
1224 1231 NULL, za->za_name, zap);
1225 1232 }
1226 1233 rw_exit(&zc->zc_leaf->l_rwlock);
1227 1234 return (err);
1228 1235 }
1229 1236
1230 1237 static void
1231 1238 zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
1232 1239 {
1233 1240 int i, err;
1234 1241 uint64_t lastblk = 0;
1235 1242
1236 1243 /*
1237 1244 * NB: if a leaf has more pointers than an entire ptrtbl block
1238 1245 * can hold, then it'll be accounted for more than once, since
1239 1246 * we won't have lastblk.
1240 1247 */
1241 1248 for (i = 0; i < len; i++) {
1242 1249 zap_leaf_t *l;
1243 1250
1244 1251 if (tbl[i] == lastblk)
1245 1252 continue;
1246 1253 lastblk = tbl[i];
1247 1254
1248 1255 err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l);
1249 1256 if (err == 0) {
1250 1257 zap_leaf_stats(zap, l, zs);
1251 1258 zap_put_leaf(l);
1252 1259 }
1253 1260 }
1254 1261 }
1255 1262
1256 1263 void
1257 1264 fzap_get_stats(zap_t *zap, zap_stats_t *zs)
1258 1265 {
1259 1266 int bs = FZAP_BLOCK_SHIFT(zap);
1260 1267 zs->zs_blocksize = 1ULL << bs;
1261 1268
1262 1269 /*
1263 1270 * Set zap_phys_t fields
1264 1271 */
1265 1272 zs->zs_num_leafs = zap_f_phys(zap)->zap_num_leafs;
1266 1273 zs->zs_num_entries = zap_f_phys(zap)->zap_num_entries;
1267 1274 zs->zs_num_blocks = zap_f_phys(zap)->zap_freeblk;
1268 1275 zs->zs_block_type = zap_f_phys(zap)->zap_block_type;
1269 1276 zs->zs_magic = zap_f_phys(zap)->zap_magic;
1270 1277 zs->zs_salt = zap_f_phys(zap)->zap_salt;
1271 1278
1272 1279 /*
1273 1280 * Set zap_ptrtbl fields
1274 1281 */
1275 1282 zs->zs_ptrtbl_len = 1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift;
1276 1283 zs->zs_ptrtbl_nextblk = zap_f_phys(zap)->zap_ptrtbl.zt_nextblk;
1277 1284 zs->zs_ptrtbl_blks_copied =
1278 1285 zap_f_phys(zap)->zap_ptrtbl.zt_blks_copied;
1279 1286 zs->zs_ptrtbl_zt_blk = zap_f_phys(zap)->zap_ptrtbl.zt_blk;
1280 1287 zs->zs_ptrtbl_zt_numblks = zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
1281 1288 zs->zs_ptrtbl_zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
1282 1289
1283 1290 if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
1284 1291 /* the ptrtbl is entirely in the header block. */
1285 1292 zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
1286 1293 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs);
1287 1294 } else {
1288 1295 int b;
1289 1296
1290 1297 dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
1291 1298 zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs,
1292 1299 zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs,
1293 1300 ZIO_PRIORITY_SYNC_READ);
1294 1301
1295 1302 for (b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
1296 1303 b++) {
1297 1304 dmu_buf_t *db;
1298 1305 int err;
1299 1306
1300 1307 err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
1301 1308 (zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs,
1302 1309 FTAG, &db, DMU_READ_NO_PREFETCH);
1303 1310 if (err == 0) {
1304 1311 zap_stats_ptrtbl(zap, db->db_data,
1305 1312 1<<(bs-3), zs);
1306 1313 dmu_buf_rele(db, FTAG);
1307 1314 }
1308 1315 }
1309 1316 }
1310 1317 }
1311 1318
1312 1319 int
1313 1320 fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite,
1314 1321 uint64_t *tooverwrite)
1315 1322 {
1316 1323 zap_t *zap = zn->zn_zap;
1317 1324 zap_leaf_t *l;
1318 1325 int err;
1319 1326
1320 1327 /*
1321 1328 * Account for the header block of the fatzap.
1322 1329 */
1323 1330 if (!add && dmu_buf_freeable(zap->zap_dbuf)) {
1324 1331 *tooverwrite += zap->zap_dbuf->db_size;
1325 1332 } else {
1326 1333 *towrite += zap->zap_dbuf->db_size;
1327 1334 }
1328 1335
1329 1336 /*
1330 1337 * Account for the pointer table blocks.
1331 1338 * If we are adding we need to account for the following cases :
1332 1339 * - If the pointer table is embedded, this operation could force an
1333 1340 * external pointer table.
1334 1341 * - If this already has an external pointer table this operation
1335 1342 * could extend the table.
1336 1343 */
1337 1344 if (add) {
1338 1345 if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0)
1339 1346 *towrite += zap->zap_dbuf->db_size;
1340 1347 else
1341 1348 *towrite += (zap->zap_dbuf->db_size * 3);
1342 1349 }
1343 1350
1344 1351 /*
1345 1352 * Now, check if the block containing leaf is freeable
1346 1353 * and account accordingly.
1347 1354 */
1348 1355 err = zap_deref_leaf(zap, zn->zn_hash, NULL, RW_READER, &l);
1349 1356 if (err != 0) {
1350 1357 return (err);
1351 1358 }
1352 1359
1353 1360 if (!add && dmu_buf_freeable(l->l_dbuf)) {
1354 1361 *tooverwrite += l->l_dbuf->db_size;
1355 1362 } else {
1356 1363 /*
1357 1364 * If this an add operation, the leaf block could split.
1358 1365 * Hence, we need to account for an additional leaf block.
1359 1366 */
1360 1367 *towrite += (add ? 2 : 1) * l->l_dbuf->db_size;
1361 1368 }
1362 1369
1363 1370 zap_put_leaf(l);
1364 1371 return (0);
1365 1372 }
|
↓ open down ↓ |
777 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX