Print this page
3469 dbuf_read_impl shows too much enthusiasm
Reviewed by: Dan McDonald <danmcd@omniti.com>
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/fs/zfs/dbuf.c
+++ new/usr/src/uts/common/fs/zfs/dbuf.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24 24 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
25 25 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
26 26 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
27 27 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
28 28 */
29 29
30 30 #include <sys/zfs_context.h>
31 31 #include <sys/dmu.h>
32 32 #include <sys/dmu_send.h>
33 33 #include <sys/dmu_impl.h>
34 34 #include <sys/dbuf.h>
35 35 #include <sys/dmu_objset.h>
36 36 #include <sys/dsl_dataset.h>
37 37 #include <sys/dsl_dir.h>
38 38 #include <sys/dmu_tx.h>
39 39 #include <sys/spa.h>
40 40 #include <sys/zio.h>
41 41 #include <sys/dmu_zfetch.h>
42 42 #include <sys/sa.h>
43 43 #include <sys/sa_impl.h>
44 44 #include <sys/zfeature.h>
45 45 #include <sys/blkptr.h>
46 46 #include <sys/range_tree.h>
47 47
48 48 /*
49 49 * Number of times that zfs_free_range() took the slow path while doing
50 50 * a zfs receive. A nonzero value indicates a potential performance problem.
51 51 */
52 52 uint64_t zfs_free_range_recv_miss;
53 53
54 54 static void dbuf_destroy(dmu_buf_impl_t *db);
55 55 static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
56 56 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
57 57
58 58 #ifndef __lint
59 59 extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu,
60 60 dmu_buf_evict_func_t *evict_func, dmu_buf_t **clear_on_evict_dbufp);
61 61 #endif /* ! __lint */
62 62
63 63 /*
64 64 * Global data structures and functions for the dbuf cache.
65 65 */
66 66 static kmem_cache_t *dbuf_cache;
67 67 static taskq_t *dbu_evict_taskq;
68 68
69 69 /* ARGSUSED */
70 70 static int
71 71 dbuf_cons(void *vdb, void *unused, int kmflag)
72 72 {
73 73 dmu_buf_impl_t *db = vdb;
74 74 bzero(db, sizeof (dmu_buf_impl_t));
75 75
76 76 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
77 77 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
78 78 refcount_create(&db->db_holds);
79 79
80 80 return (0);
81 81 }
82 82
83 83 /* ARGSUSED */
84 84 static void
85 85 dbuf_dest(void *vdb, void *unused)
86 86 {
87 87 dmu_buf_impl_t *db = vdb;
88 88 mutex_destroy(&db->db_mtx);
89 89 cv_destroy(&db->db_changed);
90 90 refcount_destroy(&db->db_holds);
91 91 }
92 92
93 93 /*
94 94 * dbuf hash table routines
95 95 */
96 96 static dbuf_hash_table_t dbuf_hash_table;
97 97
98 98 static uint64_t dbuf_hash_count;
99 99
100 100 static uint64_t
101 101 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
102 102 {
103 103 uintptr_t osv = (uintptr_t)os;
104 104 uint64_t crc = -1ULL;
105 105
106 106 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
107 107 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
108 108 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
109 109 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
110 110 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
111 111 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
112 112 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
113 113
114 114 crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
115 115
116 116 return (crc);
117 117 }
118 118
119 119 #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
120 120
121 121 #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
122 122 ((dbuf)->db.db_object == (obj) && \
123 123 (dbuf)->db_objset == (os) && \
124 124 (dbuf)->db_level == (level) && \
125 125 (dbuf)->db_blkid == (blkid))
126 126
127 127 dmu_buf_impl_t *
128 128 dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
129 129 {
130 130 dbuf_hash_table_t *h = &dbuf_hash_table;
131 131 uint64_t hv = DBUF_HASH(os, obj, level, blkid);
132 132 uint64_t idx = hv & h->hash_table_mask;
133 133 dmu_buf_impl_t *db;
134 134
135 135 mutex_enter(DBUF_HASH_MUTEX(h, idx));
136 136 for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
137 137 if (DBUF_EQUAL(db, os, obj, level, blkid)) {
138 138 mutex_enter(&db->db_mtx);
139 139 if (db->db_state != DB_EVICTING) {
140 140 mutex_exit(DBUF_HASH_MUTEX(h, idx));
141 141 return (db);
142 142 }
143 143 mutex_exit(&db->db_mtx);
144 144 }
145 145 }
146 146 mutex_exit(DBUF_HASH_MUTEX(h, idx));
147 147 return (NULL);
148 148 }
149 149
150 150 static dmu_buf_impl_t *
151 151 dbuf_find_bonus(objset_t *os, uint64_t object)
152 152 {
153 153 dnode_t *dn;
154 154 dmu_buf_impl_t *db = NULL;
155 155
156 156 if (dnode_hold(os, object, FTAG, &dn) == 0) {
157 157 rw_enter(&dn->dn_struct_rwlock, RW_READER);
158 158 if (dn->dn_bonus != NULL) {
159 159 db = dn->dn_bonus;
160 160 mutex_enter(&db->db_mtx);
161 161 }
162 162 rw_exit(&dn->dn_struct_rwlock);
163 163 dnode_rele(dn, FTAG);
164 164 }
165 165 return (db);
166 166 }
167 167
168 168 /*
169 169 * Insert an entry into the hash table. If there is already an element
170 170 * equal to elem in the hash table, then the already existing element
171 171 * will be returned and the new element will not be inserted.
172 172 * Otherwise returns NULL.
173 173 */
174 174 static dmu_buf_impl_t *
175 175 dbuf_hash_insert(dmu_buf_impl_t *db)
176 176 {
177 177 dbuf_hash_table_t *h = &dbuf_hash_table;
178 178 objset_t *os = db->db_objset;
179 179 uint64_t obj = db->db.db_object;
180 180 int level = db->db_level;
181 181 uint64_t blkid = db->db_blkid;
182 182 uint64_t hv = DBUF_HASH(os, obj, level, blkid);
183 183 uint64_t idx = hv & h->hash_table_mask;
184 184 dmu_buf_impl_t *dbf;
185 185
186 186 mutex_enter(DBUF_HASH_MUTEX(h, idx));
187 187 for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
188 188 if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
189 189 mutex_enter(&dbf->db_mtx);
190 190 if (dbf->db_state != DB_EVICTING) {
191 191 mutex_exit(DBUF_HASH_MUTEX(h, idx));
192 192 return (dbf);
193 193 }
194 194 mutex_exit(&dbf->db_mtx);
195 195 }
196 196 }
197 197
198 198 mutex_enter(&db->db_mtx);
199 199 db->db_hash_next = h->hash_table[idx];
200 200 h->hash_table[idx] = db;
201 201 mutex_exit(DBUF_HASH_MUTEX(h, idx));
202 202 atomic_inc_64(&dbuf_hash_count);
203 203
204 204 return (NULL);
205 205 }
206 206
207 207 /*
208 208 * Remove an entry from the hash table. It must be in the EVICTING state.
209 209 */
210 210 static void
211 211 dbuf_hash_remove(dmu_buf_impl_t *db)
212 212 {
213 213 dbuf_hash_table_t *h = &dbuf_hash_table;
214 214 uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
215 215 db->db_level, db->db_blkid);
216 216 uint64_t idx = hv & h->hash_table_mask;
217 217 dmu_buf_impl_t *dbf, **dbp;
218 218
219 219 /*
220 220 * We musn't hold db_mtx to maintain lock ordering:
221 221 * DBUF_HASH_MUTEX > db_mtx.
222 222 */
223 223 ASSERT(refcount_is_zero(&db->db_holds));
224 224 ASSERT(db->db_state == DB_EVICTING);
225 225 ASSERT(!MUTEX_HELD(&db->db_mtx));
226 226
227 227 mutex_enter(DBUF_HASH_MUTEX(h, idx));
228 228 dbp = &h->hash_table[idx];
229 229 while ((dbf = *dbp) != db) {
230 230 dbp = &dbf->db_hash_next;
231 231 ASSERT(dbf != NULL);
232 232 }
233 233 *dbp = db->db_hash_next;
234 234 db->db_hash_next = NULL;
235 235 mutex_exit(DBUF_HASH_MUTEX(h, idx));
236 236 atomic_dec_64(&dbuf_hash_count);
237 237 }
238 238
239 239 static arc_evict_func_t dbuf_do_evict;
240 240
241 241 typedef enum {
242 242 DBVU_EVICTING,
243 243 DBVU_NOT_EVICTING
244 244 } dbvu_verify_type_t;
245 245
246 246 static void
247 247 dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)
248 248 {
249 249 #ifdef ZFS_DEBUG
250 250 int64_t holds;
251 251
252 252 if (db->db_user == NULL)
253 253 return;
254 254
255 255 /* Only data blocks support the attachment of user data. */
256 256 ASSERT(db->db_level == 0);
257 257
258 258 /* Clients must resolve a dbuf before attaching user data. */
259 259 ASSERT(db->db.db_data != NULL);
260 260 ASSERT3U(db->db_state, ==, DB_CACHED);
261 261
262 262 holds = refcount_count(&db->db_holds);
263 263 if (verify_type == DBVU_EVICTING) {
264 264 /*
265 265 * Immediate eviction occurs when holds == dirtycnt.
266 266 * For normal eviction buffers, holds is zero on
267 267 * eviction, except when dbuf_fix_old_data() calls
268 268 * dbuf_clear_data(). However, the hold count can grow
269 269 * during eviction even though db_mtx is held (see
270 270 * dmu_bonus_hold() for an example), so we can only
271 271 * test the generic invariant that holds >= dirtycnt.
272 272 */
273 273 ASSERT3U(holds, >=, db->db_dirtycnt);
274 274 } else {
275 275 if (db->db_user_immediate_evict == TRUE)
276 276 ASSERT3U(holds, >=, db->db_dirtycnt);
277 277 else
278 278 ASSERT3U(holds, >, 0);
279 279 }
280 280 #endif
281 281 }
282 282
283 283 static void
284 284 dbuf_evict_user(dmu_buf_impl_t *db)
285 285 {
286 286 dmu_buf_user_t *dbu = db->db_user;
287 287
288 288 ASSERT(MUTEX_HELD(&db->db_mtx));
289 289
290 290 if (dbu == NULL)
291 291 return;
292 292
293 293 dbuf_verify_user(db, DBVU_EVICTING);
294 294 db->db_user = NULL;
295 295
296 296 #ifdef ZFS_DEBUG
297 297 if (dbu->dbu_clear_on_evict_dbufp != NULL)
298 298 *dbu->dbu_clear_on_evict_dbufp = NULL;
299 299 #endif
300 300
301 301 /*
302 302 * Invoke the callback from a taskq to avoid lock order reversals
303 303 * and limit stack depth.
304 304 */
305 305 taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func, dbu, 0,
306 306 &dbu->dbu_tqent);
307 307 }
308 308
309 309 boolean_t
310 310 dbuf_is_metadata(dmu_buf_impl_t *db)
311 311 {
312 312 if (db->db_level > 0) {
313 313 return (B_TRUE);
314 314 } else {
315 315 boolean_t is_metadata;
316 316
317 317 DB_DNODE_ENTER(db);
318 318 is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
319 319 DB_DNODE_EXIT(db);
320 320
321 321 return (is_metadata);
322 322 }
323 323 }
324 324
325 325 void
326 326 dbuf_evict(dmu_buf_impl_t *db)
327 327 {
328 328 ASSERT(MUTEX_HELD(&db->db_mtx));
329 329 ASSERT(db->db_buf == NULL);
330 330 ASSERT(db->db_data_pending == NULL);
331 331
332 332 dbuf_clear(db);
333 333 dbuf_destroy(db);
334 334 }
335 335
336 336 void
337 337 dbuf_init(void)
338 338 {
339 339 uint64_t hsize = 1ULL << 16;
340 340 dbuf_hash_table_t *h = &dbuf_hash_table;
341 341 int i;
342 342
343 343 /*
344 344 * The hash table is big enough to fill all of physical memory
345 345 * with an average 4K block size. The table will take up
346 346 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
347 347 */
348 348 while (hsize * 4096 < physmem * PAGESIZE)
349 349 hsize <<= 1;
350 350
351 351 retry:
352 352 h->hash_table_mask = hsize - 1;
353 353 h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
354 354 if (h->hash_table == NULL) {
355 355 /* XXX - we should really return an error instead of assert */
356 356 ASSERT(hsize > (1ULL << 10));
357 357 hsize >>= 1;
358 358 goto retry;
359 359 }
360 360
361 361 dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
362 362 sizeof (dmu_buf_impl_t),
363 363 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
364 364
365 365 for (i = 0; i < DBUF_MUTEXES; i++)
366 366 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
367 367
368 368 /*
369 369 * All entries are queued via taskq_dispatch_ent(), so min/maxalloc
370 370 * configuration is not required.
371 371 */
372 372 dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0);
373 373 }
374 374
375 375 void
376 376 dbuf_fini(void)
377 377 {
378 378 dbuf_hash_table_t *h = &dbuf_hash_table;
379 379 int i;
380 380
381 381 for (i = 0; i < DBUF_MUTEXES; i++)
382 382 mutex_destroy(&h->hash_mutexes[i]);
383 383 kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
384 384 kmem_cache_destroy(dbuf_cache);
385 385 taskq_destroy(dbu_evict_taskq);
386 386 }
387 387
388 388 /*
389 389 * Other stuff.
390 390 */
391 391
392 392 #ifdef ZFS_DEBUG
393 393 static void
394 394 dbuf_verify(dmu_buf_impl_t *db)
395 395 {
396 396 dnode_t *dn;
397 397 dbuf_dirty_record_t *dr;
398 398
399 399 ASSERT(MUTEX_HELD(&db->db_mtx));
400 400
401 401 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
402 402 return;
403 403
404 404 ASSERT(db->db_objset != NULL);
405 405 DB_DNODE_ENTER(db);
406 406 dn = DB_DNODE(db);
407 407 if (dn == NULL) {
408 408 ASSERT(db->db_parent == NULL);
409 409 ASSERT(db->db_blkptr == NULL);
410 410 } else {
411 411 ASSERT3U(db->db.db_object, ==, dn->dn_object);
412 412 ASSERT3P(db->db_objset, ==, dn->dn_objset);
413 413 ASSERT3U(db->db_level, <, dn->dn_nlevels);
414 414 ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
415 415 db->db_blkid == DMU_SPILL_BLKID ||
416 416 !avl_is_empty(&dn->dn_dbufs));
417 417 }
418 418 if (db->db_blkid == DMU_BONUS_BLKID) {
419 419 ASSERT(dn != NULL);
420 420 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
421 421 ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
422 422 } else if (db->db_blkid == DMU_SPILL_BLKID) {
423 423 ASSERT(dn != NULL);
424 424 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
425 425 ASSERT0(db->db.db_offset);
426 426 } else {
427 427 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
428 428 }
429 429
430 430 for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
431 431 ASSERT(dr->dr_dbuf == db);
432 432
433 433 for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
434 434 ASSERT(dr->dr_dbuf == db);
435 435
436 436 /*
437 437 * We can't assert that db_size matches dn_datablksz because it
438 438 * can be momentarily different when another thread is doing
439 439 * dnode_set_blksz().
440 440 */
441 441 if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
442 442 dr = db->db_data_pending;
443 443 /*
444 444 * It should only be modified in syncing context, so
445 445 * make sure we only have one copy of the data.
446 446 */
447 447 ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
448 448 }
449 449
450 450 /* verify db->db_blkptr */
451 451 if (db->db_blkptr) {
452 452 if (db->db_parent == dn->dn_dbuf) {
453 453 /* db is pointed to by the dnode */
454 454 /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
455 455 if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
456 456 ASSERT(db->db_parent == NULL);
457 457 else
458 458 ASSERT(db->db_parent != NULL);
459 459 if (db->db_blkid != DMU_SPILL_BLKID)
460 460 ASSERT3P(db->db_blkptr, ==,
461 461 &dn->dn_phys->dn_blkptr[db->db_blkid]);
462 462 } else {
463 463 /* db is pointed to by an indirect block */
464 464 int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
465 465 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
466 466 ASSERT3U(db->db_parent->db.db_object, ==,
467 467 db->db.db_object);
468 468 /*
469 469 * dnode_grow_indblksz() can make this fail if we don't
470 470 * have the struct_rwlock. XXX indblksz no longer
471 471 * grows. safe to do this now?
472 472 */
473 473 if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
474 474 ASSERT3P(db->db_blkptr, ==,
475 475 ((blkptr_t *)db->db_parent->db.db_data +
476 476 db->db_blkid % epb));
477 477 }
478 478 }
479 479 }
480 480 if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
481 481 (db->db_buf == NULL || db->db_buf->b_data) &&
482 482 db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
483 483 db->db_state != DB_FILL && !dn->dn_free_txg) {
484 484 /*
485 485 * If the blkptr isn't set but they have nonzero data,
486 486 * it had better be dirty, otherwise we'll lose that
487 487 * data when we evict this buffer.
488 488 */
489 489 if (db->db_dirtycnt == 0) {
490 490 uint64_t *buf = db->db.db_data;
491 491 int i;
492 492
493 493 for (i = 0; i < db->db.db_size >> 3; i++) {
494 494 ASSERT(buf[i] == 0);
495 495 }
496 496 }
497 497 }
498 498 DB_DNODE_EXIT(db);
499 499 }
500 500 #endif
501 501
502 502 static void
503 503 dbuf_clear_data(dmu_buf_impl_t *db)
504 504 {
505 505 ASSERT(MUTEX_HELD(&db->db_mtx));
506 506 dbuf_evict_user(db);
507 507 db->db_buf = NULL;
508 508 db->db.db_data = NULL;
509 509 if (db->db_state != DB_NOFILL)
510 510 db->db_state = DB_UNCACHED;
511 511 }
512 512
513 513 static void
514 514 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
515 515 {
516 516 ASSERT(MUTEX_HELD(&db->db_mtx));
517 517 ASSERT(buf != NULL);
518 518
519 519 db->db_buf = buf;
520 520 ASSERT(buf->b_data != NULL);
521 521 db->db.db_data = buf->b_data;
522 522 if (!arc_released(buf))
523 523 arc_set_callback(buf, dbuf_do_evict, db);
524 524 }
525 525
526 526 /*
527 527 * Loan out an arc_buf for read. Return the loaned arc_buf.
528 528 */
529 529 arc_buf_t *
530 530 dbuf_loan_arcbuf(dmu_buf_impl_t *db)
531 531 {
532 532 arc_buf_t *abuf;
533 533
534 534 mutex_enter(&db->db_mtx);
535 535 if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
536 536 int blksz = db->db.db_size;
537 537 spa_t *spa = db->db_objset->os_spa;
538 538
539 539 mutex_exit(&db->db_mtx);
540 540 abuf = arc_loan_buf(spa, blksz);
541 541 bcopy(db->db.db_data, abuf->b_data, blksz);
542 542 } else {
543 543 abuf = db->db_buf;
544 544 arc_loan_inuse_buf(abuf, db);
545 545 dbuf_clear_data(db);
546 546 mutex_exit(&db->db_mtx);
547 547 }
548 548 return (abuf);
549 549 }
550 550
551 551 /*
552 552 * Calculate which level n block references the data at the level 0 offset
553 553 * provided.
554 554 */
555 555 uint64_t
556 556 dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset)
557 557 {
558 558 if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) {
559 559 /*
560 560 * The level n blkid is equal to the level 0 blkid divided by
561 561 * the number of level 0s in a level n block.
562 562 *
563 563 * The level 0 blkid is offset >> datablkshift =
564 564 * offset / 2^datablkshift.
565 565 *
566 566 * The number of level 0s in a level n is the number of block
567 567 * pointers in an indirect block, raised to the power of level.
568 568 * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level =
569 569 * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)).
570 570 *
571 571 * Thus, the level n blkid is: offset /
572 572 * ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT)))
573 573 * = offset / 2^(datablkshift + level *
574 574 * (indblkshift - SPA_BLKPTRSHIFT))
575 575 * = offset >> (datablkshift + level *
576 576 * (indblkshift - SPA_BLKPTRSHIFT))
577 577 */
578 578 return (offset >> (dn->dn_datablkshift + level *
579 579 (dn->dn_indblkshift - SPA_BLKPTRSHIFT)));
580 580 } else {
581 581 ASSERT3U(offset, <, dn->dn_datablksz);
582 582 return (0);
583 583 }
584 584 }
585 585
586 586 static void
587 587 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
588 588 {
589 589 dmu_buf_impl_t *db = vdb;
590 590
591 591 mutex_enter(&db->db_mtx);
592 592 ASSERT3U(db->db_state, ==, DB_READ);
593 593 /*
594 594 * All reads are synchronous, so we must have a hold on the dbuf
595 595 */
596 596 ASSERT(refcount_count(&db->db_holds) > 0);
597 597 ASSERT(db->db_buf == NULL);
598 598 ASSERT(db->db.db_data == NULL);
599 599 if (db->db_level == 0 && db->db_freed_in_flight) {
600 600 /* we were freed in flight; disregard any error */
601 601 arc_release(buf, db);
602 602 bzero(buf->b_data, db->db.db_size);
603 603 arc_buf_freeze(buf);
604 604 db->db_freed_in_flight = FALSE;
605 605 dbuf_set_data(db, buf);
606 606 db->db_state = DB_CACHED;
607 607 } else if (zio == NULL || zio->io_error == 0) {
608 608 dbuf_set_data(db, buf);
609 609 db->db_state = DB_CACHED;
610 610 } else {
611 611 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
612 612 ASSERT3P(db->db_buf, ==, NULL);
613 613 VERIFY(arc_buf_remove_ref(buf, db));
614 614 db->db_state = DB_UNCACHED;
615 615 }
616 616 cv_broadcast(&db->db_changed);
617 617 dbuf_rele_and_unlock(db, NULL);
618 618 }
619 619
620 620 static void
621 621 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
622 622 {
623 623 dnode_t *dn;
624 624 zbookmark_phys_t zb;
625 625 arc_flags_t aflags = ARC_FLAG_NOWAIT;
626 626
627 627 DB_DNODE_ENTER(db);
628 628 dn = DB_DNODE(db);
629 629 ASSERT(!refcount_is_zero(&db->db_holds));
630 630 /* We need the struct_rwlock to prevent db_blkptr from changing. */
631 631 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
632 632 ASSERT(MUTEX_HELD(&db->db_mtx));
633 633 ASSERT(db->db_state == DB_UNCACHED);
|
↓ open down ↓ |
633 lines elided |
↑ open up ↑ |
634 634 ASSERT(db->db_buf == NULL);
635 635
636 636 if (db->db_blkid == DMU_BONUS_BLKID) {
637 637 int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
638 638
639 639 ASSERT3U(bonuslen, <=, db->db.db_size);
640 640 db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
641 641 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
642 642 if (bonuslen < DN_MAX_BONUSLEN)
643 643 bzero(db->db.db_data, DN_MAX_BONUSLEN);
644 - if (bonuslen)
645 - bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
644 + if (bonuslen) {
645 + /*
646 + * Absent byzantine on-disk corruption, we fully expect
647 + * our bonuslen to be no more than DN_MAX_BONUSLEN --
648 + * but we nonetheless explicitly clamp it on the
649 + * bcopy() to prevent any on-disk corruption from
650 + * becoming rampant in-kernel corruption.
651 + */
652 + if (bonuslen > DN_MAX_BONUSLEN) {
653 + DTRACE_PROBE3(dbuf__read__impl__toolong, int,
654 + bonuslen, dnode_t *, dn, dmu_buf_impl_t *,
655 + db);
656 + bcopy(DN_BONUS(dn->dn_phys), db->db.db_data,
657 + DN_MAX_BONUSLEN);
658 + } else {
659 + bcopy(DN_BONUS(dn->dn_phys), db->db.db_data,
660 + bonuslen);
661 + }
662 + }
646 663 DB_DNODE_EXIT(db);
647 664 db->db_state = DB_CACHED;
648 665 mutex_exit(&db->db_mtx);
649 666 return;
650 667 }
651 668
652 669 /*
653 670 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
654 671 * processes the delete record and clears the bp while we are waiting
655 672 * for the dn_mtx (resulting in a "no" from block_freed).
656 673 */
657 674 if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
658 675 (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
659 676 BP_IS_HOLE(db->db_blkptr)))) {
660 677 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
661 678
662 679 DB_DNODE_EXIT(db);
663 680 dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa,
664 681 db->db.db_size, db, type));
665 682 bzero(db->db.db_data, db->db.db_size);
666 683 db->db_state = DB_CACHED;
667 684 mutex_exit(&db->db_mtx);
668 685 return;
669 686 }
670 687
671 688 DB_DNODE_EXIT(db);
672 689
673 690 db->db_state = DB_READ;
674 691 mutex_exit(&db->db_mtx);
675 692
676 693 if (DBUF_IS_L2CACHEABLE(db))
677 694 aflags |= ARC_FLAG_L2CACHE;
678 695 if (DBUF_IS_L2COMPRESSIBLE(db))
679 696 aflags |= ARC_FLAG_L2COMPRESS;
680 697
681 698 SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
682 699 db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
683 700 db->db.db_object, db->db_level, db->db_blkid);
684 701
685 702 dbuf_add_ref(db, NULL);
686 703
687 704 (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
688 705 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
689 706 (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
690 707 &aflags, &zb);
691 708 }
692 709
693 710 int
694 711 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
695 712 {
696 713 int err = 0;
697 714 boolean_t havepzio = (zio != NULL);
698 715 boolean_t prefetch;
699 716 dnode_t *dn;
700 717
701 718 /*
702 719 * We don't have to hold the mutex to check db_state because it
703 720 * can't be freed while we have a hold on the buffer.
704 721 */
705 722 ASSERT(!refcount_is_zero(&db->db_holds));
706 723
707 724 if (db->db_state == DB_NOFILL)
708 725 return (SET_ERROR(EIO));
709 726
710 727 DB_DNODE_ENTER(db);
711 728 dn = DB_DNODE(db);
712 729 if ((flags & DB_RF_HAVESTRUCT) == 0)
713 730 rw_enter(&dn->dn_struct_rwlock, RW_READER);
714 731
715 732 prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
716 733 (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
717 734 DBUF_IS_CACHEABLE(db);
718 735
719 736 mutex_enter(&db->db_mtx);
720 737 if (db->db_state == DB_CACHED) {
721 738 mutex_exit(&db->db_mtx);
722 739 if (prefetch)
723 740 dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1);
724 741 if ((flags & DB_RF_HAVESTRUCT) == 0)
725 742 rw_exit(&dn->dn_struct_rwlock);
726 743 DB_DNODE_EXIT(db);
727 744 } else if (db->db_state == DB_UNCACHED) {
728 745 spa_t *spa = dn->dn_objset->os_spa;
729 746
730 747 if (zio == NULL)
731 748 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
732 749 dbuf_read_impl(db, zio, flags);
733 750
734 751 /* dbuf_read_impl has dropped db_mtx for us */
735 752
736 753 if (prefetch)
737 754 dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1);
738 755
739 756 if ((flags & DB_RF_HAVESTRUCT) == 0)
740 757 rw_exit(&dn->dn_struct_rwlock);
741 758 DB_DNODE_EXIT(db);
742 759
743 760 if (!havepzio)
744 761 err = zio_wait(zio);
745 762 } else {
746 763 /*
747 764 * Another reader came in while the dbuf was in flight
748 765 * between UNCACHED and CACHED. Either a writer will finish
749 766 * writing the buffer (sending the dbuf to CACHED) or the
750 767 * first reader's request will reach the read_done callback
751 768 * and send the dbuf to CACHED. Otherwise, a failure
752 769 * occurred and the dbuf went to UNCACHED.
753 770 */
754 771 mutex_exit(&db->db_mtx);
755 772 if (prefetch)
756 773 dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1);
757 774 if ((flags & DB_RF_HAVESTRUCT) == 0)
758 775 rw_exit(&dn->dn_struct_rwlock);
759 776 DB_DNODE_EXIT(db);
760 777
761 778 /* Skip the wait per the caller's request. */
762 779 mutex_enter(&db->db_mtx);
763 780 if ((flags & DB_RF_NEVERWAIT) == 0) {
764 781 while (db->db_state == DB_READ ||
765 782 db->db_state == DB_FILL) {
766 783 ASSERT(db->db_state == DB_READ ||
767 784 (flags & DB_RF_HAVESTRUCT) == 0);
768 785 DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
769 786 db, zio_t *, zio);
770 787 cv_wait(&db->db_changed, &db->db_mtx);
771 788 }
772 789 if (db->db_state == DB_UNCACHED)
773 790 err = SET_ERROR(EIO);
774 791 }
775 792 mutex_exit(&db->db_mtx);
776 793 }
777 794
778 795 ASSERT(err || havepzio || db->db_state == DB_CACHED);
779 796 return (err);
780 797 }
781 798
782 799 static void
783 800 dbuf_noread(dmu_buf_impl_t *db)
784 801 {
785 802 ASSERT(!refcount_is_zero(&db->db_holds));
786 803 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
787 804 mutex_enter(&db->db_mtx);
788 805 while (db->db_state == DB_READ || db->db_state == DB_FILL)
789 806 cv_wait(&db->db_changed, &db->db_mtx);
790 807 if (db->db_state == DB_UNCACHED) {
791 808 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
792 809 spa_t *spa = db->db_objset->os_spa;
793 810
794 811 ASSERT(db->db_buf == NULL);
795 812 ASSERT(db->db.db_data == NULL);
796 813 dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
797 814 db->db_state = DB_FILL;
798 815 } else if (db->db_state == DB_NOFILL) {
799 816 dbuf_clear_data(db);
800 817 } else {
801 818 ASSERT3U(db->db_state, ==, DB_CACHED);
802 819 }
803 820 mutex_exit(&db->db_mtx);
804 821 }
805 822
806 823 /*
807 824 * This is our just-in-time copy function. It makes a copy of
808 825 * buffers, that have been modified in a previous transaction
809 826 * group, before we modify them in the current active group.
810 827 *
811 828 * This function is used in two places: when we are dirtying a
812 829 * buffer for the first time in a txg, and when we are freeing
813 830 * a range in a dnode that includes this buffer.
814 831 *
815 832 * Note that when we are called from dbuf_free_range() we do
816 833 * not put a hold on the buffer, we just traverse the active
817 834 * dbuf list for the dnode.
818 835 */
819 836 static void
820 837 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
821 838 {
822 839 dbuf_dirty_record_t *dr = db->db_last_dirty;
823 840
824 841 ASSERT(MUTEX_HELD(&db->db_mtx));
825 842 ASSERT(db->db.db_data != NULL);
826 843 ASSERT(db->db_level == 0);
827 844 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
828 845
829 846 if (dr == NULL ||
830 847 (dr->dt.dl.dr_data !=
831 848 ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
832 849 return;
833 850
834 851 /*
835 852 * If the last dirty record for this dbuf has not yet synced
836 853 * and its referencing the dbuf data, either:
837 854 * reset the reference to point to a new copy,
838 855 * or (if there a no active holders)
839 856 * just null out the current db_data pointer.
840 857 */
841 858 ASSERT(dr->dr_txg >= txg - 2);
842 859 if (db->db_blkid == DMU_BONUS_BLKID) {
843 860 /* Note that the data bufs here are zio_bufs */
844 861 dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
845 862 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
846 863 bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
847 864 } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
848 865 int size = db->db.db_size;
849 866 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
850 867 spa_t *spa = db->db_objset->os_spa;
851 868
852 869 dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
853 870 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
854 871 } else {
855 872 dbuf_clear_data(db);
856 873 }
857 874 }
858 875
859 876 void
860 877 dbuf_unoverride(dbuf_dirty_record_t *dr)
861 878 {
862 879 dmu_buf_impl_t *db = dr->dr_dbuf;
863 880 blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
864 881 uint64_t txg = dr->dr_txg;
865 882
866 883 ASSERT(MUTEX_HELD(&db->db_mtx));
867 884 ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
868 885 ASSERT(db->db_level == 0);
869 886
870 887 if (db->db_blkid == DMU_BONUS_BLKID ||
871 888 dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
872 889 return;
873 890
874 891 ASSERT(db->db_data_pending != dr);
875 892
876 893 /* free this block */
877 894 if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
878 895 zio_free(db->db_objset->os_spa, txg, bp);
879 896
880 897 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
881 898 dr->dt.dl.dr_nopwrite = B_FALSE;
882 899
883 900 /*
884 901 * Release the already-written buffer, so we leave it in
885 902 * a consistent dirty state. Note that all callers are
886 903 * modifying the buffer, so they will immediately do
887 904 * another (redundant) arc_release(). Therefore, leave
888 905 * the buf thawed to save the effort of freezing &
889 906 * immediately re-thawing it.
890 907 */
891 908 arc_release(dr->dt.dl.dr_data, db);
892 909 }
893 910
894 911 /*
895 912 * Evict (if its unreferenced) or clear (if its referenced) any level-0
896 913 * data blocks in the free range, so that any future readers will find
897 914 * empty blocks.
898 915 *
899 916 * This is a no-op if the dataset is in the middle of an incremental
900 917 * receive; see comment below for details.
901 918 */
902 919 void
903 920 dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
904 921 dmu_tx_t *tx)
905 922 {
906 923 dmu_buf_impl_t db_search;
907 924 dmu_buf_impl_t *db, *db_next;
908 925 uint64_t txg = tx->tx_txg;
909 926 avl_index_t where;
910 927
911 928 if (end_blkid > dn->dn_maxblkid && (end_blkid != DMU_SPILL_BLKID))
912 929 end_blkid = dn->dn_maxblkid;
913 930 dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid);
914 931
915 932 db_search.db_level = 0;
916 933 db_search.db_blkid = start_blkid;
917 934 db_search.db_state = DB_SEARCH;
918 935
919 936 mutex_enter(&dn->dn_dbufs_mtx);
920 937 if (start_blkid >= dn->dn_unlisted_l0_blkid) {
921 938 /* There can't be any dbufs in this range; no need to search. */
922 939 #ifdef DEBUG
923 940 db = avl_find(&dn->dn_dbufs, &db_search, &where);
924 941 ASSERT3P(db, ==, NULL);
925 942 db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
926 943 ASSERT(db == NULL || db->db_level > 0);
927 944 #endif
928 945 mutex_exit(&dn->dn_dbufs_mtx);
929 946 return;
930 947 } else if (dmu_objset_is_receiving(dn->dn_objset)) {
931 948 /*
932 949 * If we are receiving, we expect there to be no dbufs in
933 950 * the range to be freed, because receive modifies each
934 951 * block at most once, and in offset order. If this is
935 952 * not the case, it can lead to performance problems,
936 953 * so note that we unexpectedly took the slow path.
937 954 */
938 955 atomic_inc_64(&zfs_free_range_recv_miss);
939 956 }
940 957
941 958 db = avl_find(&dn->dn_dbufs, &db_search, &where);
942 959 ASSERT3P(db, ==, NULL);
943 960 db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
944 961
945 962 for (; db != NULL; db = db_next) {
946 963 db_next = AVL_NEXT(&dn->dn_dbufs, db);
947 964 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
948 965
949 966 if (db->db_level != 0 || db->db_blkid > end_blkid) {
950 967 break;
951 968 }
952 969 ASSERT3U(db->db_blkid, >=, start_blkid);
953 970
954 971 /* found a level 0 buffer in the range */
955 972 mutex_enter(&db->db_mtx);
956 973 if (dbuf_undirty(db, tx)) {
957 974 /* mutex has been dropped and dbuf destroyed */
958 975 continue;
959 976 }
960 977
961 978 if (db->db_state == DB_UNCACHED ||
962 979 db->db_state == DB_NOFILL ||
963 980 db->db_state == DB_EVICTING) {
964 981 ASSERT(db->db.db_data == NULL);
965 982 mutex_exit(&db->db_mtx);
966 983 continue;
967 984 }
968 985 if (db->db_state == DB_READ || db->db_state == DB_FILL) {
969 986 /* will be handled in dbuf_read_done or dbuf_rele */
970 987 db->db_freed_in_flight = TRUE;
971 988 mutex_exit(&db->db_mtx);
972 989 continue;
973 990 }
974 991 if (refcount_count(&db->db_holds) == 0) {
975 992 ASSERT(db->db_buf);
976 993 dbuf_clear(db);
977 994 continue;
978 995 }
979 996 /* The dbuf is referenced */
980 997
981 998 if (db->db_last_dirty != NULL) {
982 999 dbuf_dirty_record_t *dr = db->db_last_dirty;
983 1000
984 1001 if (dr->dr_txg == txg) {
985 1002 /*
986 1003 * This buffer is "in-use", re-adjust the file
987 1004 * size to reflect that this buffer may
988 1005 * contain new data when we sync.
989 1006 */
990 1007 if (db->db_blkid != DMU_SPILL_BLKID &&
991 1008 db->db_blkid > dn->dn_maxblkid)
992 1009 dn->dn_maxblkid = db->db_blkid;
993 1010 dbuf_unoverride(dr);
994 1011 } else {
995 1012 /*
996 1013 * This dbuf is not dirty in the open context.
997 1014 * Either uncache it (if its not referenced in
998 1015 * the open context) or reset its contents to
999 1016 * empty.
1000 1017 */
1001 1018 dbuf_fix_old_data(db, txg);
1002 1019 }
1003 1020 }
1004 1021 /* clear the contents if its cached */
1005 1022 if (db->db_state == DB_CACHED) {
1006 1023 ASSERT(db->db.db_data != NULL);
1007 1024 arc_release(db->db_buf, db);
1008 1025 bzero(db->db.db_data, db->db.db_size);
1009 1026 arc_buf_freeze(db->db_buf);
1010 1027 }
1011 1028
1012 1029 mutex_exit(&db->db_mtx);
1013 1030 }
1014 1031 mutex_exit(&dn->dn_dbufs_mtx);
1015 1032 }
1016 1033
1017 1034 static int
1018 1035 dbuf_block_freeable(dmu_buf_impl_t *db)
1019 1036 {
1020 1037 dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
1021 1038 uint64_t birth_txg = 0;
1022 1039
1023 1040 /*
1024 1041 * We don't need any locking to protect db_blkptr:
1025 1042 * If it's syncing, then db_last_dirty will be set
1026 1043 * so we'll ignore db_blkptr.
1027 1044 *
1028 1045 * This logic ensures that only block births for
1029 1046 * filled blocks are considered.
1030 1047 */
1031 1048 ASSERT(MUTEX_HELD(&db->db_mtx));
1032 1049 if (db->db_last_dirty && (db->db_blkptr == NULL ||
1033 1050 !BP_IS_HOLE(db->db_blkptr))) {
1034 1051 birth_txg = db->db_last_dirty->dr_txg;
1035 1052 } else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
1036 1053 birth_txg = db->db_blkptr->blk_birth;
1037 1054 }
1038 1055
1039 1056 /*
1040 1057 * If this block don't exist or is in a snapshot, it can't be freed.
1041 1058 * Don't pass the bp to dsl_dataset_block_freeable() since we
1042 1059 * are holding the db_mtx lock and might deadlock if we are
1043 1060 * prefetching a dedup-ed block.
1044 1061 */
1045 1062 if (birth_txg != 0)
1046 1063 return (ds == NULL ||
1047 1064 dsl_dataset_block_freeable(ds, NULL, birth_txg));
1048 1065 else
1049 1066 return (B_FALSE);
1050 1067 }
1051 1068
1052 1069 void
1053 1070 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
1054 1071 {
1055 1072 arc_buf_t *buf, *obuf;
1056 1073 int osize = db->db.db_size;
1057 1074 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1058 1075 dnode_t *dn;
1059 1076
1060 1077 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1061 1078
1062 1079 DB_DNODE_ENTER(db);
1063 1080 dn = DB_DNODE(db);
1064 1081
1065 1082 /* XXX does *this* func really need the lock? */
1066 1083 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
1067 1084
1068 1085 /*
1069 1086 * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held
1070 1087 * is OK, because there can be no other references to the db
1071 1088 * when we are changing its size, so no concurrent DB_FILL can
1072 1089 * be happening.
1073 1090 */
1074 1091 /*
1075 1092 * XXX we should be doing a dbuf_read, checking the return
1076 1093 * value and returning that up to our callers
1077 1094 */
1078 1095 dmu_buf_will_dirty(&db->db, tx);
1079 1096
1080 1097 /* create the data buffer for the new block */
1081 1098 buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
1082 1099
1083 1100 /* copy old block data to the new block */
1084 1101 obuf = db->db_buf;
1085 1102 bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
1086 1103 /* zero the remainder */
1087 1104 if (size > osize)
1088 1105 bzero((uint8_t *)buf->b_data + osize, size - osize);
1089 1106
1090 1107 mutex_enter(&db->db_mtx);
1091 1108 dbuf_set_data(db, buf);
1092 1109 VERIFY(arc_buf_remove_ref(obuf, db));
1093 1110 db->db.db_size = size;
1094 1111
1095 1112 if (db->db_level == 0) {
1096 1113 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
1097 1114 db->db_last_dirty->dt.dl.dr_data = buf;
1098 1115 }
1099 1116 mutex_exit(&db->db_mtx);
1100 1117
1101 1118 dnode_willuse_space(dn, size-osize, tx);
1102 1119 DB_DNODE_EXIT(db);
1103 1120 }
1104 1121
1105 1122 void
1106 1123 dbuf_release_bp(dmu_buf_impl_t *db)
1107 1124 {
1108 1125 objset_t *os = db->db_objset;
1109 1126
1110 1127 ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
1111 1128 ASSERT(arc_released(os->os_phys_buf) ||
1112 1129 list_link_active(&os->os_dsl_dataset->ds_synced_link));
1113 1130 ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
1114 1131
1115 1132 (void) arc_release(db->db_buf, db);
1116 1133 }
1117 1134
1118 1135 /*
1119 1136 * We already have a dirty record for this TXG, and we are being
1120 1137 * dirtied again.
1121 1138 */
1122 1139 static void
1123 1140 dbuf_redirty(dbuf_dirty_record_t *dr)
1124 1141 {
1125 1142 dmu_buf_impl_t *db = dr->dr_dbuf;
1126 1143
1127 1144 ASSERT(MUTEX_HELD(&db->db_mtx));
1128 1145
1129 1146 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
1130 1147 /*
1131 1148 * If this buffer has already been written out,
1132 1149 * we now need to reset its state.
1133 1150 */
1134 1151 dbuf_unoverride(dr);
1135 1152 if (db->db.db_object != DMU_META_DNODE_OBJECT &&
1136 1153 db->db_state != DB_NOFILL) {
1137 1154 /* Already released on initial dirty, so just thaw. */
1138 1155 ASSERT(arc_released(db->db_buf));
1139 1156 arc_buf_thaw(db->db_buf);
1140 1157 }
1141 1158 }
1142 1159 }
1143 1160
1144 1161 dbuf_dirty_record_t *
1145 1162 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1146 1163 {
1147 1164 dnode_t *dn;
1148 1165 objset_t *os;
1149 1166 dbuf_dirty_record_t **drp, *dr;
1150 1167 int drop_struct_lock = FALSE;
1151 1168 boolean_t do_free_accounting = B_FALSE;
1152 1169 int txgoff = tx->tx_txg & TXG_MASK;
1153 1170
1154 1171 ASSERT(tx->tx_txg != 0);
1155 1172 ASSERT(!refcount_is_zero(&db->db_holds));
1156 1173 DMU_TX_DIRTY_BUF(tx, db);
1157 1174
1158 1175 DB_DNODE_ENTER(db);
1159 1176 dn = DB_DNODE(db);
1160 1177 /*
1161 1178 * Shouldn't dirty a regular buffer in syncing context. Private
1162 1179 * objects may be dirtied in syncing context, but only if they
1163 1180 * were already pre-dirtied in open context.
1164 1181 */
1165 1182 ASSERT(!dmu_tx_is_syncing(tx) ||
1166 1183 BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
1167 1184 DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1168 1185 dn->dn_objset->os_dsl_dataset == NULL);
1169 1186 /*
1170 1187 * We make this assert for private objects as well, but after we
1171 1188 * check if we're already dirty. They are allowed to re-dirty
1172 1189 * in syncing context.
1173 1190 */
1174 1191 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1175 1192 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1176 1193 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1177 1194
1178 1195 mutex_enter(&db->db_mtx);
1179 1196 /*
1180 1197 * XXX make this true for indirects too? The problem is that
1181 1198 * transactions created with dmu_tx_create_assigned() from
1182 1199 * syncing context don't bother holding ahead.
1183 1200 */
1184 1201 ASSERT(db->db_level != 0 ||
1185 1202 db->db_state == DB_CACHED || db->db_state == DB_FILL ||
1186 1203 db->db_state == DB_NOFILL);
1187 1204
1188 1205 mutex_enter(&dn->dn_mtx);
1189 1206 /*
1190 1207 * Don't set dirtyctx to SYNC if we're just modifying this as we
1191 1208 * initialize the objset.
1192 1209 */
1193 1210 if (dn->dn_dirtyctx == DN_UNDIRTIED &&
1194 1211 !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
1195 1212 dn->dn_dirtyctx =
1196 1213 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
1197 1214 ASSERT(dn->dn_dirtyctx_firstset == NULL);
1198 1215 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
1199 1216 }
1200 1217 mutex_exit(&dn->dn_mtx);
1201 1218
1202 1219 if (db->db_blkid == DMU_SPILL_BLKID)
1203 1220 dn->dn_have_spill = B_TRUE;
1204 1221
1205 1222 /*
1206 1223 * If this buffer is already dirty, we're done.
1207 1224 */
1208 1225 drp = &db->db_last_dirty;
1209 1226 ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
1210 1227 db->db.db_object == DMU_META_DNODE_OBJECT);
1211 1228 while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
1212 1229 drp = &dr->dr_next;
1213 1230 if (dr && dr->dr_txg == tx->tx_txg) {
1214 1231 DB_DNODE_EXIT(db);
1215 1232
1216 1233 dbuf_redirty(dr);
1217 1234 mutex_exit(&db->db_mtx);
1218 1235 return (dr);
1219 1236 }
1220 1237
1221 1238 /*
1222 1239 * Only valid if not already dirty.
1223 1240 */
1224 1241 ASSERT(dn->dn_object == 0 ||
1225 1242 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1226 1243 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1227 1244
1228 1245 ASSERT3U(dn->dn_nlevels, >, db->db_level);
1229 1246 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
1230 1247 dn->dn_phys->dn_nlevels > db->db_level ||
1231 1248 dn->dn_next_nlevels[txgoff] > db->db_level ||
1232 1249 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
1233 1250 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
1234 1251
1235 1252 /*
1236 1253 * We should only be dirtying in syncing context if it's the
1237 1254 * mos or we're initializing the os or it's a special object.
1238 1255 * However, we are allowed to dirty in syncing context provided
1239 1256 * we already dirtied it in open context. Hence we must make
1240 1257 * this assertion only if we're not already dirty.
1241 1258 */
1242 1259 os = dn->dn_objset;
1243 1260 ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1244 1261 os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
1245 1262 ASSERT(db->db.db_size != 0);
1246 1263
1247 1264 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1248 1265
1249 1266 if (db->db_blkid != DMU_BONUS_BLKID) {
1250 1267 /*
1251 1268 * Update the accounting.
1252 1269 * Note: we delay "free accounting" until after we drop
1253 1270 * the db_mtx. This keeps us from grabbing other locks
1254 1271 * (and possibly deadlocking) in bp_get_dsize() while
1255 1272 * also holding the db_mtx.
1256 1273 */
1257 1274 dnode_willuse_space(dn, db->db.db_size, tx);
1258 1275 do_free_accounting = dbuf_block_freeable(db);
1259 1276 }
1260 1277
1261 1278 /*
1262 1279 * If this buffer is dirty in an old transaction group we need
1263 1280 * to make a copy of it so that the changes we make in this
1264 1281 * transaction group won't leak out when we sync the older txg.
1265 1282 */
1266 1283 dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
1267 1284 if (db->db_level == 0) {
1268 1285 void *data_old = db->db_buf;
1269 1286
1270 1287 if (db->db_state != DB_NOFILL) {
1271 1288 if (db->db_blkid == DMU_BONUS_BLKID) {
1272 1289 dbuf_fix_old_data(db, tx->tx_txg);
1273 1290 data_old = db->db.db_data;
1274 1291 } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
1275 1292 /*
1276 1293 * Release the data buffer from the cache so
1277 1294 * that we can modify it without impacting
1278 1295 * possible other users of this cached data
1279 1296 * block. Note that indirect blocks and
1280 1297 * private objects are not released until the
1281 1298 * syncing state (since they are only modified
1282 1299 * then).
1283 1300 */
1284 1301 arc_release(db->db_buf, db);
1285 1302 dbuf_fix_old_data(db, tx->tx_txg);
1286 1303 data_old = db->db_buf;
1287 1304 }
1288 1305 ASSERT(data_old != NULL);
1289 1306 }
1290 1307 dr->dt.dl.dr_data = data_old;
1291 1308 } else {
1292 1309 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
1293 1310 list_create(&dr->dt.di.dr_children,
1294 1311 sizeof (dbuf_dirty_record_t),
1295 1312 offsetof(dbuf_dirty_record_t, dr_dirty_node));
1296 1313 }
1297 1314 if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
1298 1315 dr->dr_accounted = db->db.db_size;
1299 1316 dr->dr_dbuf = db;
1300 1317 dr->dr_txg = tx->tx_txg;
1301 1318 dr->dr_next = *drp;
1302 1319 *drp = dr;
1303 1320
1304 1321 /*
1305 1322 * We could have been freed_in_flight between the dbuf_noread
1306 1323 * and dbuf_dirty. We win, as though the dbuf_noread() had
1307 1324 * happened after the free.
1308 1325 */
1309 1326 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1310 1327 db->db_blkid != DMU_SPILL_BLKID) {
1311 1328 mutex_enter(&dn->dn_mtx);
1312 1329 if (dn->dn_free_ranges[txgoff] != NULL) {
1313 1330 range_tree_clear(dn->dn_free_ranges[txgoff],
1314 1331 db->db_blkid, 1);
1315 1332 }
1316 1333 mutex_exit(&dn->dn_mtx);
1317 1334 db->db_freed_in_flight = FALSE;
1318 1335 }
1319 1336
1320 1337 /*
1321 1338 * This buffer is now part of this txg
1322 1339 */
1323 1340 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
1324 1341 db->db_dirtycnt += 1;
1325 1342 ASSERT3U(db->db_dirtycnt, <=, 3);
1326 1343
1327 1344 mutex_exit(&db->db_mtx);
1328 1345
1329 1346 if (db->db_blkid == DMU_BONUS_BLKID ||
1330 1347 db->db_blkid == DMU_SPILL_BLKID) {
1331 1348 mutex_enter(&dn->dn_mtx);
1332 1349 ASSERT(!list_link_active(&dr->dr_dirty_node));
1333 1350 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1334 1351 mutex_exit(&dn->dn_mtx);
1335 1352 dnode_setdirty(dn, tx);
1336 1353 DB_DNODE_EXIT(db);
1337 1354 return (dr);
1338 1355 } else if (do_free_accounting) {
1339 1356 blkptr_t *bp = db->db_blkptr;
1340 1357 int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
1341 1358 bp_get_dsize(os->os_spa, bp) : db->db.db_size;
1342 1359 /*
1343 1360 * This is only a guess -- if the dbuf is dirty
1344 1361 * in a previous txg, we don't know how much
1345 1362 * space it will use on disk yet. We should
1346 1363 * really have the struct_rwlock to access
1347 1364 * db_blkptr, but since this is just a guess,
1348 1365 * it's OK if we get an odd answer.
1349 1366 */
1350 1367 ddt_prefetch(os->os_spa, bp);
1351 1368 dnode_willuse_space(dn, -willfree, tx);
1352 1369 }
1353 1370
1354 1371 if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
1355 1372 rw_enter(&dn->dn_struct_rwlock, RW_READER);
1356 1373 drop_struct_lock = TRUE;
1357 1374 }
1358 1375
1359 1376 if (db->db_level == 0) {
1360 1377 dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
1361 1378 ASSERT(dn->dn_maxblkid >= db->db_blkid);
1362 1379 }
1363 1380
1364 1381 if (db->db_level+1 < dn->dn_nlevels) {
1365 1382 dmu_buf_impl_t *parent = db->db_parent;
1366 1383 dbuf_dirty_record_t *di;
1367 1384 int parent_held = FALSE;
1368 1385
1369 1386 if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
1370 1387 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1371 1388
1372 1389 parent = dbuf_hold_level(dn, db->db_level+1,
1373 1390 db->db_blkid >> epbs, FTAG);
1374 1391 ASSERT(parent != NULL);
1375 1392 parent_held = TRUE;
1376 1393 }
1377 1394 if (drop_struct_lock)
1378 1395 rw_exit(&dn->dn_struct_rwlock);
1379 1396 ASSERT3U(db->db_level+1, ==, parent->db_level);
1380 1397 di = dbuf_dirty(parent, tx);
1381 1398 if (parent_held)
1382 1399 dbuf_rele(parent, FTAG);
1383 1400
1384 1401 mutex_enter(&db->db_mtx);
1385 1402 /*
1386 1403 * Since we've dropped the mutex, it's possible that
1387 1404 * dbuf_undirty() might have changed this out from under us.
1388 1405 */
1389 1406 if (db->db_last_dirty == dr ||
1390 1407 dn->dn_object == DMU_META_DNODE_OBJECT) {
1391 1408 mutex_enter(&di->dt.di.dr_mtx);
1392 1409 ASSERT3U(di->dr_txg, ==, tx->tx_txg);
1393 1410 ASSERT(!list_link_active(&dr->dr_dirty_node));
1394 1411 list_insert_tail(&di->dt.di.dr_children, dr);
1395 1412 mutex_exit(&di->dt.di.dr_mtx);
1396 1413 dr->dr_parent = di;
1397 1414 }
1398 1415 mutex_exit(&db->db_mtx);
1399 1416 } else {
1400 1417 ASSERT(db->db_level+1 == dn->dn_nlevels);
1401 1418 ASSERT(db->db_blkid < dn->dn_nblkptr);
1402 1419 ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
1403 1420 mutex_enter(&dn->dn_mtx);
1404 1421 ASSERT(!list_link_active(&dr->dr_dirty_node));
1405 1422 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1406 1423 mutex_exit(&dn->dn_mtx);
1407 1424 if (drop_struct_lock)
1408 1425 rw_exit(&dn->dn_struct_rwlock);
1409 1426 }
1410 1427
1411 1428 dnode_setdirty(dn, tx);
1412 1429 DB_DNODE_EXIT(db);
1413 1430 return (dr);
1414 1431 }
1415 1432
1416 1433 /*
1417 1434 * Undirty a buffer in the transaction group referenced by the given
1418 1435 * transaction. Return whether this evicted the dbuf.
1419 1436 */
1420 1437 static boolean_t
1421 1438 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1422 1439 {
1423 1440 dnode_t *dn;
1424 1441 uint64_t txg = tx->tx_txg;
1425 1442 dbuf_dirty_record_t *dr, **drp;
1426 1443
1427 1444 ASSERT(txg != 0);
1428 1445
1429 1446 /*
1430 1447 * Due to our use of dn_nlevels below, this can only be called
1431 1448 * in open context, unless we are operating on the MOS.
1432 1449 * From syncing context, dn_nlevels may be different from the
1433 1450 * dn_nlevels used when dbuf was dirtied.
1434 1451 */
1435 1452 ASSERT(db->db_objset ==
1436 1453 dmu_objset_pool(db->db_objset)->dp_meta_objset ||
1437 1454 txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));
1438 1455 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1439 1456 ASSERT0(db->db_level);
1440 1457 ASSERT(MUTEX_HELD(&db->db_mtx));
1441 1458
1442 1459 /*
1443 1460 * If this buffer is not dirty, we're done.
1444 1461 */
1445 1462 for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
1446 1463 if (dr->dr_txg <= txg)
1447 1464 break;
1448 1465 if (dr == NULL || dr->dr_txg < txg)
1449 1466 return (B_FALSE);
1450 1467 ASSERT(dr->dr_txg == txg);
1451 1468 ASSERT(dr->dr_dbuf == db);
1452 1469
1453 1470 DB_DNODE_ENTER(db);
1454 1471 dn = DB_DNODE(db);
1455 1472
1456 1473 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1457 1474
1458 1475 ASSERT(db->db.db_size != 0);
1459 1476
1460 1477 dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset),
1461 1478 dr->dr_accounted, txg);
1462 1479
1463 1480 *drp = dr->dr_next;
1464 1481
1465 1482 /*
1466 1483 * Note that there are three places in dbuf_dirty()
1467 1484 * where this dirty record may be put on a list.
1468 1485 * Make sure to do a list_remove corresponding to
1469 1486 * every one of those list_insert calls.
1470 1487 */
1471 1488 if (dr->dr_parent) {
1472 1489 mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
1473 1490 list_remove(&dr->dr_parent->dt.di.dr_children, dr);
1474 1491 mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
1475 1492 } else if (db->db_blkid == DMU_SPILL_BLKID ||
1476 1493 db->db_level + 1 == dn->dn_nlevels) {
1477 1494 ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
1478 1495 mutex_enter(&dn->dn_mtx);
1479 1496 list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
1480 1497 mutex_exit(&dn->dn_mtx);
1481 1498 }
1482 1499 DB_DNODE_EXIT(db);
1483 1500
1484 1501 if (db->db_state != DB_NOFILL) {
1485 1502 dbuf_unoverride(dr);
1486 1503
1487 1504 ASSERT(db->db_buf != NULL);
1488 1505 ASSERT(dr->dt.dl.dr_data != NULL);
1489 1506 if (dr->dt.dl.dr_data != db->db_buf)
1490 1507 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
1491 1508 }
1492 1509
1493 1510 kmem_free(dr, sizeof (dbuf_dirty_record_t));
1494 1511
1495 1512 ASSERT(db->db_dirtycnt > 0);
1496 1513 db->db_dirtycnt -= 1;
1497 1514
1498 1515 if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
1499 1516 arc_buf_t *buf = db->db_buf;
1500 1517
1501 1518 ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
1502 1519 dbuf_clear_data(db);
1503 1520 VERIFY(arc_buf_remove_ref(buf, db));
1504 1521 dbuf_evict(db);
1505 1522 return (B_TRUE);
1506 1523 }
1507 1524
1508 1525 return (B_FALSE);
1509 1526 }
1510 1527
1511 1528 void
1512 1529 dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
1513 1530 {
1514 1531 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1515 1532 int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
1516 1533
1517 1534 ASSERT(tx->tx_txg != 0);
1518 1535 ASSERT(!refcount_is_zero(&db->db_holds));
1519 1536
1520 1537 /*
1521 1538 * Quick check for dirtyness. For already dirty blocks, this
1522 1539 * reduces runtime of this function by >90%, and overall performance
1523 1540 * by 50% for some workloads (e.g. file deletion with indirect blocks
1524 1541 * cached).
1525 1542 */
1526 1543 mutex_enter(&db->db_mtx);
1527 1544 dbuf_dirty_record_t *dr;
1528 1545 for (dr = db->db_last_dirty;
1529 1546 dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) {
1530 1547 /*
1531 1548 * It's possible that it is already dirty but not cached,
1532 1549 * because there are some calls to dbuf_dirty() that don't
1533 1550 * go through dmu_buf_will_dirty().
1534 1551 */
1535 1552 if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) {
1536 1553 /* This dbuf is already dirty and cached. */
1537 1554 dbuf_redirty(dr);
1538 1555 mutex_exit(&db->db_mtx);
1539 1556 return;
1540 1557 }
1541 1558 }
1542 1559 mutex_exit(&db->db_mtx);
1543 1560
1544 1561 DB_DNODE_ENTER(db);
1545 1562 if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
1546 1563 rf |= DB_RF_HAVESTRUCT;
1547 1564 DB_DNODE_EXIT(db);
1548 1565 (void) dbuf_read(db, NULL, rf);
1549 1566 (void) dbuf_dirty(db, tx);
1550 1567 }
1551 1568
1552 1569 void
1553 1570 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1554 1571 {
1555 1572 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1556 1573
1557 1574 db->db_state = DB_NOFILL;
1558 1575
1559 1576 dmu_buf_will_fill(db_fake, tx);
1560 1577 }
1561 1578
1562 1579 void
1563 1580 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1564 1581 {
1565 1582 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1566 1583
1567 1584 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1568 1585 ASSERT(tx->tx_txg != 0);
1569 1586 ASSERT(db->db_level == 0);
1570 1587 ASSERT(!refcount_is_zero(&db->db_holds));
1571 1588
1572 1589 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
1573 1590 dmu_tx_private_ok(tx));
1574 1591
1575 1592 dbuf_noread(db);
1576 1593 (void) dbuf_dirty(db, tx);
1577 1594 }
1578 1595
1579 1596 #pragma weak dmu_buf_fill_done = dbuf_fill_done
1580 1597 /* ARGSUSED */
1581 1598 void
1582 1599 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
1583 1600 {
1584 1601 mutex_enter(&db->db_mtx);
1585 1602 DBUF_VERIFY(db);
1586 1603
1587 1604 if (db->db_state == DB_FILL) {
1588 1605 if (db->db_level == 0 && db->db_freed_in_flight) {
1589 1606 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1590 1607 /* we were freed while filling */
1591 1608 /* XXX dbuf_undirty? */
1592 1609 bzero(db->db.db_data, db->db.db_size);
1593 1610 db->db_freed_in_flight = FALSE;
1594 1611 }
1595 1612 db->db_state = DB_CACHED;
1596 1613 cv_broadcast(&db->db_changed);
1597 1614 }
1598 1615 mutex_exit(&db->db_mtx);
1599 1616 }
1600 1617
1601 1618 void
1602 1619 dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
1603 1620 bp_embedded_type_t etype, enum zio_compress comp,
1604 1621 int uncompressed_size, int compressed_size, int byteorder,
1605 1622 dmu_tx_t *tx)
1606 1623 {
1607 1624 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
1608 1625 struct dirty_leaf *dl;
1609 1626 dmu_object_type_t type;
1610 1627
1611 1628 if (etype == BP_EMBEDDED_TYPE_DATA) {
1612 1629 ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset),
1613 1630 SPA_FEATURE_EMBEDDED_DATA));
1614 1631 }
1615 1632
1616 1633 DB_DNODE_ENTER(db);
1617 1634 type = DB_DNODE(db)->dn_type;
1618 1635 DB_DNODE_EXIT(db);
1619 1636
1620 1637 ASSERT0(db->db_level);
1621 1638 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1622 1639
1623 1640 dmu_buf_will_not_fill(dbuf, tx);
1624 1641
1625 1642 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
1626 1643 dl = &db->db_last_dirty->dt.dl;
1627 1644 encode_embedded_bp_compressed(&dl->dr_overridden_by,
1628 1645 data, comp, uncompressed_size, compressed_size);
1629 1646 BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
1630 1647 BP_SET_TYPE(&dl->dr_overridden_by, type);
1631 1648 BP_SET_LEVEL(&dl->dr_overridden_by, 0);
1632 1649 BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
1633 1650
1634 1651 dl->dr_override_state = DR_OVERRIDDEN;
1635 1652 dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg;
1636 1653 }
1637 1654
1638 1655 /*
1639 1656 * Directly assign a provided arc buf to a given dbuf if it's not referenced
1640 1657 * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
1641 1658 */
1642 1659 void
1643 1660 dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
1644 1661 {
1645 1662 ASSERT(!refcount_is_zero(&db->db_holds));
1646 1663 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1647 1664 ASSERT(db->db_level == 0);
1648 1665 ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
1649 1666 ASSERT(buf != NULL);
1650 1667 ASSERT(arc_buf_size(buf) == db->db.db_size);
1651 1668 ASSERT(tx->tx_txg != 0);
1652 1669
1653 1670 arc_return_buf(buf, db);
1654 1671 ASSERT(arc_released(buf));
1655 1672
1656 1673 mutex_enter(&db->db_mtx);
1657 1674
1658 1675 while (db->db_state == DB_READ || db->db_state == DB_FILL)
1659 1676 cv_wait(&db->db_changed, &db->db_mtx);
1660 1677
1661 1678 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
1662 1679
1663 1680 if (db->db_state == DB_CACHED &&
1664 1681 refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
1665 1682 mutex_exit(&db->db_mtx);
1666 1683 (void) dbuf_dirty(db, tx);
1667 1684 bcopy(buf->b_data, db->db.db_data, db->db.db_size);
1668 1685 VERIFY(arc_buf_remove_ref(buf, db));
1669 1686 xuio_stat_wbuf_copied();
1670 1687 return;
1671 1688 }
1672 1689
1673 1690 xuio_stat_wbuf_nocopy();
1674 1691 if (db->db_state == DB_CACHED) {
1675 1692 dbuf_dirty_record_t *dr = db->db_last_dirty;
1676 1693
1677 1694 ASSERT(db->db_buf != NULL);
1678 1695 if (dr != NULL && dr->dr_txg == tx->tx_txg) {
1679 1696 ASSERT(dr->dt.dl.dr_data == db->db_buf);
1680 1697 if (!arc_released(db->db_buf)) {
1681 1698 ASSERT(dr->dt.dl.dr_override_state ==
1682 1699 DR_OVERRIDDEN);
1683 1700 arc_release(db->db_buf, db);
1684 1701 }
1685 1702 dr->dt.dl.dr_data = buf;
1686 1703 VERIFY(arc_buf_remove_ref(db->db_buf, db));
1687 1704 } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
1688 1705 arc_release(db->db_buf, db);
1689 1706 VERIFY(arc_buf_remove_ref(db->db_buf, db));
1690 1707 }
1691 1708 db->db_buf = NULL;
1692 1709 }
1693 1710 ASSERT(db->db_buf == NULL);
1694 1711 dbuf_set_data(db, buf);
1695 1712 db->db_state = DB_FILL;
1696 1713 mutex_exit(&db->db_mtx);
1697 1714 (void) dbuf_dirty(db, tx);
1698 1715 dmu_buf_fill_done(&db->db, tx);
1699 1716 }
1700 1717
1701 1718 /*
1702 1719 * "Clear" the contents of this dbuf. This will mark the dbuf
1703 1720 * EVICTING and clear *most* of its references. Unfortunately,
1704 1721 * when we are not holding the dn_dbufs_mtx, we can't clear the
1705 1722 * entry in the dn_dbufs list. We have to wait until dbuf_destroy()
1706 1723 * in this case. For callers from the DMU we will usually see:
1707 1724 * dbuf_clear()->arc_clear_callback()->dbuf_do_evict()->dbuf_destroy()
1708 1725 * For the arc callback, we will usually see:
1709 1726 * dbuf_do_evict()->dbuf_clear();dbuf_destroy()
1710 1727 * Sometimes, though, we will get a mix of these two:
1711 1728 * DMU: dbuf_clear()->arc_clear_callback()
1712 1729 * ARC: dbuf_do_evict()->dbuf_destroy()
1713 1730 *
1714 1731 * This routine will dissociate the dbuf from the arc, by calling
1715 1732 * arc_clear_callback(), but will not evict the data from the ARC.
1716 1733 */
1717 1734 void
1718 1735 dbuf_clear(dmu_buf_impl_t *db)
1719 1736 {
1720 1737 dnode_t *dn;
1721 1738 dmu_buf_impl_t *parent = db->db_parent;
1722 1739 dmu_buf_impl_t *dndb;
1723 1740 boolean_t dbuf_gone = B_FALSE;
1724 1741
1725 1742 ASSERT(MUTEX_HELD(&db->db_mtx));
1726 1743 ASSERT(refcount_is_zero(&db->db_holds));
1727 1744
1728 1745 dbuf_evict_user(db);
1729 1746
1730 1747 if (db->db_state == DB_CACHED) {
1731 1748 ASSERT(db->db.db_data != NULL);
1732 1749 if (db->db_blkid == DMU_BONUS_BLKID) {
1733 1750 zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
1734 1751 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
1735 1752 }
1736 1753 db->db.db_data = NULL;
1737 1754 db->db_state = DB_UNCACHED;
1738 1755 }
1739 1756
1740 1757 ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
1741 1758 ASSERT(db->db_data_pending == NULL);
1742 1759
1743 1760 db->db_state = DB_EVICTING;
1744 1761 db->db_blkptr = NULL;
1745 1762
1746 1763 DB_DNODE_ENTER(db);
1747 1764 dn = DB_DNODE(db);
1748 1765 dndb = dn->dn_dbuf;
1749 1766 if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
1750 1767 avl_remove(&dn->dn_dbufs, db);
1751 1768 atomic_dec_32(&dn->dn_dbufs_count);
1752 1769 membar_producer();
1753 1770 DB_DNODE_EXIT(db);
1754 1771 /*
1755 1772 * Decrementing the dbuf count means that the hold corresponding
1756 1773 * to the removed dbuf is no longer discounted in dnode_move(),
1757 1774 * so the dnode cannot be moved until after we release the hold.
1758 1775 * The membar_producer() ensures visibility of the decremented
1759 1776 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
1760 1777 * release any lock.
1761 1778 */
1762 1779 dnode_rele(dn, db);
1763 1780 db->db_dnode_handle = NULL;
1764 1781 } else {
1765 1782 DB_DNODE_EXIT(db);
1766 1783 }
1767 1784
1768 1785 if (db->db_buf)
1769 1786 dbuf_gone = arc_clear_callback(db->db_buf);
1770 1787
1771 1788 if (!dbuf_gone)
1772 1789 mutex_exit(&db->db_mtx);
1773 1790
1774 1791 /*
1775 1792 * If this dbuf is referenced from an indirect dbuf,
1776 1793 * decrement the ref count on the indirect dbuf.
1777 1794 */
1778 1795 if (parent && parent != dndb)
1779 1796 dbuf_rele(parent, db);
1780 1797 }
1781 1798
1782 1799 /*
1783 1800 * Note: While bpp will always be updated if the function returns success,
1784 1801 * parentp will not be updated if the dnode does not have dn_dbuf filled in;
1785 1802 * this happens when the dnode is the meta-dnode, or a userused or groupused
1786 1803 * object.
1787 1804 */
1788 1805 static int
1789 1806 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
1790 1807 dmu_buf_impl_t **parentp, blkptr_t **bpp)
1791 1808 {
1792 1809 int nlevels, epbs;
1793 1810
1794 1811 *parentp = NULL;
1795 1812 *bpp = NULL;
1796 1813
1797 1814 ASSERT(blkid != DMU_BONUS_BLKID);
1798 1815
1799 1816 if (blkid == DMU_SPILL_BLKID) {
1800 1817 mutex_enter(&dn->dn_mtx);
1801 1818 if (dn->dn_have_spill &&
1802 1819 (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
1803 1820 *bpp = &dn->dn_phys->dn_spill;
1804 1821 else
1805 1822 *bpp = NULL;
1806 1823 dbuf_add_ref(dn->dn_dbuf, NULL);
1807 1824 *parentp = dn->dn_dbuf;
1808 1825 mutex_exit(&dn->dn_mtx);
1809 1826 return (0);
1810 1827 }
1811 1828
1812 1829 if (dn->dn_phys->dn_nlevels == 0)
1813 1830 nlevels = 1;
1814 1831 else
1815 1832 nlevels = dn->dn_phys->dn_nlevels;
1816 1833
1817 1834 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1818 1835
1819 1836 ASSERT3U(level * epbs, <, 64);
1820 1837 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1821 1838 if (level >= nlevels ||
1822 1839 (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
1823 1840 /* the buffer has no parent yet */
1824 1841 return (SET_ERROR(ENOENT));
1825 1842 } else if (level < nlevels-1) {
1826 1843 /* this block is referenced from an indirect block */
1827 1844 int err = dbuf_hold_impl(dn, level+1,
1828 1845 blkid >> epbs, fail_sparse, FALSE, NULL, parentp);
1829 1846 if (err)
1830 1847 return (err);
1831 1848 err = dbuf_read(*parentp, NULL,
1832 1849 (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
1833 1850 if (err) {
1834 1851 dbuf_rele(*parentp, NULL);
1835 1852 *parentp = NULL;
1836 1853 return (err);
1837 1854 }
1838 1855 *bpp = ((blkptr_t *)(*parentp)->db.db_data) +
1839 1856 (blkid & ((1ULL << epbs) - 1));
1840 1857 return (0);
1841 1858 } else {
1842 1859 /* the block is referenced from the dnode */
1843 1860 ASSERT3U(level, ==, nlevels-1);
1844 1861 ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
1845 1862 blkid < dn->dn_phys->dn_nblkptr);
1846 1863 if (dn->dn_dbuf) {
1847 1864 dbuf_add_ref(dn->dn_dbuf, NULL);
1848 1865 *parentp = dn->dn_dbuf;
1849 1866 }
1850 1867 *bpp = &dn->dn_phys->dn_blkptr[blkid];
1851 1868 return (0);
1852 1869 }
1853 1870 }
1854 1871
1855 1872 static dmu_buf_impl_t *
1856 1873 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
1857 1874 dmu_buf_impl_t *parent, blkptr_t *blkptr)
1858 1875 {
1859 1876 objset_t *os = dn->dn_objset;
1860 1877 dmu_buf_impl_t *db, *odb;
1861 1878
1862 1879 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1863 1880 ASSERT(dn->dn_type != DMU_OT_NONE);
1864 1881
1865 1882 db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
1866 1883
1867 1884 db->db_objset = os;
1868 1885 db->db.db_object = dn->dn_object;
1869 1886 db->db_level = level;
1870 1887 db->db_blkid = blkid;
1871 1888 db->db_last_dirty = NULL;
1872 1889 db->db_dirtycnt = 0;
1873 1890 db->db_dnode_handle = dn->dn_handle;
1874 1891 db->db_parent = parent;
1875 1892 db->db_blkptr = blkptr;
1876 1893
1877 1894 db->db_user = NULL;
1878 1895 db->db_user_immediate_evict = FALSE;
1879 1896 db->db_freed_in_flight = FALSE;
1880 1897 db->db_pending_evict = FALSE;
1881 1898
1882 1899 if (blkid == DMU_BONUS_BLKID) {
1883 1900 ASSERT3P(parent, ==, dn->dn_dbuf);
1884 1901 db->db.db_size = DN_MAX_BONUSLEN -
1885 1902 (dn->dn_nblkptr-1) * sizeof (blkptr_t);
1886 1903 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
1887 1904 db->db.db_offset = DMU_BONUS_BLKID;
1888 1905 db->db_state = DB_UNCACHED;
1889 1906 /* the bonus dbuf is not placed in the hash table */
1890 1907 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1891 1908 return (db);
1892 1909 } else if (blkid == DMU_SPILL_BLKID) {
1893 1910 db->db.db_size = (blkptr != NULL) ?
1894 1911 BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
1895 1912 db->db.db_offset = 0;
1896 1913 } else {
1897 1914 int blocksize =
1898 1915 db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
1899 1916 db->db.db_size = blocksize;
1900 1917 db->db.db_offset = db->db_blkid * blocksize;
1901 1918 }
1902 1919
1903 1920 /*
1904 1921 * Hold the dn_dbufs_mtx while we get the new dbuf
1905 1922 * in the hash table *and* added to the dbufs list.
1906 1923 * This prevents a possible deadlock with someone
1907 1924 * trying to look up this dbuf before its added to the
1908 1925 * dn_dbufs list.
1909 1926 */
1910 1927 mutex_enter(&dn->dn_dbufs_mtx);
1911 1928 db->db_state = DB_EVICTING;
1912 1929 if ((odb = dbuf_hash_insert(db)) != NULL) {
1913 1930 /* someone else inserted it first */
1914 1931 kmem_cache_free(dbuf_cache, db);
1915 1932 mutex_exit(&dn->dn_dbufs_mtx);
1916 1933 return (odb);
1917 1934 }
1918 1935 avl_add(&dn->dn_dbufs, db);
1919 1936 if (db->db_level == 0 && db->db_blkid >=
1920 1937 dn->dn_unlisted_l0_blkid)
1921 1938 dn->dn_unlisted_l0_blkid = db->db_blkid + 1;
1922 1939 db->db_state = DB_UNCACHED;
1923 1940 mutex_exit(&dn->dn_dbufs_mtx);
1924 1941 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1925 1942
1926 1943 if (parent && parent != dn->dn_dbuf)
1927 1944 dbuf_add_ref(parent, db);
1928 1945
1929 1946 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1930 1947 refcount_count(&dn->dn_holds) > 0);
1931 1948 (void) refcount_add(&dn->dn_holds, db);
1932 1949 atomic_inc_32(&dn->dn_dbufs_count);
1933 1950
1934 1951 dprintf_dbuf(db, "db=%p\n", db);
1935 1952
1936 1953 return (db);
1937 1954 }
1938 1955
1939 1956 static int
1940 1957 dbuf_do_evict(void *private)
1941 1958 {
1942 1959 dmu_buf_impl_t *db = private;
1943 1960
1944 1961 if (!MUTEX_HELD(&db->db_mtx))
1945 1962 mutex_enter(&db->db_mtx);
1946 1963
1947 1964 ASSERT(refcount_is_zero(&db->db_holds));
1948 1965
1949 1966 if (db->db_state != DB_EVICTING) {
1950 1967 ASSERT(db->db_state == DB_CACHED);
1951 1968 DBUF_VERIFY(db);
1952 1969 db->db_buf = NULL;
1953 1970 dbuf_evict(db);
1954 1971 } else {
1955 1972 mutex_exit(&db->db_mtx);
1956 1973 dbuf_destroy(db);
1957 1974 }
1958 1975 return (0);
1959 1976 }
1960 1977
1961 1978 static void
1962 1979 dbuf_destroy(dmu_buf_impl_t *db)
1963 1980 {
1964 1981 ASSERT(refcount_is_zero(&db->db_holds));
1965 1982
1966 1983 if (db->db_blkid != DMU_BONUS_BLKID) {
1967 1984 /*
1968 1985 * If this dbuf is still on the dn_dbufs list,
1969 1986 * remove it from that list.
1970 1987 */
1971 1988 if (db->db_dnode_handle != NULL) {
1972 1989 dnode_t *dn;
1973 1990
1974 1991 DB_DNODE_ENTER(db);
1975 1992 dn = DB_DNODE(db);
1976 1993 mutex_enter(&dn->dn_dbufs_mtx);
1977 1994 avl_remove(&dn->dn_dbufs, db);
1978 1995 atomic_dec_32(&dn->dn_dbufs_count);
1979 1996 mutex_exit(&dn->dn_dbufs_mtx);
1980 1997 DB_DNODE_EXIT(db);
1981 1998 /*
1982 1999 * Decrementing the dbuf count means that the hold
1983 2000 * corresponding to the removed dbuf is no longer
1984 2001 * discounted in dnode_move(), so the dnode cannot be
1985 2002 * moved until after we release the hold.
1986 2003 */
1987 2004 dnode_rele(dn, db);
1988 2005 db->db_dnode_handle = NULL;
1989 2006 }
1990 2007 dbuf_hash_remove(db);
1991 2008 }
1992 2009 db->db_parent = NULL;
1993 2010 db->db_buf = NULL;
1994 2011
1995 2012 ASSERT(db->db.db_data == NULL);
1996 2013 ASSERT(db->db_hash_next == NULL);
1997 2014 ASSERT(db->db_blkptr == NULL);
1998 2015 ASSERT(db->db_data_pending == NULL);
1999 2016
2000 2017 kmem_cache_free(dbuf_cache, db);
2001 2018 arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
2002 2019 }
2003 2020
2004 2021 typedef struct dbuf_prefetch_arg {
2005 2022 spa_t *dpa_spa; /* The spa to issue the prefetch in. */
2006 2023 zbookmark_phys_t dpa_zb; /* The target block to prefetch. */
2007 2024 int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */
2008 2025 int dpa_curlevel; /* The current level that we're reading */
2009 2026 zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
2010 2027 zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
2011 2028 arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
2012 2029 } dbuf_prefetch_arg_t;
2013 2030
2014 2031 /*
2015 2032 * Actually issue the prefetch read for the block given.
2016 2033 */
2017 2034 static void
2018 2035 dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
2019 2036 {
2020 2037 if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
2021 2038 return;
2022 2039
2023 2040 arc_flags_t aflags =
2024 2041 dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
2025 2042
2026 2043 ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
2027 2044 ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
2028 2045 ASSERT(dpa->dpa_zio != NULL);
2029 2046 (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL,
2030 2047 dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
2031 2048 &aflags, &dpa->dpa_zb);
2032 2049 }
2033 2050
2034 2051 /*
2035 2052 * Called when an indirect block above our prefetch target is read in. This
2036 2053 * will either read in the next indirect block down the tree or issue the actual
2037 2054 * prefetch if the next block down is our target.
2038 2055 */
2039 2056 static void
2040 2057 dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
2041 2058 {
2042 2059 dbuf_prefetch_arg_t *dpa = private;
2043 2060
2044 2061 ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
2045 2062 ASSERT3S(dpa->dpa_curlevel, >, 0);
2046 2063 if (zio != NULL) {
2047 2064 ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
2048 2065 ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
2049 2066 ASSERT3P(zio->io_spa, ==, dpa->dpa_spa);
2050 2067 }
2051 2068
2052 2069 dpa->dpa_curlevel--;
2053 2070
2054 2071 uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
2055 2072 (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
2056 2073 blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
2057 2074 P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
2058 2075 if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) {
2059 2076 kmem_free(dpa, sizeof (*dpa));
2060 2077 } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
2061 2078 ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
2062 2079 dbuf_issue_final_prefetch(dpa, bp);
2063 2080 kmem_free(dpa, sizeof (*dpa));
2064 2081 } else {
2065 2082 arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
2066 2083 zbookmark_phys_t zb;
2067 2084
2068 2085 ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
2069 2086
2070 2087 SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset,
2071 2088 dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);
2072 2089
2073 2090 (void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
2074 2091 bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio,
2075 2092 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
2076 2093 &iter_aflags, &zb);
2077 2094 }
2078 2095 (void) arc_buf_remove_ref(abuf, private);
2079 2096 }
2080 2097
2081 2098 /*
2082 2099 * Issue prefetch reads for the given block on the given level. If the indirect
2083 2100 * blocks above that block are not in memory, we will read them in
2084 2101 * asynchronously. As a result, this call never blocks waiting for a read to
2085 2102 * complete.
2086 2103 */
2087 2104 void
2088 2105 dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
2089 2106 arc_flags_t aflags)
2090 2107 {
2091 2108 blkptr_t bp;
2092 2109 int epbs, nlevels, curlevel;
2093 2110 uint64_t curblkid;
2094 2111
2095 2112 ASSERT(blkid != DMU_BONUS_BLKID);
2096 2113 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
2097 2114
2098 2115 if (blkid > dn->dn_maxblkid)
2099 2116 return;
2100 2117
2101 2118 if (dnode_block_freed(dn, blkid))
2102 2119 return;
2103 2120
2104 2121 /*
2105 2122 * This dnode hasn't been written to disk yet, so there's nothing to
2106 2123 * prefetch.
2107 2124 */
2108 2125 nlevels = dn->dn_phys->dn_nlevels;
2109 2126 if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)
2110 2127 return;
2111 2128
2112 2129 epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2113 2130 if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
2114 2131 return;
2115 2132
2116 2133 dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
2117 2134 level, blkid);
2118 2135 if (db != NULL) {
2119 2136 mutex_exit(&db->db_mtx);
2120 2137 /*
2121 2138 * This dbuf already exists. It is either CACHED, or
2122 2139 * (we assume) about to be read or filled.
2123 2140 */
2124 2141 return;
2125 2142 }
2126 2143
2127 2144 /*
2128 2145 * Find the closest ancestor (indirect block) of the target block
2129 2146 * that is present in the cache. In this indirect block, we will
2130 2147 * find the bp that is at curlevel, curblkid.
2131 2148 */
2132 2149 curlevel = level;
2133 2150 curblkid = blkid;
2134 2151 while (curlevel < nlevels - 1) {
2135 2152 int parent_level = curlevel + 1;
2136 2153 uint64_t parent_blkid = curblkid >> epbs;
2137 2154 dmu_buf_impl_t *db;
2138 2155
2139 2156 if (dbuf_hold_impl(dn, parent_level, parent_blkid,
2140 2157 FALSE, TRUE, FTAG, &db) == 0) {
2141 2158 blkptr_t *bpp = db->db_buf->b_data;
2142 2159 bp = bpp[P2PHASE(curblkid, 1 << epbs)];
2143 2160 dbuf_rele(db, FTAG);
2144 2161 break;
2145 2162 }
2146 2163
2147 2164 curlevel = parent_level;
2148 2165 curblkid = parent_blkid;
2149 2166 }
2150 2167
2151 2168 if (curlevel == nlevels - 1) {
2152 2169 /* No cached indirect blocks found. */
2153 2170 ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr);
2154 2171 bp = dn->dn_phys->dn_blkptr[curblkid];
2155 2172 }
2156 2173 if (BP_IS_HOLE(&bp))
2157 2174 return;
2158 2175
2159 2176 ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
2160 2177
2161 2178 zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL,
2162 2179 ZIO_FLAG_CANFAIL);
2163 2180
2164 2181 dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP);
2165 2182 dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
2166 2183 SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
2167 2184 dn->dn_object, level, blkid);
2168 2185 dpa->dpa_curlevel = curlevel;
2169 2186 dpa->dpa_prio = prio;
2170 2187 dpa->dpa_aflags = aflags;
2171 2188 dpa->dpa_spa = dn->dn_objset->os_spa;
2172 2189 dpa->dpa_epbs = epbs;
2173 2190 dpa->dpa_zio = pio;
2174 2191
2175 2192 /*
2176 2193 * If we have the indirect just above us, no need to do the asynchronous
2177 2194 * prefetch chain; we'll just run the last step ourselves. If we're at
2178 2195 * a higher level, though, we want to issue the prefetches for all the
2179 2196 * indirect blocks asynchronously, so we can go on with whatever we were
2180 2197 * doing.
2181 2198 */
2182 2199 if (curlevel == level) {
2183 2200 ASSERT3U(curblkid, ==, blkid);
2184 2201 dbuf_issue_final_prefetch(dpa, &bp);
2185 2202 kmem_free(dpa, sizeof (*dpa));
2186 2203 } else {
2187 2204 arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
2188 2205 zbookmark_phys_t zb;
2189 2206
2190 2207 SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
2191 2208 dn->dn_object, curlevel, curblkid);
2192 2209 (void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
2193 2210 &bp, dbuf_prefetch_indirect_done, dpa, prio,
2194 2211 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
2195 2212 &iter_aflags, &zb);
2196 2213 }
2197 2214 /*
2198 2215 * We use pio here instead of dpa_zio since it's possible that
2199 2216 * dpa may have already been freed.
2200 2217 */
2201 2218 zio_nowait(pio);
2202 2219 }
2203 2220
2204 2221 /*
2205 2222 * Returns with db_holds incremented, and db_mtx not held.
2206 2223 * Note: dn_struct_rwlock must be held.
2207 2224 */
2208 2225 int
2209 2226 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
2210 2227 boolean_t fail_sparse, boolean_t fail_uncached,
2211 2228 void *tag, dmu_buf_impl_t **dbp)
2212 2229 {
2213 2230 dmu_buf_impl_t *db, *parent = NULL;
2214 2231
2215 2232 ASSERT(blkid != DMU_BONUS_BLKID);
2216 2233 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
2217 2234 ASSERT3U(dn->dn_nlevels, >, level);
2218 2235
2219 2236 *dbp = NULL;
2220 2237 top:
2221 2238 /* dbuf_find() returns with db_mtx held */
2222 2239 db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid);
2223 2240
2224 2241 if (db == NULL) {
2225 2242 blkptr_t *bp = NULL;
2226 2243 int err;
2227 2244
2228 2245 if (fail_uncached)
2229 2246 return (SET_ERROR(ENOENT));
2230 2247
2231 2248 ASSERT3P(parent, ==, NULL);
2232 2249 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
2233 2250 if (fail_sparse) {
2234 2251 if (err == 0 && bp && BP_IS_HOLE(bp))
2235 2252 err = SET_ERROR(ENOENT);
2236 2253 if (err) {
2237 2254 if (parent)
2238 2255 dbuf_rele(parent, NULL);
2239 2256 return (err);
2240 2257 }
2241 2258 }
2242 2259 if (err && err != ENOENT)
2243 2260 return (err);
2244 2261 db = dbuf_create(dn, level, blkid, parent, bp);
2245 2262 }
2246 2263
2247 2264 if (fail_uncached && db->db_state != DB_CACHED) {
2248 2265 mutex_exit(&db->db_mtx);
2249 2266 return (SET_ERROR(ENOENT));
2250 2267 }
2251 2268
2252 2269 if (db->db_buf && refcount_is_zero(&db->db_holds)) {
2253 2270 arc_buf_add_ref(db->db_buf, db);
2254 2271 if (db->db_buf->b_data == NULL) {
2255 2272 dbuf_clear(db);
2256 2273 if (parent) {
2257 2274 dbuf_rele(parent, NULL);
2258 2275 parent = NULL;
2259 2276 }
2260 2277 goto top;
2261 2278 }
2262 2279 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
2263 2280 }
2264 2281
2265 2282 ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
2266 2283
2267 2284 /*
2268 2285 * If this buffer is currently syncing out, and we are are
2269 2286 * still referencing it from db_data, we need to make a copy
2270 2287 * of it in case we decide we want to dirty it again in this txg.
2271 2288 */
2272 2289 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
2273 2290 dn->dn_object != DMU_META_DNODE_OBJECT &&
2274 2291 db->db_state == DB_CACHED && db->db_data_pending) {
2275 2292 dbuf_dirty_record_t *dr = db->db_data_pending;
2276 2293
2277 2294 if (dr->dt.dl.dr_data == db->db_buf) {
2278 2295 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
2279 2296
2280 2297 dbuf_set_data(db,
2281 2298 arc_buf_alloc(dn->dn_objset->os_spa,
2282 2299 db->db.db_size, db, type));
2283 2300 bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
2284 2301 db->db.db_size);
2285 2302 }
2286 2303 }
2287 2304
2288 2305 (void) refcount_add(&db->db_holds, tag);
2289 2306 DBUF_VERIFY(db);
2290 2307 mutex_exit(&db->db_mtx);
2291 2308
2292 2309 /* NOTE: we can't rele the parent until after we drop the db_mtx */
2293 2310 if (parent)
2294 2311 dbuf_rele(parent, NULL);
2295 2312
2296 2313 ASSERT3P(DB_DNODE(db), ==, dn);
2297 2314 ASSERT3U(db->db_blkid, ==, blkid);
2298 2315 ASSERT3U(db->db_level, ==, level);
2299 2316 *dbp = db;
2300 2317
2301 2318 return (0);
2302 2319 }
2303 2320
2304 2321 dmu_buf_impl_t *
2305 2322 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
2306 2323 {
2307 2324 return (dbuf_hold_level(dn, 0, blkid, tag));
2308 2325 }
2309 2326
2310 2327 dmu_buf_impl_t *
2311 2328 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
2312 2329 {
2313 2330 dmu_buf_impl_t *db;
2314 2331 int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);
2315 2332 return (err ? NULL : db);
2316 2333 }
2317 2334
2318 2335 void
2319 2336 dbuf_create_bonus(dnode_t *dn)
2320 2337 {
2321 2338 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
2322 2339
2323 2340 ASSERT(dn->dn_bonus == NULL);
2324 2341 dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
2325 2342 }
2326 2343
2327 2344 int
2328 2345 dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
2329 2346 {
2330 2347 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2331 2348 dnode_t *dn;
2332 2349
2333 2350 if (db->db_blkid != DMU_SPILL_BLKID)
2334 2351 return (SET_ERROR(ENOTSUP));
2335 2352 if (blksz == 0)
2336 2353 blksz = SPA_MINBLOCKSIZE;
2337 2354 ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
2338 2355 blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
2339 2356
2340 2357 DB_DNODE_ENTER(db);
2341 2358 dn = DB_DNODE(db);
2342 2359 rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
2343 2360 dbuf_new_size(db, blksz, tx);
2344 2361 rw_exit(&dn->dn_struct_rwlock);
2345 2362 DB_DNODE_EXIT(db);
2346 2363
2347 2364 return (0);
2348 2365 }
2349 2366
2350 2367 void
2351 2368 dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
2352 2369 {
2353 2370 dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
2354 2371 }
2355 2372
2356 2373 #pragma weak dmu_buf_add_ref = dbuf_add_ref
2357 2374 void
2358 2375 dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
2359 2376 {
2360 2377 int64_t holds = refcount_add(&db->db_holds, tag);
2361 2378 ASSERT(holds > 1);
2362 2379 }
2363 2380
2364 2381 #pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref
2365 2382 boolean_t
2366 2383 dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
2367 2384 void *tag)
2368 2385 {
2369 2386 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2370 2387 dmu_buf_impl_t *found_db;
2371 2388 boolean_t result = B_FALSE;
2372 2389
2373 2390 if (db->db_blkid == DMU_BONUS_BLKID)
2374 2391 found_db = dbuf_find_bonus(os, obj);
2375 2392 else
2376 2393 found_db = dbuf_find(os, obj, 0, blkid);
2377 2394
2378 2395 if (found_db != NULL) {
2379 2396 if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) {
2380 2397 (void) refcount_add(&db->db_holds, tag);
2381 2398 result = B_TRUE;
2382 2399 }
2383 2400 mutex_exit(&db->db_mtx);
2384 2401 }
2385 2402 return (result);
2386 2403 }
2387 2404
2388 2405 /*
2389 2406 * If you call dbuf_rele() you had better not be referencing the dnode handle
2390 2407 * unless you have some other direct or indirect hold on the dnode. (An indirect
2391 2408 * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
2392 2409 * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
2393 2410 * dnode's parent dbuf evicting its dnode handles.
2394 2411 */
2395 2412 void
2396 2413 dbuf_rele(dmu_buf_impl_t *db, void *tag)
2397 2414 {
2398 2415 mutex_enter(&db->db_mtx);
2399 2416 dbuf_rele_and_unlock(db, tag);
2400 2417 }
2401 2418
2402 2419 void
2403 2420 dmu_buf_rele(dmu_buf_t *db, void *tag)
2404 2421 {
2405 2422 dbuf_rele((dmu_buf_impl_t *)db, tag);
2406 2423 }
2407 2424
2408 2425 /*
2409 2426 * dbuf_rele() for an already-locked dbuf. This is necessary to allow
2410 2427 * db_dirtycnt and db_holds to be updated atomically.
2411 2428 */
2412 2429 void
2413 2430 dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
2414 2431 {
2415 2432 int64_t holds;
2416 2433
2417 2434 ASSERT(MUTEX_HELD(&db->db_mtx));
2418 2435 DBUF_VERIFY(db);
2419 2436
2420 2437 /*
2421 2438 * Remove the reference to the dbuf before removing its hold on the
2422 2439 * dnode so we can guarantee in dnode_move() that a referenced bonus
2423 2440 * buffer has a corresponding dnode hold.
2424 2441 */
2425 2442 holds = refcount_remove(&db->db_holds, tag);
2426 2443 ASSERT(holds >= 0);
2427 2444
2428 2445 /*
2429 2446 * We can't freeze indirects if there is a possibility that they
2430 2447 * may be modified in the current syncing context.
2431 2448 */
2432 2449 if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
2433 2450 arc_buf_freeze(db->db_buf);
2434 2451
2435 2452 if (holds == db->db_dirtycnt &&
2436 2453 db->db_level == 0 && db->db_user_immediate_evict)
2437 2454 dbuf_evict_user(db);
2438 2455
2439 2456 if (holds == 0) {
2440 2457 if (db->db_blkid == DMU_BONUS_BLKID) {
2441 2458 dnode_t *dn;
2442 2459 boolean_t evict_dbuf = db->db_pending_evict;
2443 2460
2444 2461 /*
2445 2462 * If the dnode moves here, we cannot cross this
2446 2463 * barrier until the move completes.
2447 2464 */
2448 2465 DB_DNODE_ENTER(db);
2449 2466
2450 2467 dn = DB_DNODE(db);
2451 2468 atomic_dec_32(&dn->dn_dbufs_count);
2452 2469
2453 2470 /*
2454 2471 * Decrementing the dbuf count means that the bonus
2455 2472 * buffer's dnode hold is no longer discounted in
2456 2473 * dnode_move(). The dnode cannot move until after
2457 2474 * the dnode_rele() below.
2458 2475 */
2459 2476 DB_DNODE_EXIT(db);
2460 2477
2461 2478 /*
2462 2479 * Do not reference db after its lock is dropped.
2463 2480 * Another thread may evict it.
2464 2481 */
2465 2482 mutex_exit(&db->db_mtx);
2466 2483
2467 2484 if (evict_dbuf)
2468 2485 dnode_evict_bonus(dn);
2469 2486
2470 2487 dnode_rele(dn, db);
2471 2488 } else if (db->db_buf == NULL) {
2472 2489 /*
2473 2490 * This is a special case: we never associated this
2474 2491 * dbuf with any data allocated from the ARC.
2475 2492 */
2476 2493 ASSERT(db->db_state == DB_UNCACHED ||
2477 2494 db->db_state == DB_NOFILL);
2478 2495 dbuf_evict(db);
2479 2496 } else if (arc_released(db->db_buf)) {
2480 2497 arc_buf_t *buf = db->db_buf;
2481 2498 /*
2482 2499 * This dbuf has anonymous data associated with it.
2483 2500 */
2484 2501 dbuf_clear_data(db);
2485 2502 VERIFY(arc_buf_remove_ref(buf, db));
2486 2503 dbuf_evict(db);
2487 2504 } else {
2488 2505 VERIFY(!arc_buf_remove_ref(db->db_buf, db));
2489 2506
2490 2507 /*
2491 2508 * A dbuf will be eligible for eviction if either the
2492 2509 * 'primarycache' property is set or a duplicate
2493 2510 * copy of this buffer is already cached in the arc.
2494 2511 *
2495 2512 * In the case of the 'primarycache' a buffer
2496 2513 * is considered for eviction if it matches the
2497 2514 * criteria set in the property.
2498 2515 *
2499 2516 * To decide if our buffer is considered a
2500 2517 * duplicate, we must call into the arc to determine
2501 2518 * if multiple buffers are referencing the same
2502 2519 * block on-disk. If so, then we simply evict
2503 2520 * ourselves.
2504 2521 */
2505 2522 if (!DBUF_IS_CACHEABLE(db)) {
2506 2523 if (db->db_blkptr != NULL &&
2507 2524 !BP_IS_HOLE(db->db_blkptr) &&
2508 2525 !BP_IS_EMBEDDED(db->db_blkptr)) {
2509 2526 spa_t *spa =
2510 2527 dmu_objset_spa(db->db_objset);
2511 2528 blkptr_t bp = *db->db_blkptr;
2512 2529 dbuf_clear(db);
2513 2530 arc_freed(spa, &bp);
2514 2531 } else {
2515 2532 dbuf_clear(db);
2516 2533 }
2517 2534 } else if (db->db_pending_evict ||
2518 2535 arc_buf_eviction_needed(db->db_buf)) {
2519 2536 dbuf_clear(db);
2520 2537 } else {
2521 2538 mutex_exit(&db->db_mtx);
2522 2539 }
2523 2540 }
2524 2541 } else {
2525 2542 mutex_exit(&db->db_mtx);
2526 2543 }
2527 2544 }
2528 2545
2529 2546 #pragma weak dmu_buf_refcount = dbuf_refcount
2530 2547 uint64_t
2531 2548 dbuf_refcount(dmu_buf_impl_t *db)
2532 2549 {
2533 2550 return (refcount_count(&db->db_holds));
2534 2551 }
2535 2552
2536 2553 void *
2537 2554 dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user,
2538 2555 dmu_buf_user_t *new_user)
2539 2556 {
2540 2557 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2541 2558
2542 2559 mutex_enter(&db->db_mtx);
2543 2560 dbuf_verify_user(db, DBVU_NOT_EVICTING);
2544 2561 if (db->db_user == old_user)
2545 2562 db->db_user = new_user;
2546 2563 else
2547 2564 old_user = db->db_user;
2548 2565 dbuf_verify_user(db, DBVU_NOT_EVICTING);
2549 2566 mutex_exit(&db->db_mtx);
2550 2567
2551 2568 return (old_user);
2552 2569 }
2553 2570
2554 2571 void *
2555 2572 dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
2556 2573 {
2557 2574 return (dmu_buf_replace_user(db_fake, NULL, user));
2558 2575 }
2559 2576
2560 2577 void *
2561 2578 dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)
2562 2579 {
2563 2580 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2564 2581
2565 2582 db->db_user_immediate_evict = TRUE;
2566 2583 return (dmu_buf_set_user(db_fake, user));
2567 2584 }
2568 2585
2569 2586 void *
2570 2587 dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
2571 2588 {
2572 2589 return (dmu_buf_replace_user(db_fake, user, NULL));
2573 2590 }
2574 2591
2575 2592 void *
2576 2593 dmu_buf_get_user(dmu_buf_t *db_fake)
2577 2594 {
2578 2595 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2579 2596
2580 2597 dbuf_verify_user(db, DBVU_NOT_EVICTING);
2581 2598 return (db->db_user);
2582 2599 }
2583 2600
2584 2601 void
2585 2602 dmu_buf_user_evict_wait()
2586 2603 {
2587 2604 taskq_wait(dbu_evict_taskq);
2588 2605 }
2589 2606
2590 2607 boolean_t
2591 2608 dmu_buf_freeable(dmu_buf_t *dbuf)
2592 2609 {
2593 2610 boolean_t res = B_FALSE;
2594 2611 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
2595 2612
2596 2613 if (db->db_blkptr)
2597 2614 res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
2598 2615 db->db_blkptr, db->db_blkptr->blk_birth);
2599 2616
2600 2617 return (res);
2601 2618 }
2602 2619
2603 2620 blkptr_t *
2604 2621 dmu_buf_get_blkptr(dmu_buf_t *db)
2605 2622 {
2606 2623 dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
2607 2624 return (dbi->db_blkptr);
2608 2625 }
2609 2626
2610 2627 static void
2611 2628 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
2612 2629 {
2613 2630 /* ASSERT(dmu_tx_is_syncing(tx) */
2614 2631 ASSERT(MUTEX_HELD(&db->db_mtx));
2615 2632
2616 2633 if (db->db_blkptr != NULL)
2617 2634 return;
2618 2635
2619 2636 if (db->db_blkid == DMU_SPILL_BLKID) {
2620 2637 db->db_blkptr = &dn->dn_phys->dn_spill;
2621 2638 BP_ZERO(db->db_blkptr);
2622 2639 return;
2623 2640 }
2624 2641 if (db->db_level == dn->dn_phys->dn_nlevels-1) {
2625 2642 /*
2626 2643 * This buffer was allocated at a time when there was
2627 2644 * no available blkptrs from the dnode, or it was
2628 2645 * inappropriate to hook it in (i.e., nlevels mis-match).
2629 2646 */
2630 2647 ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
2631 2648 ASSERT(db->db_parent == NULL);
2632 2649 db->db_parent = dn->dn_dbuf;
2633 2650 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
2634 2651 DBUF_VERIFY(db);
2635 2652 } else {
2636 2653 dmu_buf_impl_t *parent = db->db_parent;
2637 2654 int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2638 2655
2639 2656 ASSERT(dn->dn_phys->dn_nlevels > 1);
2640 2657 if (parent == NULL) {
2641 2658 mutex_exit(&db->db_mtx);
2642 2659 rw_enter(&dn->dn_struct_rwlock, RW_READER);
2643 2660 parent = dbuf_hold_level(dn, db->db_level + 1,
2644 2661 db->db_blkid >> epbs, db);
2645 2662 rw_exit(&dn->dn_struct_rwlock);
2646 2663 mutex_enter(&db->db_mtx);
2647 2664 db->db_parent = parent;
2648 2665 }
2649 2666 db->db_blkptr = (blkptr_t *)parent->db.db_data +
2650 2667 (db->db_blkid & ((1ULL << epbs) - 1));
2651 2668 DBUF_VERIFY(db);
2652 2669 }
2653 2670 }
2654 2671
2655 2672 static void
2656 2673 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2657 2674 {
2658 2675 dmu_buf_impl_t *db = dr->dr_dbuf;
2659 2676 dnode_t *dn;
2660 2677 zio_t *zio;
2661 2678
2662 2679 ASSERT(dmu_tx_is_syncing(tx));
2663 2680
2664 2681 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2665 2682
2666 2683 mutex_enter(&db->db_mtx);
2667 2684
2668 2685 ASSERT(db->db_level > 0);
2669 2686 DBUF_VERIFY(db);
2670 2687
2671 2688 /* Read the block if it hasn't been read yet. */
2672 2689 if (db->db_buf == NULL) {
2673 2690 mutex_exit(&db->db_mtx);
2674 2691 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
2675 2692 mutex_enter(&db->db_mtx);
2676 2693 }
2677 2694 ASSERT3U(db->db_state, ==, DB_CACHED);
2678 2695 ASSERT(db->db_buf != NULL);
2679 2696
2680 2697 DB_DNODE_ENTER(db);
2681 2698 dn = DB_DNODE(db);
2682 2699 /* Indirect block size must match what the dnode thinks it is. */
2683 2700 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2684 2701 dbuf_check_blkptr(dn, db);
2685 2702 DB_DNODE_EXIT(db);
2686 2703
2687 2704 /* Provide the pending dirty record to child dbufs */
2688 2705 db->db_data_pending = dr;
2689 2706
2690 2707 mutex_exit(&db->db_mtx);
2691 2708 dbuf_write(dr, db->db_buf, tx);
2692 2709
2693 2710 zio = dr->dr_zio;
2694 2711 mutex_enter(&dr->dt.di.dr_mtx);
2695 2712 dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);
2696 2713 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2697 2714 mutex_exit(&dr->dt.di.dr_mtx);
2698 2715 zio_nowait(zio);
2699 2716 }
2700 2717
2701 2718 static void
2702 2719 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2703 2720 {
2704 2721 arc_buf_t **datap = &dr->dt.dl.dr_data;
2705 2722 dmu_buf_impl_t *db = dr->dr_dbuf;
2706 2723 dnode_t *dn;
2707 2724 objset_t *os;
2708 2725 uint64_t txg = tx->tx_txg;
2709 2726
2710 2727 ASSERT(dmu_tx_is_syncing(tx));
2711 2728
2712 2729 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2713 2730
2714 2731 mutex_enter(&db->db_mtx);
2715 2732 /*
2716 2733 * To be synced, we must be dirtied. But we
2717 2734 * might have been freed after the dirty.
2718 2735 */
2719 2736 if (db->db_state == DB_UNCACHED) {
2720 2737 /* This buffer has been freed since it was dirtied */
2721 2738 ASSERT(db->db.db_data == NULL);
2722 2739 } else if (db->db_state == DB_FILL) {
2723 2740 /* This buffer was freed and is now being re-filled */
2724 2741 ASSERT(db->db.db_data != dr->dt.dl.dr_data);
2725 2742 } else {
2726 2743 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
2727 2744 }
2728 2745 DBUF_VERIFY(db);
2729 2746
2730 2747 DB_DNODE_ENTER(db);
2731 2748 dn = DB_DNODE(db);
2732 2749
2733 2750 if (db->db_blkid == DMU_SPILL_BLKID) {
2734 2751 mutex_enter(&dn->dn_mtx);
2735 2752 dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
2736 2753 mutex_exit(&dn->dn_mtx);
2737 2754 }
2738 2755
2739 2756 /*
2740 2757 * If this is a bonus buffer, simply copy the bonus data into the
2741 2758 * dnode. It will be written out when the dnode is synced (and it
2742 2759 * will be synced, since it must have been dirty for dbuf_sync to
2743 2760 * be called).
2744 2761 */
2745 2762 if (db->db_blkid == DMU_BONUS_BLKID) {
2746 2763 dbuf_dirty_record_t **drp;
2747 2764
2748 2765 ASSERT(*datap != NULL);
2749 2766 ASSERT0(db->db_level);
2750 2767 ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
2751 2768 bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
2752 2769 DB_DNODE_EXIT(db);
2753 2770
2754 2771 if (*datap != db->db.db_data) {
2755 2772 zio_buf_free(*datap, DN_MAX_BONUSLEN);
2756 2773 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
2757 2774 }
2758 2775 db->db_data_pending = NULL;
2759 2776 drp = &db->db_last_dirty;
2760 2777 while (*drp != dr)
2761 2778 drp = &(*drp)->dr_next;
2762 2779 ASSERT(dr->dr_next == NULL);
2763 2780 ASSERT(dr->dr_dbuf == db);
2764 2781 *drp = dr->dr_next;
2765 2782 kmem_free(dr, sizeof (dbuf_dirty_record_t));
2766 2783 ASSERT(db->db_dirtycnt > 0);
2767 2784 db->db_dirtycnt -= 1;
2768 2785 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
2769 2786 return;
2770 2787 }
2771 2788
2772 2789 os = dn->dn_objset;
2773 2790
2774 2791 /*
2775 2792 * This function may have dropped the db_mtx lock allowing a dmu_sync
2776 2793 * operation to sneak in. As a result, we need to ensure that we
2777 2794 * don't check the dr_override_state until we have returned from
2778 2795 * dbuf_check_blkptr.
2779 2796 */
2780 2797 dbuf_check_blkptr(dn, db);
2781 2798
2782 2799 /*
2783 2800 * If this buffer is in the middle of an immediate write,
2784 2801 * wait for the synchronous IO to complete.
2785 2802 */
2786 2803 while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
2787 2804 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
2788 2805 cv_wait(&db->db_changed, &db->db_mtx);
2789 2806 ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
2790 2807 }
2791 2808
2792 2809 if (db->db_state != DB_NOFILL &&
2793 2810 dn->dn_object != DMU_META_DNODE_OBJECT &&
2794 2811 refcount_count(&db->db_holds) > 1 &&
2795 2812 dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
2796 2813 *datap == db->db_buf) {
2797 2814 /*
2798 2815 * If this buffer is currently "in use" (i.e., there
2799 2816 * are active holds and db_data still references it),
2800 2817 * then make a copy before we start the write so that
2801 2818 * any modifications from the open txg will not leak
2802 2819 * into this write.
2803 2820 *
2804 2821 * NOTE: this copy does not need to be made for
2805 2822 * objects only modified in the syncing context (e.g.
2806 2823 * DNONE_DNODE blocks).
2807 2824 */
2808 2825 int blksz = arc_buf_size(*datap);
2809 2826 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
2810 2827 *datap = arc_buf_alloc(os->os_spa, blksz, db, type);
2811 2828 bcopy(db->db.db_data, (*datap)->b_data, blksz);
2812 2829 }
2813 2830 db->db_data_pending = dr;
2814 2831
2815 2832 mutex_exit(&db->db_mtx);
2816 2833
2817 2834 dbuf_write(dr, *datap, tx);
2818 2835
2819 2836 ASSERT(!list_link_active(&dr->dr_dirty_node));
2820 2837 if (dn->dn_object == DMU_META_DNODE_OBJECT) {
2821 2838 list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
2822 2839 DB_DNODE_EXIT(db);
2823 2840 } else {
2824 2841 /*
2825 2842 * Although zio_nowait() does not "wait for an IO", it does
2826 2843 * initiate the IO. If this is an empty write it seems plausible
2827 2844 * that the IO could actually be completed before the nowait
2828 2845 * returns. We need to DB_DNODE_EXIT() first in case
2829 2846 * zio_nowait() invalidates the dbuf.
2830 2847 */
2831 2848 DB_DNODE_EXIT(db);
2832 2849 zio_nowait(dr->dr_zio);
2833 2850 }
2834 2851 }
2835 2852
2836 2853 void
2837 2854 dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
2838 2855 {
2839 2856 dbuf_dirty_record_t *dr;
2840 2857
2841 2858 while (dr = list_head(list)) {
2842 2859 if (dr->dr_zio != NULL) {
2843 2860 /*
2844 2861 * If we find an already initialized zio then we
2845 2862 * are processing the meta-dnode, and we have finished.
2846 2863 * The dbufs for all dnodes are put back on the list
2847 2864 * during processing, so that we can zio_wait()
2848 2865 * these IOs after initiating all child IOs.
2849 2866 */
2850 2867 ASSERT3U(dr->dr_dbuf->db.db_object, ==,
2851 2868 DMU_META_DNODE_OBJECT);
2852 2869 break;
2853 2870 }
2854 2871 if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
2855 2872 dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
2856 2873 VERIFY3U(dr->dr_dbuf->db_level, ==, level);
2857 2874 }
2858 2875 list_remove(list, dr);
2859 2876 if (dr->dr_dbuf->db_level > 0)
2860 2877 dbuf_sync_indirect(dr, tx);
2861 2878 else
2862 2879 dbuf_sync_leaf(dr, tx);
2863 2880 }
2864 2881 }
2865 2882
2866 2883 /* ARGSUSED */
2867 2884 static void
2868 2885 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
2869 2886 {
2870 2887 dmu_buf_impl_t *db = vdb;
2871 2888 dnode_t *dn;
2872 2889 blkptr_t *bp = zio->io_bp;
2873 2890 blkptr_t *bp_orig = &zio->io_bp_orig;
2874 2891 spa_t *spa = zio->io_spa;
2875 2892 int64_t delta;
2876 2893 uint64_t fill = 0;
2877 2894 int i;
2878 2895
2879 2896 ASSERT3P(db->db_blkptr, ==, bp);
2880 2897
2881 2898 DB_DNODE_ENTER(db);
2882 2899 dn = DB_DNODE(db);
2883 2900 delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
2884 2901 dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
2885 2902 zio->io_prev_space_delta = delta;
2886 2903
2887 2904 if (bp->blk_birth != 0) {
2888 2905 ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
2889 2906 BP_GET_TYPE(bp) == dn->dn_type) ||
2890 2907 (db->db_blkid == DMU_SPILL_BLKID &&
2891 2908 BP_GET_TYPE(bp) == dn->dn_bonustype) ||
2892 2909 BP_IS_EMBEDDED(bp));
2893 2910 ASSERT(BP_GET_LEVEL(bp) == db->db_level);
2894 2911 }
2895 2912
2896 2913 mutex_enter(&db->db_mtx);
2897 2914
2898 2915 #ifdef ZFS_DEBUG
2899 2916 if (db->db_blkid == DMU_SPILL_BLKID) {
2900 2917 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
2901 2918 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
2902 2919 db->db_blkptr == &dn->dn_phys->dn_spill);
2903 2920 }
2904 2921 #endif
2905 2922
2906 2923 if (db->db_level == 0) {
2907 2924 mutex_enter(&dn->dn_mtx);
2908 2925 if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
2909 2926 db->db_blkid != DMU_SPILL_BLKID)
2910 2927 dn->dn_phys->dn_maxblkid = db->db_blkid;
2911 2928 mutex_exit(&dn->dn_mtx);
2912 2929
2913 2930 if (dn->dn_type == DMU_OT_DNODE) {
2914 2931 dnode_phys_t *dnp = db->db.db_data;
2915 2932 for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
2916 2933 i--, dnp++) {
2917 2934 if (dnp->dn_type != DMU_OT_NONE)
2918 2935 fill++;
2919 2936 }
2920 2937 } else {
2921 2938 if (BP_IS_HOLE(bp)) {
2922 2939 fill = 0;
2923 2940 } else {
2924 2941 fill = 1;
2925 2942 }
2926 2943 }
2927 2944 } else {
2928 2945 blkptr_t *ibp = db->db.db_data;
2929 2946 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2930 2947 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
2931 2948 if (BP_IS_HOLE(ibp))
2932 2949 continue;
2933 2950 fill += BP_GET_FILL(ibp);
2934 2951 }
2935 2952 }
2936 2953 DB_DNODE_EXIT(db);
2937 2954
2938 2955 if (!BP_IS_EMBEDDED(bp))
2939 2956 bp->blk_fill = fill;
2940 2957
2941 2958 mutex_exit(&db->db_mtx);
2942 2959 }
2943 2960
2944 2961 /*
2945 2962 * The SPA will call this callback several times for each zio - once
2946 2963 * for every physical child i/o (zio->io_phys_children times). This
2947 2964 * allows the DMU to monitor the progress of each logical i/o. For example,
2948 2965 * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
2949 2966 * block. There may be a long delay before all copies/fragments are completed,
2950 2967 * so this callback allows us to retire dirty space gradually, as the physical
2951 2968 * i/os complete.
2952 2969 */
2953 2970 /* ARGSUSED */
2954 2971 static void
2955 2972 dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
2956 2973 {
2957 2974 dmu_buf_impl_t *db = arg;
2958 2975 objset_t *os = db->db_objset;
2959 2976 dsl_pool_t *dp = dmu_objset_pool(os);
2960 2977 dbuf_dirty_record_t *dr;
2961 2978 int delta = 0;
2962 2979
2963 2980 dr = db->db_data_pending;
2964 2981 ASSERT3U(dr->dr_txg, ==, zio->io_txg);
2965 2982
2966 2983 /*
2967 2984 * The callback will be called io_phys_children times. Retire one
2968 2985 * portion of our dirty space each time we are called. Any rounding
2969 2986 * error will be cleaned up by dsl_pool_sync()'s call to
2970 2987 * dsl_pool_undirty_space().
2971 2988 */
2972 2989 delta = dr->dr_accounted / zio->io_phys_children;
2973 2990 dsl_pool_undirty_space(dp, delta, zio->io_txg);
2974 2991 }
2975 2992
2976 2993 /* ARGSUSED */
2977 2994 static void
2978 2995 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
2979 2996 {
2980 2997 dmu_buf_impl_t *db = vdb;
2981 2998 blkptr_t *bp_orig = &zio->io_bp_orig;
2982 2999 blkptr_t *bp = db->db_blkptr;
2983 3000 objset_t *os = db->db_objset;
2984 3001 dmu_tx_t *tx = os->os_synctx;
2985 3002 dbuf_dirty_record_t **drp, *dr;
2986 3003
2987 3004 ASSERT0(zio->io_error);
2988 3005 ASSERT(db->db_blkptr == bp);
2989 3006
2990 3007 /*
2991 3008 * For nopwrites and rewrites we ensure that the bp matches our
2992 3009 * original and bypass all the accounting.
2993 3010 */
2994 3011 if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
2995 3012 ASSERT(BP_EQUAL(bp, bp_orig));
2996 3013 } else {
2997 3014 dsl_dataset_t *ds = os->os_dsl_dataset;
2998 3015 (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
2999 3016 dsl_dataset_block_born(ds, bp, tx);
3000 3017 }
3001 3018
3002 3019 mutex_enter(&db->db_mtx);
3003 3020
3004 3021 DBUF_VERIFY(db);
3005 3022
3006 3023 drp = &db->db_last_dirty;
3007 3024 while ((dr = *drp) != db->db_data_pending)
3008 3025 drp = &dr->dr_next;
3009 3026 ASSERT(!list_link_active(&dr->dr_dirty_node));
3010 3027 ASSERT(dr->dr_dbuf == db);
3011 3028 ASSERT(dr->dr_next == NULL);
3012 3029 *drp = dr->dr_next;
3013 3030
3014 3031 #ifdef ZFS_DEBUG
3015 3032 if (db->db_blkid == DMU_SPILL_BLKID) {
3016 3033 dnode_t *dn;
3017 3034
3018 3035 DB_DNODE_ENTER(db);
3019 3036 dn = DB_DNODE(db);
3020 3037 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
3021 3038 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
3022 3039 db->db_blkptr == &dn->dn_phys->dn_spill);
3023 3040 DB_DNODE_EXIT(db);
3024 3041 }
3025 3042 #endif
3026 3043
3027 3044 if (db->db_level == 0) {
3028 3045 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
3029 3046 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
3030 3047 if (db->db_state != DB_NOFILL) {
3031 3048 if (dr->dt.dl.dr_data != db->db_buf)
3032 3049 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
3033 3050 db));
3034 3051 else if (!arc_released(db->db_buf))
3035 3052 arc_set_callback(db->db_buf, dbuf_do_evict, db);
3036 3053 }
3037 3054 } else {
3038 3055 dnode_t *dn;
3039 3056
3040 3057 DB_DNODE_ENTER(db);
3041 3058 dn = DB_DNODE(db);
3042 3059 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
3043 3060 ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
3044 3061 if (!BP_IS_HOLE(db->db_blkptr)) {
3045 3062 int epbs =
3046 3063 dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
3047 3064 ASSERT3U(db->db_blkid, <=,
3048 3065 dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
3049 3066 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
3050 3067 db->db.db_size);
3051 3068 if (!arc_released(db->db_buf))
3052 3069 arc_set_callback(db->db_buf, dbuf_do_evict, db);
3053 3070 }
3054 3071 DB_DNODE_EXIT(db);
3055 3072 mutex_destroy(&dr->dt.di.dr_mtx);
3056 3073 list_destroy(&dr->dt.di.dr_children);
3057 3074 }
3058 3075 kmem_free(dr, sizeof (dbuf_dirty_record_t));
3059 3076
3060 3077 cv_broadcast(&db->db_changed);
3061 3078 ASSERT(db->db_dirtycnt > 0);
3062 3079 db->db_dirtycnt -= 1;
3063 3080 db->db_data_pending = NULL;
3064 3081 dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg);
3065 3082 }
3066 3083
3067 3084 static void
3068 3085 dbuf_write_nofill_ready(zio_t *zio)
3069 3086 {
3070 3087 dbuf_write_ready(zio, NULL, zio->io_private);
3071 3088 }
3072 3089
3073 3090 static void
3074 3091 dbuf_write_nofill_done(zio_t *zio)
3075 3092 {
3076 3093 dbuf_write_done(zio, NULL, zio->io_private);
3077 3094 }
3078 3095
3079 3096 static void
3080 3097 dbuf_write_override_ready(zio_t *zio)
3081 3098 {
3082 3099 dbuf_dirty_record_t *dr = zio->io_private;
3083 3100 dmu_buf_impl_t *db = dr->dr_dbuf;
3084 3101
3085 3102 dbuf_write_ready(zio, NULL, db);
3086 3103 }
3087 3104
3088 3105 static void
3089 3106 dbuf_write_override_done(zio_t *zio)
3090 3107 {
3091 3108 dbuf_dirty_record_t *dr = zio->io_private;
3092 3109 dmu_buf_impl_t *db = dr->dr_dbuf;
3093 3110 blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
3094 3111
3095 3112 mutex_enter(&db->db_mtx);
3096 3113 if (!BP_EQUAL(zio->io_bp, obp)) {
3097 3114 if (!BP_IS_HOLE(obp))
3098 3115 dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
3099 3116 arc_release(dr->dt.dl.dr_data, db);
3100 3117 }
3101 3118 mutex_exit(&db->db_mtx);
3102 3119
3103 3120 dbuf_write_done(zio, NULL, db);
3104 3121 }
3105 3122
3106 3123 /* Issue I/O to commit a dirty buffer to disk. */
3107 3124 static void
3108 3125 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
3109 3126 {
3110 3127 dmu_buf_impl_t *db = dr->dr_dbuf;
3111 3128 dnode_t *dn;
3112 3129 objset_t *os;
3113 3130 dmu_buf_impl_t *parent = db->db_parent;
3114 3131 uint64_t txg = tx->tx_txg;
3115 3132 zbookmark_phys_t zb;
3116 3133 zio_prop_t zp;
3117 3134 zio_t *zio;
3118 3135 int wp_flag = 0;
3119 3136
3120 3137 DB_DNODE_ENTER(db);
3121 3138 dn = DB_DNODE(db);
3122 3139 os = dn->dn_objset;
3123 3140
3124 3141 if (db->db_state != DB_NOFILL) {
3125 3142 if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
3126 3143 /*
3127 3144 * Private object buffers are released here rather
3128 3145 * than in dbuf_dirty() since they are only modified
3129 3146 * in the syncing context and we don't want the
3130 3147 * overhead of making multiple copies of the data.
3131 3148 */
3132 3149 if (BP_IS_HOLE(db->db_blkptr)) {
3133 3150 arc_buf_thaw(data);
3134 3151 } else {
3135 3152 dbuf_release_bp(db);
3136 3153 }
3137 3154 }
3138 3155 }
3139 3156
3140 3157 if (parent != dn->dn_dbuf) {
3141 3158 /* Our parent is an indirect block. */
3142 3159 /* We have a dirty parent that has been scheduled for write. */
3143 3160 ASSERT(parent && parent->db_data_pending);
3144 3161 /* Our parent's buffer is one level closer to the dnode. */
3145 3162 ASSERT(db->db_level == parent->db_level-1);
3146 3163 /*
3147 3164 * We're about to modify our parent's db_data by modifying
3148 3165 * our block pointer, so the parent must be released.
3149 3166 */
3150 3167 ASSERT(arc_released(parent->db_buf));
3151 3168 zio = parent->db_data_pending->dr_zio;
3152 3169 } else {
3153 3170 /* Our parent is the dnode itself. */
3154 3171 ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
3155 3172 db->db_blkid != DMU_SPILL_BLKID) ||
3156 3173 (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
3157 3174 if (db->db_blkid != DMU_SPILL_BLKID)
3158 3175 ASSERT3P(db->db_blkptr, ==,
3159 3176 &dn->dn_phys->dn_blkptr[db->db_blkid]);
3160 3177 zio = dn->dn_zio;
3161 3178 }
3162 3179
3163 3180 ASSERT(db->db_level == 0 || data == db->db_buf);
3164 3181 ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
3165 3182 ASSERT(zio);
3166 3183
3167 3184 SET_BOOKMARK(&zb, os->os_dsl_dataset ?
3168 3185 os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
3169 3186 db->db.db_object, db->db_level, db->db_blkid);
3170 3187
3171 3188 if (db->db_blkid == DMU_SPILL_BLKID)
3172 3189 wp_flag = WP_SPILL;
3173 3190 wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
3174 3191
3175 3192 dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
3176 3193 DB_DNODE_EXIT(db);
3177 3194
3178 3195 if (db->db_level == 0 &&
3179 3196 dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
3180 3197 /*
3181 3198 * The BP for this block has been provided by open context
3182 3199 * (by dmu_sync() or dmu_buf_write_embedded()).
3183 3200 */
3184 3201 void *contents = (data != NULL) ? data->b_data : NULL;
3185 3202
3186 3203 dr->dr_zio = zio_write(zio, os->os_spa, txg,
3187 3204 db->db_blkptr, contents, db->db.db_size, &zp,
3188 3205 dbuf_write_override_ready, NULL, dbuf_write_override_done,
3189 3206 dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
3190 3207 mutex_enter(&db->db_mtx);
3191 3208 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
3192 3209 zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
3193 3210 dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
3194 3211 mutex_exit(&db->db_mtx);
3195 3212 } else if (db->db_state == DB_NOFILL) {
3196 3213 ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
3197 3214 zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
3198 3215 dr->dr_zio = zio_write(zio, os->os_spa, txg,
3199 3216 db->db_blkptr, NULL, db->db.db_size, &zp,
3200 3217 dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db,
3201 3218 ZIO_PRIORITY_ASYNC_WRITE,
3202 3219 ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
3203 3220 } else {
3204 3221 ASSERT(arc_released(data));
3205 3222 dr->dr_zio = arc_write(zio, os->os_spa, txg,
3206 3223 db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db),
3207 3224 DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
3208 3225 dbuf_write_physdone, dbuf_write_done, db,
3209 3226 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
3210 3227 }
3211 3228 }
|
↓ open down ↓ |
2556 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX