Print this page
4374 dn_free_ranges should use range_tree_t
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Max Grossman <max.grossman@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com
Reviewed by: Garrett D'Amore <garrett@damore.org>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Approved by: Dan McDonald <danmcd@omniti.com>
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/fs/zfs/dbuf.c
+++ new/usr/src/uts/common/fs/zfs/dbuf.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
|
↓ open down ↓ |
13 lines elided |
↑ open up ↑ |
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24 - * Copyright (c) 2013 by Delphix. All rights reserved.
24 + * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
25 25 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
26 26 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
27 27 */
28 28
29 29 #include <sys/zfs_context.h>
30 30 #include <sys/dmu.h>
31 31 #include <sys/dmu_send.h>
32 32 #include <sys/dmu_impl.h>
33 33 #include <sys/dbuf.h>
34 34 #include <sys/dmu_objset.h>
35 35 #include <sys/dsl_dataset.h>
36 36 #include <sys/dsl_dir.h>
37 37 #include <sys/dmu_tx.h>
38 38 #include <sys/spa.h>
39 39 #include <sys/zio.h>
40 40 #include <sys/dmu_zfetch.h>
41 41 #include <sys/sa.h>
42 42 #include <sys/sa_impl.h>
43 +#include <sys/range_tree.h>
43 44
44 45 /*
45 46 * Number of times that zfs_free_range() took the slow path while doing
46 47 * a zfs receive. A nonzero value indicates a potential performance problem.
47 48 */
48 49 uint64_t zfs_free_range_recv_miss;
49 50
50 51 static void dbuf_destroy(dmu_buf_impl_t *db);
51 52 static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
52 53 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
53 54
54 55 /*
55 56 * Global data structures and functions for the dbuf cache.
56 57 */
57 58 static kmem_cache_t *dbuf_cache;
58 59
59 60 /* ARGSUSED */
60 61 static int
61 62 dbuf_cons(void *vdb, void *unused, int kmflag)
62 63 {
63 64 dmu_buf_impl_t *db = vdb;
64 65 bzero(db, sizeof (dmu_buf_impl_t));
65 66
66 67 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
67 68 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
68 69 refcount_create(&db->db_holds);
69 70 return (0);
70 71 }
71 72
72 73 /* ARGSUSED */
73 74 static void
74 75 dbuf_dest(void *vdb, void *unused)
75 76 {
76 77 dmu_buf_impl_t *db = vdb;
77 78 mutex_destroy(&db->db_mtx);
78 79 cv_destroy(&db->db_changed);
79 80 refcount_destroy(&db->db_holds);
80 81 }
81 82
82 83 /*
83 84 * dbuf hash table routines
84 85 */
85 86 static dbuf_hash_table_t dbuf_hash_table;
86 87
87 88 static uint64_t dbuf_hash_count;
88 89
89 90 static uint64_t
90 91 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
91 92 {
92 93 uintptr_t osv = (uintptr_t)os;
93 94 uint64_t crc = -1ULL;
94 95
95 96 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
96 97 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
97 98 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
98 99 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
99 100 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
100 101 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
101 102 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
102 103
103 104 crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
104 105
105 106 return (crc);
106 107 }
107 108
108 109 #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
109 110
110 111 #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
111 112 ((dbuf)->db.db_object == (obj) && \
112 113 (dbuf)->db_objset == (os) && \
113 114 (dbuf)->db_level == (level) && \
114 115 (dbuf)->db_blkid == (blkid))
115 116
116 117 dmu_buf_impl_t *
117 118 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
118 119 {
119 120 dbuf_hash_table_t *h = &dbuf_hash_table;
120 121 objset_t *os = dn->dn_objset;
121 122 uint64_t obj = dn->dn_object;
122 123 uint64_t hv = DBUF_HASH(os, obj, level, blkid);
123 124 uint64_t idx = hv & h->hash_table_mask;
124 125 dmu_buf_impl_t *db;
125 126
126 127 mutex_enter(DBUF_HASH_MUTEX(h, idx));
127 128 for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
128 129 if (DBUF_EQUAL(db, os, obj, level, blkid)) {
129 130 mutex_enter(&db->db_mtx);
130 131 if (db->db_state != DB_EVICTING) {
131 132 mutex_exit(DBUF_HASH_MUTEX(h, idx));
132 133 return (db);
133 134 }
134 135 mutex_exit(&db->db_mtx);
135 136 }
136 137 }
137 138 mutex_exit(DBUF_HASH_MUTEX(h, idx));
138 139 return (NULL);
139 140 }
140 141
141 142 /*
142 143 * Insert an entry into the hash table. If there is already an element
143 144 * equal to elem in the hash table, then the already existing element
144 145 * will be returned and the new element will not be inserted.
145 146 * Otherwise returns NULL.
146 147 */
147 148 static dmu_buf_impl_t *
148 149 dbuf_hash_insert(dmu_buf_impl_t *db)
149 150 {
150 151 dbuf_hash_table_t *h = &dbuf_hash_table;
151 152 objset_t *os = db->db_objset;
152 153 uint64_t obj = db->db.db_object;
153 154 int level = db->db_level;
154 155 uint64_t blkid = db->db_blkid;
155 156 uint64_t hv = DBUF_HASH(os, obj, level, blkid);
156 157 uint64_t idx = hv & h->hash_table_mask;
157 158 dmu_buf_impl_t *dbf;
158 159
159 160 mutex_enter(DBUF_HASH_MUTEX(h, idx));
160 161 for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
161 162 if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
162 163 mutex_enter(&dbf->db_mtx);
163 164 if (dbf->db_state != DB_EVICTING) {
164 165 mutex_exit(DBUF_HASH_MUTEX(h, idx));
165 166 return (dbf);
166 167 }
167 168 mutex_exit(&dbf->db_mtx);
168 169 }
169 170 }
170 171
171 172 mutex_enter(&db->db_mtx);
172 173 db->db_hash_next = h->hash_table[idx];
173 174 h->hash_table[idx] = db;
174 175 mutex_exit(DBUF_HASH_MUTEX(h, idx));
175 176 atomic_add_64(&dbuf_hash_count, 1);
176 177
177 178 return (NULL);
178 179 }
179 180
180 181 /*
181 182 * Remove an entry from the hash table. This operation will
182 183 * fail if there are any existing holds on the db.
183 184 */
184 185 static void
185 186 dbuf_hash_remove(dmu_buf_impl_t *db)
186 187 {
187 188 dbuf_hash_table_t *h = &dbuf_hash_table;
188 189 uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
189 190 db->db_level, db->db_blkid);
190 191 uint64_t idx = hv & h->hash_table_mask;
191 192 dmu_buf_impl_t *dbf, **dbp;
192 193
193 194 /*
194 195 * We musn't hold db_mtx to maintin lock ordering:
195 196 * DBUF_HASH_MUTEX > db_mtx.
196 197 */
197 198 ASSERT(refcount_is_zero(&db->db_holds));
198 199 ASSERT(db->db_state == DB_EVICTING);
199 200 ASSERT(!MUTEX_HELD(&db->db_mtx));
200 201
201 202 mutex_enter(DBUF_HASH_MUTEX(h, idx));
202 203 dbp = &h->hash_table[idx];
203 204 while ((dbf = *dbp) != db) {
204 205 dbp = &dbf->db_hash_next;
205 206 ASSERT(dbf != NULL);
206 207 }
207 208 *dbp = db->db_hash_next;
208 209 db->db_hash_next = NULL;
209 210 mutex_exit(DBUF_HASH_MUTEX(h, idx));
210 211 atomic_add_64(&dbuf_hash_count, -1);
211 212 }
212 213
213 214 static arc_evict_func_t dbuf_do_evict;
214 215
215 216 static void
216 217 dbuf_evict_user(dmu_buf_impl_t *db)
217 218 {
218 219 ASSERT(MUTEX_HELD(&db->db_mtx));
219 220
220 221 if (db->db_level != 0 || db->db_evict_func == NULL)
221 222 return;
222 223
223 224 if (db->db_user_data_ptr_ptr)
224 225 *db->db_user_data_ptr_ptr = db->db.db_data;
225 226 db->db_evict_func(&db->db, db->db_user_ptr);
226 227 db->db_user_ptr = NULL;
227 228 db->db_user_data_ptr_ptr = NULL;
228 229 db->db_evict_func = NULL;
229 230 }
230 231
231 232 boolean_t
232 233 dbuf_is_metadata(dmu_buf_impl_t *db)
233 234 {
234 235 if (db->db_level > 0) {
235 236 return (B_TRUE);
236 237 } else {
237 238 boolean_t is_metadata;
238 239
239 240 DB_DNODE_ENTER(db);
240 241 is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
241 242 DB_DNODE_EXIT(db);
242 243
243 244 return (is_metadata);
244 245 }
245 246 }
246 247
247 248 void
248 249 dbuf_evict(dmu_buf_impl_t *db)
249 250 {
250 251 ASSERT(MUTEX_HELD(&db->db_mtx));
251 252 ASSERT(db->db_buf == NULL);
252 253 ASSERT(db->db_data_pending == NULL);
253 254
254 255 dbuf_clear(db);
255 256 dbuf_destroy(db);
256 257 }
257 258
258 259 void
259 260 dbuf_init(void)
260 261 {
261 262 uint64_t hsize = 1ULL << 16;
262 263 dbuf_hash_table_t *h = &dbuf_hash_table;
263 264 int i;
264 265
265 266 /*
266 267 * The hash table is big enough to fill all of physical memory
267 268 * with an average 4K block size. The table will take up
268 269 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
269 270 */
270 271 while (hsize * 4096 < physmem * PAGESIZE)
271 272 hsize <<= 1;
272 273
273 274 retry:
274 275 h->hash_table_mask = hsize - 1;
275 276 h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
276 277 if (h->hash_table == NULL) {
277 278 /* XXX - we should really return an error instead of assert */
278 279 ASSERT(hsize > (1ULL << 10));
279 280 hsize >>= 1;
280 281 goto retry;
281 282 }
282 283
283 284 dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
284 285 sizeof (dmu_buf_impl_t),
285 286 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
286 287
287 288 for (i = 0; i < DBUF_MUTEXES; i++)
288 289 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
289 290 }
290 291
291 292 void
292 293 dbuf_fini(void)
293 294 {
294 295 dbuf_hash_table_t *h = &dbuf_hash_table;
295 296 int i;
296 297
297 298 for (i = 0; i < DBUF_MUTEXES; i++)
298 299 mutex_destroy(&h->hash_mutexes[i]);
299 300 kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
300 301 kmem_cache_destroy(dbuf_cache);
301 302 }
302 303
303 304 /*
304 305 * Other stuff.
305 306 */
306 307
307 308 #ifdef ZFS_DEBUG
308 309 static void
309 310 dbuf_verify(dmu_buf_impl_t *db)
310 311 {
311 312 dnode_t *dn;
312 313 dbuf_dirty_record_t *dr;
313 314
314 315 ASSERT(MUTEX_HELD(&db->db_mtx));
315 316
316 317 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
317 318 return;
318 319
319 320 ASSERT(db->db_objset != NULL);
320 321 DB_DNODE_ENTER(db);
321 322 dn = DB_DNODE(db);
322 323 if (dn == NULL) {
323 324 ASSERT(db->db_parent == NULL);
324 325 ASSERT(db->db_blkptr == NULL);
325 326 } else {
326 327 ASSERT3U(db->db.db_object, ==, dn->dn_object);
327 328 ASSERT3P(db->db_objset, ==, dn->dn_objset);
328 329 ASSERT3U(db->db_level, <, dn->dn_nlevels);
329 330 ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
330 331 db->db_blkid == DMU_SPILL_BLKID ||
331 332 !list_is_empty(&dn->dn_dbufs));
332 333 }
333 334 if (db->db_blkid == DMU_BONUS_BLKID) {
334 335 ASSERT(dn != NULL);
335 336 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
336 337 ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
337 338 } else if (db->db_blkid == DMU_SPILL_BLKID) {
338 339 ASSERT(dn != NULL);
339 340 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
340 341 ASSERT0(db->db.db_offset);
341 342 } else {
342 343 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
343 344 }
344 345
345 346 for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
346 347 ASSERT(dr->dr_dbuf == db);
347 348
348 349 for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
349 350 ASSERT(dr->dr_dbuf == db);
350 351
351 352 /*
352 353 * We can't assert that db_size matches dn_datablksz because it
353 354 * can be momentarily different when another thread is doing
354 355 * dnode_set_blksz().
355 356 */
356 357 if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
357 358 dr = db->db_data_pending;
358 359 /*
359 360 * It should only be modified in syncing context, so
360 361 * make sure we only have one copy of the data.
361 362 */
362 363 ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
363 364 }
364 365
365 366 /* verify db->db_blkptr */
366 367 if (db->db_blkptr) {
367 368 if (db->db_parent == dn->dn_dbuf) {
368 369 /* db is pointed to by the dnode */
369 370 /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
370 371 if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
371 372 ASSERT(db->db_parent == NULL);
372 373 else
373 374 ASSERT(db->db_parent != NULL);
374 375 if (db->db_blkid != DMU_SPILL_BLKID)
375 376 ASSERT3P(db->db_blkptr, ==,
376 377 &dn->dn_phys->dn_blkptr[db->db_blkid]);
377 378 } else {
378 379 /* db is pointed to by an indirect block */
379 380 int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
380 381 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
381 382 ASSERT3U(db->db_parent->db.db_object, ==,
382 383 db->db.db_object);
383 384 /*
384 385 * dnode_grow_indblksz() can make this fail if we don't
385 386 * have the struct_rwlock. XXX indblksz no longer
386 387 * grows. safe to do this now?
387 388 */
388 389 if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
389 390 ASSERT3P(db->db_blkptr, ==,
390 391 ((blkptr_t *)db->db_parent->db.db_data +
391 392 db->db_blkid % epb));
392 393 }
393 394 }
394 395 }
395 396 if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
396 397 (db->db_buf == NULL || db->db_buf->b_data) &&
397 398 db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
398 399 db->db_state != DB_FILL && !dn->dn_free_txg) {
399 400 /*
400 401 * If the blkptr isn't set but they have nonzero data,
401 402 * it had better be dirty, otherwise we'll lose that
402 403 * data when we evict this buffer.
403 404 */
404 405 if (db->db_dirtycnt == 0) {
405 406 uint64_t *buf = db->db.db_data;
406 407 int i;
407 408
408 409 for (i = 0; i < db->db.db_size >> 3; i++) {
409 410 ASSERT(buf[i] == 0);
410 411 }
411 412 }
412 413 }
413 414 DB_DNODE_EXIT(db);
414 415 }
415 416 #endif
416 417
417 418 static void
418 419 dbuf_update_data(dmu_buf_impl_t *db)
419 420 {
420 421 ASSERT(MUTEX_HELD(&db->db_mtx));
421 422 if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
422 423 ASSERT(!refcount_is_zero(&db->db_holds));
423 424 *db->db_user_data_ptr_ptr = db->db.db_data;
424 425 }
425 426 }
426 427
427 428 static void
428 429 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
429 430 {
430 431 ASSERT(MUTEX_HELD(&db->db_mtx));
431 432 ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
432 433 db->db_buf = buf;
433 434 if (buf != NULL) {
434 435 ASSERT(buf->b_data != NULL);
435 436 db->db.db_data = buf->b_data;
436 437 if (!arc_released(buf))
437 438 arc_set_callback(buf, dbuf_do_evict, db);
438 439 dbuf_update_data(db);
439 440 } else {
440 441 dbuf_evict_user(db);
441 442 db->db.db_data = NULL;
442 443 if (db->db_state != DB_NOFILL)
443 444 db->db_state = DB_UNCACHED;
444 445 }
445 446 }
446 447
447 448 /*
448 449 * Loan out an arc_buf for read. Return the loaned arc_buf.
449 450 */
450 451 arc_buf_t *
451 452 dbuf_loan_arcbuf(dmu_buf_impl_t *db)
452 453 {
453 454 arc_buf_t *abuf;
454 455
455 456 mutex_enter(&db->db_mtx);
456 457 if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
457 458 int blksz = db->db.db_size;
458 459 spa_t *spa = db->db_objset->os_spa;
459 460
460 461 mutex_exit(&db->db_mtx);
461 462 abuf = arc_loan_buf(spa, blksz);
462 463 bcopy(db->db.db_data, abuf->b_data, blksz);
463 464 } else {
464 465 abuf = db->db_buf;
465 466 arc_loan_inuse_buf(abuf, db);
466 467 dbuf_set_data(db, NULL);
467 468 mutex_exit(&db->db_mtx);
468 469 }
469 470 return (abuf);
470 471 }
471 472
472 473 uint64_t
473 474 dbuf_whichblock(dnode_t *dn, uint64_t offset)
474 475 {
475 476 if (dn->dn_datablkshift) {
476 477 return (offset >> dn->dn_datablkshift);
477 478 } else {
478 479 ASSERT3U(offset, <, dn->dn_datablksz);
479 480 return (0);
480 481 }
481 482 }
482 483
483 484 static void
484 485 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
485 486 {
486 487 dmu_buf_impl_t *db = vdb;
487 488
488 489 mutex_enter(&db->db_mtx);
489 490 ASSERT3U(db->db_state, ==, DB_READ);
490 491 /*
491 492 * All reads are synchronous, so we must have a hold on the dbuf
492 493 */
493 494 ASSERT(refcount_count(&db->db_holds) > 0);
494 495 ASSERT(db->db_buf == NULL);
495 496 ASSERT(db->db.db_data == NULL);
496 497 if (db->db_level == 0 && db->db_freed_in_flight) {
497 498 /* we were freed in flight; disregard any error */
498 499 arc_release(buf, db);
499 500 bzero(buf->b_data, db->db.db_size);
500 501 arc_buf_freeze(buf);
501 502 db->db_freed_in_flight = FALSE;
502 503 dbuf_set_data(db, buf);
503 504 db->db_state = DB_CACHED;
504 505 } else if (zio == NULL || zio->io_error == 0) {
505 506 dbuf_set_data(db, buf);
506 507 db->db_state = DB_CACHED;
507 508 } else {
508 509 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
509 510 ASSERT3P(db->db_buf, ==, NULL);
510 511 VERIFY(arc_buf_remove_ref(buf, db));
511 512 db->db_state = DB_UNCACHED;
512 513 }
513 514 cv_broadcast(&db->db_changed);
514 515 dbuf_rele_and_unlock(db, NULL);
515 516 }
516 517
517 518 static void
518 519 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
519 520 {
520 521 dnode_t *dn;
521 522 zbookmark_t zb;
522 523 uint32_t aflags = ARC_NOWAIT;
523 524
524 525 DB_DNODE_ENTER(db);
525 526 dn = DB_DNODE(db);
526 527 ASSERT(!refcount_is_zero(&db->db_holds));
527 528 /* We need the struct_rwlock to prevent db_blkptr from changing. */
528 529 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
529 530 ASSERT(MUTEX_HELD(&db->db_mtx));
530 531 ASSERT(db->db_state == DB_UNCACHED);
531 532 ASSERT(db->db_buf == NULL);
532 533
533 534 if (db->db_blkid == DMU_BONUS_BLKID) {
534 535 int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
535 536
536 537 ASSERT3U(bonuslen, <=, db->db.db_size);
537 538 db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
538 539 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
539 540 if (bonuslen < DN_MAX_BONUSLEN)
540 541 bzero(db->db.db_data, DN_MAX_BONUSLEN);
541 542 if (bonuslen)
542 543 bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
543 544 DB_DNODE_EXIT(db);
544 545 dbuf_update_data(db);
545 546 db->db_state = DB_CACHED;
546 547 mutex_exit(&db->db_mtx);
547 548 return;
548 549 }
549 550
550 551 /*
551 552 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
552 553 * processes the delete record and clears the bp while we are waiting
553 554 * for the dn_mtx (resulting in a "no" from block_freed).
554 555 */
555 556 if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
556 557 (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
557 558 BP_IS_HOLE(db->db_blkptr)))) {
558 559 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
559 560
560 561 DB_DNODE_EXIT(db);
561 562 dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa,
562 563 db->db.db_size, db, type));
563 564 bzero(db->db.db_data, db->db.db_size);
564 565 db->db_state = DB_CACHED;
565 566 *flags |= DB_RF_CACHED;
566 567 mutex_exit(&db->db_mtx);
567 568 return;
568 569 }
569 570
570 571 DB_DNODE_EXIT(db);
571 572
572 573 db->db_state = DB_READ;
573 574 mutex_exit(&db->db_mtx);
574 575
575 576 if (DBUF_IS_L2CACHEABLE(db))
576 577 aflags |= ARC_L2CACHE;
577 578 if (DBUF_IS_L2COMPRESSIBLE(db))
578 579 aflags |= ARC_L2COMPRESS;
579 580
580 581 SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
581 582 db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
582 583 db->db.db_object, db->db_level, db->db_blkid);
583 584
584 585 dbuf_add_ref(db, NULL);
585 586
586 587 (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
587 588 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
588 589 (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
589 590 &aflags, &zb);
590 591 if (aflags & ARC_CACHED)
591 592 *flags |= DB_RF_CACHED;
592 593 }
593 594
594 595 int
595 596 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
596 597 {
597 598 int err = 0;
598 599 boolean_t havepzio = (zio != NULL);
599 600 boolean_t prefetch;
600 601 dnode_t *dn;
601 602
602 603 /*
603 604 * We don't have to hold the mutex to check db_state because it
604 605 * can't be freed while we have a hold on the buffer.
605 606 */
606 607 ASSERT(!refcount_is_zero(&db->db_holds));
607 608
608 609 if (db->db_state == DB_NOFILL)
609 610 return (SET_ERROR(EIO));
610 611
611 612 DB_DNODE_ENTER(db);
612 613 dn = DB_DNODE(db);
613 614 if ((flags & DB_RF_HAVESTRUCT) == 0)
614 615 rw_enter(&dn->dn_struct_rwlock, RW_READER);
615 616
616 617 prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
617 618 (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
618 619 DBUF_IS_CACHEABLE(db);
619 620
620 621 mutex_enter(&db->db_mtx);
621 622 if (db->db_state == DB_CACHED) {
622 623 mutex_exit(&db->db_mtx);
623 624 if (prefetch)
624 625 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
625 626 db->db.db_size, TRUE);
626 627 if ((flags & DB_RF_HAVESTRUCT) == 0)
627 628 rw_exit(&dn->dn_struct_rwlock);
628 629 DB_DNODE_EXIT(db);
629 630 } else if (db->db_state == DB_UNCACHED) {
630 631 spa_t *spa = dn->dn_objset->os_spa;
631 632
632 633 if (zio == NULL)
633 634 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
634 635 dbuf_read_impl(db, zio, &flags);
635 636
636 637 /* dbuf_read_impl has dropped db_mtx for us */
637 638
638 639 if (prefetch)
639 640 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
640 641 db->db.db_size, flags & DB_RF_CACHED);
641 642
642 643 if ((flags & DB_RF_HAVESTRUCT) == 0)
643 644 rw_exit(&dn->dn_struct_rwlock);
644 645 DB_DNODE_EXIT(db);
645 646
646 647 if (!havepzio)
647 648 err = zio_wait(zio);
648 649 } else {
649 650 /*
650 651 * Another reader came in while the dbuf was in flight
651 652 * between UNCACHED and CACHED. Either a writer will finish
652 653 * writing the buffer (sending the dbuf to CACHED) or the
653 654 * first reader's request will reach the read_done callback
654 655 * and send the dbuf to CACHED. Otherwise, a failure
655 656 * occurred and the dbuf went to UNCACHED.
656 657 */
657 658 mutex_exit(&db->db_mtx);
658 659 if (prefetch)
659 660 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
660 661 db->db.db_size, TRUE);
661 662 if ((flags & DB_RF_HAVESTRUCT) == 0)
662 663 rw_exit(&dn->dn_struct_rwlock);
663 664 DB_DNODE_EXIT(db);
664 665
665 666 /* Skip the wait per the caller's request. */
666 667 mutex_enter(&db->db_mtx);
667 668 if ((flags & DB_RF_NEVERWAIT) == 0) {
668 669 while (db->db_state == DB_READ ||
669 670 db->db_state == DB_FILL) {
670 671 ASSERT(db->db_state == DB_READ ||
671 672 (flags & DB_RF_HAVESTRUCT) == 0);
672 673 cv_wait(&db->db_changed, &db->db_mtx);
673 674 }
674 675 if (db->db_state == DB_UNCACHED)
675 676 err = SET_ERROR(EIO);
676 677 }
677 678 mutex_exit(&db->db_mtx);
678 679 }
679 680
680 681 ASSERT(err || havepzio || db->db_state == DB_CACHED);
681 682 return (err);
682 683 }
683 684
684 685 static void
685 686 dbuf_noread(dmu_buf_impl_t *db)
686 687 {
687 688 ASSERT(!refcount_is_zero(&db->db_holds));
688 689 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
689 690 mutex_enter(&db->db_mtx);
690 691 while (db->db_state == DB_READ || db->db_state == DB_FILL)
691 692 cv_wait(&db->db_changed, &db->db_mtx);
692 693 if (db->db_state == DB_UNCACHED) {
693 694 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
694 695 spa_t *spa = db->db_objset->os_spa;
695 696
696 697 ASSERT(db->db_buf == NULL);
697 698 ASSERT(db->db.db_data == NULL);
698 699 dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
699 700 db->db_state = DB_FILL;
700 701 } else if (db->db_state == DB_NOFILL) {
701 702 dbuf_set_data(db, NULL);
702 703 } else {
703 704 ASSERT3U(db->db_state, ==, DB_CACHED);
704 705 }
705 706 mutex_exit(&db->db_mtx);
706 707 }
707 708
708 709 /*
709 710 * This is our just-in-time copy function. It makes a copy of
710 711 * buffers, that have been modified in a previous transaction
711 712 * group, before we modify them in the current active group.
712 713 *
713 714 * This function is used in two places: when we are dirtying a
714 715 * buffer for the first time in a txg, and when we are freeing
715 716 * a range in a dnode that includes this buffer.
716 717 *
717 718 * Note that when we are called from dbuf_free_range() we do
718 719 * not put a hold on the buffer, we just traverse the active
719 720 * dbuf list for the dnode.
720 721 */
721 722 static void
722 723 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
723 724 {
724 725 dbuf_dirty_record_t *dr = db->db_last_dirty;
725 726
726 727 ASSERT(MUTEX_HELD(&db->db_mtx));
727 728 ASSERT(db->db.db_data != NULL);
728 729 ASSERT(db->db_level == 0);
729 730 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
730 731
731 732 if (dr == NULL ||
732 733 (dr->dt.dl.dr_data !=
733 734 ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
734 735 return;
735 736
736 737 /*
737 738 * If the last dirty record for this dbuf has not yet synced
738 739 * and its referencing the dbuf data, either:
739 740 * reset the reference to point to a new copy,
740 741 * or (if there a no active holders)
741 742 * just null out the current db_data pointer.
742 743 */
743 744 ASSERT(dr->dr_txg >= txg - 2);
744 745 if (db->db_blkid == DMU_BONUS_BLKID) {
745 746 /* Note that the data bufs here are zio_bufs */
746 747 dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
747 748 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
748 749 bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
749 750 } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
750 751 int size = db->db.db_size;
751 752 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
752 753 spa_t *spa = db->db_objset->os_spa;
753 754
754 755 dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
755 756 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
756 757 } else {
757 758 dbuf_set_data(db, NULL);
758 759 }
759 760 }
760 761
761 762 void
762 763 dbuf_unoverride(dbuf_dirty_record_t *dr)
763 764 {
764 765 dmu_buf_impl_t *db = dr->dr_dbuf;
765 766 blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
766 767 uint64_t txg = dr->dr_txg;
767 768
768 769 ASSERT(MUTEX_HELD(&db->db_mtx));
769 770 ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
770 771 ASSERT(db->db_level == 0);
771 772
772 773 if (db->db_blkid == DMU_BONUS_BLKID ||
773 774 dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
774 775 return;
775 776
776 777 ASSERT(db->db_data_pending != dr);
777 778
778 779 /* free this block */
779 780 if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
780 781 zio_free(db->db_objset->os_spa, txg, bp);
781 782
782 783 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
783 784 dr->dt.dl.dr_nopwrite = B_FALSE;
784 785
785 786 /*
786 787 * Release the already-written buffer, so we leave it in
787 788 * a consistent dirty state. Note that all callers are
788 789 * modifying the buffer, so they will immediately do
789 790 * another (redundant) arc_release(). Therefore, leave
790 791 * the buf thawed to save the effort of freezing &
791 792 * immediately re-thawing it.
792 793 */
793 794 arc_release(dr->dt.dl.dr_data, db);
794 795 }
795 796
796 797 /*
797 798 * Evict (if its unreferenced) or clear (if its referenced) any level-0
798 799 * data blocks in the free range, so that any future readers will find
799 800 * empty blocks.
800 801 *
801 802 * This is a no-op if the dataset is in the middle of an incremental
802 803 * receive; see comment below for details.
803 804 */
804 805 void
805 806 dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
806 807 {
807 808 dmu_buf_impl_t *db, *db_next;
808 809 uint64_t txg = tx->tx_txg;
809 810
810 811 if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID))
811 812 end = dn->dn_maxblkid;
812 813 dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
813 814
814 815 mutex_enter(&dn->dn_dbufs_mtx);
815 816 if (start >= dn->dn_unlisted_l0_blkid * dn->dn_datablksz) {
816 817 /* There can't be any dbufs in this range; no need to search. */
817 818 mutex_exit(&dn->dn_dbufs_mtx);
818 819 return;
819 820 } else if (dmu_objset_is_receiving(dn->dn_objset)) {
820 821 /*
821 822 * If we are receiving, we expect there to be no dbufs in
822 823 * the range to be freed, because receive modifies each
823 824 * block at most once, and in offset order. If this is
824 825 * not the case, it can lead to performance problems,
825 826 * so note that we unexpectedly took the slow path.
826 827 */
827 828 atomic_inc_64(&zfs_free_range_recv_miss);
828 829 }
829 830
830 831 for (db = list_head(&dn->dn_dbufs); db != NULL; db = db_next) {
831 832 db_next = list_next(&dn->dn_dbufs, db);
832 833 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
833 834
834 835 if (db->db_level != 0)
835 836 continue;
836 837 if (db->db_blkid < start || db->db_blkid > end)
837 838 continue;
838 839
839 840 /* found a level 0 buffer in the range */
840 841 mutex_enter(&db->db_mtx);
841 842 if (dbuf_undirty(db, tx)) {
842 843 /* mutex has been dropped and dbuf destroyed */
843 844 continue;
844 845 }
845 846
846 847 if (db->db_state == DB_UNCACHED ||
847 848 db->db_state == DB_NOFILL ||
848 849 db->db_state == DB_EVICTING) {
849 850 ASSERT(db->db.db_data == NULL);
850 851 mutex_exit(&db->db_mtx);
851 852 continue;
852 853 }
853 854 if (db->db_state == DB_READ || db->db_state == DB_FILL) {
854 855 /* will be handled in dbuf_read_done or dbuf_rele */
855 856 db->db_freed_in_flight = TRUE;
856 857 mutex_exit(&db->db_mtx);
857 858 continue;
858 859 }
859 860 if (refcount_count(&db->db_holds) == 0) {
860 861 ASSERT(db->db_buf);
861 862 dbuf_clear(db);
862 863 continue;
863 864 }
864 865 /* The dbuf is referenced */
865 866
866 867 if (db->db_last_dirty != NULL) {
867 868 dbuf_dirty_record_t *dr = db->db_last_dirty;
868 869
869 870 if (dr->dr_txg == txg) {
870 871 /*
871 872 * This buffer is "in-use", re-adjust the file
872 873 * size to reflect that this buffer may
873 874 * contain new data when we sync.
874 875 */
875 876 if (db->db_blkid != DMU_SPILL_BLKID &&
876 877 db->db_blkid > dn->dn_maxblkid)
877 878 dn->dn_maxblkid = db->db_blkid;
878 879 dbuf_unoverride(dr);
879 880 } else {
880 881 /*
881 882 * This dbuf is not dirty in the open context.
882 883 * Either uncache it (if its not referenced in
883 884 * the open context) or reset its contents to
884 885 * empty.
885 886 */
886 887 dbuf_fix_old_data(db, txg);
887 888 }
888 889 }
889 890 /* clear the contents if its cached */
890 891 if (db->db_state == DB_CACHED) {
891 892 ASSERT(db->db.db_data != NULL);
892 893 arc_release(db->db_buf, db);
893 894 bzero(db->db.db_data, db->db.db_size);
894 895 arc_buf_freeze(db->db_buf);
895 896 }
896 897
897 898 mutex_exit(&db->db_mtx);
898 899 }
899 900 mutex_exit(&dn->dn_dbufs_mtx);
900 901 }
901 902
902 903 static int
903 904 dbuf_block_freeable(dmu_buf_impl_t *db)
904 905 {
905 906 dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
906 907 uint64_t birth_txg = 0;
907 908
908 909 /*
909 910 * We don't need any locking to protect db_blkptr:
910 911 * If it's syncing, then db_last_dirty will be set
911 912 * so we'll ignore db_blkptr.
912 913 *
913 914 * This logic ensures that only block births for
914 915 * filled blocks are considered.
915 916 */
916 917 ASSERT(MUTEX_HELD(&db->db_mtx));
917 918 if (db->db_last_dirty && (db->db_blkptr == NULL ||
918 919 !BP_IS_HOLE(db->db_blkptr))) {
919 920 birth_txg = db->db_last_dirty->dr_txg;
920 921 } else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
921 922 birth_txg = db->db_blkptr->blk_birth;
922 923 }
923 924
924 925 /*
925 926 * If this block don't exist or is in a snapshot, it can't be freed.
926 927 * Don't pass the bp to dsl_dataset_block_freeable() since we
927 928 * are holding the db_mtx lock and might deadlock if we are
928 929 * prefetching a dedup-ed block.
929 930 */
930 931 if (birth_txg != 0)
931 932 return (ds == NULL ||
932 933 dsl_dataset_block_freeable(ds, NULL, birth_txg));
933 934 else
934 935 return (B_FALSE);
935 936 }
936 937
937 938 void
938 939 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
939 940 {
940 941 arc_buf_t *buf, *obuf;
941 942 int osize = db->db.db_size;
942 943 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
943 944 dnode_t *dn;
944 945
945 946 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
946 947
947 948 DB_DNODE_ENTER(db);
948 949 dn = DB_DNODE(db);
949 950
950 951 /* XXX does *this* func really need the lock? */
951 952 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
952 953
953 954 /*
954 955 * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held
955 956 * is OK, because there can be no other references to the db
956 957 * when we are changing its size, so no concurrent DB_FILL can
957 958 * be happening.
958 959 */
959 960 /*
960 961 * XXX we should be doing a dbuf_read, checking the return
961 962 * value and returning that up to our callers
962 963 */
963 964 dmu_buf_will_dirty(&db->db, tx);
964 965
965 966 /* create the data buffer for the new block */
966 967 buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
967 968
968 969 /* copy old block data to the new block */
969 970 obuf = db->db_buf;
970 971 bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
971 972 /* zero the remainder */
972 973 if (size > osize)
973 974 bzero((uint8_t *)buf->b_data + osize, size - osize);
974 975
975 976 mutex_enter(&db->db_mtx);
976 977 dbuf_set_data(db, buf);
977 978 VERIFY(arc_buf_remove_ref(obuf, db));
978 979 db->db.db_size = size;
979 980
980 981 if (db->db_level == 0) {
981 982 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
982 983 db->db_last_dirty->dt.dl.dr_data = buf;
983 984 }
984 985 mutex_exit(&db->db_mtx);
985 986
986 987 dnode_willuse_space(dn, size-osize, tx);
987 988 DB_DNODE_EXIT(db);
988 989 }
989 990
990 991 void
991 992 dbuf_release_bp(dmu_buf_impl_t *db)
992 993 {
993 994 objset_t *os = db->db_objset;
994 995
995 996 ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
996 997 ASSERT(arc_released(os->os_phys_buf) ||
997 998 list_link_active(&os->os_dsl_dataset->ds_synced_link));
998 999 ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
999 1000
1000 1001 (void) arc_release(db->db_buf, db);
1001 1002 }
1002 1003
1003 1004 dbuf_dirty_record_t *
1004 1005 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1005 1006 {
1006 1007 dnode_t *dn;
1007 1008 objset_t *os;
1008 1009 dbuf_dirty_record_t **drp, *dr;
1009 1010 int drop_struct_lock = FALSE;
1010 1011 boolean_t do_free_accounting = B_FALSE;
1011 1012 int txgoff = tx->tx_txg & TXG_MASK;
1012 1013
1013 1014 ASSERT(tx->tx_txg != 0);
1014 1015 ASSERT(!refcount_is_zero(&db->db_holds));
1015 1016 DMU_TX_DIRTY_BUF(tx, db);
1016 1017
1017 1018 DB_DNODE_ENTER(db);
1018 1019 dn = DB_DNODE(db);
1019 1020 /*
1020 1021 * Shouldn't dirty a regular buffer in syncing context. Private
1021 1022 * objects may be dirtied in syncing context, but only if they
1022 1023 * were already pre-dirtied in open context.
1023 1024 */
1024 1025 ASSERT(!dmu_tx_is_syncing(tx) ||
1025 1026 BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
1026 1027 DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1027 1028 dn->dn_objset->os_dsl_dataset == NULL);
1028 1029 /*
1029 1030 * We make this assert for private objects as well, but after we
1030 1031 * check if we're already dirty. They are allowed to re-dirty
1031 1032 * in syncing context.
1032 1033 */
1033 1034 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1034 1035 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1035 1036 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1036 1037
1037 1038 mutex_enter(&db->db_mtx);
1038 1039 /*
1039 1040 * XXX make this true for indirects too? The problem is that
1040 1041 * transactions created with dmu_tx_create_assigned() from
1041 1042 * syncing context don't bother holding ahead.
1042 1043 */
1043 1044 ASSERT(db->db_level != 0 ||
1044 1045 db->db_state == DB_CACHED || db->db_state == DB_FILL ||
1045 1046 db->db_state == DB_NOFILL);
1046 1047
1047 1048 mutex_enter(&dn->dn_mtx);
1048 1049 /*
1049 1050 * Don't set dirtyctx to SYNC if we're just modifying this as we
1050 1051 * initialize the objset.
1051 1052 */
1052 1053 if (dn->dn_dirtyctx == DN_UNDIRTIED &&
1053 1054 !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
1054 1055 dn->dn_dirtyctx =
1055 1056 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
1056 1057 ASSERT(dn->dn_dirtyctx_firstset == NULL);
1057 1058 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
1058 1059 }
1059 1060 mutex_exit(&dn->dn_mtx);
1060 1061
1061 1062 if (db->db_blkid == DMU_SPILL_BLKID)
1062 1063 dn->dn_have_spill = B_TRUE;
1063 1064
1064 1065 /*
1065 1066 * If this buffer is already dirty, we're done.
1066 1067 */
1067 1068 drp = &db->db_last_dirty;
1068 1069 ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
1069 1070 db->db.db_object == DMU_META_DNODE_OBJECT);
1070 1071 while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
1071 1072 drp = &dr->dr_next;
1072 1073 if (dr && dr->dr_txg == tx->tx_txg) {
1073 1074 DB_DNODE_EXIT(db);
1074 1075
1075 1076 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
1076 1077 /*
1077 1078 * If this buffer has already been written out,
1078 1079 * we now need to reset its state.
1079 1080 */
1080 1081 dbuf_unoverride(dr);
1081 1082 if (db->db.db_object != DMU_META_DNODE_OBJECT &&
1082 1083 db->db_state != DB_NOFILL)
1083 1084 arc_buf_thaw(db->db_buf);
1084 1085 }
1085 1086 mutex_exit(&db->db_mtx);
1086 1087 return (dr);
1087 1088 }
1088 1089
1089 1090 /*
1090 1091 * Only valid if not already dirty.
1091 1092 */
1092 1093 ASSERT(dn->dn_object == 0 ||
1093 1094 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1094 1095 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1095 1096
1096 1097 ASSERT3U(dn->dn_nlevels, >, db->db_level);
1097 1098 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
1098 1099 dn->dn_phys->dn_nlevels > db->db_level ||
1099 1100 dn->dn_next_nlevels[txgoff] > db->db_level ||
1100 1101 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
1101 1102 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
1102 1103
1103 1104 /*
1104 1105 * We should only be dirtying in syncing context if it's the
1105 1106 * mos or we're initializing the os or it's a special object.
1106 1107 * However, we are allowed to dirty in syncing context provided
1107 1108 * we already dirtied it in open context. Hence we must make
1108 1109 * this assertion only if we're not already dirty.
1109 1110 */
1110 1111 os = dn->dn_objset;
1111 1112 ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1112 1113 os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
1113 1114 ASSERT(db->db.db_size != 0);
1114 1115
1115 1116 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1116 1117
1117 1118 if (db->db_blkid != DMU_BONUS_BLKID) {
1118 1119 /*
1119 1120 * Update the accounting.
1120 1121 * Note: we delay "free accounting" until after we drop
1121 1122 * the db_mtx. This keeps us from grabbing other locks
1122 1123 * (and possibly deadlocking) in bp_get_dsize() while
1123 1124 * also holding the db_mtx.
1124 1125 */
1125 1126 dnode_willuse_space(dn, db->db.db_size, tx);
1126 1127 do_free_accounting = dbuf_block_freeable(db);
1127 1128 }
1128 1129
1129 1130 /*
1130 1131 * If this buffer is dirty in an old transaction group we need
1131 1132 * to make a copy of it so that the changes we make in this
1132 1133 * transaction group won't leak out when we sync the older txg.
1133 1134 */
1134 1135 dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
1135 1136 if (db->db_level == 0) {
1136 1137 void *data_old = db->db_buf;
1137 1138
1138 1139 if (db->db_state != DB_NOFILL) {
1139 1140 if (db->db_blkid == DMU_BONUS_BLKID) {
1140 1141 dbuf_fix_old_data(db, tx->tx_txg);
1141 1142 data_old = db->db.db_data;
1142 1143 } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
1143 1144 /*
1144 1145 * Release the data buffer from the cache so
1145 1146 * that we can modify it without impacting
1146 1147 * possible other users of this cached data
1147 1148 * block. Note that indirect blocks and
1148 1149 * private objects are not released until the
1149 1150 * syncing state (since they are only modified
1150 1151 * then).
1151 1152 */
1152 1153 arc_release(db->db_buf, db);
1153 1154 dbuf_fix_old_data(db, tx->tx_txg);
1154 1155 data_old = db->db_buf;
1155 1156 }
1156 1157 ASSERT(data_old != NULL);
1157 1158 }
1158 1159 dr->dt.dl.dr_data = data_old;
1159 1160 } else {
1160 1161 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
1161 1162 list_create(&dr->dt.di.dr_children,
1162 1163 sizeof (dbuf_dirty_record_t),
1163 1164 offsetof(dbuf_dirty_record_t, dr_dirty_node));
1164 1165 }
1165 1166 if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
1166 1167 dr->dr_accounted = db->db.db_size;
1167 1168 dr->dr_dbuf = db;
1168 1169 dr->dr_txg = tx->tx_txg;
1169 1170 dr->dr_next = *drp;
|
↓ open down ↓ |
1117 lines elided |
↑ open up ↑ |
1170 1171 *drp = dr;
1171 1172
1172 1173 /*
1173 1174 * We could have been freed_in_flight between the dbuf_noread
1174 1175 * and dbuf_dirty. We win, as though the dbuf_noread() had
1175 1176 * happened after the free.
1176 1177 */
1177 1178 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1178 1179 db->db_blkid != DMU_SPILL_BLKID) {
1179 1180 mutex_enter(&dn->dn_mtx);
1180 - dnode_clear_range(dn, db->db_blkid, 1, tx);
1181 + if (dn->dn_free_ranges[txgoff] != NULL) {
1182 + range_tree_clear(dn->dn_free_ranges[txgoff],
1183 + db->db_blkid, 1);
1184 + }
1181 1185 mutex_exit(&dn->dn_mtx);
1182 1186 db->db_freed_in_flight = FALSE;
1183 1187 }
1184 1188
1185 1189 /*
1186 1190 * This buffer is now part of this txg
1187 1191 */
1188 1192 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
1189 1193 db->db_dirtycnt += 1;
1190 1194 ASSERT3U(db->db_dirtycnt, <=, 3);
1191 1195
1192 1196 mutex_exit(&db->db_mtx);
1193 1197
1194 1198 if (db->db_blkid == DMU_BONUS_BLKID ||
1195 1199 db->db_blkid == DMU_SPILL_BLKID) {
1196 1200 mutex_enter(&dn->dn_mtx);
1197 1201 ASSERT(!list_link_active(&dr->dr_dirty_node));
1198 1202 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1199 1203 mutex_exit(&dn->dn_mtx);
1200 1204 dnode_setdirty(dn, tx);
1201 1205 DB_DNODE_EXIT(db);
1202 1206 return (dr);
1203 1207 } else if (do_free_accounting) {
1204 1208 blkptr_t *bp = db->db_blkptr;
1205 1209 int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
1206 1210 bp_get_dsize(os->os_spa, bp) : db->db.db_size;
1207 1211 /*
1208 1212 * This is only a guess -- if the dbuf is dirty
1209 1213 * in a previous txg, we don't know how much
1210 1214 * space it will use on disk yet. We should
1211 1215 * really have the struct_rwlock to access
1212 1216 * db_blkptr, but since this is just a guess,
1213 1217 * it's OK if we get an odd answer.
1214 1218 */
1215 1219 ddt_prefetch(os->os_spa, bp);
1216 1220 dnode_willuse_space(dn, -willfree, tx);
1217 1221 }
1218 1222
1219 1223 if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
1220 1224 rw_enter(&dn->dn_struct_rwlock, RW_READER);
1221 1225 drop_struct_lock = TRUE;
1222 1226 }
1223 1227
1224 1228 if (db->db_level == 0) {
1225 1229 dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
1226 1230 ASSERT(dn->dn_maxblkid >= db->db_blkid);
1227 1231 }
1228 1232
1229 1233 if (db->db_level+1 < dn->dn_nlevels) {
1230 1234 dmu_buf_impl_t *parent = db->db_parent;
1231 1235 dbuf_dirty_record_t *di;
1232 1236 int parent_held = FALSE;
1233 1237
1234 1238 if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
1235 1239 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1236 1240
1237 1241 parent = dbuf_hold_level(dn, db->db_level+1,
1238 1242 db->db_blkid >> epbs, FTAG);
1239 1243 ASSERT(parent != NULL);
1240 1244 parent_held = TRUE;
1241 1245 }
1242 1246 if (drop_struct_lock)
1243 1247 rw_exit(&dn->dn_struct_rwlock);
1244 1248 ASSERT3U(db->db_level+1, ==, parent->db_level);
1245 1249 di = dbuf_dirty(parent, tx);
1246 1250 if (parent_held)
1247 1251 dbuf_rele(parent, FTAG);
1248 1252
1249 1253 mutex_enter(&db->db_mtx);
1250 1254 /*
1251 1255 * Since we've dropped the mutex, it's possible that
1252 1256 * dbuf_undirty() might have changed this out from under us.
1253 1257 */
1254 1258 if (db->db_last_dirty == dr ||
1255 1259 dn->dn_object == DMU_META_DNODE_OBJECT) {
1256 1260 mutex_enter(&di->dt.di.dr_mtx);
1257 1261 ASSERT3U(di->dr_txg, ==, tx->tx_txg);
1258 1262 ASSERT(!list_link_active(&dr->dr_dirty_node));
1259 1263 list_insert_tail(&di->dt.di.dr_children, dr);
1260 1264 mutex_exit(&di->dt.di.dr_mtx);
1261 1265 dr->dr_parent = di;
1262 1266 }
1263 1267 mutex_exit(&db->db_mtx);
1264 1268 } else {
1265 1269 ASSERT(db->db_level+1 == dn->dn_nlevels);
1266 1270 ASSERT(db->db_blkid < dn->dn_nblkptr);
1267 1271 ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
1268 1272 mutex_enter(&dn->dn_mtx);
1269 1273 ASSERT(!list_link_active(&dr->dr_dirty_node));
1270 1274 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1271 1275 mutex_exit(&dn->dn_mtx);
1272 1276 if (drop_struct_lock)
1273 1277 rw_exit(&dn->dn_struct_rwlock);
1274 1278 }
1275 1279
1276 1280 dnode_setdirty(dn, tx);
1277 1281 DB_DNODE_EXIT(db);
1278 1282 return (dr);
1279 1283 }
1280 1284
1281 1285 /*
1282 1286 * Undirty a buffer in the transaction group referenced by the given
1283 1287 * transaction. Return whether this evicted the dbuf.
1284 1288 */
1285 1289 static boolean_t
1286 1290 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1287 1291 {
1288 1292 dnode_t *dn;
1289 1293 uint64_t txg = tx->tx_txg;
1290 1294 dbuf_dirty_record_t *dr, **drp;
1291 1295
1292 1296 ASSERT(txg != 0);
1293 1297 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1294 1298 ASSERT0(db->db_level);
1295 1299 ASSERT(MUTEX_HELD(&db->db_mtx));
1296 1300
1297 1301 /*
1298 1302 * If this buffer is not dirty, we're done.
1299 1303 */
1300 1304 for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
1301 1305 if (dr->dr_txg <= txg)
1302 1306 break;
1303 1307 if (dr == NULL || dr->dr_txg < txg)
1304 1308 return (B_FALSE);
1305 1309 ASSERT(dr->dr_txg == txg);
1306 1310 ASSERT(dr->dr_dbuf == db);
1307 1311
1308 1312 DB_DNODE_ENTER(db);
1309 1313 dn = DB_DNODE(db);
1310 1314
1311 1315 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1312 1316
1313 1317 ASSERT(db->db.db_size != 0);
1314 1318
1315 1319 /*
1316 1320 * Any space we accounted for in dp_dirty_* will be cleaned up by
1317 1321 * dsl_pool_sync(). This is relatively rare so the discrepancy
1318 1322 * is not a big deal.
1319 1323 */
1320 1324
1321 1325 *drp = dr->dr_next;
1322 1326
1323 1327 /*
1324 1328 * Note that there are three places in dbuf_dirty()
1325 1329 * where this dirty record may be put on a list.
1326 1330 * Make sure to do a list_remove corresponding to
1327 1331 * every one of those list_insert calls.
1328 1332 */
1329 1333 if (dr->dr_parent) {
1330 1334 mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
1331 1335 list_remove(&dr->dr_parent->dt.di.dr_children, dr);
1332 1336 mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
1333 1337 } else if (db->db_blkid == DMU_SPILL_BLKID ||
1334 1338 db->db_level+1 == dn->dn_nlevels) {
1335 1339 ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
1336 1340 mutex_enter(&dn->dn_mtx);
1337 1341 list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
1338 1342 mutex_exit(&dn->dn_mtx);
1339 1343 }
1340 1344 DB_DNODE_EXIT(db);
1341 1345
1342 1346 if (db->db_state != DB_NOFILL) {
1343 1347 dbuf_unoverride(dr);
1344 1348
1345 1349 ASSERT(db->db_buf != NULL);
1346 1350 ASSERT(dr->dt.dl.dr_data != NULL);
1347 1351 if (dr->dt.dl.dr_data != db->db_buf)
1348 1352 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
1349 1353 }
1350 1354 kmem_free(dr, sizeof (dbuf_dirty_record_t));
1351 1355
1352 1356 ASSERT(db->db_dirtycnt > 0);
1353 1357 db->db_dirtycnt -= 1;
1354 1358
1355 1359 if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
1356 1360 arc_buf_t *buf = db->db_buf;
1357 1361
1358 1362 ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
1359 1363 dbuf_set_data(db, NULL);
1360 1364 VERIFY(arc_buf_remove_ref(buf, db));
1361 1365 dbuf_evict(db);
1362 1366 return (B_TRUE);
1363 1367 }
1364 1368
1365 1369 return (B_FALSE);
1366 1370 }
1367 1371
1368 1372 void
1369 1373 dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
1370 1374 {
1371 1375 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1372 1376 int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
1373 1377
1374 1378 ASSERT(tx->tx_txg != 0);
1375 1379 ASSERT(!refcount_is_zero(&db->db_holds));
1376 1380
1377 1381 DB_DNODE_ENTER(db);
1378 1382 if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
1379 1383 rf |= DB_RF_HAVESTRUCT;
1380 1384 DB_DNODE_EXIT(db);
1381 1385 (void) dbuf_read(db, NULL, rf);
1382 1386 (void) dbuf_dirty(db, tx);
1383 1387 }
1384 1388
1385 1389 void
1386 1390 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1387 1391 {
1388 1392 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1389 1393
1390 1394 db->db_state = DB_NOFILL;
1391 1395
1392 1396 dmu_buf_will_fill(db_fake, tx);
1393 1397 }
1394 1398
1395 1399 void
1396 1400 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1397 1401 {
1398 1402 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1399 1403
1400 1404 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1401 1405 ASSERT(tx->tx_txg != 0);
1402 1406 ASSERT(db->db_level == 0);
1403 1407 ASSERT(!refcount_is_zero(&db->db_holds));
1404 1408
1405 1409 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
1406 1410 dmu_tx_private_ok(tx));
1407 1411
1408 1412 dbuf_noread(db);
1409 1413 (void) dbuf_dirty(db, tx);
1410 1414 }
1411 1415
1412 1416 #pragma weak dmu_buf_fill_done = dbuf_fill_done
1413 1417 /* ARGSUSED */
1414 1418 void
1415 1419 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
1416 1420 {
1417 1421 mutex_enter(&db->db_mtx);
1418 1422 DBUF_VERIFY(db);
1419 1423
1420 1424 if (db->db_state == DB_FILL) {
1421 1425 if (db->db_level == 0 && db->db_freed_in_flight) {
1422 1426 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1423 1427 /* we were freed while filling */
1424 1428 /* XXX dbuf_undirty? */
1425 1429 bzero(db->db.db_data, db->db.db_size);
1426 1430 db->db_freed_in_flight = FALSE;
1427 1431 }
1428 1432 db->db_state = DB_CACHED;
1429 1433 cv_broadcast(&db->db_changed);
1430 1434 }
1431 1435 mutex_exit(&db->db_mtx);
1432 1436 }
1433 1437
1434 1438 /*
1435 1439 * Directly assign a provided arc buf to a given dbuf if it's not referenced
1436 1440 * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
1437 1441 */
1438 1442 void
1439 1443 dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
1440 1444 {
1441 1445 ASSERT(!refcount_is_zero(&db->db_holds));
1442 1446 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1443 1447 ASSERT(db->db_level == 0);
1444 1448 ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
1445 1449 ASSERT(buf != NULL);
1446 1450 ASSERT(arc_buf_size(buf) == db->db.db_size);
1447 1451 ASSERT(tx->tx_txg != 0);
1448 1452
1449 1453 arc_return_buf(buf, db);
1450 1454 ASSERT(arc_released(buf));
1451 1455
1452 1456 mutex_enter(&db->db_mtx);
1453 1457
1454 1458 while (db->db_state == DB_READ || db->db_state == DB_FILL)
1455 1459 cv_wait(&db->db_changed, &db->db_mtx);
1456 1460
1457 1461 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
1458 1462
1459 1463 if (db->db_state == DB_CACHED &&
1460 1464 refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
1461 1465 mutex_exit(&db->db_mtx);
1462 1466 (void) dbuf_dirty(db, tx);
1463 1467 bcopy(buf->b_data, db->db.db_data, db->db.db_size);
1464 1468 VERIFY(arc_buf_remove_ref(buf, db));
1465 1469 xuio_stat_wbuf_copied();
1466 1470 return;
1467 1471 }
1468 1472
1469 1473 xuio_stat_wbuf_nocopy();
1470 1474 if (db->db_state == DB_CACHED) {
1471 1475 dbuf_dirty_record_t *dr = db->db_last_dirty;
1472 1476
1473 1477 ASSERT(db->db_buf != NULL);
1474 1478 if (dr != NULL && dr->dr_txg == tx->tx_txg) {
1475 1479 ASSERT(dr->dt.dl.dr_data == db->db_buf);
1476 1480 if (!arc_released(db->db_buf)) {
1477 1481 ASSERT(dr->dt.dl.dr_override_state ==
1478 1482 DR_OVERRIDDEN);
1479 1483 arc_release(db->db_buf, db);
1480 1484 }
1481 1485 dr->dt.dl.dr_data = buf;
1482 1486 VERIFY(arc_buf_remove_ref(db->db_buf, db));
1483 1487 } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
1484 1488 arc_release(db->db_buf, db);
1485 1489 VERIFY(arc_buf_remove_ref(db->db_buf, db));
1486 1490 }
1487 1491 db->db_buf = NULL;
1488 1492 }
1489 1493 ASSERT(db->db_buf == NULL);
1490 1494 dbuf_set_data(db, buf);
1491 1495 db->db_state = DB_FILL;
1492 1496 mutex_exit(&db->db_mtx);
1493 1497 (void) dbuf_dirty(db, tx);
1494 1498 dmu_buf_fill_done(&db->db, tx);
1495 1499 }
1496 1500
1497 1501 /*
1498 1502 * "Clear" the contents of this dbuf. This will mark the dbuf
1499 1503 * EVICTING and clear *most* of its references. Unfortunately,
1500 1504 * when we are not holding the dn_dbufs_mtx, we can't clear the
1501 1505 * entry in the dn_dbufs list. We have to wait until dbuf_destroy()
1502 1506 * in this case. For callers from the DMU we will usually see:
1503 1507 * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
1504 1508 * For the arc callback, we will usually see:
1505 1509 * dbuf_do_evict()->dbuf_clear();dbuf_destroy()
1506 1510 * Sometimes, though, we will get a mix of these two:
1507 1511 * DMU: dbuf_clear()->arc_buf_evict()
1508 1512 * ARC: dbuf_do_evict()->dbuf_destroy()
1509 1513 */
1510 1514 void
1511 1515 dbuf_clear(dmu_buf_impl_t *db)
1512 1516 {
1513 1517 dnode_t *dn;
1514 1518 dmu_buf_impl_t *parent = db->db_parent;
1515 1519 dmu_buf_impl_t *dndb;
1516 1520 int dbuf_gone = FALSE;
1517 1521
1518 1522 ASSERT(MUTEX_HELD(&db->db_mtx));
1519 1523 ASSERT(refcount_is_zero(&db->db_holds));
1520 1524
1521 1525 dbuf_evict_user(db);
1522 1526
1523 1527 if (db->db_state == DB_CACHED) {
1524 1528 ASSERT(db->db.db_data != NULL);
1525 1529 if (db->db_blkid == DMU_BONUS_BLKID) {
1526 1530 zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
1527 1531 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
1528 1532 }
1529 1533 db->db.db_data = NULL;
1530 1534 db->db_state = DB_UNCACHED;
1531 1535 }
1532 1536
1533 1537 ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
1534 1538 ASSERT(db->db_data_pending == NULL);
1535 1539
1536 1540 db->db_state = DB_EVICTING;
1537 1541 db->db_blkptr = NULL;
1538 1542
1539 1543 DB_DNODE_ENTER(db);
1540 1544 dn = DB_DNODE(db);
1541 1545 dndb = dn->dn_dbuf;
1542 1546 if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
1543 1547 list_remove(&dn->dn_dbufs, db);
1544 1548 (void) atomic_dec_32_nv(&dn->dn_dbufs_count);
1545 1549 membar_producer();
1546 1550 DB_DNODE_EXIT(db);
1547 1551 /*
1548 1552 * Decrementing the dbuf count means that the hold corresponding
1549 1553 * to the removed dbuf is no longer discounted in dnode_move(),
1550 1554 * so the dnode cannot be moved until after we release the hold.
1551 1555 * The membar_producer() ensures visibility of the decremented
1552 1556 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
1553 1557 * release any lock.
1554 1558 */
1555 1559 dnode_rele(dn, db);
1556 1560 db->db_dnode_handle = NULL;
1557 1561 } else {
1558 1562 DB_DNODE_EXIT(db);
1559 1563 }
1560 1564
1561 1565 if (db->db_buf)
1562 1566 dbuf_gone = arc_buf_evict(db->db_buf);
1563 1567
1564 1568 if (!dbuf_gone)
1565 1569 mutex_exit(&db->db_mtx);
1566 1570
1567 1571 /*
1568 1572 * If this dbuf is referenced from an indirect dbuf,
1569 1573 * decrement the ref count on the indirect dbuf.
1570 1574 */
1571 1575 if (parent && parent != dndb)
1572 1576 dbuf_rele(parent, db);
1573 1577 }
1574 1578
1575 1579 static int
1576 1580 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
1577 1581 dmu_buf_impl_t **parentp, blkptr_t **bpp)
1578 1582 {
1579 1583 int nlevels, epbs;
1580 1584
1581 1585 *parentp = NULL;
1582 1586 *bpp = NULL;
1583 1587
1584 1588 ASSERT(blkid != DMU_BONUS_BLKID);
1585 1589
1586 1590 if (blkid == DMU_SPILL_BLKID) {
1587 1591 mutex_enter(&dn->dn_mtx);
1588 1592 if (dn->dn_have_spill &&
1589 1593 (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
1590 1594 *bpp = &dn->dn_phys->dn_spill;
1591 1595 else
1592 1596 *bpp = NULL;
1593 1597 dbuf_add_ref(dn->dn_dbuf, NULL);
1594 1598 *parentp = dn->dn_dbuf;
1595 1599 mutex_exit(&dn->dn_mtx);
1596 1600 return (0);
1597 1601 }
1598 1602
1599 1603 if (dn->dn_phys->dn_nlevels == 0)
1600 1604 nlevels = 1;
1601 1605 else
1602 1606 nlevels = dn->dn_phys->dn_nlevels;
1603 1607
1604 1608 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1605 1609
1606 1610 ASSERT3U(level * epbs, <, 64);
1607 1611 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1608 1612 if (level >= nlevels ||
1609 1613 (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
1610 1614 /* the buffer has no parent yet */
1611 1615 return (SET_ERROR(ENOENT));
1612 1616 } else if (level < nlevels-1) {
1613 1617 /* this block is referenced from an indirect block */
1614 1618 int err = dbuf_hold_impl(dn, level+1,
1615 1619 blkid >> epbs, fail_sparse, NULL, parentp);
1616 1620 if (err)
1617 1621 return (err);
1618 1622 err = dbuf_read(*parentp, NULL,
1619 1623 (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
1620 1624 if (err) {
1621 1625 dbuf_rele(*parentp, NULL);
1622 1626 *parentp = NULL;
1623 1627 return (err);
1624 1628 }
1625 1629 *bpp = ((blkptr_t *)(*parentp)->db.db_data) +
1626 1630 (blkid & ((1ULL << epbs) - 1));
1627 1631 return (0);
1628 1632 } else {
1629 1633 /* the block is referenced from the dnode */
1630 1634 ASSERT3U(level, ==, nlevels-1);
1631 1635 ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
1632 1636 blkid < dn->dn_phys->dn_nblkptr);
1633 1637 if (dn->dn_dbuf) {
1634 1638 dbuf_add_ref(dn->dn_dbuf, NULL);
1635 1639 *parentp = dn->dn_dbuf;
1636 1640 }
1637 1641 *bpp = &dn->dn_phys->dn_blkptr[blkid];
1638 1642 return (0);
1639 1643 }
1640 1644 }
1641 1645
1642 1646 static dmu_buf_impl_t *
1643 1647 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
1644 1648 dmu_buf_impl_t *parent, blkptr_t *blkptr)
1645 1649 {
1646 1650 objset_t *os = dn->dn_objset;
1647 1651 dmu_buf_impl_t *db, *odb;
1648 1652
1649 1653 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1650 1654 ASSERT(dn->dn_type != DMU_OT_NONE);
1651 1655
1652 1656 db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
1653 1657
1654 1658 db->db_objset = os;
1655 1659 db->db.db_object = dn->dn_object;
1656 1660 db->db_level = level;
1657 1661 db->db_blkid = blkid;
1658 1662 db->db_last_dirty = NULL;
1659 1663 db->db_dirtycnt = 0;
1660 1664 db->db_dnode_handle = dn->dn_handle;
1661 1665 db->db_parent = parent;
1662 1666 db->db_blkptr = blkptr;
1663 1667
1664 1668 db->db_user_ptr = NULL;
1665 1669 db->db_user_data_ptr_ptr = NULL;
1666 1670 db->db_evict_func = NULL;
1667 1671 db->db_immediate_evict = 0;
1668 1672 db->db_freed_in_flight = 0;
1669 1673
1670 1674 if (blkid == DMU_BONUS_BLKID) {
1671 1675 ASSERT3P(parent, ==, dn->dn_dbuf);
1672 1676 db->db.db_size = DN_MAX_BONUSLEN -
1673 1677 (dn->dn_nblkptr-1) * sizeof (blkptr_t);
1674 1678 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
1675 1679 db->db.db_offset = DMU_BONUS_BLKID;
1676 1680 db->db_state = DB_UNCACHED;
1677 1681 /* the bonus dbuf is not placed in the hash table */
1678 1682 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1679 1683 return (db);
1680 1684 } else if (blkid == DMU_SPILL_BLKID) {
1681 1685 db->db.db_size = (blkptr != NULL) ?
1682 1686 BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
1683 1687 db->db.db_offset = 0;
1684 1688 } else {
1685 1689 int blocksize =
1686 1690 db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
1687 1691 db->db.db_size = blocksize;
1688 1692 db->db.db_offset = db->db_blkid * blocksize;
1689 1693 }
1690 1694
1691 1695 /*
1692 1696 * Hold the dn_dbufs_mtx while we get the new dbuf
1693 1697 * in the hash table *and* added to the dbufs list.
1694 1698 * This prevents a possible deadlock with someone
1695 1699 * trying to look up this dbuf before its added to the
1696 1700 * dn_dbufs list.
1697 1701 */
1698 1702 mutex_enter(&dn->dn_dbufs_mtx);
1699 1703 db->db_state = DB_EVICTING;
1700 1704 if ((odb = dbuf_hash_insert(db)) != NULL) {
1701 1705 /* someone else inserted it first */
1702 1706 kmem_cache_free(dbuf_cache, db);
1703 1707 mutex_exit(&dn->dn_dbufs_mtx);
1704 1708 return (odb);
1705 1709 }
1706 1710 list_insert_head(&dn->dn_dbufs, db);
1707 1711 if (db->db_level == 0 && db->db_blkid >=
1708 1712 dn->dn_unlisted_l0_blkid)
1709 1713 dn->dn_unlisted_l0_blkid = db->db_blkid + 1;
1710 1714 db->db_state = DB_UNCACHED;
1711 1715 mutex_exit(&dn->dn_dbufs_mtx);
1712 1716 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1713 1717
1714 1718 if (parent && parent != dn->dn_dbuf)
1715 1719 dbuf_add_ref(parent, db);
1716 1720
1717 1721 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1718 1722 refcount_count(&dn->dn_holds) > 0);
1719 1723 (void) refcount_add(&dn->dn_holds, db);
1720 1724 (void) atomic_inc_32_nv(&dn->dn_dbufs_count);
1721 1725
1722 1726 dprintf_dbuf(db, "db=%p\n", db);
1723 1727
1724 1728 return (db);
1725 1729 }
1726 1730
1727 1731 static int
1728 1732 dbuf_do_evict(void *private)
1729 1733 {
1730 1734 arc_buf_t *buf = private;
1731 1735 dmu_buf_impl_t *db = buf->b_private;
1732 1736
1733 1737 if (!MUTEX_HELD(&db->db_mtx))
1734 1738 mutex_enter(&db->db_mtx);
1735 1739
1736 1740 ASSERT(refcount_is_zero(&db->db_holds));
1737 1741
1738 1742 if (db->db_state != DB_EVICTING) {
1739 1743 ASSERT(db->db_state == DB_CACHED);
1740 1744 DBUF_VERIFY(db);
1741 1745 db->db_buf = NULL;
1742 1746 dbuf_evict(db);
1743 1747 } else {
1744 1748 mutex_exit(&db->db_mtx);
1745 1749 dbuf_destroy(db);
1746 1750 }
1747 1751 return (0);
1748 1752 }
1749 1753
1750 1754 static void
1751 1755 dbuf_destroy(dmu_buf_impl_t *db)
1752 1756 {
1753 1757 ASSERT(refcount_is_zero(&db->db_holds));
1754 1758
1755 1759 if (db->db_blkid != DMU_BONUS_BLKID) {
1756 1760 /*
1757 1761 * If this dbuf is still on the dn_dbufs list,
1758 1762 * remove it from that list.
1759 1763 */
1760 1764 if (db->db_dnode_handle != NULL) {
1761 1765 dnode_t *dn;
1762 1766
1763 1767 DB_DNODE_ENTER(db);
1764 1768 dn = DB_DNODE(db);
1765 1769 mutex_enter(&dn->dn_dbufs_mtx);
1766 1770 list_remove(&dn->dn_dbufs, db);
1767 1771 (void) atomic_dec_32_nv(&dn->dn_dbufs_count);
1768 1772 mutex_exit(&dn->dn_dbufs_mtx);
1769 1773 DB_DNODE_EXIT(db);
1770 1774 /*
1771 1775 * Decrementing the dbuf count means that the hold
1772 1776 * corresponding to the removed dbuf is no longer
1773 1777 * discounted in dnode_move(), so the dnode cannot be
1774 1778 * moved until after we release the hold.
1775 1779 */
1776 1780 dnode_rele(dn, db);
1777 1781 db->db_dnode_handle = NULL;
1778 1782 }
1779 1783 dbuf_hash_remove(db);
1780 1784 }
1781 1785 db->db_parent = NULL;
1782 1786 db->db_buf = NULL;
1783 1787
1784 1788 ASSERT(!list_link_active(&db->db_link));
1785 1789 ASSERT(db->db.db_data == NULL);
1786 1790 ASSERT(db->db_hash_next == NULL);
1787 1791 ASSERT(db->db_blkptr == NULL);
1788 1792 ASSERT(db->db_data_pending == NULL);
1789 1793
1790 1794 kmem_cache_free(dbuf_cache, db);
1791 1795 arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1792 1796 }
1793 1797
1794 1798 void
1795 1799 dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
1796 1800 {
1797 1801 dmu_buf_impl_t *db = NULL;
1798 1802 blkptr_t *bp = NULL;
1799 1803
1800 1804 ASSERT(blkid != DMU_BONUS_BLKID);
1801 1805 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1802 1806
1803 1807 if (dnode_block_freed(dn, blkid))
1804 1808 return;
1805 1809
1806 1810 /* dbuf_find() returns with db_mtx held */
1807 1811 if (db = dbuf_find(dn, 0, blkid)) {
1808 1812 /*
1809 1813 * This dbuf is already in the cache. We assume that
1810 1814 * it is already CACHED, or else about to be either
1811 1815 * read or filled.
1812 1816 */
1813 1817 mutex_exit(&db->db_mtx);
1814 1818 return;
1815 1819 }
1816 1820
1817 1821 if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
1818 1822 if (bp && !BP_IS_HOLE(bp)) {
1819 1823 dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
1820 1824 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
1821 1825 zbookmark_t zb;
1822 1826
1823 1827 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
1824 1828 dn->dn_object, 0, blkid);
1825 1829
1826 1830 (void) arc_read(NULL, dn->dn_objset->os_spa,
1827 1831 bp, NULL, NULL, prio,
1828 1832 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
1829 1833 &aflags, &zb);
1830 1834 }
1831 1835 if (db)
1832 1836 dbuf_rele(db, NULL);
1833 1837 }
1834 1838 }
1835 1839
1836 1840 /*
1837 1841 * Returns with db_holds incremented, and db_mtx not held.
1838 1842 * Note: dn_struct_rwlock must be held.
1839 1843 */
1840 1844 int
1841 1845 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
1842 1846 void *tag, dmu_buf_impl_t **dbp)
1843 1847 {
1844 1848 dmu_buf_impl_t *db, *parent = NULL;
1845 1849
1846 1850 ASSERT(blkid != DMU_BONUS_BLKID);
1847 1851 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1848 1852 ASSERT3U(dn->dn_nlevels, >, level);
1849 1853
1850 1854 *dbp = NULL;
1851 1855 top:
1852 1856 /* dbuf_find() returns with db_mtx held */
1853 1857 db = dbuf_find(dn, level, blkid);
1854 1858
1855 1859 if (db == NULL) {
1856 1860 blkptr_t *bp = NULL;
1857 1861 int err;
1858 1862
1859 1863 ASSERT3P(parent, ==, NULL);
1860 1864 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
1861 1865 if (fail_sparse) {
1862 1866 if (err == 0 && bp && BP_IS_HOLE(bp))
1863 1867 err = SET_ERROR(ENOENT);
1864 1868 if (err) {
1865 1869 if (parent)
1866 1870 dbuf_rele(parent, NULL);
1867 1871 return (err);
1868 1872 }
1869 1873 }
1870 1874 if (err && err != ENOENT)
1871 1875 return (err);
1872 1876 db = dbuf_create(dn, level, blkid, parent, bp);
1873 1877 }
1874 1878
1875 1879 if (db->db_buf && refcount_is_zero(&db->db_holds)) {
1876 1880 arc_buf_add_ref(db->db_buf, db);
1877 1881 if (db->db_buf->b_data == NULL) {
1878 1882 dbuf_clear(db);
1879 1883 if (parent) {
1880 1884 dbuf_rele(parent, NULL);
1881 1885 parent = NULL;
1882 1886 }
1883 1887 goto top;
1884 1888 }
1885 1889 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
1886 1890 }
1887 1891
1888 1892 ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
1889 1893
1890 1894 /*
1891 1895 * If this buffer is currently syncing out, and we are are
1892 1896 * still referencing it from db_data, we need to make a copy
1893 1897 * of it in case we decide we want to dirty it again in this txg.
1894 1898 */
1895 1899 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1896 1900 dn->dn_object != DMU_META_DNODE_OBJECT &&
1897 1901 db->db_state == DB_CACHED && db->db_data_pending) {
1898 1902 dbuf_dirty_record_t *dr = db->db_data_pending;
1899 1903
1900 1904 if (dr->dt.dl.dr_data == db->db_buf) {
1901 1905 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1902 1906
1903 1907 dbuf_set_data(db,
1904 1908 arc_buf_alloc(dn->dn_objset->os_spa,
1905 1909 db->db.db_size, db, type));
1906 1910 bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
1907 1911 db->db.db_size);
1908 1912 }
1909 1913 }
1910 1914
1911 1915 (void) refcount_add(&db->db_holds, tag);
1912 1916 dbuf_update_data(db);
1913 1917 DBUF_VERIFY(db);
1914 1918 mutex_exit(&db->db_mtx);
1915 1919
1916 1920 /* NOTE: we can't rele the parent until after we drop the db_mtx */
1917 1921 if (parent)
1918 1922 dbuf_rele(parent, NULL);
1919 1923
1920 1924 ASSERT3P(DB_DNODE(db), ==, dn);
1921 1925 ASSERT3U(db->db_blkid, ==, blkid);
1922 1926 ASSERT3U(db->db_level, ==, level);
1923 1927 *dbp = db;
1924 1928
1925 1929 return (0);
1926 1930 }
1927 1931
1928 1932 dmu_buf_impl_t *
1929 1933 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
1930 1934 {
1931 1935 dmu_buf_impl_t *db;
1932 1936 int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
1933 1937 return (err ? NULL : db);
1934 1938 }
1935 1939
1936 1940 dmu_buf_impl_t *
1937 1941 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
1938 1942 {
1939 1943 dmu_buf_impl_t *db;
1940 1944 int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
1941 1945 return (err ? NULL : db);
1942 1946 }
1943 1947
1944 1948 void
1945 1949 dbuf_create_bonus(dnode_t *dn)
1946 1950 {
1947 1951 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
1948 1952
1949 1953 ASSERT(dn->dn_bonus == NULL);
1950 1954 dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
1951 1955 }
1952 1956
1953 1957 int
1954 1958 dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
1955 1959 {
1956 1960 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1957 1961 dnode_t *dn;
1958 1962
1959 1963 if (db->db_blkid != DMU_SPILL_BLKID)
1960 1964 return (SET_ERROR(ENOTSUP));
1961 1965 if (blksz == 0)
1962 1966 blksz = SPA_MINBLOCKSIZE;
1963 1967 if (blksz > SPA_MAXBLOCKSIZE)
1964 1968 blksz = SPA_MAXBLOCKSIZE;
1965 1969 else
1966 1970 blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
1967 1971
1968 1972 DB_DNODE_ENTER(db);
1969 1973 dn = DB_DNODE(db);
1970 1974 rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1971 1975 dbuf_new_size(db, blksz, tx);
1972 1976 rw_exit(&dn->dn_struct_rwlock);
1973 1977 DB_DNODE_EXIT(db);
1974 1978
1975 1979 return (0);
1976 1980 }
1977 1981
1978 1982 void
1979 1983 dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
1980 1984 {
1981 1985 dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
1982 1986 }
1983 1987
1984 1988 #pragma weak dmu_buf_add_ref = dbuf_add_ref
1985 1989 void
1986 1990 dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
1987 1991 {
1988 1992 int64_t holds = refcount_add(&db->db_holds, tag);
1989 1993 ASSERT(holds > 1);
1990 1994 }
1991 1995
1992 1996 /*
1993 1997 * If you call dbuf_rele() you had better not be referencing the dnode handle
1994 1998 * unless you have some other direct or indirect hold on the dnode. (An indirect
1995 1999 * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
1996 2000 * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
1997 2001 * dnode's parent dbuf evicting its dnode handles.
1998 2002 */
1999 2003 void
2000 2004 dbuf_rele(dmu_buf_impl_t *db, void *tag)
2001 2005 {
2002 2006 mutex_enter(&db->db_mtx);
2003 2007 dbuf_rele_and_unlock(db, tag);
2004 2008 }
2005 2009
2006 2010 void
2007 2011 dmu_buf_rele(dmu_buf_t *db, void *tag)
2008 2012 {
2009 2013 dbuf_rele((dmu_buf_impl_t *)db, tag);
2010 2014 }
2011 2015
2012 2016 /*
2013 2017 * dbuf_rele() for an already-locked dbuf. This is necessary to allow
2014 2018 * db_dirtycnt and db_holds to be updated atomically.
2015 2019 */
2016 2020 void
2017 2021 dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
2018 2022 {
2019 2023 int64_t holds;
2020 2024
2021 2025 ASSERT(MUTEX_HELD(&db->db_mtx));
2022 2026 DBUF_VERIFY(db);
2023 2027
2024 2028 /*
2025 2029 * Remove the reference to the dbuf before removing its hold on the
2026 2030 * dnode so we can guarantee in dnode_move() that a referenced bonus
2027 2031 * buffer has a corresponding dnode hold.
2028 2032 */
2029 2033 holds = refcount_remove(&db->db_holds, tag);
2030 2034 ASSERT(holds >= 0);
2031 2035
2032 2036 /*
2033 2037 * We can't freeze indirects if there is a possibility that they
2034 2038 * may be modified in the current syncing context.
2035 2039 */
2036 2040 if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
2037 2041 arc_buf_freeze(db->db_buf);
2038 2042
2039 2043 if (holds == db->db_dirtycnt &&
2040 2044 db->db_level == 0 && db->db_immediate_evict)
2041 2045 dbuf_evict_user(db);
2042 2046
2043 2047 if (holds == 0) {
2044 2048 if (db->db_blkid == DMU_BONUS_BLKID) {
2045 2049 mutex_exit(&db->db_mtx);
2046 2050
2047 2051 /*
2048 2052 * If the dnode moves here, we cannot cross this barrier
2049 2053 * until the move completes.
2050 2054 */
2051 2055 DB_DNODE_ENTER(db);
2052 2056 (void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count);
2053 2057 DB_DNODE_EXIT(db);
2054 2058 /*
2055 2059 * The bonus buffer's dnode hold is no longer discounted
2056 2060 * in dnode_move(). The dnode cannot move until after
2057 2061 * the dnode_rele().
2058 2062 */
2059 2063 dnode_rele(DB_DNODE(db), db);
2060 2064 } else if (db->db_buf == NULL) {
2061 2065 /*
2062 2066 * This is a special case: we never associated this
2063 2067 * dbuf with any data allocated from the ARC.
2064 2068 */
2065 2069 ASSERT(db->db_state == DB_UNCACHED ||
2066 2070 db->db_state == DB_NOFILL);
2067 2071 dbuf_evict(db);
2068 2072 } else if (arc_released(db->db_buf)) {
2069 2073 arc_buf_t *buf = db->db_buf;
2070 2074 /*
2071 2075 * This dbuf has anonymous data associated with it.
2072 2076 */
2073 2077 dbuf_set_data(db, NULL);
2074 2078 VERIFY(arc_buf_remove_ref(buf, db));
2075 2079 dbuf_evict(db);
2076 2080 } else {
2077 2081 VERIFY(!arc_buf_remove_ref(db->db_buf, db));
2078 2082
2079 2083 /*
2080 2084 * A dbuf will be eligible for eviction if either the
2081 2085 * 'primarycache' property is set or a duplicate
2082 2086 * copy of this buffer is already cached in the arc.
2083 2087 *
2084 2088 * In the case of the 'primarycache' a buffer
2085 2089 * is considered for eviction if it matches the
2086 2090 * criteria set in the property.
2087 2091 *
2088 2092 * To decide if our buffer is considered a
2089 2093 * duplicate, we must call into the arc to determine
2090 2094 * if multiple buffers are referencing the same
2091 2095 * block on-disk. If so, then we simply evict
2092 2096 * ourselves.
2093 2097 */
2094 2098 if (!DBUF_IS_CACHEABLE(db) ||
2095 2099 arc_buf_eviction_needed(db->db_buf))
2096 2100 dbuf_clear(db);
2097 2101 else
2098 2102 mutex_exit(&db->db_mtx);
2099 2103 }
2100 2104 } else {
2101 2105 mutex_exit(&db->db_mtx);
2102 2106 }
2103 2107 }
2104 2108
2105 2109 #pragma weak dmu_buf_refcount = dbuf_refcount
2106 2110 uint64_t
2107 2111 dbuf_refcount(dmu_buf_impl_t *db)
2108 2112 {
2109 2113 return (refcount_count(&db->db_holds));
2110 2114 }
2111 2115
2112 2116 void *
2113 2117 dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
2114 2118 dmu_buf_evict_func_t *evict_func)
2115 2119 {
2116 2120 return (dmu_buf_update_user(db_fake, NULL, user_ptr,
2117 2121 user_data_ptr_ptr, evict_func));
2118 2122 }
2119 2123
2120 2124 void *
2121 2125 dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
2122 2126 dmu_buf_evict_func_t *evict_func)
2123 2127 {
2124 2128 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2125 2129
2126 2130 db->db_immediate_evict = TRUE;
2127 2131 return (dmu_buf_update_user(db_fake, NULL, user_ptr,
2128 2132 user_data_ptr_ptr, evict_func));
2129 2133 }
2130 2134
2131 2135 void *
2132 2136 dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
2133 2137 void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
2134 2138 {
2135 2139 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2136 2140 ASSERT(db->db_level == 0);
2137 2141
2138 2142 ASSERT((user_ptr == NULL) == (evict_func == NULL));
2139 2143
2140 2144 mutex_enter(&db->db_mtx);
2141 2145
2142 2146 if (db->db_user_ptr == old_user_ptr) {
2143 2147 db->db_user_ptr = user_ptr;
2144 2148 db->db_user_data_ptr_ptr = user_data_ptr_ptr;
2145 2149 db->db_evict_func = evict_func;
2146 2150
2147 2151 dbuf_update_data(db);
2148 2152 } else {
2149 2153 old_user_ptr = db->db_user_ptr;
2150 2154 }
2151 2155
2152 2156 mutex_exit(&db->db_mtx);
2153 2157 return (old_user_ptr);
2154 2158 }
2155 2159
2156 2160 void *
2157 2161 dmu_buf_get_user(dmu_buf_t *db_fake)
2158 2162 {
2159 2163 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2160 2164 ASSERT(!refcount_is_zero(&db->db_holds));
2161 2165
2162 2166 return (db->db_user_ptr);
2163 2167 }
2164 2168
2165 2169 boolean_t
2166 2170 dmu_buf_freeable(dmu_buf_t *dbuf)
2167 2171 {
2168 2172 boolean_t res = B_FALSE;
2169 2173 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
2170 2174
2171 2175 if (db->db_blkptr)
2172 2176 res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
2173 2177 db->db_blkptr, db->db_blkptr->blk_birth);
2174 2178
2175 2179 return (res);
2176 2180 }
2177 2181
2178 2182 blkptr_t *
2179 2183 dmu_buf_get_blkptr(dmu_buf_t *db)
2180 2184 {
2181 2185 dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
2182 2186 return (dbi->db_blkptr);
2183 2187 }
2184 2188
2185 2189 static void
2186 2190 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
2187 2191 {
2188 2192 /* ASSERT(dmu_tx_is_syncing(tx) */
2189 2193 ASSERT(MUTEX_HELD(&db->db_mtx));
2190 2194
2191 2195 if (db->db_blkptr != NULL)
2192 2196 return;
2193 2197
2194 2198 if (db->db_blkid == DMU_SPILL_BLKID) {
2195 2199 db->db_blkptr = &dn->dn_phys->dn_spill;
2196 2200 BP_ZERO(db->db_blkptr);
2197 2201 return;
2198 2202 }
2199 2203 if (db->db_level == dn->dn_phys->dn_nlevels-1) {
2200 2204 /*
2201 2205 * This buffer was allocated at a time when there was
2202 2206 * no available blkptrs from the dnode, or it was
2203 2207 * inappropriate to hook it in (i.e., nlevels mis-match).
2204 2208 */
2205 2209 ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
2206 2210 ASSERT(db->db_parent == NULL);
2207 2211 db->db_parent = dn->dn_dbuf;
2208 2212 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
2209 2213 DBUF_VERIFY(db);
2210 2214 } else {
2211 2215 dmu_buf_impl_t *parent = db->db_parent;
2212 2216 int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2213 2217
2214 2218 ASSERT(dn->dn_phys->dn_nlevels > 1);
2215 2219 if (parent == NULL) {
2216 2220 mutex_exit(&db->db_mtx);
2217 2221 rw_enter(&dn->dn_struct_rwlock, RW_READER);
2218 2222 (void) dbuf_hold_impl(dn, db->db_level+1,
2219 2223 db->db_blkid >> epbs, FALSE, db, &parent);
2220 2224 rw_exit(&dn->dn_struct_rwlock);
2221 2225 mutex_enter(&db->db_mtx);
2222 2226 db->db_parent = parent;
2223 2227 }
2224 2228 db->db_blkptr = (blkptr_t *)parent->db.db_data +
2225 2229 (db->db_blkid & ((1ULL << epbs) - 1));
2226 2230 DBUF_VERIFY(db);
2227 2231 }
2228 2232 }
2229 2233
2230 2234 static void
2231 2235 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2232 2236 {
2233 2237 dmu_buf_impl_t *db = dr->dr_dbuf;
2234 2238 dnode_t *dn;
2235 2239 zio_t *zio;
2236 2240
2237 2241 ASSERT(dmu_tx_is_syncing(tx));
2238 2242
2239 2243 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2240 2244
2241 2245 mutex_enter(&db->db_mtx);
2242 2246
2243 2247 ASSERT(db->db_level > 0);
2244 2248 DBUF_VERIFY(db);
2245 2249
2246 2250 /* Read the block if it hasn't been read yet. */
2247 2251 if (db->db_buf == NULL) {
2248 2252 mutex_exit(&db->db_mtx);
2249 2253 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
2250 2254 mutex_enter(&db->db_mtx);
2251 2255 }
2252 2256 ASSERT3U(db->db_state, ==, DB_CACHED);
2253 2257 ASSERT(db->db_buf != NULL);
2254 2258
2255 2259 DB_DNODE_ENTER(db);
2256 2260 dn = DB_DNODE(db);
2257 2261 /* Indirect block size must match what the dnode thinks it is. */
2258 2262 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2259 2263 dbuf_check_blkptr(dn, db);
2260 2264 DB_DNODE_EXIT(db);
2261 2265
2262 2266 /* Provide the pending dirty record to child dbufs */
2263 2267 db->db_data_pending = dr;
2264 2268
2265 2269 mutex_exit(&db->db_mtx);
2266 2270 dbuf_write(dr, db->db_buf, tx);
2267 2271
2268 2272 zio = dr->dr_zio;
2269 2273 mutex_enter(&dr->dt.di.dr_mtx);
2270 2274 dbuf_sync_list(&dr->dt.di.dr_children, tx);
2271 2275 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2272 2276 mutex_exit(&dr->dt.di.dr_mtx);
2273 2277 zio_nowait(zio);
2274 2278 }
2275 2279
2276 2280 static void
2277 2281 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2278 2282 {
2279 2283 arc_buf_t **datap = &dr->dt.dl.dr_data;
2280 2284 dmu_buf_impl_t *db = dr->dr_dbuf;
2281 2285 dnode_t *dn;
2282 2286 objset_t *os;
2283 2287 uint64_t txg = tx->tx_txg;
2284 2288
2285 2289 ASSERT(dmu_tx_is_syncing(tx));
2286 2290
2287 2291 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2288 2292
2289 2293 mutex_enter(&db->db_mtx);
2290 2294 /*
2291 2295 * To be synced, we must be dirtied. But we
2292 2296 * might have been freed after the dirty.
2293 2297 */
2294 2298 if (db->db_state == DB_UNCACHED) {
2295 2299 /* This buffer has been freed since it was dirtied */
2296 2300 ASSERT(db->db.db_data == NULL);
2297 2301 } else if (db->db_state == DB_FILL) {
2298 2302 /* This buffer was freed and is now being re-filled */
2299 2303 ASSERT(db->db.db_data != dr->dt.dl.dr_data);
2300 2304 } else {
2301 2305 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
2302 2306 }
2303 2307 DBUF_VERIFY(db);
2304 2308
2305 2309 DB_DNODE_ENTER(db);
2306 2310 dn = DB_DNODE(db);
2307 2311
2308 2312 if (db->db_blkid == DMU_SPILL_BLKID) {
2309 2313 mutex_enter(&dn->dn_mtx);
2310 2314 dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
2311 2315 mutex_exit(&dn->dn_mtx);
2312 2316 }
2313 2317
2314 2318 /*
2315 2319 * If this is a bonus buffer, simply copy the bonus data into the
2316 2320 * dnode. It will be written out when the dnode is synced (and it
2317 2321 * will be synced, since it must have been dirty for dbuf_sync to
2318 2322 * be called).
2319 2323 */
2320 2324 if (db->db_blkid == DMU_BONUS_BLKID) {
2321 2325 dbuf_dirty_record_t **drp;
2322 2326
2323 2327 ASSERT(*datap != NULL);
2324 2328 ASSERT0(db->db_level);
2325 2329 ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
2326 2330 bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
2327 2331 DB_DNODE_EXIT(db);
2328 2332
2329 2333 if (*datap != db->db.db_data) {
2330 2334 zio_buf_free(*datap, DN_MAX_BONUSLEN);
2331 2335 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
2332 2336 }
2333 2337 db->db_data_pending = NULL;
2334 2338 drp = &db->db_last_dirty;
2335 2339 while (*drp != dr)
2336 2340 drp = &(*drp)->dr_next;
2337 2341 ASSERT(dr->dr_next == NULL);
2338 2342 ASSERT(dr->dr_dbuf == db);
2339 2343 *drp = dr->dr_next;
2340 2344 kmem_free(dr, sizeof (dbuf_dirty_record_t));
2341 2345 ASSERT(db->db_dirtycnt > 0);
2342 2346 db->db_dirtycnt -= 1;
2343 2347 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
2344 2348 return;
2345 2349 }
2346 2350
2347 2351 os = dn->dn_objset;
2348 2352
2349 2353 /*
2350 2354 * This function may have dropped the db_mtx lock allowing a dmu_sync
2351 2355 * operation to sneak in. As a result, we need to ensure that we
2352 2356 * don't check the dr_override_state until we have returned from
2353 2357 * dbuf_check_blkptr.
2354 2358 */
2355 2359 dbuf_check_blkptr(dn, db);
2356 2360
2357 2361 /*
2358 2362 * If this buffer is in the middle of an immediate write,
2359 2363 * wait for the synchronous IO to complete.
2360 2364 */
2361 2365 while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
2362 2366 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
2363 2367 cv_wait(&db->db_changed, &db->db_mtx);
2364 2368 ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
2365 2369 }
2366 2370
2367 2371 if (db->db_state != DB_NOFILL &&
2368 2372 dn->dn_object != DMU_META_DNODE_OBJECT &&
2369 2373 refcount_count(&db->db_holds) > 1 &&
2370 2374 dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
2371 2375 *datap == db->db_buf) {
2372 2376 /*
2373 2377 * If this buffer is currently "in use" (i.e., there
2374 2378 * are active holds and db_data still references it),
2375 2379 * then make a copy before we start the write so that
2376 2380 * any modifications from the open txg will not leak
2377 2381 * into this write.
2378 2382 *
2379 2383 * NOTE: this copy does not need to be made for
2380 2384 * objects only modified in the syncing context (e.g.
2381 2385 * DNONE_DNODE blocks).
2382 2386 */
2383 2387 int blksz = arc_buf_size(*datap);
2384 2388 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
2385 2389 *datap = arc_buf_alloc(os->os_spa, blksz, db, type);
2386 2390 bcopy(db->db.db_data, (*datap)->b_data, blksz);
2387 2391 }
2388 2392 db->db_data_pending = dr;
2389 2393
2390 2394 mutex_exit(&db->db_mtx);
2391 2395
2392 2396 dbuf_write(dr, *datap, tx);
2393 2397
2394 2398 ASSERT(!list_link_active(&dr->dr_dirty_node));
2395 2399 if (dn->dn_object == DMU_META_DNODE_OBJECT) {
2396 2400 list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
2397 2401 DB_DNODE_EXIT(db);
2398 2402 } else {
2399 2403 /*
2400 2404 * Although zio_nowait() does not "wait for an IO", it does
2401 2405 * initiate the IO. If this is an empty write it seems plausible
2402 2406 * that the IO could actually be completed before the nowait
2403 2407 * returns. We need to DB_DNODE_EXIT() first in case
2404 2408 * zio_nowait() invalidates the dbuf.
2405 2409 */
2406 2410 DB_DNODE_EXIT(db);
2407 2411 zio_nowait(dr->dr_zio);
2408 2412 }
2409 2413 }
2410 2414
2411 2415 void
2412 2416 dbuf_sync_list(list_t *list, dmu_tx_t *tx)
2413 2417 {
2414 2418 dbuf_dirty_record_t *dr;
2415 2419
2416 2420 while (dr = list_head(list)) {
2417 2421 if (dr->dr_zio != NULL) {
2418 2422 /*
2419 2423 * If we find an already initialized zio then we
2420 2424 * are processing the meta-dnode, and we have finished.
2421 2425 * The dbufs for all dnodes are put back on the list
2422 2426 * during processing, so that we can zio_wait()
2423 2427 * these IOs after initiating all child IOs.
2424 2428 */
2425 2429 ASSERT3U(dr->dr_dbuf->db.db_object, ==,
2426 2430 DMU_META_DNODE_OBJECT);
2427 2431 break;
2428 2432 }
2429 2433 list_remove(list, dr);
2430 2434 if (dr->dr_dbuf->db_level > 0)
2431 2435 dbuf_sync_indirect(dr, tx);
2432 2436 else
2433 2437 dbuf_sync_leaf(dr, tx);
2434 2438 }
2435 2439 }
2436 2440
2437 2441 /* ARGSUSED */
2438 2442 static void
2439 2443 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
2440 2444 {
2441 2445 dmu_buf_impl_t *db = vdb;
2442 2446 dnode_t *dn;
2443 2447 blkptr_t *bp = zio->io_bp;
2444 2448 blkptr_t *bp_orig = &zio->io_bp_orig;
2445 2449 spa_t *spa = zio->io_spa;
2446 2450 int64_t delta;
2447 2451 uint64_t fill = 0;
2448 2452 int i;
2449 2453
2450 2454 ASSERT(db->db_blkptr == bp);
2451 2455
2452 2456 DB_DNODE_ENTER(db);
2453 2457 dn = DB_DNODE(db);
2454 2458 delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
2455 2459 dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
2456 2460 zio->io_prev_space_delta = delta;
2457 2461
2458 2462 if (bp->blk_birth != 0) {
2459 2463 ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
2460 2464 BP_GET_TYPE(bp) == dn->dn_type) ||
2461 2465 (db->db_blkid == DMU_SPILL_BLKID &&
2462 2466 BP_GET_TYPE(bp) == dn->dn_bonustype));
2463 2467 ASSERT(BP_GET_LEVEL(bp) == db->db_level);
2464 2468 }
2465 2469
2466 2470 mutex_enter(&db->db_mtx);
2467 2471
2468 2472 #ifdef ZFS_DEBUG
2469 2473 if (db->db_blkid == DMU_SPILL_BLKID) {
2470 2474 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
2471 2475 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
2472 2476 db->db_blkptr == &dn->dn_phys->dn_spill);
2473 2477 }
2474 2478 #endif
2475 2479
2476 2480 if (db->db_level == 0) {
2477 2481 mutex_enter(&dn->dn_mtx);
2478 2482 if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
2479 2483 db->db_blkid != DMU_SPILL_BLKID)
2480 2484 dn->dn_phys->dn_maxblkid = db->db_blkid;
2481 2485 mutex_exit(&dn->dn_mtx);
2482 2486
2483 2487 if (dn->dn_type == DMU_OT_DNODE) {
2484 2488 dnode_phys_t *dnp = db->db.db_data;
2485 2489 for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
2486 2490 i--, dnp++) {
2487 2491 if (dnp->dn_type != DMU_OT_NONE)
2488 2492 fill++;
2489 2493 }
2490 2494 } else {
2491 2495 if (BP_IS_HOLE(bp)) {
2492 2496 fill = 0;
2493 2497 } else {
2494 2498 fill = 1;
2495 2499 }
2496 2500 }
2497 2501 } else {
2498 2502 blkptr_t *ibp = db->db.db_data;
2499 2503 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2500 2504 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
2501 2505 if (BP_IS_HOLE(ibp))
2502 2506 continue;
2503 2507 fill += ibp->blk_fill;
2504 2508 }
2505 2509 }
2506 2510 DB_DNODE_EXIT(db);
2507 2511
2508 2512 bp->blk_fill = fill;
2509 2513
2510 2514 mutex_exit(&db->db_mtx);
2511 2515 }
2512 2516
2513 2517 /*
2514 2518 * The SPA will call this callback several times for each zio - once
2515 2519 * for every physical child i/o (zio->io_phys_children times). This
2516 2520 * allows the DMU to monitor the progress of each logical i/o. For example,
2517 2521 * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
2518 2522 * block. There may be a long delay before all copies/fragments are completed,
2519 2523 * so this callback allows us to retire dirty space gradually, as the physical
2520 2524 * i/os complete.
2521 2525 */
2522 2526 /* ARGSUSED */
2523 2527 static void
2524 2528 dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
2525 2529 {
2526 2530 dmu_buf_impl_t *db = arg;
2527 2531 objset_t *os = db->db_objset;
2528 2532 dsl_pool_t *dp = dmu_objset_pool(os);
2529 2533 dbuf_dirty_record_t *dr;
2530 2534 int delta = 0;
2531 2535
2532 2536 dr = db->db_data_pending;
2533 2537 ASSERT3U(dr->dr_txg, ==, zio->io_txg);
2534 2538
2535 2539 /*
2536 2540 * The callback will be called io_phys_children times. Retire one
2537 2541 * portion of our dirty space each time we are called. Any rounding
2538 2542 * error will be cleaned up by dsl_pool_sync()'s call to
2539 2543 * dsl_pool_undirty_space().
2540 2544 */
2541 2545 delta = dr->dr_accounted / zio->io_phys_children;
2542 2546 dsl_pool_undirty_space(dp, delta, zio->io_txg);
2543 2547 }
2544 2548
2545 2549 /* ARGSUSED */
2546 2550 static void
2547 2551 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
2548 2552 {
2549 2553 dmu_buf_impl_t *db = vdb;
2550 2554 blkptr_t *bp_orig = &zio->io_bp_orig;
2551 2555 blkptr_t *bp = db->db_blkptr;
2552 2556 objset_t *os = db->db_objset;
2553 2557 dmu_tx_t *tx = os->os_synctx;
2554 2558 dbuf_dirty_record_t **drp, *dr;
2555 2559
2556 2560 ASSERT0(zio->io_error);
2557 2561 ASSERT(db->db_blkptr == bp);
2558 2562
2559 2563 /*
2560 2564 * For nopwrites and rewrites we ensure that the bp matches our
2561 2565 * original and bypass all the accounting.
2562 2566 */
2563 2567 if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
2564 2568 ASSERT(BP_EQUAL(bp, bp_orig));
2565 2569 } else {
2566 2570 dsl_dataset_t *ds = os->os_dsl_dataset;
2567 2571 (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
2568 2572 dsl_dataset_block_born(ds, bp, tx);
2569 2573 }
2570 2574
2571 2575 mutex_enter(&db->db_mtx);
2572 2576
2573 2577 DBUF_VERIFY(db);
2574 2578
2575 2579 drp = &db->db_last_dirty;
2576 2580 while ((dr = *drp) != db->db_data_pending)
2577 2581 drp = &dr->dr_next;
2578 2582 ASSERT(!list_link_active(&dr->dr_dirty_node));
2579 2583 ASSERT(dr->dr_dbuf == db);
2580 2584 ASSERT(dr->dr_next == NULL);
2581 2585 *drp = dr->dr_next;
2582 2586
2583 2587 #ifdef ZFS_DEBUG
2584 2588 if (db->db_blkid == DMU_SPILL_BLKID) {
2585 2589 dnode_t *dn;
2586 2590
2587 2591 DB_DNODE_ENTER(db);
2588 2592 dn = DB_DNODE(db);
2589 2593 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
2590 2594 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
2591 2595 db->db_blkptr == &dn->dn_phys->dn_spill);
2592 2596 DB_DNODE_EXIT(db);
2593 2597 }
2594 2598 #endif
2595 2599
2596 2600 if (db->db_level == 0) {
2597 2601 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2598 2602 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
2599 2603 if (db->db_state != DB_NOFILL) {
2600 2604 if (dr->dt.dl.dr_data != db->db_buf)
2601 2605 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
2602 2606 db));
2603 2607 else if (!arc_released(db->db_buf))
2604 2608 arc_set_callback(db->db_buf, dbuf_do_evict, db);
2605 2609 }
2606 2610 } else {
2607 2611 dnode_t *dn;
2608 2612
2609 2613 DB_DNODE_ENTER(db);
2610 2614 dn = DB_DNODE(db);
2611 2615 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2612 2616 ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
2613 2617 if (!BP_IS_HOLE(db->db_blkptr)) {
2614 2618 int epbs =
2615 2619 dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2616 2620 ASSERT3U(db->db_blkid, <=,
2617 2621 dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
2618 2622 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
2619 2623 db->db.db_size);
2620 2624 arc_set_callback(db->db_buf, dbuf_do_evict, db);
2621 2625 }
2622 2626 DB_DNODE_EXIT(db);
2623 2627 mutex_destroy(&dr->dt.di.dr_mtx);
2624 2628 list_destroy(&dr->dt.di.dr_children);
2625 2629 }
2626 2630 kmem_free(dr, sizeof (dbuf_dirty_record_t));
2627 2631
2628 2632 cv_broadcast(&db->db_changed);
2629 2633 ASSERT(db->db_dirtycnt > 0);
2630 2634 db->db_dirtycnt -= 1;
2631 2635 db->db_data_pending = NULL;
2632 2636 dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg);
2633 2637 }
2634 2638
2635 2639 static void
2636 2640 dbuf_write_nofill_ready(zio_t *zio)
2637 2641 {
2638 2642 dbuf_write_ready(zio, NULL, zio->io_private);
2639 2643 }
2640 2644
2641 2645 static void
2642 2646 dbuf_write_nofill_done(zio_t *zio)
2643 2647 {
2644 2648 dbuf_write_done(zio, NULL, zio->io_private);
2645 2649 }
2646 2650
2647 2651 static void
2648 2652 dbuf_write_override_ready(zio_t *zio)
2649 2653 {
2650 2654 dbuf_dirty_record_t *dr = zio->io_private;
2651 2655 dmu_buf_impl_t *db = dr->dr_dbuf;
2652 2656
2653 2657 dbuf_write_ready(zio, NULL, db);
2654 2658 }
2655 2659
2656 2660 static void
2657 2661 dbuf_write_override_done(zio_t *zio)
2658 2662 {
2659 2663 dbuf_dirty_record_t *dr = zio->io_private;
2660 2664 dmu_buf_impl_t *db = dr->dr_dbuf;
2661 2665 blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
2662 2666
2663 2667 mutex_enter(&db->db_mtx);
2664 2668 if (!BP_EQUAL(zio->io_bp, obp)) {
2665 2669 if (!BP_IS_HOLE(obp))
2666 2670 dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
2667 2671 arc_release(dr->dt.dl.dr_data, db);
2668 2672 }
2669 2673 mutex_exit(&db->db_mtx);
2670 2674
2671 2675 dbuf_write_done(zio, NULL, db);
2672 2676 }
2673 2677
2674 2678 /* Issue I/O to commit a dirty buffer to disk. */
2675 2679 static void
2676 2680 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
2677 2681 {
2678 2682 dmu_buf_impl_t *db = dr->dr_dbuf;
2679 2683 dnode_t *dn;
2680 2684 objset_t *os;
2681 2685 dmu_buf_impl_t *parent = db->db_parent;
2682 2686 uint64_t txg = tx->tx_txg;
2683 2687 zbookmark_t zb;
2684 2688 zio_prop_t zp;
2685 2689 zio_t *zio;
2686 2690 int wp_flag = 0;
2687 2691
2688 2692 DB_DNODE_ENTER(db);
2689 2693 dn = DB_DNODE(db);
2690 2694 os = dn->dn_objset;
2691 2695
2692 2696 if (db->db_state != DB_NOFILL) {
2693 2697 if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
2694 2698 /*
2695 2699 * Private object buffers are released here rather
2696 2700 * than in dbuf_dirty() since they are only modified
2697 2701 * in the syncing context and we don't want the
2698 2702 * overhead of making multiple copies of the data.
2699 2703 */
2700 2704 if (BP_IS_HOLE(db->db_blkptr)) {
2701 2705 arc_buf_thaw(data);
2702 2706 } else {
2703 2707 dbuf_release_bp(db);
2704 2708 }
2705 2709 }
2706 2710 }
2707 2711
2708 2712 if (parent != dn->dn_dbuf) {
2709 2713 /* Our parent is an indirect block. */
2710 2714 /* We have a dirty parent that has been scheduled for write. */
2711 2715 ASSERT(parent && parent->db_data_pending);
2712 2716 /* Our parent's buffer is one level closer to the dnode. */
2713 2717 ASSERT(db->db_level == parent->db_level-1);
2714 2718 /*
2715 2719 * We're about to modify our parent's db_data by modifying
2716 2720 * our block pointer, so the parent must be released.
2717 2721 */
2718 2722 ASSERT(arc_released(parent->db_buf));
2719 2723 zio = parent->db_data_pending->dr_zio;
2720 2724 } else {
2721 2725 /* Our parent is the dnode itself. */
2722 2726 ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
2723 2727 db->db_blkid != DMU_SPILL_BLKID) ||
2724 2728 (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
2725 2729 if (db->db_blkid != DMU_SPILL_BLKID)
2726 2730 ASSERT3P(db->db_blkptr, ==,
2727 2731 &dn->dn_phys->dn_blkptr[db->db_blkid]);
2728 2732 zio = dn->dn_zio;
2729 2733 }
2730 2734
2731 2735 ASSERT(db->db_level == 0 || data == db->db_buf);
2732 2736 ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
2733 2737 ASSERT(zio);
2734 2738
2735 2739 SET_BOOKMARK(&zb, os->os_dsl_dataset ?
2736 2740 os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
2737 2741 db->db.db_object, db->db_level, db->db_blkid);
2738 2742
2739 2743 if (db->db_blkid == DMU_SPILL_BLKID)
2740 2744 wp_flag = WP_SPILL;
2741 2745 wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
2742 2746
2743 2747 dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
2744 2748 DB_DNODE_EXIT(db);
2745 2749
2746 2750 if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
2747 2751 ASSERT(db->db_state != DB_NOFILL);
2748 2752 dr->dr_zio = zio_write(zio, os->os_spa, txg,
2749 2753 db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
2750 2754 dbuf_write_override_ready, NULL, dbuf_write_override_done,
2751 2755 dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2752 2756 mutex_enter(&db->db_mtx);
2753 2757 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
2754 2758 zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
2755 2759 dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
2756 2760 mutex_exit(&db->db_mtx);
2757 2761 } else if (db->db_state == DB_NOFILL) {
2758 2762 ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
2759 2763 zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
2760 2764 dr->dr_zio = zio_write(zio, os->os_spa, txg,
2761 2765 db->db_blkptr, NULL, db->db.db_size, &zp,
2762 2766 dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db,
2763 2767 ZIO_PRIORITY_ASYNC_WRITE,
2764 2768 ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
2765 2769 } else {
2766 2770 ASSERT(arc_released(data));
2767 2771 dr->dr_zio = arc_write(zio, os->os_spa, txg,
2768 2772 db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db),
2769 2773 DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
2770 2774 dbuf_write_physdone, dbuf_write_done, db,
2771 2775 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2772 2776 }
2773 2777 }
|
↓ open down ↓ |
1583 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX