Print this page
6288 dmu_buf_will_dirty could be faster
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Justin Gibbs <gibbs@scsiguy.com>
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Approved by: Robert Mustacchi <rm@joyent.com>
6267 dn_bonus evicted too early
Reviewed by: Richard Yao <ryao@gentoo.org>
Reviewed by: Xin LI <delphij@freebsd.org>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/fs/zfs/dbuf.c
+++ new/usr/src/uts/common/fs/zfs/dbuf.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24 24 * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
25 25 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
26 26 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
27 27 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
28 28 */
29 29
30 30 #include <sys/zfs_context.h>
31 31 #include <sys/dmu.h>
32 32 #include <sys/dmu_send.h>
33 33 #include <sys/dmu_impl.h>
34 34 #include <sys/dbuf.h>
35 35 #include <sys/dmu_objset.h>
36 36 #include <sys/dsl_dataset.h>
37 37 #include <sys/dsl_dir.h>
38 38 #include <sys/dmu_tx.h>
39 39 #include <sys/spa.h>
40 40 #include <sys/zio.h>
41 41 #include <sys/dmu_zfetch.h>
42 42 #include <sys/sa.h>
43 43 #include <sys/sa_impl.h>
44 44 #include <sys/zfeature.h>
45 45 #include <sys/blkptr.h>
46 46 #include <sys/range_tree.h>
47 47
48 48 /*
49 49 * Number of times that zfs_free_range() took the slow path while doing
50 50 * a zfs receive. A nonzero value indicates a potential performance problem.
51 51 */
52 52 uint64_t zfs_free_range_recv_miss;
53 53
54 54 static void dbuf_destroy(dmu_buf_impl_t *db);
55 55 static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
56 56 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
57 57
58 58 #ifndef __lint
59 59 extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu,
60 60 dmu_buf_evict_func_t *evict_func, dmu_buf_t **clear_on_evict_dbufp);
61 61 #endif /* ! __lint */
62 62
63 63 /*
64 64 * Global data structures and functions for the dbuf cache.
65 65 */
66 66 static kmem_cache_t *dbuf_cache;
67 67 static taskq_t *dbu_evict_taskq;
68 68
69 69 /* ARGSUSED */
70 70 static int
71 71 dbuf_cons(void *vdb, void *unused, int kmflag)
72 72 {
73 73 dmu_buf_impl_t *db = vdb;
74 74 bzero(db, sizeof (dmu_buf_impl_t));
75 75
76 76 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
77 77 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
78 78 refcount_create(&db->db_holds);
79 79
80 80 return (0);
81 81 }
82 82
83 83 /* ARGSUSED */
84 84 static void
85 85 dbuf_dest(void *vdb, void *unused)
86 86 {
87 87 dmu_buf_impl_t *db = vdb;
88 88 mutex_destroy(&db->db_mtx);
89 89 cv_destroy(&db->db_changed);
90 90 refcount_destroy(&db->db_holds);
91 91 }
92 92
93 93 /*
94 94 * dbuf hash table routines
95 95 */
96 96 static dbuf_hash_table_t dbuf_hash_table;
97 97
98 98 static uint64_t dbuf_hash_count;
99 99
100 100 static uint64_t
101 101 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
102 102 {
103 103 uintptr_t osv = (uintptr_t)os;
104 104 uint64_t crc = -1ULL;
105 105
106 106 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
107 107 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
108 108 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
109 109 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
110 110 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
111 111 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
112 112 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
113 113
114 114 crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
115 115
116 116 return (crc);
117 117 }
118 118
119 119 #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
120 120
121 121 #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
122 122 ((dbuf)->db.db_object == (obj) && \
123 123 (dbuf)->db_objset == (os) && \
124 124 (dbuf)->db_level == (level) && \
125 125 (dbuf)->db_blkid == (blkid))
126 126
127 127 dmu_buf_impl_t *
128 128 dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
129 129 {
130 130 dbuf_hash_table_t *h = &dbuf_hash_table;
131 131 uint64_t hv = DBUF_HASH(os, obj, level, blkid);
132 132 uint64_t idx = hv & h->hash_table_mask;
133 133 dmu_buf_impl_t *db;
134 134
135 135 mutex_enter(DBUF_HASH_MUTEX(h, idx));
136 136 for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
137 137 if (DBUF_EQUAL(db, os, obj, level, blkid)) {
138 138 mutex_enter(&db->db_mtx);
139 139 if (db->db_state != DB_EVICTING) {
140 140 mutex_exit(DBUF_HASH_MUTEX(h, idx));
141 141 return (db);
142 142 }
143 143 mutex_exit(&db->db_mtx);
144 144 }
145 145 }
146 146 mutex_exit(DBUF_HASH_MUTEX(h, idx));
147 147 return (NULL);
148 148 }
149 149
150 150 static dmu_buf_impl_t *
151 151 dbuf_find_bonus(objset_t *os, uint64_t object)
152 152 {
153 153 dnode_t *dn;
154 154 dmu_buf_impl_t *db = NULL;
155 155
156 156 if (dnode_hold(os, object, FTAG, &dn) == 0) {
157 157 rw_enter(&dn->dn_struct_rwlock, RW_READER);
158 158 if (dn->dn_bonus != NULL) {
159 159 db = dn->dn_bonus;
160 160 mutex_enter(&db->db_mtx);
161 161 }
162 162 rw_exit(&dn->dn_struct_rwlock);
163 163 dnode_rele(dn, FTAG);
164 164 }
165 165 return (db);
166 166 }
167 167
168 168 /*
169 169 * Insert an entry into the hash table. If there is already an element
170 170 * equal to elem in the hash table, then the already existing element
171 171 * will be returned and the new element will not be inserted.
172 172 * Otherwise returns NULL.
173 173 */
174 174 static dmu_buf_impl_t *
175 175 dbuf_hash_insert(dmu_buf_impl_t *db)
176 176 {
177 177 dbuf_hash_table_t *h = &dbuf_hash_table;
178 178 objset_t *os = db->db_objset;
179 179 uint64_t obj = db->db.db_object;
180 180 int level = db->db_level;
181 181 uint64_t blkid = db->db_blkid;
182 182 uint64_t hv = DBUF_HASH(os, obj, level, blkid);
183 183 uint64_t idx = hv & h->hash_table_mask;
184 184 dmu_buf_impl_t *dbf;
185 185
186 186 mutex_enter(DBUF_HASH_MUTEX(h, idx));
187 187 for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
188 188 if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
189 189 mutex_enter(&dbf->db_mtx);
190 190 if (dbf->db_state != DB_EVICTING) {
191 191 mutex_exit(DBUF_HASH_MUTEX(h, idx));
192 192 return (dbf);
193 193 }
194 194 mutex_exit(&dbf->db_mtx);
195 195 }
196 196 }
197 197
198 198 mutex_enter(&db->db_mtx);
199 199 db->db_hash_next = h->hash_table[idx];
200 200 h->hash_table[idx] = db;
201 201 mutex_exit(DBUF_HASH_MUTEX(h, idx));
202 202 atomic_inc_64(&dbuf_hash_count);
203 203
204 204 return (NULL);
205 205 }
206 206
207 207 /*
208 208 * Remove an entry from the hash table. It must be in the EVICTING state.
209 209 */
210 210 static void
211 211 dbuf_hash_remove(dmu_buf_impl_t *db)
212 212 {
213 213 dbuf_hash_table_t *h = &dbuf_hash_table;
214 214 uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
215 215 db->db_level, db->db_blkid);
216 216 uint64_t idx = hv & h->hash_table_mask;
217 217 dmu_buf_impl_t *dbf, **dbp;
218 218
219 219 /*
220 220 * We musn't hold db_mtx to maintain lock ordering:
221 221 * DBUF_HASH_MUTEX > db_mtx.
222 222 */
223 223 ASSERT(refcount_is_zero(&db->db_holds));
224 224 ASSERT(db->db_state == DB_EVICTING);
225 225 ASSERT(!MUTEX_HELD(&db->db_mtx));
226 226
227 227 mutex_enter(DBUF_HASH_MUTEX(h, idx));
228 228 dbp = &h->hash_table[idx];
229 229 while ((dbf = *dbp) != db) {
230 230 dbp = &dbf->db_hash_next;
231 231 ASSERT(dbf != NULL);
232 232 }
233 233 *dbp = db->db_hash_next;
234 234 db->db_hash_next = NULL;
235 235 mutex_exit(DBUF_HASH_MUTEX(h, idx));
236 236 atomic_dec_64(&dbuf_hash_count);
237 237 }
238 238
239 239 static arc_evict_func_t dbuf_do_evict;
240 240
241 241 typedef enum {
242 242 DBVU_EVICTING,
243 243 DBVU_NOT_EVICTING
244 244 } dbvu_verify_type_t;
245 245
246 246 static void
247 247 dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)
248 248 {
249 249 #ifdef ZFS_DEBUG
250 250 int64_t holds;
251 251
252 252 if (db->db_user == NULL)
253 253 return;
254 254
255 255 /* Only data blocks support the attachment of user data. */
256 256 ASSERT(db->db_level == 0);
257 257
258 258 /* Clients must resolve a dbuf before attaching user data. */
259 259 ASSERT(db->db.db_data != NULL);
260 260 ASSERT3U(db->db_state, ==, DB_CACHED);
261 261
262 262 holds = refcount_count(&db->db_holds);
263 263 if (verify_type == DBVU_EVICTING) {
264 264 /*
|
↓ open down ↓ |
264 lines elided |
↑ open up ↑ |
265 265 * Immediate eviction occurs when holds == dirtycnt.
266 266 * For normal eviction buffers, holds is zero on
267 267 * eviction, except when dbuf_fix_old_data() calls
268 268 * dbuf_clear_data(). However, the hold count can grow
269 269 * during eviction even though db_mtx is held (see
270 270 * dmu_bonus_hold() for an example), so we can only
271 271 * test the generic invariant that holds >= dirtycnt.
272 272 */
273 273 ASSERT3U(holds, >=, db->db_dirtycnt);
274 274 } else {
275 - if (db->db_immediate_evict == TRUE)
275 + if (db->db_user_immediate_evict == TRUE)
276 276 ASSERT3U(holds, >=, db->db_dirtycnt);
277 277 else
278 278 ASSERT3U(holds, >, 0);
279 279 }
280 280 #endif
281 281 }
282 282
283 283 static void
284 284 dbuf_evict_user(dmu_buf_impl_t *db)
285 285 {
286 286 dmu_buf_user_t *dbu = db->db_user;
287 287
288 288 ASSERT(MUTEX_HELD(&db->db_mtx));
289 289
290 290 if (dbu == NULL)
291 291 return;
292 292
293 293 dbuf_verify_user(db, DBVU_EVICTING);
294 294 db->db_user = NULL;
295 295
296 296 #ifdef ZFS_DEBUG
297 297 if (dbu->dbu_clear_on_evict_dbufp != NULL)
298 298 *dbu->dbu_clear_on_evict_dbufp = NULL;
299 299 #endif
300 300
301 301 /*
302 302 * Invoke the callback from a taskq to avoid lock order reversals
303 303 * and limit stack depth.
304 304 */
305 305 taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func, dbu, 0,
306 306 &dbu->dbu_tqent);
307 307 }
308 308
309 309 boolean_t
310 310 dbuf_is_metadata(dmu_buf_impl_t *db)
311 311 {
312 312 if (db->db_level > 0) {
313 313 return (B_TRUE);
314 314 } else {
315 315 boolean_t is_metadata;
316 316
317 317 DB_DNODE_ENTER(db);
318 318 is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
319 319 DB_DNODE_EXIT(db);
320 320
321 321 return (is_metadata);
322 322 }
323 323 }
324 324
325 325 void
326 326 dbuf_evict(dmu_buf_impl_t *db)
327 327 {
328 328 ASSERT(MUTEX_HELD(&db->db_mtx));
329 329 ASSERT(db->db_buf == NULL);
330 330 ASSERT(db->db_data_pending == NULL);
331 331
332 332 dbuf_clear(db);
333 333 dbuf_destroy(db);
334 334 }
335 335
336 336 void
337 337 dbuf_init(void)
338 338 {
339 339 uint64_t hsize = 1ULL << 16;
340 340 dbuf_hash_table_t *h = &dbuf_hash_table;
341 341 int i;
342 342
343 343 /*
344 344 * The hash table is big enough to fill all of physical memory
345 345 * with an average 4K block size. The table will take up
346 346 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
347 347 */
348 348 while (hsize * 4096 < physmem * PAGESIZE)
349 349 hsize <<= 1;
350 350
351 351 retry:
352 352 h->hash_table_mask = hsize - 1;
353 353 h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
354 354 if (h->hash_table == NULL) {
355 355 /* XXX - we should really return an error instead of assert */
356 356 ASSERT(hsize > (1ULL << 10));
357 357 hsize >>= 1;
358 358 goto retry;
359 359 }
360 360
361 361 dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
362 362 sizeof (dmu_buf_impl_t),
363 363 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
364 364
365 365 for (i = 0; i < DBUF_MUTEXES; i++)
366 366 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
367 367
368 368 /*
369 369 * All entries are queued via taskq_dispatch_ent(), so min/maxalloc
370 370 * configuration is not required.
371 371 */
372 372 dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0);
373 373 }
374 374
375 375 void
376 376 dbuf_fini(void)
377 377 {
378 378 dbuf_hash_table_t *h = &dbuf_hash_table;
379 379 int i;
380 380
381 381 for (i = 0; i < DBUF_MUTEXES; i++)
382 382 mutex_destroy(&h->hash_mutexes[i]);
383 383 kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
384 384 kmem_cache_destroy(dbuf_cache);
385 385 taskq_destroy(dbu_evict_taskq);
386 386 }
387 387
388 388 /*
389 389 * Other stuff.
390 390 */
391 391
392 392 #ifdef ZFS_DEBUG
393 393 static void
394 394 dbuf_verify(dmu_buf_impl_t *db)
395 395 {
396 396 dnode_t *dn;
397 397 dbuf_dirty_record_t *dr;
398 398
399 399 ASSERT(MUTEX_HELD(&db->db_mtx));
400 400
401 401 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
402 402 return;
403 403
404 404 ASSERT(db->db_objset != NULL);
405 405 DB_DNODE_ENTER(db);
406 406 dn = DB_DNODE(db);
407 407 if (dn == NULL) {
408 408 ASSERT(db->db_parent == NULL);
409 409 ASSERT(db->db_blkptr == NULL);
410 410 } else {
411 411 ASSERT3U(db->db.db_object, ==, dn->dn_object);
412 412 ASSERT3P(db->db_objset, ==, dn->dn_objset);
413 413 ASSERT3U(db->db_level, <, dn->dn_nlevels);
414 414 ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
415 415 db->db_blkid == DMU_SPILL_BLKID ||
416 416 !avl_is_empty(&dn->dn_dbufs));
417 417 }
418 418 if (db->db_blkid == DMU_BONUS_BLKID) {
419 419 ASSERT(dn != NULL);
420 420 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
421 421 ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
422 422 } else if (db->db_blkid == DMU_SPILL_BLKID) {
423 423 ASSERT(dn != NULL);
424 424 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
425 425 ASSERT0(db->db.db_offset);
426 426 } else {
427 427 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
428 428 }
429 429
430 430 for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
431 431 ASSERT(dr->dr_dbuf == db);
432 432
433 433 for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
434 434 ASSERT(dr->dr_dbuf == db);
435 435
436 436 /*
437 437 * We can't assert that db_size matches dn_datablksz because it
438 438 * can be momentarily different when another thread is doing
439 439 * dnode_set_blksz().
440 440 */
441 441 if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
442 442 dr = db->db_data_pending;
443 443 /*
444 444 * It should only be modified in syncing context, so
445 445 * make sure we only have one copy of the data.
446 446 */
447 447 ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
448 448 }
449 449
450 450 /* verify db->db_blkptr */
451 451 if (db->db_blkptr) {
452 452 if (db->db_parent == dn->dn_dbuf) {
453 453 /* db is pointed to by the dnode */
454 454 /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
455 455 if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
456 456 ASSERT(db->db_parent == NULL);
457 457 else
458 458 ASSERT(db->db_parent != NULL);
459 459 if (db->db_blkid != DMU_SPILL_BLKID)
460 460 ASSERT3P(db->db_blkptr, ==,
461 461 &dn->dn_phys->dn_blkptr[db->db_blkid]);
462 462 } else {
463 463 /* db is pointed to by an indirect block */
464 464 int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
465 465 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
466 466 ASSERT3U(db->db_parent->db.db_object, ==,
467 467 db->db.db_object);
468 468 /*
469 469 * dnode_grow_indblksz() can make this fail if we don't
470 470 * have the struct_rwlock. XXX indblksz no longer
471 471 * grows. safe to do this now?
472 472 */
473 473 if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
474 474 ASSERT3P(db->db_blkptr, ==,
475 475 ((blkptr_t *)db->db_parent->db.db_data +
476 476 db->db_blkid % epb));
477 477 }
478 478 }
479 479 }
480 480 if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
481 481 (db->db_buf == NULL || db->db_buf->b_data) &&
482 482 db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
483 483 db->db_state != DB_FILL && !dn->dn_free_txg) {
484 484 /*
485 485 * If the blkptr isn't set but they have nonzero data,
486 486 * it had better be dirty, otherwise we'll lose that
487 487 * data when we evict this buffer.
488 488 */
489 489 if (db->db_dirtycnt == 0) {
490 490 uint64_t *buf = db->db.db_data;
491 491 int i;
492 492
493 493 for (i = 0; i < db->db.db_size >> 3; i++) {
494 494 ASSERT(buf[i] == 0);
495 495 }
496 496 }
497 497 }
498 498 DB_DNODE_EXIT(db);
499 499 }
500 500 #endif
501 501
502 502 static void
503 503 dbuf_clear_data(dmu_buf_impl_t *db)
504 504 {
505 505 ASSERT(MUTEX_HELD(&db->db_mtx));
506 506 dbuf_evict_user(db);
507 507 db->db_buf = NULL;
508 508 db->db.db_data = NULL;
509 509 if (db->db_state != DB_NOFILL)
510 510 db->db_state = DB_UNCACHED;
511 511 }
512 512
513 513 static void
514 514 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
515 515 {
516 516 ASSERT(MUTEX_HELD(&db->db_mtx));
517 517 ASSERT(buf != NULL);
518 518
519 519 db->db_buf = buf;
520 520 ASSERT(buf->b_data != NULL);
521 521 db->db.db_data = buf->b_data;
522 522 if (!arc_released(buf))
523 523 arc_set_callback(buf, dbuf_do_evict, db);
524 524 }
525 525
526 526 /*
527 527 * Loan out an arc_buf for read. Return the loaned arc_buf.
528 528 */
529 529 arc_buf_t *
530 530 dbuf_loan_arcbuf(dmu_buf_impl_t *db)
531 531 {
532 532 arc_buf_t *abuf;
533 533
534 534 mutex_enter(&db->db_mtx);
535 535 if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
536 536 int blksz = db->db.db_size;
537 537 spa_t *spa = db->db_objset->os_spa;
538 538
539 539 mutex_exit(&db->db_mtx);
540 540 abuf = arc_loan_buf(spa, blksz);
541 541 bcopy(db->db.db_data, abuf->b_data, blksz);
542 542 } else {
543 543 abuf = db->db_buf;
544 544 arc_loan_inuse_buf(abuf, db);
545 545 dbuf_clear_data(db);
546 546 mutex_exit(&db->db_mtx);
547 547 }
548 548 return (abuf);
549 549 }
550 550
551 551 uint64_t
552 552 dbuf_whichblock(dnode_t *dn, uint64_t offset)
553 553 {
554 554 if (dn->dn_datablkshift) {
555 555 return (offset >> dn->dn_datablkshift);
556 556 } else {
557 557 ASSERT3U(offset, <, dn->dn_datablksz);
558 558 return (0);
559 559 }
560 560 }
561 561
562 562 static void
563 563 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
564 564 {
565 565 dmu_buf_impl_t *db = vdb;
566 566
567 567 mutex_enter(&db->db_mtx);
568 568 ASSERT3U(db->db_state, ==, DB_READ);
569 569 /*
570 570 * All reads are synchronous, so we must have a hold on the dbuf
571 571 */
572 572 ASSERT(refcount_count(&db->db_holds) > 0);
573 573 ASSERT(db->db_buf == NULL);
574 574 ASSERT(db->db.db_data == NULL);
575 575 if (db->db_level == 0 && db->db_freed_in_flight) {
576 576 /* we were freed in flight; disregard any error */
577 577 arc_release(buf, db);
578 578 bzero(buf->b_data, db->db.db_size);
579 579 arc_buf_freeze(buf);
580 580 db->db_freed_in_flight = FALSE;
581 581 dbuf_set_data(db, buf);
582 582 db->db_state = DB_CACHED;
583 583 } else if (zio == NULL || zio->io_error == 0) {
584 584 dbuf_set_data(db, buf);
585 585 db->db_state = DB_CACHED;
586 586 } else {
587 587 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
588 588 ASSERT3P(db->db_buf, ==, NULL);
589 589 VERIFY(arc_buf_remove_ref(buf, db));
590 590 db->db_state = DB_UNCACHED;
591 591 }
592 592 cv_broadcast(&db->db_changed);
593 593 dbuf_rele_and_unlock(db, NULL);
594 594 }
595 595
596 596 static void
597 597 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
598 598 {
599 599 dnode_t *dn;
600 600 zbookmark_phys_t zb;
601 601 arc_flags_t aflags = ARC_FLAG_NOWAIT;
602 602
603 603 DB_DNODE_ENTER(db);
604 604 dn = DB_DNODE(db);
605 605 ASSERT(!refcount_is_zero(&db->db_holds));
606 606 /* We need the struct_rwlock to prevent db_blkptr from changing. */
607 607 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
608 608 ASSERT(MUTEX_HELD(&db->db_mtx));
609 609 ASSERT(db->db_state == DB_UNCACHED);
610 610 ASSERT(db->db_buf == NULL);
611 611
612 612 if (db->db_blkid == DMU_BONUS_BLKID) {
613 613 int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
614 614
615 615 ASSERT3U(bonuslen, <=, db->db.db_size);
616 616 db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
617 617 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
618 618 if (bonuslen < DN_MAX_BONUSLEN)
619 619 bzero(db->db.db_data, DN_MAX_BONUSLEN);
620 620 if (bonuslen)
621 621 bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
622 622 DB_DNODE_EXIT(db);
623 623 db->db_state = DB_CACHED;
624 624 mutex_exit(&db->db_mtx);
625 625 return;
626 626 }
627 627
628 628 /*
629 629 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
630 630 * processes the delete record and clears the bp while we are waiting
631 631 * for the dn_mtx (resulting in a "no" from block_freed).
632 632 */
633 633 if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
634 634 (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
635 635 BP_IS_HOLE(db->db_blkptr)))) {
636 636 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
637 637
638 638 DB_DNODE_EXIT(db);
639 639 dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa,
640 640 db->db.db_size, db, type));
641 641 bzero(db->db.db_data, db->db.db_size);
642 642 db->db_state = DB_CACHED;
643 643 *flags |= DB_RF_CACHED;
644 644 mutex_exit(&db->db_mtx);
645 645 return;
646 646 }
647 647
648 648 DB_DNODE_EXIT(db);
649 649
650 650 db->db_state = DB_READ;
651 651 mutex_exit(&db->db_mtx);
652 652
653 653 if (DBUF_IS_L2CACHEABLE(db))
654 654 aflags |= ARC_FLAG_L2CACHE;
655 655 if (DBUF_IS_L2COMPRESSIBLE(db))
656 656 aflags |= ARC_FLAG_L2COMPRESS;
657 657
658 658 SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
659 659 db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
660 660 db->db.db_object, db->db_level, db->db_blkid);
661 661
662 662 dbuf_add_ref(db, NULL);
663 663
664 664 (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
665 665 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
666 666 (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
667 667 &aflags, &zb);
668 668 if (aflags & ARC_FLAG_CACHED)
669 669 *flags |= DB_RF_CACHED;
670 670 }
671 671
672 672 int
673 673 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
674 674 {
675 675 int err = 0;
676 676 boolean_t havepzio = (zio != NULL);
677 677 boolean_t prefetch;
678 678 dnode_t *dn;
679 679
680 680 /*
681 681 * We don't have to hold the mutex to check db_state because it
682 682 * can't be freed while we have a hold on the buffer.
683 683 */
684 684 ASSERT(!refcount_is_zero(&db->db_holds));
685 685
686 686 if (db->db_state == DB_NOFILL)
687 687 return (SET_ERROR(EIO));
688 688
689 689 DB_DNODE_ENTER(db);
690 690 dn = DB_DNODE(db);
691 691 if ((flags & DB_RF_HAVESTRUCT) == 0)
692 692 rw_enter(&dn->dn_struct_rwlock, RW_READER);
693 693
694 694 prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
695 695 (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
696 696 DBUF_IS_CACHEABLE(db);
697 697
698 698 mutex_enter(&db->db_mtx);
699 699 if (db->db_state == DB_CACHED) {
700 700 mutex_exit(&db->db_mtx);
701 701 if (prefetch)
702 702 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
703 703 db->db.db_size, TRUE);
704 704 if ((flags & DB_RF_HAVESTRUCT) == 0)
705 705 rw_exit(&dn->dn_struct_rwlock);
706 706 DB_DNODE_EXIT(db);
707 707 } else if (db->db_state == DB_UNCACHED) {
708 708 spa_t *spa = dn->dn_objset->os_spa;
709 709
710 710 if (zio == NULL)
711 711 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
712 712 dbuf_read_impl(db, zio, &flags);
713 713
714 714 /* dbuf_read_impl has dropped db_mtx for us */
715 715
716 716 if (prefetch)
717 717 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
718 718 db->db.db_size, flags & DB_RF_CACHED);
719 719
720 720 if ((flags & DB_RF_HAVESTRUCT) == 0)
721 721 rw_exit(&dn->dn_struct_rwlock);
722 722 DB_DNODE_EXIT(db);
723 723
724 724 if (!havepzio)
725 725 err = zio_wait(zio);
726 726 } else {
727 727 /*
728 728 * Another reader came in while the dbuf was in flight
729 729 * between UNCACHED and CACHED. Either a writer will finish
730 730 * writing the buffer (sending the dbuf to CACHED) or the
731 731 * first reader's request will reach the read_done callback
732 732 * and send the dbuf to CACHED. Otherwise, a failure
733 733 * occurred and the dbuf went to UNCACHED.
734 734 */
735 735 mutex_exit(&db->db_mtx);
736 736 if (prefetch)
737 737 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
738 738 db->db.db_size, TRUE);
739 739 if ((flags & DB_RF_HAVESTRUCT) == 0)
740 740 rw_exit(&dn->dn_struct_rwlock);
741 741 DB_DNODE_EXIT(db);
742 742
743 743 /* Skip the wait per the caller's request. */
744 744 mutex_enter(&db->db_mtx);
745 745 if ((flags & DB_RF_NEVERWAIT) == 0) {
746 746 while (db->db_state == DB_READ ||
747 747 db->db_state == DB_FILL) {
748 748 ASSERT(db->db_state == DB_READ ||
749 749 (flags & DB_RF_HAVESTRUCT) == 0);
750 750 DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
751 751 db, zio_t *, zio);
752 752 cv_wait(&db->db_changed, &db->db_mtx);
753 753 }
754 754 if (db->db_state == DB_UNCACHED)
755 755 err = SET_ERROR(EIO);
756 756 }
757 757 mutex_exit(&db->db_mtx);
758 758 }
759 759
760 760 ASSERT(err || havepzio || db->db_state == DB_CACHED);
761 761 return (err);
762 762 }
763 763
764 764 static void
765 765 dbuf_noread(dmu_buf_impl_t *db)
766 766 {
767 767 ASSERT(!refcount_is_zero(&db->db_holds));
768 768 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
769 769 mutex_enter(&db->db_mtx);
770 770 while (db->db_state == DB_READ || db->db_state == DB_FILL)
771 771 cv_wait(&db->db_changed, &db->db_mtx);
772 772 if (db->db_state == DB_UNCACHED) {
773 773 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
774 774 spa_t *spa = db->db_objset->os_spa;
775 775
776 776 ASSERT(db->db_buf == NULL);
777 777 ASSERT(db->db.db_data == NULL);
778 778 dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
779 779 db->db_state = DB_FILL;
780 780 } else if (db->db_state == DB_NOFILL) {
781 781 dbuf_clear_data(db);
782 782 } else {
783 783 ASSERT3U(db->db_state, ==, DB_CACHED);
784 784 }
785 785 mutex_exit(&db->db_mtx);
786 786 }
787 787
788 788 /*
789 789 * This is our just-in-time copy function. It makes a copy of
790 790 * buffers, that have been modified in a previous transaction
791 791 * group, before we modify them in the current active group.
792 792 *
793 793 * This function is used in two places: when we are dirtying a
794 794 * buffer for the first time in a txg, and when we are freeing
795 795 * a range in a dnode that includes this buffer.
796 796 *
797 797 * Note that when we are called from dbuf_free_range() we do
798 798 * not put a hold on the buffer, we just traverse the active
799 799 * dbuf list for the dnode.
800 800 */
801 801 static void
802 802 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
803 803 {
804 804 dbuf_dirty_record_t *dr = db->db_last_dirty;
805 805
806 806 ASSERT(MUTEX_HELD(&db->db_mtx));
807 807 ASSERT(db->db.db_data != NULL);
808 808 ASSERT(db->db_level == 0);
809 809 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
810 810
811 811 if (dr == NULL ||
812 812 (dr->dt.dl.dr_data !=
813 813 ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
814 814 return;
815 815
816 816 /*
817 817 * If the last dirty record for this dbuf has not yet synced
818 818 * and its referencing the dbuf data, either:
819 819 * reset the reference to point to a new copy,
820 820 * or (if there a no active holders)
821 821 * just null out the current db_data pointer.
822 822 */
823 823 ASSERT(dr->dr_txg >= txg - 2);
824 824 if (db->db_blkid == DMU_BONUS_BLKID) {
825 825 /* Note that the data bufs here are zio_bufs */
826 826 dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
827 827 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
828 828 bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
829 829 } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
830 830 int size = db->db.db_size;
831 831 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
832 832 spa_t *spa = db->db_objset->os_spa;
833 833
834 834 dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
835 835 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
836 836 } else {
837 837 dbuf_clear_data(db);
838 838 }
839 839 }
840 840
841 841 void
842 842 dbuf_unoverride(dbuf_dirty_record_t *dr)
843 843 {
844 844 dmu_buf_impl_t *db = dr->dr_dbuf;
845 845 blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
846 846 uint64_t txg = dr->dr_txg;
847 847
848 848 ASSERT(MUTEX_HELD(&db->db_mtx));
849 849 ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
850 850 ASSERT(db->db_level == 0);
851 851
852 852 if (db->db_blkid == DMU_BONUS_BLKID ||
853 853 dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
854 854 return;
855 855
856 856 ASSERT(db->db_data_pending != dr);
857 857
858 858 /* free this block */
859 859 if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
860 860 zio_free(db->db_objset->os_spa, txg, bp);
861 861
862 862 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
863 863 dr->dt.dl.dr_nopwrite = B_FALSE;
864 864
865 865 /*
866 866 * Release the already-written buffer, so we leave it in
867 867 * a consistent dirty state. Note that all callers are
868 868 * modifying the buffer, so they will immediately do
869 869 * another (redundant) arc_release(). Therefore, leave
870 870 * the buf thawed to save the effort of freezing &
871 871 * immediately re-thawing it.
872 872 */
873 873 arc_release(dr->dt.dl.dr_data, db);
874 874 }
875 875
876 876 /*
877 877 * Evict (if its unreferenced) or clear (if its referenced) any level-0
878 878 * data blocks in the free range, so that any future readers will find
879 879 * empty blocks.
880 880 *
881 881 * This is a no-op if the dataset is in the middle of an incremental
882 882 * receive; see comment below for details.
883 883 */
884 884 void
885 885 dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
886 886 dmu_tx_t *tx)
887 887 {
888 888 dmu_buf_impl_t db_search;
889 889 dmu_buf_impl_t *db, *db_next;
890 890 uint64_t txg = tx->tx_txg;
891 891 avl_index_t where;
892 892
893 893 if (end_blkid > dn->dn_maxblkid && (end_blkid != DMU_SPILL_BLKID))
894 894 end_blkid = dn->dn_maxblkid;
895 895 dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid);
896 896
897 897 db_search.db_level = 0;
898 898 db_search.db_blkid = start_blkid;
899 899 db_search.db_state = DB_SEARCH;
900 900
901 901 mutex_enter(&dn->dn_dbufs_mtx);
902 902 if (start_blkid >= dn->dn_unlisted_l0_blkid) {
903 903 /* There can't be any dbufs in this range; no need to search. */
904 904 #ifdef DEBUG
905 905 db = avl_find(&dn->dn_dbufs, &db_search, &where);
906 906 ASSERT3P(db, ==, NULL);
907 907 db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
908 908 ASSERT(db == NULL || db->db_level > 0);
909 909 #endif
910 910 mutex_exit(&dn->dn_dbufs_mtx);
911 911 return;
912 912 } else if (dmu_objset_is_receiving(dn->dn_objset)) {
913 913 /*
914 914 * If we are receiving, we expect there to be no dbufs in
915 915 * the range to be freed, because receive modifies each
916 916 * block at most once, and in offset order. If this is
917 917 * not the case, it can lead to performance problems,
918 918 * so note that we unexpectedly took the slow path.
919 919 */
920 920 atomic_inc_64(&zfs_free_range_recv_miss);
921 921 }
922 922
923 923 db = avl_find(&dn->dn_dbufs, &db_search, &where);
924 924 ASSERT3P(db, ==, NULL);
925 925 db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
926 926
927 927 for (; db != NULL; db = db_next) {
928 928 db_next = AVL_NEXT(&dn->dn_dbufs, db);
929 929 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
930 930
931 931 if (db->db_level != 0 || db->db_blkid > end_blkid) {
932 932 break;
933 933 }
934 934 ASSERT3U(db->db_blkid, >=, start_blkid);
935 935
936 936 /* found a level 0 buffer in the range */
937 937 mutex_enter(&db->db_mtx);
938 938 if (dbuf_undirty(db, tx)) {
939 939 /* mutex has been dropped and dbuf destroyed */
940 940 continue;
941 941 }
942 942
943 943 if (db->db_state == DB_UNCACHED ||
944 944 db->db_state == DB_NOFILL ||
945 945 db->db_state == DB_EVICTING) {
946 946 ASSERT(db->db.db_data == NULL);
947 947 mutex_exit(&db->db_mtx);
948 948 continue;
949 949 }
950 950 if (db->db_state == DB_READ || db->db_state == DB_FILL) {
951 951 /* will be handled in dbuf_read_done or dbuf_rele */
952 952 db->db_freed_in_flight = TRUE;
953 953 mutex_exit(&db->db_mtx);
954 954 continue;
955 955 }
956 956 if (refcount_count(&db->db_holds) == 0) {
957 957 ASSERT(db->db_buf);
958 958 dbuf_clear(db);
959 959 continue;
960 960 }
961 961 /* The dbuf is referenced */
962 962
963 963 if (db->db_last_dirty != NULL) {
964 964 dbuf_dirty_record_t *dr = db->db_last_dirty;
965 965
966 966 if (dr->dr_txg == txg) {
967 967 /*
968 968 * This buffer is "in-use", re-adjust the file
969 969 * size to reflect that this buffer may
970 970 * contain new data when we sync.
971 971 */
972 972 if (db->db_blkid != DMU_SPILL_BLKID &&
973 973 db->db_blkid > dn->dn_maxblkid)
974 974 dn->dn_maxblkid = db->db_blkid;
975 975 dbuf_unoverride(dr);
976 976 } else {
977 977 /*
978 978 * This dbuf is not dirty in the open context.
979 979 * Either uncache it (if its not referenced in
980 980 * the open context) or reset its contents to
981 981 * empty.
982 982 */
983 983 dbuf_fix_old_data(db, txg);
984 984 }
985 985 }
986 986 /* clear the contents if its cached */
987 987 if (db->db_state == DB_CACHED) {
988 988 ASSERT(db->db.db_data != NULL);
989 989 arc_release(db->db_buf, db);
990 990 bzero(db->db.db_data, db->db.db_size);
991 991 arc_buf_freeze(db->db_buf);
992 992 }
993 993
994 994 mutex_exit(&db->db_mtx);
995 995 }
996 996 mutex_exit(&dn->dn_dbufs_mtx);
997 997 }
998 998
999 999 static int
1000 1000 dbuf_block_freeable(dmu_buf_impl_t *db)
1001 1001 {
1002 1002 dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
1003 1003 uint64_t birth_txg = 0;
1004 1004
1005 1005 /*
1006 1006 * We don't need any locking to protect db_blkptr:
1007 1007 * If it's syncing, then db_last_dirty will be set
1008 1008 * so we'll ignore db_blkptr.
1009 1009 *
1010 1010 * This logic ensures that only block births for
1011 1011 * filled blocks are considered.
1012 1012 */
1013 1013 ASSERT(MUTEX_HELD(&db->db_mtx));
1014 1014 if (db->db_last_dirty && (db->db_blkptr == NULL ||
1015 1015 !BP_IS_HOLE(db->db_blkptr))) {
1016 1016 birth_txg = db->db_last_dirty->dr_txg;
1017 1017 } else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
1018 1018 birth_txg = db->db_blkptr->blk_birth;
1019 1019 }
1020 1020
1021 1021 /*
1022 1022 * If this block don't exist or is in a snapshot, it can't be freed.
1023 1023 * Don't pass the bp to dsl_dataset_block_freeable() since we
1024 1024 * are holding the db_mtx lock and might deadlock if we are
1025 1025 * prefetching a dedup-ed block.
1026 1026 */
1027 1027 if (birth_txg != 0)
1028 1028 return (ds == NULL ||
1029 1029 dsl_dataset_block_freeable(ds, NULL, birth_txg));
1030 1030 else
1031 1031 return (B_FALSE);
1032 1032 }
1033 1033
1034 1034 void
1035 1035 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
1036 1036 {
1037 1037 arc_buf_t *buf, *obuf;
1038 1038 int osize = db->db.db_size;
1039 1039 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1040 1040 dnode_t *dn;
1041 1041
1042 1042 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1043 1043
1044 1044 DB_DNODE_ENTER(db);
1045 1045 dn = DB_DNODE(db);
1046 1046
1047 1047 /* XXX does *this* func really need the lock? */
1048 1048 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
1049 1049
1050 1050 /*
1051 1051 * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held
1052 1052 * is OK, because there can be no other references to the db
1053 1053 * when we are changing its size, so no concurrent DB_FILL can
1054 1054 * be happening.
1055 1055 */
1056 1056 /*
1057 1057 * XXX we should be doing a dbuf_read, checking the return
1058 1058 * value and returning that up to our callers
1059 1059 */
1060 1060 dmu_buf_will_dirty(&db->db, tx);
1061 1061
1062 1062 /* create the data buffer for the new block */
1063 1063 buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
1064 1064
1065 1065 /* copy old block data to the new block */
1066 1066 obuf = db->db_buf;
1067 1067 bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
1068 1068 /* zero the remainder */
1069 1069 if (size > osize)
1070 1070 bzero((uint8_t *)buf->b_data + osize, size - osize);
1071 1071
1072 1072 mutex_enter(&db->db_mtx);
1073 1073 dbuf_set_data(db, buf);
1074 1074 VERIFY(arc_buf_remove_ref(obuf, db));
1075 1075 db->db.db_size = size;
1076 1076
1077 1077 if (db->db_level == 0) {
1078 1078 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
1079 1079 db->db_last_dirty->dt.dl.dr_data = buf;
1080 1080 }
1081 1081 mutex_exit(&db->db_mtx);
1082 1082
1083 1083 dnode_willuse_space(dn, size-osize, tx);
1084 1084 DB_DNODE_EXIT(db);
1085 1085 }
1086 1086
1087 1087 void
1088 1088 dbuf_release_bp(dmu_buf_impl_t *db)
1089 1089 {
|
↓ open down ↓ |
804 lines elided |
↑ open up ↑ |
1090 1090 objset_t *os = db->db_objset;
1091 1091
1092 1092 ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
1093 1093 ASSERT(arc_released(os->os_phys_buf) ||
1094 1094 list_link_active(&os->os_dsl_dataset->ds_synced_link));
1095 1095 ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
1096 1096
1097 1097 (void) arc_release(db->db_buf, db);
1098 1098 }
1099 1099
1100 +/*
1101 + * We already have a dirty record for this TXG, and we are being
1102 + * dirtied again.
1103 + */
1104 +static void
1105 +dbuf_redirty(dbuf_dirty_record_t *dr)
1106 +{
1107 + dmu_buf_impl_t *db = dr->dr_dbuf;
1108 +
1109 + ASSERT(MUTEX_HELD(&db->db_mtx));
1110 +
1111 + if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
1112 + /*
1113 + * If this buffer has already been written out,
1114 + * we now need to reset its state.
1115 + */
1116 + dbuf_unoverride(dr);
1117 + if (db->db.db_object != DMU_META_DNODE_OBJECT &&
1118 + db->db_state != DB_NOFILL) {
1119 + /* Already released on initial dirty, so just thaw. */
1120 + ASSERT(arc_released(db->db_buf));
1121 + arc_buf_thaw(db->db_buf);
1122 + }
1123 + }
1124 +}
1125 +
1100 1126 dbuf_dirty_record_t *
1101 1127 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1102 1128 {
1103 1129 dnode_t *dn;
1104 1130 objset_t *os;
1105 1131 dbuf_dirty_record_t **drp, *dr;
1106 1132 int drop_struct_lock = FALSE;
1107 1133 boolean_t do_free_accounting = B_FALSE;
1108 1134 int txgoff = tx->tx_txg & TXG_MASK;
1109 1135
1110 1136 ASSERT(tx->tx_txg != 0);
1111 1137 ASSERT(!refcount_is_zero(&db->db_holds));
1112 1138 DMU_TX_DIRTY_BUF(tx, db);
1113 1139
1114 1140 DB_DNODE_ENTER(db);
1115 1141 dn = DB_DNODE(db);
1116 1142 /*
1117 1143 * Shouldn't dirty a regular buffer in syncing context. Private
1118 1144 * objects may be dirtied in syncing context, but only if they
1119 1145 * were already pre-dirtied in open context.
1120 1146 */
1121 1147 ASSERT(!dmu_tx_is_syncing(tx) ||
1122 1148 BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
1123 1149 DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1124 1150 dn->dn_objset->os_dsl_dataset == NULL);
1125 1151 /*
1126 1152 * We make this assert for private objects as well, but after we
1127 1153 * check if we're already dirty. They are allowed to re-dirty
1128 1154 * in syncing context.
1129 1155 */
1130 1156 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1131 1157 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1132 1158 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1133 1159
1134 1160 mutex_enter(&db->db_mtx);
1135 1161 /*
1136 1162 * XXX make this true for indirects too? The problem is that
1137 1163 * transactions created with dmu_tx_create_assigned() from
1138 1164 * syncing context don't bother holding ahead.
1139 1165 */
1140 1166 ASSERT(db->db_level != 0 ||
1141 1167 db->db_state == DB_CACHED || db->db_state == DB_FILL ||
1142 1168 db->db_state == DB_NOFILL);
1143 1169
1144 1170 mutex_enter(&dn->dn_mtx);
1145 1171 /*
1146 1172 * Don't set dirtyctx to SYNC if we're just modifying this as we
1147 1173 * initialize the objset.
1148 1174 */
1149 1175 if (dn->dn_dirtyctx == DN_UNDIRTIED &&
1150 1176 !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
1151 1177 dn->dn_dirtyctx =
1152 1178 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
1153 1179 ASSERT(dn->dn_dirtyctx_firstset == NULL);
1154 1180 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
1155 1181 }
1156 1182 mutex_exit(&dn->dn_mtx);
1157 1183
1158 1184 if (db->db_blkid == DMU_SPILL_BLKID)
1159 1185 dn->dn_have_spill = B_TRUE;
1160 1186
1161 1187 /*
|
↓ open down ↓ |
52 lines elided |
↑ open up ↑ |
1162 1188 * If this buffer is already dirty, we're done.
1163 1189 */
1164 1190 drp = &db->db_last_dirty;
1165 1191 ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
1166 1192 db->db.db_object == DMU_META_DNODE_OBJECT);
1167 1193 while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
1168 1194 drp = &dr->dr_next;
1169 1195 if (dr && dr->dr_txg == tx->tx_txg) {
1170 1196 DB_DNODE_EXIT(db);
1171 1197
1172 - if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
1173 - /*
1174 - * If this buffer has already been written out,
1175 - * we now need to reset its state.
1176 - */
1177 - dbuf_unoverride(dr);
1178 - if (db->db.db_object != DMU_META_DNODE_OBJECT &&
1179 - db->db_state != DB_NOFILL)
1180 - arc_buf_thaw(db->db_buf);
1181 - }
1198 + dbuf_redirty(dr);
1182 1199 mutex_exit(&db->db_mtx);
1183 1200 return (dr);
1184 1201 }
1185 1202
1186 1203 /*
1187 1204 * Only valid if not already dirty.
1188 1205 */
1189 1206 ASSERT(dn->dn_object == 0 ||
1190 1207 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1191 1208 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1192 1209
1193 1210 ASSERT3U(dn->dn_nlevels, >, db->db_level);
1194 1211 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
1195 1212 dn->dn_phys->dn_nlevels > db->db_level ||
1196 1213 dn->dn_next_nlevels[txgoff] > db->db_level ||
1197 1214 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
1198 1215 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
1199 1216
1200 1217 /*
1201 1218 * We should only be dirtying in syncing context if it's the
1202 1219 * mos or we're initializing the os or it's a special object.
1203 1220 * However, we are allowed to dirty in syncing context provided
1204 1221 * we already dirtied it in open context. Hence we must make
1205 1222 * this assertion only if we're not already dirty.
1206 1223 */
1207 1224 os = dn->dn_objset;
1208 1225 ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1209 1226 os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
1210 1227 ASSERT(db->db.db_size != 0);
1211 1228
1212 1229 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1213 1230
1214 1231 if (db->db_blkid != DMU_BONUS_BLKID) {
1215 1232 /*
1216 1233 * Update the accounting.
1217 1234 * Note: we delay "free accounting" until after we drop
1218 1235 * the db_mtx. This keeps us from grabbing other locks
1219 1236 * (and possibly deadlocking) in bp_get_dsize() while
1220 1237 * also holding the db_mtx.
1221 1238 */
1222 1239 dnode_willuse_space(dn, db->db.db_size, tx);
1223 1240 do_free_accounting = dbuf_block_freeable(db);
1224 1241 }
1225 1242
1226 1243 /*
1227 1244 * If this buffer is dirty in an old transaction group we need
1228 1245 * to make a copy of it so that the changes we make in this
1229 1246 * transaction group won't leak out when we sync the older txg.
1230 1247 */
1231 1248 dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
1232 1249 if (db->db_level == 0) {
1233 1250 void *data_old = db->db_buf;
1234 1251
1235 1252 if (db->db_state != DB_NOFILL) {
1236 1253 if (db->db_blkid == DMU_BONUS_BLKID) {
1237 1254 dbuf_fix_old_data(db, tx->tx_txg);
1238 1255 data_old = db->db.db_data;
1239 1256 } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
1240 1257 /*
1241 1258 * Release the data buffer from the cache so
1242 1259 * that we can modify it without impacting
1243 1260 * possible other users of this cached data
1244 1261 * block. Note that indirect blocks and
1245 1262 * private objects are not released until the
1246 1263 * syncing state (since they are only modified
1247 1264 * then).
1248 1265 */
1249 1266 arc_release(db->db_buf, db);
1250 1267 dbuf_fix_old_data(db, tx->tx_txg);
1251 1268 data_old = db->db_buf;
1252 1269 }
1253 1270 ASSERT(data_old != NULL);
1254 1271 }
1255 1272 dr->dt.dl.dr_data = data_old;
1256 1273 } else {
1257 1274 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
1258 1275 list_create(&dr->dt.di.dr_children,
1259 1276 sizeof (dbuf_dirty_record_t),
1260 1277 offsetof(dbuf_dirty_record_t, dr_dirty_node));
1261 1278 }
1262 1279 if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
1263 1280 dr->dr_accounted = db->db.db_size;
1264 1281 dr->dr_dbuf = db;
1265 1282 dr->dr_txg = tx->tx_txg;
1266 1283 dr->dr_next = *drp;
1267 1284 *drp = dr;
1268 1285
1269 1286 /*
1270 1287 * We could have been freed_in_flight between the dbuf_noread
1271 1288 * and dbuf_dirty. We win, as though the dbuf_noread() had
1272 1289 * happened after the free.
1273 1290 */
1274 1291 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1275 1292 db->db_blkid != DMU_SPILL_BLKID) {
1276 1293 mutex_enter(&dn->dn_mtx);
1277 1294 if (dn->dn_free_ranges[txgoff] != NULL) {
1278 1295 range_tree_clear(dn->dn_free_ranges[txgoff],
1279 1296 db->db_blkid, 1);
1280 1297 }
1281 1298 mutex_exit(&dn->dn_mtx);
1282 1299 db->db_freed_in_flight = FALSE;
1283 1300 }
1284 1301
1285 1302 /*
1286 1303 * This buffer is now part of this txg
1287 1304 */
1288 1305 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
1289 1306 db->db_dirtycnt += 1;
1290 1307 ASSERT3U(db->db_dirtycnt, <=, 3);
1291 1308
1292 1309 mutex_exit(&db->db_mtx);
1293 1310
1294 1311 if (db->db_blkid == DMU_BONUS_BLKID ||
1295 1312 db->db_blkid == DMU_SPILL_BLKID) {
1296 1313 mutex_enter(&dn->dn_mtx);
1297 1314 ASSERT(!list_link_active(&dr->dr_dirty_node));
1298 1315 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1299 1316 mutex_exit(&dn->dn_mtx);
1300 1317 dnode_setdirty(dn, tx);
1301 1318 DB_DNODE_EXIT(db);
1302 1319 return (dr);
1303 1320 }
1304 1321
1305 1322 /*
1306 1323 * The dn_struct_rwlock prevents db_blkptr from changing
1307 1324 * due to a write from syncing context completing
1308 1325 * while we are running, so we want to acquire it before
1309 1326 * looking at db_blkptr.
1310 1327 */
1311 1328 if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
1312 1329 rw_enter(&dn->dn_struct_rwlock, RW_READER);
1313 1330 drop_struct_lock = TRUE;
1314 1331 }
1315 1332
1316 1333 if (do_free_accounting) {
1317 1334 blkptr_t *bp = db->db_blkptr;
1318 1335 int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
1319 1336 bp_get_dsize(os->os_spa, bp) : db->db.db_size;
1320 1337 /*
1321 1338 * This is only a guess -- if the dbuf is dirty
1322 1339 * in a previous txg, we don't know how much
1323 1340 * space it will use on disk yet. We should
1324 1341 * really have the struct_rwlock to access
1325 1342 * db_blkptr, but since this is just a guess,
1326 1343 * it's OK if we get an odd answer.
1327 1344 */
1328 1345 ddt_prefetch(os->os_spa, bp);
1329 1346 dnode_willuse_space(dn, -willfree, tx);
1330 1347 }
1331 1348
1332 1349 if (db->db_level == 0) {
1333 1350 dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
1334 1351 ASSERT(dn->dn_maxblkid >= db->db_blkid);
1335 1352 }
1336 1353
1337 1354 if (db->db_level+1 < dn->dn_nlevels) {
1338 1355 dmu_buf_impl_t *parent = db->db_parent;
1339 1356 dbuf_dirty_record_t *di;
1340 1357 int parent_held = FALSE;
1341 1358
1342 1359 if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
1343 1360 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1344 1361
1345 1362 parent = dbuf_hold_level(dn, db->db_level+1,
1346 1363 db->db_blkid >> epbs, FTAG);
1347 1364 ASSERT(parent != NULL);
1348 1365 parent_held = TRUE;
1349 1366 }
1350 1367 if (drop_struct_lock)
1351 1368 rw_exit(&dn->dn_struct_rwlock);
1352 1369 ASSERT3U(db->db_level+1, ==, parent->db_level);
1353 1370 di = dbuf_dirty(parent, tx);
1354 1371 if (parent_held)
1355 1372 dbuf_rele(parent, FTAG);
1356 1373
1357 1374 mutex_enter(&db->db_mtx);
1358 1375 /*
1359 1376 * Since we've dropped the mutex, it's possible that
1360 1377 * dbuf_undirty() might have changed this out from under us.
1361 1378 */
1362 1379 if (db->db_last_dirty == dr ||
1363 1380 dn->dn_object == DMU_META_DNODE_OBJECT) {
1364 1381 mutex_enter(&di->dt.di.dr_mtx);
1365 1382 ASSERT3U(di->dr_txg, ==, tx->tx_txg);
1366 1383 ASSERT(!list_link_active(&dr->dr_dirty_node));
1367 1384 list_insert_tail(&di->dt.di.dr_children, dr);
1368 1385 mutex_exit(&di->dt.di.dr_mtx);
1369 1386 dr->dr_parent = di;
1370 1387 }
1371 1388 mutex_exit(&db->db_mtx);
1372 1389 } else {
1373 1390 ASSERT(db->db_level+1 == dn->dn_nlevels);
1374 1391 ASSERT(db->db_blkid < dn->dn_nblkptr);
1375 1392 ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
1376 1393 mutex_enter(&dn->dn_mtx);
1377 1394 ASSERT(!list_link_active(&dr->dr_dirty_node));
1378 1395 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1379 1396 mutex_exit(&dn->dn_mtx);
1380 1397 if (drop_struct_lock)
1381 1398 rw_exit(&dn->dn_struct_rwlock);
1382 1399 }
1383 1400
1384 1401 dnode_setdirty(dn, tx);
1385 1402 DB_DNODE_EXIT(db);
1386 1403 return (dr);
1387 1404 }
1388 1405
1389 1406 /*
1390 1407 * Undirty a buffer in the transaction group referenced by the given
1391 1408 * transaction. Return whether this evicted the dbuf.
1392 1409 */
1393 1410 static boolean_t
1394 1411 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1395 1412 {
1396 1413 dnode_t *dn;
1397 1414 uint64_t txg = tx->tx_txg;
1398 1415 dbuf_dirty_record_t *dr, **drp;
1399 1416
1400 1417 ASSERT(txg != 0);
1401 1418 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1402 1419 ASSERT0(db->db_level);
1403 1420 ASSERT(MUTEX_HELD(&db->db_mtx));
1404 1421
1405 1422 /*
1406 1423 * If this buffer is not dirty, we're done.
1407 1424 */
1408 1425 for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
1409 1426 if (dr->dr_txg <= txg)
1410 1427 break;
1411 1428 if (dr == NULL || dr->dr_txg < txg)
1412 1429 return (B_FALSE);
1413 1430 ASSERT(dr->dr_txg == txg);
1414 1431 ASSERT(dr->dr_dbuf == db);
1415 1432
1416 1433 DB_DNODE_ENTER(db);
1417 1434 dn = DB_DNODE(db);
1418 1435
1419 1436 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1420 1437
1421 1438 ASSERT(db->db.db_size != 0);
1422 1439
1423 1440 /*
1424 1441 * Any space we accounted for in dp_dirty_* will be cleaned up by
1425 1442 * dsl_pool_sync(). This is relatively rare so the discrepancy
1426 1443 * is not a big deal.
1427 1444 */
1428 1445
1429 1446 *drp = dr->dr_next;
1430 1447
1431 1448 /*
1432 1449 * Note that there are three places in dbuf_dirty()
1433 1450 * where this dirty record may be put on a list.
1434 1451 * Make sure to do a list_remove corresponding to
1435 1452 * every one of those list_insert calls.
1436 1453 */
1437 1454 if (dr->dr_parent) {
1438 1455 mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
1439 1456 list_remove(&dr->dr_parent->dt.di.dr_children, dr);
1440 1457 mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
1441 1458 } else if (db->db_blkid == DMU_SPILL_BLKID ||
1442 1459 db->db_level+1 == dn->dn_nlevels) {
1443 1460 ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
1444 1461 mutex_enter(&dn->dn_mtx);
1445 1462 list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
1446 1463 mutex_exit(&dn->dn_mtx);
1447 1464 }
1448 1465 DB_DNODE_EXIT(db);
1449 1466
1450 1467 if (db->db_state != DB_NOFILL) {
1451 1468 dbuf_unoverride(dr);
1452 1469
1453 1470 ASSERT(db->db_buf != NULL);
1454 1471 ASSERT(dr->dt.dl.dr_data != NULL);
1455 1472 if (dr->dt.dl.dr_data != db->db_buf)
1456 1473 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
1457 1474 }
1458 1475
1459 1476 if (db->db_level != 0) {
1460 1477 mutex_destroy(&dr->dt.di.dr_mtx);
1461 1478 list_destroy(&dr->dt.di.dr_children);
1462 1479 }
1463 1480
1464 1481 kmem_free(dr, sizeof (dbuf_dirty_record_t));
1465 1482
1466 1483 ASSERT(db->db_dirtycnt > 0);
1467 1484 db->db_dirtycnt -= 1;
1468 1485
1469 1486 if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
1470 1487 arc_buf_t *buf = db->db_buf;
1471 1488
1472 1489 ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
1473 1490 dbuf_clear_data(db);
1474 1491 VERIFY(arc_buf_remove_ref(buf, db));
1475 1492 dbuf_evict(db);
1476 1493 return (B_TRUE);
1477 1494 }
1478 1495
1479 1496 return (B_FALSE);
1480 1497 }
|
↓ open down ↓ |
289 lines elided |
↑ open up ↑ |
1481 1498
1482 1499 void
1483 1500 dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
1484 1501 {
1485 1502 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1486 1503 int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
1487 1504
1488 1505 ASSERT(tx->tx_txg != 0);
1489 1506 ASSERT(!refcount_is_zero(&db->db_holds));
1490 1507
1508 + /*
1509 + * Quick check for dirtyness. For already dirty blocks, this
1510 + * reduces runtime of this function by >90%, and overall performance
1511 + * by 50% for some workloads (e.g. file deletion with indirect blocks
1512 + * cached).
1513 + */
1514 + mutex_enter(&db->db_mtx);
1515 + dbuf_dirty_record_t *dr;
1516 + for (dr = db->db_last_dirty;
1517 + dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) {
1518 + /*
1519 + * It's possible that it is already dirty but not cached,
1520 + * because there are some calls to dbuf_dirty() that don't
1521 + * go through dmu_buf_will_dirty().
1522 + */
1523 + if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) {
1524 + /* This dbuf is already dirty and cached. */
1525 + dbuf_redirty(dr);
1526 + mutex_exit(&db->db_mtx);
1527 + return;
1528 + }
1529 + }
1530 + mutex_exit(&db->db_mtx);
1531 +
1491 1532 DB_DNODE_ENTER(db);
1492 1533 if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
1493 1534 rf |= DB_RF_HAVESTRUCT;
1494 1535 DB_DNODE_EXIT(db);
1495 1536 (void) dbuf_read(db, NULL, rf);
1496 1537 (void) dbuf_dirty(db, tx);
1497 1538 }
1498 1539
1499 1540 void
1500 1541 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1501 1542 {
1502 1543 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1503 1544
1504 1545 db->db_state = DB_NOFILL;
1505 1546
1506 1547 dmu_buf_will_fill(db_fake, tx);
1507 1548 }
1508 1549
1509 1550 void
1510 1551 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1511 1552 {
1512 1553 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1513 1554
1514 1555 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1515 1556 ASSERT(tx->tx_txg != 0);
1516 1557 ASSERT(db->db_level == 0);
1517 1558 ASSERT(!refcount_is_zero(&db->db_holds));
1518 1559
1519 1560 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
1520 1561 dmu_tx_private_ok(tx));
1521 1562
1522 1563 dbuf_noread(db);
1523 1564 (void) dbuf_dirty(db, tx);
1524 1565 }
1525 1566
1526 1567 #pragma weak dmu_buf_fill_done = dbuf_fill_done
1527 1568 /* ARGSUSED */
1528 1569 void
1529 1570 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
1530 1571 {
1531 1572 mutex_enter(&db->db_mtx);
1532 1573 DBUF_VERIFY(db);
1533 1574
1534 1575 if (db->db_state == DB_FILL) {
1535 1576 if (db->db_level == 0 && db->db_freed_in_flight) {
1536 1577 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1537 1578 /* we were freed while filling */
1538 1579 /* XXX dbuf_undirty? */
1539 1580 bzero(db->db.db_data, db->db.db_size);
1540 1581 db->db_freed_in_flight = FALSE;
1541 1582 }
1542 1583 db->db_state = DB_CACHED;
1543 1584 cv_broadcast(&db->db_changed);
1544 1585 }
1545 1586 mutex_exit(&db->db_mtx);
1546 1587 }
1547 1588
1548 1589 void
1549 1590 dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
1550 1591 bp_embedded_type_t etype, enum zio_compress comp,
1551 1592 int uncompressed_size, int compressed_size, int byteorder,
1552 1593 dmu_tx_t *tx)
1553 1594 {
1554 1595 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
1555 1596 struct dirty_leaf *dl;
1556 1597 dmu_object_type_t type;
1557 1598
1558 1599 DB_DNODE_ENTER(db);
1559 1600 type = DB_DNODE(db)->dn_type;
1560 1601 DB_DNODE_EXIT(db);
1561 1602
1562 1603 ASSERT0(db->db_level);
1563 1604 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1564 1605
1565 1606 dmu_buf_will_not_fill(dbuf, tx);
1566 1607
1567 1608 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
1568 1609 dl = &db->db_last_dirty->dt.dl;
1569 1610 encode_embedded_bp_compressed(&dl->dr_overridden_by,
1570 1611 data, comp, uncompressed_size, compressed_size);
1571 1612 BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
1572 1613 BP_SET_TYPE(&dl->dr_overridden_by, type);
1573 1614 BP_SET_LEVEL(&dl->dr_overridden_by, 0);
1574 1615 BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
1575 1616
1576 1617 dl->dr_override_state = DR_OVERRIDDEN;
1577 1618 dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg;
1578 1619 }
1579 1620
1580 1621 /*
1581 1622 * Directly assign a provided arc buf to a given dbuf if it's not referenced
1582 1623 * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
1583 1624 */
1584 1625 void
1585 1626 dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
1586 1627 {
1587 1628 ASSERT(!refcount_is_zero(&db->db_holds));
1588 1629 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1589 1630 ASSERT(db->db_level == 0);
1590 1631 ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
1591 1632 ASSERT(buf != NULL);
1592 1633 ASSERT(arc_buf_size(buf) == db->db.db_size);
1593 1634 ASSERT(tx->tx_txg != 0);
1594 1635
1595 1636 arc_return_buf(buf, db);
1596 1637 ASSERT(arc_released(buf));
1597 1638
1598 1639 mutex_enter(&db->db_mtx);
1599 1640
1600 1641 while (db->db_state == DB_READ || db->db_state == DB_FILL)
1601 1642 cv_wait(&db->db_changed, &db->db_mtx);
1602 1643
1603 1644 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
1604 1645
1605 1646 if (db->db_state == DB_CACHED &&
1606 1647 refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
1607 1648 mutex_exit(&db->db_mtx);
1608 1649 (void) dbuf_dirty(db, tx);
1609 1650 bcopy(buf->b_data, db->db.db_data, db->db.db_size);
1610 1651 VERIFY(arc_buf_remove_ref(buf, db));
1611 1652 xuio_stat_wbuf_copied();
1612 1653 return;
1613 1654 }
1614 1655
1615 1656 xuio_stat_wbuf_nocopy();
1616 1657 if (db->db_state == DB_CACHED) {
1617 1658 dbuf_dirty_record_t *dr = db->db_last_dirty;
1618 1659
1619 1660 ASSERT(db->db_buf != NULL);
1620 1661 if (dr != NULL && dr->dr_txg == tx->tx_txg) {
1621 1662 ASSERT(dr->dt.dl.dr_data == db->db_buf);
1622 1663 if (!arc_released(db->db_buf)) {
1623 1664 ASSERT(dr->dt.dl.dr_override_state ==
1624 1665 DR_OVERRIDDEN);
1625 1666 arc_release(db->db_buf, db);
1626 1667 }
1627 1668 dr->dt.dl.dr_data = buf;
1628 1669 VERIFY(arc_buf_remove_ref(db->db_buf, db));
1629 1670 } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
1630 1671 arc_release(db->db_buf, db);
1631 1672 VERIFY(arc_buf_remove_ref(db->db_buf, db));
1632 1673 }
1633 1674 db->db_buf = NULL;
1634 1675 }
1635 1676 ASSERT(db->db_buf == NULL);
1636 1677 dbuf_set_data(db, buf);
1637 1678 db->db_state = DB_FILL;
1638 1679 mutex_exit(&db->db_mtx);
1639 1680 (void) dbuf_dirty(db, tx);
1640 1681 dmu_buf_fill_done(&db->db, tx);
1641 1682 }
1642 1683
1643 1684 /*
1644 1685 * "Clear" the contents of this dbuf. This will mark the dbuf
1645 1686 * EVICTING and clear *most* of its references. Unfortunately,
1646 1687 * when we are not holding the dn_dbufs_mtx, we can't clear the
1647 1688 * entry in the dn_dbufs list. We have to wait until dbuf_destroy()
1648 1689 * in this case. For callers from the DMU we will usually see:
1649 1690 * dbuf_clear()->arc_clear_callback()->dbuf_do_evict()->dbuf_destroy()
1650 1691 * For the arc callback, we will usually see:
1651 1692 * dbuf_do_evict()->dbuf_clear();dbuf_destroy()
1652 1693 * Sometimes, though, we will get a mix of these two:
1653 1694 * DMU: dbuf_clear()->arc_clear_callback()
1654 1695 * ARC: dbuf_do_evict()->dbuf_destroy()
1655 1696 *
1656 1697 * This routine will dissociate the dbuf from the arc, by calling
1657 1698 * arc_clear_callback(), but will not evict the data from the ARC.
1658 1699 */
1659 1700 void
1660 1701 dbuf_clear(dmu_buf_impl_t *db)
1661 1702 {
1662 1703 dnode_t *dn;
1663 1704 dmu_buf_impl_t *parent = db->db_parent;
1664 1705 dmu_buf_impl_t *dndb;
1665 1706 boolean_t dbuf_gone = B_FALSE;
1666 1707
1667 1708 ASSERT(MUTEX_HELD(&db->db_mtx));
1668 1709 ASSERT(refcount_is_zero(&db->db_holds));
1669 1710
1670 1711 dbuf_evict_user(db);
1671 1712
1672 1713 if (db->db_state == DB_CACHED) {
1673 1714 ASSERT(db->db.db_data != NULL);
1674 1715 if (db->db_blkid == DMU_BONUS_BLKID) {
1675 1716 zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
1676 1717 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
1677 1718 }
1678 1719 db->db.db_data = NULL;
1679 1720 db->db_state = DB_UNCACHED;
1680 1721 }
1681 1722
1682 1723 ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
1683 1724 ASSERT(db->db_data_pending == NULL);
1684 1725
1685 1726 db->db_state = DB_EVICTING;
1686 1727 db->db_blkptr = NULL;
1687 1728
1688 1729 DB_DNODE_ENTER(db);
1689 1730 dn = DB_DNODE(db);
1690 1731 dndb = dn->dn_dbuf;
1691 1732 if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
1692 1733 avl_remove(&dn->dn_dbufs, db);
1693 1734 atomic_dec_32(&dn->dn_dbufs_count);
1694 1735 membar_producer();
1695 1736 DB_DNODE_EXIT(db);
1696 1737 /*
1697 1738 * Decrementing the dbuf count means that the hold corresponding
1698 1739 * to the removed dbuf is no longer discounted in dnode_move(),
1699 1740 * so the dnode cannot be moved until after we release the hold.
1700 1741 * The membar_producer() ensures visibility of the decremented
1701 1742 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
1702 1743 * release any lock.
1703 1744 */
1704 1745 dnode_rele(dn, db);
1705 1746 db->db_dnode_handle = NULL;
1706 1747 } else {
1707 1748 DB_DNODE_EXIT(db);
1708 1749 }
1709 1750
1710 1751 if (db->db_buf)
1711 1752 dbuf_gone = arc_clear_callback(db->db_buf);
1712 1753
1713 1754 if (!dbuf_gone)
1714 1755 mutex_exit(&db->db_mtx);
1715 1756
1716 1757 /*
1717 1758 * If this dbuf is referenced from an indirect dbuf,
1718 1759 * decrement the ref count on the indirect dbuf.
1719 1760 */
1720 1761 if (parent && parent != dndb)
1721 1762 dbuf_rele(parent, db);
1722 1763 }
1723 1764
1724 1765 static int
1725 1766 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
1726 1767 dmu_buf_impl_t **parentp, blkptr_t **bpp)
1727 1768 {
1728 1769 int nlevels, epbs;
1729 1770
1730 1771 *parentp = NULL;
1731 1772 *bpp = NULL;
1732 1773
1733 1774 ASSERT(blkid != DMU_BONUS_BLKID);
1734 1775
1735 1776 if (blkid == DMU_SPILL_BLKID) {
1736 1777 mutex_enter(&dn->dn_mtx);
1737 1778 if (dn->dn_have_spill &&
1738 1779 (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
1739 1780 *bpp = &dn->dn_phys->dn_spill;
1740 1781 else
1741 1782 *bpp = NULL;
1742 1783 dbuf_add_ref(dn->dn_dbuf, NULL);
1743 1784 *parentp = dn->dn_dbuf;
1744 1785 mutex_exit(&dn->dn_mtx);
1745 1786 return (0);
1746 1787 }
1747 1788
1748 1789 if (dn->dn_phys->dn_nlevels == 0)
1749 1790 nlevels = 1;
1750 1791 else
1751 1792 nlevels = dn->dn_phys->dn_nlevels;
1752 1793
1753 1794 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1754 1795
1755 1796 ASSERT3U(level * epbs, <, 64);
1756 1797 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1757 1798 if (level >= nlevels ||
1758 1799 (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
1759 1800 /* the buffer has no parent yet */
1760 1801 return (SET_ERROR(ENOENT));
1761 1802 } else if (level < nlevels-1) {
1762 1803 /* this block is referenced from an indirect block */
1763 1804 int err = dbuf_hold_impl(dn, level+1,
1764 1805 blkid >> epbs, fail_sparse, NULL, parentp);
1765 1806 if (err)
1766 1807 return (err);
1767 1808 err = dbuf_read(*parentp, NULL,
1768 1809 (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
1769 1810 if (err) {
1770 1811 dbuf_rele(*parentp, NULL);
1771 1812 *parentp = NULL;
1772 1813 return (err);
1773 1814 }
1774 1815 *bpp = ((blkptr_t *)(*parentp)->db.db_data) +
1775 1816 (blkid & ((1ULL << epbs) - 1));
1776 1817 return (0);
1777 1818 } else {
1778 1819 /* the block is referenced from the dnode */
1779 1820 ASSERT3U(level, ==, nlevels-1);
1780 1821 ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
1781 1822 blkid < dn->dn_phys->dn_nblkptr);
1782 1823 if (dn->dn_dbuf) {
1783 1824 dbuf_add_ref(dn->dn_dbuf, NULL);
1784 1825 *parentp = dn->dn_dbuf;
1785 1826 }
1786 1827 *bpp = &dn->dn_phys->dn_blkptr[blkid];
1787 1828 return (0);
1788 1829 }
1789 1830 }
1790 1831
1791 1832 static dmu_buf_impl_t *
1792 1833 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
1793 1834 dmu_buf_impl_t *parent, blkptr_t *blkptr)
1794 1835 {
1795 1836 objset_t *os = dn->dn_objset;
1796 1837 dmu_buf_impl_t *db, *odb;
1797 1838
1798 1839 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1799 1840 ASSERT(dn->dn_type != DMU_OT_NONE);
1800 1841
1801 1842 db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
1802 1843
1803 1844 db->db_objset = os;
|
↓ open down ↓ |
303 lines elided |
↑ open up ↑ |
1804 1845 db->db.db_object = dn->dn_object;
1805 1846 db->db_level = level;
1806 1847 db->db_blkid = blkid;
1807 1848 db->db_last_dirty = NULL;
1808 1849 db->db_dirtycnt = 0;
1809 1850 db->db_dnode_handle = dn->dn_handle;
1810 1851 db->db_parent = parent;
1811 1852 db->db_blkptr = blkptr;
1812 1853
1813 1854 db->db_user = NULL;
1814 - db->db_immediate_evict = 0;
1815 - db->db_freed_in_flight = 0;
1855 + db->db_user_immediate_evict = FALSE;
1856 + db->db_freed_in_flight = FALSE;
1857 + db->db_pending_evict = FALSE;
1816 1858
1817 1859 if (blkid == DMU_BONUS_BLKID) {
1818 1860 ASSERT3P(parent, ==, dn->dn_dbuf);
1819 1861 db->db.db_size = DN_MAX_BONUSLEN -
1820 1862 (dn->dn_nblkptr-1) * sizeof (blkptr_t);
1821 1863 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
1822 1864 db->db.db_offset = DMU_BONUS_BLKID;
1823 1865 db->db_state = DB_UNCACHED;
1824 1866 /* the bonus dbuf is not placed in the hash table */
1825 1867 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1826 1868 return (db);
1827 1869 } else if (blkid == DMU_SPILL_BLKID) {
1828 1870 db->db.db_size = (blkptr != NULL) ?
1829 1871 BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
1830 1872 db->db.db_offset = 0;
1831 1873 } else {
1832 1874 int blocksize =
1833 1875 db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
1834 1876 db->db.db_size = blocksize;
1835 1877 db->db.db_offset = db->db_blkid * blocksize;
1836 1878 }
1837 1879
1838 1880 /*
1839 1881 * Hold the dn_dbufs_mtx while we get the new dbuf
1840 1882 * in the hash table *and* added to the dbufs list.
1841 1883 * This prevents a possible deadlock with someone
1842 1884 * trying to look up this dbuf before its added to the
1843 1885 * dn_dbufs list.
1844 1886 */
1845 1887 mutex_enter(&dn->dn_dbufs_mtx);
1846 1888 db->db_state = DB_EVICTING;
1847 1889 if ((odb = dbuf_hash_insert(db)) != NULL) {
1848 1890 /* someone else inserted it first */
1849 1891 kmem_cache_free(dbuf_cache, db);
1850 1892 mutex_exit(&dn->dn_dbufs_mtx);
1851 1893 return (odb);
1852 1894 }
1853 1895 avl_add(&dn->dn_dbufs, db);
1854 1896 if (db->db_level == 0 && db->db_blkid >=
1855 1897 dn->dn_unlisted_l0_blkid)
1856 1898 dn->dn_unlisted_l0_blkid = db->db_blkid + 1;
1857 1899 db->db_state = DB_UNCACHED;
1858 1900 mutex_exit(&dn->dn_dbufs_mtx);
1859 1901 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1860 1902
1861 1903 if (parent && parent != dn->dn_dbuf)
1862 1904 dbuf_add_ref(parent, db);
1863 1905
1864 1906 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1865 1907 refcount_count(&dn->dn_holds) > 0);
1866 1908 (void) refcount_add(&dn->dn_holds, db);
1867 1909 atomic_inc_32(&dn->dn_dbufs_count);
1868 1910
1869 1911 dprintf_dbuf(db, "db=%p\n", db);
1870 1912
1871 1913 return (db);
1872 1914 }
1873 1915
1874 1916 static int
1875 1917 dbuf_do_evict(void *private)
1876 1918 {
1877 1919 dmu_buf_impl_t *db = private;
1878 1920
1879 1921 if (!MUTEX_HELD(&db->db_mtx))
1880 1922 mutex_enter(&db->db_mtx);
1881 1923
1882 1924 ASSERT(refcount_is_zero(&db->db_holds));
1883 1925
1884 1926 if (db->db_state != DB_EVICTING) {
1885 1927 ASSERT(db->db_state == DB_CACHED);
1886 1928 DBUF_VERIFY(db);
1887 1929 db->db_buf = NULL;
1888 1930 dbuf_evict(db);
1889 1931 } else {
1890 1932 mutex_exit(&db->db_mtx);
1891 1933 dbuf_destroy(db);
1892 1934 }
1893 1935 return (0);
1894 1936 }
1895 1937
1896 1938 static void
1897 1939 dbuf_destroy(dmu_buf_impl_t *db)
1898 1940 {
1899 1941 ASSERT(refcount_is_zero(&db->db_holds));
1900 1942
1901 1943 if (db->db_blkid != DMU_BONUS_BLKID) {
1902 1944 /*
1903 1945 * If this dbuf is still on the dn_dbufs list,
1904 1946 * remove it from that list.
1905 1947 */
1906 1948 if (db->db_dnode_handle != NULL) {
1907 1949 dnode_t *dn;
1908 1950
1909 1951 DB_DNODE_ENTER(db);
1910 1952 dn = DB_DNODE(db);
1911 1953 mutex_enter(&dn->dn_dbufs_mtx);
1912 1954 avl_remove(&dn->dn_dbufs, db);
1913 1955 atomic_dec_32(&dn->dn_dbufs_count);
1914 1956 mutex_exit(&dn->dn_dbufs_mtx);
1915 1957 DB_DNODE_EXIT(db);
1916 1958 /*
1917 1959 * Decrementing the dbuf count means that the hold
1918 1960 * corresponding to the removed dbuf is no longer
1919 1961 * discounted in dnode_move(), so the dnode cannot be
1920 1962 * moved until after we release the hold.
1921 1963 */
1922 1964 dnode_rele(dn, db);
1923 1965 db->db_dnode_handle = NULL;
1924 1966 }
1925 1967 dbuf_hash_remove(db);
1926 1968 }
1927 1969 db->db_parent = NULL;
1928 1970 db->db_buf = NULL;
1929 1971
1930 1972 ASSERT(db->db.db_data == NULL);
1931 1973 ASSERT(db->db_hash_next == NULL);
1932 1974 ASSERT(db->db_blkptr == NULL);
1933 1975 ASSERT(db->db_data_pending == NULL);
1934 1976
1935 1977 kmem_cache_free(dbuf_cache, db);
1936 1978 arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1937 1979 }
1938 1980
1939 1981 void
1940 1982 dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
1941 1983 {
1942 1984 dmu_buf_impl_t *db = NULL;
1943 1985 blkptr_t *bp = NULL;
1944 1986
1945 1987 ASSERT(blkid != DMU_BONUS_BLKID);
1946 1988 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1947 1989
1948 1990 if (dnode_block_freed(dn, blkid))
1949 1991 return;
1950 1992
1951 1993 /* dbuf_find() returns with db_mtx held */
1952 1994 if (db = dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid)) {
1953 1995 /*
1954 1996 * This dbuf is already in the cache. We assume that
1955 1997 * it is already CACHED, or else about to be either
1956 1998 * read or filled.
1957 1999 */
1958 2000 mutex_exit(&db->db_mtx);
1959 2001 return;
1960 2002 }
1961 2003
1962 2004 if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
1963 2005 if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
1964 2006 dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
1965 2007 arc_flags_t aflags =
1966 2008 ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
1967 2009 zbookmark_phys_t zb;
1968 2010
1969 2011 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
1970 2012 dn->dn_object, 0, blkid);
1971 2013
1972 2014 (void) arc_read(NULL, dn->dn_objset->os_spa,
1973 2015 bp, NULL, NULL, prio,
1974 2016 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
1975 2017 &aflags, &zb);
1976 2018 }
1977 2019 if (db)
1978 2020 dbuf_rele(db, NULL);
1979 2021 }
1980 2022 }
1981 2023
1982 2024 /*
1983 2025 * Returns with db_holds incremented, and db_mtx not held.
1984 2026 * Note: dn_struct_rwlock must be held.
1985 2027 */
1986 2028 int
1987 2029 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
1988 2030 void *tag, dmu_buf_impl_t **dbp)
1989 2031 {
1990 2032 dmu_buf_impl_t *db, *parent = NULL;
1991 2033
1992 2034 ASSERT(blkid != DMU_BONUS_BLKID);
1993 2035 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1994 2036 ASSERT3U(dn->dn_nlevels, >, level);
1995 2037
1996 2038 *dbp = NULL;
1997 2039 top:
1998 2040 /* dbuf_find() returns with db_mtx held */
1999 2041 db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid);
2000 2042
2001 2043 if (db == NULL) {
2002 2044 blkptr_t *bp = NULL;
2003 2045 int err;
2004 2046
2005 2047 ASSERT3P(parent, ==, NULL);
2006 2048 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
2007 2049 if (fail_sparse) {
2008 2050 if (err == 0 && bp && BP_IS_HOLE(bp))
2009 2051 err = SET_ERROR(ENOENT);
2010 2052 if (err) {
2011 2053 if (parent)
2012 2054 dbuf_rele(parent, NULL);
2013 2055 return (err);
2014 2056 }
2015 2057 }
2016 2058 if (err && err != ENOENT)
2017 2059 return (err);
2018 2060 db = dbuf_create(dn, level, blkid, parent, bp);
2019 2061 }
2020 2062
2021 2063 if (db->db_buf && refcount_is_zero(&db->db_holds)) {
2022 2064 arc_buf_add_ref(db->db_buf, db);
2023 2065 if (db->db_buf->b_data == NULL) {
2024 2066 dbuf_clear(db);
2025 2067 if (parent) {
2026 2068 dbuf_rele(parent, NULL);
2027 2069 parent = NULL;
2028 2070 }
2029 2071 goto top;
2030 2072 }
2031 2073 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
2032 2074 }
2033 2075
2034 2076 ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
2035 2077
2036 2078 /*
2037 2079 * If this buffer is currently syncing out, and we are are
2038 2080 * still referencing it from db_data, we need to make a copy
2039 2081 * of it in case we decide we want to dirty it again in this txg.
2040 2082 */
2041 2083 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
2042 2084 dn->dn_object != DMU_META_DNODE_OBJECT &&
2043 2085 db->db_state == DB_CACHED && db->db_data_pending) {
2044 2086 dbuf_dirty_record_t *dr = db->db_data_pending;
2045 2087
2046 2088 if (dr->dt.dl.dr_data == db->db_buf) {
2047 2089 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
2048 2090
2049 2091 dbuf_set_data(db,
2050 2092 arc_buf_alloc(dn->dn_objset->os_spa,
2051 2093 db->db.db_size, db, type));
2052 2094 bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
2053 2095 db->db.db_size);
2054 2096 }
2055 2097 }
2056 2098
2057 2099 (void) refcount_add(&db->db_holds, tag);
2058 2100 DBUF_VERIFY(db);
2059 2101 mutex_exit(&db->db_mtx);
2060 2102
2061 2103 /* NOTE: we can't rele the parent until after we drop the db_mtx */
2062 2104 if (parent)
2063 2105 dbuf_rele(parent, NULL);
2064 2106
2065 2107 ASSERT3P(DB_DNODE(db), ==, dn);
2066 2108 ASSERT3U(db->db_blkid, ==, blkid);
2067 2109 ASSERT3U(db->db_level, ==, level);
2068 2110 *dbp = db;
2069 2111
2070 2112 return (0);
2071 2113 }
2072 2114
2073 2115 dmu_buf_impl_t *
2074 2116 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
2075 2117 {
2076 2118 dmu_buf_impl_t *db;
2077 2119 int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
2078 2120 return (err ? NULL : db);
2079 2121 }
2080 2122
2081 2123 dmu_buf_impl_t *
2082 2124 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
2083 2125 {
2084 2126 dmu_buf_impl_t *db;
2085 2127 int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
2086 2128 return (err ? NULL : db);
2087 2129 }
2088 2130
2089 2131 void
2090 2132 dbuf_create_bonus(dnode_t *dn)
2091 2133 {
2092 2134 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
2093 2135
2094 2136 ASSERT(dn->dn_bonus == NULL);
2095 2137 dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
2096 2138 }
2097 2139
2098 2140 int
2099 2141 dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
2100 2142 {
2101 2143 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2102 2144 dnode_t *dn;
2103 2145
2104 2146 if (db->db_blkid != DMU_SPILL_BLKID)
2105 2147 return (SET_ERROR(ENOTSUP));
2106 2148 if (blksz == 0)
2107 2149 blksz = SPA_MINBLOCKSIZE;
2108 2150 ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
2109 2151 blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
2110 2152
2111 2153 DB_DNODE_ENTER(db);
2112 2154 dn = DB_DNODE(db);
2113 2155 rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
2114 2156 dbuf_new_size(db, blksz, tx);
2115 2157 rw_exit(&dn->dn_struct_rwlock);
2116 2158 DB_DNODE_EXIT(db);
2117 2159
2118 2160 return (0);
2119 2161 }
2120 2162
2121 2163 void
2122 2164 dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
2123 2165 {
2124 2166 dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
2125 2167 }
2126 2168
2127 2169 #pragma weak dmu_buf_add_ref = dbuf_add_ref
2128 2170 void
2129 2171 dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
2130 2172 {
2131 2173 int64_t holds = refcount_add(&db->db_holds, tag);
2132 2174 ASSERT(holds > 1);
2133 2175 }
2134 2176
2135 2177 #pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref
2136 2178 boolean_t
2137 2179 dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
2138 2180 void *tag)
2139 2181 {
2140 2182 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2141 2183 dmu_buf_impl_t *found_db;
2142 2184 boolean_t result = B_FALSE;
2143 2185
2144 2186 if (db->db_blkid == DMU_BONUS_BLKID)
2145 2187 found_db = dbuf_find_bonus(os, obj);
2146 2188 else
2147 2189 found_db = dbuf_find(os, obj, 0, blkid);
2148 2190
2149 2191 if (found_db != NULL) {
2150 2192 if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) {
2151 2193 (void) refcount_add(&db->db_holds, tag);
2152 2194 result = B_TRUE;
2153 2195 }
2154 2196 mutex_exit(&db->db_mtx);
2155 2197 }
2156 2198 return (result);
2157 2199 }
2158 2200
2159 2201 /*
2160 2202 * If you call dbuf_rele() you had better not be referencing the dnode handle
2161 2203 * unless you have some other direct or indirect hold on the dnode. (An indirect
2162 2204 * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
2163 2205 * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
2164 2206 * dnode's parent dbuf evicting its dnode handles.
2165 2207 */
2166 2208 void
2167 2209 dbuf_rele(dmu_buf_impl_t *db, void *tag)
2168 2210 {
2169 2211 mutex_enter(&db->db_mtx);
2170 2212 dbuf_rele_and_unlock(db, tag);
2171 2213 }
2172 2214
2173 2215 void
2174 2216 dmu_buf_rele(dmu_buf_t *db, void *tag)
2175 2217 {
2176 2218 dbuf_rele((dmu_buf_impl_t *)db, tag);
2177 2219 }
2178 2220
2179 2221 /*
2180 2222 * dbuf_rele() for an already-locked dbuf. This is necessary to allow
2181 2223 * db_dirtycnt and db_holds to be updated atomically.
2182 2224 */
2183 2225 void
2184 2226 dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
2185 2227 {
2186 2228 int64_t holds;
2187 2229
2188 2230 ASSERT(MUTEX_HELD(&db->db_mtx));
2189 2231 DBUF_VERIFY(db);
2190 2232
2191 2233 /*
2192 2234 * Remove the reference to the dbuf before removing its hold on the
2193 2235 * dnode so we can guarantee in dnode_move() that a referenced bonus
2194 2236 * buffer has a corresponding dnode hold.
2195 2237 */
2196 2238 holds = refcount_remove(&db->db_holds, tag);
|
↓ open down ↓ |
371 lines elided |
↑ open up ↑ |
2197 2239 ASSERT(holds >= 0);
2198 2240
2199 2241 /*
2200 2242 * We can't freeze indirects if there is a possibility that they
2201 2243 * may be modified in the current syncing context.
2202 2244 */
2203 2245 if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
2204 2246 arc_buf_freeze(db->db_buf);
2205 2247
2206 2248 if (holds == db->db_dirtycnt &&
2207 - db->db_level == 0 && db->db_immediate_evict)
2249 + db->db_level == 0 && db->db_user_immediate_evict)
2208 2250 dbuf_evict_user(db);
2209 2251
2210 2252 if (holds == 0) {
2211 2253 if (db->db_blkid == DMU_BONUS_BLKID) {
2212 2254 dnode_t *dn;
2255 + boolean_t evict_dbuf = db->db_pending_evict;
2213 2256
2214 2257 /*
2215 2258 * If the dnode moves here, we cannot cross this
2216 2259 * barrier until the move completes.
2217 2260 */
2218 2261 DB_DNODE_ENTER(db);
2219 2262
2220 2263 dn = DB_DNODE(db);
2221 2264 atomic_dec_32(&dn->dn_dbufs_count);
2222 2265
2223 2266 /*
2224 2267 * Decrementing the dbuf count means that the bonus
2225 2268 * buffer's dnode hold is no longer discounted in
2226 2269 * dnode_move(). The dnode cannot move until after
2227 - * the dnode_rele_and_unlock() below.
2270 + * the dnode_rele() below.
2228 2271 */
2229 2272 DB_DNODE_EXIT(db);
2230 2273
2231 2274 /*
2232 2275 * Do not reference db after its lock is dropped.
2233 2276 * Another thread may evict it.
2234 2277 */
2235 2278 mutex_exit(&db->db_mtx);
2236 2279
2237 - /*
2238 - * If the dnode has been freed, evict the bonus
2239 - * buffer immediately. The data in the bonus
2240 - * buffer is no longer relevant and this prevents
2241 - * a stale bonus buffer from being associated
2242 - * with this dnode_t should the dnode_t be reused
2243 - * prior to being destroyed.
2244 - */
2245 - mutex_enter(&dn->dn_mtx);
2246 - if (dn->dn_type == DMU_OT_NONE ||
2247 - dn->dn_free_txg != 0) {
2248 - /*
2249 - * Drop dn_mtx. It is a leaf lock and
2250 - * cannot be held when dnode_evict_bonus()
2251 - * acquires other locks in order to
2252 - * perform the eviction.
2253 - *
2254 - * Freed dnodes cannot be reused until the
2255 - * last hold is released. Since this bonus
2256 - * buffer has a hold, the dnode will remain
2257 - * in the free state, even without dn_mtx
2258 - * held, until the dnode_rele_and_unlock()
2259 - * below.
2260 - */
2261 - mutex_exit(&dn->dn_mtx);
2280 + if (evict_dbuf)
2262 2281 dnode_evict_bonus(dn);
2263 - mutex_enter(&dn->dn_mtx);
2264 - }
2265 - dnode_rele_and_unlock(dn, db);
2282 +
2283 + dnode_rele(dn, db);
2266 2284 } else if (db->db_buf == NULL) {
2267 2285 /*
2268 2286 * This is a special case: we never associated this
2269 2287 * dbuf with any data allocated from the ARC.
2270 2288 */
2271 2289 ASSERT(db->db_state == DB_UNCACHED ||
2272 2290 db->db_state == DB_NOFILL);
2273 2291 dbuf_evict(db);
2274 2292 } else if (arc_released(db->db_buf)) {
2275 2293 arc_buf_t *buf = db->db_buf;
2276 2294 /*
2277 2295 * This dbuf has anonymous data associated with it.
2278 2296 */
2279 2297 dbuf_clear_data(db);
2280 2298 VERIFY(arc_buf_remove_ref(buf, db));
2281 2299 dbuf_evict(db);
2282 2300 } else {
2283 2301 VERIFY(!arc_buf_remove_ref(db->db_buf, db));
2284 2302
2285 2303 /*
2286 2304 * A dbuf will be eligible for eviction if either the
2287 2305 * 'primarycache' property is set or a duplicate
2288 2306 * copy of this buffer is already cached in the arc.
2289 2307 *
2290 2308 * In the case of the 'primarycache' a buffer
2291 2309 * is considered for eviction if it matches the
2292 2310 * criteria set in the property.
2293 2311 *
2294 2312 * To decide if our buffer is considered a
2295 2313 * duplicate, we must call into the arc to determine
2296 2314 * if multiple buffers are referencing the same
2297 2315 * block on-disk. If so, then we simply evict
2298 2316 * ourselves.
2299 2317 */
2300 2318 if (!DBUF_IS_CACHEABLE(db)) {
2301 2319 if (db->db_blkptr != NULL &&
|
↓ open down ↓ |
26 lines elided |
↑ open up ↑ |
2302 2320 !BP_IS_HOLE(db->db_blkptr) &&
2303 2321 !BP_IS_EMBEDDED(db->db_blkptr)) {
2304 2322 spa_t *spa =
2305 2323 dmu_objset_spa(db->db_objset);
2306 2324 blkptr_t bp = *db->db_blkptr;
2307 2325 dbuf_clear(db);
2308 2326 arc_freed(spa, &bp);
2309 2327 } else {
2310 2328 dbuf_clear(db);
2311 2329 }
2312 - } else if (db->db_objset->os_evicting ||
2330 + } else if (db->db_pending_evict ||
2313 2331 arc_buf_eviction_needed(db->db_buf)) {
2314 2332 dbuf_clear(db);
2315 2333 } else {
2316 2334 mutex_exit(&db->db_mtx);
2317 2335 }
2318 2336 }
2319 2337 } else {
2320 2338 mutex_exit(&db->db_mtx);
2321 2339 }
2322 2340 }
2323 2341
2324 2342 #pragma weak dmu_buf_refcount = dbuf_refcount
2325 2343 uint64_t
2326 2344 dbuf_refcount(dmu_buf_impl_t *db)
2327 2345 {
2328 2346 return (refcount_count(&db->db_holds));
2329 2347 }
2330 2348
2331 2349 void *
2332 2350 dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user,
2333 2351 dmu_buf_user_t *new_user)
2334 2352 {
2335 2353 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2336 2354
2337 2355 mutex_enter(&db->db_mtx);
2338 2356 dbuf_verify_user(db, DBVU_NOT_EVICTING);
2339 2357 if (db->db_user == old_user)
2340 2358 db->db_user = new_user;
2341 2359 else
2342 2360 old_user = db->db_user;
2343 2361 dbuf_verify_user(db, DBVU_NOT_EVICTING);
2344 2362 mutex_exit(&db->db_mtx);
2345 2363
2346 2364 return (old_user);
2347 2365 }
2348 2366
2349 2367 void *
|
↓ open down ↓ |
27 lines elided |
↑ open up ↑ |
2350 2368 dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
2351 2369 {
2352 2370 return (dmu_buf_replace_user(db_fake, NULL, user));
2353 2371 }
2354 2372
2355 2373 void *
2356 2374 dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)
2357 2375 {
2358 2376 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2359 2377
2360 - db->db_immediate_evict = TRUE;
2378 + db->db_user_immediate_evict = TRUE;
2361 2379 return (dmu_buf_set_user(db_fake, user));
2362 2380 }
2363 2381
2364 2382 void *
2365 2383 dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
2366 2384 {
2367 2385 return (dmu_buf_replace_user(db_fake, user, NULL));
2368 2386 }
2369 2387
2370 2388 void *
2371 2389 dmu_buf_get_user(dmu_buf_t *db_fake)
2372 2390 {
2373 2391 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2374 2392
2375 2393 dbuf_verify_user(db, DBVU_NOT_EVICTING);
2376 2394 return (db->db_user);
2377 2395 }
2378 2396
2379 2397 void
2380 2398 dmu_buf_user_evict_wait()
2381 2399 {
2382 2400 taskq_wait(dbu_evict_taskq);
2383 2401 }
2384 2402
2385 2403 boolean_t
2386 2404 dmu_buf_freeable(dmu_buf_t *dbuf)
2387 2405 {
2388 2406 boolean_t res = B_FALSE;
2389 2407 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
2390 2408
2391 2409 if (db->db_blkptr)
2392 2410 res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
2393 2411 db->db_blkptr, db->db_blkptr->blk_birth);
2394 2412
2395 2413 return (res);
2396 2414 }
2397 2415
2398 2416 blkptr_t *
2399 2417 dmu_buf_get_blkptr(dmu_buf_t *db)
2400 2418 {
2401 2419 dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
2402 2420 return (dbi->db_blkptr);
2403 2421 }
2404 2422
2405 2423 static void
2406 2424 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
2407 2425 {
2408 2426 /* ASSERT(dmu_tx_is_syncing(tx) */
2409 2427 ASSERT(MUTEX_HELD(&db->db_mtx));
2410 2428
2411 2429 if (db->db_blkptr != NULL)
2412 2430 return;
2413 2431
2414 2432 if (db->db_blkid == DMU_SPILL_BLKID) {
2415 2433 db->db_blkptr = &dn->dn_phys->dn_spill;
2416 2434 BP_ZERO(db->db_blkptr);
2417 2435 return;
2418 2436 }
2419 2437 if (db->db_level == dn->dn_phys->dn_nlevels-1) {
2420 2438 /*
2421 2439 * This buffer was allocated at a time when there was
2422 2440 * no available blkptrs from the dnode, or it was
2423 2441 * inappropriate to hook it in (i.e., nlevels mis-match).
2424 2442 */
2425 2443 ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
2426 2444 ASSERT(db->db_parent == NULL);
2427 2445 db->db_parent = dn->dn_dbuf;
2428 2446 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
2429 2447 DBUF_VERIFY(db);
2430 2448 } else {
2431 2449 dmu_buf_impl_t *parent = db->db_parent;
2432 2450 int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2433 2451
2434 2452 ASSERT(dn->dn_phys->dn_nlevels > 1);
2435 2453 if (parent == NULL) {
2436 2454 mutex_exit(&db->db_mtx);
2437 2455 rw_enter(&dn->dn_struct_rwlock, RW_READER);
2438 2456 (void) dbuf_hold_impl(dn, db->db_level+1,
2439 2457 db->db_blkid >> epbs, FALSE, db, &parent);
2440 2458 rw_exit(&dn->dn_struct_rwlock);
2441 2459 mutex_enter(&db->db_mtx);
2442 2460 db->db_parent = parent;
2443 2461 }
2444 2462 db->db_blkptr = (blkptr_t *)parent->db.db_data +
2445 2463 (db->db_blkid & ((1ULL << epbs) - 1));
2446 2464 DBUF_VERIFY(db);
2447 2465 }
2448 2466 }
2449 2467
2450 2468 static void
2451 2469 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2452 2470 {
2453 2471 dmu_buf_impl_t *db = dr->dr_dbuf;
2454 2472 dnode_t *dn;
2455 2473 zio_t *zio;
2456 2474
2457 2475 ASSERT(dmu_tx_is_syncing(tx));
2458 2476
2459 2477 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2460 2478
2461 2479 mutex_enter(&db->db_mtx);
2462 2480
2463 2481 ASSERT(db->db_level > 0);
2464 2482 DBUF_VERIFY(db);
2465 2483
2466 2484 /* Read the block if it hasn't been read yet. */
2467 2485 if (db->db_buf == NULL) {
2468 2486 mutex_exit(&db->db_mtx);
2469 2487 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
2470 2488 mutex_enter(&db->db_mtx);
2471 2489 }
2472 2490 ASSERT3U(db->db_state, ==, DB_CACHED);
2473 2491 ASSERT(db->db_buf != NULL);
2474 2492
2475 2493 DB_DNODE_ENTER(db);
2476 2494 dn = DB_DNODE(db);
2477 2495 /* Indirect block size must match what the dnode thinks it is. */
2478 2496 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2479 2497 dbuf_check_blkptr(dn, db);
2480 2498 DB_DNODE_EXIT(db);
2481 2499
2482 2500 /* Provide the pending dirty record to child dbufs */
2483 2501 db->db_data_pending = dr;
2484 2502
2485 2503 mutex_exit(&db->db_mtx);
2486 2504 dbuf_write(dr, db->db_buf, tx);
2487 2505
2488 2506 zio = dr->dr_zio;
2489 2507 mutex_enter(&dr->dt.di.dr_mtx);
2490 2508 dbuf_sync_list(&dr->dt.di.dr_children, tx);
2491 2509 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2492 2510 mutex_exit(&dr->dt.di.dr_mtx);
2493 2511 zio_nowait(zio);
2494 2512 }
2495 2513
2496 2514 static void
2497 2515 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2498 2516 {
2499 2517 arc_buf_t **datap = &dr->dt.dl.dr_data;
2500 2518 dmu_buf_impl_t *db = dr->dr_dbuf;
2501 2519 dnode_t *dn;
2502 2520 objset_t *os;
2503 2521 uint64_t txg = tx->tx_txg;
2504 2522
2505 2523 ASSERT(dmu_tx_is_syncing(tx));
2506 2524
2507 2525 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2508 2526
2509 2527 mutex_enter(&db->db_mtx);
2510 2528 /*
2511 2529 * To be synced, we must be dirtied. But we
2512 2530 * might have been freed after the dirty.
2513 2531 */
2514 2532 if (db->db_state == DB_UNCACHED) {
2515 2533 /* This buffer has been freed since it was dirtied */
2516 2534 ASSERT(db->db.db_data == NULL);
2517 2535 } else if (db->db_state == DB_FILL) {
2518 2536 /* This buffer was freed and is now being re-filled */
2519 2537 ASSERT(db->db.db_data != dr->dt.dl.dr_data);
2520 2538 } else {
2521 2539 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
2522 2540 }
2523 2541 DBUF_VERIFY(db);
2524 2542
2525 2543 DB_DNODE_ENTER(db);
2526 2544 dn = DB_DNODE(db);
2527 2545
2528 2546 if (db->db_blkid == DMU_SPILL_BLKID) {
2529 2547 mutex_enter(&dn->dn_mtx);
2530 2548 dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
2531 2549 mutex_exit(&dn->dn_mtx);
2532 2550 }
2533 2551
2534 2552 /*
2535 2553 * If this is a bonus buffer, simply copy the bonus data into the
2536 2554 * dnode. It will be written out when the dnode is synced (and it
2537 2555 * will be synced, since it must have been dirty for dbuf_sync to
2538 2556 * be called).
2539 2557 */
2540 2558 if (db->db_blkid == DMU_BONUS_BLKID) {
2541 2559 dbuf_dirty_record_t **drp;
2542 2560
2543 2561 ASSERT(*datap != NULL);
2544 2562 ASSERT0(db->db_level);
2545 2563 ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
2546 2564 bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
2547 2565 DB_DNODE_EXIT(db);
2548 2566
2549 2567 if (*datap != db->db.db_data) {
2550 2568 zio_buf_free(*datap, DN_MAX_BONUSLEN);
2551 2569 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
2552 2570 }
2553 2571 db->db_data_pending = NULL;
2554 2572 drp = &db->db_last_dirty;
2555 2573 while (*drp != dr)
2556 2574 drp = &(*drp)->dr_next;
2557 2575 ASSERT(dr->dr_next == NULL);
2558 2576 ASSERT(dr->dr_dbuf == db);
2559 2577 *drp = dr->dr_next;
2560 2578 kmem_free(dr, sizeof (dbuf_dirty_record_t));
2561 2579 ASSERT(db->db_dirtycnt > 0);
2562 2580 db->db_dirtycnt -= 1;
2563 2581 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
2564 2582 return;
2565 2583 }
2566 2584
2567 2585 os = dn->dn_objset;
2568 2586
2569 2587 /*
2570 2588 * This function may have dropped the db_mtx lock allowing a dmu_sync
2571 2589 * operation to sneak in. As a result, we need to ensure that we
2572 2590 * don't check the dr_override_state until we have returned from
2573 2591 * dbuf_check_blkptr.
2574 2592 */
2575 2593 dbuf_check_blkptr(dn, db);
2576 2594
2577 2595 /*
2578 2596 * If this buffer is in the middle of an immediate write,
2579 2597 * wait for the synchronous IO to complete.
2580 2598 */
2581 2599 while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
2582 2600 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
2583 2601 cv_wait(&db->db_changed, &db->db_mtx);
2584 2602 ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
2585 2603 }
2586 2604
2587 2605 if (db->db_state != DB_NOFILL &&
2588 2606 dn->dn_object != DMU_META_DNODE_OBJECT &&
2589 2607 refcount_count(&db->db_holds) > 1 &&
2590 2608 dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
2591 2609 *datap == db->db_buf) {
2592 2610 /*
2593 2611 * If this buffer is currently "in use" (i.e., there
2594 2612 * are active holds and db_data still references it),
2595 2613 * then make a copy before we start the write so that
2596 2614 * any modifications from the open txg will not leak
2597 2615 * into this write.
2598 2616 *
2599 2617 * NOTE: this copy does not need to be made for
2600 2618 * objects only modified in the syncing context (e.g.
2601 2619 * DNONE_DNODE blocks).
2602 2620 */
2603 2621 int blksz = arc_buf_size(*datap);
2604 2622 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
2605 2623 *datap = arc_buf_alloc(os->os_spa, blksz, db, type);
2606 2624 bcopy(db->db.db_data, (*datap)->b_data, blksz);
2607 2625 }
2608 2626 db->db_data_pending = dr;
2609 2627
2610 2628 mutex_exit(&db->db_mtx);
2611 2629
2612 2630 dbuf_write(dr, *datap, tx);
2613 2631
2614 2632 ASSERT(!list_link_active(&dr->dr_dirty_node));
2615 2633 if (dn->dn_object == DMU_META_DNODE_OBJECT) {
2616 2634 list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
2617 2635 DB_DNODE_EXIT(db);
2618 2636 } else {
2619 2637 /*
2620 2638 * Although zio_nowait() does not "wait for an IO", it does
2621 2639 * initiate the IO. If this is an empty write it seems plausible
2622 2640 * that the IO could actually be completed before the nowait
2623 2641 * returns. We need to DB_DNODE_EXIT() first in case
2624 2642 * zio_nowait() invalidates the dbuf.
2625 2643 */
2626 2644 DB_DNODE_EXIT(db);
2627 2645 zio_nowait(dr->dr_zio);
2628 2646 }
2629 2647 }
2630 2648
2631 2649 void
2632 2650 dbuf_sync_list(list_t *list, dmu_tx_t *tx)
2633 2651 {
2634 2652 dbuf_dirty_record_t *dr;
2635 2653
2636 2654 while (dr = list_head(list)) {
2637 2655 if (dr->dr_zio != NULL) {
2638 2656 /*
2639 2657 * If we find an already initialized zio then we
2640 2658 * are processing the meta-dnode, and we have finished.
2641 2659 * The dbufs for all dnodes are put back on the list
2642 2660 * during processing, so that we can zio_wait()
2643 2661 * these IOs after initiating all child IOs.
2644 2662 */
2645 2663 ASSERT3U(dr->dr_dbuf->db.db_object, ==,
2646 2664 DMU_META_DNODE_OBJECT);
2647 2665 break;
2648 2666 }
2649 2667 list_remove(list, dr);
2650 2668 if (dr->dr_dbuf->db_level > 0)
2651 2669 dbuf_sync_indirect(dr, tx);
2652 2670 else
2653 2671 dbuf_sync_leaf(dr, tx);
2654 2672 }
2655 2673 }
2656 2674
2657 2675 /* ARGSUSED */
2658 2676 static void
2659 2677 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
2660 2678 {
2661 2679 dmu_buf_impl_t *db = vdb;
2662 2680 dnode_t *dn;
2663 2681 blkptr_t *bp = zio->io_bp;
2664 2682 blkptr_t *bp_orig = &zio->io_bp_orig;
2665 2683 spa_t *spa = zio->io_spa;
2666 2684 int64_t delta;
2667 2685 uint64_t fill = 0;
2668 2686 int i;
2669 2687
2670 2688 ASSERT3P(db->db_blkptr, !=, NULL);
2671 2689 ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp);
2672 2690
2673 2691 DB_DNODE_ENTER(db);
2674 2692 dn = DB_DNODE(db);
2675 2693 delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
2676 2694 dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
2677 2695 zio->io_prev_space_delta = delta;
2678 2696
2679 2697 if (bp->blk_birth != 0) {
2680 2698 ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
2681 2699 BP_GET_TYPE(bp) == dn->dn_type) ||
2682 2700 (db->db_blkid == DMU_SPILL_BLKID &&
2683 2701 BP_GET_TYPE(bp) == dn->dn_bonustype) ||
2684 2702 BP_IS_EMBEDDED(bp));
2685 2703 ASSERT(BP_GET_LEVEL(bp) == db->db_level);
2686 2704 }
2687 2705
2688 2706 mutex_enter(&db->db_mtx);
2689 2707
2690 2708 #ifdef ZFS_DEBUG
2691 2709 if (db->db_blkid == DMU_SPILL_BLKID) {
2692 2710 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
2693 2711 ASSERT(!(BP_IS_HOLE(bp)) &&
2694 2712 db->db_blkptr == &dn->dn_phys->dn_spill);
2695 2713 }
2696 2714 #endif
2697 2715
2698 2716 if (db->db_level == 0) {
2699 2717 mutex_enter(&dn->dn_mtx);
2700 2718 if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
2701 2719 db->db_blkid != DMU_SPILL_BLKID)
2702 2720 dn->dn_phys->dn_maxblkid = db->db_blkid;
2703 2721 mutex_exit(&dn->dn_mtx);
2704 2722
2705 2723 if (dn->dn_type == DMU_OT_DNODE) {
2706 2724 dnode_phys_t *dnp = db->db.db_data;
2707 2725 for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
2708 2726 i--, dnp++) {
2709 2727 if (dnp->dn_type != DMU_OT_NONE)
2710 2728 fill++;
2711 2729 }
2712 2730 } else {
2713 2731 if (BP_IS_HOLE(bp)) {
2714 2732 fill = 0;
2715 2733 } else {
2716 2734 fill = 1;
2717 2735 }
2718 2736 }
2719 2737 } else {
2720 2738 blkptr_t *ibp = db->db.db_data;
2721 2739 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2722 2740 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
2723 2741 if (BP_IS_HOLE(ibp))
2724 2742 continue;
2725 2743 fill += BP_GET_FILL(ibp);
2726 2744 }
2727 2745 }
2728 2746 DB_DNODE_EXIT(db);
2729 2747
2730 2748 if (!BP_IS_EMBEDDED(bp))
2731 2749 bp->blk_fill = fill;
2732 2750
2733 2751 mutex_exit(&db->db_mtx);
2734 2752
2735 2753 rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
2736 2754 *db->db_blkptr = *bp;
2737 2755 rw_exit(&dn->dn_struct_rwlock);
2738 2756 }
2739 2757
2740 2758 /*
2741 2759 * The SPA will call this callback several times for each zio - once
2742 2760 * for every physical child i/o (zio->io_phys_children times). This
2743 2761 * allows the DMU to monitor the progress of each logical i/o. For example,
2744 2762 * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
2745 2763 * block. There may be a long delay before all copies/fragments are completed,
2746 2764 * so this callback allows us to retire dirty space gradually, as the physical
2747 2765 * i/os complete.
2748 2766 */
2749 2767 /* ARGSUSED */
2750 2768 static void
2751 2769 dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
2752 2770 {
2753 2771 dmu_buf_impl_t *db = arg;
2754 2772 objset_t *os = db->db_objset;
2755 2773 dsl_pool_t *dp = dmu_objset_pool(os);
2756 2774 dbuf_dirty_record_t *dr;
2757 2775 int delta = 0;
2758 2776
2759 2777 dr = db->db_data_pending;
2760 2778 ASSERT3U(dr->dr_txg, ==, zio->io_txg);
2761 2779
2762 2780 /*
2763 2781 * The callback will be called io_phys_children times. Retire one
2764 2782 * portion of our dirty space each time we are called. Any rounding
2765 2783 * error will be cleaned up by dsl_pool_sync()'s call to
2766 2784 * dsl_pool_undirty_space().
2767 2785 */
2768 2786 delta = dr->dr_accounted / zio->io_phys_children;
2769 2787 dsl_pool_undirty_space(dp, delta, zio->io_txg);
2770 2788 }
2771 2789
2772 2790 /* ARGSUSED */
2773 2791 static void
2774 2792 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
2775 2793 {
2776 2794 dmu_buf_impl_t *db = vdb;
2777 2795 blkptr_t *bp_orig = &zio->io_bp_orig;
2778 2796 blkptr_t *bp = db->db_blkptr;
2779 2797 objset_t *os = db->db_objset;
2780 2798 dmu_tx_t *tx = os->os_synctx;
2781 2799 dbuf_dirty_record_t **drp, *dr;
2782 2800
2783 2801 ASSERT0(zio->io_error);
2784 2802 ASSERT(db->db_blkptr == bp);
2785 2803
2786 2804 /*
2787 2805 * For nopwrites and rewrites we ensure that the bp matches our
2788 2806 * original and bypass all the accounting.
2789 2807 */
2790 2808 if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
2791 2809 ASSERT(BP_EQUAL(bp, bp_orig));
2792 2810 } else {
2793 2811 dsl_dataset_t *ds = os->os_dsl_dataset;
2794 2812 (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
2795 2813 dsl_dataset_block_born(ds, bp, tx);
2796 2814 }
2797 2815
2798 2816 mutex_enter(&db->db_mtx);
2799 2817
2800 2818 DBUF_VERIFY(db);
2801 2819
2802 2820 drp = &db->db_last_dirty;
2803 2821 while ((dr = *drp) != db->db_data_pending)
2804 2822 drp = &dr->dr_next;
2805 2823 ASSERT(!list_link_active(&dr->dr_dirty_node));
2806 2824 ASSERT(dr->dr_dbuf == db);
2807 2825 ASSERT(dr->dr_next == NULL);
2808 2826 *drp = dr->dr_next;
2809 2827
2810 2828 #ifdef ZFS_DEBUG
2811 2829 if (db->db_blkid == DMU_SPILL_BLKID) {
2812 2830 dnode_t *dn;
2813 2831
2814 2832 DB_DNODE_ENTER(db);
2815 2833 dn = DB_DNODE(db);
2816 2834 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
2817 2835 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
2818 2836 db->db_blkptr == &dn->dn_phys->dn_spill);
2819 2837 DB_DNODE_EXIT(db);
2820 2838 }
2821 2839 #endif
2822 2840
2823 2841 if (db->db_level == 0) {
2824 2842 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2825 2843 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
2826 2844 if (db->db_state != DB_NOFILL) {
2827 2845 if (dr->dt.dl.dr_data != db->db_buf)
2828 2846 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
2829 2847 db));
2830 2848 else if (!arc_released(db->db_buf))
2831 2849 arc_set_callback(db->db_buf, dbuf_do_evict, db);
2832 2850 }
2833 2851 } else {
2834 2852 dnode_t *dn;
2835 2853
2836 2854 DB_DNODE_ENTER(db);
2837 2855 dn = DB_DNODE(db);
2838 2856 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2839 2857 ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
2840 2858 if (!BP_IS_HOLE(db->db_blkptr)) {
2841 2859 int epbs =
2842 2860 dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2843 2861 ASSERT3U(db->db_blkid, <=,
2844 2862 dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
2845 2863 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
2846 2864 db->db.db_size);
2847 2865 if (!arc_released(db->db_buf))
2848 2866 arc_set_callback(db->db_buf, dbuf_do_evict, db);
2849 2867 }
2850 2868 DB_DNODE_EXIT(db);
2851 2869 mutex_destroy(&dr->dt.di.dr_mtx);
2852 2870 list_destroy(&dr->dt.di.dr_children);
2853 2871 }
2854 2872 kmem_free(dr, sizeof (dbuf_dirty_record_t));
2855 2873
2856 2874 cv_broadcast(&db->db_changed);
2857 2875 ASSERT(db->db_dirtycnt > 0);
2858 2876 db->db_dirtycnt -= 1;
2859 2877 db->db_data_pending = NULL;
2860 2878 dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg);
2861 2879 }
2862 2880
2863 2881 static void
2864 2882 dbuf_write_nofill_ready(zio_t *zio)
2865 2883 {
2866 2884 dbuf_write_ready(zio, NULL, zio->io_private);
2867 2885 }
2868 2886
2869 2887 static void
2870 2888 dbuf_write_nofill_done(zio_t *zio)
2871 2889 {
2872 2890 dbuf_write_done(zio, NULL, zio->io_private);
2873 2891 }
2874 2892
2875 2893 static void
2876 2894 dbuf_write_override_ready(zio_t *zio)
2877 2895 {
2878 2896 dbuf_dirty_record_t *dr = zio->io_private;
2879 2897 dmu_buf_impl_t *db = dr->dr_dbuf;
2880 2898
2881 2899 dbuf_write_ready(zio, NULL, db);
2882 2900 }
2883 2901
2884 2902 static void
2885 2903 dbuf_write_override_done(zio_t *zio)
2886 2904 {
2887 2905 dbuf_dirty_record_t *dr = zio->io_private;
2888 2906 dmu_buf_impl_t *db = dr->dr_dbuf;
2889 2907 blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
2890 2908
2891 2909 mutex_enter(&db->db_mtx);
2892 2910 if (!BP_EQUAL(zio->io_bp, obp)) {
2893 2911 if (!BP_IS_HOLE(obp))
2894 2912 dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
2895 2913 arc_release(dr->dt.dl.dr_data, db);
2896 2914 }
2897 2915 mutex_exit(&db->db_mtx);
2898 2916
2899 2917 dbuf_write_done(zio, NULL, db);
2900 2918 }
2901 2919
2902 2920 /* Issue I/O to commit a dirty buffer to disk. */
2903 2921 static void
2904 2922 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
2905 2923 {
2906 2924 dmu_buf_impl_t *db = dr->dr_dbuf;
2907 2925 dnode_t *dn;
2908 2926 objset_t *os;
2909 2927 dmu_buf_impl_t *parent = db->db_parent;
2910 2928 uint64_t txg = tx->tx_txg;
2911 2929 zbookmark_phys_t zb;
2912 2930 zio_prop_t zp;
2913 2931 zio_t *zio;
2914 2932 int wp_flag = 0;
2915 2933
2916 2934 ASSERT(dmu_tx_is_syncing(tx));
2917 2935
2918 2936 DB_DNODE_ENTER(db);
2919 2937 dn = DB_DNODE(db);
2920 2938 os = dn->dn_objset;
2921 2939
2922 2940 if (db->db_state != DB_NOFILL) {
2923 2941 if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
2924 2942 /*
2925 2943 * Private object buffers are released here rather
2926 2944 * than in dbuf_dirty() since they are only modified
2927 2945 * in the syncing context and we don't want the
2928 2946 * overhead of making multiple copies of the data.
2929 2947 */
2930 2948 if (BP_IS_HOLE(db->db_blkptr)) {
2931 2949 arc_buf_thaw(data);
2932 2950 } else {
2933 2951 dbuf_release_bp(db);
2934 2952 }
2935 2953 }
2936 2954 }
2937 2955
2938 2956 if (parent != dn->dn_dbuf) {
2939 2957 /* Our parent is an indirect block. */
2940 2958 /* We have a dirty parent that has been scheduled for write. */
2941 2959 ASSERT(parent && parent->db_data_pending);
2942 2960 /* Our parent's buffer is one level closer to the dnode. */
2943 2961 ASSERT(db->db_level == parent->db_level-1);
2944 2962 /*
2945 2963 * We're about to modify our parent's db_data by modifying
2946 2964 * our block pointer, so the parent must be released.
2947 2965 */
2948 2966 ASSERT(arc_released(parent->db_buf));
2949 2967 zio = parent->db_data_pending->dr_zio;
2950 2968 } else {
2951 2969 /* Our parent is the dnode itself. */
2952 2970 ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
2953 2971 db->db_blkid != DMU_SPILL_BLKID) ||
2954 2972 (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
2955 2973 if (db->db_blkid != DMU_SPILL_BLKID)
2956 2974 ASSERT3P(db->db_blkptr, ==,
2957 2975 &dn->dn_phys->dn_blkptr[db->db_blkid]);
2958 2976 zio = dn->dn_zio;
2959 2977 }
2960 2978
2961 2979 ASSERT(db->db_level == 0 || data == db->db_buf);
2962 2980 ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
2963 2981 ASSERT(zio);
2964 2982
2965 2983 SET_BOOKMARK(&zb, os->os_dsl_dataset ?
2966 2984 os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
2967 2985 db->db.db_object, db->db_level, db->db_blkid);
2968 2986
2969 2987 if (db->db_blkid == DMU_SPILL_BLKID)
2970 2988 wp_flag = WP_SPILL;
2971 2989 wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
2972 2990
2973 2991 dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
2974 2992 DB_DNODE_EXIT(db);
2975 2993
2976 2994 /*
2977 2995 * We copy the blkptr now (rather than when we instantiate the dirty
2978 2996 * record), because its value can change between open context and
2979 2997 * syncing context. We do not need to hold dn_struct_rwlock to read
2980 2998 * db_blkptr because we are in syncing context.
2981 2999 */
2982 3000 dr->dr_bp_copy = *db->db_blkptr;
2983 3001
2984 3002 if (db->db_level == 0 &&
2985 3003 dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
2986 3004 /*
2987 3005 * The BP for this block has been provided by open context
2988 3006 * (by dmu_sync() or dmu_buf_write_embedded()).
2989 3007 */
2990 3008 void *contents = (data != NULL) ? data->b_data : NULL;
2991 3009
2992 3010 dr->dr_zio = zio_write(zio, os->os_spa, txg,
2993 3011 &dr->dr_bp_copy, contents, db->db.db_size, &zp,
2994 3012 dbuf_write_override_ready, NULL, dbuf_write_override_done,
2995 3013 dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2996 3014 mutex_enter(&db->db_mtx);
2997 3015 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
2998 3016 zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
2999 3017 dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
3000 3018 mutex_exit(&db->db_mtx);
3001 3019 } else if (db->db_state == DB_NOFILL) {
3002 3020 ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
3003 3021 zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
3004 3022 dr->dr_zio = zio_write(zio, os->os_spa, txg,
3005 3023 &dr->dr_bp_copy, NULL, db->db.db_size, &zp,
3006 3024 dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db,
3007 3025 ZIO_PRIORITY_ASYNC_WRITE,
3008 3026 ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
3009 3027 } else {
3010 3028 ASSERT(arc_released(data));
3011 3029 dr->dr_zio = arc_write(zio, os->os_spa, txg,
3012 3030 &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db),
3013 3031 DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
3014 3032 dbuf_write_physdone, dbuf_write_done, db,
3015 3033 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
3016 3034 }
3017 3035 }
|
↓ open down ↓ |
647 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX