Print this page
5056 ZFS deadlock on db_mtx and dn_holds
Reviewed by: Will Andrews <willa@spectralogic.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Approved by: Dan McDonald <danmcd@omniti.com>
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/fs/zfs/dbuf.c
+++ new/usr/src/uts/common/fs/zfs/dbuf.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
|
↓ open down ↓ |
16 lines elided |
↑ open up ↑ |
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24 24 * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
25 25 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
26 26 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
27 + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
27 28 */
28 29
29 30 #include <sys/zfs_context.h>
30 31 #include <sys/dmu.h>
31 32 #include <sys/dmu_send.h>
32 33 #include <sys/dmu_impl.h>
33 34 #include <sys/dbuf.h>
34 35 #include <sys/dmu_objset.h>
35 36 #include <sys/dsl_dataset.h>
36 37 #include <sys/dsl_dir.h>
37 38 #include <sys/dmu_tx.h>
38 39 #include <sys/spa.h>
39 40 #include <sys/zio.h>
40 41 #include <sys/dmu_zfetch.h>
41 42 #include <sys/sa.h>
42 43 #include <sys/sa_impl.h>
43 44 #include <sys/zfeature.h>
44 45 #include <sys/blkptr.h>
45 46 #include <sys/range_tree.h>
46 47
|
↓ open down ↓ |
10 lines elided |
↑ open up ↑ |
47 48 /*
48 49 * Number of times that zfs_free_range() took the slow path while doing
49 50 * a zfs receive. A nonzero value indicates a potential performance problem.
50 51 */
51 52 uint64_t zfs_free_range_recv_miss;
52 53
53 54 static void dbuf_destroy(dmu_buf_impl_t *db);
54 55 static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
55 56 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
56 57
58 +#ifndef __lint
59 +extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu,
60 + dmu_buf_evict_func_t *evict_func, dmu_buf_t **clear_on_evict_dbufp);
61 +#endif /* ! __lint */
62 +
57 63 /*
58 64 * Global data structures and functions for the dbuf cache.
59 65 */
60 66 static kmem_cache_t *dbuf_cache;
67 +static taskq_t *dbu_evict_taskq;
61 68
62 69 /* ARGSUSED */
63 70 static int
64 71 dbuf_cons(void *vdb, void *unused, int kmflag)
65 72 {
66 73 dmu_buf_impl_t *db = vdb;
67 74 bzero(db, sizeof (dmu_buf_impl_t));
68 75
69 76 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
70 77 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
71 78 refcount_create(&db->db_holds);
72 79
73 80 return (0);
74 81 }
75 82
76 83 /* ARGSUSED */
77 84 static void
78 85 dbuf_dest(void *vdb, void *unused)
79 86 {
80 87 dmu_buf_impl_t *db = vdb;
81 88 mutex_destroy(&db->db_mtx);
82 89 cv_destroy(&db->db_changed);
83 90 refcount_destroy(&db->db_holds);
84 91 }
85 92
86 93 /*
87 94 * dbuf hash table routines
88 95 */
89 96 static dbuf_hash_table_t dbuf_hash_table;
90 97
91 98 static uint64_t dbuf_hash_count;
92 99
93 100 static uint64_t
94 101 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
95 102 {
96 103 uintptr_t osv = (uintptr_t)os;
97 104 uint64_t crc = -1ULL;
98 105
99 106 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
100 107 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
101 108 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
102 109 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
103 110 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
104 111 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
105 112 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
106 113
107 114 crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
108 115
109 116 return (crc);
110 117 }
111 118
112 119 #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
113 120
114 121 #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
115 122 ((dbuf)->db.db_object == (obj) && \
116 123 (dbuf)->db_objset == (os) && \
117 124 (dbuf)->db_level == (level) && \
118 125 (dbuf)->db_blkid == (blkid))
119 126
120 127 dmu_buf_impl_t *
121 128 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
122 129 {
123 130 dbuf_hash_table_t *h = &dbuf_hash_table;
124 131 objset_t *os = dn->dn_objset;
125 132 uint64_t obj = dn->dn_object;
126 133 uint64_t hv = DBUF_HASH(os, obj, level, blkid);
127 134 uint64_t idx = hv & h->hash_table_mask;
128 135 dmu_buf_impl_t *db;
129 136
130 137 mutex_enter(DBUF_HASH_MUTEX(h, idx));
131 138 for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
132 139 if (DBUF_EQUAL(db, os, obj, level, blkid)) {
133 140 mutex_enter(&db->db_mtx);
134 141 if (db->db_state != DB_EVICTING) {
135 142 mutex_exit(DBUF_HASH_MUTEX(h, idx));
136 143 return (db);
137 144 }
138 145 mutex_exit(&db->db_mtx);
139 146 }
140 147 }
141 148 mutex_exit(DBUF_HASH_MUTEX(h, idx));
142 149 return (NULL);
143 150 }
144 151
145 152 /*
146 153 * Insert an entry into the hash table. If there is already an element
147 154 * equal to elem in the hash table, then the already existing element
148 155 * will be returned and the new element will not be inserted.
149 156 * Otherwise returns NULL.
150 157 */
151 158 static dmu_buf_impl_t *
152 159 dbuf_hash_insert(dmu_buf_impl_t *db)
153 160 {
154 161 dbuf_hash_table_t *h = &dbuf_hash_table;
155 162 objset_t *os = db->db_objset;
156 163 uint64_t obj = db->db.db_object;
157 164 int level = db->db_level;
158 165 uint64_t blkid = db->db_blkid;
159 166 uint64_t hv = DBUF_HASH(os, obj, level, blkid);
160 167 uint64_t idx = hv & h->hash_table_mask;
161 168 dmu_buf_impl_t *dbf;
162 169
163 170 mutex_enter(DBUF_HASH_MUTEX(h, idx));
164 171 for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
165 172 if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
166 173 mutex_enter(&dbf->db_mtx);
167 174 if (dbf->db_state != DB_EVICTING) {
168 175 mutex_exit(DBUF_HASH_MUTEX(h, idx));
169 176 return (dbf);
170 177 }
171 178 mutex_exit(&dbf->db_mtx);
172 179 }
173 180 }
174 181
175 182 mutex_enter(&db->db_mtx);
176 183 db->db_hash_next = h->hash_table[idx];
177 184 h->hash_table[idx] = db;
178 185 mutex_exit(DBUF_HASH_MUTEX(h, idx));
179 186 atomic_inc_64(&dbuf_hash_count);
180 187
181 188 return (NULL);
182 189 }
183 190
184 191 /*
185 192 * Remove an entry from the hash table. It must be in the EVICTING state.
186 193 */
187 194 static void
188 195 dbuf_hash_remove(dmu_buf_impl_t *db)
189 196 {
190 197 dbuf_hash_table_t *h = &dbuf_hash_table;
191 198 uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
192 199 db->db_level, db->db_blkid);
193 200 uint64_t idx = hv & h->hash_table_mask;
194 201 dmu_buf_impl_t *dbf, **dbp;
195 202
196 203 /*
197 204 * We musn't hold db_mtx to maintain lock ordering:
198 205 * DBUF_HASH_MUTEX > db_mtx.
199 206 */
200 207 ASSERT(refcount_is_zero(&db->db_holds));
201 208 ASSERT(db->db_state == DB_EVICTING);
202 209 ASSERT(!MUTEX_HELD(&db->db_mtx));
203 210
204 211 mutex_enter(DBUF_HASH_MUTEX(h, idx));
205 212 dbp = &h->hash_table[idx];
206 213 while ((dbf = *dbp) != db) {
207 214 dbp = &dbf->db_hash_next;
|
↓ open down ↓ |
137 lines elided |
↑ open up ↑ |
208 215 ASSERT(dbf != NULL);
209 216 }
210 217 *dbp = db->db_hash_next;
211 218 db->db_hash_next = NULL;
212 219 mutex_exit(DBUF_HASH_MUTEX(h, idx));
213 220 atomic_dec_64(&dbuf_hash_count);
214 221 }
215 222
216 223 static arc_evict_func_t dbuf_do_evict;
217 224
225 +typedef enum {
226 + DBVU_EVICTING,
227 + DBVU_NOT_EVICTING
228 +} dbvu_verify_type_t;
229 +
218 230 static void
231 +dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)
232 +{
233 +#ifdef ZFS_DEBUG
234 + int64_t holds;
235 +
236 + if (db->db_user == NULL)
237 + return;
238 +
239 + /* Only data blocks support the attachment of user data. */
240 + ASSERT(db->db_level == 0);
241 +
242 + /* Clients must resolve a dbuf before attaching user data. */
243 + ASSERT(db->db.db_data != NULL);
244 + ASSERT3U(db->db_state, ==, DB_CACHED);
245 +
246 + holds = refcount_count(&db->db_holds);
247 + if (verify_type == DBVU_EVICTING) {
248 + /*
249 + * Immediate eviction occurs when holds == dirtycnt.
250 + * For normal eviction buffers, holds is zero on
251 + * eviction, except when dbuf_fix_old_data() calls
252 + * dbuf_clear_data(). However, the hold count can grow
253 + * during eviction even though db_mtx is held (see
254 + * dmu_bonus_hold() for an example), so we can only
255 + * test the generic invariant that holds >= dirtycnt.
256 + */
257 + ASSERT3U(holds, >=, db->db_dirtycnt);
258 + } else {
259 + if (db->db_immediate_evict == TRUE)
260 + ASSERT3U(holds, >=, db->db_dirtycnt);
261 + else
262 + ASSERT3U(holds, >, 0);
263 + }
264 +#endif
265 +}
266 +
267 +static void
219 268 dbuf_evict_user(dmu_buf_impl_t *db)
220 269 {
270 + dmu_buf_user_t *dbu = db->db_user;
271 +
221 272 ASSERT(MUTEX_HELD(&db->db_mtx));
222 273
223 - if (db->db_level != 0 || db->db_evict_func == NULL)
274 + if (dbu == NULL)
224 275 return;
225 276
226 - db->db_evict_func(&db->db, db->db_user_ptr);
227 - db->db_user_ptr = NULL;
228 - db->db_evict_func = NULL;
277 + dbuf_verify_user(db, DBVU_EVICTING);
278 + db->db_user = NULL;
279 +
280 +#ifdef ZFS_DEBUG
281 + if (dbu->dbu_clear_on_evict_dbufp != NULL)
282 + *dbu->dbu_clear_on_evict_dbufp = NULL;
283 +#endif
284 +
285 + /*
286 + * Invoke the callback from a taskq to avoid lock order reversals
287 + * and limit stack depth.
288 + */
289 + taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func, dbu, 0,
290 + &dbu->dbu_tqent);
229 291 }
230 292
231 293 boolean_t
232 294 dbuf_is_metadata(dmu_buf_impl_t *db)
233 295 {
234 296 if (db->db_level > 0) {
235 297 return (B_TRUE);
236 298 } else {
237 299 boolean_t is_metadata;
238 300
239 301 DB_DNODE_ENTER(db);
240 302 is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
241 303 DB_DNODE_EXIT(db);
242 304
243 305 return (is_metadata);
244 306 }
245 307 }
246 308
247 309 void
248 310 dbuf_evict(dmu_buf_impl_t *db)
249 311 {
250 312 ASSERT(MUTEX_HELD(&db->db_mtx));
251 313 ASSERT(db->db_buf == NULL);
252 314 ASSERT(db->db_data_pending == NULL);
253 315
254 316 dbuf_clear(db);
255 317 dbuf_destroy(db);
256 318 }
257 319
258 320 void
259 321 dbuf_init(void)
260 322 {
261 323 uint64_t hsize = 1ULL << 16;
262 324 dbuf_hash_table_t *h = &dbuf_hash_table;
263 325 int i;
264 326
265 327 /*
266 328 * The hash table is big enough to fill all of physical memory
267 329 * with an average 4K block size. The table will take up
268 330 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
269 331 */
270 332 while (hsize * 4096 < physmem * PAGESIZE)
271 333 hsize <<= 1;
272 334
273 335 retry:
274 336 h->hash_table_mask = hsize - 1;
275 337 h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
276 338 if (h->hash_table == NULL) {
277 339 /* XXX - we should really return an error instead of assert */
278 340 ASSERT(hsize > (1ULL << 10));
|
↓ open down ↓ |
40 lines elided |
↑ open up ↑ |
279 341 hsize >>= 1;
280 342 goto retry;
281 343 }
282 344
283 345 dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
284 346 sizeof (dmu_buf_impl_t),
285 347 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
286 348
287 349 for (i = 0; i < DBUF_MUTEXES; i++)
288 350 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
351 +
352 + /*
353 + * All entries are queued via taskq_dispatch_ent(), so min/maxalloc
354 + * configuration is not required.
355 + */
356 + dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0);
289 357 }
290 358
291 359 void
292 360 dbuf_fini(void)
293 361 {
294 362 dbuf_hash_table_t *h = &dbuf_hash_table;
295 363 int i;
296 364
297 365 for (i = 0; i < DBUF_MUTEXES; i++)
298 366 mutex_destroy(&h->hash_mutexes[i]);
299 367 kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
300 368 kmem_cache_destroy(dbuf_cache);
369 + taskq_destroy(dbu_evict_taskq);
301 370 }
302 371
303 372 /*
304 373 * Other stuff.
305 374 */
306 375
307 376 #ifdef ZFS_DEBUG
308 377 static void
309 378 dbuf_verify(dmu_buf_impl_t *db)
310 379 {
311 380 dnode_t *dn;
312 381 dbuf_dirty_record_t *dr;
313 382
314 383 ASSERT(MUTEX_HELD(&db->db_mtx));
315 384
316 385 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
317 386 return;
318 387
319 388 ASSERT(db->db_objset != NULL);
320 389 DB_DNODE_ENTER(db);
321 390 dn = DB_DNODE(db);
322 391 if (dn == NULL) {
323 392 ASSERT(db->db_parent == NULL);
324 393 ASSERT(db->db_blkptr == NULL);
325 394 } else {
326 395 ASSERT3U(db->db.db_object, ==, dn->dn_object);
327 396 ASSERT3P(db->db_objset, ==, dn->dn_objset);
328 397 ASSERT3U(db->db_level, <, dn->dn_nlevels);
329 398 ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
330 399 db->db_blkid == DMU_SPILL_BLKID ||
331 400 !avl_is_empty(&dn->dn_dbufs));
332 401 }
333 402 if (db->db_blkid == DMU_BONUS_BLKID) {
334 403 ASSERT(dn != NULL);
335 404 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
336 405 ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
337 406 } else if (db->db_blkid == DMU_SPILL_BLKID) {
338 407 ASSERT(dn != NULL);
339 408 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
340 409 ASSERT0(db->db.db_offset);
341 410 } else {
342 411 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
343 412 }
344 413
345 414 for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
346 415 ASSERT(dr->dr_dbuf == db);
347 416
348 417 for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
349 418 ASSERT(dr->dr_dbuf == db);
350 419
351 420 /*
352 421 * We can't assert that db_size matches dn_datablksz because it
353 422 * can be momentarily different when another thread is doing
354 423 * dnode_set_blksz().
355 424 */
356 425 if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
357 426 dr = db->db_data_pending;
358 427 /*
359 428 * It should only be modified in syncing context, so
360 429 * make sure we only have one copy of the data.
361 430 */
362 431 ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
363 432 }
364 433
365 434 /* verify db->db_blkptr */
366 435 if (db->db_blkptr) {
367 436 if (db->db_parent == dn->dn_dbuf) {
368 437 /* db is pointed to by the dnode */
369 438 /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
370 439 if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
371 440 ASSERT(db->db_parent == NULL);
372 441 else
373 442 ASSERT(db->db_parent != NULL);
374 443 if (db->db_blkid != DMU_SPILL_BLKID)
375 444 ASSERT3P(db->db_blkptr, ==,
376 445 &dn->dn_phys->dn_blkptr[db->db_blkid]);
377 446 } else {
378 447 /* db is pointed to by an indirect block */
379 448 int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
380 449 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
381 450 ASSERT3U(db->db_parent->db.db_object, ==,
382 451 db->db.db_object);
383 452 /*
384 453 * dnode_grow_indblksz() can make this fail if we don't
385 454 * have the struct_rwlock. XXX indblksz no longer
386 455 * grows. safe to do this now?
387 456 */
388 457 if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
389 458 ASSERT3P(db->db_blkptr, ==,
390 459 ((blkptr_t *)db->db_parent->db.db_data +
391 460 db->db_blkid % epb));
392 461 }
393 462 }
394 463 }
395 464 if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
396 465 (db->db_buf == NULL || db->db_buf->b_data) &&
397 466 db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
398 467 db->db_state != DB_FILL && !dn->dn_free_txg) {
399 468 /*
400 469 * If the blkptr isn't set but they have nonzero data,
401 470 * it had better be dirty, otherwise we'll lose that
402 471 * data when we evict this buffer.
403 472 */
404 473 if (db->db_dirtycnt == 0) {
405 474 uint64_t *buf = db->db.db_data;
406 475 int i;
407 476
|
↓ open down ↓ |
97 lines elided |
↑ open up ↑ |
408 477 for (i = 0; i < db->db.db_size >> 3; i++) {
409 478 ASSERT(buf[i] == 0);
410 479 }
411 480 }
412 481 }
413 482 DB_DNODE_EXIT(db);
414 483 }
415 484 #endif
416 485
417 486 static void
487 +dbuf_clear_data(dmu_buf_impl_t *db)
488 +{
489 + ASSERT(MUTEX_HELD(&db->db_mtx));
490 + dbuf_evict_user(db);
491 + db->db_buf = NULL;
492 + db->db.db_data = NULL;
493 + if (db->db_state != DB_NOFILL)
494 + db->db_state = DB_UNCACHED;
495 +}
496 +
497 +static void
418 498 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
419 499 {
420 500 ASSERT(MUTEX_HELD(&db->db_mtx));
501 + ASSERT(buf != NULL);
502 +
421 503 db->db_buf = buf;
422 - if (buf != NULL) {
423 - ASSERT(buf->b_data != NULL);
424 - db->db.db_data = buf->b_data;
425 - if (!arc_released(buf))
426 - arc_set_callback(buf, dbuf_do_evict, db);
427 - } else {
428 - dbuf_evict_user(db);
429 - db->db.db_data = NULL;
430 - if (db->db_state != DB_NOFILL)
431 - db->db_state = DB_UNCACHED;
432 - }
504 + ASSERT(buf->b_data != NULL);
505 + db->db.db_data = buf->b_data;
506 + if (!arc_released(buf))
507 + arc_set_callback(buf, dbuf_do_evict, db);
433 508 }
434 509
435 510 /*
436 511 * Loan out an arc_buf for read. Return the loaned arc_buf.
437 512 */
438 513 arc_buf_t *
439 514 dbuf_loan_arcbuf(dmu_buf_impl_t *db)
440 515 {
441 516 arc_buf_t *abuf;
442 517
443 518 mutex_enter(&db->db_mtx);
|
↓ open down ↓ |
1 lines elided |
↑ open up ↑ |
444 519 if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
445 520 int blksz = db->db.db_size;
446 521 spa_t *spa = db->db_objset->os_spa;
447 522
448 523 mutex_exit(&db->db_mtx);
449 524 abuf = arc_loan_buf(spa, blksz);
450 525 bcopy(db->db.db_data, abuf->b_data, blksz);
451 526 } else {
452 527 abuf = db->db_buf;
453 528 arc_loan_inuse_buf(abuf, db);
454 - dbuf_set_data(db, NULL);
529 + dbuf_clear_data(db);
455 530 mutex_exit(&db->db_mtx);
456 531 }
457 532 return (abuf);
458 533 }
459 534
460 535 uint64_t
461 536 dbuf_whichblock(dnode_t *dn, uint64_t offset)
462 537 {
463 538 if (dn->dn_datablkshift) {
464 539 return (offset >> dn->dn_datablkshift);
465 540 } else {
466 541 ASSERT3U(offset, <, dn->dn_datablksz);
467 542 return (0);
468 543 }
469 544 }
470 545
471 546 static void
472 547 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
473 548 {
474 549 dmu_buf_impl_t *db = vdb;
475 550
476 551 mutex_enter(&db->db_mtx);
477 552 ASSERT3U(db->db_state, ==, DB_READ);
478 553 /*
479 554 * All reads are synchronous, so we must have a hold on the dbuf
480 555 */
481 556 ASSERT(refcount_count(&db->db_holds) > 0);
482 557 ASSERT(db->db_buf == NULL);
483 558 ASSERT(db->db.db_data == NULL);
484 559 if (db->db_level == 0 && db->db_freed_in_flight) {
485 560 /* we were freed in flight; disregard any error */
486 561 arc_release(buf, db);
487 562 bzero(buf->b_data, db->db.db_size);
488 563 arc_buf_freeze(buf);
489 564 db->db_freed_in_flight = FALSE;
490 565 dbuf_set_data(db, buf);
491 566 db->db_state = DB_CACHED;
492 567 } else if (zio == NULL || zio->io_error == 0) {
493 568 dbuf_set_data(db, buf);
494 569 db->db_state = DB_CACHED;
495 570 } else {
496 571 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
497 572 ASSERT3P(db->db_buf, ==, NULL);
498 573 VERIFY(arc_buf_remove_ref(buf, db));
499 574 db->db_state = DB_UNCACHED;
500 575 }
501 576 cv_broadcast(&db->db_changed);
502 577 dbuf_rele_and_unlock(db, NULL);
503 578 }
504 579
505 580 static void
506 581 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
507 582 {
508 583 dnode_t *dn;
509 584 zbookmark_phys_t zb;
510 585 arc_flags_t aflags = ARC_FLAG_NOWAIT;
511 586
512 587 DB_DNODE_ENTER(db);
513 588 dn = DB_DNODE(db);
514 589 ASSERT(!refcount_is_zero(&db->db_holds));
515 590 /* We need the struct_rwlock to prevent db_blkptr from changing. */
516 591 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
517 592 ASSERT(MUTEX_HELD(&db->db_mtx));
518 593 ASSERT(db->db_state == DB_UNCACHED);
519 594 ASSERT(db->db_buf == NULL);
520 595
521 596 if (db->db_blkid == DMU_BONUS_BLKID) {
522 597 int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
523 598
524 599 ASSERT3U(bonuslen, <=, db->db.db_size);
525 600 db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
526 601 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
527 602 if (bonuslen < DN_MAX_BONUSLEN)
528 603 bzero(db->db.db_data, DN_MAX_BONUSLEN);
529 604 if (bonuslen)
530 605 bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
531 606 DB_DNODE_EXIT(db);
532 607 db->db_state = DB_CACHED;
533 608 mutex_exit(&db->db_mtx);
534 609 return;
535 610 }
536 611
537 612 /*
538 613 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
539 614 * processes the delete record and clears the bp while we are waiting
540 615 * for the dn_mtx (resulting in a "no" from block_freed).
541 616 */
542 617 if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
543 618 (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
544 619 BP_IS_HOLE(db->db_blkptr)))) {
545 620 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
546 621
547 622 DB_DNODE_EXIT(db);
548 623 dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa,
549 624 db->db.db_size, db, type));
550 625 bzero(db->db.db_data, db->db.db_size);
551 626 db->db_state = DB_CACHED;
552 627 *flags |= DB_RF_CACHED;
553 628 mutex_exit(&db->db_mtx);
554 629 return;
555 630 }
556 631
557 632 DB_DNODE_EXIT(db);
558 633
559 634 db->db_state = DB_READ;
560 635 mutex_exit(&db->db_mtx);
561 636
562 637 if (DBUF_IS_L2CACHEABLE(db))
563 638 aflags |= ARC_FLAG_L2CACHE;
564 639 if (DBUF_IS_L2COMPRESSIBLE(db))
565 640 aflags |= ARC_FLAG_L2COMPRESS;
566 641
567 642 SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
568 643 db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
569 644 db->db.db_object, db->db_level, db->db_blkid);
570 645
571 646 dbuf_add_ref(db, NULL);
572 647
573 648 (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
574 649 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
575 650 (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
576 651 &aflags, &zb);
577 652 if (aflags & ARC_FLAG_CACHED)
578 653 *flags |= DB_RF_CACHED;
579 654 }
580 655
581 656 int
582 657 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
583 658 {
584 659 int err = 0;
585 660 boolean_t havepzio = (zio != NULL);
586 661 boolean_t prefetch;
587 662 dnode_t *dn;
588 663
589 664 /*
590 665 * We don't have to hold the mutex to check db_state because it
591 666 * can't be freed while we have a hold on the buffer.
592 667 */
593 668 ASSERT(!refcount_is_zero(&db->db_holds));
594 669
595 670 if (db->db_state == DB_NOFILL)
596 671 return (SET_ERROR(EIO));
597 672
598 673 DB_DNODE_ENTER(db);
599 674 dn = DB_DNODE(db);
600 675 if ((flags & DB_RF_HAVESTRUCT) == 0)
601 676 rw_enter(&dn->dn_struct_rwlock, RW_READER);
602 677
603 678 prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
604 679 (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
605 680 DBUF_IS_CACHEABLE(db);
606 681
607 682 mutex_enter(&db->db_mtx);
608 683 if (db->db_state == DB_CACHED) {
609 684 mutex_exit(&db->db_mtx);
610 685 if (prefetch)
611 686 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
612 687 db->db.db_size, TRUE);
613 688 if ((flags & DB_RF_HAVESTRUCT) == 0)
614 689 rw_exit(&dn->dn_struct_rwlock);
615 690 DB_DNODE_EXIT(db);
616 691 } else if (db->db_state == DB_UNCACHED) {
617 692 spa_t *spa = dn->dn_objset->os_spa;
618 693
619 694 if (zio == NULL)
620 695 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
621 696 dbuf_read_impl(db, zio, &flags);
622 697
623 698 /* dbuf_read_impl has dropped db_mtx for us */
624 699
625 700 if (prefetch)
626 701 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
627 702 db->db.db_size, flags & DB_RF_CACHED);
628 703
629 704 if ((flags & DB_RF_HAVESTRUCT) == 0)
630 705 rw_exit(&dn->dn_struct_rwlock);
631 706 DB_DNODE_EXIT(db);
632 707
633 708 if (!havepzio)
634 709 err = zio_wait(zio);
635 710 } else {
636 711 /*
637 712 * Another reader came in while the dbuf was in flight
638 713 * between UNCACHED and CACHED. Either a writer will finish
639 714 * writing the buffer (sending the dbuf to CACHED) or the
640 715 * first reader's request will reach the read_done callback
641 716 * and send the dbuf to CACHED. Otherwise, a failure
642 717 * occurred and the dbuf went to UNCACHED.
643 718 */
644 719 mutex_exit(&db->db_mtx);
645 720 if (prefetch)
646 721 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
647 722 db->db.db_size, TRUE);
648 723 if ((flags & DB_RF_HAVESTRUCT) == 0)
649 724 rw_exit(&dn->dn_struct_rwlock);
650 725 DB_DNODE_EXIT(db);
651 726
652 727 /* Skip the wait per the caller's request. */
653 728 mutex_enter(&db->db_mtx);
654 729 if ((flags & DB_RF_NEVERWAIT) == 0) {
655 730 while (db->db_state == DB_READ ||
656 731 db->db_state == DB_FILL) {
657 732 ASSERT(db->db_state == DB_READ ||
658 733 (flags & DB_RF_HAVESTRUCT) == 0);
659 734 DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
660 735 db, zio_t *, zio);
661 736 cv_wait(&db->db_changed, &db->db_mtx);
662 737 }
663 738 if (db->db_state == DB_UNCACHED)
664 739 err = SET_ERROR(EIO);
665 740 }
666 741 mutex_exit(&db->db_mtx);
667 742 }
668 743
669 744 ASSERT(err || havepzio || db->db_state == DB_CACHED);
670 745 return (err);
671 746 }
672 747
673 748 static void
674 749 dbuf_noread(dmu_buf_impl_t *db)
675 750 {
676 751 ASSERT(!refcount_is_zero(&db->db_holds));
677 752 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
678 753 mutex_enter(&db->db_mtx);
679 754 while (db->db_state == DB_READ || db->db_state == DB_FILL)
|
↓ open down ↓ |
215 lines elided |
↑ open up ↑ |
680 755 cv_wait(&db->db_changed, &db->db_mtx);
681 756 if (db->db_state == DB_UNCACHED) {
682 757 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
683 758 spa_t *spa = db->db_objset->os_spa;
684 759
685 760 ASSERT(db->db_buf == NULL);
686 761 ASSERT(db->db.db_data == NULL);
687 762 dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
688 763 db->db_state = DB_FILL;
689 764 } else if (db->db_state == DB_NOFILL) {
690 - dbuf_set_data(db, NULL);
765 + dbuf_clear_data(db);
691 766 } else {
692 767 ASSERT3U(db->db_state, ==, DB_CACHED);
693 768 }
694 769 mutex_exit(&db->db_mtx);
695 770 }
696 771
697 772 /*
698 773 * This is our just-in-time copy function. It makes a copy of
699 774 * buffers, that have been modified in a previous transaction
700 775 * group, before we modify them in the current active group.
701 776 *
702 777 * This function is used in two places: when we are dirtying a
703 778 * buffer for the first time in a txg, and when we are freeing
704 779 * a range in a dnode that includes this buffer.
705 780 *
706 781 * Note that when we are called from dbuf_free_range() we do
707 782 * not put a hold on the buffer, we just traverse the active
708 783 * dbuf list for the dnode.
709 784 */
710 785 static void
711 786 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
712 787 {
713 788 dbuf_dirty_record_t *dr = db->db_last_dirty;
714 789
715 790 ASSERT(MUTEX_HELD(&db->db_mtx));
716 791 ASSERT(db->db.db_data != NULL);
717 792 ASSERT(db->db_level == 0);
718 793 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
719 794
720 795 if (dr == NULL ||
721 796 (dr->dt.dl.dr_data !=
722 797 ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
723 798 return;
724 799
725 800 /*
726 801 * If the last dirty record for this dbuf has not yet synced
727 802 * and its referencing the dbuf data, either:
728 803 * reset the reference to point to a new copy,
729 804 * or (if there a no active holders)
730 805 * just null out the current db_data pointer.
731 806 */
732 807 ASSERT(dr->dr_txg >= txg - 2);
733 808 if (db->db_blkid == DMU_BONUS_BLKID) {
734 809 /* Note that the data bufs here are zio_bufs */
735 810 dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
|
↓ open down ↓ |
35 lines elided |
↑ open up ↑ |
736 811 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
737 812 bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
738 813 } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
739 814 int size = db->db.db_size;
740 815 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
741 816 spa_t *spa = db->db_objset->os_spa;
742 817
743 818 dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
744 819 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
745 820 } else {
746 - dbuf_set_data(db, NULL);
821 + dbuf_clear_data(db);
747 822 }
748 823 }
749 824
750 825 void
751 826 dbuf_unoverride(dbuf_dirty_record_t *dr)
752 827 {
753 828 dmu_buf_impl_t *db = dr->dr_dbuf;
754 829 blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
755 830 uint64_t txg = dr->dr_txg;
756 831
757 832 ASSERT(MUTEX_HELD(&db->db_mtx));
758 833 ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
759 834 ASSERT(db->db_level == 0);
760 835
761 836 if (db->db_blkid == DMU_BONUS_BLKID ||
762 837 dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
763 838 return;
764 839
765 840 ASSERT(db->db_data_pending != dr);
766 841
767 842 /* free this block */
768 843 if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
769 844 zio_free(db->db_objset->os_spa, txg, bp);
770 845
771 846 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
772 847 dr->dt.dl.dr_nopwrite = B_FALSE;
773 848
774 849 /*
775 850 * Release the already-written buffer, so we leave it in
776 851 * a consistent dirty state. Note that all callers are
777 852 * modifying the buffer, so they will immediately do
778 853 * another (redundant) arc_release(). Therefore, leave
779 854 * the buf thawed to save the effort of freezing &
780 855 * immediately re-thawing it.
781 856 */
782 857 arc_release(dr->dt.dl.dr_data, db);
783 858 }
784 859
785 860 /*
786 861 * Evict (if its unreferenced) or clear (if its referenced) any level-0
|
↓ open down ↓ |
30 lines elided |
↑ open up ↑ |
787 862 * data blocks in the free range, so that any future readers will find
788 863 * empty blocks.
789 864 *
790 865 * This is a no-op if the dataset is in the middle of an incremental
791 866 * receive; see comment below for details.
792 867 */
793 868 void
794 869 dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
795 870 dmu_tx_t *tx)
796 871 {
797 - dmu_buf_impl_t *db, *db_next, db_search;
872 + dmu_buf_impl_t db_search;
873 + dmu_buf_impl_t *db, *db_next;
798 874 uint64_t txg = tx->tx_txg;
799 875 avl_index_t where;
800 876
801 877 if (end_blkid > dn->dn_maxblkid && (end_blkid != DMU_SPILL_BLKID))
802 878 end_blkid = dn->dn_maxblkid;
803 879 dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid);
804 880
805 881 db_search.db_level = 0;
806 882 db_search.db_blkid = start_blkid;
807 883 db_search.db_state = DB_SEARCH;
808 884
809 885 mutex_enter(&dn->dn_dbufs_mtx);
810 886 if (start_blkid >= dn->dn_unlisted_l0_blkid) {
811 887 /* There can't be any dbufs in this range; no need to search. */
812 888 #ifdef DEBUG
813 889 db = avl_find(&dn->dn_dbufs, &db_search, &where);
814 890 ASSERT3P(db, ==, NULL);
815 891 db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
816 892 ASSERT(db == NULL || db->db_level > 0);
817 893 #endif
818 894 mutex_exit(&dn->dn_dbufs_mtx);
819 895 return;
820 896 } else if (dmu_objset_is_receiving(dn->dn_objset)) {
821 897 /*
822 898 * If we are receiving, we expect there to be no dbufs in
823 899 * the range to be freed, because receive modifies each
824 900 * block at most once, and in offset order. If this is
825 901 * not the case, it can lead to performance problems,
826 902 * so note that we unexpectedly took the slow path.
827 903 */
828 904 atomic_inc_64(&zfs_free_range_recv_miss);
829 905 }
830 906
831 907 db = avl_find(&dn->dn_dbufs, &db_search, &where);
832 908 ASSERT3P(db, ==, NULL);
833 909 db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
834 910
835 911 for (; db != NULL; db = db_next) {
836 912 db_next = AVL_NEXT(&dn->dn_dbufs, db);
837 913 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
838 914
839 915 if (db->db_level != 0 || db->db_blkid > end_blkid) {
840 916 break;
841 917 }
842 918 ASSERT3U(db->db_blkid, >=, start_blkid);
843 919
844 920 /* found a level 0 buffer in the range */
845 921 mutex_enter(&db->db_mtx);
846 922 if (dbuf_undirty(db, tx)) {
847 923 /* mutex has been dropped and dbuf destroyed */
848 924 continue;
849 925 }
850 926
851 927 if (db->db_state == DB_UNCACHED ||
852 928 db->db_state == DB_NOFILL ||
853 929 db->db_state == DB_EVICTING) {
854 930 ASSERT(db->db.db_data == NULL);
855 931 mutex_exit(&db->db_mtx);
856 932 continue;
857 933 }
858 934 if (db->db_state == DB_READ || db->db_state == DB_FILL) {
859 935 /* will be handled in dbuf_read_done or dbuf_rele */
860 936 db->db_freed_in_flight = TRUE;
861 937 mutex_exit(&db->db_mtx);
862 938 continue;
863 939 }
864 940 if (refcount_count(&db->db_holds) == 0) {
865 941 ASSERT(db->db_buf);
866 942 dbuf_clear(db);
867 943 continue;
868 944 }
869 945 /* The dbuf is referenced */
870 946
871 947 if (db->db_last_dirty != NULL) {
872 948 dbuf_dirty_record_t *dr = db->db_last_dirty;
873 949
874 950 if (dr->dr_txg == txg) {
875 951 /*
876 952 * This buffer is "in-use", re-adjust the file
877 953 * size to reflect that this buffer may
878 954 * contain new data when we sync.
879 955 */
880 956 if (db->db_blkid != DMU_SPILL_BLKID &&
881 957 db->db_blkid > dn->dn_maxblkid)
882 958 dn->dn_maxblkid = db->db_blkid;
883 959 dbuf_unoverride(dr);
884 960 } else {
885 961 /*
886 962 * This dbuf is not dirty in the open context.
887 963 * Either uncache it (if its not referenced in
888 964 * the open context) or reset its contents to
889 965 * empty.
890 966 */
891 967 dbuf_fix_old_data(db, txg);
892 968 }
893 969 }
894 970 /* clear the contents if its cached */
895 971 if (db->db_state == DB_CACHED) {
896 972 ASSERT(db->db.db_data != NULL);
897 973 arc_release(db->db_buf, db);
898 974 bzero(db->db.db_data, db->db.db_size);
899 975 arc_buf_freeze(db->db_buf);
900 976 }
901 977
902 978 mutex_exit(&db->db_mtx);
903 979 }
904 980 mutex_exit(&dn->dn_dbufs_mtx);
905 981 }
906 982
907 983 static int
908 984 dbuf_block_freeable(dmu_buf_impl_t *db)
909 985 {
910 986 dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
911 987 uint64_t birth_txg = 0;
912 988
913 989 /*
914 990 * We don't need any locking to protect db_blkptr:
915 991 * If it's syncing, then db_last_dirty will be set
916 992 * so we'll ignore db_blkptr.
917 993 *
918 994 * This logic ensures that only block births for
919 995 * filled blocks are considered.
920 996 */
921 997 ASSERT(MUTEX_HELD(&db->db_mtx));
922 998 if (db->db_last_dirty && (db->db_blkptr == NULL ||
923 999 !BP_IS_HOLE(db->db_blkptr))) {
924 1000 birth_txg = db->db_last_dirty->dr_txg;
925 1001 } else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
926 1002 birth_txg = db->db_blkptr->blk_birth;
927 1003 }
928 1004
929 1005 /*
930 1006 * If this block don't exist or is in a snapshot, it can't be freed.
931 1007 * Don't pass the bp to dsl_dataset_block_freeable() since we
932 1008 * are holding the db_mtx lock and might deadlock if we are
933 1009 * prefetching a dedup-ed block.
934 1010 */
935 1011 if (birth_txg != 0)
936 1012 return (ds == NULL ||
937 1013 dsl_dataset_block_freeable(ds, NULL, birth_txg));
938 1014 else
939 1015 return (B_FALSE);
940 1016 }
941 1017
942 1018 void
943 1019 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
944 1020 {
945 1021 arc_buf_t *buf, *obuf;
946 1022 int osize = db->db.db_size;
947 1023 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
948 1024 dnode_t *dn;
949 1025
950 1026 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
951 1027
952 1028 DB_DNODE_ENTER(db);
953 1029 dn = DB_DNODE(db);
954 1030
955 1031 /* XXX does *this* func really need the lock? */
956 1032 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
957 1033
958 1034 /*
959 1035 * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held
960 1036 * is OK, because there can be no other references to the db
961 1037 * when we are changing its size, so no concurrent DB_FILL can
962 1038 * be happening.
963 1039 */
964 1040 /*
965 1041 * XXX we should be doing a dbuf_read, checking the return
966 1042 * value and returning that up to our callers
967 1043 */
968 1044 dmu_buf_will_dirty(&db->db, tx);
969 1045
970 1046 /* create the data buffer for the new block */
971 1047 buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
972 1048
973 1049 /* copy old block data to the new block */
974 1050 obuf = db->db_buf;
975 1051 bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
976 1052 /* zero the remainder */
977 1053 if (size > osize)
978 1054 bzero((uint8_t *)buf->b_data + osize, size - osize);
979 1055
980 1056 mutex_enter(&db->db_mtx);
981 1057 dbuf_set_data(db, buf);
982 1058 VERIFY(arc_buf_remove_ref(obuf, db));
983 1059 db->db.db_size = size;
984 1060
985 1061 if (db->db_level == 0) {
986 1062 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
987 1063 db->db_last_dirty->dt.dl.dr_data = buf;
988 1064 }
989 1065 mutex_exit(&db->db_mtx);
990 1066
991 1067 dnode_willuse_space(dn, size-osize, tx);
992 1068 DB_DNODE_EXIT(db);
993 1069 }
994 1070
995 1071 void
996 1072 dbuf_release_bp(dmu_buf_impl_t *db)
997 1073 {
998 1074 objset_t *os = db->db_objset;
999 1075
1000 1076 ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
1001 1077 ASSERT(arc_released(os->os_phys_buf) ||
1002 1078 list_link_active(&os->os_dsl_dataset->ds_synced_link));
1003 1079 ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
1004 1080
1005 1081 (void) arc_release(db->db_buf, db);
1006 1082 }
1007 1083
1008 1084 dbuf_dirty_record_t *
1009 1085 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1010 1086 {
1011 1087 dnode_t *dn;
1012 1088 objset_t *os;
1013 1089 dbuf_dirty_record_t **drp, *dr;
1014 1090 int drop_struct_lock = FALSE;
1015 1091 boolean_t do_free_accounting = B_FALSE;
1016 1092 int txgoff = tx->tx_txg & TXG_MASK;
1017 1093
1018 1094 ASSERT(tx->tx_txg != 0);
1019 1095 ASSERT(!refcount_is_zero(&db->db_holds));
1020 1096 DMU_TX_DIRTY_BUF(tx, db);
1021 1097
1022 1098 DB_DNODE_ENTER(db);
1023 1099 dn = DB_DNODE(db);
1024 1100 /*
1025 1101 * Shouldn't dirty a regular buffer in syncing context. Private
1026 1102 * objects may be dirtied in syncing context, but only if they
1027 1103 * were already pre-dirtied in open context.
1028 1104 */
1029 1105 ASSERT(!dmu_tx_is_syncing(tx) ||
1030 1106 BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
1031 1107 DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1032 1108 dn->dn_objset->os_dsl_dataset == NULL);
1033 1109 /*
1034 1110 * We make this assert for private objects as well, but after we
1035 1111 * check if we're already dirty. They are allowed to re-dirty
1036 1112 * in syncing context.
1037 1113 */
1038 1114 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1039 1115 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1040 1116 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1041 1117
1042 1118 mutex_enter(&db->db_mtx);
1043 1119 /*
1044 1120 * XXX make this true for indirects too? The problem is that
1045 1121 * transactions created with dmu_tx_create_assigned() from
1046 1122 * syncing context don't bother holding ahead.
1047 1123 */
1048 1124 ASSERT(db->db_level != 0 ||
1049 1125 db->db_state == DB_CACHED || db->db_state == DB_FILL ||
1050 1126 db->db_state == DB_NOFILL);
1051 1127
1052 1128 mutex_enter(&dn->dn_mtx);
1053 1129 /*
1054 1130 * Don't set dirtyctx to SYNC if we're just modifying this as we
1055 1131 * initialize the objset.
1056 1132 */
1057 1133 if (dn->dn_dirtyctx == DN_UNDIRTIED &&
1058 1134 !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
1059 1135 dn->dn_dirtyctx =
1060 1136 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
1061 1137 ASSERT(dn->dn_dirtyctx_firstset == NULL);
1062 1138 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
1063 1139 }
1064 1140 mutex_exit(&dn->dn_mtx);
1065 1141
1066 1142 if (db->db_blkid == DMU_SPILL_BLKID)
1067 1143 dn->dn_have_spill = B_TRUE;
1068 1144
1069 1145 /*
1070 1146 * If this buffer is already dirty, we're done.
1071 1147 */
1072 1148 drp = &db->db_last_dirty;
1073 1149 ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
1074 1150 db->db.db_object == DMU_META_DNODE_OBJECT);
1075 1151 while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
1076 1152 drp = &dr->dr_next;
1077 1153 if (dr && dr->dr_txg == tx->tx_txg) {
1078 1154 DB_DNODE_EXIT(db);
1079 1155
1080 1156 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
1081 1157 /*
1082 1158 * If this buffer has already been written out,
1083 1159 * we now need to reset its state.
1084 1160 */
1085 1161 dbuf_unoverride(dr);
1086 1162 if (db->db.db_object != DMU_META_DNODE_OBJECT &&
1087 1163 db->db_state != DB_NOFILL)
1088 1164 arc_buf_thaw(db->db_buf);
1089 1165 }
1090 1166 mutex_exit(&db->db_mtx);
1091 1167 return (dr);
1092 1168 }
1093 1169
1094 1170 /*
1095 1171 * Only valid if not already dirty.
1096 1172 */
1097 1173 ASSERT(dn->dn_object == 0 ||
1098 1174 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1099 1175 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1100 1176
1101 1177 ASSERT3U(dn->dn_nlevels, >, db->db_level);
1102 1178 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
1103 1179 dn->dn_phys->dn_nlevels > db->db_level ||
1104 1180 dn->dn_next_nlevels[txgoff] > db->db_level ||
1105 1181 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
1106 1182 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
1107 1183
1108 1184 /*
1109 1185 * We should only be dirtying in syncing context if it's the
1110 1186 * mos or we're initializing the os or it's a special object.
1111 1187 * However, we are allowed to dirty in syncing context provided
1112 1188 * we already dirtied it in open context. Hence we must make
1113 1189 * this assertion only if we're not already dirty.
1114 1190 */
1115 1191 os = dn->dn_objset;
1116 1192 ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1117 1193 os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
1118 1194 ASSERT(db->db.db_size != 0);
1119 1195
1120 1196 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1121 1197
1122 1198 if (db->db_blkid != DMU_BONUS_BLKID) {
1123 1199 /*
1124 1200 * Update the accounting.
1125 1201 * Note: we delay "free accounting" until after we drop
1126 1202 * the db_mtx. This keeps us from grabbing other locks
1127 1203 * (and possibly deadlocking) in bp_get_dsize() while
1128 1204 * also holding the db_mtx.
1129 1205 */
1130 1206 dnode_willuse_space(dn, db->db.db_size, tx);
1131 1207 do_free_accounting = dbuf_block_freeable(db);
1132 1208 }
1133 1209
1134 1210 /*
1135 1211 * If this buffer is dirty in an old transaction group we need
1136 1212 * to make a copy of it so that the changes we make in this
1137 1213 * transaction group won't leak out when we sync the older txg.
1138 1214 */
1139 1215 dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
1140 1216 if (db->db_level == 0) {
1141 1217 void *data_old = db->db_buf;
1142 1218
1143 1219 if (db->db_state != DB_NOFILL) {
1144 1220 if (db->db_blkid == DMU_BONUS_BLKID) {
1145 1221 dbuf_fix_old_data(db, tx->tx_txg);
1146 1222 data_old = db->db.db_data;
1147 1223 } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
1148 1224 /*
1149 1225 * Release the data buffer from the cache so
1150 1226 * that we can modify it without impacting
1151 1227 * possible other users of this cached data
1152 1228 * block. Note that indirect blocks and
1153 1229 * private objects are not released until the
1154 1230 * syncing state (since they are only modified
1155 1231 * then).
1156 1232 */
1157 1233 arc_release(db->db_buf, db);
1158 1234 dbuf_fix_old_data(db, tx->tx_txg);
1159 1235 data_old = db->db_buf;
1160 1236 }
1161 1237 ASSERT(data_old != NULL);
1162 1238 }
1163 1239 dr->dt.dl.dr_data = data_old;
1164 1240 } else {
1165 1241 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
1166 1242 list_create(&dr->dt.di.dr_children,
1167 1243 sizeof (dbuf_dirty_record_t),
1168 1244 offsetof(dbuf_dirty_record_t, dr_dirty_node));
1169 1245 }
1170 1246 if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
1171 1247 dr->dr_accounted = db->db.db_size;
1172 1248 dr->dr_dbuf = db;
1173 1249 dr->dr_txg = tx->tx_txg;
1174 1250 dr->dr_next = *drp;
1175 1251 *drp = dr;
1176 1252
1177 1253 /*
1178 1254 * We could have been freed_in_flight between the dbuf_noread
1179 1255 * and dbuf_dirty. We win, as though the dbuf_noread() had
1180 1256 * happened after the free.
1181 1257 */
1182 1258 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1183 1259 db->db_blkid != DMU_SPILL_BLKID) {
1184 1260 mutex_enter(&dn->dn_mtx);
1185 1261 if (dn->dn_free_ranges[txgoff] != NULL) {
1186 1262 range_tree_clear(dn->dn_free_ranges[txgoff],
1187 1263 db->db_blkid, 1);
1188 1264 }
1189 1265 mutex_exit(&dn->dn_mtx);
1190 1266 db->db_freed_in_flight = FALSE;
1191 1267 }
1192 1268
1193 1269 /*
1194 1270 * This buffer is now part of this txg
1195 1271 */
1196 1272 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
1197 1273 db->db_dirtycnt += 1;
1198 1274 ASSERT3U(db->db_dirtycnt, <=, 3);
1199 1275
1200 1276 mutex_exit(&db->db_mtx);
1201 1277
1202 1278 if (db->db_blkid == DMU_BONUS_BLKID ||
1203 1279 db->db_blkid == DMU_SPILL_BLKID) {
1204 1280 mutex_enter(&dn->dn_mtx);
1205 1281 ASSERT(!list_link_active(&dr->dr_dirty_node));
1206 1282 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1207 1283 mutex_exit(&dn->dn_mtx);
1208 1284 dnode_setdirty(dn, tx);
1209 1285 DB_DNODE_EXIT(db);
1210 1286 return (dr);
1211 1287 } else if (do_free_accounting) {
1212 1288 blkptr_t *bp = db->db_blkptr;
1213 1289 int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
1214 1290 bp_get_dsize(os->os_spa, bp) : db->db.db_size;
1215 1291 /*
1216 1292 * This is only a guess -- if the dbuf is dirty
1217 1293 * in a previous txg, we don't know how much
1218 1294 * space it will use on disk yet. We should
1219 1295 * really have the struct_rwlock to access
1220 1296 * db_blkptr, but since this is just a guess,
1221 1297 * it's OK if we get an odd answer.
1222 1298 */
1223 1299 ddt_prefetch(os->os_spa, bp);
1224 1300 dnode_willuse_space(dn, -willfree, tx);
1225 1301 }
1226 1302
1227 1303 if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
1228 1304 rw_enter(&dn->dn_struct_rwlock, RW_READER);
1229 1305 drop_struct_lock = TRUE;
1230 1306 }
1231 1307
1232 1308 if (db->db_level == 0) {
1233 1309 dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
1234 1310 ASSERT(dn->dn_maxblkid >= db->db_blkid);
1235 1311 }
1236 1312
1237 1313 if (db->db_level+1 < dn->dn_nlevels) {
1238 1314 dmu_buf_impl_t *parent = db->db_parent;
1239 1315 dbuf_dirty_record_t *di;
1240 1316 int parent_held = FALSE;
1241 1317
1242 1318 if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
1243 1319 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1244 1320
1245 1321 parent = dbuf_hold_level(dn, db->db_level+1,
1246 1322 db->db_blkid >> epbs, FTAG);
1247 1323 ASSERT(parent != NULL);
1248 1324 parent_held = TRUE;
1249 1325 }
1250 1326 if (drop_struct_lock)
1251 1327 rw_exit(&dn->dn_struct_rwlock);
1252 1328 ASSERT3U(db->db_level+1, ==, parent->db_level);
1253 1329 di = dbuf_dirty(parent, tx);
1254 1330 if (parent_held)
1255 1331 dbuf_rele(parent, FTAG);
1256 1332
1257 1333 mutex_enter(&db->db_mtx);
1258 1334 /*
1259 1335 * Since we've dropped the mutex, it's possible that
1260 1336 * dbuf_undirty() might have changed this out from under us.
1261 1337 */
1262 1338 if (db->db_last_dirty == dr ||
1263 1339 dn->dn_object == DMU_META_DNODE_OBJECT) {
1264 1340 mutex_enter(&di->dt.di.dr_mtx);
1265 1341 ASSERT3U(di->dr_txg, ==, tx->tx_txg);
1266 1342 ASSERT(!list_link_active(&dr->dr_dirty_node));
1267 1343 list_insert_tail(&di->dt.di.dr_children, dr);
1268 1344 mutex_exit(&di->dt.di.dr_mtx);
1269 1345 dr->dr_parent = di;
1270 1346 }
1271 1347 mutex_exit(&db->db_mtx);
1272 1348 } else {
1273 1349 ASSERT(db->db_level+1 == dn->dn_nlevels);
1274 1350 ASSERT(db->db_blkid < dn->dn_nblkptr);
1275 1351 ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
1276 1352 mutex_enter(&dn->dn_mtx);
1277 1353 ASSERT(!list_link_active(&dr->dr_dirty_node));
1278 1354 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1279 1355 mutex_exit(&dn->dn_mtx);
1280 1356 if (drop_struct_lock)
1281 1357 rw_exit(&dn->dn_struct_rwlock);
1282 1358 }
1283 1359
1284 1360 dnode_setdirty(dn, tx);
1285 1361 DB_DNODE_EXIT(db);
1286 1362 return (dr);
1287 1363 }
1288 1364
1289 1365 /*
1290 1366 * Undirty a buffer in the transaction group referenced by the given
1291 1367 * transaction. Return whether this evicted the dbuf.
1292 1368 */
1293 1369 static boolean_t
1294 1370 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1295 1371 {
1296 1372 dnode_t *dn;
1297 1373 uint64_t txg = tx->tx_txg;
1298 1374 dbuf_dirty_record_t *dr, **drp;
1299 1375
1300 1376 ASSERT(txg != 0);
1301 1377 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1302 1378 ASSERT0(db->db_level);
1303 1379 ASSERT(MUTEX_HELD(&db->db_mtx));
1304 1380
1305 1381 /*
1306 1382 * If this buffer is not dirty, we're done.
1307 1383 */
1308 1384 for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
1309 1385 if (dr->dr_txg <= txg)
1310 1386 break;
1311 1387 if (dr == NULL || dr->dr_txg < txg)
1312 1388 return (B_FALSE);
1313 1389 ASSERT(dr->dr_txg == txg);
1314 1390 ASSERT(dr->dr_dbuf == db);
1315 1391
1316 1392 DB_DNODE_ENTER(db);
1317 1393 dn = DB_DNODE(db);
1318 1394
1319 1395 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1320 1396
1321 1397 ASSERT(db->db.db_size != 0);
1322 1398
1323 1399 /*
1324 1400 * Any space we accounted for in dp_dirty_* will be cleaned up by
1325 1401 * dsl_pool_sync(). This is relatively rare so the discrepancy
1326 1402 * is not a big deal.
1327 1403 */
1328 1404
1329 1405 *drp = dr->dr_next;
1330 1406
1331 1407 /*
1332 1408 * Note that there are three places in dbuf_dirty()
1333 1409 * where this dirty record may be put on a list.
1334 1410 * Make sure to do a list_remove corresponding to
1335 1411 * every one of those list_insert calls.
1336 1412 */
1337 1413 if (dr->dr_parent) {
1338 1414 mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
1339 1415 list_remove(&dr->dr_parent->dt.di.dr_children, dr);
1340 1416 mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
1341 1417 } else if (db->db_blkid == DMU_SPILL_BLKID ||
1342 1418 db->db_level+1 == dn->dn_nlevels) {
1343 1419 ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
1344 1420 mutex_enter(&dn->dn_mtx);
1345 1421 list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
1346 1422 mutex_exit(&dn->dn_mtx);
1347 1423 }
1348 1424 DB_DNODE_EXIT(db);
1349 1425
1350 1426 if (db->db_state != DB_NOFILL) {
1351 1427 dbuf_unoverride(dr);
1352 1428
1353 1429 ASSERT(db->db_buf != NULL);
1354 1430 ASSERT(dr->dt.dl.dr_data != NULL);
1355 1431 if (dr->dt.dl.dr_data != db->db_buf)
1356 1432 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
1357 1433 }
1358 1434
1359 1435 if (db->db_level != 0) {
1360 1436 mutex_destroy(&dr->dt.di.dr_mtx);
1361 1437 list_destroy(&dr->dt.di.dr_children);
1362 1438 }
|
↓ open down ↓ |
555 lines elided |
↑ open up ↑ |
1363 1439
1364 1440 kmem_free(dr, sizeof (dbuf_dirty_record_t));
1365 1441
1366 1442 ASSERT(db->db_dirtycnt > 0);
1367 1443 db->db_dirtycnt -= 1;
1368 1444
1369 1445 if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
1370 1446 arc_buf_t *buf = db->db_buf;
1371 1447
1372 1448 ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
1373 - dbuf_set_data(db, NULL);
1449 + dbuf_clear_data(db);
1374 1450 VERIFY(arc_buf_remove_ref(buf, db));
1375 1451 dbuf_evict(db);
1376 1452 return (B_TRUE);
1377 1453 }
1378 1454
1379 1455 return (B_FALSE);
1380 1456 }
1381 1457
1382 1458 void
1383 1459 dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
1384 1460 {
1385 1461 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1386 1462 int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
1387 1463
1388 1464 ASSERT(tx->tx_txg != 0);
1389 1465 ASSERT(!refcount_is_zero(&db->db_holds));
1390 1466
1391 1467 DB_DNODE_ENTER(db);
1392 1468 if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
1393 1469 rf |= DB_RF_HAVESTRUCT;
1394 1470 DB_DNODE_EXIT(db);
1395 1471 (void) dbuf_read(db, NULL, rf);
1396 1472 (void) dbuf_dirty(db, tx);
1397 1473 }
1398 1474
1399 1475 void
1400 1476 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1401 1477 {
1402 1478 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1403 1479
1404 1480 db->db_state = DB_NOFILL;
1405 1481
1406 1482 dmu_buf_will_fill(db_fake, tx);
1407 1483 }
1408 1484
1409 1485 void
1410 1486 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1411 1487 {
1412 1488 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1413 1489
1414 1490 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1415 1491 ASSERT(tx->tx_txg != 0);
1416 1492 ASSERT(db->db_level == 0);
1417 1493 ASSERT(!refcount_is_zero(&db->db_holds));
1418 1494
1419 1495 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
1420 1496 dmu_tx_private_ok(tx));
1421 1497
1422 1498 dbuf_noread(db);
1423 1499 (void) dbuf_dirty(db, tx);
1424 1500 }
1425 1501
1426 1502 #pragma weak dmu_buf_fill_done = dbuf_fill_done
1427 1503 /* ARGSUSED */
1428 1504 void
1429 1505 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
1430 1506 {
1431 1507 mutex_enter(&db->db_mtx);
1432 1508 DBUF_VERIFY(db);
1433 1509
1434 1510 if (db->db_state == DB_FILL) {
1435 1511 if (db->db_level == 0 && db->db_freed_in_flight) {
1436 1512 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1437 1513 /* we were freed while filling */
1438 1514 /* XXX dbuf_undirty? */
1439 1515 bzero(db->db.db_data, db->db.db_size);
1440 1516 db->db_freed_in_flight = FALSE;
1441 1517 }
1442 1518 db->db_state = DB_CACHED;
1443 1519 cv_broadcast(&db->db_changed);
1444 1520 }
1445 1521 mutex_exit(&db->db_mtx);
1446 1522 }
1447 1523
1448 1524 void
1449 1525 dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
1450 1526 bp_embedded_type_t etype, enum zio_compress comp,
1451 1527 int uncompressed_size, int compressed_size, int byteorder,
1452 1528 dmu_tx_t *tx)
1453 1529 {
1454 1530 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
1455 1531 struct dirty_leaf *dl;
1456 1532 dmu_object_type_t type;
1457 1533
1458 1534 DB_DNODE_ENTER(db);
1459 1535 type = DB_DNODE(db)->dn_type;
1460 1536 DB_DNODE_EXIT(db);
1461 1537
1462 1538 ASSERT0(db->db_level);
1463 1539 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1464 1540
1465 1541 dmu_buf_will_not_fill(dbuf, tx);
1466 1542
1467 1543 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
1468 1544 dl = &db->db_last_dirty->dt.dl;
1469 1545 encode_embedded_bp_compressed(&dl->dr_overridden_by,
1470 1546 data, comp, uncompressed_size, compressed_size);
1471 1547 BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
1472 1548 BP_SET_TYPE(&dl->dr_overridden_by, type);
1473 1549 BP_SET_LEVEL(&dl->dr_overridden_by, 0);
1474 1550 BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
1475 1551
1476 1552 dl->dr_override_state = DR_OVERRIDDEN;
1477 1553 dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg;
1478 1554 }
1479 1555
1480 1556 /*
1481 1557 * Directly assign a provided arc buf to a given dbuf if it's not referenced
1482 1558 * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
1483 1559 */
1484 1560 void
1485 1561 dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
1486 1562 {
1487 1563 ASSERT(!refcount_is_zero(&db->db_holds));
1488 1564 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1489 1565 ASSERT(db->db_level == 0);
1490 1566 ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
1491 1567 ASSERT(buf != NULL);
1492 1568 ASSERT(arc_buf_size(buf) == db->db.db_size);
1493 1569 ASSERT(tx->tx_txg != 0);
1494 1570
1495 1571 arc_return_buf(buf, db);
1496 1572 ASSERT(arc_released(buf));
1497 1573
1498 1574 mutex_enter(&db->db_mtx);
1499 1575
1500 1576 while (db->db_state == DB_READ || db->db_state == DB_FILL)
1501 1577 cv_wait(&db->db_changed, &db->db_mtx);
1502 1578
1503 1579 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
1504 1580
1505 1581 if (db->db_state == DB_CACHED &&
1506 1582 refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
1507 1583 mutex_exit(&db->db_mtx);
1508 1584 (void) dbuf_dirty(db, tx);
1509 1585 bcopy(buf->b_data, db->db.db_data, db->db.db_size);
1510 1586 VERIFY(arc_buf_remove_ref(buf, db));
1511 1587 xuio_stat_wbuf_copied();
1512 1588 return;
1513 1589 }
1514 1590
1515 1591 xuio_stat_wbuf_nocopy();
1516 1592 if (db->db_state == DB_CACHED) {
1517 1593 dbuf_dirty_record_t *dr = db->db_last_dirty;
1518 1594
1519 1595 ASSERT(db->db_buf != NULL);
1520 1596 if (dr != NULL && dr->dr_txg == tx->tx_txg) {
1521 1597 ASSERT(dr->dt.dl.dr_data == db->db_buf);
1522 1598 if (!arc_released(db->db_buf)) {
1523 1599 ASSERT(dr->dt.dl.dr_override_state ==
1524 1600 DR_OVERRIDDEN);
1525 1601 arc_release(db->db_buf, db);
1526 1602 }
1527 1603 dr->dt.dl.dr_data = buf;
1528 1604 VERIFY(arc_buf_remove_ref(db->db_buf, db));
1529 1605 } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
1530 1606 arc_release(db->db_buf, db);
1531 1607 VERIFY(arc_buf_remove_ref(db->db_buf, db));
1532 1608 }
1533 1609 db->db_buf = NULL;
1534 1610 }
1535 1611 ASSERT(db->db_buf == NULL);
1536 1612 dbuf_set_data(db, buf);
1537 1613 db->db_state = DB_FILL;
1538 1614 mutex_exit(&db->db_mtx);
1539 1615 (void) dbuf_dirty(db, tx);
1540 1616 dmu_buf_fill_done(&db->db, tx);
1541 1617 }
1542 1618
1543 1619 /*
1544 1620 * "Clear" the contents of this dbuf. This will mark the dbuf
1545 1621 * EVICTING and clear *most* of its references. Unfortunately,
1546 1622 * when we are not holding the dn_dbufs_mtx, we can't clear the
1547 1623 * entry in the dn_dbufs list. We have to wait until dbuf_destroy()
1548 1624 * in this case. For callers from the DMU we will usually see:
1549 1625 * dbuf_clear()->arc_clear_callback()->dbuf_do_evict()->dbuf_destroy()
1550 1626 * For the arc callback, we will usually see:
1551 1627 * dbuf_do_evict()->dbuf_clear();dbuf_destroy()
1552 1628 * Sometimes, though, we will get a mix of these two:
1553 1629 * DMU: dbuf_clear()->arc_clear_callback()
1554 1630 * ARC: dbuf_do_evict()->dbuf_destroy()
1555 1631 *
1556 1632 * This routine will dissociate the dbuf from the arc, by calling
1557 1633 * arc_clear_callback(), but will not evict the data from the ARC.
1558 1634 */
1559 1635 void
1560 1636 dbuf_clear(dmu_buf_impl_t *db)
1561 1637 {
1562 1638 dnode_t *dn;
1563 1639 dmu_buf_impl_t *parent = db->db_parent;
1564 1640 dmu_buf_impl_t *dndb;
1565 1641 boolean_t dbuf_gone = B_FALSE;
1566 1642
1567 1643 ASSERT(MUTEX_HELD(&db->db_mtx));
1568 1644 ASSERT(refcount_is_zero(&db->db_holds));
1569 1645
1570 1646 dbuf_evict_user(db);
1571 1647
1572 1648 if (db->db_state == DB_CACHED) {
1573 1649 ASSERT(db->db.db_data != NULL);
1574 1650 if (db->db_blkid == DMU_BONUS_BLKID) {
1575 1651 zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
1576 1652 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
1577 1653 }
1578 1654 db->db.db_data = NULL;
1579 1655 db->db_state = DB_UNCACHED;
1580 1656 }
1581 1657
1582 1658 ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
1583 1659 ASSERT(db->db_data_pending == NULL);
1584 1660
1585 1661 db->db_state = DB_EVICTING;
1586 1662 db->db_blkptr = NULL;
1587 1663
1588 1664 DB_DNODE_ENTER(db);
1589 1665 dn = DB_DNODE(db);
1590 1666 dndb = dn->dn_dbuf;
1591 1667 if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
1592 1668 avl_remove(&dn->dn_dbufs, db);
1593 1669 atomic_dec_32(&dn->dn_dbufs_count);
1594 1670 membar_producer();
1595 1671 DB_DNODE_EXIT(db);
1596 1672 /*
1597 1673 * Decrementing the dbuf count means that the hold corresponding
1598 1674 * to the removed dbuf is no longer discounted in dnode_move(),
1599 1675 * so the dnode cannot be moved until after we release the hold.
1600 1676 * The membar_producer() ensures visibility of the decremented
1601 1677 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
1602 1678 * release any lock.
1603 1679 */
1604 1680 dnode_rele(dn, db);
1605 1681 db->db_dnode_handle = NULL;
1606 1682 } else {
1607 1683 DB_DNODE_EXIT(db);
1608 1684 }
1609 1685
1610 1686 if (db->db_buf)
1611 1687 dbuf_gone = arc_clear_callback(db->db_buf);
1612 1688
1613 1689 if (!dbuf_gone)
1614 1690 mutex_exit(&db->db_mtx);
1615 1691
1616 1692 /*
1617 1693 * If this dbuf is referenced from an indirect dbuf,
1618 1694 * decrement the ref count on the indirect dbuf.
1619 1695 */
1620 1696 if (parent && parent != dndb)
1621 1697 dbuf_rele(parent, db);
1622 1698 }
1623 1699
1624 1700 static int
1625 1701 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
1626 1702 dmu_buf_impl_t **parentp, blkptr_t **bpp)
1627 1703 {
1628 1704 int nlevels, epbs;
1629 1705
1630 1706 *parentp = NULL;
1631 1707 *bpp = NULL;
1632 1708
1633 1709 ASSERT(blkid != DMU_BONUS_BLKID);
1634 1710
1635 1711 if (blkid == DMU_SPILL_BLKID) {
1636 1712 mutex_enter(&dn->dn_mtx);
1637 1713 if (dn->dn_have_spill &&
1638 1714 (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
1639 1715 *bpp = &dn->dn_phys->dn_spill;
1640 1716 else
1641 1717 *bpp = NULL;
1642 1718 dbuf_add_ref(dn->dn_dbuf, NULL);
1643 1719 *parentp = dn->dn_dbuf;
1644 1720 mutex_exit(&dn->dn_mtx);
1645 1721 return (0);
1646 1722 }
1647 1723
1648 1724 if (dn->dn_phys->dn_nlevels == 0)
1649 1725 nlevels = 1;
1650 1726 else
1651 1727 nlevels = dn->dn_phys->dn_nlevels;
1652 1728
1653 1729 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1654 1730
1655 1731 ASSERT3U(level * epbs, <, 64);
1656 1732 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1657 1733 if (level >= nlevels ||
1658 1734 (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
1659 1735 /* the buffer has no parent yet */
1660 1736 return (SET_ERROR(ENOENT));
1661 1737 } else if (level < nlevels-1) {
1662 1738 /* this block is referenced from an indirect block */
1663 1739 int err = dbuf_hold_impl(dn, level+1,
1664 1740 blkid >> epbs, fail_sparse, NULL, parentp);
1665 1741 if (err)
1666 1742 return (err);
1667 1743 err = dbuf_read(*parentp, NULL,
1668 1744 (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
1669 1745 if (err) {
1670 1746 dbuf_rele(*parentp, NULL);
1671 1747 *parentp = NULL;
1672 1748 return (err);
1673 1749 }
1674 1750 *bpp = ((blkptr_t *)(*parentp)->db.db_data) +
1675 1751 (blkid & ((1ULL << epbs) - 1));
1676 1752 return (0);
1677 1753 } else {
1678 1754 /* the block is referenced from the dnode */
1679 1755 ASSERT3U(level, ==, nlevels-1);
1680 1756 ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
1681 1757 blkid < dn->dn_phys->dn_nblkptr);
1682 1758 if (dn->dn_dbuf) {
1683 1759 dbuf_add_ref(dn->dn_dbuf, NULL);
1684 1760 *parentp = dn->dn_dbuf;
1685 1761 }
1686 1762 *bpp = &dn->dn_phys->dn_blkptr[blkid];
1687 1763 return (0);
1688 1764 }
1689 1765 }
1690 1766
1691 1767 static dmu_buf_impl_t *
1692 1768 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
1693 1769 dmu_buf_impl_t *parent, blkptr_t *blkptr)
1694 1770 {
1695 1771 objset_t *os = dn->dn_objset;
1696 1772 dmu_buf_impl_t *db, *odb;
1697 1773
1698 1774 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1699 1775 ASSERT(dn->dn_type != DMU_OT_NONE);
1700 1776
1701 1777 db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
1702 1778
|
↓ open down ↓ |
319 lines elided |
↑ open up ↑ |
1703 1779 db->db_objset = os;
1704 1780 db->db.db_object = dn->dn_object;
1705 1781 db->db_level = level;
1706 1782 db->db_blkid = blkid;
1707 1783 db->db_last_dirty = NULL;
1708 1784 db->db_dirtycnt = 0;
1709 1785 db->db_dnode_handle = dn->dn_handle;
1710 1786 db->db_parent = parent;
1711 1787 db->db_blkptr = blkptr;
1712 1788
1713 - db->db_user_ptr = NULL;
1714 - db->db_evict_func = NULL;
1789 + db->db_user = NULL;
1715 1790 db->db_immediate_evict = 0;
1716 1791 db->db_freed_in_flight = 0;
1717 1792
1718 1793 if (blkid == DMU_BONUS_BLKID) {
1719 1794 ASSERT3P(parent, ==, dn->dn_dbuf);
1720 1795 db->db.db_size = DN_MAX_BONUSLEN -
1721 1796 (dn->dn_nblkptr-1) * sizeof (blkptr_t);
1722 1797 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
1723 1798 db->db.db_offset = DMU_BONUS_BLKID;
1724 1799 db->db_state = DB_UNCACHED;
1725 1800 /* the bonus dbuf is not placed in the hash table */
1726 1801 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1727 1802 return (db);
1728 1803 } else if (blkid == DMU_SPILL_BLKID) {
1729 1804 db->db.db_size = (blkptr != NULL) ?
1730 1805 BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
1731 1806 db->db.db_offset = 0;
1732 1807 } else {
1733 1808 int blocksize =
1734 1809 db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
1735 1810 db->db.db_size = blocksize;
1736 1811 db->db.db_offset = db->db_blkid * blocksize;
1737 1812 }
1738 1813
1739 1814 /*
1740 1815 * Hold the dn_dbufs_mtx while we get the new dbuf
1741 1816 * in the hash table *and* added to the dbufs list.
1742 1817 * This prevents a possible deadlock with someone
1743 1818 * trying to look up this dbuf before its added to the
1744 1819 * dn_dbufs list.
1745 1820 */
1746 1821 mutex_enter(&dn->dn_dbufs_mtx);
1747 1822 db->db_state = DB_EVICTING;
1748 1823 if ((odb = dbuf_hash_insert(db)) != NULL) {
1749 1824 /* someone else inserted it first */
1750 1825 kmem_cache_free(dbuf_cache, db);
1751 1826 mutex_exit(&dn->dn_dbufs_mtx);
1752 1827 return (odb);
1753 1828 }
1754 1829 avl_add(&dn->dn_dbufs, db);
1755 1830 if (db->db_level == 0 && db->db_blkid >=
1756 1831 dn->dn_unlisted_l0_blkid)
1757 1832 dn->dn_unlisted_l0_blkid = db->db_blkid + 1;
1758 1833 db->db_state = DB_UNCACHED;
1759 1834 mutex_exit(&dn->dn_dbufs_mtx);
1760 1835 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1761 1836
1762 1837 if (parent && parent != dn->dn_dbuf)
1763 1838 dbuf_add_ref(parent, db);
1764 1839
1765 1840 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1766 1841 refcount_count(&dn->dn_holds) > 0);
1767 1842 (void) refcount_add(&dn->dn_holds, db);
1768 1843 atomic_inc_32(&dn->dn_dbufs_count);
1769 1844
1770 1845 dprintf_dbuf(db, "db=%p\n", db);
1771 1846
1772 1847 return (db);
1773 1848 }
1774 1849
1775 1850 static int
1776 1851 dbuf_do_evict(void *private)
1777 1852 {
1778 1853 dmu_buf_impl_t *db = private;
1779 1854
1780 1855 if (!MUTEX_HELD(&db->db_mtx))
1781 1856 mutex_enter(&db->db_mtx);
1782 1857
1783 1858 ASSERT(refcount_is_zero(&db->db_holds));
1784 1859
1785 1860 if (db->db_state != DB_EVICTING) {
1786 1861 ASSERT(db->db_state == DB_CACHED);
1787 1862 DBUF_VERIFY(db);
1788 1863 db->db_buf = NULL;
1789 1864 dbuf_evict(db);
1790 1865 } else {
1791 1866 mutex_exit(&db->db_mtx);
1792 1867 dbuf_destroy(db);
1793 1868 }
1794 1869 return (0);
1795 1870 }
1796 1871
1797 1872 static void
1798 1873 dbuf_destroy(dmu_buf_impl_t *db)
1799 1874 {
1800 1875 ASSERT(refcount_is_zero(&db->db_holds));
1801 1876
1802 1877 if (db->db_blkid != DMU_BONUS_BLKID) {
1803 1878 /*
1804 1879 * If this dbuf is still on the dn_dbufs list,
1805 1880 * remove it from that list.
1806 1881 */
1807 1882 if (db->db_dnode_handle != NULL) {
1808 1883 dnode_t *dn;
1809 1884
1810 1885 DB_DNODE_ENTER(db);
1811 1886 dn = DB_DNODE(db);
1812 1887 mutex_enter(&dn->dn_dbufs_mtx);
1813 1888 avl_remove(&dn->dn_dbufs, db);
1814 1889 atomic_dec_32(&dn->dn_dbufs_count);
1815 1890 mutex_exit(&dn->dn_dbufs_mtx);
1816 1891 DB_DNODE_EXIT(db);
1817 1892 /*
1818 1893 * Decrementing the dbuf count means that the hold
1819 1894 * corresponding to the removed dbuf is no longer
1820 1895 * discounted in dnode_move(), so the dnode cannot be
1821 1896 * moved until after we release the hold.
1822 1897 */
1823 1898 dnode_rele(dn, db);
1824 1899 db->db_dnode_handle = NULL;
1825 1900 }
1826 1901 dbuf_hash_remove(db);
1827 1902 }
1828 1903 db->db_parent = NULL;
1829 1904 db->db_buf = NULL;
1830 1905
1831 1906 ASSERT(db->db.db_data == NULL);
1832 1907 ASSERT(db->db_hash_next == NULL);
1833 1908 ASSERT(db->db_blkptr == NULL);
1834 1909 ASSERT(db->db_data_pending == NULL);
1835 1910
1836 1911 kmem_cache_free(dbuf_cache, db);
1837 1912 arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1838 1913 }
1839 1914
1840 1915 void
1841 1916 dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
1842 1917 {
1843 1918 dmu_buf_impl_t *db = NULL;
1844 1919 blkptr_t *bp = NULL;
1845 1920
1846 1921 ASSERT(blkid != DMU_BONUS_BLKID);
1847 1922 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1848 1923
1849 1924 if (dnode_block_freed(dn, blkid))
1850 1925 return;
1851 1926
1852 1927 /* dbuf_find() returns with db_mtx held */
1853 1928 if (db = dbuf_find(dn, 0, blkid)) {
1854 1929 /*
1855 1930 * This dbuf is already in the cache. We assume that
1856 1931 * it is already CACHED, or else about to be either
1857 1932 * read or filled.
1858 1933 */
1859 1934 mutex_exit(&db->db_mtx);
1860 1935 return;
1861 1936 }
1862 1937
1863 1938 if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
1864 1939 if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
1865 1940 dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
1866 1941 arc_flags_t aflags =
1867 1942 ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
1868 1943 zbookmark_phys_t zb;
1869 1944
1870 1945 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
1871 1946 dn->dn_object, 0, blkid);
1872 1947
1873 1948 (void) arc_read(NULL, dn->dn_objset->os_spa,
1874 1949 bp, NULL, NULL, prio,
1875 1950 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
1876 1951 &aflags, &zb);
1877 1952 }
1878 1953 if (db)
1879 1954 dbuf_rele(db, NULL);
1880 1955 }
1881 1956 }
1882 1957
1883 1958 /*
1884 1959 * Returns with db_holds incremented, and db_mtx not held.
1885 1960 * Note: dn_struct_rwlock must be held.
1886 1961 */
1887 1962 int
1888 1963 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
1889 1964 void *tag, dmu_buf_impl_t **dbp)
1890 1965 {
1891 1966 dmu_buf_impl_t *db, *parent = NULL;
1892 1967
1893 1968 ASSERT(blkid != DMU_BONUS_BLKID);
1894 1969 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1895 1970 ASSERT3U(dn->dn_nlevels, >, level);
1896 1971
1897 1972 *dbp = NULL;
1898 1973 top:
1899 1974 /* dbuf_find() returns with db_mtx held */
1900 1975 db = dbuf_find(dn, level, blkid);
1901 1976
1902 1977 if (db == NULL) {
1903 1978 blkptr_t *bp = NULL;
1904 1979 int err;
1905 1980
1906 1981 ASSERT3P(parent, ==, NULL);
1907 1982 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
1908 1983 if (fail_sparse) {
1909 1984 if (err == 0 && bp && BP_IS_HOLE(bp))
1910 1985 err = SET_ERROR(ENOENT);
1911 1986 if (err) {
1912 1987 if (parent)
1913 1988 dbuf_rele(parent, NULL);
1914 1989 return (err);
1915 1990 }
1916 1991 }
1917 1992 if (err && err != ENOENT)
1918 1993 return (err);
1919 1994 db = dbuf_create(dn, level, blkid, parent, bp);
1920 1995 }
1921 1996
1922 1997 if (db->db_buf && refcount_is_zero(&db->db_holds)) {
1923 1998 arc_buf_add_ref(db->db_buf, db);
1924 1999 if (db->db_buf->b_data == NULL) {
1925 2000 dbuf_clear(db);
1926 2001 if (parent) {
1927 2002 dbuf_rele(parent, NULL);
1928 2003 parent = NULL;
1929 2004 }
1930 2005 goto top;
1931 2006 }
1932 2007 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
1933 2008 }
1934 2009
1935 2010 ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
1936 2011
1937 2012 /*
1938 2013 * If this buffer is currently syncing out, and we are are
1939 2014 * still referencing it from db_data, we need to make a copy
1940 2015 * of it in case we decide we want to dirty it again in this txg.
1941 2016 */
1942 2017 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1943 2018 dn->dn_object != DMU_META_DNODE_OBJECT &&
1944 2019 db->db_state == DB_CACHED && db->db_data_pending) {
1945 2020 dbuf_dirty_record_t *dr = db->db_data_pending;
1946 2021
1947 2022 if (dr->dt.dl.dr_data == db->db_buf) {
1948 2023 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1949 2024
1950 2025 dbuf_set_data(db,
1951 2026 arc_buf_alloc(dn->dn_objset->os_spa,
1952 2027 db->db.db_size, db, type));
1953 2028 bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
1954 2029 db->db.db_size);
1955 2030 }
1956 2031 }
1957 2032
1958 2033 (void) refcount_add(&db->db_holds, tag);
1959 2034 DBUF_VERIFY(db);
1960 2035 mutex_exit(&db->db_mtx);
1961 2036
1962 2037 /* NOTE: we can't rele the parent until after we drop the db_mtx */
1963 2038 if (parent)
1964 2039 dbuf_rele(parent, NULL);
1965 2040
1966 2041 ASSERT3P(DB_DNODE(db), ==, dn);
1967 2042 ASSERT3U(db->db_blkid, ==, blkid);
1968 2043 ASSERT3U(db->db_level, ==, level);
1969 2044 *dbp = db;
1970 2045
1971 2046 return (0);
1972 2047 }
1973 2048
1974 2049 dmu_buf_impl_t *
1975 2050 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
1976 2051 {
1977 2052 dmu_buf_impl_t *db;
1978 2053 int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
1979 2054 return (err ? NULL : db);
1980 2055 }
1981 2056
1982 2057 dmu_buf_impl_t *
1983 2058 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
1984 2059 {
1985 2060 dmu_buf_impl_t *db;
1986 2061 int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
1987 2062 return (err ? NULL : db);
1988 2063 }
1989 2064
1990 2065 void
1991 2066 dbuf_create_bonus(dnode_t *dn)
1992 2067 {
1993 2068 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
1994 2069
1995 2070 ASSERT(dn->dn_bonus == NULL);
1996 2071 dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
1997 2072 }
1998 2073
1999 2074 int
2000 2075 dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
2001 2076 {
2002 2077 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2003 2078 dnode_t *dn;
2004 2079
2005 2080 if (db->db_blkid != DMU_SPILL_BLKID)
2006 2081 return (SET_ERROR(ENOTSUP));
2007 2082 if (blksz == 0)
2008 2083 blksz = SPA_MINBLOCKSIZE;
2009 2084 ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
2010 2085 blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
2011 2086
2012 2087 DB_DNODE_ENTER(db);
2013 2088 dn = DB_DNODE(db);
2014 2089 rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
2015 2090 dbuf_new_size(db, blksz, tx);
2016 2091 rw_exit(&dn->dn_struct_rwlock);
2017 2092 DB_DNODE_EXIT(db);
2018 2093
2019 2094 return (0);
2020 2095 }
2021 2096
2022 2097 void
2023 2098 dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
2024 2099 {
2025 2100 dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
2026 2101 }
2027 2102
2028 2103 #pragma weak dmu_buf_add_ref = dbuf_add_ref
2029 2104 void
2030 2105 dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
2031 2106 {
2032 2107 int64_t holds = refcount_add(&db->db_holds, tag);
2033 2108 ASSERT(holds > 1);
2034 2109 }
2035 2110
2036 2111 /*
2037 2112 * If you call dbuf_rele() you had better not be referencing the dnode handle
2038 2113 * unless you have some other direct or indirect hold on the dnode. (An indirect
2039 2114 * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
2040 2115 * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
2041 2116 * dnode's parent dbuf evicting its dnode handles.
2042 2117 */
2043 2118 void
2044 2119 dbuf_rele(dmu_buf_impl_t *db, void *tag)
2045 2120 {
2046 2121 mutex_enter(&db->db_mtx);
2047 2122 dbuf_rele_and_unlock(db, tag);
2048 2123 }
2049 2124
2050 2125 void
2051 2126 dmu_buf_rele(dmu_buf_t *db, void *tag)
2052 2127 {
2053 2128 dbuf_rele((dmu_buf_impl_t *)db, tag);
2054 2129 }
2055 2130
2056 2131 /*
2057 2132 * dbuf_rele() for an already-locked dbuf. This is necessary to allow
2058 2133 * db_dirtycnt and db_holds to be updated atomically.
2059 2134 */
2060 2135 void
2061 2136 dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
2062 2137 {
2063 2138 int64_t holds;
2064 2139
2065 2140 ASSERT(MUTEX_HELD(&db->db_mtx));
2066 2141 DBUF_VERIFY(db);
2067 2142
2068 2143 /*
2069 2144 * Remove the reference to the dbuf before removing its hold on the
2070 2145 * dnode so we can guarantee in dnode_move() that a referenced bonus
2071 2146 * buffer has a corresponding dnode hold.
2072 2147 */
2073 2148 holds = refcount_remove(&db->db_holds, tag);
2074 2149 ASSERT(holds >= 0);
2075 2150
2076 2151 /*
2077 2152 * We can't freeze indirects if there is a possibility that they
2078 2153 * may be modified in the current syncing context.
2079 2154 */
2080 2155 if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
2081 2156 arc_buf_freeze(db->db_buf);
2082 2157
2083 2158 if (holds == db->db_dirtycnt &&
2084 2159 db->db_level == 0 && db->db_immediate_evict)
2085 2160 dbuf_evict_user(db);
2086 2161
2087 2162 if (holds == 0) {
2088 2163 if (db->db_blkid == DMU_BONUS_BLKID) {
2089 2164 mutex_exit(&db->db_mtx);
2090 2165
2091 2166 /*
2092 2167 * If the dnode moves here, we cannot cross this barrier
2093 2168 * until the move completes.
2094 2169 */
2095 2170 DB_DNODE_ENTER(db);
2096 2171 atomic_dec_32(&DB_DNODE(db)->dn_dbufs_count);
2097 2172 DB_DNODE_EXIT(db);
2098 2173 /*
2099 2174 * The bonus buffer's dnode hold is no longer discounted
2100 2175 * in dnode_move(). The dnode cannot move until after
2101 2176 * the dnode_rele().
2102 2177 */
2103 2178 dnode_rele(DB_DNODE(db), db);
2104 2179 } else if (db->db_buf == NULL) {
2105 2180 /*
2106 2181 * This is a special case: we never associated this
|
↓ open down ↓ |
382 lines elided |
↑ open up ↑ |
2107 2182 * dbuf with any data allocated from the ARC.
2108 2183 */
2109 2184 ASSERT(db->db_state == DB_UNCACHED ||
2110 2185 db->db_state == DB_NOFILL);
2111 2186 dbuf_evict(db);
2112 2187 } else if (arc_released(db->db_buf)) {
2113 2188 arc_buf_t *buf = db->db_buf;
2114 2189 /*
2115 2190 * This dbuf has anonymous data associated with it.
2116 2191 */
2117 - dbuf_set_data(db, NULL);
2192 + dbuf_clear_data(db);
2118 2193 VERIFY(arc_buf_remove_ref(buf, db));
2119 2194 dbuf_evict(db);
2120 2195 } else {
2121 2196 VERIFY(!arc_buf_remove_ref(db->db_buf, db));
2122 2197
2123 2198 /*
2124 2199 * A dbuf will be eligible for eviction if either the
2125 2200 * 'primarycache' property is set or a duplicate
2126 2201 * copy of this buffer is already cached in the arc.
2127 2202 *
2128 2203 * In the case of the 'primarycache' a buffer
2129 2204 * is considered for eviction if it matches the
2130 2205 * criteria set in the property.
2131 2206 *
2132 2207 * To decide if our buffer is considered a
2133 2208 * duplicate, we must call into the arc to determine
2134 2209 * if multiple buffers are referencing the same
2135 2210 * block on-disk. If so, then we simply evict
2136 2211 * ourselves.
2137 2212 */
2138 2213 if (!DBUF_IS_CACHEABLE(db)) {
2139 2214 if (db->db_blkptr != NULL &&
|
↓ open down ↓ |
12 lines elided |
↑ open up ↑ |
2140 2215 !BP_IS_HOLE(db->db_blkptr) &&
2141 2216 !BP_IS_EMBEDDED(db->db_blkptr)) {
2142 2217 spa_t *spa =
2143 2218 dmu_objset_spa(db->db_objset);
2144 2219 blkptr_t bp = *db->db_blkptr;
2145 2220 dbuf_clear(db);
2146 2221 arc_freed(spa, &bp);
2147 2222 } else {
2148 2223 dbuf_clear(db);
2149 2224 }
2150 - } else if (arc_buf_eviction_needed(db->db_buf)) {
2225 + } else if (db->db_objset->os_evicting ||
2226 + arc_buf_eviction_needed(db->db_buf)) {
2151 2227 dbuf_clear(db);
2152 2228 } else {
2153 2229 mutex_exit(&db->db_mtx);
2154 2230 }
2155 2231 }
2156 2232 } else {
2157 2233 mutex_exit(&db->db_mtx);
2158 2234 }
2159 2235 }
2160 2236
2161 2237 #pragma weak dmu_buf_refcount = dbuf_refcount
2162 2238 uint64_t
2163 2239 dbuf_refcount(dmu_buf_impl_t *db)
2164 2240 {
2165 2241 return (refcount_count(&db->db_holds));
2166 2242 }
2167 2243
2168 2244 void *
2169 -dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr,
2170 - dmu_buf_evict_func_t *evict_func)
2245 +dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user,
2246 + dmu_buf_user_t *new_user)
2171 2247 {
2172 - return (dmu_buf_update_user(db_fake, NULL, user_ptr, evict_func));
2248 + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2249 +
2250 + mutex_enter(&db->db_mtx);
2251 + dbuf_verify_user(db, DBVU_NOT_EVICTING);
2252 + if (db->db_user == old_user)
2253 + db->db_user = new_user;
2254 + else
2255 + old_user = db->db_user;
2256 + dbuf_verify_user(db, DBVU_NOT_EVICTING);
2257 + mutex_exit(&db->db_mtx);
2258 +
2259 + return (old_user);
2173 2260 }
2174 2261
2175 2262 void *
2176 -dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr,
2177 - dmu_buf_evict_func_t *evict_func)
2263 +dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
2178 2264 {
2179 - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2180 -
2181 - db->db_immediate_evict = TRUE;
2182 - return (dmu_buf_update_user(db_fake, NULL, user_ptr, evict_func));
2265 + return (dmu_buf_replace_user(db_fake, NULL, user));
2183 2266 }
2184 2267
2185 2268 void *
2186 -dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
2187 - dmu_buf_evict_func_t *evict_func)
2269 +dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)
2188 2270 {
2189 2271 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2190 - ASSERT(db->db_level == 0);
2191 2272
2192 - ASSERT((user_ptr == NULL) == (evict_func == NULL));
2273 + db->db_immediate_evict = TRUE;
2274 + return (dmu_buf_set_user(db_fake, user));
2275 +}
2193 2276
2194 - mutex_enter(&db->db_mtx);
2195 -
2196 - if (db->db_user_ptr == old_user_ptr) {
2197 - db->db_user_ptr = user_ptr;
2198 - db->db_evict_func = evict_func;
2199 - } else {
2200 - old_user_ptr = db->db_user_ptr;
2201 - }
2202 -
2203 - mutex_exit(&db->db_mtx);
2204 - return (old_user_ptr);
2277 +void *
2278 +dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
2279 +{
2280 + return (dmu_buf_replace_user(db_fake, user, NULL));
2205 2281 }
2206 2282
2207 2283 void *
2208 2284 dmu_buf_get_user(dmu_buf_t *db_fake)
2209 2285 {
2210 2286 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2211 - ASSERT(!refcount_is_zero(&db->db_holds));
2212 2287
2213 - return (db->db_user_ptr);
2288 + dbuf_verify_user(db, DBVU_NOT_EVICTING);
2289 + return (db->db_user);
2214 2290 }
2215 2291
2292 +void
2293 +dmu_buf_user_evict_wait()
2294 +{
2295 + taskq_wait(dbu_evict_taskq);
2296 +}
2297 +
2216 2298 boolean_t
2217 2299 dmu_buf_freeable(dmu_buf_t *dbuf)
2218 2300 {
2219 2301 boolean_t res = B_FALSE;
2220 2302 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
2221 2303
2222 2304 if (db->db_blkptr)
2223 2305 res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
2224 2306 db->db_blkptr, db->db_blkptr->blk_birth);
2225 2307
2226 2308 return (res);
2227 2309 }
2228 2310
2229 2311 blkptr_t *
2230 2312 dmu_buf_get_blkptr(dmu_buf_t *db)
2231 2313 {
2232 2314 dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
2233 2315 return (dbi->db_blkptr);
2234 2316 }
2235 2317
2236 2318 static void
2237 2319 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
2238 2320 {
2239 2321 /* ASSERT(dmu_tx_is_syncing(tx) */
2240 2322 ASSERT(MUTEX_HELD(&db->db_mtx));
2241 2323
2242 2324 if (db->db_blkptr != NULL)
2243 2325 return;
2244 2326
2245 2327 if (db->db_blkid == DMU_SPILL_BLKID) {
2246 2328 db->db_blkptr = &dn->dn_phys->dn_spill;
2247 2329 BP_ZERO(db->db_blkptr);
2248 2330 return;
2249 2331 }
2250 2332 if (db->db_level == dn->dn_phys->dn_nlevels-1) {
2251 2333 /*
2252 2334 * This buffer was allocated at a time when there was
2253 2335 * no available blkptrs from the dnode, or it was
2254 2336 * inappropriate to hook it in (i.e., nlevels mis-match).
2255 2337 */
2256 2338 ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
2257 2339 ASSERT(db->db_parent == NULL);
2258 2340 db->db_parent = dn->dn_dbuf;
2259 2341 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
2260 2342 DBUF_VERIFY(db);
2261 2343 } else {
2262 2344 dmu_buf_impl_t *parent = db->db_parent;
2263 2345 int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2264 2346
2265 2347 ASSERT(dn->dn_phys->dn_nlevels > 1);
2266 2348 if (parent == NULL) {
2267 2349 mutex_exit(&db->db_mtx);
2268 2350 rw_enter(&dn->dn_struct_rwlock, RW_READER);
2269 2351 (void) dbuf_hold_impl(dn, db->db_level+1,
2270 2352 db->db_blkid >> epbs, FALSE, db, &parent);
2271 2353 rw_exit(&dn->dn_struct_rwlock);
2272 2354 mutex_enter(&db->db_mtx);
2273 2355 db->db_parent = parent;
2274 2356 }
2275 2357 db->db_blkptr = (blkptr_t *)parent->db.db_data +
2276 2358 (db->db_blkid & ((1ULL << epbs) - 1));
2277 2359 DBUF_VERIFY(db);
2278 2360 }
2279 2361 }
2280 2362
2281 2363 static void
2282 2364 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2283 2365 {
2284 2366 dmu_buf_impl_t *db = dr->dr_dbuf;
2285 2367 dnode_t *dn;
2286 2368 zio_t *zio;
2287 2369
2288 2370 ASSERT(dmu_tx_is_syncing(tx));
2289 2371
2290 2372 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2291 2373
2292 2374 mutex_enter(&db->db_mtx);
2293 2375
2294 2376 ASSERT(db->db_level > 0);
2295 2377 DBUF_VERIFY(db);
2296 2378
2297 2379 /* Read the block if it hasn't been read yet. */
2298 2380 if (db->db_buf == NULL) {
2299 2381 mutex_exit(&db->db_mtx);
2300 2382 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
2301 2383 mutex_enter(&db->db_mtx);
2302 2384 }
2303 2385 ASSERT3U(db->db_state, ==, DB_CACHED);
2304 2386 ASSERT(db->db_buf != NULL);
2305 2387
2306 2388 DB_DNODE_ENTER(db);
2307 2389 dn = DB_DNODE(db);
2308 2390 /* Indirect block size must match what the dnode thinks it is. */
2309 2391 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2310 2392 dbuf_check_blkptr(dn, db);
2311 2393 DB_DNODE_EXIT(db);
2312 2394
2313 2395 /* Provide the pending dirty record to child dbufs */
2314 2396 db->db_data_pending = dr;
2315 2397
2316 2398 mutex_exit(&db->db_mtx);
2317 2399 dbuf_write(dr, db->db_buf, tx);
2318 2400
2319 2401 zio = dr->dr_zio;
2320 2402 mutex_enter(&dr->dt.di.dr_mtx);
2321 2403 dbuf_sync_list(&dr->dt.di.dr_children, tx);
2322 2404 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2323 2405 mutex_exit(&dr->dt.di.dr_mtx);
2324 2406 zio_nowait(zio);
2325 2407 }
2326 2408
2327 2409 static void
2328 2410 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2329 2411 {
2330 2412 arc_buf_t **datap = &dr->dt.dl.dr_data;
2331 2413 dmu_buf_impl_t *db = dr->dr_dbuf;
2332 2414 dnode_t *dn;
2333 2415 objset_t *os;
2334 2416 uint64_t txg = tx->tx_txg;
2335 2417
2336 2418 ASSERT(dmu_tx_is_syncing(tx));
2337 2419
2338 2420 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2339 2421
2340 2422 mutex_enter(&db->db_mtx);
2341 2423 /*
2342 2424 * To be synced, we must be dirtied. But we
2343 2425 * might have been freed after the dirty.
2344 2426 */
2345 2427 if (db->db_state == DB_UNCACHED) {
2346 2428 /* This buffer has been freed since it was dirtied */
2347 2429 ASSERT(db->db.db_data == NULL);
2348 2430 } else if (db->db_state == DB_FILL) {
2349 2431 /* This buffer was freed and is now being re-filled */
2350 2432 ASSERT(db->db.db_data != dr->dt.dl.dr_data);
2351 2433 } else {
2352 2434 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
2353 2435 }
2354 2436 DBUF_VERIFY(db);
2355 2437
2356 2438 DB_DNODE_ENTER(db);
2357 2439 dn = DB_DNODE(db);
2358 2440
2359 2441 if (db->db_blkid == DMU_SPILL_BLKID) {
2360 2442 mutex_enter(&dn->dn_mtx);
2361 2443 dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
2362 2444 mutex_exit(&dn->dn_mtx);
2363 2445 }
2364 2446
2365 2447 /*
2366 2448 * If this is a bonus buffer, simply copy the bonus data into the
2367 2449 * dnode. It will be written out when the dnode is synced (and it
2368 2450 * will be synced, since it must have been dirty for dbuf_sync to
2369 2451 * be called).
2370 2452 */
2371 2453 if (db->db_blkid == DMU_BONUS_BLKID) {
2372 2454 dbuf_dirty_record_t **drp;
2373 2455
2374 2456 ASSERT(*datap != NULL);
2375 2457 ASSERT0(db->db_level);
2376 2458 ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
2377 2459 bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
2378 2460 DB_DNODE_EXIT(db);
2379 2461
2380 2462 if (*datap != db->db.db_data) {
2381 2463 zio_buf_free(*datap, DN_MAX_BONUSLEN);
2382 2464 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
2383 2465 }
2384 2466 db->db_data_pending = NULL;
2385 2467 drp = &db->db_last_dirty;
2386 2468 while (*drp != dr)
2387 2469 drp = &(*drp)->dr_next;
2388 2470 ASSERT(dr->dr_next == NULL);
2389 2471 ASSERT(dr->dr_dbuf == db);
2390 2472 *drp = dr->dr_next;
2391 2473 kmem_free(dr, sizeof (dbuf_dirty_record_t));
2392 2474 ASSERT(db->db_dirtycnt > 0);
2393 2475 db->db_dirtycnt -= 1;
2394 2476 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
2395 2477 return;
2396 2478 }
2397 2479
2398 2480 os = dn->dn_objset;
2399 2481
2400 2482 /*
2401 2483 * This function may have dropped the db_mtx lock allowing a dmu_sync
2402 2484 * operation to sneak in. As a result, we need to ensure that we
2403 2485 * don't check the dr_override_state until we have returned from
2404 2486 * dbuf_check_blkptr.
2405 2487 */
2406 2488 dbuf_check_blkptr(dn, db);
2407 2489
2408 2490 /*
2409 2491 * If this buffer is in the middle of an immediate write,
2410 2492 * wait for the synchronous IO to complete.
2411 2493 */
2412 2494 while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
2413 2495 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
2414 2496 cv_wait(&db->db_changed, &db->db_mtx);
2415 2497 ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
2416 2498 }
2417 2499
2418 2500 if (db->db_state != DB_NOFILL &&
2419 2501 dn->dn_object != DMU_META_DNODE_OBJECT &&
2420 2502 refcount_count(&db->db_holds) > 1 &&
2421 2503 dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
2422 2504 *datap == db->db_buf) {
2423 2505 /*
2424 2506 * If this buffer is currently "in use" (i.e., there
2425 2507 * are active holds and db_data still references it),
2426 2508 * then make a copy before we start the write so that
2427 2509 * any modifications from the open txg will not leak
2428 2510 * into this write.
2429 2511 *
2430 2512 * NOTE: this copy does not need to be made for
2431 2513 * objects only modified in the syncing context (e.g.
2432 2514 * DNONE_DNODE blocks).
2433 2515 */
2434 2516 int blksz = arc_buf_size(*datap);
2435 2517 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
2436 2518 *datap = arc_buf_alloc(os->os_spa, blksz, db, type);
2437 2519 bcopy(db->db.db_data, (*datap)->b_data, blksz);
2438 2520 }
2439 2521 db->db_data_pending = dr;
2440 2522
2441 2523 mutex_exit(&db->db_mtx);
2442 2524
2443 2525 dbuf_write(dr, *datap, tx);
2444 2526
2445 2527 ASSERT(!list_link_active(&dr->dr_dirty_node));
2446 2528 if (dn->dn_object == DMU_META_DNODE_OBJECT) {
2447 2529 list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
2448 2530 DB_DNODE_EXIT(db);
2449 2531 } else {
2450 2532 /*
2451 2533 * Although zio_nowait() does not "wait for an IO", it does
2452 2534 * initiate the IO. If this is an empty write it seems plausible
2453 2535 * that the IO could actually be completed before the nowait
2454 2536 * returns. We need to DB_DNODE_EXIT() first in case
2455 2537 * zio_nowait() invalidates the dbuf.
2456 2538 */
2457 2539 DB_DNODE_EXIT(db);
2458 2540 zio_nowait(dr->dr_zio);
2459 2541 }
2460 2542 }
2461 2543
2462 2544 void
2463 2545 dbuf_sync_list(list_t *list, dmu_tx_t *tx)
2464 2546 {
2465 2547 dbuf_dirty_record_t *dr;
2466 2548
2467 2549 while (dr = list_head(list)) {
2468 2550 if (dr->dr_zio != NULL) {
2469 2551 /*
2470 2552 * If we find an already initialized zio then we
2471 2553 * are processing the meta-dnode, and we have finished.
2472 2554 * The dbufs for all dnodes are put back on the list
2473 2555 * during processing, so that we can zio_wait()
2474 2556 * these IOs after initiating all child IOs.
2475 2557 */
2476 2558 ASSERT3U(dr->dr_dbuf->db.db_object, ==,
2477 2559 DMU_META_DNODE_OBJECT);
2478 2560 break;
2479 2561 }
2480 2562 list_remove(list, dr);
2481 2563 if (dr->dr_dbuf->db_level > 0)
2482 2564 dbuf_sync_indirect(dr, tx);
2483 2565 else
2484 2566 dbuf_sync_leaf(dr, tx);
2485 2567 }
2486 2568 }
2487 2569
2488 2570 /* ARGSUSED */
2489 2571 static void
2490 2572 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
2491 2573 {
2492 2574 dmu_buf_impl_t *db = vdb;
2493 2575 dnode_t *dn;
2494 2576 blkptr_t *bp = zio->io_bp;
2495 2577 blkptr_t *bp_orig = &zio->io_bp_orig;
2496 2578 spa_t *spa = zio->io_spa;
2497 2579 int64_t delta;
2498 2580 uint64_t fill = 0;
2499 2581 int i;
2500 2582
2501 2583 ASSERT3P(db->db_blkptr, ==, bp);
2502 2584
2503 2585 DB_DNODE_ENTER(db);
2504 2586 dn = DB_DNODE(db);
2505 2587 delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
2506 2588 dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
2507 2589 zio->io_prev_space_delta = delta;
2508 2590
2509 2591 if (bp->blk_birth != 0) {
2510 2592 ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
2511 2593 BP_GET_TYPE(bp) == dn->dn_type) ||
2512 2594 (db->db_blkid == DMU_SPILL_BLKID &&
2513 2595 BP_GET_TYPE(bp) == dn->dn_bonustype) ||
2514 2596 BP_IS_EMBEDDED(bp));
2515 2597 ASSERT(BP_GET_LEVEL(bp) == db->db_level);
2516 2598 }
2517 2599
2518 2600 mutex_enter(&db->db_mtx);
2519 2601
2520 2602 #ifdef ZFS_DEBUG
2521 2603 if (db->db_blkid == DMU_SPILL_BLKID) {
2522 2604 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
2523 2605 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
2524 2606 db->db_blkptr == &dn->dn_phys->dn_spill);
2525 2607 }
2526 2608 #endif
2527 2609
2528 2610 if (db->db_level == 0) {
2529 2611 mutex_enter(&dn->dn_mtx);
2530 2612 if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
2531 2613 db->db_blkid != DMU_SPILL_BLKID)
2532 2614 dn->dn_phys->dn_maxblkid = db->db_blkid;
2533 2615 mutex_exit(&dn->dn_mtx);
2534 2616
2535 2617 if (dn->dn_type == DMU_OT_DNODE) {
2536 2618 dnode_phys_t *dnp = db->db.db_data;
2537 2619 for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
2538 2620 i--, dnp++) {
2539 2621 if (dnp->dn_type != DMU_OT_NONE)
2540 2622 fill++;
2541 2623 }
2542 2624 } else {
2543 2625 if (BP_IS_HOLE(bp)) {
2544 2626 fill = 0;
2545 2627 } else {
2546 2628 fill = 1;
2547 2629 }
2548 2630 }
2549 2631 } else {
2550 2632 blkptr_t *ibp = db->db.db_data;
2551 2633 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2552 2634 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
2553 2635 if (BP_IS_HOLE(ibp))
2554 2636 continue;
2555 2637 fill += BP_GET_FILL(ibp);
2556 2638 }
2557 2639 }
2558 2640 DB_DNODE_EXIT(db);
2559 2641
2560 2642 if (!BP_IS_EMBEDDED(bp))
2561 2643 bp->blk_fill = fill;
2562 2644
2563 2645 mutex_exit(&db->db_mtx);
2564 2646 }
2565 2647
2566 2648 /*
2567 2649 * The SPA will call this callback several times for each zio - once
2568 2650 * for every physical child i/o (zio->io_phys_children times). This
2569 2651 * allows the DMU to monitor the progress of each logical i/o. For example,
2570 2652 * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
2571 2653 * block. There may be a long delay before all copies/fragments are completed,
2572 2654 * so this callback allows us to retire dirty space gradually, as the physical
2573 2655 * i/os complete.
2574 2656 */
2575 2657 /* ARGSUSED */
2576 2658 static void
2577 2659 dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
2578 2660 {
2579 2661 dmu_buf_impl_t *db = arg;
2580 2662 objset_t *os = db->db_objset;
2581 2663 dsl_pool_t *dp = dmu_objset_pool(os);
2582 2664 dbuf_dirty_record_t *dr;
2583 2665 int delta = 0;
2584 2666
2585 2667 dr = db->db_data_pending;
2586 2668 ASSERT3U(dr->dr_txg, ==, zio->io_txg);
2587 2669
2588 2670 /*
2589 2671 * The callback will be called io_phys_children times. Retire one
2590 2672 * portion of our dirty space each time we are called. Any rounding
2591 2673 * error will be cleaned up by dsl_pool_sync()'s call to
2592 2674 * dsl_pool_undirty_space().
2593 2675 */
2594 2676 delta = dr->dr_accounted / zio->io_phys_children;
2595 2677 dsl_pool_undirty_space(dp, delta, zio->io_txg);
2596 2678 }
2597 2679
2598 2680 /* ARGSUSED */
2599 2681 static void
2600 2682 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
2601 2683 {
2602 2684 dmu_buf_impl_t *db = vdb;
2603 2685 blkptr_t *bp_orig = &zio->io_bp_orig;
2604 2686 blkptr_t *bp = db->db_blkptr;
2605 2687 objset_t *os = db->db_objset;
2606 2688 dmu_tx_t *tx = os->os_synctx;
2607 2689 dbuf_dirty_record_t **drp, *dr;
2608 2690
2609 2691 ASSERT0(zio->io_error);
2610 2692 ASSERT(db->db_blkptr == bp);
2611 2693
2612 2694 /*
2613 2695 * For nopwrites and rewrites we ensure that the bp matches our
2614 2696 * original and bypass all the accounting.
2615 2697 */
2616 2698 if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
2617 2699 ASSERT(BP_EQUAL(bp, bp_orig));
2618 2700 } else {
2619 2701 dsl_dataset_t *ds = os->os_dsl_dataset;
2620 2702 (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
2621 2703 dsl_dataset_block_born(ds, bp, tx);
2622 2704 }
2623 2705
2624 2706 mutex_enter(&db->db_mtx);
2625 2707
2626 2708 DBUF_VERIFY(db);
2627 2709
2628 2710 drp = &db->db_last_dirty;
2629 2711 while ((dr = *drp) != db->db_data_pending)
2630 2712 drp = &dr->dr_next;
2631 2713 ASSERT(!list_link_active(&dr->dr_dirty_node));
2632 2714 ASSERT(dr->dr_dbuf == db);
2633 2715 ASSERT(dr->dr_next == NULL);
2634 2716 *drp = dr->dr_next;
2635 2717
2636 2718 #ifdef ZFS_DEBUG
2637 2719 if (db->db_blkid == DMU_SPILL_BLKID) {
2638 2720 dnode_t *dn;
2639 2721
2640 2722 DB_DNODE_ENTER(db);
2641 2723 dn = DB_DNODE(db);
2642 2724 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
2643 2725 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
2644 2726 db->db_blkptr == &dn->dn_phys->dn_spill);
2645 2727 DB_DNODE_EXIT(db);
2646 2728 }
2647 2729 #endif
2648 2730
2649 2731 if (db->db_level == 0) {
2650 2732 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2651 2733 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
2652 2734 if (db->db_state != DB_NOFILL) {
2653 2735 if (dr->dt.dl.dr_data != db->db_buf)
2654 2736 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
2655 2737 db));
2656 2738 else if (!arc_released(db->db_buf))
2657 2739 arc_set_callback(db->db_buf, dbuf_do_evict, db);
2658 2740 }
2659 2741 } else {
2660 2742 dnode_t *dn;
2661 2743
2662 2744 DB_DNODE_ENTER(db);
2663 2745 dn = DB_DNODE(db);
2664 2746 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2665 2747 ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
2666 2748 if (!BP_IS_HOLE(db->db_blkptr)) {
2667 2749 int epbs =
2668 2750 dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2669 2751 ASSERT3U(db->db_blkid, <=,
2670 2752 dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
2671 2753 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
2672 2754 db->db.db_size);
2673 2755 if (!arc_released(db->db_buf))
2674 2756 arc_set_callback(db->db_buf, dbuf_do_evict, db);
2675 2757 }
2676 2758 DB_DNODE_EXIT(db);
2677 2759 mutex_destroy(&dr->dt.di.dr_mtx);
2678 2760 list_destroy(&dr->dt.di.dr_children);
2679 2761 }
2680 2762 kmem_free(dr, sizeof (dbuf_dirty_record_t));
2681 2763
2682 2764 cv_broadcast(&db->db_changed);
2683 2765 ASSERT(db->db_dirtycnt > 0);
2684 2766 db->db_dirtycnt -= 1;
2685 2767 db->db_data_pending = NULL;
2686 2768 dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg);
2687 2769 }
2688 2770
2689 2771 static void
2690 2772 dbuf_write_nofill_ready(zio_t *zio)
2691 2773 {
2692 2774 dbuf_write_ready(zio, NULL, zio->io_private);
2693 2775 }
2694 2776
2695 2777 static void
2696 2778 dbuf_write_nofill_done(zio_t *zio)
2697 2779 {
2698 2780 dbuf_write_done(zio, NULL, zio->io_private);
2699 2781 }
2700 2782
2701 2783 static void
2702 2784 dbuf_write_override_ready(zio_t *zio)
2703 2785 {
2704 2786 dbuf_dirty_record_t *dr = zio->io_private;
2705 2787 dmu_buf_impl_t *db = dr->dr_dbuf;
2706 2788
2707 2789 dbuf_write_ready(zio, NULL, db);
2708 2790 }
2709 2791
2710 2792 static void
2711 2793 dbuf_write_override_done(zio_t *zio)
2712 2794 {
2713 2795 dbuf_dirty_record_t *dr = zio->io_private;
2714 2796 dmu_buf_impl_t *db = dr->dr_dbuf;
2715 2797 blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
2716 2798
2717 2799 mutex_enter(&db->db_mtx);
2718 2800 if (!BP_EQUAL(zio->io_bp, obp)) {
2719 2801 if (!BP_IS_HOLE(obp))
2720 2802 dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
2721 2803 arc_release(dr->dt.dl.dr_data, db);
2722 2804 }
2723 2805 mutex_exit(&db->db_mtx);
2724 2806
2725 2807 dbuf_write_done(zio, NULL, db);
2726 2808 }
2727 2809
2728 2810 /* Issue I/O to commit a dirty buffer to disk. */
2729 2811 static void
2730 2812 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
2731 2813 {
2732 2814 dmu_buf_impl_t *db = dr->dr_dbuf;
2733 2815 dnode_t *dn;
2734 2816 objset_t *os;
2735 2817 dmu_buf_impl_t *parent = db->db_parent;
2736 2818 uint64_t txg = tx->tx_txg;
2737 2819 zbookmark_phys_t zb;
2738 2820 zio_prop_t zp;
2739 2821 zio_t *zio;
2740 2822 int wp_flag = 0;
2741 2823
2742 2824 DB_DNODE_ENTER(db);
2743 2825 dn = DB_DNODE(db);
2744 2826 os = dn->dn_objset;
2745 2827
2746 2828 if (db->db_state != DB_NOFILL) {
2747 2829 if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
2748 2830 /*
2749 2831 * Private object buffers are released here rather
2750 2832 * than in dbuf_dirty() since they are only modified
2751 2833 * in the syncing context and we don't want the
2752 2834 * overhead of making multiple copies of the data.
2753 2835 */
2754 2836 if (BP_IS_HOLE(db->db_blkptr)) {
2755 2837 arc_buf_thaw(data);
2756 2838 } else {
2757 2839 dbuf_release_bp(db);
2758 2840 }
2759 2841 }
2760 2842 }
2761 2843
2762 2844 if (parent != dn->dn_dbuf) {
2763 2845 /* Our parent is an indirect block. */
2764 2846 /* We have a dirty parent that has been scheduled for write. */
2765 2847 ASSERT(parent && parent->db_data_pending);
2766 2848 /* Our parent's buffer is one level closer to the dnode. */
2767 2849 ASSERT(db->db_level == parent->db_level-1);
2768 2850 /*
2769 2851 * We're about to modify our parent's db_data by modifying
2770 2852 * our block pointer, so the parent must be released.
2771 2853 */
2772 2854 ASSERT(arc_released(parent->db_buf));
2773 2855 zio = parent->db_data_pending->dr_zio;
2774 2856 } else {
2775 2857 /* Our parent is the dnode itself. */
2776 2858 ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
2777 2859 db->db_blkid != DMU_SPILL_BLKID) ||
2778 2860 (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
2779 2861 if (db->db_blkid != DMU_SPILL_BLKID)
2780 2862 ASSERT3P(db->db_blkptr, ==,
2781 2863 &dn->dn_phys->dn_blkptr[db->db_blkid]);
2782 2864 zio = dn->dn_zio;
2783 2865 }
2784 2866
2785 2867 ASSERT(db->db_level == 0 || data == db->db_buf);
2786 2868 ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
2787 2869 ASSERT(zio);
2788 2870
2789 2871 SET_BOOKMARK(&zb, os->os_dsl_dataset ?
2790 2872 os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
2791 2873 db->db.db_object, db->db_level, db->db_blkid);
2792 2874
2793 2875 if (db->db_blkid == DMU_SPILL_BLKID)
2794 2876 wp_flag = WP_SPILL;
2795 2877 wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
2796 2878
2797 2879 dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
2798 2880 DB_DNODE_EXIT(db);
2799 2881
2800 2882 if (db->db_level == 0 &&
2801 2883 dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
2802 2884 /*
2803 2885 * The BP for this block has been provided by open context
2804 2886 * (by dmu_sync() or dmu_buf_write_embedded()).
2805 2887 */
2806 2888 void *contents = (data != NULL) ? data->b_data : NULL;
2807 2889
2808 2890 dr->dr_zio = zio_write(zio, os->os_spa, txg,
2809 2891 db->db_blkptr, contents, db->db.db_size, &zp,
2810 2892 dbuf_write_override_ready, NULL, dbuf_write_override_done,
2811 2893 dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2812 2894 mutex_enter(&db->db_mtx);
2813 2895 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
2814 2896 zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
2815 2897 dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
2816 2898 mutex_exit(&db->db_mtx);
2817 2899 } else if (db->db_state == DB_NOFILL) {
2818 2900 ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
2819 2901 zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
2820 2902 dr->dr_zio = zio_write(zio, os->os_spa, txg,
2821 2903 db->db_blkptr, NULL, db->db.db_size, &zp,
2822 2904 dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db,
2823 2905 ZIO_PRIORITY_ASYNC_WRITE,
2824 2906 ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
2825 2907 } else {
2826 2908 ASSERT(arc_released(data));
2827 2909 dr->dr_zio = arc_write(zio, os->os_spa, txg,
2828 2910 db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db),
2829 2911 DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
2830 2912 dbuf_write_physdone, dbuf_write_done, db,
2831 2913 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2832 2914 }
2833 2915 }
|
↓ open down ↓ |
608 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX