Print this page
5056 ZFS deadlock on db_mtx and dn_holds
Reviewed by: Will Andrews <willa@spectralogic.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Approved by: Dan McDonald <danmcd@omniti.com>
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/fs/zfs/dnode_sync.c
+++ new/usr/src/uts/common/fs/zfs/dnode_sync.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
↓ open down ↓ |
14 lines elided |
↑ open up ↑ |
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 24 * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
25 + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
25 26 */
26 27
27 28 #include <sys/zfs_context.h>
28 29 #include <sys/dbuf.h>
29 30 #include <sys/dnode.h>
30 31 #include <sys/dmu.h>
31 32 #include <sys/dmu_tx.h>
32 33 #include <sys/dmu_objset.h>
33 34 #include <sys/dsl_dataset.h>
34 35 #include <sys/spa.h>
35 36 #include <sys/range_tree.h>
36 37 #include <sys/zfeature.h>
37 38
38 39 static void
39 40 dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
40 41 {
41 42 dmu_buf_impl_t *db;
42 43 int txgoff = tx->tx_txg & TXG_MASK;
43 44 int nblkptr = dn->dn_phys->dn_nblkptr;
44 45 int old_toplvl = dn->dn_phys->dn_nlevels - 1;
45 46 int new_level = dn->dn_next_nlevels[txgoff];
46 47 int i;
47 48
48 49 rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
49 50
50 51 /* this dnode can't be paged out because it's dirty */
51 52 ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
52 53 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
53 54 ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0);
54 55
55 56 db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG);
56 57 ASSERT(db != NULL);
57 58
58 59 dn->dn_phys->dn_nlevels = new_level;
59 60 dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset,
60 61 dn->dn_object, dn->dn_phys->dn_nlevels);
61 62
62 63 /* check for existing blkptrs in the dnode */
63 64 for (i = 0; i < nblkptr; i++)
64 65 if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[i]))
65 66 break;
66 67 if (i != nblkptr) {
67 68 /* transfer dnode's block pointers to new indirect block */
68 69 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT);
69 70 ASSERT(db->db.db_data);
70 71 ASSERT(arc_released(db->db_buf));
71 72 ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size);
72 73 bcopy(dn->dn_phys->dn_blkptr, db->db.db_data,
73 74 sizeof (blkptr_t) * nblkptr);
74 75 arc_buf_freeze(db->db_buf);
75 76 }
76 77
77 78 /* set dbuf's parent pointers to new indirect buf */
78 79 for (i = 0; i < nblkptr; i++) {
79 80 dmu_buf_impl_t *child = dbuf_find(dn, old_toplvl, i);
80 81
81 82 if (child == NULL)
82 83 continue;
83 84 #ifdef DEBUG
84 85 DB_DNODE_ENTER(child);
85 86 ASSERT3P(DB_DNODE(child), ==, dn);
86 87 DB_DNODE_EXIT(child);
87 88 #endif /* DEBUG */
88 89 if (child->db_parent && child->db_parent != dn->dn_dbuf) {
89 90 ASSERT(child->db_parent->db_level == db->db_level);
90 91 ASSERT(child->db_blkptr !=
91 92 &dn->dn_phys->dn_blkptr[child->db_blkid]);
92 93 mutex_exit(&child->db_mtx);
93 94 continue;
94 95 }
95 96 ASSERT(child->db_parent == NULL ||
96 97 child->db_parent == dn->dn_dbuf);
97 98
98 99 child->db_parent = db;
99 100 dbuf_add_ref(db, child);
100 101 if (db->db.db_data)
101 102 child->db_blkptr = (blkptr_t *)db->db.db_data + i;
102 103 else
103 104 child->db_blkptr = NULL;
104 105 dprintf_dbuf_bp(child, child->db_blkptr,
105 106 "changed db_blkptr to new indirect %s", "");
106 107
107 108 mutex_exit(&child->db_mtx);
108 109 }
109 110
110 111 bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr);
111 112
112 113 dbuf_rele(db, FTAG);
113 114
114 115 rw_exit(&dn->dn_struct_rwlock);
115 116 }
116 117
117 118 static void
118 119 free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)
119 120 {
120 121 dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
121 122 uint64_t bytesfreed = 0;
122 123
123 124 dprintf("ds=%p obj=%llx num=%d\n", ds, dn->dn_object, num);
124 125
125 126 for (int i = 0; i < num; i++, bp++) {
126 127 if (BP_IS_HOLE(bp))
127 128 continue;
128 129
129 130 bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE);
130 131 ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys));
131 132
132 133 /*
133 134 * Save some useful information on the holes being
134 135 * punched, including logical size, type, and indirection
135 136 * level. Retaining birth time enables detection of when
136 137 * holes are punched for reducing the number of free
137 138 * records transmitted during a zfs send.
138 139 */
139 140
140 141 uint64_t lsize = BP_GET_LSIZE(bp);
141 142 dmu_object_type_t type = BP_GET_TYPE(bp);
142 143 uint64_t lvl = BP_GET_LEVEL(bp);
143 144
144 145 bzero(bp, sizeof (blkptr_t));
145 146
146 147 if (spa_feature_is_active(dn->dn_objset->os_spa,
147 148 SPA_FEATURE_HOLE_BIRTH)) {
148 149 BP_SET_LSIZE(bp, lsize);
149 150 BP_SET_TYPE(bp, type);
150 151 BP_SET_LEVEL(bp, lvl);
151 152 BP_SET_BIRTH(bp, dmu_tx_get_txg(tx), 0);
152 153 }
153 154 }
154 155 dnode_diduse_space(dn, -bytesfreed);
155 156 }
156 157
157 158 #ifdef ZFS_DEBUG
158 159 static void
159 160 free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
160 161 {
161 162 int off, num;
162 163 int i, err, epbs;
163 164 uint64_t txg = tx->tx_txg;
164 165 dnode_t *dn;
165 166
166 167 DB_DNODE_ENTER(db);
167 168 dn = DB_DNODE(db);
168 169 epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
169 170 off = start - (db->db_blkid * 1<<epbs);
170 171 num = end - start + 1;
171 172
172 173 ASSERT3U(off, >=, 0);
173 174 ASSERT3U(num, >=, 0);
174 175 ASSERT3U(db->db_level, >, 0);
175 176 ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
176 177 ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT);
177 178 ASSERT(db->db_blkptr != NULL);
178 179
179 180 for (i = off; i < off+num; i++) {
180 181 uint64_t *buf;
181 182 dmu_buf_impl_t *child;
182 183 dbuf_dirty_record_t *dr;
183 184 int j;
184 185
185 186 ASSERT(db->db_level == 1);
186 187
187 188 rw_enter(&dn->dn_struct_rwlock, RW_READER);
188 189 err = dbuf_hold_impl(dn, db->db_level-1,
189 190 (db->db_blkid << epbs) + i, TRUE, FTAG, &child);
190 191 rw_exit(&dn->dn_struct_rwlock);
191 192 if (err == ENOENT)
192 193 continue;
193 194 ASSERT(err == 0);
194 195 ASSERT(child->db_level == 0);
195 196 dr = child->db_last_dirty;
196 197 while (dr && dr->dr_txg > txg)
197 198 dr = dr->dr_next;
198 199 ASSERT(dr == NULL || dr->dr_txg == txg);
199 200
200 201 /* data_old better be zeroed */
201 202 if (dr) {
202 203 buf = dr->dt.dl.dr_data->b_data;
203 204 for (j = 0; j < child->db.db_size >> 3; j++) {
204 205 if (buf[j] != 0) {
205 206 panic("freed data not zero: "
206 207 "child=%p i=%d off=%d num=%d\n",
207 208 (void *)child, i, off, num);
208 209 }
209 210 }
210 211 }
211 212
212 213 /*
213 214 * db_data better be zeroed unless it's dirty in a
214 215 * future txg.
215 216 */
216 217 mutex_enter(&child->db_mtx);
217 218 buf = child->db.db_data;
218 219 if (buf != NULL && child->db_state != DB_FILL &&
219 220 child->db_last_dirty == NULL) {
220 221 for (j = 0; j < child->db.db_size >> 3; j++) {
221 222 if (buf[j] != 0) {
222 223 panic("freed data not zero: "
223 224 "child=%p i=%d off=%d num=%d\n",
224 225 (void *)child, i, off, num);
225 226 }
226 227 }
227 228 }
228 229 mutex_exit(&child->db_mtx);
229 230
230 231 dbuf_rele(child, FTAG);
231 232 }
232 233 DB_DNODE_EXIT(db);
233 234 }
234 235 #endif
235 236
236 237 static void
237 238 free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
238 239 dmu_tx_t *tx)
239 240 {
240 241 dnode_t *dn;
241 242 blkptr_t *bp;
242 243 dmu_buf_impl_t *subdb;
243 244 uint64_t start, end, dbstart, dbend, i;
244 245 int epbs, shift;
245 246
246 247 /*
247 248 * There is a small possibility that this block will not be cached:
248 249 * 1 - if level > 1 and there are no children with level <= 1
249 250 * 2 - if this block was evicted since we read it from
250 251 * dmu_tx_hold_free().
251 252 */
252 253 if (db->db_state != DB_CACHED)
253 254 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
254 255
255 256 dbuf_release_bp(db);
256 257 bp = db->db.db_data;
257 258
258 259 DB_DNODE_ENTER(db);
259 260 dn = DB_DNODE(db);
260 261 epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
261 262 shift = (db->db_level - 1) * epbs;
262 263 dbstart = db->db_blkid << epbs;
263 264 start = blkid >> shift;
264 265 if (dbstart < start) {
265 266 bp += start - dbstart;
266 267 } else {
267 268 start = dbstart;
268 269 }
269 270 dbend = ((db->db_blkid + 1) << epbs) - 1;
270 271 end = (blkid + nblks - 1) >> shift;
271 272 if (dbend <= end)
272 273 end = dbend;
273 274
274 275 ASSERT3U(start, <=, end);
275 276
276 277 if (db->db_level == 1) {
277 278 FREE_VERIFY(db, start, end, tx);
278 279 free_blocks(dn, bp, end-start+1, tx);
279 280 } else {
280 281 for (i = start; i <= end; i++, bp++) {
281 282 if (BP_IS_HOLE(bp))
282 283 continue;
283 284 rw_enter(&dn->dn_struct_rwlock, RW_READER);
284 285 VERIFY0(dbuf_hold_impl(dn, db->db_level - 1,
285 286 i, B_TRUE, FTAG, &subdb));
286 287 rw_exit(&dn->dn_struct_rwlock);
287 288 ASSERT3P(bp, ==, subdb->db_blkptr);
288 289
289 290 free_children(subdb, blkid, nblks, tx);
290 291 dbuf_rele(subdb, FTAG);
291 292 }
292 293 }
293 294
294 295 /* If this whole block is free, free ourself too. */
295 296 for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) {
296 297 if (!BP_IS_HOLE(bp))
297 298 break;
298 299 }
299 300 if (i == 1 << epbs) {
300 301 /* didn't find any non-holes */
301 302 bzero(db->db.db_data, db->db.db_size);
302 303 free_blocks(dn, db->db_blkptr, 1, tx);
303 304 } else {
304 305 /*
305 306 * Partial block free; must be marked dirty so that it
306 307 * will be written out.
307 308 */
308 309 ASSERT(db->db_dirtycnt > 0);
309 310 }
310 311
311 312 DB_DNODE_EXIT(db);
312 313 arc_buf_freeze(db->db_buf);
313 314 }
314 315
315 316 /*
316 317 * Traverse the indicated range of the provided file
317 318 * and "free" all the blocks contained there.
318 319 */
319 320 static void
320 321 dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
321 322 dmu_tx_t *tx)
322 323 {
323 324 blkptr_t *bp = dn->dn_phys->dn_blkptr;
324 325 int dnlevel = dn->dn_phys->dn_nlevels;
325 326 boolean_t trunc = B_FALSE;
326 327
327 328 if (blkid > dn->dn_phys->dn_maxblkid)
328 329 return;
329 330
330 331 ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX);
331 332 if (blkid + nblks > dn->dn_phys->dn_maxblkid) {
332 333 nblks = dn->dn_phys->dn_maxblkid - blkid + 1;
333 334 trunc = B_TRUE;
334 335 }
335 336
336 337 /* There are no indirect blocks in the object */
337 338 if (dnlevel == 1) {
338 339 if (blkid >= dn->dn_phys->dn_nblkptr) {
339 340 /* this range was never made persistent */
340 341 return;
341 342 }
342 343 ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr);
343 344 free_blocks(dn, bp + blkid, nblks, tx);
344 345 } else {
345 346 int shift = (dnlevel - 1) *
346 347 (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT);
347 348 int start = blkid >> shift;
348 349 int end = (blkid + nblks - 1) >> shift;
349 350 dmu_buf_impl_t *db;
350 351
351 352 ASSERT(start < dn->dn_phys->dn_nblkptr);
352 353 bp += start;
353 354 for (int i = start; i <= end; i++, bp++) {
354 355 if (BP_IS_HOLE(bp))
355 356 continue;
356 357 rw_enter(&dn->dn_struct_rwlock, RW_READER);
357 358 VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i,
358 359 TRUE, FTAG, &db));
359 360 rw_exit(&dn->dn_struct_rwlock);
360 361
361 362 free_children(db, blkid, nblks, tx);
362 363 dbuf_rele(db, FTAG);
363 364 }
364 365 }
365 366
366 367 if (trunc) {
367 368 dn->dn_phys->dn_maxblkid = blkid == 0 ? 0 : blkid - 1;
368 369
369 370 uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
370 371 (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
371 372 ASSERT(off < dn->dn_phys->dn_maxblkid ||
372 373 dn->dn_phys->dn_maxblkid == 0 ||
373 374 dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0);
374 375 }
375 376 }
376 377
377 378 typedef struct dnode_sync_free_range_arg {
378 379 dnode_t *dsfra_dnode;
379 380 dmu_tx_t *dsfra_tx;
380 381 } dnode_sync_free_range_arg_t;
381 382
382 383 static void
383 384 dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks)
384 385 {
385 386 dnode_sync_free_range_arg_t *dsfra = arg;
386 387 dnode_t *dn = dsfra->dsfra_dnode;
387 388
388 389 mutex_exit(&dn->dn_mtx);
|
↓ open down ↓ |
354 lines elided |
↑ open up ↑ |
389 390 dnode_sync_free_range_impl(dn, blkid, nblks, dsfra->dsfra_tx);
390 391 mutex_enter(&dn->dn_mtx);
391 392 }
392 393
393 394 /*
394 395 * Try to kick all the dnode's dbufs out of the cache...
395 396 */
396 397 void
397 398 dnode_evict_dbufs(dnode_t *dn)
398 399 {
399 - int progress;
400 - int pass = 0;
400 + dmu_buf_impl_t db_marker;
401 + dmu_buf_impl_t *db, *db_next;
401 402
402 - do {
403 - dmu_buf_impl_t *db, *db_next;
404 - int evicting = FALSE;
403 + mutex_enter(&dn->dn_dbufs_mtx);
404 + for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) {
405 405
406 - progress = FALSE;
407 - mutex_enter(&dn->dn_dbufs_mtx);
408 - for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) {
409 - db_next = AVL_NEXT(&dn->dn_dbufs, db);
410 406 #ifdef DEBUG
411 - DB_DNODE_ENTER(db);
412 - ASSERT3P(DB_DNODE(db), ==, dn);
413 - DB_DNODE_EXIT(db);
407 + DB_DNODE_ENTER(db);
408 + ASSERT3P(DB_DNODE(db), ==, dn);
409 + DB_DNODE_EXIT(db);
414 410 #endif /* DEBUG */
415 411
416 - mutex_enter(&db->db_mtx);
417 - if (db->db_state == DB_EVICTING) {
418 - progress = TRUE;
419 - evicting = TRUE;
420 - mutex_exit(&db->db_mtx);
421 - } else if (refcount_is_zero(&db->db_holds)) {
422 - progress = TRUE;
423 - dbuf_clear(db); /* exits db_mtx for us */
424 - } else {
425 - mutex_exit(&db->db_mtx);
426 - }
412 + mutex_enter(&db->db_mtx);
413 + if (db->db_state != DB_EVICTING &&
414 + refcount_is_zero(&db->db_holds)) {
415 + db_marker.db_level = db->db_level;
416 + db_marker.db_blkid = db->db_blkid;
417 + db_marker.db_state = DB_SEARCH;
418 + avl_insert_here(&dn->dn_dbufs, &db_marker, db,
419 + AVL_BEFORE);
427 420
421 + dbuf_clear(db);
422 +
423 + db_next = AVL_NEXT(&dn->dn_dbufs, &db_marker);
424 + avl_remove(&dn->dn_dbufs, &db_marker);
425 + } else {
426 + mutex_exit(&db->db_mtx);
427 + db_next = AVL_NEXT(&dn->dn_dbufs, db);
428 428 }
429 - /*
430 - * NB: we need to drop dn_dbufs_mtx between passes so
431 - * that any DB_EVICTING dbufs can make progress.
432 - * Ideally, we would have some cv we could wait on, but
433 - * since we don't, just wait a bit to give the other
434 - * thread a chance to run.
435 - */
436 - mutex_exit(&dn->dn_dbufs_mtx);
437 - if (evicting)
438 - delay(1);
439 - pass++;
440 - ASSERT(pass < 100); /* sanity check */
441 - } while (progress);
429 + }
430 + mutex_exit(&dn->dn_dbufs_mtx);
442 431
443 432 rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
444 433 if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) {
445 434 mutex_enter(&dn->dn_bonus->db_mtx);
446 435 dbuf_evict(dn->dn_bonus);
447 436 dn->dn_bonus = NULL;
448 437 }
449 438 rw_exit(&dn->dn_struct_rwlock);
450 439 }
451 440
452 441 static void
453 442 dnode_undirty_dbufs(list_t *list)
454 443 {
455 444 dbuf_dirty_record_t *dr;
456 445
457 446 while (dr = list_head(list)) {
458 447 dmu_buf_impl_t *db = dr->dr_dbuf;
459 448 uint64_t txg = dr->dr_txg;
460 449
461 450 if (db->db_level != 0)
462 451 dnode_undirty_dbufs(&dr->dt.di.dr_children);
463 452
464 453 mutex_enter(&db->db_mtx);
465 454 /* XXX - use dbuf_undirty()? */
466 455 list_remove(list, dr);
467 456 ASSERT(db->db_last_dirty == dr);
468 457 db->db_last_dirty = NULL;
469 458 db->db_dirtycnt -= 1;
470 459 if (db->db_level == 0) {
471 460 ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
472 461 dr->dt.dl.dr_data == db->db_buf);
473 462 dbuf_unoverride(dr);
474 463 } else {
475 464 mutex_destroy(&dr->dt.di.dr_mtx);
476 465 list_destroy(&dr->dt.di.dr_children);
477 466 }
478 467 kmem_free(dr, sizeof (dbuf_dirty_record_t));
479 468 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
480 469 }
481 470 }
482 471
483 472 static void
484 473 dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
485 474 {
486 475 int txgoff = tx->tx_txg & TXG_MASK;
487 476
488 477 ASSERT(dmu_tx_is_syncing(tx));
489 478
|
↓ open down ↓ |
38 lines elided |
↑ open up ↑ |
490 479 /*
491 480 * Our contents should have been freed in dnode_sync() by the
492 481 * free range record inserted by the caller of dnode_free().
493 482 */
494 483 ASSERT0(DN_USED_BYTES(dn->dn_phys));
495 484 ASSERT(BP_IS_HOLE(dn->dn_phys->dn_blkptr));
496 485
497 486 dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
498 487 dnode_evict_dbufs(dn);
499 488 ASSERT(avl_is_empty(&dn->dn_dbufs));
500 - ASSERT3P(dn->dn_bonus, ==, NULL);
501 489
502 490 /*
503 491 * XXX - It would be nice to assert this, but we may still
504 492 * have residual holds from async evictions from the arc...
505 493 *
506 494 * zfs_obj_to_path() also depends on this being
507 495 * commented out.
508 496 *
509 497 * ASSERT3U(refcount_count(&dn->dn_holds), ==, 1);
510 498 */
511 499
512 500 /* Undirty next bits */
513 501 dn->dn_next_nlevels[txgoff] = 0;
514 502 dn->dn_next_indblkshift[txgoff] = 0;
515 503 dn->dn_next_blksz[txgoff] = 0;
516 504
517 505 /* ASSERT(blkptrs are zero); */
518 506 ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
519 507 ASSERT(dn->dn_type != DMU_OT_NONE);
520 508
521 509 ASSERT(dn->dn_free_txg > 0);
522 510 if (dn->dn_allocated_txg != dn->dn_free_txg)
523 511 dmu_buf_will_dirty(&dn->dn_dbuf->db, tx);
524 512 bzero(dn->dn_phys, sizeof (dnode_phys_t));
525 513
526 514 mutex_enter(&dn->dn_mtx);
527 515 dn->dn_type = DMU_OT_NONE;
528 516 dn->dn_maxblkid = 0;
529 517 dn->dn_allocated_txg = 0;
530 518 dn->dn_free_txg = 0;
531 519 dn->dn_have_spill = B_FALSE;
532 520 mutex_exit(&dn->dn_mtx);
533 521
534 522 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
535 523
536 524 dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
537 525 /*
538 526 * Now that we've released our hold, the dnode may
539 527 * be evicted, so we musn't access it.
540 528 */
541 529 }
542 530
543 531 /*
544 532 * Write out the dnode's dirty buffers.
545 533 */
546 534 void
547 535 dnode_sync(dnode_t *dn, dmu_tx_t *tx)
548 536 {
549 537 dnode_phys_t *dnp = dn->dn_phys;
550 538 int txgoff = tx->tx_txg & TXG_MASK;
551 539 list_t *list = &dn->dn_dirty_records[txgoff];
552 540 static const dnode_phys_t zerodn = { 0 };
553 541 boolean_t kill_spill = B_FALSE;
554 542
555 543 ASSERT(dmu_tx_is_syncing(tx));
556 544 ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
557 545 ASSERT(dnp->dn_type != DMU_OT_NONE ||
558 546 bcmp(dnp, &zerodn, DNODE_SIZE) == 0);
559 547 DNODE_VERIFY(dn);
560 548
561 549 ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf));
562 550
563 551 if (dmu_objset_userused_enabled(dn->dn_objset) &&
564 552 !DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
565 553 mutex_enter(&dn->dn_mtx);
566 554 dn->dn_oldused = DN_USED_BYTES(dn->dn_phys);
567 555 dn->dn_oldflags = dn->dn_phys->dn_flags;
568 556 dn->dn_phys->dn_flags |= DNODE_FLAG_USERUSED_ACCOUNTED;
569 557 mutex_exit(&dn->dn_mtx);
570 558 dmu_objset_userquota_get_ids(dn, B_FALSE, tx);
571 559 } else {
572 560 /* Once we account for it, we should always account for it. */
573 561 ASSERT(!(dn->dn_phys->dn_flags &
574 562 DNODE_FLAG_USERUSED_ACCOUNTED));
575 563 }
576 564
577 565 mutex_enter(&dn->dn_mtx);
578 566 if (dn->dn_allocated_txg == tx->tx_txg) {
579 567 /* The dnode is newly allocated or reallocated */
580 568 if (dnp->dn_type == DMU_OT_NONE) {
581 569 /* this is a first alloc, not a realloc */
582 570 dnp->dn_nlevels = 1;
583 571 dnp->dn_nblkptr = dn->dn_nblkptr;
584 572 }
585 573
586 574 dnp->dn_type = dn->dn_type;
587 575 dnp->dn_bonustype = dn->dn_bonustype;
588 576 dnp->dn_bonuslen = dn->dn_bonuslen;
589 577 }
590 578 ASSERT(dnp->dn_nlevels > 1 ||
591 579 BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
592 580 BP_IS_EMBEDDED(&dnp->dn_blkptr[0]) ||
593 581 BP_GET_LSIZE(&dnp->dn_blkptr[0]) ==
594 582 dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
595 583 ASSERT(dnp->dn_nlevels < 2 ||
596 584 BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
597 585 BP_GET_LSIZE(&dnp->dn_blkptr[0]) == 1 << dnp->dn_indblkshift);
598 586
599 587 if (dn->dn_next_type[txgoff] != 0) {
600 588 dnp->dn_type = dn->dn_type;
601 589 dn->dn_next_type[txgoff] = 0;
602 590 }
603 591
604 592 if (dn->dn_next_blksz[txgoff] != 0) {
605 593 ASSERT(P2PHASE(dn->dn_next_blksz[txgoff],
606 594 SPA_MINBLOCKSIZE) == 0);
607 595 ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
608 596 dn->dn_maxblkid == 0 || list_head(list) != NULL ||
609 597 dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT ==
610 598 dnp->dn_datablkszsec ||
611 599 range_tree_space(dn->dn_free_ranges[txgoff]) != 0);
612 600 dnp->dn_datablkszsec =
613 601 dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT;
614 602 dn->dn_next_blksz[txgoff] = 0;
615 603 }
616 604
617 605 if (dn->dn_next_bonuslen[txgoff] != 0) {
618 606 if (dn->dn_next_bonuslen[txgoff] == DN_ZERO_BONUSLEN)
619 607 dnp->dn_bonuslen = 0;
620 608 else
621 609 dnp->dn_bonuslen = dn->dn_next_bonuslen[txgoff];
622 610 ASSERT(dnp->dn_bonuslen <= DN_MAX_BONUSLEN);
623 611 dn->dn_next_bonuslen[txgoff] = 0;
624 612 }
625 613
626 614 if (dn->dn_next_bonustype[txgoff] != 0) {
627 615 ASSERT(DMU_OT_IS_VALID(dn->dn_next_bonustype[txgoff]));
628 616 dnp->dn_bonustype = dn->dn_next_bonustype[txgoff];
629 617 dn->dn_next_bonustype[txgoff] = 0;
630 618 }
631 619
632 620 boolean_t freeing_dnode = dn->dn_free_txg > 0 &&
633 621 dn->dn_free_txg <= tx->tx_txg;
634 622
635 623 /*
636 624 * Remove the spill block if we have been explicitly asked to
637 625 * remove it, or if the object is being removed.
638 626 */
639 627 if (dn->dn_rm_spillblk[txgoff] || freeing_dnode) {
640 628 if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
641 629 kill_spill = B_TRUE;
642 630 dn->dn_rm_spillblk[txgoff] = 0;
643 631 }
644 632
645 633 if (dn->dn_next_indblkshift[txgoff] != 0) {
646 634 ASSERT(dnp->dn_nlevels == 1);
647 635 dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff];
648 636 dn->dn_next_indblkshift[txgoff] = 0;
649 637 }
650 638
651 639 /*
652 640 * Just take the live (open-context) values for checksum and compress.
653 641 * Strictly speaking it's a future leak, but nothing bad happens if we
654 642 * start using the new checksum or compress algorithm a little early.
655 643 */
656 644 dnp->dn_checksum = dn->dn_checksum;
657 645 dnp->dn_compress = dn->dn_compress;
658 646
659 647 mutex_exit(&dn->dn_mtx);
660 648
661 649 if (kill_spill) {
662 650 free_blocks(dn, &dn->dn_phys->dn_spill, 1, tx);
663 651 mutex_enter(&dn->dn_mtx);
664 652 dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR;
665 653 mutex_exit(&dn->dn_mtx);
666 654 }
667 655
668 656 /* process all the "freed" ranges in the file */
669 657 if (dn->dn_free_ranges[txgoff] != NULL) {
670 658 dnode_sync_free_range_arg_t dsfra;
671 659 dsfra.dsfra_dnode = dn;
672 660 dsfra.dsfra_tx = tx;
673 661 mutex_enter(&dn->dn_mtx);
674 662 range_tree_vacate(dn->dn_free_ranges[txgoff],
675 663 dnode_sync_free_range, &dsfra);
676 664 range_tree_destroy(dn->dn_free_ranges[txgoff]);
677 665 dn->dn_free_ranges[txgoff] = NULL;
678 666 mutex_exit(&dn->dn_mtx);
679 667 }
680 668
681 669 if (freeing_dnode) {
682 670 dnode_sync_free(dn, tx);
683 671 return;
684 672 }
685 673
686 674 if (dn->dn_next_nlevels[txgoff]) {
687 675 dnode_increase_indirection(dn, tx);
688 676 dn->dn_next_nlevels[txgoff] = 0;
689 677 }
690 678
691 679 if (dn->dn_next_nblkptr[txgoff]) {
692 680 /* this should only happen on a realloc */
693 681 ASSERT(dn->dn_allocated_txg == tx->tx_txg);
694 682 if (dn->dn_next_nblkptr[txgoff] > dnp->dn_nblkptr) {
695 683 /* zero the new blkptrs we are gaining */
696 684 bzero(dnp->dn_blkptr + dnp->dn_nblkptr,
697 685 sizeof (blkptr_t) *
698 686 (dn->dn_next_nblkptr[txgoff] - dnp->dn_nblkptr));
699 687 #ifdef ZFS_DEBUG
700 688 } else {
701 689 int i;
702 690 ASSERT(dn->dn_next_nblkptr[txgoff] < dnp->dn_nblkptr);
703 691 /* the blkptrs we are losing better be unallocated */
704 692 for (i = dn->dn_next_nblkptr[txgoff];
705 693 i < dnp->dn_nblkptr; i++)
706 694 ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[i]));
707 695 #endif
708 696 }
709 697 mutex_enter(&dn->dn_mtx);
710 698 dnp->dn_nblkptr = dn->dn_next_nblkptr[txgoff];
711 699 dn->dn_next_nblkptr[txgoff] = 0;
712 700 mutex_exit(&dn->dn_mtx);
713 701 }
714 702
715 703 dbuf_sync_list(list, tx);
716 704
717 705 if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
718 706 ASSERT3P(list_head(list), ==, NULL);
719 707 dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
720 708 }
721 709
722 710 /*
723 711 * Although we have dropped our reference to the dnode, it
724 712 * can't be evicted until its written, and we haven't yet
725 713 * initiated the IO for the dnode's dbuf.
726 714 */
727 715 }
|
↓ open down ↓ |
217 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX