4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2013 by Delphix. All rights reserved.
25 */
26
27 #include <sys/zfs_context.h>
28 #include <sys/dbuf.h>
29 #include <sys/dnode.h>
30 #include <sys/dmu.h>
31 #include <sys/dmu_tx.h>
32 #include <sys/dmu_objset.h>
33 #include <sys/dsl_dataset.h>
34 #include <sys/spa.h>
35 #include <sys/zfeature.h>
36
37 static void
38 dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
39 {
40 dmu_buf_impl_t *db;
41 int txgoff = tx->tx_txg & TXG_MASK;
42 int nblkptr = dn->dn_phys->dn_nblkptr;
43 int old_toplvl = dn->dn_phys->dn_nlevels - 1;
44 int new_level = dn->dn_next_nlevels[txgoff];
45 int i;
46
47 rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
48
49 /* this dnode can't be paged out because it's dirty */
50 ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
51 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
52 ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0);
53
54 db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG);
301 /* didn't find any non-holes */
302 bzero(db->db.db_data, db->db.db_size);
303 free_blocks(dn, db->db_blkptr, 1, tx);
304 } else {
305 /*
306 * Partial block free; must be marked dirty so that it
307 * will be written out.
308 */
309 ASSERT(db->db_dirtycnt > 0);
310 }
311
312 DB_DNODE_EXIT(db);
313 arc_buf_freeze(db->db_buf);
314 }
315
316 /*
317 * Traverse the indicated range of the provided file
318 * and "free" all the blocks contained there.
319 */
320 static void
321 dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks,
322 dmu_tx_t *tx)
323 {
324 blkptr_t *bp = dn->dn_phys->dn_blkptr;
325 int dnlevel = dn->dn_phys->dn_nlevels;
326 boolean_t trunc = B_FALSE;
327
328 if (blkid > dn->dn_phys->dn_maxblkid)
329 return;
330
331 ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX);
332 if (blkid + nblks > dn->dn_phys->dn_maxblkid) {
333 nblks = dn->dn_phys->dn_maxblkid - blkid + 1;
334 trunc = B_TRUE;
335 }
336
337 /* There are no indirect blocks in the object */
338 if (dnlevel == 1) {
339 if (blkid >= dn->dn_phys->dn_nblkptr) {
340 /* this range was never made persistent */
341 return;
359 TRUE, FTAG, &db));
360 rw_exit(&dn->dn_struct_rwlock);
361
362 free_children(db, blkid, nblks, tx);
363 dbuf_rele(db, FTAG);
364
365 }
366 }
367
368 if (trunc) {
369 dn->dn_phys->dn_maxblkid = blkid == 0 ? 0 : blkid - 1;
370
371 uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
372 (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
373 ASSERT(off < dn->dn_phys->dn_maxblkid ||
374 dn->dn_phys->dn_maxblkid == 0 ||
375 dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0);
376 }
377 }
378
379 /*
380 * Try to kick all the dnode's dbufs out of the cache...
381 */
382 void
383 dnode_evict_dbufs(dnode_t *dn)
384 {
385 int progress;
386 int pass = 0;
387
388 do {
389 dmu_buf_impl_t *db, marker;
390 int evicting = FALSE;
391
392 progress = FALSE;
393 mutex_enter(&dn->dn_dbufs_mtx);
394 list_insert_tail(&dn->dn_dbufs, &marker);
395 db = list_head(&dn->dn_dbufs);
396 for (; db != ▮ db = list_head(&dn->dn_dbufs)) {
397 list_remove(&dn->dn_dbufs, db);
398 list_insert_tail(&dn->dn_dbufs, db);
516 dn->dn_allocated_txg = 0;
517 dn->dn_free_txg = 0;
518 dn->dn_have_spill = B_FALSE;
519 mutex_exit(&dn->dn_mtx);
520
521 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
522
523 dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
524 /*
525 * Now that we've released our hold, the dnode may
526 * be evicted, so we musn't access it.
527 */
528 }
529
530 /*
531 * Write out the dnode's dirty buffers.
532 */
533 void
534 dnode_sync(dnode_t *dn, dmu_tx_t *tx)
535 {
536 free_range_t *rp;
537 dnode_phys_t *dnp = dn->dn_phys;
538 int txgoff = tx->tx_txg & TXG_MASK;
539 list_t *list = &dn->dn_dirty_records[txgoff];
540 static const dnode_phys_t zerodn = { 0 };
541 boolean_t kill_spill = B_FALSE;
542
543 ASSERT(dmu_tx_is_syncing(tx));
544 ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
545 ASSERT(dnp->dn_type != DMU_OT_NONE ||
546 bcmp(dnp, &zerodn, DNODE_SIZE) == 0);
547 DNODE_VERIFY(dn);
548
549 ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf));
550
551 if (dmu_objset_userused_enabled(dn->dn_objset) &&
552 !DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
553 mutex_enter(&dn->dn_mtx);
554 dn->dn_oldused = DN_USED_BYTES(dn->dn_phys);
555 dn->dn_oldflags = dn->dn_phys->dn_flags;
556 dn->dn_phys->dn_flags |= DNODE_FLAG_USERUSED_ACCOUNTED;
574 dnp->dn_type = dn->dn_type;
575 dnp->dn_bonustype = dn->dn_bonustype;
576 dnp->dn_bonuslen = dn->dn_bonuslen;
577 }
578
579 ASSERT(dnp->dn_nlevels > 1 ||
580 BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
581 BP_GET_LSIZE(&dnp->dn_blkptr[0]) ==
582 dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
583
584 if (dn->dn_next_type[txgoff] != 0) {
585 dnp->dn_type = dn->dn_type;
586 dn->dn_next_type[txgoff] = 0;
587 }
588
589 if (dn->dn_next_blksz[txgoff] != 0) {
590 ASSERT(P2PHASE(dn->dn_next_blksz[txgoff],
591 SPA_MINBLOCKSIZE) == 0);
592 ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
593 dn->dn_maxblkid == 0 || list_head(list) != NULL ||
594 avl_last(&dn->dn_ranges[txgoff]) ||
595 dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT ==
596 dnp->dn_datablkszsec);
597 dnp->dn_datablkszsec =
598 dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT;
599 dn->dn_next_blksz[txgoff] = 0;
600 }
601
602 if (dn->dn_next_bonuslen[txgoff] != 0) {
603 if (dn->dn_next_bonuslen[txgoff] == DN_ZERO_BONUSLEN)
604 dnp->dn_bonuslen = 0;
605 else
606 dnp->dn_bonuslen = dn->dn_next_bonuslen[txgoff];
607 ASSERT(dnp->dn_bonuslen <= DN_MAX_BONUSLEN);
608 dn->dn_next_bonuslen[txgoff] = 0;
609 }
610
611 if (dn->dn_next_bonustype[txgoff] != 0) {
612 ASSERT(DMU_OT_IS_VALID(dn->dn_next_bonustype[txgoff]));
613 dnp->dn_bonustype = dn->dn_next_bonustype[txgoff];
614 dn->dn_next_bonustype[txgoff] = 0;
615 }
616
635 }
636
637 /*
638 * Just take the live (open-context) values for checksum and compress.
639 * Strictly speaking it's a future leak, but nothing bad happens if we
640 * start using the new checksum or compress algorithm a little early.
641 */
642 dnp->dn_checksum = dn->dn_checksum;
643 dnp->dn_compress = dn->dn_compress;
644
645 mutex_exit(&dn->dn_mtx);
646
647 if (kill_spill) {
648 free_blocks(dn, &dn->dn_phys->dn_spill, 1, tx);
649 mutex_enter(&dn->dn_mtx);
650 dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR;
651 mutex_exit(&dn->dn_mtx);
652 }
653
654 /* process all the "freed" ranges in the file */
655 while (rp = avl_last(&dn->dn_ranges[txgoff])) {
656 dnode_sync_free_range(dn, rp->fr_blkid, rp->fr_nblks, tx);
657 /* grab the mutex so we don't race with dnode_block_freed() */
658 mutex_enter(&dn->dn_mtx);
659 avl_remove(&dn->dn_ranges[txgoff], rp);
660 mutex_exit(&dn->dn_mtx);
661 kmem_free(rp, sizeof (free_range_t));
662 }
663
664 if (freeing_dnode) {
665 dnode_sync_free(dn, tx);
666 return;
667 }
668
669 if (dn->dn_next_nblkptr[txgoff]) {
670 /* this should only happen on a realloc */
671 ASSERT(dn->dn_allocated_txg == tx->tx_txg);
672 if (dn->dn_next_nblkptr[txgoff] > dnp->dn_nblkptr) {
673 /* zero the new blkptrs we are gaining */
674 bzero(dnp->dn_blkptr + dnp->dn_nblkptr,
675 sizeof (blkptr_t) *
676 (dn->dn_next_nblkptr[txgoff] - dnp->dn_nblkptr));
677 #ifdef ZFS_DEBUG
678 } else {
679 int i;
680 ASSERT(dn->dn_next_nblkptr[txgoff] < dnp->dn_nblkptr);
681 /* the blkptrs we are losing better be unallocated */
|
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
25 */
26
27 #include <sys/zfs_context.h>
28 #include <sys/dbuf.h>
29 #include <sys/dnode.h>
30 #include <sys/dmu.h>
31 #include <sys/dmu_tx.h>
32 #include <sys/dmu_objset.h>
33 #include <sys/dsl_dataset.h>
34 #include <sys/spa.h>
35 #include <sys/range_tree.h>
36 #include <sys/zfeature.h>
37
38 static void
39 dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
40 {
41 dmu_buf_impl_t *db;
42 int txgoff = tx->tx_txg & TXG_MASK;
43 int nblkptr = dn->dn_phys->dn_nblkptr;
44 int old_toplvl = dn->dn_phys->dn_nlevels - 1;
45 int new_level = dn->dn_next_nlevels[txgoff];
46 int i;
47
48 rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
49
50 /* this dnode can't be paged out because it's dirty */
51 ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
52 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
53 ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0);
54
55 db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG);
302 /* didn't find any non-holes */
303 bzero(db->db.db_data, db->db.db_size);
304 free_blocks(dn, db->db_blkptr, 1, tx);
305 } else {
306 /*
307 * Partial block free; must be marked dirty so that it
308 * will be written out.
309 */
310 ASSERT(db->db_dirtycnt > 0);
311 }
312
313 DB_DNODE_EXIT(db);
314 arc_buf_freeze(db->db_buf);
315 }
316
317 /*
318 * Traverse the indicated range of the provided file
319 * and "free" all the blocks contained there.
320 */
321 static void
322 dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
323 dmu_tx_t *tx)
324 {
325 blkptr_t *bp = dn->dn_phys->dn_blkptr;
326 int dnlevel = dn->dn_phys->dn_nlevels;
327 boolean_t trunc = B_FALSE;
328
329 if (blkid > dn->dn_phys->dn_maxblkid)
330 return;
331
332 ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX);
333 if (blkid + nblks > dn->dn_phys->dn_maxblkid) {
334 nblks = dn->dn_phys->dn_maxblkid - blkid + 1;
335 trunc = B_TRUE;
336 }
337
338 /* There are no indirect blocks in the object */
339 if (dnlevel == 1) {
340 if (blkid >= dn->dn_phys->dn_nblkptr) {
341 /* this range was never made persistent */
342 return;
360 TRUE, FTAG, &db));
361 rw_exit(&dn->dn_struct_rwlock);
362
363 free_children(db, blkid, nblks, tx);
364 dbuf_rele(db, FTAG);
365
366 }
367 }
368
369 if (trunc) {
370 dn->dn_phys->dn_maxblkid = blkid == 0 ? 0 : blkid - 1;
371
372 uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
373 (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
374 ASSERT(off < dn->dn_phys->dn_maxblkid ||
375 dn->dn_phys->dn_maxblkid == 0 ||
376 dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0);
377 }
378 }
379
380 typedef struct dnode_sync_free_range_arg {
381 dnode_t *dsfra_dnode;
382 dmu_tx_t *dsfra_tx;
383 } dnode_sync_free_range_arg_t;
384
385 static void
386 dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks)
387 {
388 dnode_sync_free_range_arg_t *dsfra = arg;
389 dnode_t *dn = dsfra->dsfra_dnode;
390
391 mutex_exit(&dn->dn_mtx);
392 dnode_sync_free_range_impl(dn, blkid, nblks, dsfra->dsfra_tx);
393 mutex_enter(&dn->dn_mtx);
394 }
395
396 /*
397 * Try to kick all the dnode's dbufs out of the cache...
398 */
399 void
400 dnode_evict_dbufs(dnode_t *dn)
401 {
402 int progress;
403 int pass = 0;
404
405 do {
406 dmu_buf_impl_t *db, marker;
407 int evicting = FALSE;
408
409 progress = FALSE;
410 mutex_enter(&dn->dn_dbufs_mtx);
411 list_insert_tail(&dn->dn_dbufs, &marker);
412 db = list_head(&dn->dn_dbufs);
413 for (; db != ▮ db = list_head(&dn->dn_dbufs)) {
414 list_remove(&dn->dn_dbufs, db);
415 list_insert_tail(&dn->dn_dbufs, db);
533 dn->dn_allocated_txg = 0;
534 dn->dn_free_txg = 0;
535 dn->dn_have_spill = B_FALSE;
536 mutex_exit(&dn->dn_mtx);
537
538 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
539
540 dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
541 /*
542 * Now that we've released our hold, the dnode may
543 * be evicted, so we musn't access it.
544 */
545 }
546
547 /*
548 * Write out the dnode's dirty buffers.
549 */
550 void
551 dnode_sync(dnode_t *dn, dmu_tx_t *tx)
552 {
553 dnode_phys_t *dnp = dn->dn_phys;
554 int txgoff = tx->tx_txg & TXG_MASK;
555 list_t *list = &dn->dn_dirty_records[txgoff];
556 static const dnode_phys_t zerodn = { 0 };
557 boolean_t kill_spill = B_FALSE;
558
559 ASSERT(dmu_tx_is_syncing(tx));
560 ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
561 ASSERT(dnp->dn_type != DMU_OT_NONE ||
562 bcmp(dnp, &zerodn, DNODE_SIZE) == 0);
563 DNODE_VERIFY(dn);
564
565 ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf));
566
567 if (dmu_objset_userused_enabled(dn->dn_objset) &&
568 !DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
569 mutex_enter(&dn->dn_mtx);
570 dn->dn_oldused = DN_USED_BYTES(dn->dn_phys);
571 dn->dn_oldflags = dn->dn_phys->dn_flags;
572 dn->dn_phys->dn_flags |= DNODE_FLAG_USERUSED_ACCOUNTED;
590 dnp->dn_type = dn->dn_type;
591 dnp->dn_bonustype = dn->dn_bonustype;
592 dnp->dn_bonuslen = dn->dn_bonuslen;
593 }
594
595 ASSERT(dnp->dn_nlevels > 1 ||
596 BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
597 BP_GET_LSIZE(&dnp->dn_blkptr[0]) ==
598 dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
599
600 if (dn->dn_next_type[txgoff] != 0) {
601 dnp->dn_type = dn->dn_type;
602 dn->dn_next_type[txgoff] = 0;
603 }
604
605 if (dn->dn_next_blksz[txgoff] != 0) {
606 ASSERT(P2PHASE(dn->dn_next_blksz[txgoff],
607 SPA_MINBLOCKSIZE) == 0);
608 ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
609 dn->dn_maxblkid == 0 || list_head(list) != NULL ||
610 dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT ==
611 dnp->dn_datablkszsec ||
612 range_tree_space(dn->dn_free_ranges[txgoff]) != 0);
613 dnp->dn_datablkszsec =
614 dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT;
615 dn->dn_next_blksz[txgoff] = 0;
616 }
617
618 if (dn->dn_next_bonuslen[txgoff] != 0) {
619 if (dn->dn_next_bonuslen[txgoff] == DN_ZERO_BONUSLEN)
620 dnp->dn_bonuslen = 0;
621 else
622 dnp->dn_bonuslen = dn->dn_next_bonuslen[txgoff];
623 ASSERT(dnp->dn_bonuslen <= DN_MAX_BONUSLEN);
624 dn->dn_next_bonuslen[txgoff] = 0;
625 }
626
627 if (dn->dn_next_bonustype[txgoff] != 0) {
628 ASSERT(DMU_OT_IS_VALID(dn->dn_next_bonustype[txgoff]));
629 dnp->dn_bonustype = dn->dn_next_bonustype[txgoff];
630 dn->dn_next_bonustype[txgoff] = 0;
631 }
632
651 }
652
653 /*
654 * Just take the live (open-context) values for checksum and compress.
655 * Strictly speaking it's a future leak, but nothing bad happens if we
656 * start using the new checksum or compress algorithm a little early.
657 */
658 dnp->dn_checksum = dn->dn_checksum;
659 dnp->dn_compress = dn->dn_compress;
660
661 mutex_exit(&dn->dn_mtx);
662
663 if (kill_spill) {
664 free_blocks(dn, &dn->dn_phys->dn_spill, 1, tx);
665 mutex_enter(&dn->dn_mtx);
666 dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR;
667 mutex_exit(&dn->dn_mtx);
668 }
669
670 /* process all the "freed" ranges in the file */
671 if (dn->dn_free_ranges[txgoff] != NULL) {
672 dnode_sync_free_range_arg_t dsfra;
673 dsfra.dsfra_dnode = dn;
674 dsfra.dsfra_tx = tx;
675 mutex_enter(&dn->dn_mtx);
676 range_tree_vacate(dn->dn_free_ranges[txgoff],
677 dnode_sync_free_range, &dsfra);
678 range_tree_destroy(dn->dn_free_ranges[txgoff]);
679 dn->dn_free_ranges[txgoff] = NULL;
680 mutex_exit(&dn->dn_mtx);
681 }
682
683 if (freeing_dnode) {
684 dnode_sync_free(dn, tx);
685 return;
686 }
687
688 if (dn->dn_next_nblkptr[txgoff]) {
689 /* this should only happen on a realloc */
690 ASSERT(dn->dn_allocated_txg == tx->tx_txg);
691 if (dn->dn_next_nblkptr[txgoff] > dnp->dn_nblkptr) {
692 /* zero the new blkptrs we are gaining */
693 bzero(dnp->dn_blkptr + dnp->dn_nblkptr,
694 sizeof (blkptr_t) *
695 (dn->dn_next_nblkptr[txgoff] - dnp->dn_nblkptr));
696 #ifdef ZFS_DEBUG
697 } else {
698 int i;
699 ASSERT(dn->dn_next_nblkptr[txgoff] < dnp->dn_nblkptr);
700 /* the blkptrs we are losing better be unallocated */
|