Print this page
NEX-5856 ddt_capped isn't reset when deduped dataset is destroyed
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
4185 add new cryptographic checksums to ZFS: SHA-512, Skein, Edon-R (fix studio build)
4185 add new cryptographic checksums to ZFS: SHA-512, Skein, Edon-R
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Richard Lowe <richlowe@richlowe.net>
Approved by: Garrett D'Amore <garrett@damore.org>
NEX-3165 need some dedup improvements
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
NEX-3211 mismerge ddt_repair_start()
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Josef Sipek <josef.sipek@nexenta.com>
4370 avoid transmitting holes during zfs send
4371 DMU code clean up
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
Approved by: Garrett D'Amore <garrett@damore.org>
OS-80 support for vdev and CoS properties for the new I/O scheduler
OS-95 lint warning introduced by OS-61
Issue #2: optimize DDE lookup in DDT objects
Added option to control number of classes of DDE's in DDT.
New default is one, that is all DDE's are stored together
regardless of refcount.
re #12611 rb4105 zpool import panic in ddt_zap_count()
re #8279 rb3915 need a mechanism to notify NMS about ZFS config changes (fix lint -courtesy of Yuri Pankov)
re #12584 rb4049 zfsxx latest code merge (fix lint - courtesy of Yuri Pankov)
re #12585 rb4049 ZFS++ work port - refactoring to improve separation of open/closed code, bug fixes, performance improvements - open code
@@ -20,10 +20,11 @@
*/
/*
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/spa.h>
#include <sys/spa_impl.h>
@@ -37,10 +38,20 @@
#include <sys/zio_compress.h>
#include <sys/dsl_scan.h>
#include <sys/abd.h>
/*
+ * Almost all of the cases of iteration through zap containing entries are
+ * restricted by spa->spa_ddt_class_{min,max}. It allows one to introduce new
+ * behavior: storing all entries into the single zap. However, there are
+ * some places where all zaps are iterated through forcibly: table creation,
+ * deletion, loading, dde prefetching, and looking up. It allows one to maintain
+ * compatibility with old pools and be able to convert the old pool format
+ * into the new one on-the-fly.
+ */
+
+/*
* Enable/disable prefetching of dedup-ed blocks which are going to be freed.
*/
int zfs_dedup_prefetch = 1;
static const ddt_ops_t *ddt_ops[DDT_TYPES] = {
@@ -51,10 +62,13 @@
"ditto",
"duplicate",
"unique",
};
+/* Possible in core size of all DDTs */
+uint64_t zfs_ddts_msize = 0;
+
static void
ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
dmu_tx_t *tx)
{
spa_t *spa = ddt->ddt_spa;
@@ -84,15 +98,18 @@
{
spa_t *spa = ddt->ddt_spa;
objset_t *os = ddt->ddt_os;
uint64_t *objectp = &ddt->ddt_object[type][class];
char name[DDT_NAMELEN];
-
+#if DEBUG
+ uint64_t count;
+#endif
ddt_object_name(ddt, type, class, name);
ASSERT(*objectp != 0);
- ASSERT(ddt_object_count(ddt, type, class) == 0);
+ ASSERT((ddt_object_count(ddt, type, class, &count) == 0) &&
+ (count == 0));
ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0);
VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0);
VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0);
bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t));
@@ -110,24 +127,28 @@
ddt_object_name(ddt, type, class, name);
error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name,
sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
-
- if (error != 0)
+ if (error)
return (error);
VERIFY0(zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
&ddt->ddt_histogram[type][class]));
/*
* Seed the cached statistics.
*/
- VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
-
- ddo->ddo_count = ddt_object_count(ddt, type, class);
+ error = ddt_object_info(ddt, type, class, &doi);
+ /* Panic in debug mode */
+ ASSERT(error == 0);
+ if (error)
+ return (error);
+ error = ddt_object_count(ddt, type, class, &ddo->ddo_count);
+ if (error)
+ return (error);
ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
return (0);
}
@@ -149,11 +170,11 @@
/*
* Cache DDT statistics; this is the only time they'll change.
*/
VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
- ddo->ddo_count = ddt_object_count(ddt, type, class);
+ (void) ddt_object_count(ddt, type, class, &ddo->ddo_count);
ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
}
static int
@@ -206,17 +227,18 @@
return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
ddt->ddt_object[type][class], dde, walk));
}
-uint64_t
-ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+int
+ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ uint64_t *count)
{
ASSERT(ddt_object_exists(ddt, type, class));
return (ddt_ops[type]->ddt_op_count(ddt->ddt_os,
- ddt->ddt_object[type][class]));
+ ddt->ddt_object[type][class], count));
}
int
ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
dmu_object_info_t *doi)
@@ -348,13 +370,12 @@
return (refcnt);
}
static void
-ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
+ddt_stat_generate(spa_t *spa, ddt_entry_t *dde, ddt_stat_t *dds)
{
- spa_t *spa = ddt->ddt_spa;
ddt_phys_t *ddp = dde->dde_phys;
ddt_key_t *ddk = &dde->dde_key;
uint64_t lsize = DDK_GET_LSIZE(ddk);
uint64_t psize = DDK_GET_PSIZE(ddk);
@@ -394,24 +415,29 @@
while (d < d_end)
*d++ += (*s++ ^ neg) - neg;
}
static void
-ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
+ddt_stat_update_by_dds(ddt_t *ddt, ddt_entry_t *dde,
+ ddt_stat_t *dds, uint64_t neg)
{
- ddt_stat_t dds;
ddt_histogram_t *ddh;
- int bucket;
-
- ddt_stat_generate(ddt, dde, &dds);
-
- bucket = highbit64(dds.dds_ref_blocks) - 1;
+ int bucket = highbit64(dds->dds_ref_blocks) - 1;
ASSERT(bucket >= 0);
ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
+ ddt_stat_add(&ddh->ddh_stat[bucket], dds, neg);
+}
- ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg);
+static void
+ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
+{
+ ddt_stat_t dds;
+
+ ddt_stat_generate(ddt->ddt_spa, dde, &dds);
+
+ ddt_stat_update_by_dds(ddt, dde, &dds, neg);
}
void
ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
{
@@ -446,12 +472,12 @@
{
/* Sum the statistics we cached in ddt_object_sync(). */
for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
ddt_t *ddt = spa->spa_ddt[c];
for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
- for (enum ddt_class class = 0; class < DDT_CLASSES;
- class++) {
+ for (enum ddt_class class = spa->spa_ddt_class_min;
+ class <= spa->spa_ddt_class_max; class++) {
ddt_object_t *ddo =
&ddt->ddt_object_stats[type][class];
ddo_total->ddo_count += ddo->ddo_count;
ddo_total->ddo_dspace += ddo->ddo_dspace;
ddo_total->ddo_mspace += ddo->ddo_mspace;
@@ -470,12 +496,12 @@
ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
{
for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
ddt_t *ddt = spa->spa_ddt[c];
for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
- for (enum ddt_class class = 0; class < DDT_CLASSES;
- class++) {
+ for (enum ddt_class class = spa->spa_ddt_class_min;
+ class <= spa->spa_ddt_class_max; class++) {
ddt_histogram_add(ddh,
&ddt->ddt_histogram_cache[type][class]);
}
}
}
@@ -482,16 +508,31 @@
}
void
ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total)
{
- ddt_histogram_t *ddh_total;
-
- ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
- ddt_get_dedup_histogram(spa, ddh_total);
- ddt_histogram_stat(dds_total, ddh_total);
- kmem_free(ddh_total, sizeof (ddt_histogram_t));
+ /*
+ * Avoid temporary allocation of ddt_histogram_t from heap
+ * or on stack (probably too large) by unrolling ddt_histogram_add()
+ */
+ bzero(dds_total, sizeof (ddt_stat_t));
+ /* sum up the stats across all the histograms */
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ ddt_t *ddt = spa->spa_ddt[c];
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = spa->spa_ddt_class_min;
+ class <= spa->spa_ddt_class_max; class++) {
+ /* unroll the ddt_histogram_add() */
+ ddt_histogram_t *src =
+ &ddt->ddt_histogram_cache[type][class];
+ for (int h = 0; h < 64; h++) {
+ ddt_stat_t *st = &src->ddh_stat[h];
+ ddt_stat_add(dds_total, st, 0);
+ }
+ }
+ }
+ }
}
uint64_t
ddt_get_dedup_dspace(spa_t *spa)
{
@@ -618,55 +659,110 @@
{
return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
}
void
-ddt_enter(ddt_t *ddt)
+ddt_enter(ddt_t *ddt, uint8_t hash)
{
- mutex_enter(&ddt->ddt_lock);
+ mutex_enter(&ddt->ddt_lock[hash]);
}
void
-ddt_exit(ddt_t *ddt)
+ddt_exit(ddt_t *ddt, uint8_t hash)
{
- mutex_exit(&ddt->ddt_lock);
+ mutex_exit(&ddt->ddt_lock[hash]);
}
+void
+dde_enter(ddt_entry_t *dde)
+{
+ mutex_enter(&dde->dde_lock);
+}
+
+void
+dde_exit(ddt_entry_t *dde)
+{
+ mutex_exit(&dde->dde_lock);
+}
+
+/* cache for ddt_entry_t structures */
+static kmem_cache_t *dde_cache;
+
+/* ARGSUSED */
+static int
+dde_cache_constr(void *buf, void *arg, int flags)
+{
+ ddt_entry_t *dde = (ddt_entry_t *)buf;
+ cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&dde->dde_lock, NULL, MUTEX_DEFAULT, NULL);
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+dde_cache_destr(void *buf, void *arg)
+{
+ ddt_entry_t *dde = (ddt_entry_t *)buf;
+ cv_destroy(&dde->dde_cv);
+ mutex_destroy(&dde->dde_lock);
+}
+
+void
+ddt_init(void)
+{
+ dde_cache = kmem_cache_create("ddt_entry_t", sizeof (ddt_entry_t),
+ 0, dde_cache_constr, dde_cache_destr, NULL, NULL, NULL, 0);
+ VERIFY(dde_cache != NULL);
+}
+
+void
+ddt_fini(void)
+{
+ if (dde_cache) {
+ kmem_cache_destroy(dde_cache);
+ dde_cache = NULL;
+ }
+}
+
static ddt_entry_t *
ddt_alloc(const ddt_key_t *ddk)
{
ddt_entry_t *dde;
- dde = kmem_zalloc(sizeof (ddt_entry_t), KM_SLEEP);
- cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
+ dde = kmem_cache_alloc(dde_cache, KM_SLEEP);
+ /* Init everything but the condvar and the mutex */
dde->dde_key = *ddk;
+ bzero((void*)((uintptr_t)dde+offsetof(ddt_entry_t, dde_phys)),
+ offsetof(ddt_entry_t, dde_cv)-offsetof(ddt_entry_t, dde_phys));
+ bzero((void*)((uintptr_t)dde+offsetof(ddt_entry_t, dde_node)),
+ sizeof (avl_node_t));
return (dde);
}
static void
ddt_free(ddt_entry_t *dde)
{
- ASSERT(!dde->dde_loading);
+ ASSERT(!(dde->dde_state & DDE_LOADING));
for (int p = 0; p < DDT_PHYS_TYPES; p++)
ASSERT(dde->dde_lead_zio[p] == NULL);
if (dde->dde_repair_abd != NULL)
abd_free(dde->dde_repair_abd);
- cv_destroy(&dde->dde_cv);
- kmem_free(dde, sizeof (*dde));
+ kmem_cache_free(dde_cache, dde);
}
+/* for zdb usage */
void
ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
{
- ASSERT(MUTEX_HELD(&ddt->ddt_lock));
+ uint8_t hash = DDT_HASHFN(dde->dde_key.ddk_cksum);
- avl_remove(&ddt->ddt_tree, dde);
+ avl_remove(&ddt->ddt_tree[hash], dde);
ddt_free(dde);
}
ddt_entry_t *
ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
@@ -673,60 +769,84 @@
{
ddt_entry_t *dde, dde_search;
enum ddt_type type;
enum ddt_class class;
avl_index_t where;
+ uint8_t hash = DDT_HASHFN(bp->blk_cksum);
int error;
- ASSERT(MUTEX_HELD(&ddt->ddt_lock));
-
ddt_key_fill(&dde_search.dde_key, bp);
- dde = avl_find(&ddt->ddt_tree, &dde_search, &where);
+ ddt_enter(ddt, hash);
+ /*
+ * Do we have the dirty DDE in mem already?
+ */
+ dde = avl_find(&ddt->ddt_tree[hash], &dde_search, &where);
if (dde == NULL) {
- if (!add)
+ /* This DDE doesn't exists in dirty tree */
+ if (!add) {
+ ddt_exit(ddt, hash);
return (NULL);
+ }
+ /* Since a dirty DDE didn't exist, create it */
dde = ddt_alloc(&dde_search.dde_key);
- avl_insert(&ddt->ddt_tree, dde, where);
+ avl_insert(&ddt->ddt_tree[hash], dde, where);
}
- while (dde->dde_loading)
- cv_wait(&dde->dde_cv, &ddt->ddt_lock);
+ ddt_exit(ddt, hash);
- if (dde->dde_loaded)
+ /*
+ * If we're already looking up this DDE
+ * wait until we have the result
+ */
+ dde_enter(dde);
+ while (dde->dde_state & DDE_LOADING)
+ cv_wait(&dde->dde_cv, &dde->dde_lock);
+
+ /*
+ * If we have loaded the DDE from disk return it
+ */
+ if (dde->dde_state & DDE_LOADED)
return (dde);
- dde->dde_loading = B_TRUE;
+ /*
+ * If we didn't find this DDE, start looking up the DDE in ZAP
+ */
+ dde->dde_state |= DDE_LOADING;
+ dde_exit(dde);
- ddt_exit(ddt);
-
error = ENOENT;
+ DTRACE_PROBE1(ddt__loading, ddt_key_t *, &dde->dde_key);
for (type = 0; type < DDT_TYPES; type++) {
for (class = 0; class < DDT_CLASSES; class++) {
error = ddt_object_lookup(ddt, type, class, dde);
- if (error != ENOENT) {
- ASSERT0(error);
+ if (error != ENOENT)
break;
}
- }
if (error != ENOENT)
break;
}
- ddt_enter(ddt);
+ ASSERT(error == 0 || error == ENOENT);
- ASSERT(dde->dde_loaded == B_FALSE);
- ASSERT(dde->dde_loading == B_TRUE);
+ dde_enter(dde);
+ ASSERT(!(dde->dde_state & DDE_LOADED));
+ ASSERT(dde->dde_state & DDE_LOADING);
+
dde->dde_type = type; /* will be DDT_TYPES if no entry found */
dde->dde_class = class; /* will be DDT_CLASSES if no entry found */
- dde->dde_loaded = B_TRUE;
- dde->dde_loading = B_FALSE;
+ if (type == DDT_TYPES && class == DDT_CLASSES)
+ dde->dde_state |= DDE_NEW;
+ dde->dde_state |= DDE_LOADED;
+ dde->dde_state &= ~DDE_LOADING;
+ DTRACE_PROBE2(ddt__loaded, ddt_key_t *, &dde->dde_key,
+ enum ddt_class, dde->dde_class);
if (error == 0)
- ddt_stat_update(ddt, dde, -1ULL);
+ ddt_stat_generate(ddt->ddt_spa, dde, &dde->dde_lkstat);
cv_broadcast(&dde->dde_cv);
return (dde);
}
@@ -747,11 +867,12 @@
*/
ddt = ddt_select(spa, bp);
ddt_key_fill(&dde.dde_key, bp);
for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
- for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+ for (enum ddt_class class = 0;
+ class < DDT_CLASSES; class++) {
ddt_object_prefetch(ddt, type, class, &dde);
}
}
}
@@ -775,16 +896,21 @@
static ddt_t *
ddt_table_alloc(spa_t *spa, enum zio_checksum c)
{
ddt_t *ddt;
+ uint_t i;
ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP);
- mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
- avl_create(&ddt->ddt_tree, ddt_entry_compare,
+ for (i = 0; i < DDT_HASHSZ; i++) {
+ mutex_init(&ddt->ddt_lock[i], NULL, MUTEX_DEFAULT, NULL);
+ avl_create(&ddt->ddt_tree[i], ddt_entry_compare,
sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
+ }
+ mutex_init(&ddt->ddt_repair_lock, NULL, MUTEX_DEFAULT, NULL);
+
avl_create(&ddt->ddt_repair_tree, ddt_entry_compare,
sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
ddt->ddt_checksum = c;
ddt->ddt_spa = spa;
ddt->ddt_os = spa->spa_meta_objset;
@@ -793,15 +919,21 @@
}
static void
ddt_table_free(ddt_t *ddt)
{
- ASSERT(avl_numnodes(&ddt->ddt_tree) == 0);
+ uint_t i;
+
ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0);
- avl_destroy(&ddt->ddt_tree);
+
+ for (i = 0; i < DDT_HASHSZ; i++) {
+ ASSERT(avl_numnodes(&ddt->ddt_tree[i]) == 0);
+ avl_destroy(&ddt->ddt_tree[i]);
+ mutex_destroy(&ddt->ddt_lock[i]);
+ }
avl_destroy(&ddt->ddt_repair_tree);
- mutex_destroy(&ddt->ddt_lock);
+ mutex_destroy(&ddt->ddt_repair_lock);
kmem_free(ddt, sizeof (*ddt));
}
void
ddt_create(spa_t *spa)
@@ -810,14 +942,31 @@
for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
spa->spa_ddt[c] = ddt_table_alloc(spa, c);
}
+/*
+ * Get the combined size of DDTs on all pools.
+ * Returns either on disk (phys == B_TRUE) or in core combined DDTs size
+ */
+uint64_t
+ddt_get_ddts_size(boolean_t phys)
+{
+ uint64_t ddts_size = 0;
+ spa_t *spa = NULL;
+
+ while ((spa = spa_next(spa)) != NULL)
+ ddts_size += spa_get_ddts_size(spa, phys);
+
+ return (ddts_size);
+}
+
int
ddt_load(spa_t *spa)
{
int error;
+ ddt_object_t *ddo;
ddt_create(spa);
error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
@@ -827,25 +976,39 @@
return (error == ENOENT ? 0 : error);
for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
ddt_t *ddt = spa->spa_ddt[c];
for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
- for (enum ddt_class class = 0; class < DDT_CLASSES;
- class++) {
+ for (enum ddt_class class = 0;
+ class < DDT_CLASSES; class++) {
error = ddt_object_load(ddt, type, class);
- if (error != 0 && error != ENOENT)
+ if (error == ENOENT)
+ continue;
+ if (error != 0)
return (error);
+ ddo = &ddt->ddt_object_stats[type][class];
+ atomic_add_64(&spa->spa_ddt_dsize,
+ ddo->ddo_dspace);
+ atomic_add_64(&spa->spa_ddt_msize,
+ ddo->ddo_mspace);
}
}
/*
* Seed the cached histograms.
*/
bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
sizeof (ddt->ddt_histogram));
}
+ zfs_ddts_msize = ddt_get_ddts_size(B_FALSE);
+ if (spa_enable_dedup_cap(spa) && spa->spa_ddt_capped == 0) {
+ /* notify that dedup cap is now active */
+ spa->spa_ddt_capped = 1;
+ spa_event_notify(spa, NULL, NULL, ESC_ZFS_DEDUP_OFF);
+ }
+
return (0);
}
void
ddt_unload(spa_t *spa)
@@ -854,10 +1017,13 @@
if (spa->spa_ddt[c]) {
ddt_table_free(spa->spa_ddt[c]);
spa->spa_ddt[c] = NULL;
}
}
+ spa->spa_ddt_dsize = 0;
+ spa->spa_ddt_msize = 0;
+ zfs_ddts_msize = ddt_get_ddts_size(B_FALSE);
}
boolean_t
ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp)
{
@@ -865,19 +1031,20 @@
ddt_entry_t dde;
if (!BP_GET_DEDUP(bp))
return (B_FALSE);
- if (max_class == DDT_CLASS_UNIQUE)
- return (B_TRUE);
+ if (max_class > spa->spa_ddt_class_max)
+ max_class = spa->spa_ddt_class_max;
ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)];
ddt_key_fill(&dde.dde_key, bp);
for (enum ddt_type type = 0; type < DDT_TYPES; type++)
- for (enum ddt_class class = 0; class <= max_class; class++)
+ for (enum ddt_class class = spa->spa_ddt_class_min;
+ class <= max_class; class++)
if (ddt_object_lookup(ddt, type, class, &dde) == 0)
return (B_TRUE);
return (B_FALSE);
}
@@ -891,11 +1058,12 @@
ddt_key_fill(&ddk, bp);
dde = ddt_alloc(&ddk);
for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
- for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+ for (enum ddt_class class = 0;
+ class < DDT_CLASSES; class++) {
/*
* We can only do repair if there are multiple copies
* of the block. For anything in the UNIQUE class,
* there's definitely only one copy, so don't even try.
*/
@@ -913,19 +1081,19 @@
void
ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
{
avl_index_t where;
- ddt_enter(ddt);
+ mutex_enter(&ddt->ddt_repair_lock);
if (dde->dde_repair_abd != NULL && spa_writeable(ddt->ddt_spa) &&
avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
avl_insert(&ddt->ddt_repair_tree, dde, where);
else
ddt_free(dde);
- ddt_exit(ddt);
+ mutex_exit(&ddt->ddt_repair_lock);;
}
static void
ddt_repair_entry_done(zio_t *zio)
{
@@ -970,39 +1138,50 @@
blkptr_t blk;
if (spa_sync_pass(spa) > 1)
return;
- ddt_enter(ddt);
+ mutex_enter(&ddt->ddt_repair_lock);
for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) {
rdde_next = AVL_NEXT(t, rdde);
avl_remove(&ddt->ddt_repair_tree, rdde);
- ddt_exit(ddt);
+ mutex_exit(&ddt->ddt_repair_lock);
+
ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk);
dde = ddt_repair_start(ddt, &blk);
ddt_repair_entry(ddt, dde, rdde, rio);
ddt_repair_done(ddt, dde);
- ddt_enter(ddt);
+
+ mutex_enter(&ddt->ddt_repair_lock);
}
- ddt_exit(ddt);
+ mutex_exit(&ddt->ddt_repair_lock);
}
static void
ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
{
dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
ddt_phys_t *ddp = dde->dde_phys;
ddt_key_t *ddk = &dde->dde_key;
+ spa_t *spa = ddt->ddt_spa;
enum ddt_type otype = dde->dde_type;
enum ddt_type ntype = DDT_TYPE_CURRENT;
enum ddt_class oclass = dde->dde_class;
enum ddt_class nclass;
uint64_t total_refcnt = 0;
- ASSERT(dde->dde_loaded);
- ASSERT(!dde->dde_loading);
+ ASSERT(dde->dde_state & DDE_LOADED);
+ ASSERT(!(dde->dde_state & DDE_LOADING));
+ /*
+ * Propagate the stats generated at lookup time
+ * this was delayed to avoid having to take locks
+ * to protect ddt->ddt_histogram
+ */
+ if (dde->dde_lkstat.dds_ref_blocks != 0)
+ ddt_stat_update_by_dds(ddt, dde, &dde->dde_lkstat, -1ULL);
+
for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
ASSERT(dde->dde_lead_zio[p] == NULL);
ASSERT((int64_t)ddp->ddp_refcnt >= 0);
if (ddp->ddp_phys_birth == 0) {
ASSERT(ddp->ddp_refcnt == 0);
@@ -1023,10 +1202,18 @@
else if (total_refcnt > 1)
nclass = DDT_CLASS_DUPLICATE;
else
nclass = DDT_CLASS_UNIQUE;
+ if (nclass > spa->spa_ddt_class_max)
+ nclass = spa->spa_ddt_class_max;
+
+ if (nclass < spa->spa_ddt_class_min)
+ nclass = spa->spa_ddt_class_min;
+
+ DTRACE_PROBE1(ddt__storing__entry, uint64_t, (uint64_t)nclass);
+
if (otype != DDT_TYPES &&
(otype != ntype || oclass != nclass || total_refcnt == 0)) {
VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0);
ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT);
}
@@ -1049,20 +1236,42 @@
if (nclass < oclass) {
dsl_scan_ddt_entry(dp->dp_scan,
ddt->ddt_checksum, dde, tx);
}
}
+ DTRACE_PROBE(ddt__stored__entry);
}
static void
+ddt_sync_avl(ddt_t *ddt, avl_tree_t *avl, dmu_tx_t *tx, uint64_t txg)
+{
+ void *cookie = NULL;
+ ddt_entry_t *dde;
+
+ while ((dde = avl_destroy_nodes(avl, &cookie)) != NULL) {
+ if ((dde->dde_state & DDE_DONT_SYNC) != DDE_DONT_SYNC) {
+ ddt_sync_entry(ddt, dde, tx, txg);
+ } else { /* if we're not syncing this DDE it must be new */
+ ASSERT(dde->dde_state & DDE_NEW);
+ }
+ ddt_free(dde);
+ }
+}
+
+static void
ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
{
+ uint64_t cnt, num_dbytes = 0, num_mbytes = 0;
+ int64_t old_mbytes = 0;
spa_t *spa = ddt->ddt_spa;
- ddt_entry_t *dde;
- void *cookie = NULL;
+ uint_t i, numnodes = 0;
+ ddt_object_t *ddo;
- if (avl_numnodes(&ddt->ddt_tree) == 0)
+ for (i = 0; i < DDT_HASHSZ; i++)
+ numnodes += avl_numnodes(&ddt->ddt_tree[i]);
+
+ if (numnodes == 0)
return;
ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP);
if (spa->spa_ddt_stat_object == 0) {
@@ -1069,39 +1278,65 @@
spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_DDT_STATS, tx);
}
- while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
- ddt_sync_entry(ddt, dde, tx, txg);
- ddt_free(dde);
- }
+ DTRACE_PROBE(ddt__syncing__avl);
+ for (i = 0; i < DDT_HASHSZ; i++)
+ ddt_sync_avl(ddt, &ddt->ddt_tree[i], tx, txg);
+ DTRACE_PROBE(ddt__synced__avl);
+
+ DTRACE_PROBE(ddt__syncing__obj);
for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
- uint64_t count = 0;
- for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+ for (enum ddt_class class = spa->spa_ddt_class_min;
+ class <= spa->spa_ddt_class_max; class++) {
if (ddt_object_exists(ddt, type, class)) {
+ ddo = &ddt->ddt_object_stats[type][class];
+ old_mbytes += ddo->ddo_mspace;
+
ddt_object_sync(ddt, type, class, tx);
- count += ddt_object_count(ddt, type, class);
+ (void) ddt_object_count(ddt, type, class, &cnt);
+ if (cnt == 0) {
+ ddt_object_destroy(ddt, type, class,
+ tx);
+ continue;
}
+
+ num_dbytes += ddo->ddo_dspace;
+ num_mbytes += ddo->ddo_mspace;
}
- for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
- if (count == 0 && ddt_object_exists(ddt, type, class))
- ddt_object_destroy(ddt, type, class, tx);
}
}
+ spa->spa_ddt_dsize = num_dbytes;
+ spa->spa_ddt_msize = num_mbytes;
+ atomic_add_64(&zfs_ddts_msize, ((int64_t)num_mbytes) - old_mbytes);
+ DTRACE_PROBE4(ddt__synced__obj, char *, spa->spa_name,
+ uint64_t, num_dbytes, uint64_t, num_mbytes, uint64_t,
+ zfs_ddts_msize);
+ if (spa_enable_dedup_cap(spa) && spa->spa_ddt_capped == 0) {
+ /* notify that dedup cap is now active */
+ spa->spa_ddt_capped = 1;
+ spa_event_notify(spa, NULL, NULL, ESC_ZFS_DEDUP_OFF);
+ } else if (!spa_enable_dedup_cap(spa) && spa->spa_ddt_capped == 1) {
+ /* notify that dedup cap is now inactive */
+ spa->spa_ddt_capped = 0;
+ spa_event_notify(spa, NULL, NULL, ESC_ZFS_DEDUP_ON);
+ }
+
+ /* update the cached stats with the values calculated above */
bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
sizeof (ddt->ddt_histogram));
}
void
ddt_sync(spa_t *spa, uint64_t txg)
{
dmu_tx_t *tx;
zio_t *rio = zio_root(spa, NULL, NULL,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL);
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
ASSERT(spa_syncing_txg(spa) == txg);
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);