1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
  25  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  26  */
  27 
  28 #include <sys/zfs_context.h>
  29 #include <sys/spa.h>
  30 #include <sys/spa_impl.h>
  31 #include <sys/zio.h>
  32 #include <sys/ddt.h>
  33 #include <sys/zap.h>
  34 #include <sys/dmu_tx.h>
  35 #include <sys/arc.h>
  36 #include <sys/dsl_pool.h>
  37 #include <sys/zio_checksum.h>
  38 #include <sys/zio_compress.h>
  39 #include <sys/dsl_scan.h>
  40 #include <sys/abd.h>
  41 
  42 /*
  43  * Almost all of the cases of iteration through zap containing entries are
  44  * restricted by spa->spa_ddt_class_{min,max}. It allows one to introduce new
  45  * behavior: storing all entries into the single zap. However, there are
  46  * some places where all zaps are iterated through forcibly: table creation,
  47  * deletion, loading, dde prefetching, and looking up. It allows one to maintain
  48  * compatibility with old pools and be able to convert the old pool format
  49  * into the new one on-the-fly.
  50  */
  51 
  52 /*
  53  * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
  54  */
  55 int zfs_dedup_prefetch = 1;
  56 
  57 static const ddt_ops_t *ddt_ops[DDT_TYPES] = {
  58         &ddt_zap_ops,
  59 };
  60 
  61 static const char *ddt_class_name[DDT_CLASSES] = {
  62         "ditto",
  63         "duplicate",
  64         "unique",
  65 };
  66 
  67 /* Possible in core size of all DDTs */
  68 uint64_t zfs_ddts_msize = 0;
  69 
  70 static void
  71 ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
  72     dmu_tx_t *tx)
  73 {
  74         spa_t *spa = ddt->ddt_spa;
  75         objset_t *os = ddt->ddt_os;
  76         uint64_t *objectp = &ddt->ddt_object[type][class];
  77         boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_flags &
  78             ZCHECKSUM_FLAG_DEDUP;
  79         char name[DDT_NAMELEN];
  80 
  81         ddt_object_name(ddt, type, class, name);
  82 
  83         ASSERT(*objectp == 0);
  84         VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0);
  85         ASSERT(*objectp != 0);
  86 
  87         VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name,
  88             sizeof (uint64_t), 1, objectp, tx) == 0);
  89 
  90         VERIFY(zap_add(os, spa->spa_ddt_stat_object, name,
  91             sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
  92             &ddt->ddt_histogram[type][class], tx) == 0);
  93 }
  94 
  95 static void
  96 ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
  97     dmu_tx_t *tx)
  98 {
  99         spa_t *spa = ddt->ddt_spa;
 100         objset_t *os = ddt->ddt_os;
 101         uint64_t *objectp = &ddt->ddt_object[type][class];
 102         char name[DDT_NAMELEN];
 103 #if DEBUG
 104         uint64_t count;
 105 #endif
 106         ddt_object_name(ddt, type, class, name);
 107 
 108         ASSERT(*objectp != 0);
 109         ASSERT((ddt_object_count(ddt, type, class, &count) == 0) &&
 110             (count == 0));
 111         ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
 112         VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0);
 113         VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0);
 114         VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0);
 115         bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t));
 116 
 117         *objectp = 0;
 118 }
 119 
 120 static int
 121 ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
 122 {
 123         ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
 124         dmu_object_info_t doi;
 125         char name[DDT_NAMELEN];
 126         int error;
 127 
 128         ddt_object_name(ddt, type, class, name);
 129 
 130         error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name,
 131             sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
 132         if (error)
 133                 return (error);
 134 
 135         VERIFY0(zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
 136             sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
 137             &ddt->ddt_histogram[type][class]));
 138 
 139         /*
 140          * Seed the cached statistics.
 141          */
 142         error = ddt_object_info(ddt, type, class, &doi);
 143         /* Panic in debug mode */
 144         ASSERT(error == 0);
 145         if (error)
 146                 return (error);
 147         error = ddt_object_count(ddt, type, class, &ddo->ddo_count);
 148         if (error)
 149                 return (error);
 150         ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
 151         ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
 152 
 153         return (0);
 154 }
 155 
 156 static void
 157 ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 158     dmu_tx_t *tx)
 159 {
 160         ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
 161         dmu_object_info_t doi;
 162         char name[DDT_NAMELEN];
 163 
 164         ddt_object_name(ddt, type, class, name);
 165 
 166         VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
 167             sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
 168             &ddt->ddt_histogram[type][class], tx) == 0);
 169 
 170         /*
 171          * Cache DDT statistics; this is the only time they'll change.
 172          */
 173         VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
 174 
 175         (void) ddt_object_count(ddt, type, class, &ddo->ddo_count);
 176         ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
 177         ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
 178 }
 179 
 180 static int
 181 ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 182     ddt_entry_t *dde)
 183 {
 184         if (!ddt_object_exists(ddt, type, class))
 185                 return (SET_ERROR(ENOENT));
 186 
 187         return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
 188             ddt->ddt_object[type][class], dde));
 189 }
 190 
 191 static void
 192 ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 193     ddt_entry_t *dde)
 194 {
 195         if (!ddt_object_exists(ddt, type, class))
 196                 return;
 197 
 198         ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os,
 199             ddt->ddt_object[type][class], dde);
 200 }
 201 
 202 int
 203 ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 204     ddt_entry_t *dde, dmu_tx_t *tx)
 205 {
 206         ASSERT(ddt_object_exists(ddt, type, class));
 207 
 208         return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
 209             ddt->ddt_object[type][class], dde, tx));
 210 }
 211 
 212 static int
 213 ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 214     ddt_entry_t *dde, dmu_tx_t *tx)
 215 {
 216         ASSERT(ddt_object_exists(ddt, type, class));
 217 
 218         return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os,
 219             ddt->ddt_object[type][class], dde, tx));
 220 }
 221 
 222 int
 223 ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 224     uint64_t *walk, ddt_entry_t *dde)
 225 {
 226         ASSERT(ddt_object_exists(ddt, type, class));
 227 
 228         return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
 229             ddt->ddt_object[type][class], dde, walk));
 230 }
 231 
 232 int
 233 ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 234         uint64_t *count)
 235 {
 236         ASSERT(ddt_object_exists(ddt, type, class));
 237 
 238         return (ddt_ops[type]->ddt_op_count(ddt->ddt_os,
 239             ddt->ddt_object[type][class], count));
 240 }
 241 
 242 int
 243 ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 244     dmu_object_info_t *doi)
 245 {
 246         if (!ddt_object_exists(ddt, type, class))
 247                 return (SET_ERROR(ENOENT));
 248 
 249         return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class],
 250             doi));
 251 }
 252 
 253 boolean_t
 254 ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
 255 {
 256         return (!!ddt->ddt_object[type][class]);
 257 }
 258 
 259 void
 260 ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 261     char *name)
 262 {
 263         (void) sprintf(name, DMU_POOL_DDT,
 264             zio_checksum_table[ddt->ddt_checksum].ci_name,
 265             ddt_ops[type]->ddt_op_name, ddt_class_name[class]);
 266 }
 267 
 268 void
 269 ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg)
 270 {
 271         ASSERT(txg != 0);
 272 
 273         for (int d = 0; d < SPA_DVAS_PER_BP; d++)
 274                 bp->blk_dva[d] = ddp->ddp_dva[d];
 275         BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth);
 276 }
 277 
 278 void
 279 ddt_bp_create(enum zio_checksum checksum,
 280     const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp)
 281 {
 282         BP_ZERO(bp);
 283 
 284         if (ddp != NULL)
 285                 ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth);
 286 
 287         bp->blk_cksum = ddk->ddk_cksum;
 288         bp->blk_fill = 1;
 289 
 290         BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk));
 291         BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk));
 292         BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk));
 293         BP_SET_CHECKSUM(bp, checksum);
 294         BP_SET_TYPE(bp, DMU_OT_DEDUP);
 295         BP_SET_LEVEL(bp, 0);
 296         BP_SET_DEDUP(bp, 0);
 297         BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
 298 }
 299 
 300 void
 301 ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp)
 302 {
 303         ddk->ddk_cksum = bp->blk_cksum;
 304         ddk->ddk_prop = 0;
 305 
 306         DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp));
 307         DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp));
 308         DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp));
 309 }
 310 
 311 void
 312 ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp)
 313 {
 314         ASSERT(ddp->ddp_phys_birth == 0);
 315 
 316         for (int d = 0; d < SPA_DVAS_PER_BP; d++)
 317                 ddp->ddp_dva[d] = bp->blk_dva[d];
 318         ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp);
 319 }
 320 
 321 void
 322 ddt_phys_clear(ddt_phys_t *ddp)
 323 {
 324         bzero(ddp, sizeof (*ddp));
 325 }
 326 
 327 void
 328 ddt_phys_addref(ddt_phys_t *ddp)
 329 {
 330         ddp->ddp_refcnt++;
 331 }
 332 
 333 void
 334 ddt_phys_decref(ddt_phys_t *ddp)
 335 {
 336         ASSERT((int64_t)ddp->ddp_refcnt > 0);
 337         ddp->ddp_refcnt--;
 338 }
 339 
 340 void
 341 ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg)
 342 {
 343         blkptr_t blk;
 344 
 345         ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
 346         ddt_phys_clear(ddp);
 347         zio_free(ddt->ddt_spa, txg, &blk);
 348 }
 349 
 350 ddt_phys_t *
 351 ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp)
 352 {
 353         ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys;
 354 
 355         for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 356                 if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) &&
 357                     BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth)
 358                         return (ddp);
 359         }
 360         return (NULL);
 361 }
 362 
 363 uint64_t
 364 ddt_phys_total_refcnt(const ddt_entry_t *dde)
 365 {
 366         uint64_t refcnt = 0;
 367 
 368         for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
 369                 refcnt += dde->dde_phys[p].ddp_refcnt;
 370 
 371         return (refcnt);
 372 }
 373 
 374 static void
 375 ddt_stat_generate(spa_t *spa, ddt_entry_t *dde, ddt_stat_t *dds)
 376 {
 377         ddt_phys_t *ddp = dde->dde_phys;
 378         ddt_key_t *ddk = &dde->dde_key;
 379         uint64_t lsize = DDK_GET_LSIZE(ddk);
 380         uint64_t psize = DDK_GET_PSIZE(ddk);
 381 
 382         bzero(dds, sizeof (*dds));
 383 
 384         for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 385                 uint64_t dsize = 0;
 386                 uint64_t refcnt = ddp->ddp_refcnt;
 387 
 388                 if (ddp->ddp_phys_birth == 0)
 389                         continue;
 390 
 391                 for (int d = 0; d < SPA_DVAS_PER_BP; d++)
 392                         dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
 393 
 394                 dds->dds_blocks += 1;
 395                 dds->dds_lsize += lsize;
 396                 dds->dds_psize += psize;
 397                 dds->dds_dsize += dsize;
 398 
 399                 dds->dds_ref_blocks += refcnt;
 400                 dds->dds_ref_lsize += lsize * refcnt;
 401                 dds->dds_ref_psize += psize * refcnt;
 402                 dds->dds_ref_dsize += dsize * refcnt;
 403         }
 404 }
 405 
 406 void
 407 ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg)
 408 {
 409         const uint64_t *s = (const uint64_t *)src;
 410         uint64_t *d = (uint64_t *)dst;
 411         uint64_t *d_end = (uint64_t *)(dst + 1);
 412 
 413         ASSERT(neg == 0 || neg == -1ULL);       /* add or subtract */
 414 
 415         while (d < d_end)
 416                 *d++ += (*s++ ^ neg) - neg;
 417 }
 418 
 419 static void
 420 ddt_stat_update_by_dds(ddt_t *ddt, ddt_entry_t *dde,
 421     ddt_stat_t *dds, uint64_t neg)
 422 {
 423         ddt_histogram_t *ddh;
 424         int bucket = highbit64(dds->dds_ref_blocks) - 1;
 425         ASSERT(bucket >= 0);
 426 
 427         ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
 428         ddt_stat_add(&ddh->ddh_stat[bucket], dds, neg);
 429 }
 430 
 431 static void
 432 ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
 433 {
 434         ddt_stat_t dds;
 435 
 436         ddt_stat_generate(ddt->ddt_spa, dde, &dds);
 437 
 438         ddt_stat_update_by_dds(ddt, dde, &dds, neg);
 439 }
 440 
 441 void
 442 ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
 443 {
 444         for (int h = 0; h < 64; h++)
 445                 ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0);
 446 }
 447 
 448 void
 449 ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh)
 450 {
 451         bzero(dds, sizeof (*dds));
 452 
 453         for (int h = 0; h < 64; h++)
 454                 ddt_stat_add(dds, &ddh->ddh_stat[h], 0);
 455 }
 456 
 457 boolean_t
 458 ddt_histogram_empty(const ddt_histogram_t *ddh)
 459 {
 460         const uint64_t *s = (const uint64_t *)ddh;
 461         const uint64_t *s_end = (const uint64_t *)(ddh + 1);
 462 
 463         while (s < s_end)
 464                 if (*s++ != 0)
 465                         return (B_FALSE);
 466 
 467         return (B_TRUE);
 468 }
 469 
 470 void
 471 ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
 472 {
 473         /* Sum the statistics we cached in ddt_object_sync(). */
 474         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 475                 ddt_t *ddt = spa->spa_ddt[c];
 476                 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 477                         for (enum ddt_class class = spa->spa_ddt_class_min;
 478                             class <= spa->spa_ddt_class_max; class++) {
 479                                 ddt_object_t *ddo =
 480                                     &ddt->ddt_object_stats[type][class];
 481                                 ddo_total->ddo_count += ddo->ddo_count;
 482                                 ddo_total->ddo_dspace += ddo->ddo_dspace;
 483                                 ddo_total->ddo_mspace += ddo->ddo_mspace;
 484                         }
 485                 }
 486         }
 487 
 488         /* ... and compute the averages. */
 489         if (ddo_total->ddo_count != 0) {
 490                 ddo_total->ddo_dspace /= ddo_total->ddo_count;
 491                 ddo_total->ddo_mspace /= ddo_total->ddo_count;
 492         }
 493 }
 494 
 495 void
 496 ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
 497 {
 498         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 499                 ddt_t *ddt = spa->spa_ddt[c];
 500                 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 501                         for (enum ddt_class class = spa->spa_ddt_class_min;
 502                             class <= spa->spa_ddt_class_max; class++) {
 503                                 ddt_histogram_add(ddh,
 504                                     &ddt->ddt_histogram_cache[type][class]);
 505                         }
 506                 }
 507         }
 508 }
 509 
 510 void
 511 ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total)
 512 {
 513         /*
 514          * Avoid temporary allocation of ddt_histogram_t from heap
 515          * or on stack (probably too large) by unrolling ddt_histogram_add()
 516          */
 517         bzero(dds_total, sizeof (ddt_stat_t));
 518         /* sum up the stats across all the histograms */
 519         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 520                 ddt_t *ddt = spa->spa_ddt[c];
 521                 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 522                         for (enum ddt_class class = spa->spa_ddt_class_min;
 523                             class <= spa->spa_ddt_class_max; class++) {
 524                                 /* unroll the ddt_histogram_add() */
 525                                 ddt_histogram_t *src =
 526                                     &ddt->ddt_histogram_cache[type][class];
 527                                 for (int h = 0; h < 64; h++) {
 528                                         ddt_stat_t *st = &src->ddh_stat[h];
 529                                         ddt_stat_add(dds_total, st, 0);
 530                                 }
 531                         }
 532                 }
 533         }
 534 }
 535 
 536 uint64_t
 537 ddt_get_dedup_dspace(spa_t *spa)
 538 {
 539         ddt_stat_t dds_total = { 0 };
 540 
 541         ddt_get_dedup_stats(spa, &dds_total);
 542         return (dds_total.dds_ref_dsize - dds_total.dds_dsize);
 543 }
 544 
 545 uint64_t
 546 ddt_get_pool_dedup_ratio(spa_t *spa)
 547 {
 548         ddt_stat_t dds_total = { 0 };
 549 
 550         ddt_get_dedup_stats(spa, &dds_total);
 551         if (dds_total.dds_dsize == 0)
 552                 return (100);
 553 
 554         return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize);
 555 }
 556 
 557 int
 558 ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref)
 559 {
 560         spa_t *spa = ddt->ddt_spa;
 561         uint64_t total_refcnt = 0;
 562         uint64_t ditto = spa->spa_dedup_ditto;
 563         int total_copies = 0;
 564         int desired_copies = 0;
 565 
 566         for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
 567                 ddt_phys_t *ddp = &dde->dde_phys[p];
 568                 zio_t *zio = dde->dde_lead_zio[p];
 569                 uint64_t refcnt = ddp->ddp_refcnt;   /* committed refs */
 570                 if (zio != NULL)
 571                         refcnt += zio->io_parent_count;      /* pending refs */
 572                 if (ddp == ddp_willref)
 573                         refcnt++;                       /* caller's ref */
 574                 if (refcnt != 0) {
 575                         total_refcnt += refcnt;
 576                         total_copies += p;
 577                 }
 578         }
 579 
 580         if (ditto == 0 || ditto > UINT32_MAX)
 581                 ditto = UINT32_MAX;
 582 
 583         if (total_refcnt >= 1)
 584                 desired_copies++;
 585         if (total_refcnt >= ditto)
 586                 desired_copies++;
 587         if (total_refcnt >= ditto * ditto)
 588                 desired_copies++;
 589 
 590         return (MAX(desired_copies, total_copies) - total_copies);
 591 }
 592 
 593 int
 594 ddt_ditto_copies_present(ddt_entry_t *dde)
 595 {
 596         ddt_phys_t *ddp = &dde->dde_phys[DDT_PHYS_DITTO];
 597         dva_t *dva = ddp->ddp_dva;
 598         int copies = 0 - DVA_GET_GANG(dva);
 599 
 600         for (int d = 0; d < SPA_DVAS_PER_BP; d++, dva++)
 601                 if (DVA_IS_VALID(dva))
 602                         copies++;
 603 
 604         ASSERT(copies >= 0 && copies < SPA_DVAS_PER_BP);
 605 
 606         return (copies);
 607 }
 608 
 609 size_t
 610 ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len)
 611 {
 612         uchar_t *version = dst++;
 613         int cpfunc = ZIO_COMPRESS_ZLE;
 614         zio_compress_info_t *ci = &zio_compress_table[cpfunc];
 615         size_t c_len;
 616 
 617         ASSERT(d_len >= s_len + 1);  /* no compression plus version byte */
 618 
 619         c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level);
 620 
 621         if (c_len == s_len) {
 622                 cpfunc = ZIO_COMPRESS_OFF;
 623                 bcopy(src, dst, s_len);
 624         }
 625 
 626         *version = cpfunc;
 627         /* CONSTCOND */
 628         if (ZFS_HOST_BYTEORDER)
 629                 *version |= DDT_COMPRESS_BYTEORDER_MASK;
 630 
 631         return (c_len + 1);
 632 }
 633 
 634 void
 635 ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len)
 636 {
 637         uchar_t version = *src++;
 638         int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK;
 639         zio_compress_info_t *ci = &zio_compress_table[cpfunc];
 640 
 641         if (ci->ci_decompress != NULL)
 642                 (void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);
 643         else
 644                 bcopy(src, dst, d_len);
 645 
 646         if (((version & DDT_COMPRESS_BYTEORDER_MASK) != 0) !=
 647             (ZFS_HOST_BYTEORDER != 0))
 648                 byteswap_uint64_array(dst, d_len);
 649 }
 650 
 651 ddt_t *
 652 ddt_select_by_checksum(spa_t *spa, enum zio_checksum c)
 653 {
 654         return (spa->spa_ddt[c]);
 655 }
 656 
 657 ddt_t *
 658 ddt_select(spa_t *spa, const blkptr_t *bp)
 659 {
 660         return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
 661 }
 662 
 663 void
 664 ddt_enter(ddt_t *ddt, uint8_t hash)
 665 {
 666         mutex_enter(&ddt->ddt_lock[hash]);
 667 }
 668 
 669 void
 670 ddt_exit(ddt_t *ddt, uint8_t hash)
 671 {
 672         mutex_exit(&ddt->ddt_lock[hash]);
 673 }
 674 
 675 void
 676 dde_enter(ddt_entry_t *dde)
 677 {
 678         mutex_enter(&dde->dde_lock);
 679 }
 680 
 681 void
 682 dde_exit(ddt_entry_t *dde)
 683 {
 684         mutex_exit(&dde->dde_lock);
 685 }
 686 
 687 /* cache for ddt_entry_t structures */
 688 static kmem_cache_t *dde_cache;
 689 
 690 /* ARGSUSED */
 691 static int
 692 dde_cache_constr(void *buf, void *arg, int flags)
 693 {
 694         ddt_entry_t *dde = (ddt_entry_t *)buf;
 695         cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
 696         mutex_init(&dde->dde_lock, NULL, MUTEX_DEFAULT, NULL);
 697         return (0);
 698 }
 699 
 700 /* ARGSUSED */
 701 static void
 702 dde_cache_destr(void *buf, void *arg)
 703 {
 704         ddt_entry_t *dde = (ddt_entry_t *)buf;
 705         cv_destroy(&dde->dde_cv);
 706         mutex_destroy(&dde->dde_lock);
 707 }
 708 
 709 void
 710 ddt_init(void)
 711 {
 712         dde_cache = kmem_cache_create("ddt_entry_t", sizeof (ddt_entry_t),
 713             0, dde_cache_constr, dde_cache_destr, NULL, NULL, NULL, 0);
 714         VERIFY(dde_cache != NULL);
 715 }
 716 
 717 void
 718 ddt_fini(void)
 719 {
 720         if (dde_cache) {
 721                 kmem_cache_destroy(dde_cache);
 722                 dde_cache = NULL;
 723         }
 724 }
 725 
 726 static ddt_entry_t *
 727 ddt_alloc(const ddt_key_t *ddk)
 728 {
 729         ddt_entry_t *dde;
 730 
 731         dde = kmem_cache_alloc(dde_cache, KM_SLEEP);
 732 
 733         /* Init everything but the condvar and the mutex */
 734         dde->dde_key = *ddk;
 735         bzero((void*)((uintptr_t)dde+offsetof(ddt_entry_t, dde_phys)),
 736             offsetof(ddt_entry_t, dde_cv)-offsetof(ddt_entry_t, dde_phys));
 737         bzero((void*)((uintptr_t)dde+offsetof(ddt_entry_t, dde_node)),
 738             sizeof (avl_node_t));
 739 
 740         return (dde);
 741 }
 742 
 743 static void
 744 ddt_free(ddt_entry_t *dde)
 745 {
 746         ASSERT(!(dde->dde_state & DDE_LOADING));
 747 
 748         for (int p = 0; p < DDT_PHYS_TYPES; p++)
 749                 ASSERT(dde->dde_lead_zio[p] == NULL);
 750 
 751         if (dde->dde_repair_abd != NULL)
 752                 abd_free(dde->dde_repair_abd);
 753 
 754         kmem_cache_free(dde_cache, dde);
 755 }
 756 
 757 /* for zdb usage */
 758 void
 759 ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
 760 {
 761         uint8_t hash = DDT_HASHFN(dde->dde_key.ddk_cksum);
 762 
 763         avl_remove(&ddt->ddt_tree[hash], dde);
 764         ddt_free(dde);
 765 }
 766 
 767 ddt_entry_t *
 768 ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
 769 {
 770         ddt_entry_t *dde, dde_search;
 771         enum ddt_type type;
 772         enum ddt_class class;
 773         avl_index_t where;
 774         uint8_t hash = DDT_HASHFN(bp->blk_cksum);
 775         int error;
 776 
 777         ddt_key_fill(&dde_search.dde_key, bp);
 778 
 779         ddt_enter(ddt, hash);
 780         /*
 781          * Do we have the dirty DDE in mem already?
 782          */
 783         dde = avl_find(&ddt->ddt_tree[hash], &dde_search, &where);
 784         if (dde == NULL) {
 785                 /* This DDE doesn't exists in dirty tree */
 786                 if (!add) {
 787                         ddt_exit(ddt, hash);
 788                         return (NULL);
 789                 }
 790                 /* Since a dirty DDE didn't exist, create it */
 791                 dde = ddt_alloc(&dde_search.dde_key);
 792                 avl_insert(&ddt->ddt_tree[hash], dde, where);
 793         }
 794 
 795         ddt_exit(ddt, hash);
 796 
 797         /*
 798          * If we're already looking up this DDE
 799          * wait until we have the result
 800          */
 801         dde_enter(dde);
 802         while (dde->dde_state & DDE_LOADING)
 803                 cv_wait(&dde->dde_cv, &dde->dde_lock);
 804 
 805         /*
 806          * If we have loaded the DDE from disk return it
 807          */
 808         if (dde->dde_state & DDE_LOADED)
 809                 return (dde);
 810 
 811         /*
 812          * If we didn't find this DDE, start looking up the DDE in ZAP
 813          */
 814         dde->dde_state |= DDE_LOADING;
 815         dde_exit(dde);
 816 
 817         error = ENOENT;
 818 
 819         DTRACE_PROBE1(ddt__loading, ddt_key_t *, &dde->dde_key);
 820         for (type = 0; type < DDT_TYPES; type++) {
 821                 for (class = 0; class < DDT_CLASSES; class++) {
 822                         error = ddt_object_lookup(ddt, type, class, dde);
 823                         if (error != ENOENT)
 824                                 break;
 825                 }
 826                 if (error != ENOENT)
 827                         break;
 828         }
 829 
 830         ASSERT(error == 0 || error == ENOENT);
 831 
 832         dde_enter(dde);
 833 
 834         ASSERT(!(dde->dde_state & DDE_LOADED));
 835         ASSERT(dde->dde_state & DDE_LOADING);
 836 
 837         dde->dde_type = type;        /* will be DDT_TYPES if no entry found */
 838         dde->dde_class = class;      /* will be DDT_CLASSES if no entry found */
 839         if (type == DDT_TYPES && class == DDT_CLASSES)
 840                 dde->dde_state |= DDE_NEW;
 841         dde->dde_state |= DDE_LOADED;
 842         dde->dde_state &= ~DDE_LOADING;
 843 
 844         DTRACE_PROBE2(ddt__loaded, ddt_key_t *, &dde->dde_key,
 845             enum ddt_class, dde->dde_class);
 846         if (error == 0)
 847                 ddt_stat_generate(ddt->ddt_spa, dde, &dde->dde_lkstat);
 848 
 849         cv_broadcast(&dde->dde_cv);
 850 
 851         return (dde);
 852 }
 853 
 854 void
 855 ddt_prefetch(spa_t *spa, const blkptr_t *bp)
 856 {
 857         ddt_t *ddt;
 858         ddt_entry_t dde;
 859 
 860         if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp))
 861                 return;
 862 
 863         /*
 864          * We only remove the DDT once all tables are empty and only
 865          * prefetch dedup blocks when there are entries in the DDT.
 866          * Thus no locking is required as the DDT can't disappear on us.
 867          */
 868         ddt = ddt_select(spa, bp);
 869         ddt_key_fill(&dde.dde_key, bp);
 870 
 871         for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 872                 for (enum ddt_class class = 0;
 873                     class < DDT_CLASSES; class++) {
 874                         ddt_object_prefetch(ddt, type, class, &dde);
 875                 }
 876         }
 877 }
 878 
 879 int
 880 ddt_entry_compare(const void *x1, const void *x2)
 881 {
 882         const ddt_entry_t *dde1 = x1;
 883         const ddt_entry_t *dde2 = x2;
 884         const uint64_t *u1 = (const uint64_t *)&dde1->dde_key;
 885         const uint64_t *u2 = (const uint64_t *)&dde2->dde_key;
 886 
 887         for (int i = 0; i < DDT_KEY_WORDS; i++) {
 888                 if (u1[i] < u2[i])
 889                         return (-1);
 890                 if (u1[i] > u2[i])
 891                         return (1);
 892         }
 893 
 894         return (0);
 895 }
 896 
 897 static ddt_t *
 898 ddt_table_alloc(spa_t *spa, enum zio_checksum c)
 899 {
 900         ddt_t *ddt;
 901         uint_t i;
 902 
 903         ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP);
 904 
 905         for (i = 0; i < DDT_HASHSZ; i++) {
 906                 mutex_init(&ddt->ddt_lock[i], NULL, MUTEX_DEFAULT, NULL);
 907                 avl_create(&ddt->ddt_tree[i], ddt_entry_compare,
 908                     sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
 909         }
 910         mutex_init(&ddt->ddt_repair_lock, NULL, MUTEX_DEFAULT, NULL);
 911 
 912         avl_create(&ddt->ddt_repair_tree, ddt_entry_compare,
 913             sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
 914         ddt->ddt_checksum = c;
 915         ddt->ddt_spa = spa;
 916         ddt->ddt_os = spa->spa_meta_objset;
 917 
 918         return (ddt);
 919 }
 920 
 921 static void
 922 ddt_table_free(ddt_t *ddt)
 923 {
 924         uint_t i;
 925 
 926         ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0);
 927 
 928         for (i = 0; i < DDT_HASHSZ; i++) {
 929                 ASSERT(avl_numnodes(&ddt->ddt_tree[i]) == 0);
 930                 avl_destroy(&ddt->ddt_tree[i]);
 931                 mutex_destroy(&ddt->ddt_lock[i]);
 932         }
 933         avl_destroy(&ddt->ddt_repair_tree);
 934         mutex_destroy(&ddt->ddt_repair_lock);
 935         kmem_free(ddt, sizeof (*ddt));
 936 }
 937 
 938 void
 939 ddt_create(spa_t *spa)
 940 {
 941         spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM;
 942 
 943         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
 944                 spa->spa_ddt[c] = ddt_table_alloc(spa, c);
 945 }
 946 
 947 /*
 948  * Get the combined size of DDTs on all pools.
 949  * Returns either on disk (phys == B_TRUE) or in core combined DDTs size
 950  */
 951 uint64_t
 952 ddt_get_ddts_size(boolean_t phys)
 953 {
 954         uint64_t ddts_size = 0;
 955         spa_t *spa = NULL;
 956 
 957         while ((spa = spa_next(spa)) != NULL)
 958                 ddts_size += spa_get_ddts_size(spa, phys);
 959 
 960         return (ddts_size);
 961 }
 962 
 963 int
 964 ddt_load(spa_t *spa)
 965 {
 966         int error;
 967         ddt_object_t *ddo;
 968 
 969         ddt_create(spa);
 970 
 971         error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 972             DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
 973             &spa->spa_ddt_stat_object);
 974 
 975         if (error)
 976                 return (error == ENOENT ? 0 : error);
 977 
 978         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 979                 ddt_t *ddt = spa->spa_ddt[c];
 980                 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 981                         for (enum ddt_class class = 0;
 982                             class < DDT_CLASSES; class++) {
 983                                 error = ddt_object_load(ddt, type, class);
 984                                 if (error == ENOENT)
 985                                         continue;
 986                                 if (error != 0)
 987                                         return (error);
 988                                 ddo = &ddt->ddt_object_stats[type][class];
 989                                 atomic_add_64(&spa->spa_ddt_dsize,
 990                                     ddo->ddo_dspace);
 991                                 atomic_add_64(&spa->spa_ddt_msize,
 992                                     ddo->ddo_mspace);
 993                         }
 994                 }
 995 
 996                 /*
 997                  * Seed the cached histograms.
 998                  */
 999                 bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
1000                     sizeof (ddt->ddt_histogram));
1001         }
1002         zfs_ddts_msize = ddt_get_ddts_size(B_FALSE);
1003 
1004         if (spa_enable_dedup_cap(spa) && spa->spa_ddt_capped == 0) {
1005                 /* notify that dedup cap is now active */
1006                 spa->spa_ddt_capped = 1;
1007                 spa_event_notify(spa, NULL, NULL, ESC_ZFS_DEDUP_OFF);
1008         }
1009 
1010         return (0);
1011 }
1012 
1013 void
1014 ddt_unload(spa_t *spa)
1015 {
1016         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1017                 if (spa->spa_ddt[c]) {
1018                         ddt_table_free(spa->spa_ddt[c]);
1019                         spa->spa_ddt[c] = NULL;
1020                 }
1021         }
1022         spa->spa_ddt_dsize = 0;
1023         spa->spa_ddt_msize = 0;
1024         zfs_ddts_msize = ddt_get_ddts_size(B_FALSE);
1025 }
1026 
1027 boolean_t
1028 ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp)
1029 {
1030         ddt_t *ddt;
1031         ddt_entry_t dde;
1032 
1033         if (!BP_GET_DEDUP(bp))
1034                 return (B_FALSE);
1035 
1036         if (max_class > spa->spa_ddt_class_max)
1037                 max_class = spa->spa_ddt_class_max;
1038 
1039         ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)];
1040 
1041         ddt_key_fill(&dde.dde_key, bp);
1042 
1043         for (enum ddt_type type = 0; type < DDT_TYPES; type++)
1044                 for (enum ddt_class class = spa->spa_ddt_class_min;
1045                     class <= max_class; class++)
1046                         if (ddt_object_lookup(ddt, type, class, &dde) == 0)
1047                                 return (B_TRUE);
1048 
1049         return (B_FALSE);
1050 }
1051 
1052 ddt_entry_t *
1053 ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
1054 {
1055         ddt_key_t ddk;
1056         ddt_entry_t *dde;
1057 
1058         ddt_key_fill(&ddk, bp);
1059 
1060         dde = ddt_alloc(&ddk);
1061 
1062         for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
1063                 for (enum ddt_class class = 0;
1064                     class < DDT_CLASSES; class++) {
1065                         /*
1066                          * We can only do repair if there are multiple copies
1067                          * of the block.  For anything in the UNIQUE class,
1068                          * there's definitely only one copy, so don't even try.
1069                          */
1070                         if (class != DDT_CLASS_UNIQUE &&
1071                             ddt_object_lookup(ddt, type, class, dde) == 0)
1072                                 return (dde);
1073                 }
1074         }
1075 
1076         bzero(dde->dde_phys, sizeof (dde->dde_phys));
1077 
1078         return (dde);
1079 }
1080 
1081 void
1082 ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
1083 {
1084         avl_index_t where;
1085 
1086         mutex_enter(&ddt->ddt_repair_lock);
1087 
1088         if (dde->dde_repair_abd != NULL && spa_writeable(ddt->ddt_spa) &&
1089             avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
1090                 avl_insert(&ddt->ddt_repair_tree, dde, where);
1091         else
1092                 ddt_free(dde);
1093 
1094         mutex_exit(&ddt->ddt_repair_lock);;
1095 }
1096 
1097 static void
1098 ddt_repair_entry_done(zio_t *zio)
1099 {
1100         ddt_entry_t *rdde = zio->io_private;
1101 
1102         ddt_free(rdde);
1103 }
1104 
1105 static void
1106 ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
1107 {
1108         ddt_phys_t *ddp = dde->dde_phys;
1109         ddt_phys_t *rddp = rdde->dde_phys;
1110         ddt_key_t *ddk = &dde->dde_key;
1111         ddt_key_t *rddk = &rdde->dde_key;
1112         zio_t *zio;
1113         blkptr_t blk;
1114 
1115         zio = zio_null(rio, rio->io_spa, NULL,
1116             ddt_repair_entry_done, rdde, rio->io_flags);
1117 
1118         for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) {
1119                 if (ddp->ddp_phys_birth == 0 ||
1120                     ddp->ddp_phys_birth != rddp->ddp_phys_birth ||
1121                     bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva)))
1122                         continue;
1123                 ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
1124                 zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
1125                     rdde->dde_repair_abd, DDK_GET_PSIZE(rddk), NULL, NULL,
1126                     ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL));
1127         }
1128 
1129         zio_nowait(zio);
1130 }
1131 
1132 static void
1133 ddt_repair_table(ddt_t *ddt, zio_t *rio)
1134 {
1135         spa_t *spa = ddt->ddt_spa;
1136         ddt_entry_t *dde, *rdde_next, *rdde;
1137         avl_tree_t *t = &ddt->ddt_repair_tree;
1138         blkptr_t blk;
1139 
1140         if (spa_sync_pass(spa) > 1)
1141                 return;
1142 
1143         mutex_enter(&ddt->ddt_repair_lock);
1144         for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) {
1145                 rdde_next = AVL_NEXT(t, rdde);
1146                 avl_remove(&ddt->ddt_repair_tree, rdde);
1147                 mutex_exit(&ddt->ddt_repair_lock);
1148 
1149                 ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk);
1150                 dde = ddt_repair_start(ddt, &blk);
1151                 ddt_repair_entry(ddt, dde, rdde, rio);
1152                 ddt_repair_done(ddt, dde);
1153 
1154                 mutex_enter(&ddt->ddt_repair_lock);
1155         }
1156         mutex_exit(&ddt->ddt_repair_lock);
1157 }
1158 
1159 static void
1160 ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
1161 {
1162         dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
1163         ddt_phys_t *ddp = dde->dde_phys;
1164         ddt_key_t *ddk = &dde->dde_key;
1165         spa_t *spa = ddt->ddt_spa;
1166         enum ddt_type otype = dde->dde_type;
1167         enum ddt_type ntype = DDT_TYPE_CURRENT;
1168         enum ddt_class oclass = dde->dde_class;
1169         enum ddt_class nclass;
1170         uint64_t total_refcnt = 0;
1171 
1172         ASSERT(dde->dde_state & DDE_LOADED);
1173         ASSERT(!(dde->dde_state & DDE_LOADING));
1174 
1175         /*
1176          * Propagate the stats generated at lookup time
1177          * this was delayed to avoid having to take locks
1178          * to protect ddt->ddt_histogram
1179          */
1180         if (dde->dde_lkstat.dds_ref_blocks != 0)
1181                 ddt_stat_update_by_dds(ddt, dde, &dde->dde_lkstat, -1ULL);
1182 
1183         for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
1184                 ASSERT(dde->dde_lead_zio[p] == NULL);
1185                 ASSERT((int64_t)ddp->ddp_refcnt >= 0);
1186                 if (ddp->ddp_phys_birth == 0) {
1187                         ASSERT(ddp->ddp_refcnt == 0);
1188                         continue;
1189                 }
1190                 if (p == DDT_PHYS_DITTO) {
1191                         if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0)
1192                                 ddt_phys_free(ddt, ddk, ddp, txg);
1193                         continue;
1194                 }
1195                 if (ddp->ddp_refcnt == 0)
1196                         ddt_phys_free(ddt, ddk, ddp, txg);
1197                 total_refcnt += ddp->ddp_refcnt;
1198         }
1199 
1200         if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0)
1201                 nclass = DDT_CLASS_DITTO;
1202         else if (total_refcnt > 1)
1203                 nclass = DDT_CLASS_DUPLICATE;
1204         else
1205                 nclass = DDT_CLASS_UNIQUE;
1206 
1207         if (nclass > spa->spa_ddt_class_max)
1208                 nclass = spa->spa_ddt_class_max;
1209 
1210         if (nclass < spa->spa_ddt_class_min)
1211                 nclass = spa->spa_ddt_class_min;
1212 
1213         DTRACE_PROBE1(ddt__storing__entry, uint64_t, (uint64_t)nclass);
1214 
1215         if (otype != DDT_TYPES &&
1216             (otype != ntype || oclass != nclass || total_refcnt == 0)) {
1217                 VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0);
1218                 ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT);
1219         }
1220 
1221         if (total_refcnt != 0) {
1222                 dde->dde_type = ntype;
1223                 dde->dde_class = nclass;
1224                 ddt_stat_update(ddt, dde, 0);
1225                 if (!ddt_object_exists(ddt, ntype, nclass))
1226                         ddt_object_create(ddt, ntype, nclass, tx);
1227                 VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0);
1228 
1229                 /*
1230                  * If the class changes, the order that we scan this bp
1231                  * changes.  If it decreases, we could miss it, so
1232                  * scan it right now.  (This covers both class changing
1233                  * while we are doing ddt_walk(), and when we are
1234                  * traversing.)
1235                  */
1236                 if (nclass < oclass) {
1237                         dsl_scan_ddt_entry(dp->dp_scan,
1238                             ddt->ddt_checksum, dde, tx);
1239                 }
1240         }
1241         DTRACE_PROBE(ddt__stored__entry);
1242 }
1243 
1244 static void
1245 ddt_sync_avl(ddt_t *ddt, avl_tree_t *avl, dmu_tx_t *tx, uint64_t txg)
1246 {
1247         void *cookie = NULL;
1248         ddt_entry_t *dde;
1249 
1250         while ((dde = avl_destroy_nodes(avl, &cookie)) != NULL) {
1251                 if ((dde->dde_state & DDE_DONT_SYNC) != DDE_DONT_SYNC) {
1252                         ddt_sync_entry(ddt, dde, tx, txg);
1253                 } else { /* if we're not syncing this DDE it must be new */
1254                         ASSERT(dde->dde_state & DDE_NEW);
1255                 }
1256                 ddt_free(dde);
1257         }
1258 }
1259 
1260 static void
1261 ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
1262 {
1263         uint64_t cnt, num_dbytes = 0, num_mbytes = 0;
1264         int64_t old_mbytes = 0;
1265         spa_t *spa = ddt->ddt_spa;
1266         uint_t i, numnodes = 0;
1267         ddt_object_t *ddo;
1268 
1269         for (i = 0; i < DDT_HASHSZ; i++)
1270                 numnodes += avl_numnodes(&ddt->ddt_tree[i]);
1271 
1272         if (numnodes == 0)
1273                 return;
1274 
1275         ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP);
1276 
1277         if (spa->spa_ddt_stat_object == 0) {
1278                 spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
1279                     DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT,
1280                     DMU_POOL_DDT_STATS, tx);
1281         }
1282 
1283 
1284         DTRACE_PROBE(ddt__syncing__avl);
1285         for (i = 0; i < DDT_HASHSZ; i++)
1286                 ddt_sync_avl(ddt, &ddt->ddt_tree[i], tx, txg);
1287         DTRACE_PROBE(ddt__synced__avl);
1288 
1289         DTRACE_PROBE(ddt__syncing__obj);
1290         for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
1291                 for (enum ddt_class class = spa->spa_ddt_class_min;
1292                     class <= spa->spa_ddt_class_max; class++) {
1293                         if (ddt_object_exists(ddt, type, class)) {
1294                                 ddo = &ddt->ddt_object_stats[type][class];
1295                                 old_mbytes += ddo->ddo_mspace;
1296 
1297                                 ddt_object_sync(ddt, type, class, tx);
1298                                 (void) ddt_object_count(ddt, type, class, &cnt);
1299                                 if (cnt == 0) {
1300                                         ddt_object_destroy(ddt, type, class,
1301                                             tx);
1302                                         continue;
1303                                 }
1304 
1305                                 num_dbytes += ddo->ddo_dspace;
1306                                 num_mbytes += ddo->ddo_mspace;
1307                         }
1308                 }
1309         }
1310         spa->spa_ddt_dsize = num_dbytes;
1311         spa->spa_ddt_msize = num_mbytes;
1312         atomic_add_64(&zfs_ddts_msize, ((int64_t)num_mbytes) - old_mbytes);
1313         DTRACE_PROBE4(ddt__synced__obj, char *, spa->spa_name,
1314             uint64_t, num_dbytes, uint64_t, num_mbytes, uint64_t,
1315             zfs_ddts_msize);
1316 
1317         if (spa_enable_dedup_cap(spa) && spa->spa_ddt_capped == 0) {
1318                 /* notify that dedup cap is now active */
1319                 spa->spa_ddt_capped = 1;
1320                 spa_event_notify(spa, NULL, NULL, ESC_ZFS_DEDUP_OFF);
1321         } else if (!spa_enable_dedup_cap(spa) && spa->spa_ddt_capped == 1) {
1322                 /* notify that dedup cap is now inactive */
1323                 spa->spa_ddt_capped = 0;
1324                 spa_event_notify(spa, NULL, NULL, ESC_ZFS_DEDUP_ON);
1325         }
1326 
1327         /* update the cached stats with the values calculated above */
1328         bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
1329             sizeof (ddt->ddt_histogram));
1330 }
1331 
1332 void
1333 ddt_sync(spa_t *spa, uint64_t txg)
1334 {
1335         dmu_tx_t *tx;
1336         zio_t *rio = zio_root(spa, NULL, NULL,
1337             ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
1338 
1339         ASSERT(spa_syncing_txg(spa) == txg);
1340 
1341         tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1342 
1343         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1344                 ddt_t *ddt = spa->spa_ddt[c];
1345                 if (ddt == NULL)
1346                         continue;
1347                 ddt_sync_table(ddt, tx, txg);
1348                 ddt_repair_table(ddt, rio);
1349         }
1350 
1351         (void) zio_wait(rio);
1352 
1353         dmu_tx_commit(tx);
1354 }
1355 
1356 int
1357 ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)
1358 {
1359         do {
1360                 do {
1361                         do {
1362                                 ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum];
1363                                 int error = ENOENT;
1364                                 if (ddt_object_exists(ddt, ddb->ddb_type,
1365                                     ddb->ddb_class)) {
1366                                         error = ddt_object_walk(ddt,
1367                                             ddb->ddb_type, ddb->ddb_class,
1368                                             &ddb->ddb_cursor, dde);
1369                                 }
1370                                 dde->dde_type = ddb->ddb_type;
1371                                 dde->dde_class = ddb->ddb_class;
1372                                 if (error == 0)
1373                                         return (0);
1374                                 if (error != ENOENT)
1375                                         return (error);
1376                                 ddb->ddb_cursor = 0;
1377                         } while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS);
1378                         ddb->ddb_checksum = 0;
1379                 } while (++ddb->ddb_type < DDT_TYPES);
1380                 ddb->ddb_type = 0;
1381         } while (++ddb->ddb_class < DDT_CLASSES);
1382 
1383         return (SET_ERROR(ENOENT));
1384 }