5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
  25  */
  26 
  27 #include <sys/zfs_context.h>
  28 #include <sys/spa.h>
  29 #include <sys/spa_impl.h>
  30 #include <sys/zio.h>
  31 #include <sys/ddt.h>
  32 #include <sys/zap.h>
  33 #include <sys/dmu_tx.h>
  34 #include <sys/arc.h>
  35 #include <sys/dsl_pool.h>
  36 #include <sys/zio_checksum.h>
  37 #include <sys/zio_compress.h>
  38 #include <sys/dsl_scan.h>
  39 #include <sys/abd.h>
  40 
  41 /*
  42  * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
  43  */
  44 int zfs_dedup_prefetch = 1;
  45 
  46 static const ddt_ops_t *ddt_ops[DDT_TYPES] = {
  47         &ddt_zap_ops,
  48 };
  49 
  50 static const char *ddt_class_name[DDT_CLASSES] = {
  51         "ditto",
  52         "duplicate",
  53         "unique",
  54 };
  55 
  56 static void
  57 ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
  58     dmu_tx_t *tx)
  59 {
  60         spa_t *spa = ddt->ddt_spa;
  61         objset_t *os = ddt->ddt_os;
  62         uint64_t *objectp = &ddt->ddt_object[type][class];
  63         boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_flags &
  64             ZCHECKSUM_FLAG_DEDUP;
  65         char name[DDT_NAMELEN];
  66 
  67         ddt_object_name(ddt, type, class, name);
  68 
  69         ASSERT(*objectp == 0);
  70         VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0);
  71         ASSERT(*objectp != 0);
  72 
  73         VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name,
  74             sizeof (uint64_t), 1, objectp, tx) == 0);
  75 
  76         VERIFY(zap_add(os, spa->spa_ddt_stat_object, name,
  77             sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
  78             &ddt->ddt_histogram[type][class], tx) == 0);
  79 }
  80 
  81 static void
  82 ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
  83     dmu_tx_t *tx)
  84 {
  85         spa_t *spa = ddt->ddt_spa;
  86         objset_t *os = ddt->ddt_os;
  87         uint64_t *objectp = &ddt->ddt_object[type][class];
  88         char name[DDT_NAMELEN];
  89 
  90         ddt_object_name(ddt, type, class, name);
  91 
  92         ASSERT(*objectp != 0);
  93         ASSERT(ddt_object_count(ddt, type, class) == 0);
  94         ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
  95         VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0);
  96         VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0);
  97         VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0);
  98         bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t));
  99 
 100         *objectp = 0;
 101 }
 102 
 103 static int
 104 ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
 105 {
 106         ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
 107         dmu_object_info_t doi;
 108         char name[DDT_NAMELEN];
 109         int error;
 110 
 111         ddt_object_name(ddt, type, class, name);
 112 
 113         error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name,
 114             sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
 115 
 116         if (error != 0)
 117                 return (error);
 118 
 119         VERIFY0(zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
 120             sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
 121             &ddt->ddt_histogram[type][class]));
 122 
 123         /*
 124          * Seed the cached statistics.
 125          */
 126         VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
 127 
 128         ddo->ddo_count = ddt_object_count(ddt, type, class);
 129         ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
 130         ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
 131 
 132         return (0);
 133 }
 134 
 135 static void
 136 ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 137     dmu_tx_t *tx)
 138 {
 139         ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
 140         dmu_object_info_t doi;
 141         char name[DDT_NAMELEN];
 142 
 143         ddt_object_name(ddt, type, class, name);
 144 
 145         VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
 146             sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
 147             &ddt->ddt_histogram[type][class], tx) == 0);
 148 
 149         /*
 150          * Cache DDT statistics; this is the only time they'll change.
 151          */
 152         VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
 153 
 154         ddo->ddo_count = ddt_object_count(ddt, type, class);
 155         ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
 156         ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
 157 }
 158 
 159 static int
 160 ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 161     ddt_entry_t *dde)
 162 {
 163         if (!ddt_object_exists(ddt, type, class))
 164                 return (SET_ERROR(ENOENT));
 165 
 166         return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
 167             ddt->ddt_object[type][class], dde));
 168 }
 169 
 170 static void
 171 ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 172     ddt_entry_t *dde)
 173 {
 174         if (!ddt_object_exists(ddt, type, class))
 
 191 static int
 192 ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 193     ddt_entry_t *dde, dmu_tx_t *tx)
 194 {
 195         ASSERT(ddt_object_exists(ddt, type, class));
 196 
 197         return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os,
 198             ddt->ddt_object[type][class], dde, tx));
 199 }
 200 
 201 int
 202 ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 203     uint64_t *walk, ddt_entry_t *dde)
 204 {
 205         ASSERT(ddt_object_exists(ddt, type, class));
 206 
 207         return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
 208             ddt->ddt_object[type][class], dde, walk));
 209 }
 210 
 211 uint64_t
 212 ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
 213 {
 214         ASSERT(ddt_object_exists(ddt, type, class));
 215 
 216         return (ddt_ops[type]->ddt_op_count(ddt->ddt_os,
 217             ddt->ddt_object[type][class]));
 218 }
 219 
 220 int
 221 ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 222     dmu_object_info_t *doi)
 223 {
 224         if (!ddt_object_exists(ddt, type, class))
 225                 return (SET_ERROR(ENOENT));
 226 
 227         return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class],
 228             doi));
 229 }
 230 
 231 boolean_t
 232 ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
 233 {
 234         return (!!ddt->ddt_object[type][class]);
 235 }
 236 
 237 void
 
 333         for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 334                 if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) &&
 335                     BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth)
 336                         return (ddp);
 337         }
 338         return (NULL);
 339 }
 340 
 341 uint64_t
 342 ddt_phys_total_refcnt(const ddt_entry_t *dde)
 343 {
 344         uint64_t refcnt = 0;
 345 
 346         for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
 347                 refcnt += dde->dde_phys[p].ddp_refcnt;
 348 
 349         return (refcnt);
 350 }
 351 
 352 static void
 353 ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
 354 {
 355         spa_t *spa = ddt->ddt_spa;
 356         ddt_phys_t *ddp = dde->dde_phys;
 357         ddt_key_t *ddk = &dde->dde_key;
 358         uint64_t lsize = DDK_GET_LSIZE(ddk);
 359         uint64_t psize = DDK_GET_PSIZE(ddk);
 360 
 361         bzero(dds, sizeof (*dds));
 362 
 363         for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 364                 uint64_t dsize = 0;
 365                 uint64_t refcnt = ddp->ddp_refcnt;
 366 
 367                 if (ddp->ddp_phys_birth == 0)
 368                         continue;
 369 
 370                 for (int d = 0; d < SPA_DVAS_PER_BP; d++)
 371                         dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
 372 
 373                 dds->dds_blocks += 1;
 374                 dds->dds_lsize += lsize;
 375                 dds->dds_psize += psize;
 
 379                 dds->dds_ref_lsize += lsize * refcnt;
 380                 dds->dds_ref_psize += psize * refcnt;
 381                 dds->dds_ref_dsize += dsize * refcnt;
 382         }
 383 }
 384 
 385 void
 386 ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg)
 387 {
 388         const uint64_t *s = (const uint64_t *)src;
 389         uint64_t *d = (uint64_t *)dst;
 390         uint64_t *d_end = (uint64_t *)(dst + 1);
 391 
 392         ASSERT(neg == 0 || neg == -1ULL);       /* add or subtract */
 393 
 394         while (d < d_end)
 395                 *d++ += (*s++ ^ neg) - neg;
 396 }
 397 
 398 static void
 399 ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
 400 {
 401         ddt_stat_t dds;
 402         ddt_histogram_t *ddh;
 403         int bucket;
 404 
 405         ddt_stat_generate(ddt, dde, &dds);
 406 
 407         bucket = highbit64(dds.dds_ref_blocks) - 1;
 408         ASSERT(bucket >= 0);
 409 
 410         ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
 411 
 412         ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg);
 413 }
 414 
 415 void
 416 ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
 417 {
 418         for (int h = 0; h < 64; h++)
 419                 ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0);
 420 }
 421 
 422 void
 423 ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh)
 424 {
 425         bzero(dds, sizeof (*dds));
 426 
 427         for (int h = 0; h < 64; h++)
 428                 ddt_stat_add(dds, &ddh->ddh_stat[h], 0);
 429 }
 430 
 431 boolean_t
 432 ddt_histogram_empty(const ddt_histogram_t *ddh)
 433 {
 434         const uint64_t *s = (const uint64_t *)ddh;
 435         const uint64_t *s_end = (const uint64_t *)(ddh + 1);
 436 
 437         while (s < s_end)
 438                 if (*s++ != 0)
 439                         return (B_FALSE);
 440 
 441         return (B_TRUE);
 442 }
 443 
 444 void
 445 ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
 446 {
 447         /* Sum the statistics we cached in ddt_object_sync(). */
 448         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 449                 ddt_t *ddt = spa->spa_ddt[c];
 450                 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 451                         for (enum ddt_class class = 0; class < DDT_CLASSES;
 452                             class++) {
 453                                 ddt_object_t *ddo =
 454                                     &ddt->ddt_object_stats[type][class];
 455                                 ddo_total->ddo_count += ddo->ddo_count;
 456                                 ddo_total->ddo_dspace += ddo->ddo_dspace;
 457                                 ddo_total->ddo_mspace += ddo->ddo_mspace;
 458                         }
 459                 }
 460         }
 461 
 462         /* ... and compute the averages. */
 463         if (ddo_total->ddo_count != 0) {
 464                 ddo_total->ddo_dspace /= ddo_total->ddo_count;
 465                 ddo_total->ddo_mspace /= ddo_total->ddo_count;
 466         }
 467 }
 468 
 469 void
 470 ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
 471 {
 472         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 473                 ddt_t *ddt = spa->spa_ddt[c];
 474                 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 475                         for (enum ddt_class class = 0; class < DDT_CLASSES;
 476                             class++) {
 477                                 ddt_histogram_add(ddh,
 478                                     &ddt->ddt_histogram_cache[type][class]);
 479                         }
 480                 }
 481         }
 482 }
 483 
 484 void
 485 ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total)
 486 {
 487         ddt_histogram_t *ddh_total;
 488 
 489         ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
 490         ddt_get_dedup_histogram(spa, ddh_total);
 491         ddt_histogram_stat(dds_total, ddh_total);
 492         kmem_free(ddh_total, sizeof (ddt_histogram_t));
 493 }
 494 
 495 uint64_t
 496 ddt_get_dedup_dspace(spa_t *spa)
 497 {
 498         ddt_stat_t dds_total = { 0 };
 499 
 500         ddt_get_dedup_stats(spa, &dds_total);
 501         return (dds_total.dds_ref_dsize - dds_total.dds_dsize);
 502 }
 503 
 504 uint64_t
 505 ddt_get_pool_dedup_ratio(spa_t *spa)
 506 {
 507         ddt_stat_t dds_total = { 0 };
 508 
 509         ddt_get_dedup_stats(spa, &dds_total);
 510         if (dds_total.dds_dsize == 0)
 511                 return (100);
 512 
 
 603                 bcopy(src, dst, d_len);
 604 
 605         if (((version & DDT_COMPRESS_BYTEORDER_MASK) != 0) !=
 606             (ZFS_HOST_BYTEORDER != 0))
 607                 byteswap_uint64_array(dst, d_len);
 608 }
 609 
 610 ddt_t *
 611 ddt_select_by_checksum(spa_t *spa, enum zio_checksum c)
 612 {
 613         return (spa->spa_ddt[c]);
 614 }
 615 
 616 ddt_t *
 617 ddt_select(spa_t *spa, const blkptr_t *bp)
 618 {
 619         return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
 620 }
 621 
 622 void
 623 ddt_enter(ddt_t *ddt)
 624 {
 625         mutex_enter(&ddt->ddt_lock);
 626 }
 627 
 628 void
 629 ddt_exit(ddt_t *ddt)
 630 {
 631         mutex_exit(&ddt->ddt_lock);
 632 }
 633 
 634 static ddt_entry_t *
 635 ddt_alloc(const ddt_key_t *ddk)
 636 {
 637         ddt_entry_t *dde;
 638 
 639         dde = kmem_zalloc(sizeof (ddt_entry_t), KM_SLEEP);
 640         cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
 641 
 642         dde->dde_key = *ddk;
 643 
 644         return (dde);
 645 }
 646 
 647 static void
 648 ddt_free(ddt_entry_t *dde)
 649 {
 650         ASSERT(!dde->dde_loading);
 651 
 652         for (int p = 0; p < DDT_PHYS_TYPES; p++)
 653                 ASSERT(dde->dde_lead_zio[p] == NULL);
 654 
 655         if (dde->dde_repair_abd != NULL)
 656                 abd_free(dde->dde_repair_abd);
 657 
 658         cv_destroy(&dde->dde_cv);
 659         kmem_free(dde, sizeof (*dde));
 660 }
 661 
 662 void
 663 ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
 664 {
 665         ASSERT(MUTEX_HELD(&ddt->ddt_lock));
 666 
 667         avl_remove(&ddt->ddt_tree, dde);
 668         ddt_free(dde);
 669 }
 670 
 671 ddt_entry_t *
 672 ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
 673 {
 674         ddt_entry_t *dde, dde_search;
 675         enum ddt_type type;
 676         enum ddt_class class;
 677         avl_index_t where;
 678         int error;
 679 
 680         ASSERT(MUTEX_HELD(&ddt->ddt_lock));
 681 
 682         ddt_key_fill(&dde_search.dde_key, bp);
 683 
 684         dde = avl_find(&ddt->ddt_tree, &dde_search, &where);
 685         if (dde == NULL) {
 686                 if (!add)
 687                         return (NULL);
 688                 dde = ddt_alloc(&dde_search.dde_key);
 689                 avl_insert(&ddt->ddt_tree, dde, where);
 690         }
 691 
 692         while (dde->dde_loading)
 693                 cv_wait(&dde->dde_cv, &ddt->ddt_lock);
 694 
 695         if (dde->dde_loaded)
 696                 return (dde);
 697 
 698         dde->dde_loading = B_TRUE;
 699 
 700         ddt_exit(ddt);
 701 
 702         error = ENOENT;
 703 
 704         for (type = 0; type < DDT_TYPES; type++) {
 705                 for (class = 0; class < DDT_CLASSES; class++) {
 706                         error = ddt_object_lookup(ddt, type, class, dde);
 707                         if (error != ENOENT) {
 708                                 ASSERT0(error);
 709                                 break;
 710                         }
 711                 }
 712                 if (error != ENOENT)
 713                         break;
 714         }
 715 
 716         ddt_enter(ddt);
 717 
 718         ASSERT(dde->dde_loaded == B_FALSE);
 719         ASSERT(dde->dde_loading == B_TRUE);
 720 
 721         dde->dde_type = type;        /* will be DDT_TYPES if no entry found */
 722         dde->dde_class = class;      /* will be DDT_CLASSES if no entry found */
 723         dde->dde_loaded = B_TRUE;
 724         dde->dde_loading = B_FALSE;
 725 
 726         if (error == 0)
 727                 ddt_stat_update(ddt, dde, -1ULL);
 728 
 729         cv_broadcast(&dde->dde_cv);
 730 
 731         return (dde);
 732 }
 733 
 734 void
 735 ddt_prefetch(spa_t *spa, const blkptr_t *bp)
 736 {
 737         ddt_t *ddt;
 738         ddt_entry_t dde;
 739 
 740         if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp))
 741                 return;
 742 
 743         /*
 744          * We only remove the DDT once all tables are empty and only
 745          * prefetch dedup blocks when there are entries in the DDT.
 746          * Thus no locking is required as the DDT can't disappear on us.
 747          */
 748         ddt = ddt_select(spa, bp);
 749         ddt_key_fill(&dde.dde_key, bp);
 750 
 751         for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 752                 for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
 753                         ddt_object_prefetch(ddt, type, class, &dde);
 754                 }
 755         }
 756 }
 757 
 758 int
 759 ddt_entry_compare(const void *x1, const void *x2)
 760 {
 761         const ddt_entry_t *dde1 = x1;
 762         const ddt_entry_t *dde2 = x2;
 763         const uint64_t *u1 = (const uint64_t *)&dde1->dde_key;
 764         const uint64_t *u2 = (const uint64_t *)&dde2->dde_key;
 765 
 766         for (int i = 0; i < DDT_KEY_WORDS; i++) {
 767                 if (u1[i] < u2[i])
 768                         return (-1);
 769                 if (u1[i] > u2[i])
 770                         return (1);
 771         }
 772 
 773         return (0);
 774 }
 775 
 776 static ddt_t *
 777 ddt_table_alloc(spa_t *spa, enum zio_checksum c)
 778 {
 779         ddt_t *ddt;
 780 
 781         ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP);
 782 
 783         mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
 784         avl_create(&ddt->ddt_tree, ddt_entry_compare,
 785             sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
 786         avl_create(&ddt->ddt_repair_tree, ddt_entry_compare,
 787             sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
 788         ddt->ddt_checksum = c;
 789         ddt->ddt_spa = spa;
 790         ddt->ddt_os = spa->spa_meta_objset;
 791 
 792         return (ddt);
 793 }
 794 
 795 static void
 796 ddt_table_free(ddt_t *ddt)
 797 {
 798         ASSERT(avl_numnodes(&ddt->ddt_tree) == 0);
 799         ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0);
 800         avl_destroy(&ddt->ddt_tree);
 801         avl_destroy(&ddt->ddt_repair_tree);
 802         mutex_destroy(&ddt->ddt_lock);
 803         kmem_free(ddt, sizeof (*ddt));
 804 }
 805 
 806 void
 807 ddt_create(spa_t *spa)
 808 {
 809         spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM;
 810 
 811         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
 812                 spa->spa_ddt[c] = ddt_table_alloc(spa, c);
 813 }
 814 
 815 int
 816 ddt_load(spa_t *spa)
 817 {
 818         int error;
 819 
 820         ddt_create(spa);
 821 
 822         error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 823             DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
 824             &spa->spa_ddt_stat_object);
 825 
 826         if (error)
 827                 return (error == ENOENT ? 0 : error);
 828 
 829         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 830                 ddt_t *ddt = spa->spa_ddt[c];
 831                 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 832                         for (enum ddt_class class = 0; class < DDT_CLASSES;
 833                             class++) {
 834                                 error = ddt_object_load(ddt, type, class);
 835                                 if (error != 0 && error != ENOENT)
 836                                         return (error);
 837                         }
 838                 }
 839 
 840                 /*
 841                  * Seed the cached histograms.
 842                  */
 843                 bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
 844                     sizeof (ddt->ddt_histogram));
 845         }
 846 
 847         return (0);
 848 }
 849 
 850 void
 851 ddt_unload(spa_t *spa)
 852 {
 853         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 854                 if (spa->spa_ddt[c]) {
 855                         ddt_table_free(spa->spa_ddt[c]);
 856                         spa->spa_ddt[c] = NULL;
 857                 }
 858         }
 859 }
 860 
 861 boolean_t
 862 ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp)
 863 {
 864         ddt_t *ddt;
 865         ddt_entry_t dde;
 866 
 867         if (!BP_GET_DEDUP(bp))
 868                 return (B_FALSE);
 869 
 870         if (max_class == DDT_CLASS_UNIQUE)
 871                 return (B_TRUE);
 872 
 873         ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)];
 874 
 875         ddt_key_fill(&dde.dde_key, bp);
 876 
 877         for (enum ddt_type type = 0; type < DDT_TYPES; type++)
 878                 for (enum ddt_class class = 0; class <= max_class; class++)
 879                         if (ddt_object_lookup(ddt, type, class, &dde) == 0)
 880                                 return (B_TRUE);
 881 
 882         return (B_FALSE);
 883 }
 884 
 885 ddt_entry_t *
 886 ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
 887 {
 888         ddt_key_t ddk;
 889         ddt_entry_t *dde;
 890 
 891         ddt_key_fill(&ddk, bp);
 892 
 893         dde = ddt_alloc(&ddk);
 894 
 895         for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 896                 for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
 897                         /*
 898                          * We can only do repair if there are multiple copies
 899                          * of the block.  For anything in the UNIQUE class,
 900                          * there's definitely only one copy, so don't even try.
 901                          */
 902                         if (class != DDT_CLASS_UNIQUE &&
 903                             ddt_object_lookup(ddt, type, class, dde) == 0)
 904                                 return (dde);
 905                 }
 906         }
 907 
 908         bzero(dde->dde_phys, sizeof (dde->dde_phys));
 909 
 910         return (dde);
 911 }
 912 
 913 void
 914 ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
 915 {
 916         avl_index_t where;
 917 
 918         ddt_enter(ddt);
 919 
 920         if (dde->dde_repair_abd != NULL && spa_writeable(ddt->ddt_spa) &&
 921             avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
 922                 avl_insert(&ddt->ddt_repair_tree, dde, where);
 923         else
 924                 ddt_free(dde);
 925 
 926         ddt_exit(ddt);
 927 }
 928 
 929 static void
 930 ddt_repair_entry_done(zio_t *zio)
 931 {
 932         ddt_entry_t *rdde = zio->io_private;
 933 
 934         ddt_free(rdde);
 935 }
 936 
 937 static void
 938 ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
 939 {
 940         ddt_phys_t *ddp = dde->dde_phys;
 941         ddt_phys_t *rddp = rdde->dde_phys;
 942         ddt_key_t *ddk = &dde->dde_key;
 943         ddt_key_t *rddk = &rdde->dde_key;
 944         zio_t *zio;
 945         blkptr_t blk;
 946 
 
 955                 ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
 956                 zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
 957                     rdde->dde_repair_abd, DDK_GET_PSIZE(rddk), NULL, NULL,
 958                     ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL));
 959         }
 960 
 961         zio_nowait(zio);
 962 }
 963 
 964 static void
 965 ddt_repair_table(ddt_t *ddt, zio_t *rio)
 966 {
 967         spa_t *spa = ddt->ddt_spa;
 968         ddt_entry_t *dde, *rdde_next, *rdde;
 969         avl_tree_t *t = &ddt->ddt_repair_tree;
 970         blkptr_t blk;
 971 
 972         if (spa_sync_pass(spa) > 1)
 973                 return;
 974 
 975         ddt_enter(ddt);
 976         for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) {
 977                 rdde_next = AVL_NEXT(t, rdde);
 978                 avl_remove(&ddt->ddt_repair_tree, rdde);
 979                 ddt_exit(ddt);
 980                 ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk);
 981                 dde = ddt_repair_start(ddt, &blk);
 982                 ddt_repair_entry(ddt, dde, rdde, rio);
 983                 ddt_repair_done(ddt, dde);
 984                 ddt_enter(ddt);
 985         }
 986         ddt_exit(ddt);
 987 }
 988 
 989 static void
 990 ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
 991 {
 992         dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
 993         ddt_phys_t *ddp = dde->dde_phys;
 994         ddt_key_t *ddk = &dde->dde_key;
 995         enum ddt_type otype = dde->dde_type;
 996         enum ddt_type ntype = DDT_TYPE_CURRENT;
 997         enum ddt_class oclass = dde->dde_class;
 998         enum ddt_class nclass;
 999         uint64_t total_refcnt = 0;
1000 
1001         ASSERT(dde->dde_loaded);
1002         ASSERT(!dde->dde_loading);
1003 
1004         for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
1005                 ASSERT(dde->dde_lead_zio[p] == NULL);
1006                 ASSERT((int64_t)ddp->ddp_refcnt >= 0);
1007                 if (ddp->ddp_phys_birth == 0) {
1008                         ASSERT(ddp->ddp_refcnt == 0);
1009                         continue;
1010                 }
1011                 if (p == DDT_PHYS_DITTO) {
1012                         if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0)
1013                                 ddt_phys_free(ddt, ddk, ddp, txg);
1014                         continue;
1015                 }
1016                 if (ddp->ddp_refcnt == 0)
1017                         ddt_phys_free(ddt, ddk, ddp, txg);
1018                 total_refcnt += ddp->ddp_refcnt;
1019         }
1020 
1021         if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0)
1022                 nclass = DDT_CLASS_DITTO;
1023         else if (total_refcnt > 1)
1024                 nclass = DDT_CLASS_DUPLICATE;
1025         else
1026                 nclass = DDT_CLASS_UNIQUE;
1027 
1028         if (otype != DDT_TYPES &&
1029             (otype != ntype || oclass != nclass || total_refcnt == 0)) {
1030                 VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0);
1031                 ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT);
1032         }
1033 
1034         if (total_refcnt != 0) {
1035                 dde->dde_type = ntype;
1036                 dde->dde_class = nclass;
1037                 ddt_stat_update(ddt, dde, 0);
1038                 if (!ddt_object_exists(ddt, ntype, nclass))
1039                         ddt_object_create(ddt, ntype, nclass, tx);
1040                 VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0);
1041 
1042                 /*
1043                  * If the class changes, the order that we scan this bp
1044                  * changes.  If it decreases, we could miss it, so
1045                  * scan it right now.  (This covers both class changing
1046                  * while we are doing ddt_walk(), and when we are
1047                  * traversing.)
1048                  */
1049                 if (nclass < oclass) {
1050                         dsl_scan_ddt_entry(dp->dp_scan,
1051                             ddt->ddt_checksum, dde, tx);
1052                 }
1053         }
1054 }
1055 
1056 static void
1057 ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
1058 {
1059         spa_t *spa = ddt->ddt_spa;
1060         ddt_entry_t *dde;
1061         void *cookie = NULL;
1062 
1063         if (avl_numnodes(&ddt->ddt_tree) == 0)
1064                 return;
1065 
1066         ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP);
1067 
1068         if (spa->spa_ddt_stat_object == 0) {
1069                 spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
1070                     DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT,
1071                     DMU_POOL_DDT_STATS, tx);
1072         }
1073 
1074         while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
1075                 ddt_sync_entry(ddt, dde, tx, txg);
1076                 ddt_free(dde);
1077         }
1078 
1079         for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
1080                 uint64_t count = 0;
1081                 for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
1082                         if (ddt_object_exists(ddt, type, class)) {
1083                                 ddt_object_sync(ddt, type, class, tx);
1084                                 count += ddt_object_count(ddt, type, class);
1085                         }
1086                 }
1087                 for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
1088                         if (count == 0 && ddt_object_exists(ddt, type, class))
1089                                 ddt_object_destroy(ddt, type, class, tx);
1090                 }
1091         }
1092 
1093         bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
1094             sizeof (ddt->ddt_histogram));
1095 }
1096 
1097 void
1098 ddt_sync(spa_t *spa, uint64_t txg)
1099 {
1100         dmu_tx_t *tx;
1101         zio_t *rio = zio_root(spa, NULL, NULL,
1102             ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL);
1103 
1104         ASSERT(spa_syncing_txg(spa) == txg);
1105 
1106         tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1107 
1108         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1109                 ddt_t *ddt = spa->spa_ddt[c];
1110                 if (ddt == NULL)
1111                         continue;
1112                 ddt_sync_table(ddt, tx, txg);
1113                 ddt_repair_table(ddt, rio);
1114         }
1115 
1116         (void) zio_wait(rio);
1117 
1118         dmu_tx_commit(tx);
1119 }
1120 
1121 int
1122 ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)
 
 | 
 
 
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
  25  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  26  */
  27 
  28 #include <sys/zfs_context.h>
  29 #include <sys/spa.h>
  30 #include <sys/spa_impl.h>
  31 #include <sys/zio.h>
  32 #include <sys/ddt.h>
  33 #include <sys/zap.h>
  34 #include <sys/dmu_tx.h>
  35 #include <sys/arc.h>
  36 #include <sys/dsl_pool.h>
  37 #include <sys/zio_checksum.h>
  38 #include <sys/zio_compress.h>
  39 #include <sys/dsl_scan.h>
  40 #include <sys/abd.h>
  41 
  42 /*
  43  * Almost all of the cases of iteration through zap containing entries are
  44  * restricted by spa->spa_ddt_class_{min,max}. It allows one to introduce new
  45  * behavior: storing all entries into the single zap. However, there are
  46  * some places where all zaps are iterated through forcibly: table creation,
  47  * deletion, loading, dde prefetching, and looking up. It allows one to maintain
  48  * compatibility with old pools and be able to convert the old pool format
  49  * into the new one on-the-fly.
  50  */
  51 
  52 /*
  53  * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
  54  */
  55 int zfs_dedup_prefetch = 1;
  56 
  57 static const ddt_ops_t *ddt_ops[DDT_TYPES] = {
  58         &ddt_zap_ops,
  59 };
  60 
  61 static const char *ddt_class_name[DDT_CLASSES] = {
  62         "ditto",
  63         "duplicate",
  64         "unique",
  65 };
  66 
  67 /* Possible in core size of all DDTs */
  68 uint64_t zfs_ddts_msize = 0;
  69 
  70 static void
  71 ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
  72     dmu_tx_t *tx)
  73 {
  74         spa_t *spa = ddt->ddt_spa;
  75         objset_t *os = ddt->ddt_os;
  76         uint64_t *objectp = &ddt->ddt_object[type][class];
  77         boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_flags &
  78             ZCHECKSUM_FLAG_DEDUP;
  79         char name[DDT_NAMELEN];
  80 
  81         ddt_object_name(ddt, type, class, name);
  82 
  83         ASSERT(*objectp == 0);
  84         VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0);
  85         ASSERT(*objectp != 0);
  86 
  87         VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name,
  88             sizeof (uint64_t), 1, objectp, tx) == 0);
  89 
  90         VERIFY(zap_add(os, spa->spa_ddt_stat_object, name,
  91             sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
  92             &ddt->ddt_histogram[type][class], tx) == 0);
  93 }
  94 
  95 static void
  96 ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
  97     dmu_tx_t *tx)
  98 {
  99         spa_t *spa = ddt->ddt_spa;
 100         objset_t *os = ddt->ddt_os;
 101         uint64_t *objectp = &ddt->ddt_object[type][class];
 102         char name[DDT_NAMELEN];
 103 #if DEBUG
 104         uint64_t count;
 105 #endif
 106         ddt_object_name(ddt, type, class, name);
 107 
 108         ASSERT(*objectp != 0);
 109         ASSERT((ddt_object_count(ddt, type, class, &count) == 0) &&
 110             (count == 0));
 111         ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
 112         VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0);
 113         VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0);
 114         VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0);
 115         bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t));
 116 
 117         *objectp = 0;
 118 }
 119 
 120 static int
 121 ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
 122 {
 123         ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
 124         dmu_object_info_t doi;
 125         char name[DDT_NAMELEN];
 126         int error;
 127 
 128         ddt_object_name(ddt, type, class, name);
 129 
 130         error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name,
 131             sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
 132         if (error)
 133                 return (error);
 134 
 135         VERIFY0(zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
 136             sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
 137             &ddt->ddt_histogram[type][class]));
 138 
 139         /*
 140          * Seed the cached statistics.
 141          */
 142         error = ddt_object_info(ddt, type, class, &doi);
 143         /* Panic in debug mode */
 144         ASSERT(error == 0);
 145         if (error)
 146                 return (error);
 147         error = ddt_object_count(ddt, type, class, &ddo->ddo_count);
 148         if (error)
 149                 return (error);
 150         ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
 151         ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
 152 
 153         return (0);
 154 }
 155 
 156 static void
 157 ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 158     dmu_tx_t *tx)
 159 {
 160         ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
 161         dmu_object_info_t doi;
 162         char name[DDT_NAMELEN];
 163 
 164         ddt_object_name(ddt, type, class, name);
 165 
 166         VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
 167             sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
 168             &ddt->ddt_histogram[type][class], tx) == 0);
 169 
 170         /*
 171          * Cache DDT statistics; this is the only time they'll change.
 172          */
 173         VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
 174 
 175         (void) ddt_object_count(ddt, type, class, &ddo->ddo_count);
 176         ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
 177         ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
 178 }
 179 
 180 static int
 181 ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 182     ddt_entry_t *dde)
 183 {
 184         if (!ddt_object_exists(ddt, type, class))
 185                 return (SET_ERROR(ENOENT));
 186 
 187         return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
 188             ddt->ddt_object[type][class], dde));
 189 }
 190 
 191 static void
 192 ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 193     ddt_entry_t *dde)
 194 {
 195         if (!ddt_object_exists(ddt, type, class))
 
 212 static int
 213 ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 214     ddt_entry_t *dde, dmu_tx_t *tx)
 215 {
 216         ASSERT(ddt_object_exists(ddt, type, class));
 217 
 218         return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os,
 219             ddt->ddt_object[type][class], dde, tx));
 220 }
 221 
 222 int
 223 ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 224     uint64_t *walk, ddt_entry_t *dde)
 225 {
 226         ASSERT(ddt_object_exists(ddt, type, class));
 227 
 228         return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
 229             ddt->ddt_object[type][class], dde, walk));
 230 }
 231 
 232 int
 233 ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 234         uint64_t *count)
 235 {
 236         ASSERT(ddt_object_exists(ddt, type, class));
 237 
 238         return (ddt_ops[type]->ddt_op_count(ddt->ddt_os,
 239             ddt->ddt_object[type][class], count));
 240 }
 241 
 242 int
 243 ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 244     dmu_object_info_t *doi)
 245 {
 246         if (!ddt_object_exists(ddt, type, class))
 247                 return (SET_ERROR(ENOENT));
 248 
 249         return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class],
 250             doi));
 251 }
 252 
 253 boolean_t
 254 ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
 255 {
 256         return (!!ddt->ddt_object[type][class]);
 257 }
 258 
 259 void
 
 355         for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 356                 if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) &&
 357                     BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth)
 358                         return (ddp);
 359         }
 360         return (NULL);
 361 }
 362 
 363 uint64_t
 364 ddt_phys_total_refcnt(const ddt_entry_t *dde)
 365 {
 366         uint64_t refcnt = 0;
 367 
 368         for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
 369                 refcnt += dde->dde_phys[p].ddp_refcnt;
 370 
 371         return (refcnt);
 372 }
 373 
 374 static void
 375 ddt_stat_generate(spa_t *spa, ddt_entry_t *dde, ddt_stat_t *dds)
 376 {
 377         ddt_phys_t *ddp = dde->dde_phys;
 378         ddt_key_t *ddk = &dde->dde_key;
 379         uint64_t lsize = DDK_GET_LSIZE(ddk);
 380         uint64_t psize = DDK_GET_PSIZE(ddk);
 381 
 382         bzero(dds, sizeof (*dds));
 383 
 384         for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 385                 uint64_t dsize = 0;
 386                 uint64_t refcnt = ddp->ddp_refcnt;
 387 
 388                 if (ddp->ddp_phys_birth == 0)
 389                         continue;
 390 
 391                 for (int d = 0; d < SPA_DVAS_PER_BP; d++)
 392                         dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
 393 
 394                 dds->dds_blocks += 1;
 395                 dds->dds_lsize += lsize;
 396                 dds->dds_psize += psize;
 
 400                 dds->dds_ref_lsize += lsize * refcnt;
 401                 dds->dds_ref_psize += psize * refcnt;
 402                 dds->dds_ref_dsize += dsize * refcnt;
 403         }
 404 }
 405 
 406 void
 407 ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg)
 408 {
 409         const uint64_t *s = (const uint64_t *)src;
 410         uint64_t *d = (uint64_t *)dst;
 411         uint64_t *d_end = (uint64_t *)(dst + 1);
 412 
 413         ASSERT(neg == 0 || neg == -1ULL);       /* add or subtract */
 414 
 415         while (d < d_end)
 416                 *d++ += (*s++ ^ neg) - neg;
 417 }
 418 
 419 static void
 420 ddt_stat_update_by_dds(ddt_t *ddt, ddt_entry_t *dde,
 421     ddt_stat_t *dds, uint64_t neg)
 422 {
 423         ddt_histogram_t *ddh;
 424         int bucket = highbit64(dds->dds_ref_blocks) - 1;
 425         ASSERT(bucket >= 0);
 426 
 427         ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
 428         ddt_stat_add(&ddh->ddh_stat[bucket], dds, neg);
 429 }
 430 
 431 static void
 432 ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
 433 {
 434         ddt_stat_t dds;
 435 
 436         ddt_stat_generate(ddt->ddt_spa, dde, &dds);
 437 
 438         ddt_stat_update_by_dds(ddt, dde, &dds, neg);
 439 }
 440 
 441 void
 442 ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
 443 {
 444         for (int h = 0; h < 64; h++)
 445                 ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0);
 446 }
 447 
 448 void
 449 ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh)
 450 {
 451         bzero(dds, sizeof (*dds));
 452 
 453         for (int h = 0; h < 64; h++)
 454                 ddt_stat_add(dds, &ddh->ddh_stat[h], 0);
 455 }
 456 
 457 boolean_t
 458 ddt_histogram_empty(const ddt_histogram_t *ddh)
 459 {
 460         const uint64_t *s = (const uint64_t *)ddh;
 461         const uint64_t *s_end = (const uint64_t *)(ddh + 1);
 462 
 463         while (s < s_end)
 464                 if (*s++ != 0)
 465                         return (B_FALSE);
 466 
 467         return (B_TRUE);
 468 }
 469 
 470 void
 471 ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
 472 {
 473         /* Sum the statistics we cached in ddt_object_sync(). */
 474         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 475                 ddt_t *ddt = spa->spa_ddt[c];
 476                 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 477                         for (enum ddt_class class = spa->spa_ddt_class_min;
 478                             class <= spa->spa_ddt_class_max; class++) {
 479                                 ddt_object_t *ddo =
 480                                     &ddt->ddt_object_stats[type][class];
 481                                 ddo_total->ddo_count += ddo->ddo_count;
 482                                 ddo_total->ddo_dspace += ddo->ddo_dspace;
 483                                 ddo_total->ddo_mspace += ddo->ddo_mspace;
 484                         }
 485                 }
 486         }
 487 
 488         /* ... and compute the averages. */
 489         if (ddo_total->ddo_count != 0) {
 490                 ddo_total->ddo_dspace /= ddo_total->ddo_count;
 491                 ddo_total->ddo_mspace /= ddo_total->ddo_count;
 492         }
 493 }
 494 
 495 void
 496 ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
 497 {
 498         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 499                 ddt_t *ddt = spa->spa_ddt[c];
 500                 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 501                         for (enum ddt_class class = spa->spa_ddt_class_min;
 502                             class <= spa->spa_ddt_class_max; class++) {
 503                                 ddt_histogram_add(ddh,
 504                                     &ddt->ddt_histogram_cache[type][class]);
 505                         }
 506                 }
 507         }
 508 }
 509 
 510 void
 511 ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total)
 512 {
 513         /*
 514          * Avoid temporary allocation of ddt_histogram_t from heap
 515          * or on stack (probably too large) by unrolling ddt_histogram_add()
 516          */
 517         bzero(dds_total, sizeof (ddt_stat_t));
 518         /* sum up the stats across all the histograms */
 519         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 520                 ddt_t *ddt = spa->spa_ddt[c];
 521                 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 522                         for (enum ddt_class class = spa->spa_ddt_class_min;
 523                             class <= spa->spa_ddt_class_max; class++) {
 524                                 /* unroll the ddt_histogram_add() */
 525                                 ddt_histogram_t *src =
 526                                     &ddt->ddt_histogram_cache[type][class];
 527                                 for (int h = 0; h < 64; h++) {
 528                                         ddt_stat_t *st = &src->ddh_stat[h];
 529                                         ddt_stat_add(dds_total, st, 0);
 530                                 }
 531                         }
 532                 }
 533         }
 534 }
 535 
 536 uint64_t
 537 ddt_get_dedup_dspace(spa_t *spa)
 538 {
 539         ddt_stat_t dds_total = { 0 };
 540 
 541         ddt_get_dedup_stats(spa, &dds_total);
 542         return (dds_total.dds_ref_dsize - dds_total.dds_dsize);
 543 }
 544 
 545 uint64_t
 546 ddt_get_pool_dedup_ratio(spa_t *spa)
 547 {
 548         ddt_stat_t dds_total = { 0 };
 549 
 550         ddt_get_dedup_stats(spa, &dds_total);
 551         if (dds_total.dds_dsize == 0)
 552                 return (100);
 553 
 
 644                 bcopy(src, dst, d_len);
 645 
 646         if (((version & DDT_COMPRESS_BYTEORDER_MASK) != 0) !=
 647             (ZFS_HOST_BYTEORDER != 0))
 648                 byteswap_uint64_array(dst, d_len);
 649 }
 650 
 651 ddt_t *
 652 ddt_select_by_checksum(spa_t *spa, enum zio_checksum c)
 653 {
 654         return (spa->spa_ddt[c]);
 655 }
 656 
 657 ddt_t *
 658 ddt_select(spa_t *spa, const blkptr_t *bp)
 659 {
 660         return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
 661 }
 662 
 663 void
 664 ddt_enter(ddt_t *ddt, uint8_t hash)
 665 {
 666         mutex_enter(&ddt->ddt_lock[hash]);
 667 }
 668 
 669 void
 670 ddt_exit(ddt_t *ddt, uint8_t hash)
 671 {
 672         mutex_exit(&ddt->ddt_lock[hash]);
 673 }
 674 
 675 void
 676 dde_enter(ddt_entry_t *dde)
 677 {
 678         mutex_enter(&dde->dde_lock);
 679 }
 680 
 681 void
 682 dde_exit(ddt_entry_t *dde)
 683 {
 684         mutex_exit(&dde->dde_lock);
 685 }
 686 
 687 /* cache for ddt_entry_t structures */
 688 static kmem_cache_t *dde_cache;
 689 
 690 /* ARGSUSED */
 691 static int
 692 dde_cache_constr(void *buf, void *arg, int flags)
 693 {
 694         ddt_entry_t *dde = (ddt_entry_t *)buf;
 695         cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
 696         mutex_init(&dde->dde_lock, NULL, MUTEX_DEFAULT, NULL);
 697         return (0);
 698 }
 699 
 700 /* ARGSUSED */
 701 static void
 702 dde_cache_destr(void *buf, void *arg)
 703 {
 704         ddt_entry_t *dde = (ddt_entry_t *)buf;
 705         cv_destroy(&dde->dde_cv);
 706         mutex_destroy(&dde->dde_lock);
 707 }
 708 
 709 void
 710 ddt_init(void)
 711 {
 712         dde_cache = kmem_cache_create("ddt_entry_t", sizeof (ddt_entry_t),
 713             0, dde_cache_constr, dde_cache_destr, NULL, NULL, NULL, 0);
 714         VERIFY(dde_cache != NULL);
 715 }
 716 
 717 void
 718 ddt_fini(void)
 719 {
 720         if (dde_cache) {
 721                 kmem_cache_destroy(dde_cache);
 722                 dde_cache = NULL;
 723         }
 724 }
 725 
 726 static ddt_entry_t *
 727 ddt_alloc(const ddt_key_t *ddk)
 728 {
 729         ddt_entry_t *dde;
 730 
 731         dde = kmem_cache_alloc(dde_cache, KM_SLEEP);
 732 
 733         /* Init everything but the condvar and the mutex */
 734         dde->dde_key = *ddk;
 735         bzero((void*)((uintptr_t)dde+offsetof(ddt_entry_t, dde_phys)),
 736             offsetof(ddt_entry_t, dde_cv)-offsetof(ddt_entry_t, dde_phys));
 737         bzero((void*)((uintptr_t)dde+offsetof(ddt_entry_t, dde_node)),
 738             sizeof (avl_node_t));
 739 
 740         return (dde);
 741 }
 742 
 743 static void
 744 ddt_free(ddt_entry_t *dde)
 745 {
 746         ASSERT(!(dde->dde_state & DDE_LOADING));
 747 
 748         for (int p = 0; p < DDT_PHYS_TYPES; p++)
 749                 ASSERT(dde->dde_lead_zio[p] == NULL);
 750 
 751         if (dde->dde_repair_abd != NULL)
 752                 abd_free(dde->dde_repair_abd);
 753 
 754         kmem_cache_free(dde_cache, dde);
 755 }
 756 
 757 /* for zdb usage */
 758 void
 759 ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
 760 {
 761         uint8_t hash = DDT_HASHFN(dde->dde_key.ddk_cksum);
 762 
 763         avl_remove(&ddt->ddt_tree[hash], dde);
 764         ddt_free(dde);
 765 }
 766 
 767 ddt_entry_t *
 768 ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
 769 {
 770         ddt_entry_t *dde, dde_search;
 771         enum ddt_type type;
 772         enum ddt_class class;
 773         avl_index_t where;
 774         uint8_t hash = DDT_HASHFN(bp->blk_cksum);
 775         int error;
 776 
 777         ddt_key_fill(&dde_search.dde_key, bp);
 778 
 779         ddt_enter(ddt, hash);
 780         /*
 781          * Do we have the dirty DDE in mem already?
 782          */
 783         dde = avl_find(&ddt->ddt_tree[hash], &dde_search, &where);
 784         if (dde == NULL) {
 785                 /* This DDE doesn't exists in dirty tree */
 786                 if (!add) {
 787                         ddt_exit(ddt, hash);
 788                         return (NULL);
 789                 }
 790                 /* Since a dirty DDE didn't exist, create it */
 791                 dde = ddt_alloc(&dde_search.dde_key);
 792                 avl_insert(&ddt->ddt_tree[hash], dde, where);
 793         }
 794 
 795         ddt_exit(ddt, hash);
 796 
 797         /*
 798          * If we're already looking up this DDE
 799          * wait until we have the result
 800          */
 801         dde_enter(dde);
 802         while (dde->dde_state & DDE_LOADING)
 803                 cv_wait(&dde->dde_cv, &dde->dde_lock);
 804 
 805         /*
 806          * If we have loaded the DDE from disk return it
 807          */
 808         if (dde->dde_state & DDE_LOADED)
 809                 return (dde);
 810 
 811         /*
 812          * If we didn't find this DDE, start looking up the DDE in ZAP
 813          */
 814         dde->dde_state |= DDE_LOADING;
 815         dde_exit(dde);
 816 
 817         error = ENOENT;
 818 
 819         DTRACE_PROBE1(ddt__loading, ddt_key_t *, &dde->dde_key);
 820         for (type = 0; type < DDT_TYPES; type++) {
 821                 for (class = 0; class < DDT_CLASSES; class++) {
 822                         error = ddt_object_lookup(ddt, type, class, dde);
 823                         if (error != ENOENT)
 824                                 break;
 825                 }
 826                 if (error != ENOENT)
 827                         break;
 828         }
 829 
 830         ASSERT(error == 0 || error == ENOENT);
 831 
 832         dde_enter(dde);
 833 
 834         ASSERT(!(dde->dde_state & DDE_LOADED));
 835         ASSERT(dde->dde_state & DDE_LOADING);
 836 
 837         dde->dde_type = type;        /* will be DDT_TYPES if no entry found */
 838         dde->dde_class = class;      /* will be DDT_CLASSES if no entry found */
 839         if (type == DDT_TYPES && class == DDT_CLASSES)
 840                 dde->dde_state |= DDE_NEW;
 841         dde->dde_state |= DDE_LOADED;
 842         dde->dde_state &= ~DDE_LOADING;
 843 
 844         DTRACE_PROBE2(ddt__loaded, ddt_key_t *, &dde->dde_key,
 845             enum ddt_class, dde->dde_class);
 846         if (error == 0)
 847                 ddt_stat_generate(ddt->ddt_spa, dde, &dde->dde_lkstat);
 848 
 849         cv_broadcast(&dde->dde_cv);
 850 
 851         return (dde);
 852 }
 853 
 854 void
 855 ddt_prefetch(spa_t *spa, const blkptr_t *bp)
 856 {
 857         ddt_t *ddt;
 858         ddt_entry_t dde;
 859 
 860         if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp))
 861                 return;
 862 
 863         /*
 864          * We only remove the DDT once all tables are empty and only
 865          * prefetch dedup blocks when there are entries in the DDT.
 866          * Thus no locking is required as the DDT can't disappear on us.
 867          */
 868         ddt = ddt_select(spa, bp);
 869         ddt_key_fill(&dde.dde_key, bp);
 870 
 871         for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 872                 for (enum ddt_class class = 0;
 873                     class < DDT_CLASSES; class++) {
 874                         ddt_object_prefetch(ddt, type, class, &dde);
 875                 }
 876         }
 877 }
 878 
 879 int
 880 ddt_entry_compare(const void *x1, const void *x2)
 881 {
 882         const ddt_entry_t *dde1 = x1;
 883         const ddt_entry_t *dde2 = x2;
 884         const uint64_t *u1 = (const uint64_t *)&dde1->dde_key;
 885         const uint64_t *u2 = (const uint64_t *)&dde2->dde_key;
 886 
 887         for (int i = 0; i < DDT_KEY_WORDS; i++) {
 888                 if (u1[i] < u2[i])
 889                         return (-1);
 890                 if (u1[i] > u2[i])
 891                         return (1);
 892         }
 893 
 894         return (0);
 895 }
 896 
 897 static ddt_t *
 898 ddt_table_alloc(spa_t *spa, enum zio_checksum c)
 899 {
 900         ddt_t *ddt;
 901         uint_t i;
 902 
 903         ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP);
 904 
 905         for (i = 0; i < DDT_HASHSZ; i++) {
 906                 mutex_init(&ddt->ddt_lock[i], NULL, MUTEX_DEFAULT, NULL);
 907                 avl_create(&ddt->ddt_tree[i], ddt_entry_compare,
 908                     sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
 909         }
 910         mutex_init(&ddt->ddt_repair_lock, NULL, MUTEX_DEFAULT, NULL);
 911 
 912         avl_create(&ddt->ddt_repair_tree, ddt_entry_compare,
 913             sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
 914         ddt->ddt_checksum = c;
 915         ddt->ddt_spa = spa;
 916         ddt->ddt_os = spa->spa_meta_objset;
 917 
 918         return (ddt);
 919 }
 920 
 921 static void
 922 ddt_table_free(ddt_t *ddt)
 923 {
 924         uint_t i;
 925 
 926         ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0);
 927 
 928         for (i = 0; i < DDT_HASHSZ; i++) {
 929                 ASSERT(avl_numnodes(&ddt->ddt_tree[i]) == 0);
 930                 avl_destroy(&ddt->ddt_tree[i]);
 931                 mutex_destroy(&ddt->ddt_lock[i]);
 932         }
 933         avl_destroy(&ddt->ddt_repair_tree);
 934         mutex_destroy(&ddt->ddt_repair_lock);
 935         kmem_free(ddt, sizeof (*ddt));
 936 }
 937 
 938 void
 939 ddt_create(spa_t *spa)
 940 {
 941         spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM;
 942 
 943         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
 944                 spa->spa_ddt[c] = ddt_table_alloc(spa, c);
 945 }
 946 
 947 /*
 948  * Get the combined size of DDTs on all pools.
 949  * Returns either on disk (phys == B_TRUE) or in core combined DDTs size
 950  */
 951 uint64_t
 952 ddt_get_ddts_size(boolean_t phys)
 953 {
 954         uint64_t ddts_size = 0;
 955         spa_t *spa = NULL;
 956 
 957         while ((spa = spa_next(spa)) != NULL)
 958                 ddts_size += spa_get_ddts_size(spa, phys);
 959 
 960         return (ddts_size);
 961 }
 962 
 963 int
 964 ddt_load(spa_t *spa)
 965 {
 966         int error;
 967         ddt_object_t *ddo;
 968 
 969         ddt_create(spa);
 970 
 971         error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 972             DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
 973             &spa->spa_ddt_stat_object);
 974 
 975         if (error)
 976                 return (error == ENOENT ? 0 : error);
 977 
 978         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 979                 ddt_t *ddt = spa->spa_ddt[c];
 980                 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 981                         for (enum ddt_class class = 0;
 982                             class < DDT_CLASSES; class++) {
 983                                 error = ddt_object_load(ddt, type, class);
 984                                 if (error == ENOENT)
 985                                         continue;
 986                                 if (error != 0)
 987                                         return (error);
 988                                 ddo = &ddt->ddt_object_stats[type][class];
 989                                 atomic_add_64(&spa->spa_ddt_dsize,
 990                                     ddo->ddo_dspace);
 991                                 atomic_add_64(&spa->spa_ddt_msize,
 992                                     ddo->ddo_mspace);
 993                         }
 994                 }
 995 
 996                 /*
 997                  * Seed the cached histograms.
 998                  */
 999                 bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
1000                     sizeof (ddt->ddt_histogram));
1001         }
1002         zfs_ddts_msize = ddt_get_ddts_size(B_FALSE);
1003 
1004         if (spa_enable_dedup_cap(spa) && spa->spa_ddt_capped == 0) {
1005                 /* notify that dedup cap is now active */
1006                 spa->spa_ddt_capped = 1;
1007                 spa_event_notify(spa, NULL, NULL, ESC_ZFS_DEDUP_OFF);
1008         }
1009 
1010         return (0);
1011 }
1012 
1013 void
1014 ddt_unload(spa_t *spa)
1015 {
1016         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1017                 if (spa->spa_ddt[c]) {
1018                         ddt_table_free(spa->spa_ddt[c]);
1019                         spa->spa_ddt[c] = NULL;
1020                 }
1021         }
1022         spa->spa_ddt_dsize = 0;
1023         spa->spa_ddt_msize = 0;
1024         zfs_ddts_msize = ddt_get_ddts_size(B_FALSE);
1025 }
1026 
1027 boolean_t
1028 ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp)
1029 {
1030         ddt_t *ddt;
1031         ddt_entry_t dde;
1032 
1033         if (!BP_GET_DEDUP(bp))
1034                 return (B_FALSE);
1035 
1036         if (max_class > spa->spa_ddt_class_max)
1037                 max_class = spa->spa_ddt_class_max;
1038 
1039         ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)];
1040 
1041         ddt_key_fill(&dde.dde_key, bp);
1042 
1043         for (enum ddt_type type = 0; type < DDT_TYPES; type++)
1044                 for (enum ddt_class class = spa->spa_ddt_class_min;
1045                     class <= max_class; class++)
1046                         if (ddt_object_lookup(ddt, type, class, &dde) == 0)
1047                                 return (B_TRUE);
1048 
1049         return (B_FALSE);
1050 }
1051 
1052 ddt_entry_t *
1053 ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
1054 {
1055         ddt_key_t ddk;
1056         ddt_entry_t *dde;
1057 
1058         ddt_key_fill(&ddk, bp);
1059 
1060         dde = ddt_alloc(&ddk);
1061 
1062         for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
1063                 for (enum ddt_class class = 0;
1064                     class < DDT_CLASSES; class++) {
1065                         /*
1066                          * We can only do repair if there are multiple copies
1067                          * of the block.  For anything in the UNIQUE class,
1068                          * there's definitely only one copy, so don't even try.
1069                          */
1070                         if (class != DDT_CLASS_UNIQUE &&
1071                             ddt_object_lookup(ddt, type, class, dde) == 0)
1072                                 return (dde);
1073                 }
1074         }
1075 
1076         bzero(dde->dde_phys, sizeof (dde->dde_phys));
1077 
1078         return (dde);
1079 }
1080 
1081 void
1082 ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
1083 {
1084         avl_index_t where;
1085 
1086         mutex_enter(&ddt->ddt_repair_lock);
1087 
1088         if (dde->dde_repair_abd != NULL && spa_writeable(ddt->ddt_spa) &&
1089             avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
1090                 avl_insert(&ddt->ddt_repair_tree, dde, where);
1091         else
1092                 ddt_free(dde);
1093 
1094         mutex_exit(&ddt->ddt_repair_lock);;
1095 }
1096 
1097 static void
1098 ddt_repair_entry_done(zio_t *zio)
1099 {
1100         ddt_entry_t *rdde = zio->io_private;
1101 
1102         ddt_free(rdde);
1103 }
1104 
1105 static void
1106 ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
1107 {
1108         ddt_phys_t *ddp = dde->dde_phys;
1109         ddt_phys_t *rddp = rdde->dde_phys;
1110         ddt_key_t *ddk = &dde->dde_key;
1111         ddt_key_t *rddk = &rdde->dde_key;
1112         zio_t *zio;
1113         blkptr_t blk;
1114 
 
1123                 ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
1124                 zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
1125                     rdde->dde_repair_abd, DDK_GET_PSIZE(rddk), NULL, NULL,
1126                     ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL));
1127         }
1128 
1129         zio_nowait(zio);
1130 }
1131 
1132 static void
1133 ddt_repair_table(ddt_t *ddt, zio_t *rio)
1134 {
1135         spa_t *spa = ddt->ddt_spa;
1136         ddt_entry_t *dde, *rdde_next, *rdde;
1137         avl_tree_t *t = &ddt->ddt_repair_tree;
1138         blkptr_t blk;
1139 
1140         if (spa_sync_pass(spa) > 1)
1141                 return;
1142 
1143         mutex_enter(&ddt->ddt_repair_lock);
1144         for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) {
1145                 rdde_next = AVL_NEXT(t, rdde);
1146                 avl_remove(&ddt->ddt_repair_tree, rdde);
1147                 mutex_exit(&ddt->ddt_repair_lock);
1148 
1149                 ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk);
1150                 dde = ddt_repair_start(ddt, &blk);
1151                 ddt_repair_entry(ddt, dde, rdde, rio);
1152                 ddt_repair_done(ddt, dde);
1153 
1154                 mutex_enter(&ddt->ddt_repair_lock);
1155         }
1156         mutex_exit(&ddt->ddt_repair_lock);
1157 }
1158 
1159 static void
1160 ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
1161 {
1162         dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
1163         ddt_phys_t *ddp = dde->dde_phys;
1164         ddt_key_t *ddk = &dde->dde_key;
1165         spa_t *spa = ddt->ddt_spa;
1166         enum ddt_type otype = dde->dde_type;
1167         enum ddt_type ntype = DDT_TYPE_CURRENT;
1168         enum ddt_class oclass = dde->dde_class;
1169         enum ddt_class nclass;
1170         uint64_t total_refcnt = 0;
1171 
1172         ASSERT(dde->dde_state & DDE_LOADED);
1173         ASSERT(!(dde->dde_state & DDE_LOADING));
1174 
1175         /*
1176          * Propagate the stats generated at lookup time
1177          * this was delayed to avoid having to take locks
1178          * to protect ddt->ddt_histogram
1179          */
1180         if (dde->dde_lkstat.dds_ref_blocks != 0)
1181                 ddt_stat_update_by_dds(ddt, dde, &dde->dde_lkstat, -1ULL);
1182 
1183         for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
1184                 ASSERT(dde->dde_lead_zio[p] == NULL);
1185                 ASSERT((int64_t)ddp->ddp_refcnt >= 0);
1186                 if (ddp->ddp_phys_birth == 0) {
1187                         ASSERT(ddp->ddp_refcnt == 0);
1188                         continue;
1189                 }
1190                 if (p == DDT_PHYS_DITTO) {
1191                         if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0)
1192                                 ddt_phys_free(ddt, ddk, ddp, txg);
1193                         continue;
1194                 }
1195                 if (ddp->ddp_refcnt == 0)
1196                         ddt_phys_free(ddt, ddk, ddp, txg);
1197                 total_refcnt += ddp->ddp_refcnt;
1198         }
1199 
1200         if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0)
1201                 nclass = DDT_CLASS_DITTO;
1202         else if (total_refcnt > 1)
1203                 nclass = DDT_CLASS_DUPLICATE;
1204         else
1205                 nclass = DDT_CLASS_UNIQUE;
1206 
1207         if (nclass > spa->spa_ddt_class_max)
1208                 nclass = spa->spa_ddt_class_max;
1209 
1210         if (nclass < spa->spa_ddt_class_min)
1211                 nclass = spa->spa_ddt_class_min;
1212 
1213         DTRACE_PROBE1(ddt__storing__entry, uint64_t, (uint64_t)nclass);
1214 
1215         if (otype != DDT_TYPES &&
1216             (otype != ntype || oclass != nclass || total_refcnt == 0)) {
1217                 VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0);
1218                 ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT);
1219         }
1220 
1221         if (total_refcnt != 0) {
1222                 dde->dde_type = ntype;
1223                 dde->dde_class = nclass;
1224                 ddt_stat_update(ddt, dde, 0);
1225                 if (!ddt_object_exists(ddt, ntype, nclass))
1226                         ddt_object_create(ddt, ntype, nclass, tx);
1227                 VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0);
1228 
1229                 /*
1230                  * If the class changes, the order that we scan this bp
1231                  * changes.  If it decreases, we could miss it, so
1232                  * scan it right now.  (This covers both class changing
1233                  * while we are doing ddt_walk(), and when we are
1234                  * traversing.)
1235                  */
1236                 if (nclass < oclass) {
1237                         dsl_scan_ddt_entry(dp->dp_scan,
1238                             ddt->ddt_checksum, dde, tx);
1239                 }
1240         }
1241         DTRACE_PROBE(ddt__stored__entry);
1242 }
1243 
1244 static void
1245 ddt_sync_avl(ddt_t *ddt, avl_tree_t *avl, dmu_tx_t *tx, uint64_t txg)
1246 {
1247         void *cookie = NULL;
1248         ddt_entry_t *dde;
1249 
1250         while ((dde = avl_destroy_nodes(avl, &cookie)) != NULL) {
1251                 if ((dde->dde_state & DDE_DONT_SYNC) != DDE_DONT_SYNC) {
1252                         ddt_sync_entry(ddt, dde, tx, txg);
1253                 } else { /* if we're not syncing this DDE it must be new */
1254                         ASSERT(dde->dde_state & DDE_NEW);
1255                 }
1256                 ddt_free(dde);
1257         }
1258 }
1259 
1260 static void
1261 ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
1262 {
1263         uint64_t cnt, num_dbytes = 0, num_mbytes = 0;
1264         int64_t old_mbytes = 0;
1265         spa_t *spa = ddt->ddt_spa;
1266         uint_t i, numnodes = 0;
1267         ddt_object_t *ddo;
1268 
1269         for (i = 0; i < DDT_HASHSZ; i++)
1270                 numnodes += avl_numnodes(&ddt->ddt_tree[i]);
1271 
1272         if (numnodes == 0)
1273                 return;
1274 
1275         ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP);
1276 
1277         if (spa->spa_ddt_stat_object == 0) {
1278                 spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
1279                     DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT,
1280                     DMU_POOL_DDT_STATS, tx);
1281         }
1282 
1283 
1284         DTRACE_PROBE(ddt__syncing__avl);
1285         for (i = 0; i < DDT_HASHSZ; i++)
1286                 ddt_sync_avl(ddt, &ddt->ddt_tree[i], tx, txg);
1287         DTRACE_PROBE(ddt__synced__avl);
1288 
1289         DTRACE_PROBE(ddt__syncing__obj);
1290         for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
1291                 for (enum ddt_class class = spa->spa_ddt_class_min;
1292                     class <= spa->spa_ddt_class_max; class++) {
1293                         if (ddt_object_exists(ddt, type, class)) {
1294                                 ddo = &ddt->ddt_object_stats[type][class];
1295                                 old_mbytes += ddo->ddo_mspace;
1296 
1297                                 ddt_object_sync(ddt, type, class, tx);
1298                                 (void) ddt_object_count(ddt, type, class, &cnt);
1299                                 if (cnt == 0) {
1300                                         ddt_object_destroy(ddt, type, class,
1301                                             tx);
1302                                         continue;
1303                                 }
1304 
1305                                 num_dbytes += ddo->ddo_dspace;
1306                                 num_mbytes += ddo->ddo_mspace;
1307                         }
1308                 }
1309         }
1310         spa->spa_ddt_dsize = num_dbytes;
1311         spa->spa_ddt_msize = num_mbytes;
1312         atomic_add_64(&zfs_ddts_msize, ((int64_t)num_mbytes) - old_mbytes);
1313         DTRACE_PROBE4(ddt__synced__obj, char *, spa->spa_name,
1314             uint64_t, num_dbytes, uint64_t, num_mbytes, uint64_t,
1315             zfs_ddts_msize);
1316 
1317         if (spa_enable_dedup_cap(spa) && spa->spa_ddt_capped == 0) {
1318                 /* notify that dedup cap is now active */
1319                 spa->spa_ddt_capped = 1;
1320                 spa_event_notify(spa, NULL, NULL, ESC_ZFS_DEDUP_OFF);
1321         } else if (!spa_enable_dedup_cap(spa) && spa->spa_ddt_capped == 1) {
1322                 /* notify that dedup cap is now inactive */
1323                 spa->spa_ddt_capped = 0;
1324                 spa_event_notify(spa, NULL, NULL, ESC_ZFS_DEDUP_ON);
1325         }
1326 
1327         /* update the cached stats with the values calculated above */
1328         bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
1329             sizeof (ddt->ddt_histogram));
1330 }
1331 
1332 void
1333 ddt_sync(spa_t *spa, uint64_t txg)
1334 {
1335         dmu_tx_t *tx;
1336         zio_t *rio = zio_root(spa, NULL, NULL,
1337             ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
1338 
1339         ASSERT(spa_syncing_txg(spa) == txg);
1340 
1341         tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1342 
1343         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1344                 ddt_t *ddt = spa->spa_ddt[c];
1345                 if (ddt == NULL)
1346                         continue;
1347                 ddt_sync_table(ddt, tx, txg);
1348                 ddt_repair_table(ddt, rio);
1349         }
1350 
1351         (void) zio_wait(rio);
1352 
1353         dmu_tx_commit(tx);
1354 }
1355 
1356 int
1357 ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)
 
 |