Print this page
NEX-5856 ddt_capped isn't reset when deduped dataset is destroyed
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
4185 add new cryptographic checksums to ZFS: SHA-512, Skein, Edon-R (fix studio build)
4185 add new cryptographic checksums to ZFS: SHA-512, Skein, Edon-R
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Richard Lowe <richlowe@richlowe.net>
Approved by: Garrett D'Amore <garrett@damore.org>
NEX-3165 need some dedup improvements
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
NEX-3211 mismerge ddt_repair_start()
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Josef Sipek <josef.sipek@nexenta.com>
4370 avoid transmitting holes during zfs send
4371 DMU code clean up
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
Approved by: Garrett D'Amore <garrett@damore.org>
OS-80 support for vdev and CoS properties for the new I/O scheduler
OS-95 lint warning introduced by OS-61
Issue #2: optimize DDE lookup in DDT objects
Added option to control number of classes of DDE's in DDT.
New default is one, that is all DDE's are stored together
regardless of refcount.
re #12611 rb4105 zpool import panic in ddt_zap_count()
re #8279 rb3915 need a mechanism to notify NMS about ZFS config changes (fix lint -courtesy of Yuri Pankov)
re #12584 rb4049 zfsxx latest code merge (fix lint - courtesy of Yuri Pankov)
re #12585 rb4049 ZFS++ work port - refactoring to improve separation of open/closed code, bug fixes, performance improvements - open code


   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2012, 2016 by Delphix. All rights reserved.

  25  */
  26 
  27 #include <sys/zfs_context.h>
  28 #include <sys/spa.h>
  29 #include <sys/spa_impl.h>
  30 #include <sys/zio.h>
  31 #include <sys/ddt.h>
  32 #include <sys/zap.h>
  33 #include <sys/dmu_tx.h>
  34 #include <sys/arc.h>
  35 #include <sys/dsl_pool.h>
  36 #include <sys/zio_checksum.h>
  37 #include <sys/zio_compress.h>
  38 #include <sys/dsl_scan.h>
  39 #include <sys/abd.h>
  40 
  41 /*










  42  * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
  43  */
  44 int zfs_dedup_prefetch = 1;
  45 
  46 static const ddt_ops_t *ddt_ops[DDT_TYPES] = {
  47         &ddt_zap_ops,
  48 };
  49 
  50 static const char *ddt_class_name[DDT_CLASSES] = {
  51         "ditto",
  52         "duplicate",
  53         "unique",
  54 };
  55 



  56 static void
  57 ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
  58     dmu_tx_t *tx)
  59 {
  60         spa_t *spa = ddt->ddt_spa;
  61         objset_t *os = ddt->ddt_os;
  62         uint64_t *objectp = &ddt->ddt_object[type][class];
  63         boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_flags &
  64             ZCHECKSUM_FLAG_DEDUP;
  65         char name[DDT_NAMELEN];
  66 
  67         ddt_object_name(ddt, type, class, name);
  68 
  69         ASSERT(*objectp == 0);
  70         VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0);
  71         ASSERT(*objectp != 0);
  72 
  73         VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name,
  74             sizeof (uint64_t), 1, objectp, tx) == 0);
  75 
  76         VERIFY(zap_add(os, spa->spa_ddt_stat_object, name,
  77             sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
  78             &ddt->ddt_histogram[type][class], tx) == 0);
  79 }
  80 
  81 static void
  82 ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
  83     dmu_tx_t *tx)
  84 {
  85         spa_t *spa = ddt->ddt_spa;
  86         objset_t *os = ddt->ddt_os;
  87         uint64_t *objectp = &ddt->ddt_object[type][class];
  88         char name[DDT_NAMELEN];
  89 


  90         ddt_object_name(ddt, type, class, name);
  91 
  92         ASSERT(*objectp != 0);
  93         ASSERT(ddt_object_count(ddt, type, class) == 0);

  94         ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
  95         VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0);
  96         VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0);
  97         VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0);
  98         bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t));
  99 
 100         *objectp = 0;
 101 }
 102 
 103 static int
 104 ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
 105 {
 106         ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
 107         dmu_object_info_t doi;
 108         char name[DDT_NAMELEN];
 109         int error;
 110 
 111         ddt_object_name(ddt, type, class, name);
 112 
 113         error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name,
 114             sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
 115 
 116         if (error != 0)
 117                 return (error);
 118 
 119         VERIFY0(zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
 120             sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
 121             &ddt->ddt_histogram[type][class]));
 122 
 123         /*
 124          * Seed the cached statistics.
 125          */
 126         VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
 127 
 128         ddo->ddo_count = ddt_object_count(ddt, type, class);





 129         ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
 130         ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
 131 
 132         return (0);
 133 }
 134 
 135 static void
 136 ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 137     dmu_tx_t *tx)
 138 {
 139         ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
 140         dmu_object_info_t doi;
 141         char name[DDT_NAMELEN];
 142 
 143         ddt_object_name(ddt, type, class, name);
 144 
 145         VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
 146             sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
 147             &ddt->ddt_histogram[type][class], tx) == 0);
 148 
 149         /*
 150          * Cache DDT statistics; this is the only time they'll change.
 151          */
 152         VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
 153 
 154         ddo->ddo_count = ddt_object_count(ddt, type, class);
 155         ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
 156         ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
 157 }
 158 
 159 static int
 160 ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 161     ddt_entry_t *dde)
 162 {
 163         if (!ddt_object_exists(ddt, type, class))
 164                 return (SET_ERROR(ENOENT));
 165 
 166         return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
 167             ddt->ddt_object[type][class], dde));
 168 }
 169 
 170 static void
 171 ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 172     ddt_entry_t *dde)
 173 {
 174         if (!ddt_object_exists(ddt, type, class))


 191 static int
 192 ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 193     ddt_entry_t *dde, dmu_tx_t *tx)
 194 {
 195         ASSERT(ddt_object_exists(ddt, type, class));
 196 
 197         return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os,
 198             ddt->ddt_object[type][class], dde, tx));
 199 }
 200 
 201 int
 202 ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 203     uint64_t *walk, ddt_entry_t *dde)
 204 {
 205         ASSERT(ddt_object_exists(ddt, type, class));
 206 
 207         return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
 208             ddt->ddt_object[type][class], dde, walk));
 209 }
 210 
 211 uint64_t
 212 ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class)

 213 {
 214         ASSERT(ddt_object_exists(ddt, type, class));
 215 
 216         return (ddt_ops[type]->ddt_op_count(ddt->ddt_os,
 217             ddt->ddt_object[type][class]));
 218 }
 219 
 220 int
 221 ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 222     dmu_object_info_t *doi)
 223 {
 224         if (!ddt_object_exists(ddt, type, class))
 225                 return (SET_ERROR(ENOENT));
 226 
 227         return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class],
 228             doi));
 229 }
 230 
 231 boolean_t
 232 ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
 233 {
 234         return (!!ddt->ddt_object[type][class]);
 235 }
 236 
 237 void


 333         for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 334                 if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) &&
 335                     BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth)
 336                         return (ddp);
 337         }
 338         return (NULL);
 339 }
 340 
 341 uint64_t
 342 ddt_phys_total_refcnt(const ddt_entry_t *dde)
 343 {
 344         uint64_t refcnt = 0;
 345 
 346         for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
 347                 refcnt += dde->dde_phys[p].ddp_refcnt;
 348 
 349         return (refcnt);
 350 }
 351 
 352 static void
 353 ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
 354 {
 355         spa_t *spa = ddt->ddt_spa;
 356         ddt_phys_t *ddp = dde->dde_phys;
 357         ddt_key_t *ddk = &dde->dde_key;
 358         uint64_t lsize = DDK_GET_LSIZE(ddk);
 359         uint64_t psize = DDK_GET_PSIZE(ddk);
 360 
 361         bzero(dds, sizeof (*dds));
 362 
 363         for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 364                 uint64_t dsize = 0;
 365                 uint64_t refcnt = ddp->ddp_refcnt;
 366 
 367                 if (ddp->ddp_phys_birth == 0)
 368                         continue;
 369 
 370                 for (int d = 0; d < SPA_DVAS_PER_BP; d++)
 371                         dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
 372 
 373                 dds->dds_blocks += 1;
 374                 dds->dds_lsize += lsize;
 375                 dds->dds_psize += psize;


 379                 dds->dds_ref_lsize += lsize * refcnt;
 380                 dds->dds_ref_psize += psize * refcnt;
 381                 dds->dds_ref_dsize += dsize * refcnt;
 382         }
 383 }
 384 
 385 void
 386 ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg)
 387 {
 388         const uint64_t *s = (const uint64_t *)src;
 389         uint64_t *d = (uint64_t *)dst;
 390         uint64_t *d_end = (uint64_t *)(dst + 1);
 391 
 392         ASSERT(neg == 0 || neg == -1ULL);       /* add or subtract */
 393 
 394         while (d < d_end)
 395                 *d++ += (*s++ ^ neg) - neg;
 396 }
 397 
 398 static void
 399 ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)

 400 {
 401         ddt_stat_t dds;
 402         ddt_histogram_t *ddh;
 403         int bucket;
 404 
 405         ddt_stat_generate(ddt, dde, &dds);
 406 
 407         bucket = highbit64(dds.dds_ref_blocks) - 1;
 408         ASSERT(bucket >= 0);
 409 
 410         ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];


 411 
 412         ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg);







 413 }
 414 
 415 void
 416 ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
 417 {
 418         for (int h = 0; h < 64; h++)
 419                 ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0);
 420 }
 421 
 422 void
 423 ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh)
 424 {
 425         bzero(dds, sizeof (*dds));
 426 
 427         for (int h = 0; h < 64; h++)
 428                 ddt_stat_add(dds, &ddh->ddh_stat[h], 0);
 429 }
 430 
 431 boolean_t
 432 ddt_histogram_empty(const ddt_histogram_t *ddh)
 433 {
 434         const uint64_t *s = (const uint64_t *)ddh;
 435         const uint64_t *s_end = (const uint64_t *)(ddh + 1);
 436 
 437         while (s < s_end)
 438                 if (*s++ != 0)
 439                         return (B_FALSE);
 440 
 441         return (B_TRUE);
 442 }
 443 
 444 void
 445 ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
 446 {
 447         /* Sum the statistics we cached in ddt_object_sync(). */
 448         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 449                 ddt_t *ddt = spa->spa_ddt[c];
 450                 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 451                         for (enum ddt_class class = 0; class < DDT_CLASSES;
 452                             class++) {
 453                                 ddt_object_t *ddo =
 454                                     &ddt->ddt_object_stats[type][class];
 455                                 ddo_total->ddo_count += ddo->ddo_count;
 456                                 ddo_total->ddo_dspace += ddo->ddo_dspace;
 457                                 ddo_total->ddo_mspace += ddo->ddo_mspace;
 458                         }
 459                 }
 460         }
 461 
 462         /* ... and compute the averages. */
 463         if (ddo_total->ddo_count != 0) {
 464                 ddo_total->ddo_dspace /= ddo_total->ddo_count;
 465                 ddo_total->ddo_mspace /= ddo_total->ddo_count;
 466         }
 467 }
 468 
 469 void
 470 ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
 471 {
 472         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 473                 ddt_t *ddt = spa->spa_ddt[c];
 474                 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 475                         for (enum ddt_class class = 0; class < DDT_CLASSES;
 476                             class++) {
 477                                 ddt_histogram_add(ddh,
 478                                     &ddt->ddt_histogram_cache[type][class]);
 479                         }
 480                 }
 481         }
 482 }
 483 
 484 void
 485 ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total)
 486 {
 487         ddt_histogram_t *ddh_total;
 488 
 489         ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
 490         ddt_get_dedup_histogram(spa, ddh_total);
 491         ddt_histogram_stat(dds_total, ddh_total);
 492         kmem_free(ddh_total, sizeof (ddt_histogram_t));















 493 }
 494 
 495 uint64_t
 496 ddt_get_dedup_dspace(spa_t *spa)
 497 {
 498         ddt_stat_t dds_total = { 0 };
 499 
 500         ddt_get_dedup_stats(spa, &dds_total);
 501         return (dds_total.dds_ref_dsize - dds_total.dds_dsize);
 502 }
 503 
 504 uint64_t
 505 ddt_get_pool_dedup_ratio(spa_t *spa)
 506 {
 507         ddt_stat_t dds_total = { 0 };
 508 
 509         ddt_get_dedup_stats(spa, &dds_total);
 510         if (dds_total.dds_dsize == 0)
 511                 return (100);
 512 


 603                 bcopy(src, dst, d_len);
 604 
 605         if (((version & DDT_COMPRESS_BYTEORDER_MASK) != 0) !=
 606             (ZFS_HOST_BYTEORDER != 0))
 607                 byteswap_uint64_array(dst, d_len);
 608 }
 609 
 610 ddt_t *
 611 ddt_select_by_checksum(spa_t *spa, enum zio_checksum c)
 612 {
 613         return (spa->spa_ddt[c]);
 614 }
 615 
 616 ddt_t *
 617 ddt_select(spa_t *spa, const blkptr_t *bp)
 618 {
 619         return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
 620 }
 621 
 622 void
 623 ddt_enter(ddt_t *ddt)
 624 {
 625         mutex_enter(&ddt->ddt_lock);
 626 }
 627 
 628 void
 629 ddt_exit(ddt_t *ddt)
 630 {
 631         mutex_exit(&ddt->ddt_lock);
 632 }
 633 



















































 634 static ddt_entry_t *
 635 ddt_alloc(const ddt_key_t *ddk)
 636 {
 637         ddt_entry_t *dde;
 638 
 639         dde = kmem_zalloc(sizeof (ddt_entry_t), KM_SLEEP);
 640         cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
 641 

 642         dde->dde_key = *ddk;




 643 
 644         return (dde);
 645 }
 646 
 647 static void
 648 ddt_free(ddt_entry_t *dde)
 649 {
 650         ASSERT(!dde->dde_loading);
 651 
 652         for (int p = 0; p < DDT_PHYS_TYPES; p++)
 653                 ASSERT(dde->dde_lead_zio[p] == NULL);
 654 
 655         if (dde->dde_repair_abd != NULL)
 656                 abd_free(dde->dde_repair_abd);
 657 
 658         cv_destroy(&dde->dde_cv);
 659         kmem_free(dde, sizeof (*dde));
 660 }
 661 

 662 void
 663 ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
 664 {
 665         ASSERT(MUTEX_HELD(&ddt->ddt_lock));
 666 
 667         avl_remove(&ddt->ddt_tree, dde);
 668         ddt_free(dde);
 669 }
 670 
 671 ddt_entry_t *
 672 ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
 673 {
 674         ddt_entry_t *dde, dde_search;
 675         enum ddt_type type;
 676         enum ddt_class class;
 677         avl_index_t where;

 678         int error;
 679 
 680         ASSERT(MUTEX_HELD(&ddt->ddt_lock));
 681 
 682         ddt_key_fill(&dde_search.dde_key, bp);
 683 
 684         dde = avl_find(&ddt->ddt_tree, &dde_search, &where);




 685         if (dde == NULL) {
 686                 if (!add)


 687                         return (NULL);


 688                 dde = ddt_alloc(&dde_search.dde_key);
 689                 avl_insert(&ddt->ddt_tree, dde, where);
 690         }
 691 
 692         while (dde->dde_loading)
 693                 cv_wait(&dde->dde_cv, &ddt->ddt_lock);
 694 
 695         if (dde->dde_loaded)











 696                 return (dde);
 697 
 698         dde->dde_loading = B_TRUE;




 699 
 700         ddt_exit(ddt);
 701 
 702         error = ENOENT;
 703 

 704         for (type = 0; type < DDT_TYPES; type++) {
 705                 for (class = 0; class < DDT_CLASSES; class++) {
 706                         error = ddt_object_lookup(ddt, type, class, dde);
 707                         if (error != ENOENT) {
 708                                 ASSERT0(error);
 709                                 break;
 710                         }
 711                 }
 712                 if (error != ENOENT)
 713                         break;
 714         }
 715 
 716         ddt_enter(ddt);
 717 
 718         ASSERT(dde->dde_loaded == B_FALSE);
 719         ASSERT(dde->dde_loading == B_TRUE);
 720 



 721         dde->dde_type = type;        /* will be DDT_TYPES if no entry found */
 722         dde->dde_class = class;      /* will be DDT_CLASSES if no entry found */
 723         dde->dde_loaded = B_TRUE;
 724         dde->dde_loading = B_FALSE;


 725 


 726         if (error == 0)
 727                 ddt_stat_update(ddt, dde, -1ULL);
 728 
 729         cv_broadcast(&dde->dde_cv);
 730 
 731         return (dde);
 732 }
 733 
 734 void
 735 ddt_prefetch(spa_t *spa, const blkptr_t *bp)
 736 {
 737         ddt_t *ddt;
 738         ddt_entry_t dde;
 739 
 740         if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp))
 741                 return;
 742 
 743         /*
 744          * We only remove the DDT once all tables are empty and only
 745          * prefetch dedup blocks when there are entries in the DDT.
 746          * Thus no locking is required as the DDT can't disappear on us.
 747          */
 748         ddt = ddt_select(spa, bp);
 749         ddt_key_fill(&dde.dde_key, bp);
 750 
 751         for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 752                 for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {

 753                         ddt_object_prefetch(ddt, type, class, &dde);
 754                 }
 755         }
 756 }
 757 
 758 int
 759 ddt_entry_compare(const void *x1, const void *x2)
 760 {
 761         const ddt_entry_t *dde1 = x1;
 762         const ddt_entry_t *dde2 = x2;
 763         const uint64_t *u1 = (const uint64_t *)&dde1->dde_key;
 764         const uint64_t *u2 = (const uint64_t *)&dde2->dde_key;
 765 
 766         for (int i = 0; i < DDT_KEY_WORDS; i++) {
 767                 if (u1[i] < u2[i])
 768                         return (-1);
 769                 if (u1[i] > u2[i])
 770                         return (1);
 771         }
 772 
 773         return (0);
 774 }
 775 
 776 static ddt_t *
 777 ddt_table_alloc(spa_t *spa, enum zio_checksum c)
 778 {
 779         ddt_t *ddt;

 780 
 781         ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP);
 782 
 783         mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
 784         avl_create(&ddt->ddt_tree, ddt_entry_compare,

 785             sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));



 786         avl_create(&ddt->ddt_repair_tree, ddt_entry_compare,
 787             sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
 788         ddt->ddt_checksum = c;
 789         ddt->ddt_spa = spa;
 790         ddt->ddt_os = spa->spa_meta_objset;
 791 
 792         return (ddt);
 793 }
 794 
 795 static void
 796 ddt_table_free(ddt_t *ddt)
 797 {
 798         ASSERT(avl_numnodes(&ddt->ddt_tree) == 0);

 799         ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0);
 800         avl_destroy(&ddt->ddt_tree);





 801         avl_destroy(&ddt->ddt_repair_tree);
 802         mutex_destroy(&ddt->ddt_lock);
 803         kmem_free(ddt, sizeof (*ddt));
 804 }
 805 
 806 void
 807 ddt_create(spa_t *spa)
 808 {
 809         spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM;
 810 
 811         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
 812                 spa->spa_ddt[c] = ddt_table_alloc(spa, c);
 813 }
 814 
















 815 int
 816 ddt_load(spa_t *spa)
 817 {
 818         int error;

 819 
 820         ddt_create(spa);
 821 
 822         error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 823             DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
 824             &spa->spa_ddt_stat_object);
 825 
 826         if (error)
 827                 return (error == ENOENT ? 0 : error);
 828 
 829         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 830                 ddt_t *ddt = spa->spa_ddt[c];
 831                 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 832                         for (enum ddt_class class = 0; class < DDT_CLASSES;
 833                             class++) {
 834                                 error = ddt_object_load(ddt, type, class);
 835                                 if (error != 0 && error != ENOENT)


 836                                         return (error);





 837                         }
 838                 }
 839 
 840                 /*
 841                  * Seed the cached histograms.
 842                  */
 843                 bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
 844                     sizeof (ddt->ddt_histogram));
 845         }

 846 






 847         return (0);
 848 }
 849 
 850 void
 851 ddt_unload(spa_t *spa)
 852 {
 853         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 854                 if (spa->spa_ddt[c]) {
 855                         ddt_table_free(spa->spa_ddt[c]);
 856                         spa->spa_ddt[c] = NULL;
 857                 }
 858         }



 859 }
 860 
 861 boolean_t
 862 ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp)
 863 {
 864         ddt_t *ddt;
 865         ddt_entry_t dde;
 866 
 867         if (!BP_GET_DEDUP(bp))
 868                 return (B_FALSE);
 869 
 870         if (max_class == DDT_CLASS_UNIQUE)
 871                 return (B_TRUE);
 872 
 873         ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)];
 874 
 875         ddt_key_fill(&dde.dde_key, bp);
 876 
 877         for (enum ddt_type type = 0; type < DDT_TYPES; type++)
 878                 for (enum ddt_class class = 0; class <= max_class; class++)

 879                         if (ddt_object_lookup(ddt, type, class, &dde) == 0)
 880                                 return (B_TRUE);
 881 
 882         return (B_FALSE);
 883 }
 884 
 885 ddt_entry_t *
 886 ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
 887 {
 888         ddt_key_t ddk;
 889         ddt_entry_t *dde;
 890 
 891         ddt_key_fill(&ddk, bp);
 892 
 893         dde = ddt_alloc(&ddk);
 894 
 895         for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 896                 for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {

 897                         /*
 898                          * We can only do repair if there are multiple copies
 899                          * of the block.  For anything in the UNIQUE class,
 900                          * there's definitely only one copy, so don't even try.
 901                          */
 902                         if (class != DDT_CLASS_UNIQUE &&
 903                             ddt_object_lookup(ddt, type, class, dde) == 0)
 904                                 return (dde);
 905                 }
 906         }
 907 
 908         bzero(dde->dde_phys, sizeof (dde->dde_phys));
 909 
 910         return (dde);
 911 }
 912 
 913 void
 914 ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
 915 {
 916         avl_index_t where;
 917 
 918         ddt_enter(ddt);
 919 
 920         if (dde->dde_repair_abd != NULL && spa_writeable(ddt->ddt_spa) &&
 921             avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
 922                 avl_insert(&ddt->ddt_repair_tree, dde, where);
 923         else
 924                 ddt_free(dde);
 925 
 926         ddt_exit(ddt);
 927 }
 928 
 929 static void
 930 ddt_repair_entry_done(zio_t *zio)
 931 {
 932         ddt_entry_t *rdde = zio->io_private;
 933 
 934         ddt_free(rdde);
 935 }
 936 
 937 static void
 938 ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
 939 {
 940         ddt_phys_t *ddp = dde->dde_phys;
 941         ddt_phys_t *rddp = rdde->dde_phys;
 942         ddt_key_t *ddk = &dde->dde_key;
 943         ddt_key_t *rddk = &rdde->dde_key;
 944         zio_t *zio;
 945         blkptr_t blk;
 946 


 955                 ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
 956                 zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
 957                     rdde->dde_repair_abd, DDK_GET_PSIZE(rddk), NULL, NULL,
 958                     ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL));
 959         }
 960 
 961         zio_nowait(zio);
 962 }
 963 
 964 static void
 965 ddt_repair_table(ddt_t *ddt, zio_t *rio)
 966 {
 967         spa_t *spa = ddt->ddt_spa;
 968         ddt_entry_t *dde, *rdde_next, *rdde;
 969         avl_tree_t *t = &ddt->ddt_repair_tree;
 970         blkptr_t blk;
 971 
 972         if (spa_sync_pass(spa) > 1)
 973                 return;
 974 
 975         ddt_enter(ddt);
 976         for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) {
 977                 rdde_next = AVL_NEXT(t, rdde);
 978                 avl_remove(&ddt->ddt_repair_tree, rdde);
 979                 ddt_exit(ddt);

 980                 ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk);
 981                 dde = ddt_repair_start(ddt, &blk);
 982                 ddt_repair_entry(ddt, dde, rdde, rio);
 983                 ddt_repair_done(ddt, dde);
 984                 ddt_enter(ddt);

 985         }
 986         ddt_exit(ddt);
 987 }
 988 
 989 static void
 990 ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
 991 {
 992         dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
 993         ddt_phys_t *ddp = dde->dde_phys;
 994         ddt_key_t *ddk = &dde->dde_key;

 995         enum ddt_type otype = dde->dde_type;
 996         enum ddt_type ntype = DDT_TYPE_CURRENT;
 997         enum ddt_class oclass = dde->dde_class;
 998         enum ddt_class nclass;
 999         uint64_t total_refcnt = 0;
1000 
1001         ASSERT(dde->dde_loaded);
1002         ASSERT(!dde->dde_loading);
1003 








1004         for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
1005                 ASSERT(dde->dde_lead_zio[p] == NULL);
1006                 ASSERT((int64_t)ddp->ddp_refcnt >= 0);
1007                 if (ddp->ddp_phys_birth == 0) {
1008                         ASSERT(ddp->ddp_refcnt == 0);
1009                         continue;
1010                 }
1011                 if (p == DDT_PHYS_DITTO) {
1012                         if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0)
1013                                 ddt_phys_free(ddt, ddk, ddp, txg);
1014                         continue;
1015                 }
1016                 if (ddp->ddp_refcnt == 0)
1017                         ddt_phys_free(ddt, ddk, ddp, txg);
1018                 total_refcnt += ddp->ddp_refcnt;
1019         }
1020 
1021         if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0)
1022                 nclass = DDT_CLASS_DITTO;
1023         else if (total_refcnt > 1)
1024                 nclass = DDT_CLASS_DUPLICATE;
1025         else
1026                 nclass = DDT_CLASS_UNIQUE;
1027 








1028         if (otype != DDT_TYPES &&
1029             (otype != ntype || oclass != nclass || total_refcnt == 0)) {
1030                 VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0);
1031                 ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT);
1032         }
1033 
1034         if (total_refcnt != 0) {
1035                 dde->dde_type = ntype;
1036                 dde->dde_class = nclass;
1037                 ddt_stat_update(ddt, dde, 0);
1038                 if (!ddt_object_exists(ddt, ntype, nclass))
1039                         ddt_object_create(ddt, ntype, nclass, tx);
1040                 VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0);
1041 
1042                 /*
1043                  * If the class changes, the order that we scan this bp
1044                  * changes.  If it decreases, we could miss it, so
1045                  * scan it right now.  (This covers both class changing
1046                  * while we are doing ddt_walk(), and when we are
1047                  * traversing.)
1048                  */
1049                 if (nclass < oclass) {
1050                         dsl_scan_ddt_entry(dp->dp_scan,
1051                             ddt->ddt_checksum, dde, tx);
1052                 }
1053         }

1054 }
1055 
1056 static void
















1057 ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
1058 {


1059         spa_t *spa = ddt->ddt_spa;
1060         ddt_entry_t *dde;
1061         void *cookie = NULL;
1062 
1063         if (avl_numnodes(&ddt->ddt_tree) == 0)



1064                 return;
1065 
1066         ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP);
1067 
1068         if (spa->spa_ddt_stat_object == 0) {
1069                 spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
1070                     DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT,
1071                     DMU_POOL_DDT_STATS, tx);
1072         }
1073 
1074         while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
1075                 ddt_sync_entry(ddt, dde, tx, txg);
1076                 ddt_free(dde);
1077         }
1078 






1079         for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
1080                 uint64_t count = 0;
1081                 for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
1082                         if (ddt_object_exists(ddt, type, class)) {



1083                                 ddt_object_sync(ddt, type, class, tx);
1084                                 count += ddt_object_count(ddt, type, class);




1085                         }



1086                 }
1087                 for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
1088                         if (count == 0 && ddt_object_exists(ddt, type, class))
1089                                 ddt_object_destroy(ddt, type, class, tx);
1090                 }
1091         }






1092 











1093         bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
1094             sizeof (ddt->ddt_histogram));
1095 }
1096 
1097 void
1098 ddt_sync(spa_t *spa, uint64_t txg)
1099 {
1100         dmu_tx_t *tx;
1101         zio_t *rio = zio_root(spa, NULL, NULL,
1102             ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL);
1103 
1104         ASSERT(spa_syncing_txg(spa) == txg);
1105 
1106         tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1107 
1108         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1109                 ddt_t *ddt = spa->spa_ddt[c];
1110                 if (ddt == NULL)
1111                         continue;
1112                 ddt_sync_table(ddt, tx, txg);
1113                 ddt_repair_table(ddt, rio);
1114         }
1115 
1116         (void) zio_wait(rio);
1117 
1118         dmu_tx_commit(tx);
1119 }
1120 
1121 int
1122 ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)




   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
  25  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  26  */
  27 
  28 #include <sys/zfs_context.h>
  29 #include <sys/spa.h>
  30 #include <sys/spa_impl.h>
  31 #include <sys/zio.h>
  32 #include <sys/ddt.h>
  33 #include <sys/zap.h>
  34 #include <sys/dmu_tx.h>
  35 #include <sys/arc.h>
  36 #include <sys/dsl_pool.h>
  37 #include <sys/zio_checksum.h>
  38 #include <sys/zio_compress.h>
  39 #include <sys/dsl_scan.h>
  40 #include <sys/abd.h>
  41 
  42 /*
  43  * Almost all of the cases of iteration through zap containing entries are
  44  * restricted by spa->spa_ddt_class_{min,max}. It allows one to introduce new
  45  * behavior: storing all entries into the single zap. However, there are
  46  * some places where all zaps are iterated through forcibly: table creation,
  47  * deletion, loading, dde prefetching, and looking up. It allows one to maintain
  48  * compatibility with old pools and be able to convert the old pool format
  49  * into the new one on-the-fly.
  50  */
  51 
  52 /*
  53  * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
  54  */
  55 int zfs_dedup_prefetch = 1;
  56 
  57 static const ddt_ops_t *ddt_ops[DDT_TYPES] = {
  58         &ddt_zap_ops,
  59 };
  60 
  61 static const char *ddt_class_name[DDT_CLASSES] = {
  62         "ditto",
  63         "duplicate",
  64         "unique",
  65 };
  66 
  67 /* Possible in core size of all DDTs */
  68 uint64_t zfs_ddts_msize = 0;
  69 
  70 static void
  71 ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
  72     dmu_tx_t *tx)
  73 {
  74         spa_t *spa = ddt->ddt_spa;
  75         objset_t *os = ddt->ddt_os;
  76         uint64_t *objectp = &ddt->ddt_object[type][class];
  77         boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_flags &
  78             ZCHECKSUM_FLAG_DEDUP;
  79         char name[DDT_NAMELEN];
  80 
  81         ddt_object_name(ddt, type, class, name);
  82 
  83         ASSERT(*objectp == 0);
  84         VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0);
  85         ASSERT(*objectp != 0);
  86 
  87         VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name,
  88             sizeof (uint64_t), 1, objectp, tx) == 0);
  89 
  90         VERIFY(zap_add(os, spa->spa_ddt_stat_object, name,
  91             sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
  92             &ddt->ddt_histogram[type][class], tx) == 0);
  93 }
  94 
  95 static void
  96 ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
  97     dmu_tx_t *tx)
  98 {
  99         spa_t *spa = ddt->ddt_spa;
 100         objset_t *os = ddt->ddt_os;
 101         uint64_t *objectp = &ddt->ddt_object[type][class];
 102         char name[DDT_NAMELEN];
 103 #if DEBUG
 104         uint64_t count;
 105 #endif
 106         ddt_object_name(ddt, type, class, name);
 107 
 108         ASSERT(*objectp != 0);
 109         ASSERT((ddt_object_count(ddt, type, class, &count) == 0) &&
 110             (count == 0));
 111         ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
 112         VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0);
 113         VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0);
 114         VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0);
 115         bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t));
 116 
 117         *objectp = 0;
 118 }
 119 
 120 static int
 121 ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
 122 {
 123         ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
 124         dmu_object_info_t doi;
 125         char name[DDT_NAMELEN];
 126         int error;
 127 
 128         ddt_object_name(ddt, type, class, name);
 129 
 130         error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name,
 131             sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
 132         if (error)

 133                 return (error);
 134 
 135         VERIFY0(zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
 136             sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
 137             &ddt->ddt_histogram[type][class]));
 138 
 139         /*
 140          * Seed the cached statistics.
 141          */
 142         error = ddt_object_info(ddt, type, class, &doi);
 143         /* Panic in debug mode */
 144         ASSERT(error == 0);
 145         if (error)
 146                 return (error);
 147         error = ddt_object_count(ddt, type, class, &ddo->ddo_count);
 148         if (error)
 149                 return (error);
 150         ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
 151         ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
 152 
 153         return (0);
 154 }
 155 
 156 static void
 157 ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 158     dmu_tx_t *tx)
 159 {
 160         ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
 161         dmu_object_info_t doi;
 162         char name[DDT_NAMELEN];
 163 
 164         ddt_object_name(ddt, type, class, name);
 165 
 166         VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
 167             sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
 168             &ddt->ddt_histogram[type][class], tx) == 0);
 169 
 170         /*
 171          * Cache DDT statistics; this is the only time they'll change.
 172          */
 173         VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
 174 
 175         (void) ddt_object_count(ddt, type, class, &ddo->ddo_count);
 176         ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
 177         ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
 178 }
 179 
 180 static int
 181 ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 182     ddt_entry_t *dde)
 183 {
 184         if (!ddt_object_exists(ddt, type, class))
 185                 return (SET_ERROR(ENOENT));
 186 
 187         return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
 188             ddt->ddt_object[type][class], dde));
 189 }
 190 
 191 static void
 192 ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 193     ddt_entry_t *dde)
 194 {
 195         if (!ddt_object_exists(ddt, type, class))


 212 static int
 213 ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 214     ddt_entry_t *dde, dmu_tx_t *tx)
 215 {
 216         ASSERT(ddt_object_exists(ddt, type, class));
 217 
 218         return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os,
 219             ddt->ddt_object[type][class], dde, tx));
 220 }
 221 
 222 int
 223 ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 224     uint64_t *walk, ddt_entry_t *dde)
 225 {
 226         ASSERT(ddt_object_exists(ddt, type, class));
 227 
 228         return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
 229             ddt->ddt_object[type][class], dde, walk));
 230 }
 231 
 232 int
 233 ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 234         uint64_t *count)
 235 {
 236         ASSERT(ddt_object_exists(ddt, type, class));
 237 
 238         return (ddt_ops[type]->ddt_op_count(ddt->ddt_os,
 239             ddt->ddt_object[type][class], count));
 240 }
 241 
 242 int
 243 ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 244     dmu_object_info_t *doi)
 245 {
 246         if (!ddt_object_exists(ddt, type, class))
 247                 return (SET_ERROR(ENOENT));
 248 
 249         return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class],
 250             doi));
 251 }
 252 
 253 boolean_t
 254 ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
 255 {
 256         return (!!ddt->ddt_object[type][class]);
 257 }
 258 
 259 void


 355         for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 356                 if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) &&
 357                     BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth)
 358                         return (ddp);
 359         }
 360         return (NULL);
 361 }
 362 
 363 uint64_t
 364 ddt_phys_total_refcnt(const ddt_entry_t *dde)
 365 {
 366         uint64_t refcnt = 0;
 367 
 368         for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
 369                 refcnt += dde->dde_phys[p].ddp_refcnt;
 370 
 371         return (refcnt);
 372 }
 373 
 374 static void
 375 ddt_stat_generate(spa_t *spa, ddt_entry_t *dde, ddt_stat_t *dds)
 376 {

 377         ddt_phys_t *ddp = dde->dde_phys;
 378         ddt_key_t *ddk = &dde->dde_key;
 379         uint64_t lsize = DDK_GET_LSIZE(ddk);
 380         uint64_t psize = DDK_GET_PSIZE(ddk);
 381 
 382         bzero(dds, sizeof (*dds));
 383 
 384         for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 385                 uint64_t dsize = 0;
 386                 uint64_t refcnt = ddp->ddp_refcnt;
 387 
 388                 if (ddp->ddp_phys_birth == 0)
 389                         continue;
 390 
 391                 for (int d = 0; d < SPA_DVAS_PER_BP; d++)
 392                         dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
 393 
 394                 dds->dds_blocks += 1;
 395                 dds->dds_lsize += lsize;
 396                 dds->dds_psize += psize;


 400                 dds->dds_ref_lsize += lsize * refcnt;
 401                 dds->dds_ref_psize += psize * refcnt;
 402                 dds->dds_ref_dsize += dsize * refcnt;
 403         }
 404 }
 405 
 406 void
 407 ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg)
 408 {
 409         const uint64_t *s = (const uint64_t *)src;
 410         uint64_t *d = (uint64_t *)dst;
 411         uint64_t *d_end = (uint64_t *)(dst + 1);
 412 
 413         ASSERT(neg == 0 || neg == -1ULL);       /* add or subtract */
 414 
 415         while (d < d_end)
 416                 *d++ += (*s++ ^ neg) - neg;
 417 }
 418 
 419 static void
 420 ddt_stat_update_by_dds(ddt_t *ddt, ddt_entry_t *dde,
 421     ddt_stat_t *dds, uint64_t neg)
 422 {

 423         ddt_histogram_t *ddh;
 424         int bucket = highbit64(dds->dds_ref_blocks) - 1;




 425         ASSERT(bucket >= 0);
 426 
 427         ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
 428         ddt_stat_add(&ddh->ddh_stat[bucket], dds, neg);
 429 }
 430 
 431 static void
 432 ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
 433 {
 434         ddt_stat_t dds;
 435 
 436         ddt_stat_generate(ddt->ddt_spa, dde, &dds);
 437 
 438         ddt_stat_update_by_dds(ddt, dde, &dds, neg);
 439 }
 440 
 441 void
 442 ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
 443 {
 444         for (int h = 0; h < 64; h++)
 445                 ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0);
 446 }
 447 
 448 void
 449 ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh)
 450 {
 451         bzero(dds, sizeof (*dds));
 452 
 453         for (int h = 0; h < 64; h++)
 454                 ddt_stat_add(dds, &ddh->ddh_stat[h], 0);
 455 }
 456 
 457 boolean_t
 458 ddt_histogram_empty(const ddt_histogram_t *ddh)
 459 {
 460         const uint64_t *s = (const uint64_t *)ddh;
 461         const uint64_t *s_end = (const uint64_t *)(ddh + 1);
 462 
 463         while (s < s_end)
 464                 if (*s++ != 0)
 465                         return (B_FALSE);
 466 
 467         return (B_TRUE);
 468 }
 469 
 470 void
 471 ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
 472 {
 473         /* Sum the statistics we cached in ddt_object_sync(). */
 474         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 475                 ddt_t *ddt = spa->spa_ddt[c];
 476                 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 477                         for (enum ddt_class class = spa->spa_ddt_class_min;
 478                             class <= spa->spa_ddt_class_max; class++) {
 479                                 ddt_object_t *ddo =
 480                                     &ddt->ddt_object_stats[type][class];
 481                                 ddo_total->ddo_count += ddo->ddo_count;
 482                                 ddo_total->ddo_dspace += ddo->ddo_dspace;
 483                                 ddo_total->ddo_mspace += ddo->ddo_mspace;
 484                         }
 485                 }
 486         }
 487 
 488         /* ... and compute the averages. */
 489         if (ddo_total->ddo_count != 0) {
 490                 ddo_total->ddo_dspace /= ddo_total->ddo_count;
 491                 ddo_total->ddo_mspace /= ddo_total->ddo_count;
 492         }
 493 }
 494 
 495 void
 496 ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
 497 {
 498         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 499                 ddt_t *ddt = spa->spa_ddt[c];
 500                 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 501                         for (enum ddt_class class = spa->spa_ddt_class_min;
 502                             class <= spa->spa_ddt_class_max; class++) {
 503                                 ddt_histogram_add(ddh,
 504                                     &ddt->ddt_histogram_cache[type][class]);
 505                         }
 506                 }
 507         }
 508 }
 509 
 510 void
 511 ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total)
 512 {
 513         /*
 514          * Avoid temporary allocation of ddt_histogram_t from heap
 515          * or on stack (probably too large) by unrolling ddt_histogram_add()
 516          */
 517         bzero(dds_total, sizeof (ddt_stat_t));
 518         /* sum up the stats across all the histograms */
 519         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 520                 ddt_t *ddt = spa->spa_ddt[c];
 521                 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 522                         for (enum ddt_class class = spa->spa_ddt_class_min;
 523                             class <= spa->spa_ddt_class_max; class++) {
 524                                 /* unroll the ddt_histogram_add() */
 525                                 ddt_histogram_t *src =
 526                                     &ddt->ddt_histogram_cache[type][class];
 527                                 for (int h = 0; h < 64; h++) {
 528                                         ddt_stat_t *st = &src->ddh_stat[h];
 529                                         ddt_stat_add(dds_total, st, 0);
 530                                 }
 531                         }
 532                 }
 533         }
 534 }
 535 
 536 uint64_t
 537 ddt_get_dedup_dspace(spa_t *spa)
 538 {
 539         ddt_stat_t dds_total = { 0 };
 540 
 541         ddt_get_dedup_stats(spa, &dds_total);
 542         return (dds_total.dds_ref_dsize - dds_total.dds_dsize);
 543 }
 544 
 545 uint64_t
 546 ddt_get_pool_dedup_ratio(spa_t *spa)
 547 {
 548         ddt_stat_t dds_total = { 0 };
 549 
 550         ddt_get_dedup_stats(spa, &dds_total);
 551         if (dds_total.dds_dsize == 0)
 552                 return (100);
 553 


 644                 bcopy(src, dst, d_len);
 645 
 646         if (((version & DDT_COMPRESS_BYTEORDER_MASK) != 0) !=
 647             (ZFS_HOST_BYTEORDER != 0))
 648                 byteswap_uint64_array(dst, d_len);
 649 }
 650 
 651 ddt_t *
 652 ddt_select_by_checksum(spa_t *spa, enum zio_checksum c)
 653 {
 654         return (spa->spa_ddt[c]);
 655 }
 656 
 657 ddt_t *
 658 ddt_select(spa_t *spa, const blkptr_t *bp)
 659 {
 660         return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
 661 }
 662 
 663 void
 664 ddt_enter(ddt_t *ddt, uint8_t hash)
 665 {
 666         mutex_enter(&ddt->ddt_lock[hash]);
 667 }
 668 
 669 void
 670 ddt_exit(ddt_t *ddt, uint8_t hash)
 671 {
 672         mutex_exit(&ddt->ddt_lock[hash]);
 673 }
 674 
 675 void
 676 dde_enter(ddt_entry_t *dde)
 677 {
 678         mutex_enter(&dde->dde_lock);
 679 }
 680 
 681 void
 682 dde_exit(ddt_entry_t *dde)
 683 {
 684         mutex_exit(&dde->dde_lock);
 685 }
 686 
 687 /* cache for ddt_entry_t structures */
 688 static kmem_cache_t *dde_cache;
 689 
 690 /* ARGSUSED */
 691 static int
 692 dde_cache_constr(void *buf, void *arg, int flags)
 693 {
 694         ddt_entry_t *dde = (ddt_entry_t *)buf;
 695         cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
 696         mutex_init(&dde->dde_lock, NULL, MUTEX_DEFAULT, NULL);
 697         return (0);
 698 }
 699 
 700 /* ARGSUSED */
 701 static void
 702 dde_cache_destr(void *buf, void *arg)
 703 {
 704         ddt_entry_t *dde = (ddt_entry_t *)buf;
 705         cv_destroy(&dde->dde_cv);
 706         mutex_destroy(&dde->dde_lock);
 707 }
 708 
 709 void
 710 ddt_init(void)
 711 {
 712         dde_cache = kmem_cache_create("ddt_entry_t", sizeof (ddt_entry_t),
 713             0, dde_cache_constr, dde_cache_destr, NULL, NULL, NULL, 0);
 714         VERIFY(dde_cache != NULL);
 715 }
 716 
 717 void
 718 ddt_fini(void)
 719 {
 720         if (dde_cache) {
 721                 kmem_cache_destroy(dde_cache);
 722                 dde_cache = NULL;
 723         }
 724 }
 725 
 726 static ddt_entry_t *
 727 ddt_alloc(const ddt_key_t *ddk)
 728 {
 729         ddt_entry_t *dde;
 730 
 731         dde = kmem_cache_alloc(dde_cache, KM_SLEEP);

 732 
 733         /* Init everything but the condvar and the mutex */
 734         dde->dde_key = *ddk;
 735         bzero((void*)((uintptr_t)dde+offsetof(ddt_entry_t, dde_phys)),
 736             offsetof(ddt_entry_t, dde_cv)-offsetof(ddt_entry_t, dde_phys));
 737         bzero((void*)((uintptr_t)dde+offsetof(ddt_entry_t, dde_node)),
 738             sizeof (avl_node_t));
 739 
 740         return (dde);
 741 }
 742 
 743 static void
 744 ddt_free(ddt_entry_t *dde)
 745 {
 746         ASSERT(!(dde->dde_state & DDE_LOADING));
 747 
 748         for (int p = 0; p < DDT_PHYS_TYPES; p++)
 749                 ASSERT(dde->dde_lead_zio[p] == NULL);
 750 
 751         if (dde->dde_repair_abd != NULL)
 752                 abd_free(dde->dde_repair_abd);
 753 
 754         kmem_cache_free(dde_cache, dde);

 755 }
 756 
 757 /* for zdb usage */
 758 void
 759 ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
 760 {
 761         uint8_t hash = DDT_HASHFN(dde->dde_key.ddk_cksum);
 762 
 763         avl_remove(&ddt->ddt_tree[hash], dde);
 764         ddt_free(dde);
 765 }
 766 
 767 ddt_entry_t *
 768 ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
 769 {
 770         ddt_entry_t *dde, dde_search;
 771         enum ddt_type type;
 772         enum ddt_class class;
 773         avl_index_t where;
 774         uint8_t hash = DDT_HASHFN(bp->blk_cksum);
 775         int error;
 776 


 777         ddt_key_fill(&dde_search.dde_key, bp);
 778 
 779         ddt_enter(ddt, hash);
 780         /*
 781          * Do we have the dirty DDE in mem already?
 782          */
 783         dde = avl_find(&ddt->ddt_tree[hash], &dde_search, &where);
 784         if (dde == NULL) {
 785                 /* This DDE doesn't exists in dirty tree */
 786                 if (!add) {
 787                         ddt_exit(ddt, hash);
 788                         return (NULL);
 789                 }
 790                 /* Since a dirty DDE didn't exist, create it */
 791                 dde = ddt_alloc(&dde_search.dde_key);
 792                 avl_insert(&ddt->ddt_tree[hash], dde, where);
 793         }
 794 
 795         ddt_exit(ddt, hash);

 796 
 797         /*
 798          * If we're already looking up this DDE
 799          * wait until we have the result
 800          */
 801         dde_enter(dde);
 802         while (dde->dde_state & DDE_LOADING)
 803                 cv_wait(&dde->dde_cv, &dde->dde_lock);
 804 
 805         /*
 806          * If we have loaded the DDE from disk return it
 807          */
 808         if (dde->dde_state & DDE_LOADED)
 809                 return (dde);
 810 
 811         /*
 812          * If we didn't find this DDE, start looking up the DDE in ZAP
 813          */
 814         dde->dde_state |= DDE_LOADING;
 815         dde_exit(dde);
 816 


 817         error = ENOENT;
 818 
 819         DTRACE_PROBE1(ddt__loading, ddt_key_t *, &dde->dde_key);
 820         for (type = 0; type < DDT_TYPES; type++) {
 821                 for (class = 0; class < DDT_CLASSES; class++) {
 822                         error = ddt_object_lookup(ddt, type, class, dde);
 823                         if (error != ENOENT)

 824                                 break;
 825                 }

 826                 if (error != ENOENT)
 827                         break;
 828         }
 829 
 830         ASSERT(error == 0 || error == ENOENT);
 831 
 832         dde_enter(dde);

 833 
 834         ASSERT(!(dde->dde_state & DDE_LOADED));
 835         ASSERT(dde->dde_state & DDE_LOADING);
 836 
 837         dde->dde_type = type;        /* will be DDT_TYPES if no entry found */
 838         dde->dde_class = class;      /* will be DDT_CLASSES if no entry found */
 839         if (type == DDT_TYPES && class == DDT_CLASSES)
 840                 dde->dde_state |= DDE_NEW;
 841         dde->dde_state |= DDE_LOADED;
 842         dde->dde_state &= ~DDE_LOADING;
 843 
 844         DTRACE_PROBE2(ddt__loaded, ddt_key_t *, &dde->dde_key,
 845             enum ddt_class, dde->dde_class);
 846         if (error == 0)
 847                 ddt_stat_generate(ddt->ddt_spa, dde, &dde->dde_lkstat);
 848 
 849         cv_broadcast(&dde->dde_cv);
 850 
 851         return (dde);
 852 }
 853 
 854 void
 855 ddt_prefetch(spa_t *spa, const blkptr_t *bp)
 856 {
 857         ddt_t *ddt;
 858         ddt_entry_t dde;
 859 
 860         if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp))
 861                 return;
 862 
 863         /*
 864          * We only remove the DDT once all tables are empty and only
 865          * prefetch dedup blocks when there are entries in the DDT.
 866          * Thus no locking is required as the DDT can't disappear on us.
 867          */
 868         ddt = ddt_select(spa, bp);
 869         ddt_key_fill(&dde.dde_key, bp);
 870 
 871         for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 872                 for (enum ddt_class class = 0;
 873                     class < DDT_CLASSES; class++) {
 874                         ddt_object_prefetch(ddt, type, class, &dde);
 875                 }
 876         }
 877 }
 878 
 879 int
 880 ddt_entry_compare(const void *x1, const void *x2)
 881 {
 882         const ddt_entry_t *dde1 = x1;
 883         const ddt_entry_t *dde2 = x2;
 884         const uint64_t *u1 = (const uint64_t *)&dde1->dde_key;
 885         const uint64_t *u2 = (const uint64_t *)&dde2->dde_key;
 886 
 887         for (int i = 0; i < DDT_KEY_WORDS; i++) {
 888                 if (u1[i] < u2[i])
 889                         return (-1);
 890                 if (u1[i] > u2[i])
 891                         return (1);
 892         }
 893 
 894         return (0);
 895 }
 896 
 897 static ddt_t *
 898 ddt_table_alloc(spa_t *spa, enum zio_checksum c)
 899 {
 900         ddt_t *ddt;
 901         uint_t i;
 902 
 903         ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP);
 904 
 905         for (i = 0; i < DDT_HASHSZ; i++) {
 906                 mutex_init(&ddt->ddt_lock[i], NULL, MUTEX_DEFAULT, NULL);
 907                 avl_create(&ddt->ddt_tree[i], ddt_entry_compare,
 908                     sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
 909         }
 910         mutex_init(&ddt->ddt_repair_lock, NULL, MUTEX_DEFAULT, NULL);
 911 
 912         avl_create(&ddt->ddt_repair_tree, ddt_entry_compare,
 913             sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
 914         ddt->ddt_checksum = c;
 915         ddt->ddt_spa = spa;
 916         ddt->ddt_os = spa->spa_meta_objset;
 917 
 918         return (ddt);
 919 }
 920 
 921 static void
 922 ddt_table_free(ddt_t *ddt)
 923 {
 924         uint_t i;
 925 
 926         ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0);
 927 
 928         for (i = 0; i < DDT_HASHSZ; i++) {
 929                 ASSERT(avl_numnodes(&ddt->ddt_tree[i]) == 0);
 930                 avl_destroy(&ddt->ddt_tree[i]);
 931                 mutex_destroy(&ddt->ddt_lock[i]);
 932         }
 933         avl_destroy(&ddt->ddt_repair_tree);
 934         mutex_destroy(&ddt->ddt_repair_lock);
 935         kmem_free(ddt, sizeof (*ddt));
 936 }
 937 
 938 void
 939 ddt_create(spa_t *spa)
 940 {
 941         spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM;
 942 
 943         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
 944                 spa->spa_ddt[c] = ddt_table_alloc(spa, c);
 945 }
 946 
 947 /*
 948  * Get the combined size of DDTs on all pools.
 949  * Returns either on disk (phys == B_TRUE) or in core combined DDTs size
 950  */
 951 uint64_t
 952 ddt_get_ddts_size(boolean_t phys)
 953 {
 954         uint64_t ddts_size = 0;
 955         spa_t *spa = NULL;
 956 
 957         while ((spa = spa_next(spa)) != NULL)
 958                 ddts_size += spa_get_ddts_size(spa, phys);
 959 
 960         return (ddts_size);
 961 }
 962 
 963 int
 964 ddt_load(spa_t *spa)
 965 {
 966         int error;
 967         ddt_object_t *ddo;
 968 
 969         ddt_create(spa);
 970 
 971         error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 972             DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
 973             &spa->spa_ddt_stat_object);
 974 
 975         if (error)
 976                 return (error == ENOENT ? 0 : error);
 977 
 978         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 979                 ddt_t *ddt = spa->spa_ddt[c];
 980                 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 981                         for (enum ddt_class class = 0;
 982                             class < DDT_CLASSES; class++) {
 983                                 error = ddt_object_load(ddt, type, class);
 984                                 if (error == ENOENT)
 985                                         continue;
 986                                 if (error != 0)
 987                                         return (error);
 988                                 ddo = &ddt->ddt_object_stats[type][class];
 989                                 atomic_add_64(&spa->spa_ddt_dsize,
 990                                     ddo->ddo_dspace);
 991                                 atomic_add_64(&spa->spa_ddt_msize,
 992                                     ddo->ddo_mspace);
 993                         }
 994                 }
 995 
 996                 /*
 997                  * Seed the cached histograms.
 998                  */
 999                 bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
1000                     sizeof (ddt->ddt_histogram));
1001         }
1002         zfs_ddts_msize = ddt_get_ddts_size(B_FALSE);
1003 
1004         if (spa_enable_dedup_cap(spa) && spa->spa_ddt_capped == 0) {
1005                 /* notify that dedup cap is now active */
1006                 spa->spa_ddt_capped = 1;
1007                 spa_event_notify(spa, NULL, NULL, ESC_ZFS_DEDUP_OFF);
1008         }
1009 
1010         return (0);
1011 }
1012 
1013 void
1014 ddt_unload(spa_t *spa)
1015 {
1016         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1017                 if (spa->spa_ddt[c]) {
1018                         ddt_table_free(spa->spa_ddt[c]);
1019                         spa->spa_ddt[c] = NULL;
1020                 }
1021         }
1022         spa->spa_ddt_dsize = 0;
1023         spa->spa_ddt_msize = 0;
1024         zfs_ddts_msize = ddt_get_ddts_size(B_FALSE);
1025 }
1026 
1027 boolean_t
1028 ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp)
1029 {
1030         ddt_t *ddt;
1031         ddt_entry_t dde;
1032 
1033         if (!BP_GET_DEDUP(bp))
1034                 return (B_FALSE);
1035 
1036         if (max_class > spa->spa_ddt_class_max)
1037                 max_class = spa->spa_ddt_class_max;
1038 
1039         ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)];
1040 
1041         ddt_key_fill(&dde.dde_key, bp);
1042 
1043         for (enum ddt_type type = 0; type < DDT_TYPES; type++)
1044                 for (enum ddt_class class = spa->spa_ddt_class_min;
1045                     class <= max_class; class++)
1046                         if (ddt_object_lookup(ddt, type, class, &dde) == 0)
1047                                 return (B_TRUE);
1048 
1049         return (B_FALSE);
1050 }
1051 
1052 ddt_entry_t *
1053 ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
1054 {
1055         ddt_key_t ddk;
1056         ddt_entry_t *dde;
1057 
1058         ddt_key_fill(&ddk, bp);
1059 
1060         dde = ddt_alloc(&ddk);
1061 
1062         for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
1063                 for (enum ddt_class class = 0;
1064                     class < DDT_CLASSES; class++) {
1065                         /*
1066                          * We can only do repair if there are multiple copies
1067                          * of the block.  For anything in the UNIQUE class,
1068                          * there's definitely only one copy, so don't even try.
1069                          */
1070                         if (class != DDT_CLASS_UNIQUE &&
1071                             ddt_object_lookup(ddt, type, class, dde) == 0)
1072                                 return (dde);
1073                 }
1074         }
1075 
1076         bzero(dde->dde_phys, sizeof (dde->dde_phys));
1077 
1078         return (dde);
1079 }
1080 
1081 void
1082 ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
1083 {
1084         avl_index_t where;
1085 
1086         mutex_enter(&ddt->ddt_repair_lock);
1087 
1088         if (dde->dde_repair_abd != NULL && spa_writeable(ddt->ddt_spa) &&
1089             avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
1090                 avl_insert(&ddt->ddt_repair_tree, dde, where);
1091         else
1092                 ddt_free(dde);
1093 
1094         mutex_exit(&ddt->ddt_repair_lock);;
1095 }
1096 
1097 static void
1098 ddt_repair_entry_done(zio_t *zio)
1099 {
1100         ddt_entry_t *rdde = zio->io_private;
1101 
1102         ddt_free(rdde);
1103 }
1104 
1105 static void
1106 ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
1107 {
1108         ddt_phys_t *ddp = dde->dde_phys;
1109         ddt_phys_t *rddp = rdde->dde_phys;
1110         ddt_key_t *ddk = &dde->dde_key;
1111         ddt_key_t *rddk = &rdde->dde_key;
1112         zio_t *zio;
1113         blkptr_t blk;
1114 


1123                 ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
1124                 zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
1125                     rdde->dde_repair_abd, DDK_GET_PSIZE(rddk), NULL, NULL,
1126                     ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL));
1127         }
1128 
1129         zio_nowait(zio);
1130 }
1131 
1132 static void
1133 ddt_repair_table(ddt_t *ddt, zio_t *rio)
1134 {
1135         spa_t *spa = ddt->ddt_spa;
1136         ddt_entry_t *dde, *rdde_next, *rdde;
1137         avl_tree_t *t = &ddt->ddt_repair_tree;
1138         blkptr_t blk;
1139 
1140         if (spa_sync_pass(spa) > 1)
1141                 return;
1142 
1143         mutex_enter(&ddt->ddt_repair_lock);
1144         for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) {
1145                 rdde_next = AVL_NEXT(t, rdde);
1146                 avl_remove(&ddt->ddt_repair_tree, rdde);
1147                 mutex_exit(&ddt->ddt_repair_lock);
1148 
1149                 ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk);
1150                 dde = ddt_repair_start(ddt, &blk);
1151                 ddt_repair_entry(ddt, dde, rdde, rio);
1152                 ddt_repair_done(ddt, dde);
1153 
1154                 mutex_enter(&ddt->ddt_repair_lock);
1155         }
1156         mutex_exit(&ddt->ddt_repair_lock);
1157 }
1158 
1159 static void
1160 ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
1161 {
1162         dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
1163         ddt_phys_t *ddp = dde->dde_phys;
1164         ddt_key_t *ddk = &dde->dde_key;
1165         spa_t *spa = ddt->ddt_spa;
1166         enum ddt_type otype = dde->dde_type;
1167         enum ddt_type ntype = DDT_TYPE_CURRENT;
1168         enum ddt_class oclass = dde->dde_class;
1169         enum ddt_class nclass;
1170         uint64_t total_refcnt = 0;
1171 
1172         ASSERT(dde->dde_state & DDE_LOADED);
1173         ASSERT(!(dde->dde_state & DDE_LOADING));
1174 
1175         /*
1176          * Propagate the stats generated at lookup time
1177          * this was delayed to avoid having to take locks
1178          * to protect ddt->ddt_histogram
1179          */
1180         if (dde->dde_lkstat.dds_ref_blocks != 0)
1181                 ddt_stat_update_by_dds(ddt, dde, &dde->dde_lkstat, -1ULL);
1182 
1183         for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
1184                 ASSERT(dde->dde_lead_zio[p] == NULL);
1185                 ASSERT((int64_t)ddp->ddp_refcnt >= 0);
1186                 if (ddp->ddp_phys_birth == 0) {
1187                         ASSERT(ddp->ddp_refcnt == 0);
1188                         continue;
1189                 }
1190                 if (p == DDT_PHYS_DITTO) {
1191                         if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0)
1192                                 ddt_phys_free(ddt, ddk, ddp, txg);
1193                         continue;
1194                 }
1195                 if (ddp->ddp_refcnt == 0)
1196                         ddt_phys_free(ddt, ddk, ddp, txg);
1197                 total_refcnt += ddp->ddp_refcnt;
1198         }
1199 
1200         if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0)
1201                 nclass = DDT_CLASS_DITTO;
1202         else if (total_refcnt > 1)
1203                 nclass = DDT_CLASS_DUPLICATE;
1204         else
1205                 nclass = DDT_CLASS_UNIQUE;
1206 
1207         if (nclass > spa->spa_ddt_class_max)
1208                 nclass = spa->spa_ddt_class_max;
1209 
1210         if (nclass < spa->spa_ddt_class_min)
1211                 nclass = spa->spa_ddt_class_min;
1212 
1213         DTRACE_PROBE1(ddt__storing__entry, uint64_t, (uint64_t)nclass);
1214 
1215         if (otype != DDT_TYPES &&
1216             (otype != ntype || oclass != nclass || total_refcnt == 0)) {
1217                 VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0);
1218                 ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT);
1219         }
1220 
1221         if (total_refcnt != 0) {
1222                 dde->dde_type = ntype;
1223                 dde->dde_class = nclass;
1224                 ddt_stat_update(ddt, dde, 0);
1225                 if (!ddt_object_exists(ddt, ntype, nclass))
1226                         ddt_object_create(ddt, ntype, nclass, tx);
1227                 VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0);
1228 
1229                 /*
1230                  * If the class changes, the order that we scan this bp
1231                  * changes.  If it decreases, we could miss it, so
1232                  * scan it right now.  (This covers both class changing
1233                  * while we are doing ddt_walk(), and when we are
1234                  * traversing.)
1235                  */
1236                 if (nclass < oclass) {
1237                         dsl_scan_ddt_entry(dp->dp_scan,
1238                             ddt->ddt_checksum, dde, tx);
1239                 }
1240         }
1241         DTRACE_PROBE(ddt__stored__entry);
1242 }
1243 
1244 static void
1245 ddt_sync_avl(ddt_t *ddt, avl_tree_t *avl, dmu_tx_t *tx, uint64_t txg)
1246 {
1247         void *cookie = NULL;
1248         ddt_entry_t *dde;
1249 
1250         while ((dde = avl_destroy_nodes(avl, &cookie)) != NULL) {
1251                 if ((dde->dde_state & DDE_DONT_SYNC) != DDE_DONT_SYNC) {
1252                         ddt_sync_entry(ddt, dde, tx, txg);
1253                 } else { /* if we're not syncing this DDE it must be new */
1254                         ASSERT(dde->dde_state & DDE_NEW);
1255                 }
1256                 ddt_free(dde);
1257         }
1258 }
1259 
1260 static void
1261 ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
1262 {
1263         uint64_t cnt, num_dbytes = 0, num_mbytes = 0;
1264         int64_t old_mbytes = 0;
1265         spa_t *spa = ddt->ddt_spa;
1266         uint_t i, numnodes = 0;
1267         ddt_object_t *ddo;
1268 
1269         for (i = 0; i < DDT_HASHSZ; i++)
1270                 numnodes += avl_numnodes(&ddt->ddt_tree[i]);
1271 
1272         if (numnodes == 0)
1273                 return;
1274 
1275         ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP);
1276 
1277         if (spa->spa_ddt_stat_object == 0) {
1278                 spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
1279                     DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT,
1280                     DMU_POOL_DDT_STATS, tx);
1281         }
1282 




1283 
1284         DTRACE_PROBE(ddt__syncing__avl);
1285         for (i = 0; i < DDT_HASHSZ; i++)
1286                 ddt_sync_avl(ddt, &ddt->ddt_tree[i], tx, txg);
1287         DTRACE_PROBE(ddt__synced__avl);
1288 
1289         DTRACE_PROBE(ddt__syncing__obj);
1290         for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
1291                 for (enum ddt_class class = spa->spa_ddt_class_min;
1292                     class <= spa->spa_ddt_class_max; class++) {
1293                         if (ddt_object_exists(ddt, type, class)) {
1294                                 ddo = &ddt->ddt_object_stats[type][class];
1295                                 old_mbytes += ddo->ddo_mspace;
1296 
1297                                 ddt_object_sync(ddt, type, class, tx);
1298                                 (void) ddt_object_count(ddt, type, class, &cnt);
1299                                 if (cnt == 0) {
1300                                         ddt_object_destroy(ddt, type, class,
1301                                             tx);
1302                                         continue;
1303                                 }
1304 
1305                                 num_dbytes += ddo->ddo_dspace;
1306                                 num_mbytes += ddo->ddo_mspace;
1307                         }



1308                 }
1309         }
1310         spa->spa_ddt_dsize = num_dbytes;
1311         spa->spa_ddt_msize = num_mbytes;
1312         atomic_add_64(&zfs_ddts_msize, ((int64_t)num_mbytes) - old_mbytes);
1313         DTRACE_PROBE4(ddt__synced__obj, char *, spa->spa_name,
1314             uint64_t, num_dbytes, uint64_t, num_mbytes, uint64_t,
1315             zfs_ddts_msize);
1316 
1317         if (spa_enable_dedup_cap(spa) && spa->spa_ddt_capped == 0) {
1318                 /* notify that dedup cap is now active */
1319                 spa->spa_ddt_capped = 1;
1320                 spa_event_notify(spa, NULL, NULL, ESC_ZFS_DEDUP_OFF);
1321         } else if (!spa_enable_dedup_cap(spa) && spa->spa_ddt_capped == 1) {
1322                 /* notify that dedup cap is now inactive */
1323                 spa->spa_ddt_capped = 0;
1324                 spa_event_notify(spa, NULL, NULL, ESC_ZFS_DEDUP_ON);
1325         }
1326 
1327         /* update the cached stats with the values calculated above */
1328         bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
1329             sizeof (ddt->ddt_histogram));
1330 }
1331 
1332 void
1333 ddt_sync(spa_t *spa, uint64_t txg)
1334 {
1335         dmu_tx_t *tx;
1336         zio_t *rio = zio_root(spa, NULL, NULL,
1337             ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
1338 
1339         ASSERT(spa_syncing_txg(spa) == txg);
1340 
1341         tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1342 
1343         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1344                 ddt_t *ddt = spa->spa_ddt[c];
1345                 if (ddt == NULL)
1346                         continue;
1347                 ddt_sync_table(ddt, tx, txg);
1348                 ddt_repair_table(ddt, rio);
1349         }
1350 
1351         (void) zio_wait(rio);
1352 
1353         dmu_tx_commit(tx);
1354 }
1355 
1356 int
1357 ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)