1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
  25  */
  26 
  27 #include <sys/zfs_context.h>
  28 #include <sys/spa.h>
  29 #include <sys/spa_impl.h>
  30 #include <sys/zio.h>
  31 #include <sys/ddt.h>
  32 #include <sys/zap.h>
  33 #include <sys/dmu_tx.h>
  34 #include <sys/arc.h>
  35 #include <sys/dsl_pool.h>
  36 #include <sys/zio_checksum.h>
  37 #include <sys/zio_compress.h>
  38 #include <sys/dsl_scan.h>
  39 #include <sys/abd.h>
  40 
  41 /*
  42  * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
  43  */
  44 int zfs_dedup_prefetch = 1;
  45 
  46 static const ddt_ops_t *ddt_ops[DDT_TYPES] = {
  47         &ddt_zap_ops,
  48 };
  49 
  50 static const char *ddt_class_name[DDT_CLASSES] = {
  51         "ditto",
  52         "duplicate",
  53         "unique",
  54 };
  55 
  56 static void
  57 ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
  58     dmu_tx_t *tx)
  59 {
  60         spa_t *spa = ddt->ddt_spa;
  61         objset_t *os = ddt->ddt_os;
  62         uint64_t *objectp = &ddt->ddt_object[type][class];
  63         boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_flags &
  64             ZCHECKSUM_FLAG_DEDUP;
  65         char name[DDT_NAMELEN];
  66 
  67         ddt_object_name(ddt, type, class, name);
  68 
  69         ASSERT(*objectp == 0);
  70         VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0);
  71         ASSERT(*objectp != 0);
  72 
  73         VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name,
  74             sizeof (uint64_t), 1, objectp, tx) == 0);
  75 
  76         VERIFY(zap_add(os, spa->spa_ddt_stat_object, name,
  77             sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
  78             &ddt->ddt_histogram[type][class], tx) == 0);
  79 }
  80 
  81 static void
  82 ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
  83     dmu_tx_t *tx)
  84 {
  85         spa_t *spa = ddt->ddt_spa;
  86         objset_t *os = ddt->ddt_os;
  87         uint64_t *objectp = &ddt->ddt_object[type][class];
  88         char name[DDT_NAMELEN];
  89 
  90         ddt_object_name(ddt, type, class, name);
  91 
  92         ASSERT(*objectp != 0);
  93         ASSERT(ddt_object_count(ddt, type, class) == 0);
  94         ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
  95         VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0);
  96         VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0);
  97         VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0);
  98         bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t));
  99 
 100         *objectp = 0;
 101 }
 102 
 103 static int
 104 ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
 105 {
 106         ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
 107         dmu_object_info_t doi;
 108         char name[DDT_NAMELEN];
 109         int error;
 110 
 111         ddt_object_name(ddt, type, class, name);
 112 
 113         error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name,
 114             sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
 115 
 116         if (error != 0)
 117                 return (error);
 118 
 119         VERIFY0(zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
 120             sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
 121             &ddt->ddt_histogram[type][class]));
 122 
 123         /*
 124          * Seed the cached statistics.
 125          */
 126         VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
 127 
 128         ddo->ddo_count = ddt_object_count(ddt, type, class);
 129         ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
 130         ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
 131 
 132         return (0);
 133 }
 134 
 135 static void
 136 ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 137     dmu_tx_t *tx)
 138 {
 139         ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
 140         dmu_object_info_t doi;
 141         char name[DDT_NAMELEN];
 142 
 143         ddt_object_name(ddt, type, class, name);
 144 
 145         VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
 146             sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
 147             &ddt->ddt_histogram[type][class], tx) == 0);
 148 
 149         /*
 150          * Cache DDT statistics; this is the only time they'll change.
 151          */
 152         VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
 153 
 154         ddo->ddo_count = ddt_object_count(ddt, type, class);
 155         ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
 156         ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
 157 }
 158 
 159 static int
 160 ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 161     ddt_entry_t *dde)
 162 {
 163         if (!ddt_object_exists(ddt, type, class))
 164                 return (SET_ERROR(ENOENT));
 165 
 166         return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
 167             ddt->ddt_object[type][class], dde));
 168 }
 169 
 170 static void
 171 ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 172     ddt_entry_t *dde)
 173 {
 174         if (!ddt_object_exists(ddt, type, class))
 175                 return;
 176 
 177         ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os,
 178             ddt->ddt_object[type][class], dde);
 179 }
 180 
 181 int
 182 ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 183     ddt_entry_t *dde, dmu_tx_t *tx)
 184 {
 185         ASSERT(ddt_object_exists(ddt, type, class));
 186 
 187         return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
 188             ddt->ddt_object[type][class], dde, tx));
 189 }
 190 
 191 static int
 192 ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 193     ddt_entry_t *dde, dmu_tx_t *tx)
 194 {
 195         ASSERT(ddt_object_exists(ddt, type, class));
 196 
 197         return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os,
 198             ddt->ddt_object[type][class], dde, tx));
 199 }
 200 
 201 int
 202 ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 203     uint64_t *walk, ddt_entry_t *dde)
 204 {
 205         ASSERT(ddt_object_exists(ddt, type, class));
 206 
 207         return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
 208             ddt->ddt_object[type][class], dde, walk));
 209 }
 210 
 211 uint64_t
 212 ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
 213 {
 214         ASSERT(ddt_object_exists(ddt, type, class));
 215 
 216         return (ddt_ops[type]->ddt_op_count(ddt->ddt_os,
 217             ddt->ddt_object[type][class]));
 218 }
 219 
 220 int
 221 ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 222     dmu_object_info_t *doi)
 223 {
 224         if (!ddt_object_exists(ddt, type, class))
 225                 return (SET_ERROR(ENOENT));
 226 
 227         return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class],
 228             doi));
 229 }
 230 
 231 boolean_t
 232 ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
 233 {
 234         return (!!ddt->ddt_object[type][class]);
 235 }
 236 
 237 void
 238 ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 239     char *name)
 240 {
 241         (void) sprintf(name, DMU_POOL_DDT,
 242             zio_checksum_table[ddt->ddt_checksum].ci_name,
 243             ddt_ops[type]->ddt_op_name, ddt_class_name[class]);
 244 }
 245 
 246 void
 247 ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg)
 248 {
 249         ASSERT(txg != 0);
 250 
 251         for (int d = 0; d < SPA_DVAS_PER_BP; d++)
 252                 bp->blk_dva[d] = ddp->ddp_dva[d];
 253         BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth);
 254 }
 255 
 256 void
 257 ddt_bp_create(enum zio_checksum checksum,
 258     const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp)
 259 {
 260         BP_ZERO(bp);
 261 
 262         if (ddp != NULL)
 263                 ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth);
 264 
 265         bp->blk_cksum = ddk->ddk_cksum;
 266         bp->blk_fill = 1;
 267 
 268         BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk));
 269         BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk));
 270         BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk));
 271         BP_SET_CHECKSUM(bp, checksum);
 272         BP_SET_TYPE(bp, DMU_OT_DEDUP);
 273         BP_SET_LEVEL(bp, 0);
 274         BP_SET_DEDUP(bp, 0);
 275         BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
 276 }
 277 
 278 void
 279 ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp)
 280 {
 281         ddk->ddk_cksum = bp->blk_cksum;
 282         ddk->ddk_prop = 0;
 283 
 284         DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp));
 285         DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp));
 286         DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp));
 287 }
 288 
 289 void
 290 ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp)
 291 {
 292         ASSERT(ddp->ddp_phys_birth == 0);
 293 
 294         for (int d = 0; d < SPA_DVAS_PER_BP; d++)
 295                 ddp->ddp_dva[d] = bp->blk_dva[d];
 296         ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp);
 297 }
 298 
 299 void
 300 ddt_phys_clear(ddt_phys_t *ddp)
 301 {
 302         bzero(ddp, sizeof (*ddp));
 303 }
 304 
 305 void
 306 ddt_phys_addref(ddt_phys_t *ddp)
 307 {
 308         ddp->ddp_refcnt++;
 309 }
 310 
 311 void
 312 ddt_phys_decref(ddt_phys_t *ddp)
 313 {
 314         ASSERT((int64_t)ddp->ddp_refcnt > 0);
 315         ddp->ddp_refcnt--;
 316 }
 317 
 318 void
 319 ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg)
 320 {
 321         blkptr_t blk;
 322 
 323         ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
 324         ddt_phys_clear(ddp);
 325         zio_free(ddt->ddt_spa, txg, &blk);
 326 }
 327 
 328 ddt_phys_t *
 329 ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp)
 330 {
 331         ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys;
 332 
 333         for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 334                 if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) &&
 335                     BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth)
 336                         return (ddp);
 337         }
 338         return (NULL);
 339 }
 340 
 341 uint64_t
 342 ddt_phys_total_refcnt(const ddt_entry_t *dde)
 343 {
 344         uint64_t refcnt = 0;
 345 
 346         for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
 347                 refcnt += dde->dde_phys[p].ddp_refcnt;
 348 
 349         return (refcnt);
 350 }
 351 
 352 static void
 353 ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
 354 {
 355         spa_t *spa = ddt->ddt_spa;
 356         ddt_phys_t *ddp = dde->dde_phys;
 357         ddt_key_t *ddk = &dde->dde_key;
 358         uint64_t lsize = DDK_GET_LSIZE(ddk);
 359         uint64_t psize = DDK_GET_PSIZE(ddk);
 360 
 361         bzero(dds, sizeof (*dds));
 362 
 363         for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 364                 uint64_t dsize = 0;
 365                 uint64_t refcnt = ddp->ddp_refcnt;
 366 
 367                 if (ddp->ddp_phys_birth == 0)
 368                         continue;
 369 
 370                 for (int d = 0; d < SPA_DVAS_PER_BP; d++)
 371                         dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
 372 
 373                 dds->dds_blocks += 1;
 374                 dds->dds_lsize += lsize;
 375                 dds->dds_psize += psize;
 376                 dds->dds_dsize += dsize;
 377 
 378                 dds->dds_ref_blocks += refcnt;
 379                 dds->dds_ref_lsize += lsize * refcnt;
 380                 dds->dds_ref_psize += psize * refcnt;
 381                 dds->dds_ref_dsize += dsize * refcnt;
 382         }
 383 }
 384 
 385 void
 386 ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg)
 387 {
 388         const uint64_t *s = (const uint64_t *)src;
 389         uint64_t *d = (uint64_t *)dst;
 390         uint64_t *d_end = (uint64_t *)(dst + 1);
 391 
 392         ASSERT(neg == 0 || neg == -1ULL);       /* add or subtract */
 393 
 394         while (d < d_end)
 395                 *d++ += (*s++ ^ neg) - neg;
 396 }
 397 
 398 static void
 399 ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
 400 {
 401         ddt_stat_t dds;
 402         ddt_histogram_t *ddh;
 403         int bucket;
 404 
 405         ddt_stat_generate(ddt, dde, &dds);
 406 
 407         bucket = highbit64(dds.dds_ref_blocks) - 1;
 408         ASSERT(bucket >= 0);
 409 
 410         ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
 411 
 412         ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg);
 413 }
 414 
 415 void
 416 ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
 417 {
 418         for (int h = 0; h < 64; h++)
 419                 ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0);
 420 }
 421 
 422 void
 423 ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh)
 424 {
 425         bzero(dds, sizeof (*dds));
 426 
 427         for (int h = 0; h < 64; h++)
 428                 ddt_stat_add(dds, &ddh->ddh_stat[h], 0);
 429 }
 430 
 431 boolean_t
 432 ddt_histogram_empty(const ddt_histogram_t *ddh)
 433 {
 434         const uint64_t *s = (const uint64_t *)ddh;
 435         const uint64_t *s_end = (const uint64_t *)(ddh + 1);
 436 
 437         while (s < s_end)
 438                 if (*s++ != 0)
 439                         return (B_FALSE);
 440 
 441         return (B_TRUE);
 442 }
 443 
 444 void
 445 ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
 446 {
 447         /* Sum the statistics we cached in ddt_object_sync(). */
 448         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 449                 ddt_t *ddt = spa->spa_ddt[c];
 450                 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 451                         for (enum ddt_class class = 0; class < DDT_CLASSES;
 452                             class++) {
 453                                 ddt_object_t *ddo =
 454                                     &ddt->ddt_object_stats[type][class];
 455                                 ddo_total->ddo_count += ddo->ddo_count;
 456                                 ddo_total->ddo_dspace += ddo->ddo_dspace;
 457                                 ddo_total->ddo_mspace += ddo->ddo_mspace;
 458                         }
 459                 }
 460         }
 461 
 462         /* ... and compute the averages. */
 463         if (ddo_total->ddo_count != 0) {
 464                 ddo_total->ddo_dspace /= ddo_total->ddo_count;
 465                 ddo_total->ddo_mspace /= ddo_total->ddo_count;
 466         }
 467 }
 468 
 469 void
 470 ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
 471 {
 472         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 473                 ddt_t *ddt = spa->spa_ddt[c];
 474                 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 475                         for (enum ddt_class class = 0; class < DDT_CLASSES;
 476                             class++) {
 477                                 ddt_histogram_add(ddh,
 478                                     &ddt->ddt_histogram_cache[type][class]);
 479                         }
 480                 }
 481         }
 482 }
 483 
 484 void
 485 ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total)
 486 {
 487         ddt_histogram_t *ddh_total;
 488 
 489         ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
 490         ddt_get_dedup_histogram(spa, ddh_total);
 491         ddt_histogram_stat(dds_total, ddh_total);
 492         kmem_free(ddh_total, sizeof (ddt_histogram_t));
 493 }
 494 
 495 uint64_t
 496 ddt_get_dedup_dspace(spa_t *spa)
 497 {
 498         ddt_stat_t dds_total = { 0 };
 499 
 500         ddt_get_dedup_stats(spa, &dds_total);
 501         return (dds_total.dds_ref_dsize - dds_total.dds_dsize);
 502 }
 503 
 504 uint64_t
 505 ddt_get_pool_dedup_ratio(spa_t *spa)
 506 {
 507         ddt_stat_t dds_total = { 0 };
 508 
 509         ddt_get_dedup_stats(spa, &dds_total);
 510         if (dds_total.dds_dsize == 0)
 511                 return (100);
 512 
 513         return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize);
 514 }
 515 
 516 int
 517 ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref)
 518 {
 519         spa_t *spa = ddt->ddt_spa;
 520         uint64_t total_refcnt = 0;
 521         uint64_t ditto = spa->spa_dedup_ditto;
 522         int total_copies = 0;
 523         int desired_copies = 0;
 524 
 525         for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
 526                 ddt_phys_t *ddp = &dde->dde_phys[p];
 527                 zio_t *zio = dde->dde_lead_zio[p];
 528                 uint64_t refcnt = ddp->ddp_refcnt;   /* committed refs */
 529                 if (zio != NULL)
 530                         refcnt += zio->io_parent_count;      /* pending refs */
 531                 if (ddp == ddp_willref)
 532                         refcnt++;                       /* caller's ref */
 533                 if (refcnt != 0) {
 534                         total_refcnt += refcnt;
 535                         total_copies += p;
 536                 }
 537         }
 538 
 539         if (ditto == 0 || ditto > UINT32_MAX)
 540                 ditto = UINT32_MAX;
 541 
 542         if (total_refcnt >= 1)
 543                 desired_copies++;
 544         if (total_refcnt >= ditto)
 545                 desired_copies++;
 546         if (total_refcnt >= ditto * ditto)
 547                 desired_copies++;
 548 
 549         return (MAX(desired_copies, total_copies) - total_copies);
 550 }
 551 
 552 int
 553 ddt_ditto_copies_present(ddt_entry_t *dde)
 554 {
 555         ddt_phys_t *ddp = &dde->dde_phys[DDT_PHYS_DITTO];
 556         dva_t *dva = ddp->ddp_dva;
 557         int copies = 0 - DVA_GET_GANG(dva);
 558 
 559         for (int d = 0; d < SPA_DVAS_PER_BP; d++, dva++)
 560                 if (DVA_IS_VALID(dva))
 561                         copies++;
 562 
 563         ASSERT(copies >= 0 && copies < SPA_DVAS_PER_BP);
 564 
 565         return (copies);
 566 }
 567 
 568 size_t
 569 ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len)
 570 {
 571         uchar_t *version = dst++;
 572         int cpfunc = ZIO_COMPRESS_ZLE;
 573         zio_compress_info_t *ci = &zio_compress_table[cpfunc];
 574         size_t c_len;
 575 
 576         ASSERT(d_len >= s_len + 1);  /* no compression plus version byte */
 577 
 578         c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level);
 579 
 580         if (c_len == s_len) {
 581                 cpfunc = ZIO_COMPRESS_OFF;
 582                 bcopy(src, dst, s_len);
 583         }
 584 
 585         *version = cpfunc;
 586         /* CONSTCOND */
 587         if (ZFS_HOST_BYTEORDER)
 588                 *version |= DDT_COMPRESS_BYTEORDER_MASK;
 589 
 590         return (c_len + 1);
 591 }
 592 
 593 void
 594 ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len)
 595 {
 596         uchar_t version = *src++;
 597         int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK;
 598         zio_compress_info_t *ci = &zio_compress_table[cpfunc];
 599 
 600         if (ci->ci_decompress != NULL)
 601                 (void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);
 602         else
 603                 bcopy(src, dst, d_len);
 604 
 605         if (((version & DDT_COMPRESS_BYTEORDER_MASK) != 0) !=
 606             (ZFS_HOST_BYTEORDER != 0))
 607                 byteswap_uint64_array(dst, d_len);
 608 }
 609 
 610 ddt_t *
 611 ddt_select_by_checksum(spa_t *spa, enum zio_checksum c)
 612 {
 613         return (spa->spa_ddt[c]);
 614 }
 615 
 616 ddt_t *
 617 ddt_select(spa_t *spa, const blkptr_t *bp)
 618 {
 619         return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
 620 }
 621 
 622 void
 623 ddt_enter(ddt_t *ddt)
 624 {
 625         mutex_enter(&ddt->ddt_lock);
 626 }
 627 
 628 void
 629 ddt_exit(ddt_t *ddt)
 630 {
 631         mutex_exit(&ddt->ddt_lock);
 632 }
 633 
 634 static ddt_entry_t *
 635 ddt_alloc(const ddt_key_t *ddk)
 636 {
 637         ddt_entry_t *dde;
 638 
 639         dde = kmem_zalloc(sizeof (ddt_entry_t), KM_SLEEP);
 640         cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
 641 
 642         dde->dde_key = *ddk;
 643 
 644         return (dde);
 645 }
 646 
 647 static void
 648 ddt_free(ddt_entry_t *dde)
 649 {
 650         ASSERT(!dde->dde_loading);
 651 
 652         for (int p = 0; p < DDT_PHYS_TYPES; p++)
 653                 ASSERT(dde->dde_lead_zio[p] == NULL);
 654 
 655         if (dde->dde_repair_abd != NULL)
 656                 abd_free(dde->dde_repair_abd);
 657 
 658         cv_destroy(&dde->dde_cv);
 659         kmem_free(dde, sizeof (*dde));
 660 }
 661 
 662 void
 663 ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
 664 {
 665         ASSERT(MUTEX_HELD(&ddt->ddt_lock));
 666 
 667         avl_remove(&ddt->ddt_tree, dde);
 668         ddt_free(dde);
 669 }
 670 
 671 ddt_entry_t *
 672 ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
 673 {
 674         ddt_entry_t *dde, dde_search;
 675         enum ddt_type type;
 676         enum ddt_class class;
 677         avl_index_t where;
 678         int error;
 679 
 680         ASSERT(MUTEX_HELD(&ddt->ddt_lock));
 681 
 682         ddt_key_fill(&dde_search.dde_key, bp);
 683 
 684         dde = avl_find(&ddt->ddt_tree, &dde_search, &where);
 685         if (dde == NULL) {
 686                 if (!add)
 687                         return (NULL);
 688                 dde = ddt_alloc(&dde_search.dde_key);
 689                 avl_insert(&ddt->ddt_tree, dde, where);
 690         }
 691 
 692         while (dde->dde_loading)
 693                 cv_wait(&dde->dde_cv, &ddt->ddt_lock);
 694 
 695         if (dde->dde_loaded)
 696                 return (dde);
 697 
 698         dde->dde_loading = B_TRUE;
 699 
 700         ddt_exit(ddt);
 701 
 702         error = ENOENT;
 703 
 704         for (type = 0; type < DDT_TYPES; type++) {
 705                 for (class = 0; class < DDT_CLASSES; class++) {
 706                         error = ddt_object_lookup(ddt, type, class, dde);
 707                         if (error != ENOENT) {
 708                                 ASSERT0(error);
 709                                 break;
 710                         }
 711                 }
 712                 if (error != ENOENT)
 713                         break;
 714         }
 715 
 716         ddt_enter(ddt);
 717 
 718         ASSERT(dde->dde_loaded == B_FALSE);
 719         ASSERT(dde->dde_loading == B_TRUE);
 720 
 721         dde->dde_type = type;        /* will be DDT_TYPES if no entry found */
 722         dde->dde_class = class;      /* will be DDT_CLASSES if no entry found */
 723         dde->dde_loaded = B_TRUE;
 724         dde->dde_loading = B_FALSE;
 725 
 726         if (error == 0)
 727                 ddt_stat_update(ddt, dde, -1ULL);
 728 
 729         cv_broadcast(&dde->dde_cv);
 730 
 731         return (dde);
 732 }
 733 
 734 void
 735 ddt_prefetch(spa_t *spa, const blkptr_t *bp)
 736 {
 737         ddt_t *ddt;
 738         ddt_entry_t dde;
 739 
 740         if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp))
 741                 return;
 742 
 743         /*
 744          * We only remove the DDT once all tables are empty and only
 745          * prefetch dedup blocks when there are entries in the DDT.
 746          * Thus no locking is required as the DDT can't disappear on us.
 747          */
 748         ddt = ddt_select(spa, bp);
 749         ddt_key_fill(&dde.dde_key, bp);
 750 
 751         for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 752                 for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
 753                         ddt_object_prefetch(ddt, type, class, &dde);
 754                 }
 755         }
 756 }
 757 
 758 int
 759 ddt_entry_compare(const void *x1, const void *x2)
 760 {
 761         const ddt_entry_t *dde1 = x1;
 762         const ddt_entry_t *dde2 = x2;
 763         const uint64_t *u1 = (const uint64_t *)&dde1->dde_key;
 764         const uint64_t *u2 = (const uint64_t *)&dde2->dde_key;
 765 
 766         for (int i = 0; i < DDT_KEY_WORDS; i++) {
 767                 if (u1[i] < u2[i])
 768                         return (-1);
 769                 if (u1[i] > u2[i])
 770                         return (1);
 771         }
 772 
 773         return (0);
 774 }
 775 
 776 static ddt_t *
 777 ddt_table_alloc(spa_t *spa, enum zio_checksum c)
 778 {
 779         ddt_t *ddt;
 780 
 781         ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP);
 782 
 783         mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
 784         avl_create(&ddt->ddt_tree, ddt_entry_compare,
 785             sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
 786         avl_create(&ddt->ddt_repair_tree, ddt_entry_compare,
 787             sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
 788         ddt->ddt_checksum = c;
 789         ddt->ddt_spa = spa;
 790         ddt->ddt_os = spa->spa_meta_objset;
 791 
 792         return (ddt);
 793 }
 794 
 795 static void
 796 ddt_table_free(ddt_t *ddt)
 797 {
 798         ASSERT(avl_numnodes(&ddt->ddt_tree) == 0);
 799         ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0);
 800         avl_destroy(&ddt->ddt_tree);
 801         avl_destroy(&ddt->ddt_repair_tree);
 802         mutex_destroy(&ddt->ddt_lock);
 803         kmem_free(ddt, sizeof (*ddt));
 804 }
 805 
 806 void
 807 ddt_create(spa_t *spa)
 808 {
 809         spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM;
 810 
 811         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
 812                 spa->spa_ddt[c] = ddt_table_alloc(spa, c);
 813 }
 814 
 815 int
 816 ddt_load(spa_t *spa)
 817 {
 818         int error;
 819 
 820         ddt_create(spa);
 821 
 822         error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 823             DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
 824             &spa->spa_ddt_stat_object);
 825 
 826         if (error)
 827                 return (error == ENOENT ? 0 : error);
 828 
 829         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 830                 ddt_t *ddt = spa->spa_ddt[c];
 831                 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 832                         for (enum ddt_class class = 0; class < DDT_CLASSES;
 833                             class++) {
 834                                 error = ddt_object_load(ddt, type, class);
 835                                 if (error != 0 && error != ENOENT)
 836                                         return (error);
 837                         }
 838                 }
 839 
 840                 /*
 841                  * Seed the cached histograms.
 842                  */
 843                 bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
 844                     sizeof (ddt->ddt_histogram));
 845         }
 846 
 847         return (0);
 848 }
 849 
 850 void
 851 ddt_unload(spa_t *spa)
 852 {
 853         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 854                 if (spa->spa_ddt[c]) {
 855                         ddt_table_free(spa->spa_ddt[c]);
 856                         spa->spa_ddt[c] = NULL;
 857                 }
 858         }
 859 }
 860 
 861 boolean_t
 862 ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp)
 863 {
 864         ddt_t *ddt;
 865         ddt_entry_t dde;
 866 
 867         if (!BP_GET_DEDUP(bp))
 868                 return (B_FALSE);
 869 
 870         if (max_class == DDT_CLASS_UNIQUE)
 871                 return (B_TRUE);
 872 
 873         ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)];
 874 
 875         ddt_key_fill(&dde.dde_key, bp);
 876 
 877         for (enum ddt_type type = 0; type < DDT_TYPES; type++)
 878                 for (enum ddt_class class = 0; class <= max_class; class++)
 879                         if (ddt_object_lookup(ddt, type, class, &dde) == 0)
 880                                 return (B_TRUE);
 881 
 882         return (B_FALSE);
 883 }
 884 
 885 ddt_entry_t *
 886 ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
 887 {
 888         ddt_key_t ddk;
 889         ddt_entry_t *dde;
 890 
 891         ddt_key_fill(&ddk, bp);
 892 
 893         dde = ddt_alloc(&ddk);
 894 
 895         for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 896                 for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
 897                         /*
 898                          * We can only do repair if there are multiple copies
 899                          * of the block.  For anything in the UNIQUE class,
 900                          * there's definitely only one copy, so don't even try.
 901                          */
 902                         if (class != DDT_CLASS_UNIQUE &&
 903                             ddt_object_lookup(ddt, type, class, dde) == 0)
 904                                 return (dde);
 905                 }
 906         }
 907 
 908         bzero(dde->dde_phys, sizeof (dde->dde_phys));
 909 
 910         return (dde);
 911 }
 912 
 913 void
 914 ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
 915 {
 916         avl_index_t where;
 917 
 918         ddt_enter(ddt);
 919 
 920         if (dde->dde_repair_abd != NULL && spa_writeable(ddt->ddt_spa) &&
 921             avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
 922                 avl_insert(&ddt->ddt_repair_tree, dde, where);
 923         else
 924                 ddt_free(dde);
 925 
 926         ddt_exit(ddt);
 927 }
 928 
 929 static void
 930 ddt_repair_entry_done(zio_t *zio)
 931 {
 932         ddt_entry_t *rdde = zio->io_private;
 933 
 934         ddt_free(rdde);
 935 }
 936 
 937 static void
 938 ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
 939 {
 940         ddt_phys_t *ddp = dde->dde_phys;
 941         ddt_phys_t *rddp = rdde->dde_phys;
 942         ddt_key_t *ddk = &dde->dde_key;
 943         ddt_key_t *rddk = &rdde->dde_key;
 944         zio_t *zio;
 945         blkptr_t blk;
 946 
 947         zio = zio_null(rio, rio->io_spa, NULL,
 948             ddt_repair_entry_done, rdde, rio->io_flags);
 949 
 950         for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) {
 951                 if (ddp->ddp_phys_birth == 0 ||
 952                     ddp->ddp_phys_birth != rddp->ddp_phys_birth ||
 953                     bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva)))
 954                         continue;
 955                 ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
 956                 zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
 957                     rdde->dde_repair_abd, DDK_GET_PSIZE(rddk), NULL, NULL,
 958                     ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL));
 959         }
 960 
 961         zio_nowait(zio);
 962 }
 963 
 964 static void
 965 ddt_repair_table(ddt_t *ddt, zio_t *rio)
 966 {
 967         spa_t *spa = ddt->ddt_spa;
 968         ddt_entry_t *dde, *rdde_next, *rdde;
 969         avl_tree_t *t = &ddt->ddt_repair_tree;
 970         blkptr_t blk;
 971 
 972         if (spa_sync_pass(spa) > 1)
 973                 return;
 974 
 975         ddt_enter(ddt);
 976         for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) {
 977                 rdde_next = AVL_NEXT(t, rdde);
 978                 avl_remove(&ddt->ddt_repair_tree, rdde);
 979                 ddt_exit(ddt);
 980                 ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk);
 981                 dde = ddt_repair_start(ddt, &blk);
 982                 ddt_repair_entry(ddt, dde, rdde, rio);
 983                 ddt_repair_done(ddt, dde);
 984                 ddt_enter(ddt);
 985         }
 986         ddt_exit(ddt);
 987 }
 988 
 989 static void
 990 ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
 991 {
 992         dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
 993         ddt_phys_t *ddp = dde->dde_phys;
 994         ddt_key_t *ddk = &dde->dde_key;
 995         enum ddt_type otype = dde->dde_type;
 996         enum ddt_type ntype = DDT_TYPE_CURRENT;
 997         enum ddt_class oclass = dde->dde_class;
 998         enum ddt_class nclass;
 999         uint64_t total_refcnt = 0;
1000 
1001         ASSERT(dde->dde_loaded);
1002         ASSERT(!dde->dde_loading);
1003 
1004         for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
1005                 ASSERT(dde->dde_lead_zio[p] == NULL);
1006                 ASSERT((int64_t)ddp->ddp_refcnt >= 0);
1007                 if (ddp->ddp_phys_birth == 0) {
1008                         ASSERT(ddp->ddp_refcnt == 0);
1009                         continue;
1010                 }
1011                 if (p == DDT_PHYS_DITTO) {
1012                         if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0)
1013                                 ddt_phys_free(ddt, ddk, ddp, txg);
1014                         continue;
1015                 }
1016                 if (ddp->ddp_refcnt == 0)
1017                         ddt_phys_free(ddt, ddk, ddp, txg);
1018                 total_refcnt += ddp->ddp_refcnt;
1019         }
1020 
1021         if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0)
1022                 nclass = DDT_CLASS_DITTO;
1023         else if (total_refcnt > 1)
1024                 nclass = DDT_CLASS_DUPLICATE;
1025         else
1026                 nclass = DDT_CLASS_UNIQUE;
1027 
1028         if (otype != DDT_TYPES &&
1029             (otype != ntype || oclass != nclass || total_refcnt == 0)) {
1030                 VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0);
1031                 ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT);
1032         }
1033 
1034         if (total_refcnt != 0) {
1035                 dde->dde_type = ntype;
1036                 dde->dde_class = nclass;
1037                 ddt_stat_update(ddt, dde, 0);
1038                 if (!ddt_object_exists(ddt, ntype, nclass))
1039                         ddt_object_create(ddt, ntype, nclass, tx);
1040                 VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0);
1041 
1042                 /*
1043                  * If the class changes, the order that we scan this bp
1044                  * changes.  If it decreases, we could miss it, so
1045                  * scan it right now.  (This covers both class changing
1046                  * while we are doing ddt_walk(), and when we are
1047                  * traversing.)
1048                  */
1049                 if (nclass < oclass) {
1050                         dsl_scan_ddt_entry(dp->dp_scan,
1051                             ddt->ddt_checksum, dde, tx);
1052                 }
1053         }
1054 }
1055 
1056 static void
1057 ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
1058 {
1059         spa_t *spa = ddt->ddt_spa;
1060         ddt_entry_t *dde;
1061         void *cookie = NULL;
1062 
1063         if (avl_numnodes(&ddt->ddt_tree) == 0)
1064                 return;
1065 
1066         ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP);
1067 
1068         if (spa->spa_ddt_stat_object == 0) {
1069                 spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
1070                     DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT,
1071                     DMU_POOL_DDT_STATS, tx);
1072         }
1073 
1074         while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
1075                 ddt_sync_entry(ddt, dde, tx, txg);
1076                 ddt_free(dde);
1077         }
1078 
1079         for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
1080                 uint64_t count = 0;
1081                 for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
1082                         if (ddt_object_exists(ddt, type, class)) {
1083                                 ddt_object_sync(ddt, type, class, tx);
1084                                 count += ddt_object_count(ddt, type, class);
1085                         }
1086                 }
1087                 for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
1088                         if (count == 0 && ddt_object_exists(ddt, type, class))
1089                                 ddt_object_destroy(ddt, type, class, tx);
1090                 }
1091         }
1092 
1093         bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
1094             sizeof (ddt->ddt_histogram));
1095 }
1096 
1097 void
1098 ddt_sync(spa_t *spa, uint64_t txg)
1099 {
1100         dmu_tx_t *tx;
1101         zio_t *rio = zio_root(spa, NULL, NULL,
1102             ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL);
1103 
1104         ASSERT(spa_syncing_txg(spa) == txg);
1105 
1106         tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1107 
1108         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1109                 ddt_t *ddt = spa->spa_ddt[c];
1110                 if (ddt == NULL)
1111                         continue;
1112                 ddt_sync_table(ddt, tx, txg);
1113                 ddt_repair_table(ddt, rio);
1114         }
1115 
1116         (void) zio_wait(rio);
1117 
1118         dmu_tx_commit(tx);
1119 }
1120 
1121 int
1122 ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)
1123 {
1124         do {
1125                 do {
1126                         do {
1127                                 ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum];
1128                                 int error = ENOENT;
1129                                 if (ddt_object_exists(ddt, ddb->ddb_type,
1130                                     ddb->ddb_class)) {
1131                                         error = ddt_object_walk(ddt,
1132                                             ddb->ddb_type, ddb->ddb_class,
1133                                             &ddb->ddb_cursor, dde);
1134                                 }
1135                                 dde->dde_type = ddb->ddb_type;
1136                                 dde->dde_class = ddb->ddb_class;
1137                                 if (error == 0)
1138                                         return (0);
1139                                 if (error != ENOENT)
1140                                         return (error);
1141                                 ddb->ddb_cursor = 0;
1142                         } while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS);
1143                         ddb->ddb_checksum = 0;
1144                 } while (++ddb->ddb_type < DDT_TYPES);
1145                 ddb->ddb_type = 0;
1146         } while (++ddb->ddb_class < DDT_CLASSES);
1147 
1148         return (SET_ERROR(ENOENT));
1149 }