Print this page
NEX-5856 ddt_capped isn't reset when deduped dataset is destroyed
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
4185 add new cryptographic checksums to ZFS: SHA-512, Skein, Edon-R (fix studio build)
4185 add new cryptographic checksums to ZFS: SHA-512, Skein, Edon-R
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Richard Lowe <richlowe@richlowe.net>
Approved by: Garrett D'Amore <garrett@damore.org>
NEX-3165 need some dedup improvements
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
NEX-3211 mismerge ddt_repair_start()
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Josef Sipek <josef.sipek@nexenta.com>
4370 avoid transmitting holes during zfs send
4371 DMU code clean up
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
Approved by: Garrett D'Amore <garrett@damore.org>
OS-80 support for vdev and CoS properties for the new I/O scheduler
OS-95 lint warning introduced by OS-61
Issue #2: optimize DDE lookup in DDT objects
Added option to control number of classes of DDE's in DDT.
New default is one, that is all DDE's are stored together
regardless of refcount.
re #12611 rb4105 zpool import panic in ddt_zap_count()
re #8279 rb3915 need a mechanism to notify NMS about ZFS config changes (fix lint -courtesy of Yuri Pankov)
re #12584 rb4049 zfsxx latest code merge (fix lint - courtesy of Yuri Pankov)
re #12585 rb4049 ZFS++ work port - refactoring to improve separation of open/closed code, bug fixes, performance improvements - open code

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/ddt.c
          +++ new/usr/src/uts/common/fs/zfs/ddt.c
↓ open down ↓ 14 lines elided ↑ open up ↑
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
       25 + * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  25   26   */
  26   27  
  27   28  #include <sys/zfs_context.h>
  28   29  #include <sys/spa.h>
  29   30  #include <sys/spa_impl.h>
  30   31  #include <sys/zio.h>
  31   32  #include <sys/ddt.h>
  32   33  #include <sys/zap.h>
  33   34  #include <sys/dmu_tx.h>
  34   35  #include <sys/arc.h>
  35   36  #include <sys/dsl_pool.h>
  36   37  #include <sys/zio_checksum.h>
  37   38  #include <sys/zio_compress.h>
  38   39  #include <sys/dsl_scan.h>
  39   40  #include <sys/abd.h>
  40   41  
  41   42  /*
       43 + * Almost all of the cases of iteration through zap containing entries are
       44 + * restricted by spa->spa_ddt_class_{min,max}. It allows one to introduce new
       45 + * behavior: storing all entries into the single zap. However, there are
       46 + * some places where all zaps are iterated through forcibly: table creation,
       47 + * deletion, loading, dde prefetching, and looking up. It allows one to maintain
       48 + * compatibility with old pools and be able to convert the old pool format
       49 + * into the new one on-the-fly.
       50 + */
       51 +
       52 +/*
  42   53   * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
  43   54   */
  44   55  int zfs_dedup_prefetch = 1;
  45   56  
  46   57  static const ddt_ops_t *ddt_ops[DDT_TYPES] = {
  47   58          &ddt_zap_ops,
  48   59  };
  49   60  
  50   61  static const char *ddt_class_name[DDT_CLASSES] = {
  51   62          "ditto",
  52   63          "duplicate",
  53   64          "unique",
  54   65  };
  55   66  
       67 +/* Possible in core size of all DDTs */
       68 +uint64_t zfs_ddts_msize = 0;
       69 +
  56   70  static void
  57   71  ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
  58   72      dmu_tx_t *tx)
  59   73  {
  60   74          spa_t *spa = ddt->ddt_spa;
  61   75          objset_t *os = ddt->ddt_os;
  62   76          uint64_t *objectp = &ddt->ddt_object[type][class];
  63   77          boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_flags &
  64   78              ZCHECKSUM_FLAG_DEDUP;
  65   79          char name[DDT_NAMELEN];
↓ open down ↓ 13 lines elided ↑ open up ↑
  79   93  }
  80   94  
  81   95  static void
  82   96  ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
  83   97      dmu_tx_t *tx)
  84   98  {
  85   99          spa_t *spa = ddt->ddt_spa;
  86  100          objset_t *os = ddt->ddt_os;
  87  101          uint64_t *objectp = &ddt->ddt_object[type][class];
  88  102          char name[DDT_NAMELEN];
  89      -
      103 +#if DEBUG
      104 +        uint64_t count;
      105 +#endif
  90  106          ddt_object_name(ddt, type, class, name);
  91  107  
  92  108          ASSERT(*objectp != 0);
  93      -        ASSERT(ddt_object_count(ddt, type, class) == 0);
      109 +        ASSERT((ddt_object_count(ddt, type, class, &count) == 0) &&
      110 +            (count == 0));
  94  111          ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
  95  112          VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0);
  96  113          VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0);
  97  114          VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0);
  98  115          bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t));
  99  116  
 100  117          *objectp = 0;
 101  118  }
 102  119  
 103  120  static int
↓ open down ↓ 1 lines elided ↑ open up ↑
 105  122  {
 106  123          ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
 107  124          dmu_object_info_t doi;
 108  125          char name[DDT_NAMELEN];
 109  126          int error;
 110  127  
 111  128          ddt_object_name(ddt, type, class, name);
 112  129  
 113  130          error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name,
 114  131              sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
 115      -
 116      -        if (error != 0)
      132 +        if (error)
 117  133                  return (error);
 118  134  
 119  135          VERIFY0(zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
 120  136              sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
 121  137              &ddt->ddt_histogram[type][class]));
 122  138  
 123  139          /*
 124  140           * Seed the cached statistics.
 125  141           */
 126      -        VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
 127      -
 128      -        ddo->ddo_count = ddt_object_count(ddt, type, class);
      142 +        error = ddt_object_info(ddt, type, class, &doi);
      143 +        /* Panic in debug mode */
      144 +        ASSERT(error == 0);
      145 +        if (error)
      146 +                return (error);
      147 +        error = ddt_object_count(ddt, type, class, &ddo->ddo_count);
      148 +        if (error)
      149 +                return (error);
 129  150          ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
 130  151          ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
 131  152  
 132  153          return (0);
 133  154  }
 134  155  
 135  156  static void
 136  157  ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 137  158      dmu_tx_t *tx)
 138  159  {
↓ open down ↓ 5 lines elided ↑ open up ↑
 144  165  
 145  166          VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
 146  167              sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
 147  168              &ddt->ddt_histogram[type][class], tx) == 0);
 148  169  
 149  170          /*
 150  171           * Cache DDT statistics; this is the only time they'll change.
 151  172           */
 152  173          VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
 153  174  
 154      -        ddo->ddo_count = ddt_object_count(ddt, type, class);
      175 +        (void) ddt_object_count(ddt, type, class, &ddo->ddo_count);
 155  176          ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
 156  177          ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
 157  178  }
 158  179  
 159  180  static int
 160  181  ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 161  182      ddt_entry_t *dde)
 162  183  {
 163  184          if (!ddt_object_exists(ddt, type, class))
 164  185                  return (SET_ERROR(ENOENT));
↓ open down ↓ 36 lines elided ↑ open up ↑
 201  222  int
 202  223  ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 203  224      uint64_t *walk, ddt_entry_t *dde)
 204  225  {
 205  226          ASSERT(ddt_object_exists(ddt, type, class));
 206  227  
 207  228          return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
 208  229              ddt->ddt_object[type][class], dde, walk));
 209  230  }
 210  231  
 211      -uint64_t
 212      -ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
      232 +int
      233 +ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
      234 +        uint64_t *count)
 213  235  {
 214  236          ASSERT(ddt_object_exists(ddt, type, class));
 215  237  
 216  238          return (ddt_ops[type]->ddt_op_count(ddt->ddt_os,
 217      -            ddt->ddt_object[type][class]));
      239 +            ddt->ddt_object[type][class], count));
 218  240  }
 219  241  
 220  242  int
 221  243  ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 222  244      dmu_object_info_t *doi)
 223  245  {
 224  246          if (!ddt_object_exists(ddt, type, class))
 225  247                  return (SET_ERROR(ENOENT));
 226  248  
 227  249          return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class],
↓ open down ↓ 115 lines elided ↑ open up ↑
 343  365  {
 344  366          uint64_t refcnt = 0;
 345  367  
 346  368          for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
 347  369                  refcnt += dde->dde_phys[p].ddp_refcnt;
 348  370  
 349  371          return (refcnt);
 350  372  }
 351  373  
 352  374  static void
 353      -ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
      375 +ddt_stat_generate(spa_t *spa, ddt_entry_t *dde, ddt_stat_t *dds)
 354  376  {
 355      -        spa_t *spa = ddt->ddt_spa;
 356  377          ddt_phys_t *ddp = dde->dde_phys;
 357  378          ddt_key_t *ddk = &dde->dde_key;
 358  379          uint64_t lsize = DDK_GET_LSIZE(ddk);
 359  380          uint64_t psize = DDK_GET_PSIZE(ddk);
 360  381  
 361  382          bzero(dds, sizeof (*dds));
 362  383  
 363  384          for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 364  385                  uint64_t dsize = 0;
 365  386                  uint64_t refcnt = ddp->ddp_refcnt;
↓ open down ↓ 23 lines elided ↑ open up ↑
 389  410          uint64_t *d = (uint64_t *)dst;
 390  411          uint64_t *d_end = (uint64_t *)(dst + 1);
 391  412  
 392  413          ASSERT(neg == 0 || neg == -1ULL);       /* add or subtract */
 393  414  
 394  415          while (d < d_end)
 395  416                  *d++ += (*s++ ^ neg) - neg;
 396  417  }
 397  418  
 398  419  static void
 399      -ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
      420 +ddt_stat_update_by_dds(ddt_t *ddt, ddt_entry_t *dde,
      421 +    ddt_stat_t *dds, uint64_t neg)
 400  422  {
 401      -        ddt_stat_t dds;
 402  423          ddt_histogram_t *ddh;
 403      -        int bucket;
 404      -
 405      -        ddt_stat_generate(ddt, dde, &dds);
 406      -
 407      -        bucket = highbit64(dds.dds_ref_blocks) - 1;
      424 +        int bucket = highbit64(dds->dds_ref_blocks) - 1;
 408  425          ASSERT(bucket >= 0);
 409  426  
 410  427          ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
      428 +        ddt_stat_add(&ddh->ddh_stat[bucket], dds, neg);
      429 +}
 411  430  
 412      -        ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg);
      431 +static void
      432 +ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
      433 +{
      434 +        ddt_stat_t dds;
      435 +
      436 +        ddt_stat_generate(ddt->ddt_spa, dde, &dds);
      437 +
      438 +        ddt_stat_update_by_dds(ddt, dde, &dds, neg);
 413  439  }
 414  440  
 415  441  void
 416  442  ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
 417  443  {
 418  444          for (int h = 0; h < 64; h++)
 419  445                  ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0);
 420  446  }
 421  447  
 422  448  void
↓ open down ↓ 18 lines elided ↑ open up ↑
 441  467          return (B_TRUE);
 442  468  }
 443  469  
 444  470  void
 445  471  ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
 446  472  {
 447  473          /* Sum the statistics we cached in ddt_object_sync(). */
 448  474          for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 449  475                  ddt_t *ddt = spa->spa_ddt[c];
 450  476                  for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 451      -                        for (enum ddt_class class = 0; class < DDT_CLASSES;
 452      -                            class++) {
      477 +                        for (enum ddt_class class = spa->spa_ddt_class_min;
      478 +                            class <= spa->spa_ddt_class_max; class++) {
 453  479                                  ddt_object_t *ddo =
 454  480                                      &ddt->ddt_object_stats[type][class];
 455  481                                  ddo_total->ddo_count += ddo->ddo_count;
 456  482                                  ddo_total->ddo_dspace += ddo->ddo_dspace;
 457  483                                  ddo_total->ddo_mspace += ddo->ddo_mspace;
 458  484                          }
 459  485                  }
 460  486          }
 461  487  
 462  488          /* ... and compute the averages. */
↓ open down ↓ 2 lines elided ↑ open up ↑
 465  491                  ddo_total->ddo_mspace /= ddo_total->ddo_count;
 466  492          }
 467  493  }
 468  494  
 469  495  void
 470  496  ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
 471  497  {
 472  498          for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 473  499                  ddt_t *ddt = spa->spa_ddt[c];
 474  500                  for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 475      -                        for (enum ddt_class class = 0; class < DDT_CLASSES;
 476      -                            class++) {
      501 +                        for (enum ddt_class class = spa->spa_ddt_class_min;
      502 +                            class <= spa->spa_ddt_class_max; class++) {
 477  503                                  ddt_histogram_add(ddh,
 478  504                                      &ddt->ddt_histogram_cache[type][class]);
 479  505                          }
 480  506                  }
 481  507          }
 482  508  }
 483  509  
 484  510  void
 485  511  ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total)
 486  512  {
 487      -        ddt_histogram_t *ddh_total;
 488      -
 489      -        ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
 490      -        ddt_get_dedup_histogram(spa, ddh_total);
 491      -        ddt_histogram_stat(dds_total, ddh_total);
 492      -        kmem_free(ddh_total, sizeof (ddt_histogram_t));
      513 +        /*
      514 +         * Avoid temporary allocation of ddt_histogram_t from heap
      515 +         * or on stack (probably too large) by unrolling ddt_histogram_add()
      516 +         */
      517 +        bzero(dds_total, sizeof (ddt_stat_t));
      518 +        /* sum up the stats across all the histograms */
      519 +        for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
      520 +                ddt_t *ddt = spa->spa_ddt[c];
      521 +                for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
      522 +                        for (enum ddt_class class = spa->spa_ddt_class_min;
      523 +                            class <= spa->spa_ddt_class_max; class++) {
      524 +                                /* unroll the ddt_histogram_add() */
      525 +                                ddt_histogram_t *src =
      526 +                                    &ddt->ddt_histogram_cache[type][class];
      527 +                                for (int h = 0; h < 64; h++) {
      528 +                                        ddt_stat_t *st = &src->ddh_stat[h];
      529 +                                        ddt_stat_add(dds_total, st, 0);
      530 +                                }
      531 +                        }
      532 +                }
      533 +        }
 493  534  }
 494  535  
 495  536  uint64_t
 496  537  ddt_get_dedup_dspace(spa_t *spa)
 497  538  {
 498  539          ddt_stat_t dds_total = { 0 };
 499  540  
 500  541          ddt_get_dedup_stats(spa, &dds_total);
 501  542          return (dds_total.dds_ref_dsize - dds_total.dds_dsize);
 502  543  }
↓ open down ↓ 110 lines elided ↑ open up ↑
 613  654          return (spa->spa_ddt[c]);
 614  655  }
 615  656  
 616  657  ddt_t *
 617  658  ddt_select(spa_t *spa, const blkptr_t *bp)
 618  659  {
 619  660          return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
 620  661  }
 621  662  
 622  663  void
 623      -ddt_enter(ddt_t *ddt)
      664 +ddt_enter(ddt_t *ddt, uint8_t hash)
 624  665  {
 625      -        mutex_enter(&ddt->ddt_lock);
      666 +        mutex_enter(&ddt->ddt_lock[hash]);
 626  667  }
 627  668  
 628  669  void
 629      -ddt_exit(ddt_t *ddt)
      670 +ddt_exit(ddt_t *ddt, uint8_t hash)
 630  671  {
 631      -        mutex_exit(&ddt->ddt_lock);
      672 +        mutex_exit(&ddt->ddt_lock[hash]);
 632  673  }
 633  674  
      675 +void
      676 +dde_enter(ddt_entry_t *dde)
      677 +{
      678 +        mutex_enter(&dde->dde_lock);
      679 +}
      680 +
      681 +void
      682 +dde_exit(ddt_entry_t *dde)
      683 +{
      684 +        mutex_exit(&dde->dde_lock);
      685 +}
      686 +
      687 +/* cache for ddt_entry_t structures */
      688 +static kmem_cache_t *dde_cache;
      689 +
      690 +/* ARGSUSED */
      691 +static int
      692 +dde_cache_constr(void *buf, void *arg, int flags)
      693 +{
      694 +        ddt_entry_t *dde = (ddt_entry_t *)buf;
      695 +        cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
      696 +        mutex_init(&dde->dde_lock, NULL, MUTEX_DEFAULT, NULL);
      697 +        return (0);
      698 +}
      699 +
      700 +/* ARGSUSED */
      701 +static void
      702 +dde_cache_destr(void *buf, void *arg)
      703 +{
      704 +        ddt_entry_t *dde = (ddt_entry_t *)buf;
      705 +        cv_destroy(&dde->dde_cv);
      706 +        mutex_destroy(&dde->dde_lock);
      707 +}
      708 +
      709 +void
      710 +ddt_init(void)
      711 +{
      712 +        dde_cache = kmem_cache_create("ddt_entry_t", sizeof (ddt_entry_t),
      713 +            0, dde_cache_constr, dde_cache_destr, NULL, NULL, NULL, 0);
      714 +        VERIFY(dde_cache != NULL);
      715 +}
      716 +
      717 +void
      718 +ddt_fini(void)
      719 +{
      720 +        if (dde_cache) {
      721 +                kmem_cache_destroy(dde_cache);
      722 +                dde_cache = NULL;
      723 +        }
      724 +}
      725 +
 634  726  static ddt_entry_t *
 635  727  ddt_alloc(const ddt_key_t *ddk)
 636  728  {
 637  729          ddt_entry_t *dde;
 638  730  
 639      -        dde = kmem_zalloc(sizeof (ddt_entry_t), KM_SLEEP);
 640      -        cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
      731 +        dde = kmem_cache_alloc(dde_cache, KM_SLEEP);
 641  732  
      733 +        /* Init everything but the condvar and the mutex */
 642  734          dde->dde_key = *ddk;
      735 +        bzero((void*)((uintptr_t)dde+offsetof(ddt_entry_t, dde_phys)),
      736 +            offsetof(ddt_entry_t, dde_cv)-offsetof(ddt_entry_t, dde_phys));
      737 +        bzero((void*)((uintptr_t)dde+offsetof(ddt_entry_t, dde_node)),
      738 +            sizeof (avl_node_t));
 643  739  
 644  740          return (dde);
 645  741  }
 646  742  
 647  743  static void
 648  744  ddt_free(ddt_entry_t *dde)
 649  745  {
 650      -        ASSERT(!dde->dde_loading);
      746 +        ASSERT(!(dde->dde_state & DDE_LOADING));
 651  747  
 652  748          for (int p = 0; p < DDT_PHYS_TYPES; p++)
 653  749                  ASSERT(dde->dde_lead_zio[p] == NULL);
 654  750  
 655  751          if (dde->dde_repair_abd != NULL)
 656  752                  abd_free(dde->dde_repair_abd);
 657  753  
 658      -        cv_destroy(&dde->dde_cv);
 659      -        kmem_free(dde, sizeof (*dde));
      754 +        kmem_cache_free(dde_cache, dde);
 660  755  }
 661  756  
      757 +/* for zdb usage */
 662  758  void
 663  759  ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
 664  760  {
 665      -        ASSERT(MUTEX_HELD(&ddt->ddt_lock));
      761 +        uint8_t hash = DDT_HASHFN(dde->dde_key.ddk_cksum);
 666  762  
 667      -        avl_remove(&ddt->ddt_tree, dde);
      763 +        avl_remove(&ddt->ddt_tree[hash], dde);
 668  764          ddt_free(dde);
 669  765  }
 670  766  
 671  767  ddt_entry_t *
 672  768  ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
 673  769  {
 674  770          ddt_entry_t *dde, dde_search;
 675  771          enum ddt_type type;
 676  772          enum ddt_class class;
 677  773          avl_index_t where;
      774 +        uint8_t hash = DDT_HASHFN(bp->blk_cksum);
 678  775          int error;
 679  776  
 680      -        ASSERT(MUTEX_HELD(&ddt->ddt_lock));
 681      -
 682  777          ddt_key_fill(&dde_search.dde_key, bp);
 683  778  
 684      -        dde = avl_find(&ddt->ddt_tree, &dde_search, &where);
      779 +        ddt_enter(ddt, hash);
      780 +        /*
      781 +         * Do we have the dirty DDE in mem already?
      782 +         */
      783 +        dde = avl_find(&ddt->ddt_tree[hash], &dde_search, &where);
 685  784          if (dde == NULL) {
 686      -                if (!add)
      785 +                /* This DDE doesn't exists in dirty tree */
      786 +                if (!add) {
      787 +                        ddt_exit(ddt, hash);
 687  788                          return (NULL);
      789 +                }
      790 +                /* Since a dirty DDE didn't exist, create it */
 688  791                  dde = ddt_alloc(&dde_search.dde_key);
 689      -                avl_insert(&ddt->ddt_tree, dde, where);
      792 +                avl_insert(&ddt->ddt_tree[hash], dde, where);
 690  793          }
 691  794  
 692      -        while (dde->dde_loading)
 693      -                cv_wait(&dde->dde_cv, &ddt->ddt_lock);
      795 +        ddt_exit(ddt, hash);
 694  796  
 695      -        if (dde->dde_loaded)
      797 +        /*
      798 +         * If we're already looking up this DDE
      799 +         * wait until we have the result
      800 +         */
      801 +        dde_enter(dde);
      802 +        while (dde->dde_state & DDE_LOADING)
      803 +                cv_wait(&dde->dde_cv, &dde->dde_lock);
      804 +
      805 +        /*
      806 +         * If we have loaded the DDE from disk return it
      807 +         */
      808 +        if (dde->dde_state & DDE_LOADED)
 696  809                  return (dde);
 697  810  
 698      -        dde->dde_loading = B_TRUE;
      811 +        /*
      812 +         * If we didn't find this DDE, start looking up the DDE in ZAP
      813 +         */
      814 +        dde->dde_state |= DDE_LOADING;
      815 +        dde_exit(dde);
 699  816  
 700      -        ddt_exit(ddt);
 701      -
 702  817          error = ENOENT;
 703  818  
      819 +        DTRACE_PROBE1(ddt__loading, ddt_key_t *, &dde->dde_key);
 704  820          for (type = 0; type < DDT_TYPES; type++) {
 705  821                  for (class = 0; class < DDT_CLASSES; class++) {
 706  822                          error = ddt_object_lookup(ddt, type, class, dde);
 707      -                        if (error != ENOENT) {
 708      -                                ASSERT0(error);
      823 +                        if (error != ENOENT)
 709  824                                  break;
 710      -                        }
 711  825                  }
 712  826                  if (error != ENOENT)
 713  827                          break;
 714  828          }
 715  829  
 716      -        ddt_enter(ddt);
      830 +        ASSERT(error == 0 || error == ENOENT);
 717  831  
 718      -        ASSERT(dde->dde_loaded == B_FALSE);
 719      -        ASSERT(dde->dde_loading == B_TRUE);
      832 +        dde_enter(dde);
 720  833  
      834 +        ASSERT(!(dde->dde_state & DDE_LOADED));
      835 +        ASSERT(dde->dde_state & DDE_LOADING);
      836 +
 721  837          dde->dde_type = type;   /* will be DDT_TYPES if no entry found */
 722  838          dde->dde_class = class; /* will be DDT_CLASSES if no entry found */
 723      -        dde->dde_loaded = B_TRUE;
 724      -        dde->dde_loading = B_FALSE;
      839 +        if (type == DDT_TYPES && class == DDT_CLASSES)
      840 +                dde->dde_state |= DDE_NEW;
      841 +        dde->dde_state |= DDE_LOADED;
      842 +        dde->dde_state &= ~DDE_LOADING;
 725  843  
      844 +        DTRACE_PROBE2(ddt__loaded, ddt_key_t *, &dde->dde_key,
      845 +            enum ddt_class, dde->dde_class);
 726  846          if (error == 0)
 727      -                ddt_stat_update(ddt, dde, -1ULL);
      847 +                ddt_stat_generate(ddt->ddt_spa, dde, &dde->dde_lkstat);
 728  848  
 729  849          cv_broadcast(&dde->dde_cv);
 730  850  
 731  851          return (dde);
 732  852  }
 733  853  
 734  854  void
 735  855  ddt_prefetch(spa_t *spa, const blkptr_t *bp)
 736  856  {
 737  857          ddt_t *ddt;
↓ open down ↓ 4 lines elided ↑ open up ↑
 742  862  
 743  863          /*
 744  864           * We only remove the DDT once all tables are empty and only
 745  865           * prefetch dedup blocks when there are entries in the DDT.
 746  866           * Thus no locking is required as the DDT can't disappear on us.
 747  867           */
 748  868          ddt = ddt_select(spa, bp);
 749  869          ddt_key_fill(&dde.dde_key, bp);
 750  870  
 751  871          for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 752      -                for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
      872 +                for (enum ddt_class class = 0;
      873 +                    class < DDT_CLASSES; class++) {
 753  874                          ddt_object_prefetch(ddt, type, class, &dde);
 754  875                  }
 755  876          }
 756  877  }
 757  878  
 758  879  int
 759  880  ddt_entry_compare(const void *x1, const void *x2)
 760  881  {
 761  882          const ddt_entry_t *dde1 = x1;
 762  883          const ddt_entry_t *dde2 = x2;
↓ open down ↓ 7 lines elided ↑ open up ↑
 770  891                          return (1);
 771  892          }
 772  893  
 773  894          return (0);
 774  895  }
 775  896  
 776  897  static ddt_t *
 777  898  ddt_table_alloc(spa_t *spa, enum zio_checksum c)
 778  899  {
 779  900          ddt_t *ddt;
      901 +        uint_t i;
 780  902  
 781  903          ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP);
 782  904  
 783      -        mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
 784      -        avl_create(&ddt->ddt_tree, ddt_entry_compare,
 785      -            sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
      905 +        for (i = 0; i < DDT_HASHSZ; i++) {
      906 +                mutex_init(&ddt->ddt_lock[i], NULL, MUTEX_DEFAULT, NULL);
      907 +                avl_create(&ddt->ddt_tree[i], ddt_entry_compare,
      908 +                    sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
      909 +        }
      910 +        mutex_init(&ddt->ddt_repair_lock, NULL, MUTEX_DEFAULT, NULL);
      911 +
 786  912          avl_create(&ddt->ddt_repair_tree, ddt_entry_compare,
 787  913              sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
 788  914          ddt->ddt_checksum = c;
 789  915          ddt->ddt_spa = spa;
 790  916          ddt->ddt_os = spa->spa_meta_objset;
 791  917  
 792  918          return (ddt);
 793  919  }
 794  920  
 795  921  static void
 796  922  ddt_table_free(ddt_t *ddt)
 797  923  {
 798      -        ASSERT(avl_numnodes(&ddt->ddt_tree) == 0);
      924 +        uint_t i;
      925 +
 799  926          ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0);
 800      -        avl_destroy(&ddt->ddt_tree);
      927 +
      928 +        for (i = 0; i < DDT_HASHSZ; i++) {
      929 +                ASSERT(avl_numnodes(&ddt->ddt_tree[i]) == 0);
      930 +                avl_destroy(&ddt->ddt_tree[i]);
      931 +                mutex_destroy(&ddt->ddt_lock[i]);
      932 +        }
 801  933          avl_destroy(&ddt->ddt_repair_tree);
 802      -        mutex_destroy(&ddt->ddt_lock);
      934 +        mutex_destroy(&ddt->ddt_repair_lock);
 803  935          kmem_free(ddt, sizeof (*ddt));
 804  936  }
 805  937  
 806  938  void
 807  939  ddt_create(spa_t *spa)
 808  940  {
 809  941          spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM;
 810  942  
 811  943          for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
 812  944                  spa->spa_ddt[c] = ddt_table_alloc(spa, c);
 813  945  }
 814  946  
      947 +/*
      948 + * Get the combined size of DDTs on all pools.
      949 + * Returns either on disk (phys == B_TRUE) or in core combined DDTs size
      950 + */
      951 +uint64_t
      952 +ddt_get_ddts_size(boolean_t phys)
      953 +{
      954 +        uint64_t ddts_size = 0;
      955 +        spa_t *spa = NULL;
      956 +
      957 +        while ((spa = spa_next(spa)) != NULL)
      958 +                ddts_size += spa_get_ddts_size(spa, phys);
      959 +
      960 +        return (ddts_size);
      961 +}
      962 +
 815  963  int
 816  964  ddt_load(spa_t *spa)
 817  965  {
 818  966          int error;
      967 +        ddt_object_t *ddo;
 819  968  
 820  969          ddt_create(spa);
 821  970  
 822  971          error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 823  972              DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
 824  973              &spa->spa_ddt_stat_object);
 825  974  
 826  975          if (error)
 827  976                  return (error == ENOENT ? 0 : error);
 828  977  
 829  978          for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 830  979                  ddt_t *ddt = spa->spa_ddt[c];
 831  980                  for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 832      -                        for (enum ddt_class class = 0; class < DDT_CLASSES;
 833      -                            class++) {
      981 +                        for (enum ddt_class class = 0;
      982 +                            class < DDT_CLASSES; class++) {
 834  983                                  error = ddt_object_load(ddt, type, class);
 835      -                                if (error != 0 && error != ENOENT)
      984 +                                if (error == ENOENT)
      985 +                                        continue;
      986 +                                if (error != 0)
 836  987                                          return (error);
      988 +                                ddo = &ddt->ddt_object_stats[type][class];
      989 +                                atomic_add_64(&spa->spa_ddt_dsize,
      990 +                                    ddo->ddo_dspace);
      991 +                                atomic_add_64(&spa->spa_ddt_msize,
      992 +                                    ddo->ddo_mspace);
 837  993                          }
 838  994                  }
 839  995  
 840  996                  /*
 841  997                   * Seed the cached histograms.
 842  998                   */
 843  999                  bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
 844 1000                      sizeof (ddt->ddt_histogram));
 845 1001          }
     1002 +        zfs_ddts_msize = ddt_get_ddts_size(B_FALSE);
 846 1003  
     1004 +        if (spa_enable_dedup_cap(spa) && spa->spa_ddt_capped == 0) {
     1005 +                /* notify that dedup cap is now active */
     1006 +                spa->spa_ddt_capped = 1;
     1007 +                spa_event_notify(spa, NULL, NULL, ESC_ZFS_DEDUP_OFF);
     1008 +        }
     1009 +
 847 1010          return (0);
 848 1011  }
 849 1012  
 850 1013  void
 851 1014  ddt_unload(spa_t *spa)
 852 1015  {
 853 1016          for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 854 1017                  if (spa->spa_ddt[c]) {
 855 1018                          ddt_table_free(spa->spa_ddt[c]);
 856 1019                          spa->spa_ddt[c] = NULL;
 857 1020                  }
 858 1021          }
     1022 +        spa->spa_ddt_dsize = 0;
     1023 +        spa->spa_ddt_msize = 0;
     1024 +        zfs_ddts_msize = ddt_get_ddts_size(B_FALSE);
 859 1025  }
 860 1026  
 861 1027  boolean_t
 862 1028  ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp)
 863 1029  {
 864 1030          ddt_t *ddt;
 865 1031          ddt_entry_t dde;
 866 1032  
 867 1033          if (!BP_GET_DEDUP(bp))
 868 1034                  return (B_FALSE);
 869 1035  
 870      -        if (max_class == DDT_CLASS_UNIQUE)
 871      -                return (B_TRUE);
     1036 +        if (max_class > spa->spa_ddt_class_max)
     1037 +                max_class = spa->spa_ddt_class_max;
 872 1038  
 873 1039          ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)];
 874 1040  
 875 1041          ddt_key_fill(&dde.dde_key, bp);
 876 1042  
 877 1043          for (enum ddt_type type = 0; type < DDT_TYPES; type++)
 878      -                for (enum ddt_class class = 0; class <= max_class; class++)
     1044 +                for (enum ddt_class class = spa->spa_ddt_class_min;
     1045 +                    class <= max_class; class++)
 879 1046                          if (ddt_object_lookup(ddt, type, class, &dde) == 0)
 880 1047                                  return (B_TRUE);
 881 1048  
 882 1049          return (B_FALSE);
 883 1050  }
 884 1051  
 885 1052  ddt_entry_t *
 886 1053  ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
 887 1054  {
 888 1055          ddt_key_t ddk;
 889 1056          ddt_entry_t *dde;
 890 1057  
 891 1058          ddt_key_fill(&ddk, bp);
 892 1059  
 893 1060          dde = ddt_alloc(&ddk);
 894 1061  
 895 1062          for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 896      -                for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
     1063 +                for (enum ddt_class class = 0;
     1064 +                    class < DDT_CLASSES; class++) {
 897 1065                          /*
 898 1066                           * We can only do repair if there are multiple copies
 899 1067                           * of the block.  For anything in the UNIQUE class,
 900 1068                           * there's definitely only one copy, so don't even try.
 901 1069                           */
 902 1070                          if (class != DDT_CLASS_UNIQUE &&
 903 1071                              ddt_object_lookup(ddt, type, class, dde) == 0)
 904 1072                                  return (dde);
 905 1073                  }
 906 1074          }
↓ open down ↓ 1 lines elided ↑ open up ↑
 908 1076          bzero(dde->dde_phys, sizeof (dde->dde_phys));
 909 1077  
 910 1078          return (dde);
 911 1079  }
 912 1080  
 913 1081  void
 914 1082  ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
 915 1083  {
 916 1084          avl_index_t where;
 917 1085  
 918      -        ddt_enter(ddt);
     1086 +        mutex_enter(&ddt->ddt_repair_lock);
 919 1087  
 920 1088          if (dde->dde_repair_abd != NULL && spa_writeable(ddt->ddt_spa) &&
 921 1089              avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
 922 1090                  avl_insert(&ddt->ddt_repair_tree, dde, where);
 923 1091          else
 924 1092                  ddt_free(dde);
 925 1093  
 926      -        ddt_exit(ddt);
     1094 +        mutex_exit(&ddt->ddt_repair_lock);;
 927 1095  }
 928 1096  
 929 1097  static void
 930 1098  ddt_repair_entry_done(zio_t *zio)
 931 1099  {
 932 1100          ddt_entry_t *rdde = zio->io_private;
 933 1101  
 934 1102          ddt_free(rdde);
 935 1103  }
 936 1104  
↓ open down ↓ 28 lines elided ↑ open up ↑
 965 1133  ddt_repair_table(ddt_t *ddt, zio_t *rio)
 966 1134  {
 967 1135          spa_t *spa = ddt->ddt_spa;
 968 1136          ddt_entry_t *dde, *rdde_next, *rdde;
 969 1137          avl_tree_t *t = &ddt->ddt_repair_tree;
 970 1138          blkptr_t blk;
 971 1139  
 972 1140          if (spa_sync_pass(spa) > 1)
 973 1141                  return;
 974 1142  
 975      -        ddt_enter(ddt);
     1143 +        mutex_enter(&ddt->ddt_repair_lock);
 976 1144          for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) {
 977 1145                  rdde_next = AVL_NEXT(t, rdde);
 978 1146                  avl_remove(&ddt->ddt_repair_tree, rdde);
 979      -                ddt_exit(ddt);
     1147 +                mutex_exit(&ddt->ddt_repair_lock);
     1148 +
 980 1149                  ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk);
 981 1150                  dde = ddt_repair_start(ddt, &blk);
 982 1151                  ddt_repair_entry(ddt, dde, rdde, rio);
 983 1152                  ddt_repair_done(ddt, dde);
 984      -                ddt_enter(ddt);
     1153 +
     1154 +                mutex_enter(&ddt->ddt_repair_lock);
 985 1155          }
 986      -        ddt_exit(ddt);
     1156 +        mutex_exit(&ddt->ddt_repair_lock);
 987 1157  }
 988 1158  
 989 1159  static void
 990 1160  ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
 991 1161  {
 992 1162          dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
 993 1163          ddt_phys_t *ddp = dde->dde_phys;
 994 1164          ddt_key_t *ddk = &dde->dde_key;
     1165 +        spa_t *spa = ddt->ddt_spa;
 995 1166          enum ddt_type otype = dde->dde_type;
 996 1167          enum ddt_type ntype = DDT_TYPE_CURRENT;
 997 1168          enum ddt_class oclass = dde->dde_class;
 998 1169          enum ddt_class nclass;
 999 1170          uint64_t total_refcnt = 0;
1000 1171  
1001      -        ASSERT(dde->dde_loaded);
1002      -        ASSERT(!dde->dde_loading);
     1172 +        ASSERT(dde->dde_state & DDE_LOADED);
     1173 +        ASSERT(!(dde->dde_state & DDE_LOADING));
1003 1174  
     1175 +        /*
     1176 +         * Propagate the stats generated at lookup time
     1177 +         * this was delayed to avoid having to take locks
     1178 +         * to protect ddt->ddt_histogram
     1179 +         */
     1180 +        if (dde->dde_lkstat.dds_ref_blocks != 0)
     1181 +                ddt_stat_update_by_dds(ddt, dde, &dde->dde_lkstat, -1ULL);
     1182 +
1004 1183          for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
1005 1184                  ASSERT(dde->dde_lead_zio[p] == NULL);
1006 1185                  ASSERT((int64_t)ddp->ddp_refcnt >= 0);
1007 1186                  if (ddp->ddp_phys_birth == 0) {
1008 1187                          ASSERT(ddp->ddp_refcnt == 0);
1009 1188                          continue;
1010 1189                  }
1011 1190                  if (p == DDT_PHYS_DITTO) {
1012 1191                          if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0)
1013 1192                                  ddt_phys_free(ddt, ddk, ddp, txg);
↓ open down ↓ 4 lines elided ↑ open up ↑
1018 1197                  total_refcnt += ddp->ddp_refcnt;
1019 1198          }
1020 1199  
1021 1200          if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0)
1022 1201                  nclass = DDT_CLASS_DITTO;
1023 1202          else if (total_refcnt > 1)
1024 1203                  nclass = DDT_CLASS_DUPLICATE;
1025 1204          else
1026 1205                  nclass = DDT_CLASS_UNIQUE;
1027 1206  
     1207 +        if (nclass > spa->spa_ddt_class_max)
     1208 +                nclass = spa->spa_ddt_class_max;
     1209 +
     1210 +        if (nclass < spa->spa_ddt_class_min)
     1211 +                nclass = spa->spa_ddt_class_min;
     1212 +
     1213 +        DTRACE_PROBE1(ddt__storing__entry, uint64_t, (uint64_t)nclass);
     1214 +
1028 1215          if (otype != DDT_TYPES &&
1029 1216              (otype != ntype || oclass != nclass || total_refcnt == 0)) {
1030 1217                  VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0);
1031 1218                  ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT);
1032 1219          }
1033 1220  
1034 1221          if (total_refcnt != 0) {
1035 1222                  dde->dde_type = ntype;
1036 1223                  dde->dde_class = nclass;
1037 1224                  ddt_stat_update(ddt, dde, 0);
↓ open down ↓ 6 lines elided ↑ open up ↑
1044 1231                   * changes.  If it decreases, we could miss it, so
1045 1232                   * scan it right now.  (This covers both class changing
1046 1233                   * while we are doing ddt_walk(), and when we are
1047 1234                   * traversing.)
1048 1235                   */
1049 1236                  if (nclass < oclass) {
1050 1237                          dsl_scan_ddt_entry(dp->dp_scan,
1051 1238                              ddt->ddt_checksum, dde, tx);
1052 1239                  }
1053 1240          }
     1241 +        DTRACE_PROBE(ddt__stored__entry);
1054 1242  }
1055 1243  
1056 1244  static void
     1245 +ddt_sync_avl(ddt_t *ddt, avl_tree_t *avl, dmu_tx_t *tx, uint64_t txg)
     1246 +{
     1247 +        void *cookie = NULL;
     1248 +        ddt_entry_t *dde;
     1249 +
     1250 +        while ((dde = avl_destroy_nodes(avl, &cookie)) != NULL) {
     1251 +                if ((dde->dde_state & DDE_DONT_SYNC) != DDE_DONT_SYNC) {
     1252 +                        ddt_sync_entry(ddt, dde, tx, txg);
     1253 +                } else { /* if we're not syncing this DDE it must be new */
     1254 +                        ASSERT(dde->dde_state & DDE_NEW);
     1255 +                }
     1256 +                ddt_free(dde);
     1257 +        }
     1258 +}
     1259 +
     1260 +static void
1057 1261  ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
1058 1262  {
     1263 +        uint64_t cnt, num_dbytes = 0, num_mbytes = 0;
     1264 +        int64_t old_mbytes = 0;
1059 1265          spa_t *spa = ddt->ddt_spa;
1060      -        ddt_entry_t *dde;
1061      -        void *cookie = NULL;
     1266 +        uint_t i, numnodes = 0;
     1267 +        ddt_object_t *ddo;
1062 1268  
1063      -        if (avl_numnodes(&ddt->ddt_tree) == 0)
     1269 +        for (i = 0; i < DDT_HASHSZ; i++)
     1270 +                numnodes += avl_numnodes(&ddt->ddt_tree[i]);
     1271 +
     1272 +        if (numnodes == 0)
1064 1273                  return;
1065 1274  
1066 1275          ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP);
1067 1276  
1068 1277          if (spa->spa_ddt_stat_object == 0) {
1069 1278                  spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
1070 1279                      DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT,
1071 1280                      DMU_POOL_DDT_STATS, tx);
1072 1281          }
1073 1282  
1074      -        while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
1075      -                ddt_sync_entry(ddt, dde, tx, txg);
1076      -                ddt_free(dde);
1077      -        }
1078 1283  
     1284 +        DTRACE_PROBE(ddt__syncing__avl);
     1285 +        for (i = 0; i < DDT_HASHSZ; i++)
     1286 +                ddt_sync_avl(ddt, &ddt->ddt_tree[i], tx, txg);
     1287 +        DTRACE_PROBE(ddt__synced__avl);
     1288 +
     1289 +        DTRACE_PROBE(ddt__syncing__obj);
1079 1290          for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
1080      -                uint64_t count = 0;
1081      -                for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
     1291 +                for (enum ddt_class class = spa->spa_ddt_class_min;
     1292 +                    class <= spa->spa_ddt_class_max; class++) {
1082 1293                          if (ddt_object_exists(ddt, type, class)) {
     1294 +                                ddo = &ddt->ddt_object_stats[type][class];
     1295 +                                old_mbytes += ddo->ddo_mspace;
     1296 +
1083 1297                                  ddt_object_sync(ddt, type, class, tx);
1084      -                                count += ddt_object_count(ddt, type, class);
     1298 +                                (void) ddt_object_count(ddt, type, class, &cnt);
     1299 +                                if (cnt == 0) {
     1300 +                                        ddt_object_destroy(ddt, type, class,
     1301 +                                            tx);
     1302 +                                        continue;
     1303 +                                }
     1304 +
     1305 +                                num_dbytes += ddo->ddo_dspace;
     1306 +                                num_mbytes += ddo->ddo_mspace;
1085 1307                          }
1086 1308                  }
1087      -                for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
1088      -                        if (count == 0 && ddt_object_exists(ddt, type, class))
1089      -                                ddt_object_destroy(ddt, type, class, tx);
1090      -                }
1091 1309          }
     1310 +        spa->spa_ddt_dsize = num_dbytes;
     1311 +        spa->spa_ddt_msize = num_mbytes;
     1312 +        atomic_add_64(&zfs_ddts_msize, ((int64_t)num_mbytes) - old_mbytes);
     1313 +        DTRACE_PROBE4(ddt__synced__obj, char *, spa->spa_name,
     1314 +            uint64_t, num_dbytes, uint64_t, num_mbytes, uint64_t,
     1315 +            zfs_ddts_msize);
1092 1316  
     1317 +        if (spa_enable_dedup_cap(spa) && spa->spa_ddt_capped == 0) {
     1318 +                /* notify that dedup cap is now active */
     1319 +                spa->spa_ddt_capped = 1;
     1320 +                spa_event_notify(spa, NULL, NULL, ESC_ZFS_DEDUP_OFF);
     1321 +        } else if (!spa_enable_dedup_cap(spa) && spa->spa_ddt_capped == 1) {
     1322 +                /* notify that dedup cap is now inactive */
     1323 +                spa->spa_ddt_capped = 0;
     1324 +                spa_event_notify(spa, NULL, NULL, ESC_ZFS_DEDUP_ON);
     1325 +        }
     1326 +
     1327 +        /* update the cached stats with the values calculated above */
1093 1328          bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
1094 1329              sizeof (ddt->ddt_histogram));
1095 1330  }
1096 1331  
1097 1332  void
1098 1333  ddt_sync(spa_t *spa, uint64_t txg)
1099 1334  {
1100 1335          dmu_tx_t *tx;
1101 1336          zio_t *rio = zio_root(spa, NULL, NULL,
1102      -            ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL);
     1337 +            ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
1103 1338  
1104 1339          ASSERT(spa_syncing_txg(spa) == txg);
1105 1340  
1106 1341          tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1107 1342  
1108 1343          for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1109 1344                  ddt_t *ddt = spa->spa_ddt[c];
1110 1345                  if (ddt == NULL)
1111 1346                          continue;
1112 1347                  ddt_sync_table(ddt, tx, txg);
↓ open down ↓ 37 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX