1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
25 * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
26 */
27
28 #include <sys/zfs_context.h>
29 #include <sys/spa.h>
30 #include <sys/spa_impl.h>
31 #include <sys/zio.h>
32 #include <sys/ddt.h>
33 #include <sys/zap.h>
34 #include <sys/dmu_tx.h>
35 #include <sys/arc.h>
36 #include <sys/dsl_pool.h>
37 #include <sys/zio_checksum.h>
38 #include <sys/zio_compress.h>
39 #include <sys/dsl_scan.h>
40 #include <sys/abd.h>
41
42 /*
43 * Almost all of the cases of iteration through zap containing entries are
44 * restricted by spa->spa_ddt_class_{min,max}. It allows one to introduce new
45 * behavior: storing all entries into the single zap. However, there are
46 * some places where all zaps are iterated through forcibly: table creation,
47 * deletion, loading, dde prefetching, and looking up. It allows one to maintain
48 * compatibility with old pools and be able to convert the old pool format
49 * into the new one on-the-fly.
50 */
51
52 /*
53 * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
54 */
55 int zfs_dedup_prefetch = 1;
56
57 static const ddt_ops_t *ddt_ops[DDT_TYPES] = {
58 &ddt_zap_ops,
59 };
60
61 static const char *ddt_class_name[DDT_CLASSES] = {
62 "ditto",
63 "duplicate",
64 "unique",
65 };
66
67 /* Possible in core size of all DDTs */
68 uint64_t zfs_ddts_msize = 0;
69
70 static void
71 ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
72 dmu_tx_t *tx)
73 {
74 spa_t *spa = ddt->ddt_spa;
75 objset_t *os = ddt->ddt_os;
76 uint64_t *objectp = &ddt->ddt_object[type][class];
77 boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_flags &
78 ZCHECKSUM_FLAG_DEDUP;
79 char name[DDT_NAMELEN];
80
81 ddt_object_name(ddt, type, class, name);
82
83 ASSERT(*objectp == 0);
84 VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0);
85 ASSERT(*objectp != 0);
86
87 VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name,
88 sizeof (uint64_t), 1, objectp, tx) == 0);
89
90 VERIFY(zap_add(os, spa->spa_ddt_stat_object, name,
91 sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
92 &ddt->ddt_histogram[type][class], tx) == 0);
93 }
94
95 static void
96 ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
97 dmu_tx_t *tx)
98 {
99 spa_t *spa = ddt->ddt_spa;
100 objset_t *os = ddt->ddt_os;
101 uint64_t *objectp = &ddt->ddt_object[type][class];
102 char name[DDT_NAMELEN];
103 #if DEBUG
104 uint64_t count;
105 #endif
106 ddt_object_name(ddt, type, class, name);
107
108 ASSERT(*objectp != 0);
109 ASSERT((ddt_object_count(ddt, type, class, &count) == 0) &&
110 (count == 0));
111 ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
112 VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0);
113 VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0);
114 VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0);
115 bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t));
116
117 *objectp = 0;
118 }
119
120 static int
121 ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
122 {
123 ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
124 dmu_object_info_t doi;
125 char name[DDT_NAMELEN];
126 int error;
127
128 ddt_object_name(ddt, type, class, name);
129
130 error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name,
131 sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
132 if (error)
133 return (error);
134
135 VERIFY0(zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
136 sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
137 &ddt->ddt_histogram[type][class]));
138
139 /*
140 * Seed the cached statistics.
141 */
142 error = ddt_object_info(ddt, type, class, &doi);
143 /* Panic in debug mode */
144 ASSERT(error == 0);
145 if (error)
146 return (error);
147 error = ddt_object_count(ddt, type, class, &ddo->ddo_count);
148 if (error)
149 return (error);
150 ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
151 ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
152
153 return (0);
154 }
155
156 static void
157 ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
158 dmu_tx_t *tx)
159 {
160 ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
161 dmu_object_info_t doi;
162 char name[DDT_NAMELEN];
163
164 ddt_object_name(ddt, type, class, name);
165
166 VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
167 sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
168 &ddt->ddt_histogram[type][class], tx) == 0);
169
170 /*
171 * Cache DDT statistics; this is the only time they'll change.
172 */
173 VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
174
175 (void) ddt_object_count(ddt, type, class, &ddo->ddo_count);
176 ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
177 ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
178 }
179
180 static int
181 ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
182 ddt_entry_t *dde)
183 {
184 if (!ddt_object_exists(ddt, type, class))
185 return (SET_ERROR(ENOENT));
186
187 return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
188 ddt->ddt_object[type][class], dde));
189 }
190
191 static void
192 ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
193 ddt_entry_t *dde)
194 {
195 if (!ddt_object_exists(ddt, type, class))
196 return;
197
198 ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os,
199 ddt->ddt_object[type][class], dde);
200 }
201
202 int
203 ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
204 ddt_entry_t *dde, dmu_tx_t *tx)
205 {
206 ASSERT(ddt_object_exists(ddt, type, class));
207
208 return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
209 ddt->ddt_object[type][class], dde, tx));
210 }
211
212 static int
213 ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
214 ddt_entry_t *dde, dmu_tx_t *tx)
215 {
216 ASSERT(ddt_object_exists(ddt, type, class));
217
218 return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os,
219 ddt->ddt_object[type][class], dde, tx));
220 }
221
222 int
223 ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
224 uint64_t *walk, ddt_entry_t *dde)
225 {
226 ASSERT(ddt_object_exists(ddt, type, class));
227
228 return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
229 ddt->ddt_object[type][class], dde, walk));
230 }
231
232 int
233 ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
234 uint64_t *count)
235 {
236 ASSERT(ddt_object_exists(ddt, type, class));
237
238 return (ddt_ops[type]->ddt_op_count(ddt->ddt_os,
239 ddt->ddt_object[type][class], count));
240 }
241
242 int
243 ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
244 dmu_object_info_t *doi)
245 {
246 if (!ddt_object_exists(ddt, type, class))
247 return (SET_ERROR(ENOENT));
248
249 return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class],
250 doi));
251 }
252
253 boolean_t
254 ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
255 {
256 return (!!ddt->ddt_object[type][class]);
257 }
258
259 void
260 ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
261 char *name)
262 {
263 (void) sprintf(name, DMU_POOL_DDT,
264 zio_checksum_table[ddt->ddt_checksum].ci_name,
265 ddt_ops[type]->ddt_op_name, ddt_class_name[class]);
266 }
267
268 void
269 ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg)
270 {
271 ASSERT(txg != 0);
272
273 for (int d = 0; d < SPA_DVAS_PER_BP; d++)
274 bp->blk_dva[d] = ddp->ddp_dva[d];
275 BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth);
276 }
277
278 void
279 ddt_bp_create(enum zio_checksum checksum,
280 const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp)
281 {
282 BP_ZERO(bp);
283
284 if (ddp != NULL)
285 ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth);
286
287 bp->blk_cksum = ddk->ddk_cksum;
288 bp->blk_fill = 1;
289
290 BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk));
291 BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk));
292 BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk));
293 BP_SET_CHECKSUM(bp, checksum);
294 BP_SET_TYPE(bp, DMU_OT_DEDUP);
295 BP_SET_LEVEL(bp, 0);
296 BP_SET_DEDUP(bp, 0);
297 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
298 }
299
300 void
301 ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp)
302 {
303 ddk->ddk_cksum = bp->blk_cksum;
304 ddk->ddk_prop = 0;
305
306 DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp));
307 DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp));
308 DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp));
309 }
310
311 void
312 ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp)
313 {
314 ASSERT(ddp->ddp_phys_birth == 0);
315
316 for (int d = 0; d < SPA_DVAS_PER_BP; d++)
317 ddp->ddp_dva[d] = bp->blk_dva[d];
318 ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp);
319 }
320
321 void
322 ddt_phys_clear(ddt_phys_t *ddp)
323 {
324 bzero(ddp, sizeof (*ddp));
325 }
326
327 void
328 ddt_phys_addref(ddt_phys_t *ddp)
329 {
330 ddp->ddp_refcnt++;
331 }
332
333 void
334 ddt_phys_decref(ddt_phys_t *ddp)
335 {
336 ASSERT((int64_t)ddp->ddp_refcnt > 0);
337 ddp->ddp_refcnt--;
338 }
339
340 void
341 ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg)
342 {
343 blkptr_t blk;
344
345 ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
346 ddt_phys_clear(ddp);
347 zio_free(ddt->ddt_spa, txg, &blk);
348 }
349
350 ddt_phys_t *
351 ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp)
352 {
353 ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys;
354
355 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
356 if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) &&
357 BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth)
358 return (ddp);
359 }
360 return (NULL);
361 }
362
363 uint64_t
364 ddt_phys_total_refcnt(const ddt_entry_t *dde)
365 {
366 uint64_t refcnt = 0;
367
368 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
369 refcnt += dde->dde_phys[p].ddp_refcnt;
370
371 return (refcnt);
372 }
373
374 static void
375 ddt_stat_generate(spa_t *spa, ddt_entry_t *dde, ddt_stat_t *dds)
376 {
377 ddt_phys_t *ddp = dde->dde_phys;
378 ddt_key_t *ddk = &dde->dde_key;
379 uint64_t lsize = DDK_GET_LSIZE(ddk);
380 uint64_t psize = DDK_GET_PSIZE(ddk);
381
382 bzero(dds, sizeof (*dds));
383
384 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
385 uint64_t dsize = 0;
386 uint64_t refcnt = ddp->ddp_refcnt;
387
388 if (ddp->ddp_phys_birth == 0)
389 continue;
390
391 for (int d = 0; d < SPA_DVAS_PER_BP; d++)
392 dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
393
394 dds->dds_blocks += 1;
395 dds->dds_lsize += lsize;
396 dds->dds_psize += psize;
397 dds->dds_dsize += dsize;
398
399 dds->dds_ref_blocks += refcnt;
400 dds->dds_ref_lsize += lsize * refcnt;
401 dds->dds_ref_psize += psize * refcnt;
402 dds->dds_ref_dsize += dsize * refcnt;
403 }
404 }
405
406 void
407 ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg)
408 {
409 const uint64_t *s = (const uint64_t *)src;
410 uint64_t *d = (uint64_t *)dst;
411 uint64_t *d_end = (uint64_t *)(dst + 1);
412
413 ASSERT(neg == 0 || neg == -1ULL); /* add or subtract */
414
415 while (d < d_end)
416 *d++ += (*s++ ^ neg) - neg;
417 }
418
419 static void
420 ddt_stat_update_by_dds(ddt_t *ddt, ddt_entry_t *dde,
421 ddt_stat_t *dds, uint64_t neg)
422 {
423 ddt_histogram_t *ddh;
424 int bucket = highbit64(dds->dds_ref_blocks) - 1;
425 ASSERT(bucket >= 0);
426
427 ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
428 ddt_stat_add(&ddh->ddh_stat[bucket], dds, neg);
429 }
430
431 static void
432 ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
433 {
434 ddt_stat_t dds;
435
436 ddt_stat_generate(ddt->ddt_spa, dde, &dds);
437
438 ddt_stat_update_by_dds(ddt, dde, &dds, neg);
439 }
440
441 void
442 ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
443 {
444 for (int h = 0; h < 64; h++)
445 ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0);
446 }
447
448 void
449 ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh)
450 {
451 bzero(dds, sizeof (*dds));
452
453 for (int h = 0; h < 64; h++)
454 ddt_stat_add(dds, &ddh->ddh_stat[h], 0);
455 }
456
457 boolean_t
458 ddt_histogram_empty(const ddt_histogram_t *ddh)
459 {
460 const uint64_t *s = (const uint64_t *)ddh;
461 const uint64_t *s_end = (const uint64_t *)(ddh + 1);
462
463 while (s < s_end)
464 if (*s++ != 0)
465 return (B_FALSE);
466
467 return (B_TRUE);
468 }
469
470 void
471 ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
472 {
473 /* Sum the statistics we cached in ddt_object_sync(). */
474 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
475 ddt_t *ddt = spa->spa_ddt[c];
476 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
477 for (enum ddt_class class = spa->spa_ddt_class_min;
478 class <= spa->spa_ddt_class_max; class++) {
479 ddt_object_t *ddo =
480 &ddt->ddt_object_stats[type][class];
481 ddo_total->ddo_count += ddo->ddo_count;
482 ddo_total->ddo_dspace += ddo->ddo_dspace;
483 ddo_total->ddo_mspace += ddo->ddo_mspace;
484 }
485 }
486 }
487
488 /* ... and compute the averages. */
489 if (ddo_total->ddo_count != 0) {
490 ddo_total->ddo_dspace /= ddo_total->ddo_count;
491 ddo_total->ddo_mspace /= ddo_total->ddo_count;
492 }
493 }
494
495 void
496 ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
497 {
498 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
499 ddt_t *ddt = spa->spa_ddt[c];
500 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
501 for (enum ddt_class class = spa->spa_ddt_class_min;
502 class <= spa->spa_ddt_class_max; class++) {
503 ddt_histogram_add(ddh,
504 &ddt->ddt_histogram_cache[type][class]);
505 }
506 }
507 }
508 }
509
510 void
511 ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total)
512 {
513 /*
514 * Avoid temporary allocation of ddt_histogram_t from heap
515 * or on stack (probably too large) by unrolling ddt_histogram_add()
516 */
517 bzero(dds_total, sizeof (ddt_stat_t));
518 /* sum up the stats across all the histograms */
519 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
520 ddt_t *ddt = spa->spa_ddt[c];
521 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
522 for (enum ddt_class class = spa->spa_ddt_class_min;
523 class <= spa->spa_ddt_class_max; class++) {
524 /* unroll the ddt_histogram_add() */
525 ddt_histogram_t *src =
526 &ddt->ddt_histogram_cache[type][class];
527 for (int h = 0; h < 64; h++) {
528 ddt_stat_t *st = &src->ddh_stat[h];
529 ddt_stat_add(dds_total, st, 0);
530 }
531 }
532 }
533 }
534 }
535
536 uint64_t
537 ddt_get_dedup_dspace(spa_t *spa)
538 {
539 ddt_stat_t dds_total = { 0 };
540
541 ddt_get_dedup_stats(spa, &dds_total);
542 return (dds_total.dds_ref_dsize - dds_total.dds_dsize);
543 }
544
545 uint64_t
546 ddt_get_pool_dedup_ratio(spa_t *spa)
547 {
548 ddt_stat_t dds_total = { 0 };
549
550 ddt_get_dedup_stats(spa, &dds_total);
551 if (dds_total.dds_dsize == 0)
552 return (100);
553
554 return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize);
555 }
556
557 int
558 ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref)
559 {
560 spa_t *spa = ddt->ddt_spa;
561 uint64_t total_refcnt = 0;
562 uint64_t ditto = spa->spa_dedup_ditto;
563 int total_copies = 0;
564 int desired_copies = 0;
565
566 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
567 ddt_phys_t *ddp = &dde->dde_phys[p];
568 zio_t *zio = dde->dde_lead_zio[p];
569 uint64_t refcnt = ddp->ddp_refcnt; /* committed refs */
570 if (zio != NULL)
571 refcnt += zio->io_parent_count; /* pending refs */
572 if (ddp == ddp_willref)
573 refcnt++; /* caller's ref */
574 if (refcnt != 0) {
575 total_refcnt += refcnt;
576 total_copies += p;
577 }
578 }
579
580 if (ditto == 0 || ditto > UINT32_MAX)
581 ditto = UINT32_MAX;
582
583 if (total_refcnt >= 1)
584 desired_copies++;
585 if (total_refcnt >= ditto)
586 desired_copies++;
587 if (total_refcnt >= ditto * ditto)
588 desired_copies++;
589
590 return (MAX(desired_copies, total_copies) - total_copies);
591 }
592
593 int
594 ddt_ditto_copies_present(ddt_entry_t *dde)
595 {
596 ddt_phys_t *ddp = &dde->dde_phys[DDT_PHYS_DITTO];
597 dva_t *dva = ddp->ddp_dva;
598 int copies = 0 - DVA_GET_GANG(dva);
599
600 for (int d = 0; d < SPA_DVAS_PER_BP; d++, dva++)
601 if (DVA_IS_VALID(dva))
602 copies++;
603
604 ASSERT(copies >= 0 && copies < SPA_DVAS_PER_BP);
605
606 return (copies);
607 }
608
609 size_t
610 ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len)
611 {
612 uchar_t *version = dst++;
613 int cpfunc = ZIO_COMPRESS_ZLE;
614 zio_compress_info_t *ci = &zio_compress_table[cpfunc];
615 size_t c_len;
616
617 ASSERT(d_len >= s_len + 1); /* no compression plus version byte */
618
619 c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level);
620
621 if (c_len == s_len) {
622 cpfunc = ZIO_COMPRESS_OFF;
623 bcopy(src, dst, s_len);
624 }
625
626 *version = cpfunc;
627 /* CONSTCOND */
628 if (ZFS_HOST_BYTEORDER)
629 *version |= DDT_COMPRESS_BYTEORDER_MASK;
630
631 return (c_len + 1);
632 }
633
634 void
635 ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len)
636 {
637 uchar_t version = *src++;
638 int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK;
639 zio_compress_info_t *ci = &zio_compress_table[cpfunc];
640
641 if (ci->ci_decompress != NULL)
642 (void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);
643 else
644 bcopy(src, dst, d_len);
645
646 if (((version & DDT_COMPRESS_BYTEORDER_MASK) != 0) !=
647 (ZFS_HOST_BYTEORDER != 0))
648 byteswap_uint64_array(dst, d_len);
649 }
650
651 ddt_t *
652 ddt_select_by_checksum(spa_t *spa, enum zio_checksum c)
653 {
654 return (spa->spa_ddt[c]);
655 }
656
657 ddt_t *
658 ddt_select(spa_t *spa, const blkptr_t *bp)
659 {
660 return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
661 }
662
663 void
664 ddt_enter(ddt_t *ddt, uint8_t hash)
665 {
666 mutex_enter(&ddt->ddt_lock[hash]);
667 }
668
669 void
670 ddt_exit(ddt_t *ddt, uint8_t hash)
671 {
672 mutex_exit(&ddt->ddt_lock[hash]);
673 }
674
675 void
676 dde_enter(ddt_entry_t *dde)
677 {
678 mutex_enter(&dde->dde_lock);
679 }
680
681 void
682 dde_exit(ddt_entry_t *dde)
683 {
684 mutex_exit(&dde->dde_lock);
685 }
686
687 /* cache for ddt_entry_t structures */
688 static kmem_cache_t *dde_cache;
689
690 /* ARGSUSED */
691 static int
692 dde_cache_constr(void *buf, void *arg, int flags)
693 {
694 ddt_entry_t *dde = (ddt_entry_t *)buf;
695 cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
696 mutex_init(&dde->dde_lock, NULL, MUTEX_DEFAULT, NULL);
697 return (0);
698 }
699
700 /* ARGSUSED */
701 static void
702 dde_cache_destr(void *buf, void *arg)
703 {
704 ddt_entry_t *dde = (ddt_entry_t *)buf;
705 cv_destroy(&dde->dde_cv);
706 mutex_destroy(&dde->dde_lock);
707 }
708
709 void
710 ddt_init(void)
711 {
712 dde_cache = kmem_cache_create("ddt_entry_t", sizeof (ddt_entry_t),
713 0, dde_cache_constr, dde_cache_destr, NULL, NULL, NULL, 0);
714 VERIFY(dde_cache != NULL);
715 }
716
717 void
718 ddt_fini(void)
719 {
720 if (dde_cache) {
721 kmem_cache_destroy(dde_cache);
722 dde_cache = NULL;
723 }
724 }
725
726 static ddt_entry_t *
727 ddt_alloc(const ddt_key_t *ddk)
728 {
729 ddt_entry_t *dde;
730
731 dde = kmem_cache_alloc(dde_cache, KM_SLEEP);
732
733 /* Init everything but the condvar and the mutex */
734 dde->dde_key = *ddk;
735 bzero((void*)((uintptr_t)dde+offsetof(ddt_entry_t, dde_phys)),
736 offsetof(ddt_entry_t, dde_cv)-offsetof(ddt_entry_t, dde_phys));
737 bzero((void*)((uintptr_t)dde+offsetof(ddt_entry_t, dde_node)),
738 sizeof (avl_node_t));
739
740 return (dde);
741 }
742
743 static void
744 ddt_free(ddt_entry_t *dde)
745 {
746 ASSERT(!(dde->dde_state & DDE_LOADING));
747
748 for (int p = 0; p < DDT_PHYS_TYPES; p++)
749 ASSERT(dde->dde_lead_zio[p] == NULL);
750
751 if (dde->dde_repair_abd != NULL)
752 abd_free(dde->dde_repair_abd);
753
754 kmem_cache_free(dde_cache, dde);
755 }
756
757 /* for zdb usage */
758 void
759 ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
760 {
761 uint8_t hash = DDT_HASHFN(dde->dde_key.ddk_cksum);
762
763 avl_remove(&ddt->ddt_tree[hash], dde);
764 ddt_free(dde);
765 }
766
767 ddt_entry_t *
768 ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
769 {
770 ddt_entry_t *dde, dde_search;
771 enum ddt_type type;
772 enum ddt_class class;
773 avl_index_t where;
774 uint8_t hash = DDT_HASHFN(bp->blk_cksum);
775 int error;
776
777 ddt_key_fill(&dde_search.dde_key, bp);
778
779 ddt_enter(ddt, hash);
780 /*
781 * Do we have the dirty DDE in mem already?
782 */
783 dde = avl_find(&ddt->ddt_tree[hash], &dde_search, &where);
784 if (dde == NULL) {
785 /* This DDE doesn't exists in dirty tree */
786 if (!add) {
787 ddt_exit(ddt, hash);
788 return (NULL);
789 }
790 /* Since a dirty DDE didn't exist, create it */
791 dde = ddt_alloc(&dde_search.dde_key);
792 avl_insert(&ddt->ddt_tree[hash], dde, where);
793 }
794
795 ddt_exit(ddt, hash);
796
797 /*
798 * If we're already looking up this DDE
799 * wait until we have the result
800 */
801 dde_enter(dde);
802 while (dde->dde_state & DDE_LOADING)
803 cv_wait(&dde->dde_cv, &dde->dde_lock);
804
805 /*
806 * If we have loaded the DDE from disk return it
807 */
808 if (dde->dde_state & DDE_LOADED)
809 return (dde);
810
811 /*
812 * If we didn't find this DDE, start looking up the DDE in ZAP
813 */
814 dde->dde_state |= DDE_LOADING;
815 dde_exit(dde);
816
817 error = ENOENT;
818
819 DTRACE_PROBE1(ddt__loading, ddt_key_t *, &dde->dde_key);
820 for (type = 0; type < DDT_TYPES; type++) {
821 for (class = 0; class < DDT_CLASSES; class++) {
822 error = ddt_object_lookup(ddt, type, class, dde);
823 if (error != ENOENT)
824 break;
825 }
826 if (error != ENOENT)
827 break;
828 }
829
830 ASSERT(error == 0 || error == ENOENT);
831
832 dde_enter(dde);
833
834 ASSERT(!(dde->dde_state & DDE_LOADED));
835 ASSERT(dde->dde_state & DDE_LOADING);
836
837 dde->dde_type = type; /* will be DDT_TYPES if no entry found */
838 dde->dde_class = class; /* will be DDT_CLASSES if no entry found */
839 if (type == DDT_TYPES && class == DDT_CLASSES)
840 dde->dde_state |= DDE_NEW;
841 dde->dde_state |= DDE_LOADED;
842 dde->dde_state &= ~DDE_LOADING;
843
844 DTRACE_PROBE2(ddt__loaded, ddt_key_t *, &dde->dde_key,
845 enum ddt_class, dde->dde_class);
846 if (error == 0)
847 ddt_stat_generate(ddt->ddt_spa, dde, &dde->dde_lkstat);
848
849 cv_broadcast(&dde->dde_cv);
850
851 return (dde);
852 }
853
854 void
855 ddt_prefetch(spa_t *spa, const blkptr_t *bp)
856 {
857 ddt_t *ddt;
858 ddt_entry_t dde;
859
860 if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp))
861 return;
862
863 /*
864 * We only remove the DDT once all tables are empty and only
865 * prefetch dedup blocks when there are entries in the DDT.
866 * Thus no locking is required as the DDT can't disappear on us.
867 */
868 ddt = ddt_select(spa, bp);
869 ddt_key_fill(&dde.dde_key, bp);
870
871 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
872 for (enum ddt_class class = 0;
873 class < DDT_CLASSES; class++) {
874 ddt_object_prefetch(ddt, type, class, &dde);
875 }
876 }
877 }
878
879 int
880 ddt_entry_compare(const void *x1, const void *x2)
881 {
882 const ddt_entry_t *dde1 = x1;
883 const ddt_entry_t *dde2 = x2;
884 const uint64_t *u1 = (const uint64_t *)&dde1->dde_key;
885 const uint64_t *u2 = (const uint64_t *)&dde2->dde_key;
886
887 for (int i = 0; i < DDT_KEY_WORDS; i++) {
888 if (u1[i] < u2[i])
889 return (-1);
890 if (u1[i] > u2[i])
891 return (1);
892 }
893
894 return (0);
895 }
896
897 static ddt_t *
898 ddt_table_alloc(spa_t *spa, enum zio_checksum c)
899 {
900 ddt_t *ddt;
901 uint_t i;
902
903 ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP);
904
905 for (i = 0; i < DDT_HASHSZ; i++) {
906 mutex_init(&ddt->ddt_lock[i], NULL, MUTEX_DEFAULT, NULL);
907 avl_create(&ddt->ddt_tree[i], ddt_entry_compare,
908 sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
909 }
910 mutex_init(&ddt->ddt_repair_lock, NULL, MUTEX_DEFAULT, NULL);
911
912 avl_create(&ddt->ddt_repair_tree, ddt_entry_compare,
913 sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
914 ddt->ddt_checksum = c;
915 ddt->ddt_spa = spa;
916 ddt->ddt_os = spa->spa_meta_objset;
917
918 return (ddt);
919 }
920
921 static void
922 ddt_table_free(ddt_t *ddt)
923 {
924 uint_t i;
925
926 ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0);
927
928 for (i = 0; i < DDT_HASHSZ; i++) {
929 ASSERT(avl_numnodes(&ddt->ddt_tree[i]) == 0);
930 avl_destroy(&ddt->ddt_tree[i]);
931 mutex_destroy(&ddt->ddt_lock[i]);
932 }
933 avl_destroy(&ddt->ddt_repair_tree);
934 mutex_destroy(&ddt->ddt_repair_lock);
935 kmem_free(ddt, sizeof (*ddt));
936 }
937
938 void
939 ddt_create(spa_t *spa)
940 {
941 spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM;
942
943 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
944 spa->spa_ddt[c] = ddt_table_alloc(spa, c);
945 }
946
947 /*
948 * Get the combined size of DDTs on all pools.
949 * Returns either on disk (phys == B_TRUE) or in core combined DDTs size
950 */
951 uint64_t
952 ddt_get_ddts_size(boolean_t phys)
953 {
954 uint64_t ddts_size = 0;
955 spa_t *spa = NULL;
956
957 while ((spa = spa_next(spa)) != NULL)
958 ddts_size += spa_get_ddts_size(spa, phys);
959
960 return (ddts_size);
961 }
962
963 int
964 ddt_load(spa_t *spa)
965 {
966 int error;
967 ddt_object_t *ddo;
968
969 ddt_create(spa);
970
971 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
972 DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
973 &spa->spa_ddt_stat_object);
974
975 if (error)
976 return (error == ENOENT ? 0 : error);
977
978 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
979 ddt_t *ddt = spa->spa_ddt[c];
980 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
981 for (enum ddt_class class = 0;
982 class < DDT_CLASSES; class++) {
983 error = ddt_object_load(ddt, type, class);
984 if (error == ENOENT)
985 continue;
986 if (error != 0)
987 return (error);
988 ddo = &ddt->ddt_object_stats[type][class];
989 atomic_add_64(&spa->spa_ddt_dsize,
990 ddo->ddo_dspace);
991 atomic_add_64(&spa->spa_ddt_msize,
992 ddo->ddo_mspace);
993 }
994 }
995
996 /*
997 * Seed the cached histograms.
998 */
999 bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
1000 sizeof (ddt->ddt_histogram));
1001 }
1002 zfs_ddts_msize = ddt_get_ddts_size(B_FALSE);
1003
1004 if (spa_enable_dedup_cap(spa) && spa->spa_ddt_capped == 0) {
1005 /* notify that dedup cap is now active */
1006 spa->spa_ddt_capped = 1;
1007 spa_event_notify(spa, NULL, NULL, ESC_ZFS_DEDUP_OFF);
1008 }
1009
1010 return (0);
1011 }
1012
1013 void
1014 ddt_unload(spa_t *spa)
1015 {
1016 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1017 if (spa->spa_ddt[c]) {
1018 ddt_table_free(spa->spa_ddt[c]);
1019 spa->spa_ddt[c] = NULL;
1020 }
1021 }
1022 spa->spa_ddt_dsize = 0;
1023 spa->spa_ddt_msize = 0;
1024 zfs_ddts_msize = ddt_get_ddts_size(B_FALSE);
1025 }
1026
1027 boolean_t
1028 ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp)
1029 {
1030 ddt_t *ddt;
1031 ddt_entry_t dde;
1032
1033 if (!BP_GET_DEDUP(bp))
1034 return (B_FALSE);
1035
1036 if (max_class > spa->spa_ddt_class_max)
1037 max_class = spa->spa_ddt_class_max;
1038
1039 ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)];
1040
1041 ddt_key_fill(&dde.dde_key, bp);
1042
1043 for (enum ddt_type type = 0; type < DDT_TYPES; type++)
1044 for (enum ddt_class class = spa->spa_ddt_class_min;
1045 class <= max_class; class++)
1046 if (ddt_object_lookup(ddt, type, class, &dde) == 0)
1047 return (B_TRUE);
1048
1049 return (B_FALSE);
1050 }
1051
1052 ddt_entry_t *
1053 ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
1054 {
1055 ddt_key_t ddk;
1056 ddt_entry_t *dde;
1057
1058 ddt_key_fill(&ddk, bp);
1059
1060 dde = ddt_alloc(&ddk);
1061
1062 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
1063 for (enum ddt_class class = 0;
1064 class < DDT_CLASSES; class++) {
1065 /*
1066 * We can only do repair if there are multiple copies
1067 * of the block. For anything in the UNIQUE class,
1068 * there's definitely only one copy, so don't even try.
1069 */
1070 if (class != DDT_CLASS_UNIQUE &&
1071 ddt_object_lookup(ddt, type, class, dde) == 0)
1072 return (dde);
1073 }
1074 }
1075
1076 bzero(dde->dde_phys, sizeof (dde->dde_phys));
1077
1078 return (dde);
1079 }
1080
1081 void
1082 ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
1083 {
1084 avl_index_t where;
1085
1086 mutex_enter(&ddt->ddt_repair_lock);
1087
1088 if (dde->dde_repair_abd != NULL && spa_writeable(ddt->ddt_spa) &&
1089 avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
1090 avl_insert(&ddt->ddt_repair_tree, dde, where);
1091 else
1092 ddt_free(dde);
1093
1094 mutex_exit(&ddt->ddt_repair_lock);;
1095 }
1096
1097 static void
1098 ddt_repair_entry_done(zio_t *zio)
1099 {
1100 ddt_entry_t *rdde = zio->io_private;
1101
1102 ddt_free(rdde);
1103 }
1104
1105 static void
1106 ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
1107 {
1108 ddt_phys_t *ddp = dde->dde_phys;
1109 ddt_phys_t *rddp = rdde->dde_phys;
1110 ddt_key_t *ddk = &dde->dde_key;
1111 ddt_key_t *rddk = &rdde->dde_key;
1112 zio_t *zio;
1113 blkptr_t blk;
1114
1115 zio = zio_null(rio, rio->io_spa, NULL,
1116 ddt_repair_entry_done, rdde, rio->io_flags);
1117
1118 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) {
1119 if (ddp->ddp_phys_birth == 0 ||
1120 ddp->ddp_phys_birth != rddp->ddp_phys_birth ||
1121 bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva)))
1122 continue;
1123 ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
1124 zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
1125 rdde->dde_repair_abd, DDK_GET_PSIZE(rddk), NULL, NULL,
1126 ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL));
1127 }
1128
1129 zio_nowait(zio);
1130 }
1131
1132 static void
1133 ddt_repair_table(ddt_t *ddt, zio_t *rio)
1134 {
1135 spa_t *spa = ddt->ddt_spa;
1136 ddt_entry_t *dde, *rdde_next, *rdde;
1137 avl_tree_t *t = &ddt->ddt_repair_tree;
1138 blkptr_t blk;
1139
1140 if (spa_sync_pass(spa) > 1)
1141 return;
1142
1143 mutex_enter(&ddt->ddt_repair_lock);
1144 for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) {
1145 rdde_next = AVL_NEXT(t, rdde);
1146 avl_remove(&ddt->ddt_repair_tree, rdde);
1147 mutex_exit(&ddt->ddt_repair_lock);
1148
1149 ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk);
1150 dde = ddt_repair_start(ddt, &blk);
1151 ddt_repair_entry(ddt, dde, rdde, rio);
1152 ddt_repair_done(ddt, dde);
1153
1154 mutex_enter(&ddt->ddt_repair_lock);
1155 }
1156 mutex_exit(&ddt->ddt_repair_lock);
1157 }
1158
1159 static void
1160 ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
1161 {
1162 dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
1163 ddt_phys_t *ddp = dde->dde_phys;
1164 ddt_key_t *ddk = &dde->dde_key;
1165 spa_t *spa = ddt->ddt_spa;
1166 enum ddt_type otype = dde->dde_type;
1167 enum ddt_type ntype = DDT_TYPE_CURRENT;
1168 enum ddt_class oclass = dde->dde_class;
1169 enum ddt_class nclass;
1170 uint64_t total_refcnt = 0;
1171
1172 ASSERT(dde->dde_state & DDE_LOADED);
1173 ASSERT(!(dde->dde_state & DDE_LOADING));
1174
1175 /*
1176 * Propagate the stats generated at lookup time
1177 * this was delayed to avoid having to take locks
1178 * to protect ddt->ddt_histogram
1179 */
1180 if (dde->dde_lkstat.dds_ref_blocks != 0)
1181 ddt_stat_update_by_dds(ddt, dde, &dde->dde_lkstat, -1ULL);
1182
1183 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
1184 ASSERT(dde->dde_lead_zio[p] == NULL);
1185 ASSERT((int64_t)ddp->ddp_refcnt >= 0);
1186 if (ddp->ddp_phys_birth == 0) {
1187 ASSERT(ddp->ddp_refcnt == 0);
1188 continue;
1189 }
1190 if (p == DDT_PHYS_DITTO) {
1191 if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0)
1192 ddt_phys_free(ddt, ddk, ddp, txg);
1193 continue;
1194 }
1195 if (ddp->ddp_refcnt == 0)
1196 ddt_phys_free(ddt, ddk, ddp, txg);
1197 total_refcnt += ddp->ddp_refcnt;
1198 }
1199
1200 if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0)
1201 nclass = DDT_CLASS_DITTO;
1202 else if (total_refcnt > 1)
1203 nclass = DDT_CLASS_DUPLICATE;
1204 else
1205 nclass = DDT_CLASS_UNIQUE;
1206
1207 if (nclass > spa->spa_ddt_class_max)
1208 nclass = spa->spa_ddt_class_max;
1209
1210 if (nclass < spa->spa_ddt_class_min)
1211 nclass = spa->spa_ddt_class_min;
1212
1213 DTRACE_PROBE1(ddt__storing__entry, uint64_t, (uint64_t)nclass);
1214
1215 if (otype != DDT_TYPES &&
1216 (otype != ntype || oclass != nclass || total_refcnt == 0)) {
1217 VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0);
1218 ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT);
1219 }
1220
1221 if (total_refcnt != 0) {
1222 dde->dde_type = ntype;
1223 dde->dde_class = nclass;
1224 ddt_stat_update(ddt, dde, 0);
1225 if (!ddt_object_exists(ddt, ntype, nclass))
1226 ddt_object_create(ddt, ntype, nclass, tx);
1227 VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0);
1228
1229 /*
1230 * If the class changes, the order that we scan this bp
1231 * changes. If it decreases, we could miss it, so
1232 * scan it right now. (This covers both class changing
1233 * while we are doing ddt_walk(), and when we are
1234 * traversing.)
1235 */
1236 if (nclass < oclass) {
1237 dsl_scan_ddt_entry(dp->dp_scan,
1238 ddt->ddt_checksum, dde, tx);
1239 }
1240 }
1241 DTRACE_PROBE(ddt__stored__entry);
1242 }
1243
1244 static void
1245 ddt_sync_avl(ddt_t *ddt, avl_tree_t *avl, dmu_tx_t *tx, uint64_t txg)
1246 {
1247 void *cookie = NULL;
1248 ddt_entry_t *dde;
1249
1250 while ((dde = avl_destroy_nodes(avl, &cookie)) != NULL) {
1251 if ((dde->dde_state & DDE_DONT_SYNC) != DDE_DONT_SYNC) {
1252 ddt_sync_entry(ddt, dde, tx, txg);
1253 } else { /* if we're not syncing this DDE it must be new */
1254 ASSERT(dde->dde_state & DDE_NEW);
1255 }
1256 ddt_free(dde);
1257 }
1258 }
1259
1260 static void
1261 ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
1262 {
1263 uint64_t cnt, num_dbytes = 0, num_mbytes = 0;
1264 int64_t old_mbytes = 0;
1265 spa_t *spa = ddt->ddt_spa;
1266 uint_t i, numnodes = 0;
1267 ddt_object_t *ddo;
1268
1269 for (i = 0; i < DDT_HASHSZ; i++)
1270 numnodes += avl_numnodes(&ddt->ddt_tree[i]);
1271
1272 if (numnodes == 0)
1273 return;
1274
1275 ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP);
1276
1277 if (spa->spa_ddt_stat_object == 0) {
1278 spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
1279 DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT,
1280 DMU_POOL_DDT_STATS, tx);
1281 }
1282
1283
1284 DTRACE_PROBE(ddt__syncing__avl);
1285 for (i = 0; i < DDT_HASHSZ; i++)
1286 ddt_sync_avl(ddt, &ddt->ddt_tree[i], tx, txg);
1287 DTRACE_PROBE(ddt__synced__avl);
1288
1289 DTRACE_PROBE(ddt__syncing__obj);
1290 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
1291 for (enum ddt_class class = spa->spa_ddt_class_min;
1292 class <= spa->spa_ddt_class_max; class++) {
1293 if (ddt_object_exists(ddt, type, class)) {
1294 ddo = &ddt->ddt_object_stats[type][class];
1295 old_mbytes += ddo->ddo_mspace;
1296
1297 ddt_object_sync(ddt, type, class, tx);
1298 (void) ddt_object_count(ddt, type, class, &cnt);
1299 if (cnt == 0) {
1300 ddt_object_destroy(ddt, type, class,
1301 tx);
1302 continue;
1303 }
1304
1305 num_dbytes += ddo->ddo_dspace;
1306 num_mbytes += ddo->ddo_mspace;
1307 }
1308 }
1309 }
1310 spa->spa_ddt_dsize = num_dbytes;
1311 spa->spa_ddt_msize = num_mbytes;
1312 atomic_add_64(&zfs_ddts_msize, ((int64_t)num_mbytes) - old_mbytes);
1313 DTRACE_PROBE4(ddt__synced__obj, char *, spa->spa_name,
1314 uint64_t, num_dbytes, uint64_t, num_mbytes, uint64_t,
1315 zfs_ddts_msize);
1316
1317 if (spa_enable_dedup_cap(spa) && spa->spa_ddt_capped == 0) {
1318 /* notify that dedup cap is now active */
1319 spa->spa_ddt_capped = 1;
1320 spa_event_notify(spa, NULL, NULL, ESC_ZFS_DEDUP_OFF);
1321 } else if (!spa_enable_dedup_cap(spa) && spa->spa_ddt_capped == 1) {
1322 /* notify that dedup cap is now inactive */
1323 spa->spa_ddt_capped = 0;
1324 spa_event_notify(spa, NULL, NULL, ESC_ZFS_DEDUP_ON);
1325 }
1326
1327 /* update the cached stats with the values calculated above */
1328 bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
1329 sizeof (ddt->ddt_histogram));
1330 }
1331
1332 void
1333 ddt_sync(spa_t *spa, uint64_t txg)
1334 {
1335 dmu_tx_t *tx;
1336 zio_t *rio = zio_root(spa, NULL, NULL,
1337 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
1338
1339 ASSERT(spa_syncing_txg(spa) == txg);
1340
1341 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1342
1343 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1344 ddt_t *ddt = spa->spa_ddt[c];
1345 if (ddt == NULL)
1346 continue;
1347 ddt_sync_table(ddt, tx, txg);
1348 ddt_repair_table(ddt, rio);
1349 }
1350
1351 (void) zio_wait(rio);
1352
1353 dmu_tx_commit(tx);
1354 }
1355
1356 int
1357 ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)
1358 {
1359 do {
1360 do {
1361 do {
1362 ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum];
1363 int error = ENOENT;
1364 if (ddt_object_exists(ddt, ddb->ddb_type,
1365 ddb->ddb_class)) {
1366 error = ddt_object_walk(ddt,
1367 ddb->ddb_type, ddb->ddb_class,
1368 &ddb->ddb_cursor, dde);
1369 }
1370 dde->dde_type = ddb->ddb_type;
1371 dde->dde_class = ddb->ddb_class;
1372 if (error == 0)
1373 return (0);
1374 if (error != ENOENT)
1375 return (error);
1376 ddb->ddb_cursor = 0;
1377 } while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS);
1378 ddb->ddb_checksum = 0;
1379 } while (++ddb->ddb_type < DDT_TYPES);
1380 ddb->ddb_type = 0;
1381 } while (++ddb->ddb_class < DDT_CLASSES);
1382
1383 return (SET_ERROR(ENOENT));
1384 }