Print this page
NEX-5856 ddt_capped isn't reset when deduped dataset is destroyed
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
4185 add new cryptographic checksums to ZFS: SHA-512, Skein, Edon-R (fix studio build)
4185 add new cryptographic checksums to ZFS: SHA-512, Skein, Edon-R
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Richard Lowe <richlowe@richlowe.net>
Approved by: Garrett D'Amore <garrett@damore.org>
NEX-3165 need some dedup improvements
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
NEX-3211 mismerge ddt_repair_start()
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Josef Sipek <josef.sipek@nexenta.com>
4370 avoid transmitting holes during zfs send
4371 DMU code clean up
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
Approved by: Garrett D'Amore <garrett@damore.org>
OS-80 support for vdev and CoS properties for the new I/O scheduler
OS-95 lint warning introduced by OS-61
Issue #2: optimize DDE lookup in DDT objects
Added option to control number of classes of DDE's in DDT.
New default is one, that is all DDE's are stored together
regardless of refcount.
re #12611 rb4105 zpool import panic in ddt_zap_count()
re #8279 rb3915 need a mechanism to notify NMS about ZFS config changes (fix lint -courtesy of Yuri Pankov)
re #12584 rb4049 zfsxx latest code merge (fix lint - courtesy of Yuri Pankov)
re #12585 rb4049 ZFS++ work port - refactoring to improve separation of open/closed code, bug fixes, performance improvements - open code
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/fs/zfs/ddt.c
+++ new/usr/src/uts/common/fs/zfs/ddt.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
↓ open down ↓ |
14 lines elided |
↑ open up ↑ |
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24 24 * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
25 + * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
25 26 */
26 27
27 28 #include <sys/zfs_context.h>
28 29 #include <sys/spa.h>
29 30 #include <sys/spa_impl.h>
30 31 #include <sys/zio.h>
31 32 #include <sys/ddt.h>
32 33 #include <sys/zap.h>
33 34 #include <sys/dmu_tx.h>
34 35 #include <sys/arc.h>
35 36 #include <sys/dsl_pool.h>
36 37 #include <sys/zio_checksum.h>
37 38 #include <sys/zio_compress.h>
38 39 #include <sys/dsl_scan.h>
39 40 #include <sys/abd.h>
40 41
41 42 /*
43 + * Almost all of the cases of iteration through zap containing entries are
44 + * restricted by spa->spa_ddt_class_{min,max}. It allows one to introduce new
45 + * behavior: storing all entries into the single zap. However, there are
46 + * some places where all zaps are iterated through forcibly: table creation,
47 + * deletion, loading, dde prefetching, and looking up. It allows one to maintain
48 + * compatibility with old pools and be able to convert the old pool format
49 + * into the new one on-the-fly.
50 + */
51 +
52 +/*
42 53 * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
43 54 */
44 55 int zfs_dedup_prefetch = 1;
45 56
46 57 static const ddt_ops_t *ddt_ops[DDT_TYPES] = {
47 58 &ddt_zap_ops,
48 59 };
49 60
50 61 static const char *ddt_class_name[DDT_CLASSES] = {
51 62 "ditto",
52 63 "duplicate",
53 64 "unique",
54 65 };
55 66
67 +/* Possible in core size of all DDTs */
68 +uint64_t zfs_ddts_msize = 0;
69 +
56 70 static void
57 71 ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
58 72 dmu_tx_t *tx)
59 73 {
60 74 spa_t *spa = ddt->ddt_spa;
61 75 objset_t *os = ddt->ddt_os;
62 76 uint64_t *objectp = &ddt->ddt_object[type][class];
63 77 boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_flags &
64 78 ZCHECKSUM_FLAG_DEDUP;
65 79 char name[DDT_NAMELEN];
66 80
67 81 ddt_object_name(ddt, type, class, name);
68 82
69 83 ASSERT(*objectp == 0);
70 84 VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0);
71 85 ASSERT(*objectp != 0);
72 86
73 87 VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name,
74 88 sizeof (uint64_t), 1, objectp, tx) == 0);
75 89
76 90 VERIFY(zap_add(os, spa->spa_ddt_stat_object, name,
77 91 sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
78 92 &ddt->ddt_histogram[type][class], tx) == 0);
|
↓ open down ↓ |
13 lines elided |
↑ open up ↑ |
79 93 }
80 94
81 95 static void
82 96 ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
83 97 dmu_tx_t *tx)
84 98 {
85 99 spa_t *spa = ddt->ddt_spa;
86 100 objset_t *os = ddt->ddt_os;
87 101 uint64_t *objectp = &ddt->ddt_object[type][class];
88 102 char name[DDT_NAMELEN];
89 -
103 +#if DEBUG
104 + uint64_t count;
105 +#endif
90 106 ddt_object_name(ddt, type, class, name);
91 107
92 108 ASSERT(*objectp != 0);
93 - ASSERT(ddt_object_count(ddt, type, class) == 0);
109 + ASSERT((ddt_object_count(ddt, type, class, &count) == 0) &&
110 + (count == 0));
94 111 ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
95 112 VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0);
96 113 VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0);
97 114 VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0);
98 115 bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t));
99 116
100 117 *objectp = 0;
101 118 }
102 119
103 120 static int
104 121 ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
|
↓ open down ↓ |
1 lines elided |
↑ open up ↑ |
105 122 {
106 123 ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
107 124 dmu_object_info_t doi;
108 125 char name[DDT_NAMELEN];
109 126 int error;
110 127
111 128 ddt_object_name(ddt, type, class, name);
112 129
113 130 error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name,
114 131 sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
115 -
116 - if (error != 0)
132 + if (error)
117 133 return (error);
118 134
119 135 VERIFY0(zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
120 136 sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
121 137 &ddt->ddt_histogram[type][class]));
122 138
123 139 /*
124 140 * Seed the cached statistics.
125 141 */
126 - VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
127 -
128 - ddo->ddo_count = ddt_object_count(ddt, type, class);
142 + error = ddt_object_info(ddt, type, class, &doi);
143 + /* Panic in debug mode */
144 + ASSERT(error == 0);
145 + if (error)
146 + return (error);
147 + error = ddt_object_count(ddt, type, class, &ddo->ddo_count);
148 + if (error)
149 + return (error);
129 150 ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
130 151 ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
131 152
132 153 return (0);
133 154 }
134 155
135 156 static void
136 157 ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
137 158 dmu_tx_t *tx)
138 159 {
139 160 ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
140 161 dmu_object_info_t doi;
141 162 char name[DDT_NAMELEN];
142 163
143 164 ddt_object_name(ddt, type, class, name);
|
↓ open down ↓ |
5 lines elided |
↑ open up ↑ |
144 165
145 166 VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
146 167 sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
147 168 &ddt->ddt_histogram[type][class], tx) == 0);
148 169
149 170 /*
150 171 * Cache DDT statistics; this is the only time they'll change.
151 172 */
152 173 VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
153 174
154 - ddo->ddo_count = ddt_object_count(ddt, type, class);
175 + (void) ddt_object_count(ddt, type, class, &ddo->ddo_count);
155 176 ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
156 177 ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
157 178 }
158 179
159 180 static int
160 181 ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
161 182 ddt_entry_t *dde)
162 183 {
163 184 if (!ddt_object_exists(ddt, type, class))
164 185 return (SET_ERROR(ENOENT));
165 186
166 187 return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
167 188 ddt->ddt_object[type][class], dde));
168 189 }
169 190
170 191 static void
171 192 ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
172 193 ddt_entry_t *dde)
173 194 {
174 195 if (!ddt_object_exists(ddt, type, class))
175 196 return;
176 197
177 198 ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os,
178 199 ddt->ddt_object[type][class], dde);
179 200 }
180 201
181 202 int
182 203 ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
183 204 ddt_entry_t *dde, dmu_tx_t *tx)
184 205 {
185 206 ASSERT(ddt_object_exists(ddt, type, class));
186 207
187 208 return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
188 209 ddt->ddt_object[type][class], dde, tx));
189 210 }
190 211
191 212 static int
192 213 ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
193 214 ddt_entry_t *dde, dmu_tx_t *tx)
194 215 {
195 216 ASSERT(ddt_object_exists(ddt, type, class));
196 217
197 218 return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os,
198 219 ddt->ddt_object[type][class], dde, tx));
199 220 }
200 221
|
↓ open down ↓ |
36 lines elided |
↑ open up ↑ |
201 222 int
202 223 ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
203 224 uint64_t *walk, ddt_entry_t *dde)
204 225 {
205 226 ASSERT(ddt_object_exists(ddt, type, class));
206 227
207 228 return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
208 229 ddt->ddt_object[type][class], dde, walk));
209 230 }
210 231
211 -uint64_t
212 -ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
232 +int
233 +ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
234 + uint64_t *count)
213 235 {
214 236 ASSERT(ddt_object_exists(ddt, type, class));
215 237
216 238 return (ddt_ops[type]->ddt_op_count(ddt->ddt_os,
217 - ddt->ddt_object[type][class]));
239 + ddt->ddt_object[type][class], count));
218 240 }
219 241
220 242 int
221 243 ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
222 244 dmu_object_info_t *doi)
223 245 {
224 246 if (!ddt_object_exists(ddt, type, class))
225 247 return (SET_ERROR(ENOENT));
226 248
227 249 return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class],
228 250 doi));
229 251 }
230 252
231 253 boolean_t
232 254 ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
233 255 {
234 256 return (!!ddt->ddt_object[type][class]);
235 257 }
236 258
237 259 void
238 260 ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
239 261 char *name)
240 262 {
241 263 (void) sprintf(name, DMU_POOL_DDT,
242 264 zio_checksum_table[ddt->ddt_checksum].ci_name,
243 265 ddt_ops[type]->ddt_op_name, ddt_class_name[class]);
244 266 }
245 267
246 268 void
247 269 ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg)
248 270 {
249 271 ASSERT(txg != 0);
250 272
251 273 for (int d = 0; d < SPA_DVAS_PER_BP; d++)
252 274 bp->blk_dva[d] = ddp->ddp_dva[d];
253 275 BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth);
254 276 }
255 277
256 278 void
257 279 ddt_bp_create(enum zio_checksum checksum,
258 280 const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp)
259 281 {
260 282 BP_ZERO(bp);
261 283
262 284 if (ddp != NULL)
263 285 ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth);
264 286
265 287 bp->blk_cksum = ddk->ddk_cksum;
266 288 bp->blk_fill = 1;
267 289
268 290 BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk));
269 291 BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk));
270 292 BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk));
271 293 BP_SET_CHECKSUM(bp, checksum);
272 294 BP_SET_TYPE(bp, DMU_OT_DEDUP);
273 295 BP_SET_LEVEL(bp, 0);
274 296 BP_SET_DEDUP(bp, 0);
275 297 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
276 298 }
277 299
278 300 void
279 301 ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp)
280 302 {
281 303 ddk->ddk_cksum = bp->blk_cksum;
282 304 ddk->ddk_prop = 0;
283 305
284 306 DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp));
285 307 DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp));
286 308 DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp));
287 309 }
288 310
289 311 void
290 312 ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp)
291 313 {
292 314 ASSERT(ddp->ddp_phys_birth == 0);
293 315
294 316 for (int d = 0; d < SPA_DVAS_PER_BP; d++)
295 317 ddp->ddp_dva[d] = bp->blk_dva[d];
296 318 ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp);
297 319 }
298 320
299 321 void
300 322 ddt_phys_clear(ddt_phys_t *ddp)
301 323 {
302 324 bzero(ddp, sizeof (*ddp));
303 325 }
304 326
305 327 void
306 328 ddt_phys_addref(ddt_phys_t *ddp)
307 329 {
308 330 ddp->ddp_refcnt++;
309 331 }
310 332
311 333 void
312 334 ddt_phys_decref(ddt_phys_t *ddp)
313 335 {
314 336 ASSERT((int64_t)ddp->ddp_refcnt > 0);
315 337 ddp->ddp_refcnt--;
316 338 }
317 339
318 340 void
319 341 ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg)
320 342 {
321 343 blkptr_t blk;
322 344
323 345 ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
324 346 ddt_phys_clear(ddp);
325 347 zio_free(ddt->ddt_spa, txg, &blk);
326 348 }
327 349
328 350 ddt_phys_t *
329 351 ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp)
330 352 {
331 353 ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys;
332 354
333 355 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
334 356 if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) &&
335 357 BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth)
336 358 return (ddp);
337 359 }
338 360 return (NULL);
339 361 }
340 362
341 363 uint64_t
342 364 ddt_phys_total_refcnt(const ddt_entry_t *dde)
|
↓ open down ↓ |
115 lines elided |
↑ open up ↑ |
343 365 {
344 366 uint64_t refcnt = 0;
345 367
346 368 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
347 369 refcnt += dde->dde_phys[p].ddp_refcnt;
348 370
349 371 return (refcnt);
350 372 }
351 373
352 374 static void
353 -ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
375 +ddt_stat_generate(spa_t *spa, ddt_entry_t *dde, ddt_stat_t *dds)
354 376 {
355 - spa_t *spa = ddt->ddt_spa;
356 377 ddt_phys_t *ddp = dde->dde_phys;
357 378 ddt_key_t *ddk = &dde->dde_key;
358 379 uint64_t lsize = DDK_GET_LSIZE(ddk);
359 380 uint64_t psize = DDK_GET_PSIZE(ddk);
360 381
361 382 bzero(dds, sizeof (*dds));
362 383
363 384 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
364 385 uint64_t dsize = 0;
365 386 uint64_t refcnt = ddp->ddp_refcnt;
366 387
367 388 if (ddp->ddp_phys_birth == 0)
368 389 continue;
369 390
370 391 for (int d = 0; d < SPA_DVAS_PER_BP; d++)
371 392 dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
372 393
373 394 dds->dds_blocks += 1;
374 395 dds->dds_lsize += lsize;
375 396 dds->dds_psize += psize;
376 397 dds->dds_dsize += dsize;
377 398
378 399 dds->dds_ref_blocks += refcnt;
379 400 dds->dds_ref_lsize += lsize * refcnt;
380 401 dds->dds_ref_psize += psize * refcnt;
381 402 dds->dds_ref_dsize += dsize * refcnt;
382 403 }
383 404 }
384 405
385 406 void
386 407 ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg)
387 408 {
388 409 const uint64_t *s = (const uint64_t *)src;
|
↓ open down ↓ |
23 lines elided |
↑ open up ↑ |
389 410 uint64_t *d = (uint64_t *)dst;
390 411 uint64_t *d_end = (uint64_t *)(dst + 1);
391 412
392 413 ASSERT(neg == 0 || neg == -1ULL); /* add or subtract */
393 414
394 415 while (d < d_end)
395 416 *d++ += (*s++ ^ neg) - neg;
396 417 }
397 418
398 419 static void
399 -ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
420 +ddt_stat_update_by_dds(ddt_t *ddt, ddt_entry_t *dde,
421 + ddt_stat_t *dds, uint64_t neg)
400 422 {
401 - ddt_stat_t dds;
402 423 ddt_histogram_t *ddh;
403 - int bucket;
404 -
405 - ddt_stat_generate(ddt, dde, &dds);
406 -
407 - bucket = highbit64(dds.dds_ref_blocks) - 1;
424 + int bucket = highbit64(dds->dds_ref_blocks) - 1;
408 425 ASSERT(bucket >= 0);
409 426
410 427 ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
428 + ddt_stat_add(&ddh->ddh_stat[bucket], dds, neg);
429 +}
411 430
412 - ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg);
431 +static void
432 +ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
433 +{
434 + ddt_stat_t dds;
435 +
436 + ddt_stat_generate(ddt->ddt_spa, dde, &dds);
437 +
438 + ddt_stat_update_by_dds(ddt, dde, &dds, neg);
413 439 }
414 440
415 441 void
416 442 ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
417 443 {
418 444 for (int h = 0; h < 64; h++)
419 445 ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0);
420 446 }
421 447
422 448 void
423 449 ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh)
424 450 {
425 451 bzero(dds, sizeof (*dds));
426 452
427 453 for (int h = 0; h < 64; h++)
428 454 ddt_stat_add(dds, &ddh->ddh_stat[h], 0);
429 455 }
430 456
431 457 boolean_t
432 458 ddt_histogram_empty(const ddt_histogram_t *ddh)
433 459 {
434 460 const uint64_t *s = (const uint64_t *)ddh;
435 461 const uint64_t *s_end = (const uint64_t *)(ddh + 1);
436 462
437 463 while (s < s_end)
438 464 if (*s++ != 0)
439 465 return (B_FALSE);
440 466
|
↓ open down ↓ |
18 lines elided |
↑ open up ↑ |
441 467 return (B_TRUE);
442 468 }
443 469
444 470 void
445 471 ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
446 472 {
447 473 /* Sum the statistics we cached in ddt_object_sync(). */
448 474 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
449 475 ddt_t *ddt = spa->spa_ddt[c];
450 476 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
451 - for (enum ddt_class class = 0; class < DDT_CLASSES;
452 - class++) {
477 + for (enum ddt_class class = spa->spa_ddt_class_min;
478 + class <= spa->spa_ddt_class_max; class++) {
453 479 ddt_object_t *ddo =
454 480 &ddt->ddt_object_stats[type][class];
455 481 ddo_total->ddo_count += ddo->ddo_count;
456 482 ddo_total->ddo_dspace += ddo->ddo_dspace;
457 483 ddo_total->ddo_mspace += ddo->ddo_mspace;
458 484 }
459 485 }
460 486 }
461 487
462 488 /* ... and compute the averages. */
463 489 if (ddo_total->ddo_count != 0) {
464 490 ddo_total->ddo_dspace /= ddo_total->ddo_count;
|
↓ open down ↓ |
2 lines elided |
↑ open up ↑ |
465 491 ddo_total->ddo_mspace /= ddo_total->ddo_count;
466 492 }
467 493 }
468 494
469 495 void
470 496 ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
471 497 {
472 498 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
473 499 ddt_t *ddt = spa->spa_ddt[c];
474 500 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
475 - for (enum ddt_class class = 0; class < DDT_CLASSES;
476 - class++) {
501 + for (enum ddt_class class = spa->spa_ddt_class_min;
502 + class <= spa->spa_ddt_class_max; class++) {
477 503 ddt_histogram_add(ddh,
478 504 &ddt->ddt_histogram_cache[type][class]);
479 505 }
480 506 }
481 507 }
482 508 }
483 509
484 510 void
485 511 ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total)
486 512 {
487 - ddt_histogram_t *ddh_total;
488 -
489 - ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
490 - ddt_get_dedup_histogram(spa, ddh_total);
491 - ddt_histogram_stat(dds_total, ddh_total);
492 - kmem_free(ddh_total, sizeof (ddt_histogram_t));
513 + /*
514 + * Avoid temporary allocation of ddt_histogram_t from heap
515 + * or on stack (probably too large) by unrolling ddt_histogram_add()
516 + */
517 + bzero(dds_total, sizeof (ddt_stat_t));
518 + /* sum up the stats across all the histograms */
519 + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
520 + ddt_t *ddt = spa->spa_ddt[c];
521 + for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
522 + for (enum ddt_class class = spa->spa_ddt_class_min;
523 + class <= spa->spa_ddt_class_max; class++) {
524 + /* unroll the ddt_histogram_add() */
525 + ddt_histogram_t *src =
526 + &ddt->ddt_histogram_cache[type][class];
527 + for (int h = 0; h < 64; h++) {
528 + ddt_stat_t *st = &src->ddh_stat[h];
529 + ddt_stat_add(dds_total, st, 0);
530 + }
531 + }
532 + }
533 + }
493 534 }
494 535
495 536 uint64_t
496 537 ddt_get_dedup_dspace(spa_t *spa)
497 538 {
498 539 ddt_stat_t dds_total = { 0 };
499 540
500 541 ddt_get_dedup_stats(spa, &dds_total);
501 542 return (dds_total.dds_ref_dsize - dds_total.dds_dsize);
502 543 }
503 544
504 545 uint64_t
505 546 ddt_get_pool_dedup_ratio(spa_t *spa)
506 547 {
507 548 ddt_stat_t dds_total = { 0 };
508 549
509 550 ddt_get_dedup_stats(spa, &dds_total);
510 551 if (dds_total.dds_dsize == 0)
511 552 return (100);
512 553
513 554 return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize);
514 555 }
515 556
516 557 int
517 558 ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref)
518 559 {
519 560 spa_t *spa = ddt->ddt_spa;
520 561 uint64_t total_refcnt = 0;
521 562 uint64_t ditto = spa->spa_dedup_ditto;
522 563 int total_copies = 0;
523 564 int desired_copies = 0;
524 565
525 566 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
526 567 ddt_phys_t *ddp = &dde->dde_phys[p];
527 568 zio_t *zio = dde->dde_lead_zio[p];
528 569 uint64_t refcnt = ddp->ddp_refcnt; /* committed refs */
529 570 if (zio != NULL)
530 571 refcnt += zio->io_parent_count; /* pending refs */
531 572 if (ddp == ddp_willref)
532 573 refcnt++; /* caller's ref */
533 574 if (refcnt != 0) {
534 575 total_refcnt += refcnt;
535 576 total_copies += p;
536 577 }
537 578 }
538 579
539 580 if (ditto == 0 || ditto > UINT32_MAX)
540 581 ditto = UINT32_MAX;
541 582
542 583 if (total_refcnt >= 1)
543 584 desired_copies++;
544 585 if (total_refcnt >= ditto)
545 586 desired_copies++;
546 587 if (total_refcnt >= ditto * ditto)
547 588 desired_copies++;
548 589
549 590 return (MAX(desired_copies, total_copies) - total_copies);
550 591 }
551 592
552 593 int
553 594 ddt_ditto_copies_present(ddt_entry_t *dde)
554 595 {
555 596 ddt_phys_t *ddp = &dde->dde_phys[DDT_PHYS_DITTO];
556 597 dva_t *dva = ddp->ddp_dva;
557 598 int copies = 0 - DVA_GET_GANG(dva);
558 599
559 600 for (int d = 0; d < SPA_DVAS_PER_BP; d++, dva++)
560 601 if (DVA_IS_VALID(dva))
561 602 copies++;
562 603
563 604 ASSERT(copies >= 0 && copies < SPA_DVAS_PER_BP);
564 605
565 606 return (copies);
566 607 }
567 608
568 609 size_t
569 610 ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len)
570 611 {
571 612 uchar_t *version = dst++;
572 613 int cpfunc = ZIO_COMPRESS_ZLE;
573 614 zio_compress_info_t *ci = &zio_compress_table[cpfunc];
574 615 size_t c_len;
575 616
576 617 ASSERT(d_len >= s_len + 1); /* no compression plus version byte */
577 618
578 619 c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level);
579 620
580 621 if (c_len == s_len) {
581 622 cpfunc = ZIO_COMPRESS_OFF;
582 623 bcopy(src, dst, s_len);
583 624 }
584 625
585 626 *version = cpfunc;
586 627 /* CONSTCOND */
587 628 if (ZFS_HOST_BYTEORDER)
588 629 *version |= DDT_COMPRESS_BYTEORDER_MASK;
589 630
590 631 return (c_len + 1);
591 632 }
592 633
593 634 void
594 635 ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len)
595 636 {
596 637 uchar_t version = *src++;
597 638 int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK;
598 639 zio_compress_info_t *ci = &zio_compress_table[cpfunc];
599 640
600 641 if (ci->ci_decompress != NULL)
601 642 (void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);
602 643 else
603 644 bcopy(src, dst, d_len);
604 645
605 646 if (((version & DDT_COMPRESS_BYTEORDER_MASK) != 0) !=
606 647 (ZFS_HOST_BYTEORDER != 0))
607 648 byteswap_uint64_array(dst, d_len);
608 649 }
609 650
610 651 ddt_t *
611 652 ddt_select_by_checksum(spa_t *spa, enum zio_checksum c)
612 653 {
|
↓ open down ↓ |
110 lines elided |
↑ open up ↑ |
613 654 return (spa->spa_ddt[c]);
614 655 }
615 656
616 657 ddt_t *
617 658 ddt_select(spa_t *spa, const blkptr_t *bp)
618 659 {
619 660 return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
620 661 }
621 662
622 663 void
623 -ddt_enter(ddt_t *ddt)
664 +ddt_enter(ddt_t *ddt, uint8_t hash)
624 665 {
625 - mutex_enter(&ddt->ddt_lock);
666 + mutex_enter(&ddt->ddt_lock[hash]);
626 667 }
627 668
628 669 void
629 -ddt_exit(ddt_t *ddt)
670 +ddt_exit(ddt_t *ddt, uint8_t hash)
630 671 {
631 - mutex_exit(&ddt->ddt_lock);
672 + mutex_exit(&ddt->ddt_lock[hash]);
632 673 }
633 674
675 +void
676 +dde_enter(ddt_entry_t *dde)
677 +{
678 + mutex_enter(&dde->dde_lock);
679 +}
680 +
681 +void
682 +dde_exit(ddt_entry_t *dde)
683 +{
684 + mutex_exit(&dde->dde_lock);
685 +}
686 +
687 +/* cache for ddt_entry_t structures */
688 +static kmem_cache_t *dde_cache;
689 +
690 +/* ARGSUSED */
691 +static int
692 +dde_cache_constr(void *buf, void *arg, int flags)
693 +{
694 + ddt_entry_t *dde = (ddt_entry_t *)buf;
695 + cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
696 + mutex_init(&dde->dde_lock, NULL, MUTEX_DEFAULT, NULL);
697 + return (0);
698 +}
699 +
700 +/* ARGSUSED */
701 +static void
702 +dde_cache_destr(void *buf, void *arg)
703 +{
704 + ddt_entry_t *dde = (ddt_entry_t *)buf;
705 + cv_destroy(&dde->dde_cv);
706 + mutex_destroy(&dde->dde_lock);
707 +}
708 +
709 +void
710 +ddt_init(void)
711 +{
712 + dde_cache = kmem_cache_create("ddt_entry_t", sizeof (ddt_entry_t),
713 + 0, dde_cache_constr, dde_cache_destr, NULL, NULL, NULL, 0);
714 + VERIFY(dde_cache != NULL);
715 +}
716 +
717 +void
718 +ddt_fini(void)
719 +{
720 + if (dde_cache) {
721 + kmem_cache_destroy(dde_cache);
722 + dde_cache = NULL;
723 + }
724 +}
725 +
634 726 static ddt_entry_t *
635 727 ddt_alloc(const ddt_key_t *ddk)
636 728 {
637 729 ddt_entry_t *dde;
638 730
639 - dde = kmem_zalloc(sizeof (ddt_entry_t), KM_SLEEP);
640 - cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
731 + dde = kmem_cache_alloc(dde_cache, KM_SLEEP);
641 732
733 + /* Init everything but the condvar and the mutex */
642 734 dde->dde_key = *ddk;
735 + bzero((void*)((uintptr_t)dde+offsetof(ddt_entry_t, dde_phys)),
736 + offsetof(ddt_entry_t, dde_cv)-offsetof(ddt_entry_t, dde_phys));
737 + bzero((void*)((uintptr_t)dde+offsetof(ddt_entry_t, dde_node)),
738 + sizeof (avl_node_t));
643 739
644 740 return (dde);
645 741 }
646 742
647 743 static void
648 744 ddt_free(ddt_entry_t *dde)
649 745 {
650 - ASSERT(!dde->dde_loading);
746 + ASSERT(!(dde->dde_state & DDE_LOADING));
651 747
652 748 for (int p = 0; p < DDT_PHYS_TYPES; p++)
653 749 ASSERT(dde->dde_lead_zio[p] == NULL);
654 750
655 751 if (dde->dde_repair_abd != NULL)
656 752 abd_free(dde->dde_repair_abd);
657 753
658 - cv_destroy(&dde->dde_cv);
659 - kmem_free(dde, sizeof (*dde));
754 + kmem_cache_free(dde_cache, dde);
660 755 }
661 756
757 +/* for zdb usage */
662 758 void
663 759 ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
664 760 {
665 - ASSERT(MUTEX_HELD(&ddt->ddt_lock));
761 + uint8_t hash = DDT_HASHFN(dde->dde_key.ddk_cksum);
666 762
667 - avl_remove(&ddt->ddt_tree, dde);
763 + avl_remove(&ddt->ddt_tree[hash], dde);
668 764 ddt_free(dde);
669 765 }
670 766
671 767 ddt_entry_t *
672 768 ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
673 769 {
674 770 ddt_entry_t *dde, dde_search;
675 771 enum ddt_type type;
676 772 enum ddt_class class;
677 773 avl_index_t where;
774 + uint8_t hash = DDT_HASHFN(bp->blk_cksum);
678 775 int error;
679 776
680 - ASSERT(MUTEX_HELD(&ddt->ddt_lock));
681 -
682 777 ddt_key_fill(&dde_search.dde_key, bp);
683 778
684 - dde = avl_find(&ddt->ddt_tree, &dde_search, &where);
779 + ddt_enter(ddt, hash);
780 + /*
781 + * Do we have the dirty DDE in mem already?
782 + */
783 + dde = avl_find(&ddt->ddt_tree[hash], &dde_search, &where);
685 784 if (dde == NULL) {
686 - if (!add)
785 + /* This DDE doesn't exists in dirty tree */
786 + if (!add) {
787 + ddt_exit(ddt, hash);
687 788 return (NULL);
789 + }
790 + /* Since a dirty DDE didn't exist, create it */
688 791 dde = ddt_alloc(&dde_search.dde_key);
689 - avl_insert(&ddt->ddt_tree, dde, where);
792 + avl_insert(&ddt->ddt_tree[hash], dde, where);
690 793 }
691 794
692 - while (dde->dde_loading)
693 - cv_wait(&dde->dde_cv, &ddt->ddt_lock);
795 + ddt_exit(ddt, hash);
694 796
695 - if (dde->dde_loaded)
797 + /*
798 + * If we're already looking up this DDE
799 + * wait until we have the result
800 + */
801 + dde_enter(dde);
802 + while (dde->dde_state & DDE_LOADING)
803 + cv_wait(&dde->dde_cv, &dde->dde_lock);
804 +
805 + /*
806 + * If we have loaded the DDE from disk return it
807 + */
808 + if (dde->dde_state & DDE_LOADED)
696 809 return (dde);
697 810
698 - dde->dde_loading = B_TRUE;
811 + /*
812 + * If we didn't find this DDE, start looking up the DDE in ZAP
813 + */
814 + dde->dde_state |= DDE_LOADING;
815 + dde_exit(dde);
699 816
700 - ddt_exit(ddt);
701 -
702 817 error = ENOENT;
703 818
819 + DTRACE_PROBE1(ddt__loading, ddt_key_t *, &dde->dde_key);
704 820 for (type = 0; type < DDT_TYPES; type++) {
705 821 for (class = 0; class < DDT_CLASSES; class++) {
706 822 error = ddt_object_lookup(ddt, type, class, dde);
707 - if (error != ENOENT) {
708 - ASSERT0(error);
823 + if (error != ENOENT)
709 824 break;
710 - }
711 825 }
712 826 if (error != ENOENT)
713 827 break;
714 828 }
715 829
716 - ddt_enter(ddt);
830 + ASSERT(error == 0 || error == ENOENT);
717 831
718 - ASSERT(dde->dde_loaded == B_FALSE);
719 - ASSERT(dde->dde_loading == B_TRUE);
832 + dde_enter(dde);
720 833
834 + ASSERT(!(dde->dde_state & DDE_LOADED));
835 + ASSERT(dde->dde_state & DDE_LOADING);
836 +
721 837 dde->dde_type = type; /* will be DDT_TYPES if no entry found */
722 838 dde->dde_class = class; /* will be DDT_CLASSES if no entry found */
723 - dde->dde_loaded = B_TRUE;
724 - dde->dde_loading = B_FALSE;
839 + if (type == DDT_TYPES && class == DDT_CLASSES)
840 + dde->dde_state |= DDE_NEW;
841 + dde->dde_state |= DDE_LOADED;
842 + dde->dde_state &= ~DDE_LOADING;
725 843
844 + DTRACE_PROBE2(ddt__loaded, ddt_key_t *, &dde->dde_key,
845 + enum ddt_class, dde->dde_class);
726 846 if (error == 0)
727 - ddt_stat_update(ddt, dde, -1ULL);
847 + ddt_stat_generate(ddt->ddt_spa, dde, &dde->dde_lkstat);
728 848
729 849 cv_broadcast(&dde->dde_cv);
730 850
731 851 return (dde);
732 852 }
733 853
734 854 void
735 855 ddt_prefetch(spa_t *spa, const blkptr_t *bp)
736 856 {
737 857 ddt_t *ddt;
738 858 ddt_entry_t dde;
739 859
740 860 if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp))
741 861 return;
|
↓ open down ↓ |
4 lines elided |
↑ open up ↑ |
742 862
743 863 /*
744 864 * We only remove the DDT once all tables are empty and only
745 865 * prefetch dedup blocks when there are entries in the DDT.
746 866 * Thus no locking is required as the DDT can't disappear on us.
747 867 */
748 868 ddt = ddt_select(spa, bp);
749 869 ddt_key_fill(&dde.dde_key, bp);
750 870
751 871 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
752 - for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
872 + for (enum ddt_class class = 0;
873 + class < DDT_CLASSES; class++) {
753 874 ddt_object_prefetch(ddt, type, class, &dde);
754 875 }
755 876 }
756 877 }
757 878
758 879 int
759 880 ddt_entry_compare(const void *x1, const void *x2)
760 881 {
761 882 const ddt_entry_t *dde1 = x1;
762 883 const ddt_entry_t *dde2 = x2;
763 884 const uint64_t *u1 = (const uint64_t *)&dde1->dde_key;
764 885 const uint64_t *u2 = (const uint64_t *)&dde2->dde_key;
765 886
766 887 for (int i = 0; i < DDT_KEY_WORDS; i++) {
767 888 if (u1[i] < u2[i])
768 889 return (-1);
769 890 if (u1[i] > u2[i])
|
↓ open down ↓ |
7 lines elided |
↑ open up ↑ |
770 891 return (1);
771 892 }
772 893
773 894 return (0);
774 895 }
775 896
776 897 static ddt_t *
777 898 ddt_table_alloc(spa_t *spa, enum zio_checksum c)
778 899 {
779 900 ddt_t *ddt;
901 + uint_t i;
780 902
781 903 ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP);
782 904
783 - mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
784 - avl_create(&ddt->ddt_tree, ddt_entry_compare,
785 - sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
905 + for (i = 0; i < DDT_HASHSZ; i++) {
906 + mutex_init(&ddt->ddt_lock[i], NULL, MUTEX_DEFAULT, NULL);
907 + avl_create(&ddt->ddt_tree[i], ddt_entry_compare,
908 + sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
909 + }
910 + mutex_init(&ddt->ddt_repair_lock, NULL, MUTEX_DEFAULT, NULL);
911 +
786 912 avl_create(&ddt->ddt_repair_tree, ddt_entry_compare,
787 913 sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
788 914 ddt->ddt_checksum = c;
789 915 ddt->ddt_spa = spa;
790 916 ddt->ddt_os = spa->spa_meta_objset;
791 917
792 918 return (ddt);
793 919 }
794 920
795 921 static void
796 922 ddt_table_free(ddt_t *ddt)
797 923 {
798 - ASSERT(avl_numnodes(&ddt->ddt_tree) == 0);
924 + uint_t i;
925 +
799 926 ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0);
800 - avl_destroy(&ddt->ddt_tree);
927 +
928 + for (i = 0; i < DDT_HASHSZ; i++) {
929 + ASSERT(avl_numnodes(&ddt->ddt_tree[i]) == 0);
930 + avl_destroy(&ddt->ddt_tree[i]);
931 + mutex_destroy(&ddt->ddt_lock[i]);
932 + }
801 933 avl_destroy(&ddt->ddt_repair_tree);
802 - mutex_destroy(&ddt->ddt_lock);
934 + mutex_destroy(&ddt->ddt_repair_lock);
803 935 kmem_free(ddt, sizeof (*ddt));
804 936 }
805 937
806 938 void
807 939 ddt_create(spa_t *spa)
808 940 {
809 941 spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM;
810 942
811 943 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
812 944 spa->spa_ddt[c] = ddt_table_alloc(spa, c);
813 945 }
814 946
947 +/*
948 + * Get the combined size of DDTs on all pools.
949 + * Returns either on disk (phys == B_TRUE) or in core combined DDTs size
950 + */
951 +uint64_t
952 +ddt_get_ddts_size(boolean_t phys)
953 +{
954 + uint64_t ddts_size = 0;
955 + spa_t *spa = NULL;
956 +
957 + while ((spa = spa_next(spa)) != NULL)
958 + ddts_size += spa_get_ddts_size(spa, phys);
959 +
960 + return (ddts_size);
961 +}
962 +
815 963 int
816 964 ddt_load(spa_t *spa)
817 965 {
818 966 int error;
967 + ddt_object_t *ddo;
819 968
820 969 ddt_create(spa);
821 970
822 971 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
823 972 DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
824 973 &spa->spa_ddt_stat_object);
825 974
826 975 if (error)
827 976 return (error == ENOENT ? 0 : error);
828 977
829 978 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
830 979 ddt_t *ddt = spa->spa_ddt[c];
831 980 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
832 - for (enum ddt_class class = 0; class < DDT_CLASSES;
833 - class++) {
981 + for (enum ddt_class class = 0;
982 + class < DDT_CLASSES; class++) {
834 983 error = ddt_object_load(ddt, type, class);
835 - if (error != 0 && error != ENOENT)
984 + if (error == ENOENT)
985 + continue;
986 + if (error != 0)
836 987 return (error);
988 + ddo = &ddt->ddt_object_stats[type][class];
989 + atomic_add_64(&spa->spa_ddt_dsize,
990 + ddo->ddo_dspace);
991 + atomic_add_64(&spa->spa_ddt_msize,
992 + ddo->ddo_mspace);
837 993 }
838 994 }
839 995
840 996 /*
841 997 * Seed the cached histograms.
842 998 */
843 999 bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
844 1000 sizeof (ddt->ddt_histogram));
845 1001 }
1002 + zfs_ddts_msize = ddt_get_ddts_size(B_FALSE);
846 1003
1004 + if (spa_enable_dedup_cap(spa) && spa->spa_ddt_capped == 0) {
1005 + /* notify that dedup cap is now active */
1006 + spa->spa_ddt_capped = 1;
1007 + spa_event_notify(spa, NULL, NULL, ESC_ZFS_DEDUP_OFF);
1008 + }
1009 +
847 1010 return (0);
848 1011 }
849 1012
850 1013 void
851 1014 ddt_unload(spa_t *spa)
852 1015 {
853 1016 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
854 1017 if (spa->spa_ddt[c]) {
855 1018 ddt_table_free(spa->spa_ddt[c]);
856 1019 spa->spa_ddt[c] = NULL;
857 1020 }
858 1021 }
1022 + spa->spa_ddt_dsize = 0;
1023 + spa->spa_ddt_msize = 0;
1024 + zfs_ddts_msize = ddt_get_ddts_size(B_FALSE);
859 1025 }
860 1026
861 1027 boolean_t
862 1028 ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp)
863 1029 {
864 1030 ddt_t *ddt;
865 1031 ddt_entry_t dde;
866 1032
867 1033 if (!BP_GET_DEDUP(bp))
868 1034 return (B_FALSE);
869 1035
870 - if (max_class == DDT_CLASS_UNIQUE)
871 - return (B_TRUE);
1036 + if (max_class > spa->spa_ddt_class_max)
1037 + max_class = spa->spa_ddt_class_max;
872 1038
873 1039 ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)];
874 1040
875 1041 ddt_key_fill(&dde.dde_key, bp);
876 1042
877 1043 for (enum ddt_type type = 0; type < DDT_TYPES; type++)
878 - for (enum ddt_class class = 0; class <= max_class; class++)
1044 + for (enum ddt_class class = spa->spa_ddt_class_min;
1045 + class <= max_class; class++)
879 1046 if (ddt_object_lookup(ddt, type, class, &dde) == 0)
880 1047 return (B_TRUE);
881 1048
882 1049 return (B_FALSE);
883 1050 }
884 1051
885 1052 ddt_entry_t *
886 1053 ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
887 1054 {
888 1055 ddt_key_t ddk;
889 1056 ddt_entry_t *dde;
890 1057
891 1058 ddt_key_fill(&ddk, bp);
892 1059
893 1060 dde = ddt_alloc(&ddk);
894 1061
895 1062 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
896 - for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
1063 + for (enum ddt_class class = 0;
1064 + class < DDT_CLASSES; class++) {
897 1065 /*
898 1066 * We can only do repair if there are multiple copies
899 1067 * of the block. For anything in the UNIQUE class,
900 1068 * there's definitely only one copy, so don't even try.
901 1069 */
902 1070 if (class != DDT_CLASS_UNIQUE &&
903 1071 ddt_object_lookup(ddt, type, class, dde) == 0)
904 1072 return (dde);
905 1073 }
906 1074 }
907 1075
|
↓ open down ↓ |
1 lines elided |
↑ open up ↑ |
908 1076 bzero(dde->dde_phys, sizeof (dde->dde_phys));
909 1077
910 1078 return (dde);
911 1079 }
912 1080
913 1081 void
914 1082 ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
915 1083 {
916 1084 avl_index_t where;
917 1085
918 - ddt_enter(ddt);
1086 + mutex_enter(&ddt->ddt_repair_lock);
919 1087
920 1088 if (dde->dde_repair_abd != NULL && spa_writeable(ddt->ddt_spa) &&
921 1089 avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
922 1090 avl_insert(&ddt->ddt_repair_tree, dde, where);
923 1091 else
924 1092 ddt_free(dde);
925 1093
926 - ddt_exit(ddt);
1094 + mutex_exit(&ddt->ddt_repair_lock);;
927 1095 }
928 1096
929 1097 static void
930 1098 ddt_repair_entry_done(zio_t *zio)
931 1099 {
932 1100 ddt_entry_t *rdde = zio->io_private;
933 1101
934 1102 ddt_free(rdde);
935 1103 }
936 1104
937 1105 static void
938 1106 ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
939 1107 {
940 1108 ddt_phys_t *ddp = dde->dde_phys;
941 1109 ddt_phys_t *rddp = rdde->dde_phys;
942 1110 ddt_key_t *ddk = &dde->dde_key;
943 1111 ddt_key_t *rddk = &rdde->dde_key;
944 1112 zio_t *zio;
945 1113 blkptr_t blk;
946 1114
947 1115 zio = zio_null(rio, rio->io_spa, NULL,
948 1116 ddt_repair_entry_done, rdde, rio->io_flags);
949 1117
950 1118 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) {
951 1119 if (ddp->ddp_phys_birth == 0 ||
952 1120 ddp->ddp_phys_birth != rddp->ddp_phys_birth ||
953 1121 bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva)))
954 1122 continue;
955 1123 ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
956 1124 zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
957 1125 rdde->dde_repair_abd, DDK_GET_PSIZE(rddk), NULL, NULL,
958 1126 ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL));
959 1127 }
960 1128
961 1129 zio_nowait(zio);
962 1130 }
963 1131
964 1132 static void
|
↓ open down ↓ |
28 lines elided |
↑ open up ↑ |
965 1133 ddt_repair_table(ddt_t *ddt, zio_t *rio)
966 1134 {
967 1135 spa_t *spa = ddt->ddt_spa;
968 1136 ddt_entry_t *dde, *rdde_next, *rdde;
969 1137 avl_tree_t *t = &ddt->ddt_repair_tree;
970 1138 blkptr_t blk;
971 1139
972 1140 if (spa_sync_pass(spa) > 1)
973 1141 return;
974 1142
975 - ddt_enter(ddt);
1143 + mutex_enter(&ddt->ddt_repair_lock);
976 1144 for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) {
977 1145 rdde_next = AVL_NEXT(t, rdde);
978 1146 avl_remove(&ddt->ddt_repair_tree, rdde);
979 - ddt_exit(ddt);
1147 + mutex_exit(&ddt->ddt_repair_lock);
1148 +
980 1149 ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk);
981 1150 dde = ddt_repair_start(ddt, &blk);
982 1151 ddt_repair_entry(ddt, dde, rdde, rio);
983 1152 ddt_repair_done(ddt, dde);
984 - ddt_enter(ddt);
1153 +
1154 + mutex_enter(&ddt->ddt_repair_lock);
985 1155 }
986 - ddt_exit(ddt);
1156 + mutex_exit(&ddt->ddt_repair_lock);
987 1157 }
988 1158
989 1159 static void
990 1160 ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
991 1161 {
992 1162 dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
993 1163 ddt_phys_t *ddp = dde->dde_phys;
994 1164 ddt_key_t *ddk = &dde->dde_key;
1165 + spa_t *spa = ddt->ddt_spa;
995 1166 enum ddt_type otype = dde->dde_type;
996 1167 enum ddt_type ntype = DDT_TYPE_CURRENT;
997 1168 enum ddt_class oclass = dde->dde_class;
998 1169 enum ddt_class nclass;
999 1170 uint64_t total_refcnt = 0;
1000 1171
1001 - ASSERT(dde->dde_loaded);
1002 - ASSERT(!dde->dde_loading);
1172 + ASSERT(dde->dde_state & DDE_LOADED);
1173 + ASSERT(!(dde->dde_state & DDE_LOADING));
1003 1174
1175 + /*
1176 + * Propagate the stats generated at lookup time
1177 + * this was delayed to avoid having to take locks
1178 + * to protect ddt->ddt_histogram
1179 + */
1180 + if (dde->dde_lkstat.dds_ref_blocks != 0)
1181 + ddt_stat_update_by_dds(ddt, dde, &dde->dde_lkstat, -1ULL);
1182 +
1004 1183 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
1005 1184 ASSERT(dde->dde_lead_zio[p] == NULL);
1006 1185 ASSERT((int64_t)ddp->ddp_refcnt >= 0);
1007 1186 if (ddp->ddp_phys_birth == 0) {
1008 1187 ASSERT(ddp->ddp_refcnt == 0);
1009 1188 continue;
1010 1189 }
1011 1190 if (p == DDT_PHYS_DITTO) {
1012 1191 if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0)
1013 1192 ddt_phys_free(ddt, ddk, ddp, txg);
1014 1193 continue;
1015 1194 }
1016 1195 if (ddp->ddp_refcnt == 0)
1017 1196 ddt_phys_free(ddt, ddk, ddp, txg);
|
↓ open down ↓ |
4 lines elided |
↑ open up ↑ |
1018 1197 total_refcnt += ddp->ddp_refcnt;
1019 1198 }
1020 1199
1021 1200 if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0)
1022 1201 nclass = DDT_CLASS_DITTO;
1023 1202 else if (total_refcnt > 1)
1024 1203 nclass = DDT_CLASS_DUPLICATE;
1025 1204 else
1026 1205 nclass = DDT_CLASS_UNIQUE;
1027 1206
1207 + if (nclass > spa->spa_ddt_class_max)
1208 + nclass = spa->spa_ddt_class_max;
1209 +
1210 + if (nclass < spa->spa_ddt_class_min)
1211 + nclass = spa->spa_ddt_class_min;
1212 +
1213 + DTRACE_PROBE1(ddt__storing__entry, uint64_t, (uint64_t)nclass);
1214 +
1028 1215 if (otype != DDT_TYPES &&
1029 1216 (otype != ntype || oclass != nclass || total_refcnt == 0)) {
1030 1217 VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0);
1031 1218 ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT);
1032 1219 }
1033 1220
1034 1221 if (total_refcnt != 0) {
1035 1222 dde->dde_type = ntype;
1036 1223 dde->dde_class = nclass;
1037 1224 ddt_stat_update(ddt, dde, 0);
1038 1225 if (!ddt_object_exists(ddt, ntype, nclass))
1039 1226 ddt_object_create(ddt, ntype, nclass, tx);
1040 1227 VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0);
1041 1228
1042 1229 /*
1043 1230 * If the class changes, the order that we scan this bp
|
↓ open down ↓ |
6 lines elided |
↑ open up ↑ |
1044 1231 * changes. If it decreases, we could miss it, so
1045 1232 * scan it right now. (This covers both class changing
1046 1233 * while we are doing ddt_walk(), and when we are
1047 1234 * traversing.)
1048 1235 */
1049 1236 if (nclass < oclass) {
1050 1237 dsl_scan_ddt_entry(dp->dp_scan,
1051 1238 ddt->ddt_checksum, dde, tx);
1052 1239 }
1053 1240 }
1241 + DTRACE_PROBE(ddt__stored__entry);
1054 1242 }
1055 1243
1056 1244 static void
1245 +ddt_sync_avl(ddt_t *ddt, avl_tree_t *avl, dmu_tx_t *tx, uint64_t txg)
1246 +{
1247 + void *cookie = NULL;
1248 + ddt_entry_t *dde;
1249 +
1250 + while ((dde = avl_destroy_nodes(avl, &cookie)) != NULL) {
1251 + if ((dde->dde_state & DDE_DONT_SYNC) != DDE_DONT_SYNC) {
1252 + ddt_sync_entry(ddt, dde, tx, txg);
1253 + } else { /* if we're not syncing this DDE it must be new */
1254 + ASSERT(dde->dde_state & DDE_NEW);
1255 + }
1256 + ddt_free(dde);
1257 + }
1258 +}
1259 +
1260 +static void
1057 1261 ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
1058 1262 {
1263 + uint64_t cnt, num_dbytes = 0, num_mbytes = 0;
1264 + int64_t old_mbytes = 0;
1059 1265 spa_t *spa = ddt->ddt_spa;
1060 - ddt_entry_t *dde;
1061 - void *cookie = NULL;
1266 + uint_t i, numnodes = 0;
1267 + ddt_object_t *ddo;
1062 1268
1063 - if (avl_numnodes(&ddt->ddt_tree) == 0)
1269 + for (i = 0; i < DDT_HASHSZ; i++)
1270 + numnodes += avl_numnodes(&ddt->ddt_tree[i]);
1271 +
1272 + if (numnodes == 0)
1064 1273 return;
1065 1274
1066 1275 ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP);
1067 1276
1068 1277 if (spa->spa_ddt_stat_object == 0) {
1069 1278 spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
1070 1279 DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT,
1071 1280 DMU_POOL_DDT_STATS, tx);
1072 1281 }
1073 1282
1074 - while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
1075 - ddt_sync_entry(ddt, dde, tx, txg);
1076 - ddt_free(dde);
1077 - }
1078 1283
1284 + DTRACE_PROBE(ddt__syncing__avl);
1285 + for (i = 0; i < DDT_HASHSZ; i++)
1286 + ddt_sync_avl(ddt, &ddt->ddt_tree[i], tx, txg);
1287 + DTRACE_PROBE(ddt__synced__avl);
1288 +
1289 + DTRACE_PROBE(ddt__syncing__obj);
1079 1290 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
1080 - uint64_t count = 0;
1081 - for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
1291 + for (enum ddt_class class = spa->spa_ddt_class_min;
1292 + class <= spa->spa_ddt_class_max; class++) {
1082 1293 if (ddt_object_exists(ddt, type, class)) {
1294 + ddo = &ddt->ddt_object_stats[type][class];
1295 + old_mbytes += ddo->ddo_mspace;
1296 +
1083 1297 ddt_object_sync(ddt, type, class, tx);
1084 - count += ddt_object_count(ddt, type, class);
1298 + (void) ddt_object_count(ddt, type, class, &cnt);
1299 + if (cnt == 0) {
1300 + ddt_object_destroy(ddt, type, class,
1301 + tx);
1302 + continue;
1303 + }
1304 +
1305 + num_dbytes += ddo->ddo_dspace;
1306 + num_mbytes += ddo->ddo_mspace;
1085 1307 }
1086 1308 }
1087 - for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
1088 - if (count == 0 && ddt_object_exists(ddt, type, class))
1089 - ddt_object_destroy(ddt, type, class, tx);
1090 - }
1091 1309 }
1310 + spa->spa_ddt_dsize = num_dbytes;
1311 + spa->spa_ddt_msize = num_mbytes;
1312 + atomic_add_64(&zfs_ddts_msize, ((int64_t)num_mbytes) - old_mbytes);
1313 + DTRACE_PROBE4(ddt__synced__obj, char *, spa->spa_name,
1314 + uint64_t, num_dbytes, uint64_t, num_mbytes, uint64_t,
1315 + zfs_ddts_msize);
1092 1316
1317 + if (spa_enable_dedup_cap(spa) && spa->spa_ddt_capped == 0) {
1318 + /* notify that dedup cap is now active */
1319 + spa->spa_ddt_capped = 1;
1320 + spa_event_notify(spa, NULL, NULL, ESC_ZFS_DEDUP_OFF);
1321 + } else if (!spa_enable_dedup_cap(spa) && spa->spa_ddt_capped == 1) {
1322 + /* notify that dedup cap is now inactive */
1323 + spa->spa_ddt_capped = 0;
1324 + spa_event_notify(spa, NULL, NULL, ESC_ZFS_DEDUP_ON);
1325 + }
1326 +
1327 + /* update the cached stats with the values calculated above */
1093 1328 bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
1094 1329 sizeof (ddt->ddt_histogram));
1095 1330 }
1096 1331
1097 1332 void
1098 1333 ddt_sync(spa_t *spa, uint64_t txg)
1099 1334 {
1100 1335 dmu_tx_t *tx;
1101 1336 zio_t *rio = zio_root(spa, NULL, NULL,
1102 - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL);
1337 + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
1103 1338
1104 1339 ASSERT(spa_syncing_txg(spa) == txg);
1105 1340
1106 1341 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1107 1342
1108 1343 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1109 1344 ddt_t *ddt = spa->spa_ddt[c];
1110 1345 if (ddt == NULL)
1111 1346 continue;
1112 1347 ddt_sync_table(ddt, tx, txg);
1113 1348 ddt_repair_table(ddt, rio);
1114 1349 }
1115 1350
1116 1351 (void) zio_wait(rio);
1117 1352
1118 1353 dmu_tx_commit(tx);
1119 1354 }
1120 1355
1121 1356 int
1122 1357 ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)
1123 1358 {
1124 1359 do {
1125 1360 do {
1126 1361 do {
1127 1362 ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum];
1128 1363 int error = ENOENT;
1129 1364 if (ddt_object_exists(ddt, ddb->ddb_type,
1130 1365 ddb->ddb_class)) {
1131 1366 error = ddt_object_walk(ddt,
1132 1367 ddb->ddb_type, ddb->ddb_class,
1133 1368 &ddb->ddb_cursor, dde);
1134 1369 }
1135 1370 dde->dde_type = ddb->ddb_type;
1136 1371 dde->dde_class = ddb->ddb_class;
1137 1372 if (error == 0)
1138 1373 return (0);
1139 1374 if (error != ENOENT)
1140 1375 return (error);
1141 1376 ddb->ddb_cursor = 0;
1142 1377 } while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS);
1143 1378 ddb->ddb_checksum = 0;
1144 1379 } while (++ddb->ddb_type < DDT_TYPES);
1145 1380 ddb->ddb_type = 0;
1146 1381 } while (++ddb->ddb_class < DDT_CLASSES);
1147 1382
1148 1383 return (SET_ERROR(ENOENT));
1149 1384 }
|
↓ open down ↓ |
37 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX