6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25 /*
26 * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
27 */
28
29 #include <sys/zfs_context.h>
30 #include <sys/spa.h>
31 #include <sys/dmu.h>
32 #include <sys/dmu_tx.h>
33 #include <sys/dnode.h>
34 #include <sys/dsl_pool.h>
35 #include <sys/zio.h>
36 #include <sys/space_map.h>
37 #include <sys/refcount.h>
38 #include <sys/zfeature.h>
39
40 /*
41 * Note on space map block size:
42 *
43 * The data for a given space map can be kept on blocks of any size.
44 * Larger blocks entail fewer I/O operations, but they also cause the
45 * DMU to keep more data in-core, and also to waste more I/O bandwidth
46 * when only a few blocks have changed since the last transaction group.
64 sm_entry_is_debug(uint64_t e)
65 {
66 return (SM_PREFIX_DECODE(e) == SM_DEBUG_PREFIX);
67 }
68
69 boolean_t
70 sm_entry_is_single_word(uint64_t e)
71 {
72 uint8_t prefix = SM_PREFIX_DECODE(e);
73 return (prefix != SM_DEBUG_PREFIX && prefix != SM2_PREFIX);
74 }
75
76 boolean_t
77 sm_entry_is_double_word(uint64_t e)
78 {
79 return (SM_PREFIX_DECODE(e) == SM2_PREFIX);
80 }
81
82 /*
83 * Iterate through the space map, invoking the callback on each (non-debug)
84 * space map entry.
85 */
86 int
87 space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg)
88 {
89 uint64_t sm_len = space_map_length(sm);
90 ASSERT3U(sm->sm_blksz, !=, 0);
91
92 dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, sm_len,
93 ZIO_PRIORITY_SYNC_READ);
94
95 uint64_t blksz = sm->sm_blksz;
96 int error = 0;
97 for (uint64_t block_base = 0; block_base < sm_len && error == 0;
98 block_base += blksz) {
99 dmu_buf_t *db;
100 error = dmu_buf_hold(sm->sm_os, space_map_object(sm),
101 block_base, FTAG, &db, DMU_READ_PREFETCH);
102 if (error != 0)
103 return (error);
104
105 uint64_t *block_start = db->db_data;
106 uint64_t block_length = MIN(sm_len - block_base, blksz);
107 uint64_t *block_end = block_start +
108 (block_length / sizeof (uint64_t));
109
110 VERIFY0(P2PHASE(block_length, sizeof (uint64_t)));
111 VERIFY3U(block_length, !=, 0);
112 ASSERT3U(blksz, ==, db->db_size);
113
114 for (uint64_t *block_cursor = block_start;
115 block_cursor < block_end && error == 0; block_cursor++) {
116 uint64_t e = *block_cursor;
117
118 if (sm_entry_is_debug(e)) /* Skip debug entries */
119 continue;
120
121 uint64_t raw_offset, raw_run, vdev_id;
122 maptype_t type;
123 if (sm_entry_is_single_word(e)) {
124 type = SM_TYPE_DECODE(e);
125 vdev_id = SM_NO_VDEVID;
126 raw_offset = SM_OFFSET_DECODE(e);
169 * Reads the entries from the last block of the space map into
170 * buf in reverse order. Populates nwords with number of words
171 * in the last block.
172 *
173 * Refer to block comment within space_map_incremental_destroy()
174 * to understand why this function is needed.
175 */
176 static int
177 space_map_reversed_last_block_entries(space_map_t *sm, uint64_t *buf,
178 uint64_t bufsz, uint64_t *nwords)
179 {
180 int error = 0;
181 dmu_buf_t *db;
182
183 /*
184 * Find the offset of the last word in the space map and use
185 * that to read the last block of the space map with
186 * dmu_buf_hold().
187 */
188 uint64_t last_word_offset =
189 sm->sm_phys->smp_objsize - sizeof (uint64_t);
190 error = dmu_buf_hold(sm->sm_os, space_map_object(sm), last_word_offset,
191 FTAG, &db, DMU_READ_NO_PREFETCH);
192 if (error != 0)
193 return (error);
194
195 ASSERT3U(sm->sm_object, ==, db->db_object);
196 ASSERT3U(sm->sm_blksz, ==, db->db_size);
197 ASSERT3U(bufsz, >=, db->db_size);
198 ASSERT(nwords != NULL);
199
200 uint64_t *words = db->db_data;
201 *nwords =
202 (sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t);
203
204 ASSERT3U(*nwords, <=, bufsz / sizeof (uint64_t));
205
206 uint64_t n = *nwords;
207 uint64_t j = n - 1;
208 for (uint64_t i = 0; i < n; i++) {
209 uint64_t entry = words[i];
210 if (sm_entry_is_double_word(entry)) {
211 /*
212 * Since we are populating the buffer backwards
213 * we have to be extra careful and add the two
214 * words of the double-word entry in the right
215 * order.
216 */
217 ASSERT3U(j, >, 0);
218 buf[j - 1] = entry;
219
220 i++;
221 ASSERT3U(i, <, n);
222 entry = words[i];
281 * each entry.
282 * 3] If there are no more entries in the space map or the callback
283 * returns a value other than 0, we stop iterating over the
284 * space map. If there are entries remaining and the callback
285 * returned 0, we go back to step [1].
286 */
287 int error = 0;
288 while (space_map_length(sm) > 0 && error == 0) {
289 uint64_t nwords = 0;
290 error = space_map_reversed_last_block_entries(sm, buf, bufsz,
291 &nwords);
292 if (error != 0)
293 break;
294
295 ASSERT3U(nwords, <=, bufsz / sizeof (uint64_t));
296
297 for (uint64_t i = 0; i < nwords; i++) {
298 uint64_t e = buf[i];
299
300 if (sm_entry_is_debug(e)) {
301 sm->sm_phys->smp_objsize -= sizeof (uint64_t);
302 space_map_update(sm);
303 continue;
304 }
305
306 int words = 1;
307 uint64_t raw_offset, raw_run, vdev_id;
308 maptype_t type;
309 if (sm_entry_is_single_word(e)) {
310 type = SM_TYPE_DECODE(e);
311 vdev_id = SM_NO_VDEVID;
312 raw_offset = SM_OFFSET_DECODE(e);
313 raw_run = SM_RUN_DECODE(e);
314 } else {
315 ASSERT(sm_entry_is_double_word(e));
316 words = 2;
317
318 raw_run = SM2_RUN_DECODE(e);
319 vdev_id = SM2_VDEV_DECODE(e);
320
321 /* move to the second word */
322 i++;
337 VERIFY3U(entry_offset, >=, sm->sm_start);
338 VERIFY3U(entry_offset, <, sm->sm_start + sm->sm_size);
339 VERIFY3U(entry_run, <=, sm->sm_size);
340 VERIFY3U(entry_offset + entry_run, <=,
341 sm->sm_start + sm->sm_size);
342
343 space_map_entry_t sme = {
344 .sme_type = type,
345 .sme_vdev = vdev_id,
346 .sme_offset = entry_offset,
347 .sme_run = entry_run
348 };
349 error = callback(&sme, arg);
350 if (error != 0)
351 break;
352
353 if (type == SM_ALLOC)
354 sm->sm_phys->smp_alloc -= entry_run;
355 else
356 sm->sm_phys->smp_alloc += entry_run;
357 sm->sm_phys->smp_objsize -= words * sizeof (uint64_t);
358 space_map_update(sm);
359 }
360 }
361
362 if (space_map_length(sm) == 0) {
363 ASSERT0(error);
364 ASSERT0(sm->sm_phys->smp_objsize);
365 ASSERT0(sm->sm_alloc);
366 }
367
368 zio_buf_free(buf, bufsz);
369 return (error);
370 }
371
372 typedef struct space_map_load_arg {
373 space_map_t *smla_sm;
374 range_tree_t *smla_rt;
375 maptype_t smla_type;
376 } space_map_load_arg_t;
377
378 static int
379 space_map_load_callback(space_map_entry_t *sme, void *arg)
380 {
381 space_map_load_arg_t *smla = arg;
382 if (sme->sme_type == smla->smla_type) {
383 VERIFY3U(range_tree_space(smla->smla_rt) + sme->sme_run, <=,
384 smla->smla_sm->sm_size);
385 range_tree_add(smla->smla_rt, sme->sme_offset, sme->sme_run);
386 } else {
387 range_tree_remove(smla->smla_rt, sme->sme_offset, sme->sme_run);
388 }
389
390 return (0);
391 }
392
393 /*
394 * Load the space map disk into the specified range tree. Segments of maptype
395 * are added to the range tree, other segment types are removed.
396 */
397 int
398 space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype)
399 {
400 uint64_t space;
401 int err;
402 space_map_load_arg_t smla;
403
404 VERIFY0(range_tree_space(rt));
405 space = space_map_allocated(sm);
406
407 if (maptype == SM_FREE) {
408 range_tree_add(rt, sm->sm_start, sm->sm_size);
409 space = sm->sm_size - space;
410 }
411
412 smla.smla_rt = rt;
413 smla.smla_sm = sm;
414 smla.smla_type = maptype;
415 err = space_map_iterate(sm, space_map_load_callback, &smla);
416
417 if (err == 0) {
418 VERIFY3U(range_tree_space(rt), ==, space);
419 } else {
420 range_tree_vacate(rt, NULL, NULL);
421 }
422
423 return (err);
424 }
425
426 void
427 space_map_histogram_clear(space_map_t *sm)
428 {
429 if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
430 return;
431
432 bzero(sm->sm_phys->smp_histogram, sizeof (sm->sm_phys->smp_histogram));
433 }
434
435 boolean_t
436 space_map_histogram_verify(space_map_t *sm, range_tree_t *rt)
437 {
438 /*
439 * Verify that the in-core range tree does not have any
440 * ranges smaller than our sm_shift size.
441 */
442 for (int i = 0; i < sm->sm_shift; i++) {
443 if (rt->rt_histogram[i] != 0)
444 return (B_FALSE);
445 }
489 * larger than the max bucket size into the last bucket.
490 */
491 if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
492 ASSERT3U(idx + sm->sm_shift, ==, i);
493 idx++;
494 ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
495 }
496 }
497 }
498
499 static void
500 space_map_write_intro_debug(space_map_t *sm, maptype_t maptype, dmu_tx_t *tx)
501 {
502 dmu_buf_will_dirty(sm->sm_dbuf, tx);
503
504 uint64_t dentry = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) |
505 SM_DEBUG_ACTION_ENCODE(maptype) |
506 SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(tx->tx_pool->dp_spa)) |
507 SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
508
509 dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_objsize,
510 sizeof (dentry), &dentry, tx);
511
512 sm->sm_phys->smp_objsize += sizeof (dentry);
513 }
514
515 /*
516 * Writes one or more entries given a segment.
517 *
518 * Note: The function may release the dbuf from the pointer initially
519 * passed to it, and return a different dbuf. Also, the space map's
520 * dbuf must be dirty for the changes in sm_phys to take effect.
521 */
522 static void
523 space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype,
524 uint64_t vdev_id, uint8_t words, dmu_buf_t **dbp, void *tag, dmu_tx_t *tx)
525 {
526 ASSERT3U(words, !=, 0);
527 ASSERT3U(words, <=, 2);
528
529 /* ensure the vdev_id can be represented by the space map */
530 ASSERT3U(vdev_id, <=, SM_NO_VDEVID);
531
532 /*
533 * if this is a single word entry, ensure that no vdev was
534 * specified.
535 */
536 IMPLY(words == 1, vdev_id == SM_NO_VDEVID);
537
538 dmu_buf_t *db = *dbp;
539 ASSERT3U(db->db_size, ==, sm->sm_blksz);
540
541 uint64_t *block_base = db->db_data;
542 uint64_t *block_end = block_base + (sm->sm_blksz / sizeof (uint64_t));
543 uint64_t *block_cursor = block_base +
544 (sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t);
545
546 ASSERT3P(block_cursor, <=, block_end);
547
548 uint64_t size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
549 uint64_t start = (rs->rs_start - sm->sm_start) >> sm->sm_shift;
550 uint64_t run_max = (words == 2) ? SM2_RUN_MAX : SM_RUN_MAX;
551
552 ASSERT3U(rs->rs_start, >=, sm->sm_start);
553 ASSERT3U(rs->rs_start, <, sm->sm_start + sm->sm_size);
554 ASSERT3U(rs->rs_end - rs->rs_start, <=, sm->sm_size);
555 ASSERT3U(rs->rs_end, <=, sm->sm_start + sm->sm_size);
556
557 while (size != 0) {
558 ASSERT3P(block_cursor, <=, block_end);
559
560 /*
561 * If we are at the end of this block, flush it and start
562 * writing again from the beginning.
563 */
564 if (block_cursor == block_end) {
565 dmu_buf_rele(db, tag);
566
567 uint64_t next_word_offset = sm->sm_phys->smp_objsize;
568 VERIFY0(dmu_buf_hold(sm->sm_os,
569 space_map_object(sm), next_word_offset,
570 tag, &db, DMU_READ_PREFETCH));
571 dmu_buf_will_dirty(db, tx);
572
573 /* update caller's dbuf */
574 *dbp = db;
575
576 ASSERT3U(db->db_size, ==, sm->sm_blksz);
577
578 block_base = db->db_data;
579 block_cursor = block_base;
580 block_end = block_base +
581 (db->db_size / sizeof (uint64_t));
582 }
583
584 /*
585 * If we are writing a two-word entry and we only have one
586 * word left on this block, just pad it with an empty debug
587 * entry and write the two-word entry in the next block.
588 */
589 uint64_t *next_entry = block_cursor + 1;
590 if (next_entry == block_end && words > 1) {
591 ASSERT3U(words, ==, 2);
592 *block_cursor = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) |
593 SM_DEBUG_ACTION_ENCODE(0) |
594 SM_DEBUG_SYNCPASS_ENCODE(0) |
595 SM_DEBUG_TXG_ENCODE(0);
596 block_cursor++;
597 sm->sm_phys->smp_objsize += sizeof (uint64_t);
598 ASSERT3P(block_cursor, ==, block_end);
599 continue;
600 }
601
602 uint64_t run_len = MIN(size, run_max);
603 switch (words) {
604 case 1:
605 *block_cursor = SM_OFFSET_ENCODE(start) |
606 SM_TYPE_ENCODE(maptype) |
607 SM_RUN_ENCODE(run_len);
608 block_cursor++;
609 break;
610 case 2:
611 /* write the first word of the entry */
612 *block_cursor = SM_PREFIX_ENCODE(SM2_PREFIX) |
613 SM2_RUN_ENCODE(run_len) |
614 SM2_VDEV_ENCODE(vdev_id);
615 block_cursor++;
616
617 /* move on to the second word of the entry */
618 ASSERT3P(block_cursor, <, block_end);
619 *block_cursor = SM2_TYPE_ENCODE(maptype) |
620 SM2_OFFSET_ENCODE(start);
621 block_cursor++;
622 break;
623 default:
624 panic("%d-word space map entries are not supported",
625 words);
626 break;
627 }
628 sm->sm_phys->smp_objsize += words * sizeof (uint64_t);
629
630 start += run_len;
631 size -= run_len;
632 }
633 ASSERT0(size);
634
635 }
636
637 /*
638 * Note: The space map's dbuf must be dirty for the changes in sm_phys to
639 * take effect.
640 */
641 static void
642 space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
643 uint64_t vdev_id, dmu_tx_t *tx)
644 {
645 spa_t *spa = tx->tx_pool->dp_spa;
646 dmu_buf_t *db;
647
648 space_map_write_intro_debug(sm, maptype, tx);
649
650 #ifdef DEBUG
651 /*
652 * We do this right after we write the intro debug entry
653 * because the estimate does not take it into account.
654 */
655 uint64_t initial_objsize = sm->sm_phys->smp_objsize;
656 uint64_t estimated_growth =
657 space_map_estimate_optimal_size(sm, rt, SM_NO_VDEVID);
658 uint64_t estimated_final_objsize = initial_objsize + estimated_growth;
659 #endif
660
661 /*
662 * Find the offset right after the last word in the space map
663 * and use that to get a hold of the last block, so we can
664 * start appending to it.
665 */
666 uint64_t next_word_offset = sm->sm_phys->smp_objsize;
667 VERIFY0(dmu_buf_hold(sm->sm_os, space_map_object(sm),
668 next_word_offset, FTAG, &db, DMU_READ_PREFETCH));
669 ASSERT3U(db->db_size, ==, sm->sm_blksz);
670
671 dmu_buf_will_dirty(db, tx);
672
673 avl_tree_t *t = &rt->rt_root;
674 for (range_seg_t *rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) {
675 uint64_t offset = (rs->rs_start - sm->sm_start) >> sm->sm_shift;
676 uint64_t length = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
677 uint8_t words = 1;
678
679 /*
680 * We only write two-word entries when both of the following
681 * are true:
682 *
683 * [1] The feature is enabled.
684 * [2] The offset or run is too big for a single-word entry,
685 * or the vdev_id is set (meaning not equal to
686 * SM_NO_VDEVID).
694 (offset >= (1ULL << SM_OFFSET_BITS) ||
695 length > SM_RUN_MAX ||
696 vdev_id != SM_NO_VDEVID ||
697 (zfs_force_some_double_word_sm_entries &&
698 spa_get_random(100) == 0)))
699 words = 2;
700
701 space_map_write_seg(sm, rs, maptype, vdev_id, words,
702 &db, FTAG, tx);
703 }
704
705 dmu_buf_rele(db, FTAG);
706
707 #ifdef DEBUG
708 /*
709 * We expect our estimation to be based on the worst case
710 * scenario [see comment in space_map_estimate_optimal_size()].
711 * Therefore we expect the actual objsize to be equal or less
712 * than whatever we estimated it to be.
713 */
714 ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_objsize);
715 #endif
716 }
717
718 /*
719 * Note: This function manipulates the state of the given space map but
720 * does not hold any locks implicitly. Thus the caller is responsible
721 * for synchronizing writes to the space map.
722 */
723 void
724 space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
725 uint64_t vdev_id, dmu_tx_t *tx)
726 {
727 objset_t *os = sm->sm_os;
728
729 ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
730 VERIFY3U(space_map_object(sm), !=, 0);
731
732 dmu_buf_will_dirty(sm->sm_dbuf, tx);
733
734 /*
850 doi.doi_bonus_size, doi.doi_data_block_size);
851
852 space_map_free(sm, tx);
853 dmu_buf_rele(sm->sm_dbuf, sm);
854
855 sm->sm_object = space_map_alloc(sm->sm_os, blocksize, tx);
856 VERIFY0(space_map_open_impl(sm));
857 } else {
858 VERIFY0(dmu_free_range(os, space_map_object(sm), 0, -1ULL, tx));
859
860 /*
861 * If the spacemap is reallocated, its histogram
862 * will be reset. Do the same in the common case so that
863 * bugs related to the uncommon case do not go unnoticed.
864 */
865 bzero(sm->sm_phys->smp_histogram,
866 sizeof (sm->sm_phys->smp_histogram));
867 }
868
869 dmu_buf_will_dirty(sm->sm_dbuf, tx);
870 sm->sm_phys->smp_objsize = 0;
871 sm->sm_phys->smp_alloc = 0;
872 }
873
874 /*
875 * Update the in-core space_map allocation and length values.
876 */
877 void
878 space_map_update(space_map_t *sm)
879 {
880 if (sm == NULL)
881 return;
882
883 sm->sm_alloc = sm->sm_phys->smp_alloc;
884 sm->sm_length = sm->sm_phys->smp_objsize;
885 }
886
887 uint64_t
888 space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
889 {
890 spa_t *spa = dmu_objset_spa(os);
891 uint64_t object;
892 int bonuslen;
893
894 if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
895 spa_feature_incr(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM, tx);
896 bonuslen = sizeof (space_map_phys_t);
897 ASSERT3U(bonuslen, <=, dmu_bonus_max());
898 } else {
899 bonuslen = SPACE_MAP_SIZE_V0;
900 }
901
902 object = dmu_object_alloc_ibs(os, DMU_OT_SPACE_MAP, blocksize,
903 space_map_ibs, DMU_OT_SPACE_MAP_HEADER, bonuslen, tx);
904
905 return (object);
906 }
1048 size += histogram[idx] *
1049 entries_for_seg * 2 * sizeof (uint64_t);
1050 }
1051
1052 /*
1053 * Assume the worst case where we start with the padding at the end
1054 * of the current block and we add an extra padding entry at the end
1055 * of all subsequent blocks.
1056 */
1057 size += ((size / sm->sm_blksz) + 1) * sizeof (uint64_t);
1058
1059 return (size);
1060 }
1061
1062 uint64_t
1063 space_map_object(space_map_t *sm)
1064 {
1065 return (sm != NULL ? sm->sm_object : 0);
1066 }
1067
1068 /*
1069 * Returns the already synced, on-disk allocated space.
1070 */
1071 uint64_t
1072 space_map_allocated(space_map_t *sm)
1073 {
1074 return (sm != NULL ? sm->sm_alloc : 0);
1075 }
1076
1077 /*
1078 * Returns the already synced, on-disk length;
1079 */
1080 uint64_t
1081 space_map_length(space_map_t *sm)
1082 {
1083 return (sm != NULL ? sm->sm_length : 0);
1084 }
1085
1086 /*
1087 * Returns the allocated space that is currently syncing.
1088 */
1089 int64_t
1090 space_map_alloc_delta(space_map_t *sm)
1091 {
1092 if (sm == NULL)
1093 return (0);
1094 ASSERT(sm->sm_dbuf != NULL);
1095 return (sm->sm_phys->smp_alloc - space_map_allocated(sm));
1096 }
|
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25 /*
26 * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
27 */
28
29 #include <sys/zfs_context.h>
30 #include <sys/spa.h>
31 #include <sys/dmu.h>
32 #include <sys/dmu_tx.h>
33 #include <sys/dnode.h>
34 #include <sys/dsl_pool.h>
35 #include <sys/zio.h>
36 #include <sys/space_map.h>
37 #include <sys/refcount.h>
38 #include <sys/zfeature.h>
39
40 /*
41 * Note on space map block size:
42 *
43 * The data for a given space map can be kept on blocks of any size.
44 * Larger blocks entail fewer I/O operations, but they also cause the
45 * DMU to keep more data in-core, and also to waste more I/O bandwidth
46 * when only a few blocks have changed since the last transaction group.
64 sm_entry_is_debug(uint64_t e)
65 {
66 return (SM_PREFIX_DECODE(e) == SM_DEBUG_PREFIX);
67 }
68
69 boolean_t
70 sm_entry_is_single_word(uint64_t e)
71 {
72 uint8_t prefix = SM_PREFIX_DECODE(e);
73 return (prefix != SM_DEBUG_PREFIX && prefix != SM2_PREFIX);
74 }
75
76 boolean_t
77 sm_entry_is_double_word(uint64_t e)
78 {
79 return (SM_PREFIX_DECODE(e) == SM2_PREFIX);
80 }
81
82 /*
83 * Iterate through the space map, invoking the callback on each (non-debug)
84 * space map entry. Stop after reading 'end' bytes of the space map.
85 */
86 int
87 space_map_iterate(space_map_t *sm, uint64_t end, sm_cb_t callback, void *arg)
88 {
89 uint64_t blksz = sm->sm_blksz;
90
91 ASSERT3U(blksz, !=, 0);
92 ASSERT3U(end, <=, space_map_length(sm));
93 ASSERT0(P2PHASE(end, sizeof (uint64_t)));
94
95 dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, end,
96 ZIO_PRIORITY_SYNC_READ);
97
98 int error = 0;
99 for (uint64_t block_base = 0; block_base < end && error == 0;
100 block_base += blksz) {
101 dmu_buf_t *db;
102 error = dmu_buf_hold(sm->sm_os, space_map_object(sm),
103 block_base, FTAG, &db, DMU_READ_PREFETCH);
104 if (error != 0)
105 return (error);
106
107 uint64_t *block_start = db->db_data;
108 uint64_t block_length = MIN(end - block_base, blksz);
109 uint64_t *block_end = block_start +
110 (block_length / sizeof (uint64_t));
111
112 VERIFY0(P2PHASE(block_length, sizeof (uint64_t)));
113 VERIFY3U(block_length, !=, 0);
114 ASSERT3U(blksz, ==, db->db_size);
115
116 for (uint64_t *block_cursor = block_start;
117 block_cursor < block_end && error == 0; block_cursor++) {
118 uint64_t e = *block_cursor;
119
120 if (sm_entry_is_debug(e)) /* Skip debug entries */
121 continue;
122
123 uint64_t raw_offset, raw_run, vdev_id;
124 maptype_t type;
125 if (sm_entry_is_single_word(e)) {
126 type = SM_TYPE_DECODE(e);
127 vdev_id = SM_NO_VDEVID;
128 raw_offset = SM_OFFSET_DECODE(e);
171 * Reads the entries from the last block of the space map into
172 * buf in reverse order. Populates nwords with number of words
173 * in the last block.
174 *
175 * Refer to block comment within space_map_incremental_destroy()
176 * to understand why this function is needed.
177 */
178 static int
179 space_map_reversed_last_block_entries(space_map_t *sm, uint64_t *buf,
180 uint64_t bufsz, uint64_t *nwords)
181 {
182 int error = 0;
183 dmu_buf_t *db;
184
185 /*
186 * Find the offset of the last word in the space map and use
187 * that to read the last block of the space map with
188 * dmu_buf_hold().
189 */
190 uint64_t last_word_offset =
191 sm->sm_phys->smp_length - sizeof (uint64_t);
192 error = dmu_buf_hold(sm->sm_os, space_map_object(sm), last_word_offset,
193 FTAG, &db, DMU_READ_NO_PREFETCH);
194 if (error != 0)
195 return (error);
196
197 ASSERT3U(sm->sm_object, ==, db->db_object);
198 ASSERT3U(sm->sm_blksz, ==, db->db_size);
199 ASSERT3U(bufsz, >=, db->db_size);
200 ASSERT(nwords != NULL);
201
202 uint64_t *words = db->db_data;
203 *nwords =
204 (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t);
205
206 ASSERT3U(*nwords, <=, bufsz / sizeof (uint64_t));
207
208 uint64_t n = *nwords;
209 uint64_t j = n - 1;
210 for (uint64_t i = 0; i < n; i++) {
211 uint64_t entry = words[i];
212 if (sm_entry_is_double_word(entry)) {
213 /*
214 * Since we are populating the buffer backwards
215 * we have to be extra careful and add the two
216 * words of the double-word entry in the right
217 * order.
218 */
219 ASSERT3U(j, >, 0);
220 buf[j - 1] = entry;
221
222 i++;
223 ASSERT3U(i, <, n);
224 entry = words[i];
283 * each entry.
284 * 3] If there are no more entries in the space map or the callback
285 * returns a value other than 0, we stop iterating over the
286 * space map. If there are entries remaining and the callback
287 * returned 0, we go back to step [1].
288 */
289 int error = 0;
290 while (space_map_length(sm) > 0 && error == 0) {
291 uint64_t nwords = 0;
292 error = space_map_reversed_last_block_entries(sm, buf, bufsz,
293 &nwords);
294 if (error != 0)
295 break;
296
297 ASSERT3U(nwords, <=, bufsz / sizeof (uint64_t));
298
299 for (uint64_t i = 0; i < nwords; i++) {
300 uint64_t e = buf[i];
301
302 if (sm_entry_is_debug(e)) {
303 sm->sm_phys->smp_length -= sizeof (uint64_t);
304 continue;
305 }
306
307 int words = 1;
308 uint64_t raw_offset, raw_run, vdev_id;
309 maptype_t type;
310 if (sm_entry_is_single_word(e)) {
311 type = SM_TYPE_DECODE(e);
312 vdev_id = SM_NO_VDEVID;
313 raw_offset = SM_OFFSET_DECODE(e);
314 raw_run = SM_RUN_DECODE(e);
315 } else {
316 ASSERT(sm_entry_is_double_word(e));
317 words = 2;
318
319 raw_run = SM2_RUN_DECODE(e);
320 vdev_id = SM2_VDEV_DECODE(e);
321
322 /* move to the second word */
323 i++;
338 VERIFY3U(entry_offset, >=, sm->sm_start);
339 VERIFY3U(entry_offset, <, sm->sm_start + sm->sm_size);
340 VERIFY3U(entry_run, <=, sm->sm_size);
341 VERIFY3U(entry_offset + entry_run, <=,
342 sm->sm_start + sm->sm_size);
343
344 space_map_entry_t sme = {
345 .sme_type = type,
346 .sme_vdev = vdev_id,
347 .sme_offset = entry_offset,
348 .sme_run = entry_run
349 };
350 error = callback(&sme, arg);
351 if (error != 0)
352 break;
353
354 if (type == SM_ALLOC)
355 sm->sm_phys->smp_alloc -= entry_run;
356 else
357 sm->sm_phys->smp_alloc += entry_run;
358 sm->sm_phys->smp_length -= words * sizeof (uint64_t);
359 }
360 }
361
362 if (space_map_length(sm) == 0) {
363 ASSERT0(error);
364 ASSERT0(space_map_allocated(sm));
365 }
366
367 zio_buf_free(buf, bufsz);
368 return (error);
369 }
370
371 typedef struct space_map_load_arg {
372 space_map_t *smla_sm;
373 range_tree_t *smla_rt;
374 maptype_t smla_type;
375 } space_map_load_arg_t;
376
377 static int
378 space_map_load_callback(space_map_entry_t *sme, void *arg)
379 {
380 space_map_load_arg_t *smla = arg;
381 if (sme->sme_type == smla->smla_type) {
382 VERIFY3U(range_tree_space(smla->smla_rt) + sme->sme_run, <=,
383 smla->smla_sm->sm_size);
384 range_tree_add(smla->smla_rt, sme->sme_offset, sme->sme_run);
385 } else {
386 range_tree_remove(smla->smla_rt, sme->sme_offset, sme->sme_run);
387 }
388
389 return (0);
390 }
391
392 /*
393 * Load the spacemap into the rangetree, like space_map_load. But only
394 * read the first 'length' bytes of the spacemap.
395 */
396 int
397 space_map_load_length(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
398 uint64_t length)
399 {
400 space_map_load_arg_t smla;
401
402 VERIFY0(range_tree_space(rt));
403
404 if (maptype == SM_FREE)
405 range_tree_add(rt, sm->sm_start, sm->sm_size);
406
407 smla.smla_rt = rt;
408 smla.smla_sm = sm;
409 smla.smla_type = maptype;
410 int err = space_map_iterate(sm, length,
411 space_map_load_callback, &smla);
412
413 if (err != 0)
414 range_tree_vacate(rt, NULL, NULL);
415
416 return (err);
417 }
418
419 /*
420 * Load the space map disk into the specified range tree. Segments of maptype
421 * are added to the range tree, other segment types are removed.
422 */
423 int
424 space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype)
425 {
426 return (space_map_load_length(sm, rt, maptype, space_map_length(sm)));
427 }
428
429 void
430 space_map_histogram_clear(space_map_t *sm)
431 {
432 if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
433 return;
434
435 bzero(sm->sm_phys->smp_histogram, sizeof (sm->sm_phys->smp_histogram));
436 }
437
438 boolean_t
439 space_map_histogram_verify(space_map_t *sm, range_tree_t *rt)
440 {
441 /*
442 * Verify that the in-core range tree does not have any
443 * ranges smaller than our sm_shift size.
444 */
445 for (int i = 0; i < sm->sm_shift; i++) {
446 if (rt->rt_histogram[i] != 0)
447 return (B_FALSE);
448 }
492 * larger than the max bucket size into the last bucket.
493 */
494 if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
495 ASSERT3U(idx + sm->sm_shift, ==, i);
496 idx++;
497 ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
498 }
499 }
500 }
501
502 static void
503 space_map_write_intro_debug(space_map_t *sm, maptype_t maptype, dmu_tx_t *tx)
504 {
505 dmu_buf_will_dirty(sm->sm_dbuf, tx);
506
507 uint64_t dentry = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) |
508 SM_DEBUG_ACTION_ENCODE(maptype) |
509 SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(tx->tx_pool->dp_spa)) |
510 SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
511
512 dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_length,
513 sizeof (dentry), &dentry, tx);
514
515 sm->sm_phys->smp_length += sizeof (dentry);
516 }
517
518 /*
519 * Writes one or more entries given a segment.
520 *
521 * Note: The function may release the dbuf from the pointer initially
522 * passed to it, and return a different dbuf. Also, the space map's
523 * dbuf must be dirty for the changes in sm_phys to take effect.
524 */
525 static void
526 space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype,
527 uint64_t vdev_id, uint8_t words, dmu_buf_t **dbp, void *tag, dmu_tx_t *tx)
528 {
529 ASSERT3U(words, !=, 0);
530 ASSERT3U(words, <=, 2);
531
532 /* ensure the vdev_id can be represented by the space map */
533 ASSERT3U(vdev_id, <=, SM_NO_VDEVID);
534
535 /*
536 * if this is a single word entry, ensure that no vdev was
537 * specified.
538 */
539 IMPLY(words == 1, vdev_id == SM_NO_VDEVID);
540
541 dmu_buf_t *db = *dbp;
542 ASSERT3U(db->db_size, ==, sm->sm_blksz);
543
544 uint64_t *block_base = db->db_data;
545 uint64_t *block_end = block_base + (sm->sm_blksz / sizeof (uint64_t));
546 uint64_t *block_cursor = block_base +
547 (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t);
548
549 ASSERT3P(block_cursor, <=, block_end);
550
551 uint64_t size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
552 uint64_t start = (rs->rs_start - sm->sm_start) >> sm->sm_shift;
553 uint64_t run_max = (words == 2) ? SM2_RUN_MAX : SM_RUN_MAX;
554
555 ASSERT3U(rs->rs_start, >=, sm->sm_start);
556 ASSERT3U(rs->rs_start, <, sm->sm_start + sm->sm_size);
557 ASSERT3U(rs->rs_end - rs->rs_start, <=, sm->sm_size);
558 ASSERT3U(rs->rs_end, <=, sm->sm_start + sm->sm_size);
559
560 while (size != 0) {
561 ASSERT3P(block_cursor, <=, block_end);
562
563 /*
564 * If we are at the end of this block, flush it and start
565 * writing again from the beginning.
566 */
567 if (block_cursor == block_end) {
568 dmu_buf_rele(db, tag);
569
570 uint64_t next_word_offset = sm->sm_phys->smp_length;
571 VERIFY0(dmu_buf_hold(sm->sm_os,
572 space_map_object(sm), next_word_offset,
573 tag, &db, DMU_READ_PREFETCH));
574 dmu_buf_will_dirty(db, tx);
575
576 /* update caller's dbuf */
577 *dbp = db;
578
579 ASSERT3U(db->db_size, ==, sm->sm_blksz);
580
581 block_base = db->db_data;
582 block_cursor = block_base;
583 block_end = block_base +
584 (db->db_size / sizeof (uint64_t));
585 }
586
587 /*
588 * If we are writing a two-word entry and we only have one
589 * word left on this block, just pad it with an empty debug
590 * entry and write the two-word entry in the next block.
591 */
592 uint64_t *next_entry = block_cursor + 1;
593 if (next_entry == block_end && words > 1) {
594 ASSERT3U(words, ==, 2);
595 *block_cursor = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) |
596 SM_DEBUG_ACTION_ENCODE(0) |
597 SM_DEBUG_SYNCPASS_ENCODE(0) |
598 SM_DEBUG_TXG_ENCODE(0);
599 block_cursor++;
600 sm->sm_phys->smp_length += sizeof (uint64_t);
601 ASSERT3P(block_cursor, ==, block_end);
602 continue;
603 }
604
605 uint64_t run_len = MIN(size, run_max);
606 switch (words) {
607 case 1:
608 *block_cursor = SM_OFFSET_ENCODE(start) |
609 SM_TYPE_ENCODE(maptype) |
610 SM_RUN_ENCODE(run_len);
611 block_cursor++;
612 break;
613 case 2:
614 /* write the first word of the entry */
615 *block_cursor = SM_PREFIX_ENCODE(SM2_PREFIX) |
616 SM2_RUN_ENCODE(run_len) |
617 SM2_VDEV_ENCODE(vdev_id);
618 block_cursor++;
619
620 /* move on to the second word of the entry */
621 ASSERT3P(block_cursor, <, block_end);
622 *block_cursor = SM2_TYPE_ENCODE(maptype) |
623 SM2_OFFSET_ENCODE(start);
624 block_cursor++;
625 break;
626 default:
627 panic("%d-word space map entries are not supported",
628 words);
629 break;
630 }
631 sm->sm_phys->smp_length += words * sizeof (uint64_t);
632
633 start += run_len;
634 size -= run_len;
635 }
636 ASSERT0(size);
637
638 }
639
640 /*
641 * Note: The space map's dbuf must be dirty for the changes in sm_phys to
642 * take effect.
643 */
644 static void
645 space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
646 uint64_t vdev_id, dmu_tx_t *tx)
647 {
648 spa_t *spa = tx->tx_pool->dp_spa;
649 dmu_buf_t *db;
650
651 space_map_write_intro_debug(sm, maptype, tx);
652
653 #ifdef DEBUG
654 /*
655 * We do this right after we write the intro debug entry
656 * because the estimate does not take it into account.
657 */
658 uint64_t initial_objsize = sm->sm_phys->smp_length;
659 uint64_t estimated_growth =
660 space_map_estimate_optimal_size(sm, rt, SM_NO_VDEVID);
661 uint64_t estimated_final_objsize = initial_objsize + estimated_growth;
662 #endif
663
664 /*
665 * Find the offset right after the last word in the space map
666 * and use that to get a hold of the last block, so we can
667 * start appending to it.
668 */
669 uint64_t next_word_offset = sm->sm_phys->smp_length;
670 VERIFY0(dmu_buf_hold(sm->sm_os, space_map_object(sm),
671 next_word_offset, FTAG, &db, DMU_READ_PREFETCH));
672 ASSERT3U(db->db_size, ==, sm->sm_blksz);
673
674 dmu_buf_will_dirty(db, tx);
675
676 avl_tree_t *t = &rt->rt_root;
677 for (range_seg_t *rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) {
678 uint64_t offset = (rs->rs_start - sm->sm_start) >> sm->sm_shift;
679 uint64_t length = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
680 uint8_t words = 1;
681
682 /*
683 * We only write two-word entries when both of the following
684 * are true:
685 *
686 * [1] The feature is enabled.
687 * [2] The offset or run is too big for a single-word entry,
688 * or the vdev_id is set (meaning not equal to
689 * SM_NO_VDEVID).
697 (offset >= (1ULL << SM_OFFSET_BITS) ||
698 length > SM_RUN_MAX ||
699 vdev_id != SM_NO_VDEVID ||
700 (zfs_force_some_double_word_sm_entries &&
701 spa_get_random(100) == 0)))
702 words = 2;
703
704 space_map_write_seg(sm, rs, maptype, vdev_id, words,
705 &db, FTAG, tx);
706 }
707
708 dmu_buf_rele(db, FTAG);
709
710 #ifdef DEBUG
711 /*
712 * We expect our estimation to be based on the worst case
713 * scenario [see comment in space_map_estimate_optimal_size()].
714 * Therefore we expect the actual objsize to be equal or less
715 * than whatever we estimated it to be.
716 */
717 ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_length);
718 #endif
719 }
720
721 /*
722 * Note: This function manipulates the state of the given space map but
723 * does not hold any locks implicitly. Thus the caller is responsible
724 * for synchronizing writes to the space map.
725 */
726 void
727 space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
728 uint64_t vdev_id, dmu_tx_t *tx)
729 {
730 objset_t *os = sm->sm_os;
731
732 ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
733 VERIFY3U(space_map_object(sm), !=, 0);
734
735 dmu_buf_will_dirty(sm->sm_dbuf, tx);
736
737 /*
853 doi.doi_bonus_size, doi.doi_data_block_size);
854
855 space_map_free(sm, tx);
856 dmu_buf_rele(sm->sm_dbuf, sm);
857
858 sm->sm_object = space_map_alloc(sm->sm_os, blocksize, tx);
859 VERIFY0(space_map_open_impl(sm));
860 } else {
861 VERIFY0(dmu_free_range(os, space_map_object(sm), 0, -1ULL, tx));
862
863 /*
864 * If the spacemap is reallocated, its histogram
865 * will be reset. Do the same in the common case so that
866 * bugs related to the uncommon case do not go unnoticed.
867 */
868 bzero(sm->sm_phys->smp_histogram,
869 sizeof (sm->sm_phys->smp_histogram));
870 }
871
872 dmu_buf_will_dirty(sm->sm_dbuf, tx);
873 sm->sm_phys->smp_length = 0;
874 sm->sm_phys->smp_alloc = 0;
875 }
876
877 uint64_t
878 space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
879 {
880 spa_t *spa = dmu_objset_spa(os);
881 uint64_t object;
882 int bonuslen;
883
884 if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
885 spa_feature_incr(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM, tx);
886 bonuslen = sizeof (space_map_phys_t);
887 ASSERT3U(bonuslen, <=, dmu_bonus_max());
888 } else {
889 bonuslen = SPACE_MAP_SIZE_V0;
890 }
891
892 object = dmu_object_alloc_ibs(os, DMU_OT_SPACE_MAP, blocksize,
893 space_map_ibs, DMU_OT_SPACE_MAP_HEADER, bonuslen, tx);
894
895 return (object);
896 }
1038 size += histogram[idx] *
1039 entries_for_seg * 2 * sizeof (uint64_t);
1040 }
1041
1042 /*
1043 * Assume the worst case where we start with the padding at the end
1044 * of the current block and we add an extra padding entry at the end
1045 * of all subsequent blocks.
1046 */
1047 size += ((size / sm->sm_blksz) + 1) * sizeof (uint64_t);
1048
1049 return (size);
1050 }
1051
1052 uint64_t
1053 space_map_object(space_map_t *sm)
1054 {
1055 return (sm != NULL ? sm->sm_object : 0);
1056 }
1057
1058 int64_t
1059 space_map_allocated(space_map_t *sm)
1060 {
1061 return (sm != NULL ? sm->sm_phys->smp_alloc : 0);
1062 }
1063
1064 uint64_t
1065 space_map_length(space_map_t *sm)
1066 {
1067 return (sm != NULL ? sm->sm_phys->smp_length : 0);
1068 }
|