3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2013 by Delphix. All rights reserved.
24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
25 */
26
27 #include <sys/zfs_context.h>
28 #include <sys/dmu.h>
29 #include <sys/dmu_tx.h>
30 #include <sys/space_map.h>
31 #include <sys/metaslab_impl.h>
32 #include <sys/vdev_impl.h>
33 #include <sys/zio.h>
34 #include <sys/spa_impl.h>
35
36 /*
37 * Allow allocations to switch to gang blocks quickly. We do this to
38 * avoid having to load lots of space_maps in a given txg. There are,
39 * however, some cases where we want to avoid "fast" ganging and instead
40 * we want to do an exhaustive search of all metaslabs on this device.
41 * Currently we don't allow any gang, zil, or dump device related allocations
42 * to "fast" gang.
43 */
676 *cursor = 0;
677 return (metaslab_block_picker(t, cursor, size, align));
678 }
679
680 /*
681 * ==========================================================================
682 * The first-fit block allocator
683 * ==========================================================================
684 */
685 static uint64_t
686 metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
687 {
688 /*
689 * Find the largest power of 2 block size that evenly divides the
690 * requested size. This is used to try to allocate blocks with similar
691 * alignment from the same area of the metaslab (i.e. same cursor
692 * bucket) but it does not guarantee that other allocations sizes
693 * may exist in the same region.
694 */
695 uint64_t align = size & -size;
696 uint64_t *cursor = &msp->ms_lbas[highbit(align) - 1];
697 avl_tree_t *t = &msp->ms_tree->rt_root;
698
699 return (metaslab_block_picker(t, cursor, size, align));
700 }
701
702 /* ARGSUSED */
703 static boolean_t
704 metaslab_ff_fragmented(metaslab_t *msp)
705 {
706 return (B_TRUE);
707 }
708
709 static metaslab_ops_t metaslab_ff_ops = {
710 metaslab_ff_alloc,
711 metaslab_ff_fragmented
712 };
713
714 /*
715 * ==========================================================================
716 * Dynamic block allocator -
717 * Uses the first fit allocation scheme until space get low and then
718 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
719 * and metaslab_df_free_pct to determine when to switch the allocation scheme.
720 * ==========================================================================
721 */
722 static uint64_t
723 metaslab_df_alloc(metaslab_t *msp, uint64_t size)
724 {
725 /*
726 * Find the largest power of 2 block size that evenly divides the
727 * requested size. This is used to try to allocate blocks with similar
728 * alignment from the same area of the metaslab (i.e. same cursor
729 * bucket) but it does not guarantee that other allocations sizes
730 * may exist in the same region.
731 */
732 uint64_t align = size & -size;
733 uint64_t *cursor = &msp->ms_lbas[highbit(align) - 1];
734 range_tree_t *rt = msp->ms_tree;
735 avl_tree_t *t = &rt->rt_root;
736 uint64_t max_size = metaslab_block_maxsize(msp);
737 int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
738
739 ASSERT(MUTEX_HELD(&msp->ms_lock));
740 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
741
742 if (max_size < size)
743 return (-1ULL);
744
745 /*
746 * If we're running low on space switch to using the size
747 * sorted AVL tree (best-fit).
748 */
749 if (max_size < metaslab_df_alloc_threshold ||
750 free_pct < metaslab_df_free_pct) {
751 t = &msp->ms_size_tree;
752 *cursor = 0;
753 }
829 * ==========================================================================
830 * New dynamic fit allocator -
831 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
832 * contiguous blocks. If no region is found then just use the largest segment
833 * that remains.
834 * ==========================================================================
835 */
836
837 /*
838 * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
839 * to request from the allocator.
840 */
841 uint64_t metaslab_ndf_clump_shift = 4;
842
843 static uint64_t
844 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
845 {
846 avl_tree_t *t = &msp->ms_tree->rt_root;
847 avl_index_t where;
848 range_seg_t *rs, rsearch;
849 uint64_t hbit = highbit(size);
850 uint64_t *cursor = &msp->ms_lbas[hbit - 1];
851 uint64_t max_size = metaslab_block_maxsize(msp);
852
853 ASSERT(MUTEX_HELD(&msp->ms_lock));
854 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
855
856 if (max_size < size)
857 return (-1ULL);
858
859 rsearch.rs_start = *cursor;
860 rsearch.rs_end = *cursor + size;
861
862 rs = avl_find(t, &rsearch, &where);
863 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
864 t = &msp->ms_size_tree;
865
866 rsearch.rs_start = 0;
867 rsearch.rs_end = MIN(max_size,
868 1ULL << (hbit + metaslab_ndf_clump_shift));
869 rs = avl_find(t, &rsearch, &where);
1072 * 2^21 / 2^9 = 2^12 = 4096 * 4 (number of regions) = 16384
1073 * 2) multiply by the power of 2 exponent and the sm_shift value:
1074 * 16384 * 21 * 9 = 3096576
1075 * This value will be added to the weighting of the metaslab.
1076 */
1077 static uint64_t
1078 metaslab_weight_factor(metaslab_t *msp)
1079 {
1080 uint64_t factor = 0;
1081 uint64_t sectors;
1082 int i;
1083
1084 /*
1085 * A null space map means that the entire metaslab is free,
1086 * calculate a weight factor that spans the entire size of the
1087 * metaslab.
1088 */
1089 if (msp->ms_sm == NULL) {
1090 vdev_t *vd = msp->ms_group->mg_vd;
1091
1092 i = highbit(msp->ms_size) - 1;
1093 sectors = msp->ms_size >> vd->vdev_ashift;
1094 return (sectors * i * vd->vdev_ashift);
1095 }
1096
1097 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
1098 return (0);
1099
1100 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE(msp->ms_sm); i++) {
1101 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
1102 continue;
1103
1104 /*
1105 * Determine the number of sm_shift sectors in the region
1106 * indicated by the histogram. For example, given an
1107 * sm_shift value of 9 (512 bytes) and i = 4 then we know
1108 * that we're looking at an 8K region in the histogram
1109 * (i.e. 9 + 4 = 13, 2^13 = 8192). To figure out the
1110 * number of sm_shift sectors (512 bytes in this example),
1111 * we would take 8192 / 512 = 16. Since the histogram
1112 * is offset by sm_shift we can simply use the value of
|
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
25 */
26
27 #include <sys/zfs_context.h>
28 #include <sys/dmu.h>
29 #include <sys/dmu_tx.h>
30 #include <sys/space_map.h>
31 #include <sys/metaslab_impl.h>
32 #include <sys/vdev_impl.h>
33 #include <sys/zio.h>
34 #include <sys/spa_impl.h>
35
36 /*
37 * Allow allocations to switch to gang blocks quickly. We do this to
38 * avoid having to load lots of space_maps in a given txg. There are,
39 * however, some cases where we want to avoid "fast" ganging and instead
40 * we want to do an exhaustive search of all metaslabs on this device.
41 * Currently we don't allow any gang, zil, or dump device related allocations
42 * to "fast" gang.
43 */
676 *cursor = 0;
677 return (metaslab_block_picker(t, cursor, size, align));
678 }
679
680 /*
681 * ==========================================================================
682 * The first-fit block allocator
683 * ==========================================================================
684 */
685 static uint64_t
686 metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
687 {
688 /*
689 * Find the largest power of 2 block size that evenly divides the
690 * requested size. This is used to try to allocate blocks with similar
691 * alignment from the same area of the metaslab (i.e. same cursor
692 * bucket) but it does not guarantee that other allocations sizes
693 * may exist in the same region.
694 */
695 uint64_t align = size & -size;
696 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
697 avl_tree_t *t = &msp->ms_tree->rt_root;
698
699 return (metaslab_block_picker(t, cursor, size, align));
700 }
701
702 /* ARGSUSED */
703 static boolean_t
704 metaslab_ff_fragmented(metaslab_t *msp)
705 {
706 return (B_TRUE);
707 }
708
709 static metaslab_ops_t metaslab_ff_ops = {
710 metaslab_ff_alloc,
711 metaslab_ff_fragmented
712 };
713
714 /*
715 * ==========================================================================
716 * Dynamic block allocator -
717 * Uses the first fit allocation scheme until space get low and then
718 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
719 * and metaslab_df_free_pct to determine when to switch the allocation scheme.
720 * ==========================================================================
721 */
722 static uint64_t
723 metaslab_df_alloc(metaslab_t *msp, uint64_t size)
724 {
725 /*
726 * Find the largest power of 2 block size that evenly divides the
727 * requested size. This is used to try to allocate blocks with similar
728 * alignment from the same area of the metaslab (i.e. same cursor
729 * bucket) but it does not guarantee that other allocations sizes
730 * may exist in the same region.
731 */
732 uint64_t align = size & -size;
733 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
734 range_tree_t *rt = msp->ms_tree;
735 avl_tree_t *t = &rt->rt_root;
736 uint64_t max_size = metaslab_block_maxsize(msp);
737 int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
738
739 ASSERT(MUTEX_HELD(&msp->ms_lock));
740 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
741
742 if (max_size < size)
743 return (-1ULL);
744
745 /*
746 * If we're running low on space switch to using the size
747 * sorted AVL tree (best-fit).
748 */
749 if (max_size < metaslab_df_alloc_threshold ||
750 free_pct < metaslab_df_free_pct) {
751 t = &msp->ms_size_tree;
752 *cursor = 0;
753 }
829 * ==========================================================================
830 * New dynamic fit allocator -
831 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
832 * contiguous blocks. If no region is found then just use the largest segment
833 * that remains.
834 * ==========================================================================
835 */
836
837 /*
838 * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
839 * to request from the allocator.
840 */
841 uint64_t metaslab_ndf_clump_shift = 4;
842
843 static uint64_t
844 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
845 {
846 avl_tree_t *t = &msp->ms_tree->rt_root;
847 avl_index_t where;
848 range_seg_t *rs, rsearch;
849 uint64_t hbit = highbit64(size);
850 uint64_t *cursor = &msp->ms_lbas[hbit - 1];
851 uint64_t max_size = metaslab_block_maxsize(msp);
852
853 ASSERT(MUTEX_HELD(&msp->ms_lock));
854 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
855
856 if (max_size < size)
857 return (-1ULL);
858
859 rsearch.rs_start = *cursor;
860 rsearch.rs_end = *cursor + size;
861
862 rs = avl_find(t, &rsearch, &where);
863 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
864 t = &msp->ms_size_tree;
865
866 rsearch.rs_start = 0;
867 rsearch.rs_end = MIN(max_size,
868 1ULL << (hbit + metaslab_ndf_clump_shift));
869 rs = avl_find(t, &rsearch, &where);
1072 * 2^21 / 2^9 = 2^12 = 4096 * 4 (number of regions) = 16384
1073 * 2) multiply by the power of 2 exponent and the sm_shift value:
1074 * 16384 * 21 * 9 = 3096576
1075 * This value will be added to the weighting of the metaslab.
1076 */
1077 static uint64_t
1078 metaslab_weight_factor(metaslab_t *msp)
1079 {
1080 uint64_t factor = 0;
1081 uint64_t sectors;
1082 int i;
1083
1084 /*
1085 * A null space map means that the entire metaslab is free,
1086 * calculate a weight factor that spans the entire size of the
1087 * metaslab.
1088 */
1089 if (msp->ms_sm == NULL) {
1090 vdev_t *vd = msp->ms_group->mg_vd;
1091
1092 i = highbit64(msp->ms_size) - 1;
1093 sectors = msp->ms_size >> vd->vdev_ashift;
1094 return (sectors * i * vd->vdev_ashift);
1095 }
1096
1097 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
1098 return (0);
1099
1100 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE(msp->ms_sm); i++) {
1101 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
1102 continue;
1103
1104 /*
1105 * Determine the number of sm_shift sectors in the region
1106 * indicated by the histogram. For example, given an
1107 * sm_shift value of 9 (512 bytes) and i = 4 then we know
1108 * that we're looking at an 8K region in the histogram
1109 * (i.e. 9 + 4 = 13, 2^13 = 8192). To figure out the
1110 * number of sm_shift sectors (512 bytes in this example),
1111 * we would take 8192 / 512 = 16. Since the histogram
1112 * is offset by sm_shift we can simply use the value of
|