Print this page
4374 dn_free_ranges should use range_tree_t
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Max Grossman <max.grossman@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com
Reviewed by: Garrett D'Amore <garrett@damore.org>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Approved by: Dan McDonald <danmcd@omniti.com>


   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2013 by Delphix. All rights reserved.
  24  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  25  */
  26 
  27 #include <sys/zfs_context.h>
  28 #include <sys/dmu.h>
  29 #include <sys/dmu_tx.h>
  30 #include <sys/space_map.h>
  31 #include <sys/metaslab_impl.h>
  32 #include <sys/vdev_impl.h>
  33 #include <sys/zio.h>
  34 #include <sys/spa_impl.h>
  35 
  36 /*
  37  * Allow allocations to switch to gang blocks quickly. We do this to
  38  * avoid having to load lots of space_maps in a given txg. There are,
  39  * however, some cases where we want to avoid "fast" ganging and instead
  40  * we want to do an exhaustive search of all metaslabs on this device.
  41  * Currently we don't allow any gang, zil, or dump device related allocations
  42  * to "fast" gang.
  43  */


 676         *cursor = 0;
 677         return (metaslab_block_picker(t, cursor, size, align));
 678 }
 679 
 680 /*
 681  * ==========================================================================
 682  * The first-fit block allocator
 683  * ==========================================================================
 684  */
 685 static uint64_t
 686 metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
 687 {
 688         /*
 689          * Find the largest power of 2 block size that evenly divides the
 690          * requested size. This is used to try to allocate blocks with similar
 691          * alignment from the same area of the metaslab (i.e. same cursor
 692          * bucket) but it does not guarantee that other allocations sizes
 693          * may exist in the same region.
 694          */
 695         uint64_t align = size & -size;
 696         uint64_t *cursor = &msp->ms_lbas[highbit(align) - 1];
 697         avl_tree_t *t = &msp->ms_tree->rt_root;
 698 
 699         return (metaslab_block_picker(t, cursor, size, align));
 700 }
 701 
 702 /* ARGSUSED */
 703 static boolean_t
 704 metaslab_ff_fragmented(metaslab_t *msp)
 705 {
 706         return (B_TRUE);
 707 }
 708 
 709 static metaslab_ops_t metaslab_ff_ops = {
 710         metaslab_ff_alloc,
 711         metaslab_ff_fragmented
 712 };
 713 
 714 /*
 715  * ==========================================================================
 716  * Dynamic block allocator -
 717  * Uses the first fit allocation scheme until space get low and then
 718  * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
 719  * and metaslab_df_free_pct to determine when to switch the allocation scheme.
 720  * ==========================================================================
 721  */
 722 static uint64_t
 723 metaslab_df_alloc(metaslab_t *msp, uint64_t size)
 724 {
 725         /*
 726          * Find the largest power of 2 block size that evenly divides the
 727          * requested size. This is used to try to allocate blocks with similar
 728          * alignment from the same area of the metaslab (i.e. same cursor
 729          * bucket) but it does not guarantee that other allocations sizes
 730          * may exist in the same region.
 731          */
 732         uint64_t align = size & -size;
 733         uint64_t *cursor = &msp->ms_lbas[highbit(align) - 1];
 734         range_tree_t *rt = msp->ms_tree;
 735         avl_tree_t *t = &rt->rt_root;
 736         uint64_t max_size = metaslab_block_maxsize(msp);
 737         int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
 738 
 739         ASSERT(MUTEX_HELD(&msp->ms_lock));
 740         ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
 741 
 742         if (max_size < size)
 743                 return (-1ULL);
 744 
 745         /*
 746          * If we're running low on space switch to using the size
 747          * sorted AVL tree (best-fit).
 748          */
 749         if (max_size < metaslab_df_alloc_threshold ||
 750             free_pct < metaslab_df_free_pct) {
 751                 t = &msp->ms_size_tree;
 752                 *cursor = 0;
 753         }


 829  * ==========================================================================
 830  * New dynamic fit allocator -
 831  * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
 832  * contiguous blocks. If no region is found then just use the largest segment
 833  * that remains.
 834  * ==========================================================================
 835  */
 836 
 837 /*
 838  * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
 839  * to request from the allocator.
 840  */
 841 uint64_t metaslab_ndf_clump_shift = 4;
 842 
 843 static uint64_t
 844 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
 845 {
 846         avl_tree_t *t = &msp->ms_tree->rt_root;
 847         avl_index_t where;
 848         range_seg_t *rs, rsearch;
 849         uint64_t hbit = highbit(size);
 850         uint64_t *cursor = &msp->ms_lbas[hbit - 1];
 851         uint64_t max_size = metaslab_block_maxsize(msp);
 852 
 853         ASSERT(MUTEX_HELD(&msp->ms_lock));
 854         ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
 855 
 856         if (max_size < size)
 857                 return (-1ULL);
 858 
 859         rsearch.rs_start = *cursor;
 860         rsearch.rs_end = *cursor + size;
 861 
 862         rs = avl_find(t, &rsearch, &where);
 863         if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
 864                 t = &msp->ms_size_tree;
 865 
 866                 rsearch.rs_start = 0;
 867                 rsearch.rs_end = MIN(max_size,
 868                     1ULL << (hbit + metaslab_ndf_clump_shift));
 869                 rs = avl_find(t, &rsearch, &where);


1072  *      2^21 / 2^9 = 2^12 = 4096 * 4 (number of regions) = 16384
1073  * 2) multiply by the power of 2 exponent and the sm_shift value:
1074  *      16384 * 21 * 9 = 3096576
1075  * This value will be added to the weighting of the metaslab.
1076  */
1077 static uint64_t
1078 metaslab_weight_factor(metaslab_t *msp)
1079 {
1080         uint64_t factor = 0;
1081         uint64_t sectors;
1082         int i;
1083 
1084         /*
1085          * A null space map means that the entire metaslab is free,
1086          * calculate a weight factor that spans the entire size of the
1087          * metaslab.
1088          */
1089         if (msp->ms_sm == NULL) {
1090                 vdev_t *vd = msp->ms_group->mg_vd;
1091 
1092                 i = highbit(msp->ms_size) - 1;
1093                 sectors = msp->ms_size >> vd->vdev_ashift;
1094                 return (sectors * i * vd->vdev_ashift);
1095         }
1096 
1097         if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
1098                 return (0);
1099 
1100         for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE(msp->ms_sm); i++) {
1101                 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
1102                         continue;
1103 
1104                 /*
1105                  * Determine the number of sm_shift sectors in the region
1106                  * indicated by the histogram. For example, given an
1107                  * sm_shift value of 9 (512 bytes) and i = 4 then we know
1108                  * that we're looking at an 8K region in the histogram
1109                  * (i.e. 9 + 4 = 13, 2^13 = 8192). To figure out the
1110                  * number of sm_shift sectors (512 bytes in this example),
1111                  * we would take 8192 / 512 = 16. Since the histogram
1112                  * is offset by sm_shift we can simply use the value of




   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  24  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  25  */
  26 
  27 #include <sys/zfs_context.h>
  28 #include <sys/dmu.h>
  29 #include <sys/dmu_tx.h>
  30 #include <sys/space_map.h>
  31 #include <sys/metaslab_impl.h>
  32 #include <sys/vdev_impl.h>
  33 #include <sys/zio.h>
  34 #include <sys/spa_impl.h>
  35 
  36 /*
  37  * Allow allocations to switch to gang blocks quickly. We do this to
  38  * avoid having to load lots of space_maps in a given txg. There are,
  39  * however, some cases where we want to avoid "fast" ganging and instead
  40  * we want to do an exhaustive search of all metaslabs on this device.
  41  * Currently we don't allow any gang, zil, or dump device related allocations
  42  * to "fast" gang.
  43  */


 676         *cursor = 0;
 677         return (metaslab_block_picker(t, cursor, size, align));
 678 }
 679 
 680 /*
 681  * ==========================================================================
 682  * The first-fit block allocator
 683  * ==========================================================================
 684  */
 685 static uint64_t
 686 metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
 687 {
 688         /*
 689          * Find the largest power of 2 block size that evenly divides the
 690          * requested size. This is used to try to allocate blocks with similar
 691          * alignment from the same area of the metaslab (i.e. same cursor
 692          * bucket) but it does not guarantee that other allocations sizes
 693          * may exist in the same region.
 694          */
 695         uint64_t align = size & -size;
 696         uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
 697         avl_tree_t *t = &msp->ms_tree->rt_root;
 698 
 699         return (metaslab_block_picker(t, cursor, size, align));
 700 }
 701 
 702 /* ARGSUSED */
 703 static boolean_t
 704 metaslab_ff_fragmented(metaslab_t *msp)
 705 {
 706         return (B_TRUE);
 707 }
 708 
 709 static metaslab_ops_t metaslab_ff_ops = {
 710         metaslab_ff_alloc,
 711         metaslab_ff_fragmented
 712 };
 713 
 714 /*
 715  * ==========================================================================
 716  * Dynamic block allocator -
 717  * Uses the first fit allocation scheme until space get low and then
 718  * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
 719  * and metaslab_df_free_pct to determine when to switch the allocation scheme.
 720  * ==========================================================================
 721  */
 722 static uint64_t
 723 metaslab_df_alloc(metaslab_t *msp, uint64_t size)
 724 {
 725         /*
 726          * Find the largest power of 2 block size that evenly divides the
 727          * requested size. This is used to try to allocate blocks with similar
 728          * alignment from the same area of the metaslab (i.e. same cursor
 729          * bucket) but it does not guarantee that other allocations sizes
 730          * may exist in the same region.
 731          */
 732         uint64_t align = size & -size;
 733         uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
 734         range_tree_t *rt = msp->ms_tree;
 735         avl_tree_t *t = &rt->rt_root;
 736         uint64_t max_size = metaslab_block_maxsize(msp);
 737         int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
 738 
 739         ASSERT(MUTEX_HELD(&msp->ms_lock));
 740         ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
 741 
 742         if (max_size < size)
 743                 return (-1ULL);
 744 
 745         /*
 746          * If we're running low on space switch to using the size
 747          * sorted AVL tree (best-fit).
 748          */
 749         if (max_size < metaslab_df_alloc_threshold ||
 750             free_pct < metaslab_df_free_pct) {
 751                 t = &msp->ms_size_tree;
 752                 *cursor = 0;
 753         }


 829  * ==========================================================================
 830  * New dynamic fit allocator -
 831  * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
 832  * contiguous blocks. If no region is found then just use the largest segment
 833  * that remains.
 834  * ==========================================================================
 835  */
 836 
 837 /*
 838  * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
 839  * to request from the allocator.
 840  */
 841 uint64_t metaslab_ndf_clump_shift = 4;
 842 
 843 static uint64_t
 844 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
 845 {
 846         avl_tree_t *t = &msp->ms_tree->rt_root;
 847         avl_index_t where;
 848         range_seg_t *rs, rsearch;
 849         uint64_t hbit = highbit64(size);
 850         uint64_t *cursor = &msp->ms_lbas[hbit - 1];
 851         uint64_t max_size = metaslab_block_maxsize(msp);
 852 
 853         ASSERT(MUTEX_HELD(&msp->ms_lock));
 854         ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
 855 
 856         if (max_size < size)
 857                 return (-1ULL);
 858 
 859         rsearch.rs_start = *cursor;
 860         rsearch.rs_end = *cursor + size;
 861 
 862         rs = avl_find(t, &rsearch, &where);
 863         if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
 864                 t = &msp->ms_size_tree;
 865 
 866                 rsearch.rs_start = 0;
 867                 rsearch.rs_end = MIN(max_size,
 868                     1ULL << (hbit + metaslab_ndf_clump_shift));
 869                 rs = avl_find(t, &rsearch, &where);


1072  *      2^21 / 2^9 = 2^12 = 4096 * 4 (number of regions) = 16384
1073  * 2) multiply by the power of 2 exponent and the sm_shift value:
1074  *      16384 * 21 * 9 = 3096576
1075  * This value will be added to the weighting of the metaslab.
1076  */
1077 static uint64_t
1078 metaslab_weight_factor(metaslab_t *msp)
1079 {
1080         uint64_t factor = 0;
1081         uint64_t sectors;
1082         int i;
1083 
1084         /*
1085          * A null space map means that the entire metaslab is free,
1086          * calculate a weight factor that spans the entire size of the
1087          * metaslab.
1088          */
1089         if (msp->ms_sm == NULL) {
1090                 vdev_t *vd = msp->ms_group->mg_vd;
1091 
1092                 i = highbit64(msp->ms_size) - 1;
1093                 sectors = msp->ms_size >> vd->vdev_ashift;
1094                 return (sectors * i * vd->vdev_ashift);
1095         }
1096 
1097         if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
1098                 return (0);
1099 
1100         for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE(msp->ms_sm); i++) {
1101                 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
1102                         continue;
1103 
1104                 /*
1105                  * Determine the number of sm_shift sectors in the region
1106                  * indicated by the histogram. For example, given an
1107                  * sm_shift value of 9 (512 bytes) and i = 4 then we know
1108                  * that we're looking at an 8K region in the histogram
1109                  * (i.e. 9 + 4 = 13, 2^13 = 8192). To figure out the
1110                  * number of sm_shift sectors (512 bytes in this example),
1111                  * we would take 8192 / 512 = 16. Since the histogram
1112                  * is offset by sm_shift we can simply use the value of