10592 Wdiff usr/src/uts/common/fs/zfs/metaslab.c

Print this page

10592 misc. metaslab and vdev related ZoL bug fixes
Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Giuseppe Di Natale <guss80@gmail.com>
Reviewed by: George Melikov <mail@gmelikov.ru>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Tony Hutter <hutter2@llnl.gov>
Reviewed by: Kody Kantor <kody.kantor@joyent.com>
Approved by: Dan McDonald <danmcd@joyent.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/metaslab.c
          +++ new/usr/src/uts/common/fs/zfs/metaslab.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  24   24   * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  25   25   * Copyright (c) 2014 Integros [integros.com]
  26   26   * Copyright (c) 2017, Intel Corporation.
  27   27   */
  28   28  
  29   29  #include <sys/zfs_context.h>
  30   30  #include <sys/dmu.h>
  31   31  #include <sys/dmu_tx.h>
  32   32  #include <sys/space_map.h>
  33   33  #include <sys/metaslab_impl.h>
  34   34  #include <sys/vdev_impl.h>
  35   35  #include <sys/zio.h>
  36   36  #include <sys/spa_impl.h>
  37   37  #include <sys/zfeature.h>
  38   38  #include <sys/vdev_indirect_mapping.h>
  39   39  #include <sys/zap.h>
  40   40  
  41   41  #define GANG_ALLOCATION(flags) \
  42   42          ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
  43   43  
  44   44  uint64_t metaslab_aliquot = 512ULL << 10;
  45   45  uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
  46   46  
  47   47  /*
  48   48   * Since we can touch multiple metaslabs (and their respective space maps)
  49   49   * with each transaction group, we benefit from having a smaller space map
  50   50   * block size since it allows us to issue more I/O operations scattered
  51   51   * around the disk.
  52   52   */
  53   53  int zfs_metaslab_sm_blksz = (1 << 12);
  54   54  
  55   55  /*
  56   56   * The in-core space map representation is more compact than its on-disk form.
  57   57   * The zfs_condense_pct determines how much more compact the in-core
  58   58   * space map representation must be before we compact it on-disk.
  59   59   * Values should be greater than or equal to 100.
  60   60   */
  61   61  int zfs_condense_pct = 200;
  62   62  
  63   63  /*
  64   64   * Condensing a metaslab is not guaranteed to actually reduce the amount of
  65   65   * space used on disk. In particular, a space map uses data in increments of
  66   66   * MAX(1 << ashift, space_map_blksize), so a metaslab might use the
  67   67   * same number of blocks after condensing. Since the goal of condensing is to
  68   68   * reduce the number of IOPs required to read the space map, we only want to
  69   69   * condense when we can be sure we will reduce the number of blocks used by the
  70   70   * space map. Unfortunately, we cannot precisely compute whether or not this is
  71   71   * the case in metaslab_should_condense since we are holding ms_lock. Instead,
  72   72   * we apply the following heuristic: do not condense a spacemap unless the
  73   73   * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
  74   74   * blocks.
  75   75   */
  76   76  int zfs_metaslab_condense_block_threshold = 4;
  77   77  
  78   78  /*
  79   79   * The zfs_mg_noalloc_threshold defines which metaslab groups should
  80   80   * be eligible for allocation. The value is defined as a percentage of
  81   81   * free space. Metaslab groups that have more free space than
  82   82   * zfs_mg_noalloc_threshold are always eligible for allocations. Once
  83   83   * a metaslab group's free space is less than or equal to the
  84   84   * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
  85   85   * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
  86   86   * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
  87   87   * groups are allowed to accept allocations. Gang blocks are always
  88   88   * eligible to allocate on any metaslab group. The default value of 0 means
  89   89   * no metaslab group will be excluded based on this criterion.
  90   90   */
  91   91  int zfs_mg_noalloc_threshold = 0;
  92   92  
  93   93  /*
  94   94   * Metaslab groups are considered eligible for allocations if their
  95   95   * fragmenation metric (measured as a percentage) is less than or equal to
  96   96   * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold
  97   97   * then it will be skipped unless all metaslab groups within the metaslab
  98   98   * class have also crossed this threshold.
  99   99   */
 100  100  int zfs_mg_fragmentation_threshold = 85;
 101  101  
 102  102  /*
 103  103   * Allow metaslabs to keep their active state as long as their fragmentation
 104  104   * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
 105  105   * active metaslab that exceeds this threshold will no longer keep its active
 106  106   * status allowing better metaslabs to be selected.
 107  107   */
 108  108  int zfs_metaslab_fragmentation_threshold = 70;
 109  109  
 110  110  /*
 111  111   * When set will load all metaslabs when pool is first opened.
 112  112   */
 113  113  int metaslab_debug_load = 0;
 114  114  
 115  115  /*
 116  116   * When set will prevent metaslabs from being unloaded.
 117  117   */
 118  118  int metaslab_debug_unload = 0;
 119  119  
 120  120  /*
 121  121   * Minimum size which forces the dynamic allocator to change
 122  122   * it's allocation strategy.  Once the space map cannot satisfy
 123  123   * an allocation of this size then it switches to using more
 124  124   * aggressive strategy (i.e search by size rather than offset).
 125  125   */
 126  126  uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
 127  127  
 128  128  /*
 129  129   * The minimum free space, in percent, which must be available
 130  130   * in a space map to continue allocations in a first-fit fashion.
 131  131   * Once the space map's free space drops below this level we dynamically
 132  132   * switch to using best-fit allocations.
 133  133   */
 134  134  int metaslab_df_free_pct = 4;
 135  135  
 136  136  /*
 137  137   * A metaslab is considered "free" if it contains a contiguous
 138  138   * segment which is greater than metaslab_min_alloc_size.
 139  139   */
 140  140  uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
 141  141  
 142  142  /*
 143  143   * Percentage of all cpus that can be used by the metaslab taskq.
 144  144   */
 145  145  int metaslab_load_pct = 50;
 146  146  
 147  147  /*
 148  148   * Determines how many txgs a metaslab may remain loaded without having any
 149  149   * allocations from it. As long as a metaslab continues to be used we will
 150  150   * keep it loaded.
 151  151   */
 152  152  int metaslab_unload_delay = TXG_SIZE * 2;
 153  153  
 154  154  /*
 155  155   * Max number of metaslabs per group to preload.
 156  156   */
 157  157  int metaslab_preload_limit = SPA_DVAS_PER_BP;
 158  158  
 159  159  /*
 160  160   * Enable/disable preloading of metaslab.
 161  161   */
 162  162  boolean_t metaslab_preload_enabled = B_TRUE;
 163  163  
 164  164  /*
 165  165   * Enable/disable fragmentation weighting on metaslabs.
 166  166   */
 167  167  boolean_t metaslab_fragmentation_factor_enabled = B_TRUE;
 168  168  
 169  169  /*
 170  170   * Enable/disable lba weighting (i.e. outer tracks are given preference).
 171  171   */
 172  172  boolean_t metaslab_lba_weighting_enabled = B_TRUE;
 173  173  
 174  174  /*
 175  175   * Enable/disable metaslab group biasing.
 176  176   */
 177  177  boolean_t metaslab_bias_enabled = B_TRUE;
 178  178  
 179  179  /*
 180  180   * Enable/disable remapping of indirect DVAs to their concrete vdevs.
 181  181   */
 182  182  boolean_t zfs_remap_blkptr_enable = B_TRUE;
 183  183  
 184  184  /*
 185  185   * Enable/disable segment-based metaslab selection.
 186  186   */
 187  187  boolean_t zfs_metaslab_segment_weight_enabled = B_TRUE;
 188  188  
 189  189  /*
 190  190   * When using segment-based metaslab selection, we will continue
 191  191   * allocating from the active metaslab until we have exhausted
 192  192   * zfs_metaslab_switch_threshold of its buckets.
 193  193   */
 194  194  int zfs_metaslab_switch_threshold = 2;
 195  195  
 196  196  /*
 197  197   * Internal switch to enable/disable the metaslab allocation tracing
 198  198   * facility.
 199  199   */
 200  200  boolean_t metaslab_trace_enabled = B_TRUE;
 201  201  
 202  202  /*
 203  203   * Maximum entries that the metaslab allocation tracing facility will keep
 204  204   * in a given list when running in non-debug mode. We limit the number
 205  205   * of entries in non-debug mode to prevent us from using up too much memory.
 206  206   * The limit should be sufficiently large that we don't expect any allocation
 207  207   * to every exceed this value. In debug mode, the system will panic if this
 208  208   * limit is ever reached allowing for further investigation.
 209  209   */
 210  210  uint64_t metaslab_trace_max_entries = 5000;
 211  211  
 212  212  static uint64_t metaslab_weight(metaslab_t *);
 213  213  static void metaslab_set_fragmentation(metaslab_t *);
 214  214  static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
 215  215  static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
 216  216  static void metaslab_passivate(metaslab_t *msp, uint64_t weight);
 217  217  static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp);
 218  218  
 219  219  kmem_cache_t *metaslab_alloc_trace_cache;
 220  220  
 221  221  /*
 222  222   * ==========================================================================
 223  223   * Metaslab classes
 224  224   * ==========================================================================
 225  225   */
 226  226  metaslab_class_t *
 227  227  metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
 228  228  {
 229  229          metaslab_class_t *mc;
 230  230  
 231  231          mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
 232  232  
 233  233          mc->mc_spa = spa;
 234  234          mc->mc_rotor = NULL;
 235  235          mc->mc_ops = ops;
 236  236          mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
 237  237          mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count *
 238  238              sizeof (zfs_refcount_t), KM_SLEEP);
 239  239          mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count *
 240  240              sizeof (uint64_t), KM_SLEEP);
 241  241          for (int i = 0; i < spa->spa_alloc_count; i++)
 242  242                  zfs_refcount_create_tracked(&mc->mc_alloc_slots[i]);
 243  243  
 244  244          return (mc);
 245  245  }
 246  246  
 247  247  void
 248  248  metaslab_class_destroy(metaslab_class_t *mc)
 249  249  {
 250  250          ASSERT(mc->mc_rotor == NULL);
 251  251          ASSERT(mc->mc_alloc == 0);
 252  252          ASSERT(mc->mc_deferred == 0);
 253  253          ASSERT(mc->mc_space == 0);
 254  254          ASSERT(mc->mc_dspace == 0);
 255  255  
 256  256          for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++)
 257  257                  zfs_refcount_destroy(&mc->mc_alloc_slots[i]);
 258  258          kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count *
 259  259              sizeof (zfs_refcount_t));
 260  260          kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count *
 261  261              sizeof (uint64_t));
 262  262          mutex_destroy(&mc->mc_lock);
 263  263          kmem_free(mc, sizeof (metaslab_class_t));
 264  264  }
 265  265  
 266  266  int
 267  267  metaslab_class_validate(metaslab_class_t *mc)
 268  268  {
 269  269          metaslab_group_t *mg;
 270  270          vdev_t *vd;
 271  271  
 272  272          /*
 273  273           * Must hold one of the spa_config locks.
 274  274           */
 275  275          ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
 276  276              spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
 277  277  
 278  278          if ((mg = mc->mc_rotor) == NULL)
 279  279                  return (0);
 280  280  
 281  281          do {
 282  282                  vd = mg->mg_vd;
 283  283                  ASSERT(vd->vdev_mg != NULL);
 284  284                  ASSERT3P(vd->vdev_top, ==, vd);
 285  285                  ASSERT3P(mg->mg_class, ==, mc);
 286  286                  ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
 287  287          } while ((mg = mg->mg_next) != mc->mc_rotor);
 288  288  
 289  289          return (0);
 290  290  }
 291  291  
 292  292  static void
 293  293  metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
 294  294      int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
 295  295  {
 296  296          atomic_add_64(&mc->mc_alloc, alloc_delta);
 297  297          atomic_add_64(&mc->mc_deferred, defer_delta);
 298  298          atomic_add_64(&mc->mc_space, space_delta);
 299  299          atomic_add_64(&mc->mc_dspace, dspace_delta);
 300  300  }
 301  301  
 302  302  uint64_t
 303  303  metaslab_class_get_alloc(metaslab_class_t *mc)
 304  304  {
 305  305          return (mc->mc_alloc);
 306  306  }
 307  307  
 308  308  uint64_t
 309  309  metaslab_class_get_deferred(metaslab_class_t *mc)
 310  310  {
 311  311          return (mc->mc_deferred);
 312  312  }
 313  313  
 314  314  uint64_t
 315  315  metaslab_class_get_space(metaslab_class_t *mc)
 316  316  {
 317  317          return (mc->mc_space);
 318  318  }
 319  319  
 320  320  uint64_t
 321  321  metaslab_class_get_dspace(metaslab_class_t *mc)
 322  322  {
 323  323          return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
 324  324  }
 325  325  
 326  326  void
 327  327  metaslab_class_histogram_verify(metaslab_class_t *mc)
 328  328  {
 329  329          spa_t *spa = mc->mc_spa;
 330  330          vdev_t *rvd = spa->spa_root_vdev;
 331  331          uint64_t *mc_hist;
 332  332          int i;
 333  333  
 334  334          if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
 335  335                  return;
 336  336  
 337  337          mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
 338  338              KM_SLEEP);
 339  339  
 340  340          for (int c = 0; c < rvd->vdev_children; c++) {
 341  341                  vdev_t *tvd = rvd->vdev_child[c];
 342  342                  metaslab_group_t *mg = tvd->vdev_mg;
 343  343  
 344  344                  /*
 345  345                   * Skip any holes, uninitialized top-levels, or
 346  346                   * vdevs that are not in this metalab class.
 347  347                   */
 348  348                  if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
 349  349                      mg->mg_class != mc) {
 350  350                          continue;
 351  351                  }
 352  352  
 353  353                  for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
 354  354                          mc_hist[i] += mg->mg_histogram[i];
 355  355          }
 356  356  
 357  357          for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
 358  358                  VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
 359  359  
 360  360          kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
 361  361  }
 362  362  
 363  363  /*
 364  364   * Calculate the metaslab class's fragmentation metric. The metric
 365  365   * is weighted based on the space contribution of each metaslab group.
 366  366   * The return value will be a number between 0 and 100 (inclusive), or
 367  367   * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
 368  368   * zfs_frag_table for more information about the metric.
 369  369   */
 370  370  uint64_t
 371  371  metaslab_class_fragmentation(metaslab_class_t *mc)
 372  372  {
 373  373          vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 374  374          uint64_t fragmentation = 0;
 375  375  
 376  376          spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
 377  377  
 378  378          for (int c = 0; c < rvd->vdev_children; c++) {
 379  379                  vdev_t *tvd = rvd->vdev_child[c];
 380  380                  metaslab_group_t *mg = tvd->vdev_mg;
 381  381  
 382  382                  /*
 383  383                   * Skip any holes, uninitialized top-levels,
 384  384                   * or vdevs that are not in this metalab class.
 385  385                   */
 386  386                  if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
 387  387                      mg->mg_class != mc) {
 388  388                          continue;
 389  389                  }
 390  390  
 391  391                  /*
 392  392                   * If a metaslab group does not contain a fragmentation
 393  393                   * metric then just bail out.
 394  394                   */
 395  395                  if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
 396  396                          spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 397  397                          return (ZFS_FRAG_INVALID);
 398  398                  }
 399  399  
 400  400                  /*
 401  401                   * Determine how much this metaslab_group is contributing
 402  402                   * to the overall pool fragmentation metric.
 403  403                   */
 404  404                  fragmentation += mg->mg_fragmentation *
 405  405                      metaslab_group_get_space(mg);
 406  406          }
 407  407          fragmentation /= metaslab_class_get_space(mc);
 408  408  
 409  409          ASSERT3U(fragmentation, <=, 100);
 410  410          spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 411  411          return (fragmentation);
 412  412  }
 413  413  
 414  414  /*
 415  415   * Calculate the amount of expandable space that is available in
 416  416   * this metaslab class. If a device is expanded then its expandable
 417  417   * space will be the amount of allocatable space that is currently not
 418  418   * part of this metaslab class.
 419  419   */
 420  420  uint64_t
 421  421  metaslab_class_expandable_space(metaslab_class_t *mc)
 422  422  {
 423  423          vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 424  424          uint64_t space = 0;
 425  425  
 426  426          spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
 427  427          for (int c = 0; c < rvd->vdev_children; c++) {
 428  428                  uint64_t tspace;
 429  429                  vdev_t *tvd = rvd->vdev_child[c];
 430  430                  metaslab_group_t *mg = tvd->vdev_mg;
 431  431  
 432  432                  if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
 433  433                      mg->mg_class != mc) {
 434  434                          continue;
 435  435                  }
 436  436  
 437  437                  /*
 438  438                   * Calculate if we have enough space to add additional
 439  439                   * metaslabs. We report the expandable space in terms
 440  440                   * of the metaslab size since that's the unit of expansion.
 441  441                   * Adjust by efi system partition size.
 442  442                   */
 443  443                  tspace = tvd->vdev_max_asize - tvd->vdev_asize;
 444  444                  if (tspace > mc->mc_spa->spa_bootsize) {
 445  445                          tspace -= mc->mc_spa->spa_bootsize;
 446  446                  }
 447  447                  space += P2ALIGN(tspace, 1ULL << tvd->vdev_ms_shift);
 448  448          }
 449  449          spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 450  450          return (space);
 451  451  }
 452  452  
 453  453  static int
 454  454  metaslab_compare(const void *x1, const void *x2)
 455  455  {
 456  456          const metaslab_t *m1 = x1;
 457  457          const metaslab_t *m2 = x2;
 458  458  
 459  459          int sort1 = 0;
 460  460          int sort2 = 0;
 461  461          if (m1->ms_allocator != -1 && m1->ms_primary)
 462  462                  sort1 = 1;
 463  463          else if (m1->ms_allocator != -1 && !m1->ms_primary)
 464  464                  sort1 = 2;
 465  465          if (m2->ms_allocator != -1 && m2->ms_primary)
 466  466                  sort2 = 1;
 467  467          else if (m2->ms_allocator != -1 && !m2->ms_primary)
 468  468                  sort2 = 2;
 469  469  
 470  470          /*
 471  471           * Sort inactive metaslabs first, then primaries, then secondaries. When
 472  472           * selecting a metaslab to allocate from, an allocator first tries its
 473  473           * primary, then secondary active metaslab. If it doesn't have active
 474  474           * metaslabs, or can't allocate from them, it searches for an inactive
 475  475           * metaslab to activate. If it can't find a suitable one, it will steal
 476  476           * a primary or secondary metaslab from another allocator.
 477  477           */
 478  478          if (sort1 < sort2)
 479  479                  return (-1);
 480  480          if (sort1 > sort2)
 481  481                  return (1);
 482  482  
 483  483          if (m1->ms_weight < m2->ms_weight)
 484  484                  return (1);
 485  485          if (m1->ms_weight > m2->ms_weight)
 486  486                  return (-1);
 487  487  
 488  488          /*
 489  489           * If the weights are identical, use the offset to force uniqueness.
 490  490           */

↓ open down ↓

490 lines elided

↑ open up ↑

 491  491          if (m1->ms_start < m2->ms_start)
 492  492                  return (-1);
 493  493          if (m1->ms_start > m2->ms_start)
 494  494                  return (1);
 495  495  
 496  496          ASSERT3P(m1, ==, m2);
 497  497  
 498  498          return (0);
 499  499  }
 500  500  
      501 +uint64_t
      502 +metaslab_allocated_space(metaslab_t *msp)
      503 +{
      504 +        return (msp->ms_allocated_space);
      505 +}
      506 +
 501  507  /*
 502  508   * Verify that the space accounting on disk matches the in-core range_trees.
 503  509   */
 504      -void
      510 +static void
 505  511  metaslab_verify_space(metaslab_t *msp, uint64_t txg)
 506  512  {
 507  513          spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 508      -        uint64_t allocated = 0;
      514 +        uint64_t allocating = 0;
 509  515          uint64_t sm_free_space, msp_free_space;
 510  516  
 511  517          ASSERT(MUTEX_HELD(&msp->ms_lock));
      518 +        ASSERT(!msp->ms_condensing);
 512  519  
 513  520          if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
 514  521                  return;
 515  522  
 516  523          /*
 517  524           * We can only verify the metaslab space when we're called
 518      -         * from syncing context with a loaded metaslab that has an allocated
 519      -         * space map. Calling this in non-syncing context does not
 520      -         * provide a consistent view of the metaslab since we're performing
 521      -         * allocations in the future.
      525 +         * from syncing context with a loaded metaslab that has an
      526 +         * allocated space map. Calling this in non-syncing context
      527 +         * does not provide a consistent view of the metaslab since
      528 +         * we're performing allocations in the future.
 522  529           */
 523  530          if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
 524  531              !msp->ms_loaded)
 525  532                  return;
 526  533  
 527      -        sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) -
 528      -            space_map_alloc_delta(msp->ms_sm);
      534 +        /*
      535 +         * Even though the smp_alloc field can get negative (e.g.
      536 +         * see vdev_checkpoint_sm), that should never be the case
      537 +         * when it come's to a metaslab's space map.
      538 +         */
      539 +        ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0);
 529  540  
      541 +        sm_free_space = msp->ms_size - metaslab_allocated_space(msp);
      542 +
 530  543          /*
 531      -         * Account for future allocations since we would have already
 532      -         * deducted that space from the ms_freetree.
      544 +         * Account for future allocations since we would have
      545 +         * already deducted that space from the ms_allocatable.
 533  546           */
 534  547          for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
 535      -                allocated +=
      548 +                allocating +=
 536  549                      range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
 537  550          }
 538  551  
 539      -        msp_free_space = range_tree_space(msp->ms_allocatable) + allocated +
      552 +        ASSERT3U(msp->ms_deferspace, ==,
      553 +            range_tree_space(msp->ms_defer[0]) +
      554 +            range_tree_space(msp->ms_defer[1]));
      555 +
      556 +        msp_free_space = range_tree_space(msp->ms_allocatable) + allocating +
 540  557              msp->ms_deferspace + range_tree_space(msp->ms_freed);
 541  558  
 542  559          VERIFY3U(sm_free_space, ==, msp_free_space);
 543  560  }
 544  561  
 545  562  /*
 546  563   * ==========================================================================
 547  564   * Metaslab groups
 548  565   * ==========================================================================
 549  566   */

 550  567  /*
 551  568   * Update the allocatable flag and the metaslab group's capacity.
 552  569   * The allocatable flag is set to true if the capacity is below
 553  570   * the zfs_mg_noalloc_threshold or has a fragmentation value that is
 554  571   * greater than zfs_mg_fragmentation_threshold. If a metaslab group
 555  572   * transitions from allocatable to non-allocatable or vice versa then the
 556  573   * metaslab group's class is updated to reflect the transition.
 557  574   */
 558  575  static void
 559  576  metaslab_group_alloc_update(metaslab_group_t *mg)
 560  577  {
 561  578          vdev_t *vd = mg->mg_vd;
 562  579          metaslab_class_t *mc = mg->mg_class;
 563  580          vdev_stat_t *vs = &vd->vdev_stat;
 564  581          boolean_t was_allocatable;
 565  582          boolean_t was_initialized;
 566  583  
 567  584          ASSERT(vd == vd->vdev_top);
 568  585          ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==,
 569  586              SCL_ALLOC);
 570  587  
 571  588          mutex_enter(&mg->mg_lock);
 572  589          was_allocatable = mg->mg_allocatable;
 573  590          was_initialized = mg->mg_initialized;
 574  591  
 575  592          mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
 576  593              (vs->vs_space + 1);
 577  594  
 578  595          mutex_enter(&mc->mc_lock);
 579  596  
 580  597          /*
 581  598           * If the metaslab group was just added then it won't
 582  599           * have any space until we finish syncing out this txg.
 583  600           * At that point we will consider it initialized and available
 584  601           * for allocations.  We also don't consider non-activated
 585  602           * metaslab groups (e.g. vdevs that are in the middle of being removed)
 586  603           * to be initialized, because they can't be used for allocation.
 587  604           */
 588  605          mg->mg_initialized = metaslab_group_initialized(mg);
 589  606          if (!was_initialized && mg->mg_initialized) {
 590  607                  mc->mc_groups++;
 591  608          } else if (was_initialized && !mg->mg_initialized) {
 592  609                  ASSERT3U(mc->mc_groups, >, 0);
 593  610                  mc->mc_groups--;
 594  611          }
 595  612          if (mg->mg_initialized)
 596  613                  mg->mg_no_free_space = B_FALSE;
 597  614  
 598  615          /*
 599  616           * A metaslab group is considered allocatable if it has plenty
 600  617           * of free space or is not heavily fragmented. We only take
 601  618           * fragmentation into account if the metaslab group has a valid
 602  619           * fragmentation metric (i.e. a value between 0 and 100).
 603  620           */
 604  621          mg->mg_allocatable = (mg->mg_activation_count > 0 &&
 605  622              mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
 606  623              (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
 607  624              mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
 608  625  
 609  626          /*
 610  627           * The mc_alloc_groups maintains a count of the number of
 611  628           * groups in this metaslab class that are still above the
 612  629           * zfs_mg_noalloc_threshold. This is used by the allocating
 613  630           * threads to determine if they should avoid allocations to
 614  631           * a given group. The allocator will avoid allocations to a group
 615  632           * if that group has reached or is below the zfs_mg_noalloc_threshold
 616  633           * and there are still other groups that are above the threshold.
 617  634           * When a group transitions from allocatable to non-allocatable or
 618  635           * vice versa we update the metaslab class to reflect that change.
 619  636           * When the mc_alloc_groups value drops to 0 that means that all
 620  637           * groups have reached the zfs_mg_noalloc_threshold making all groups
 621  638           * eligible for allocations. This effectively means that all devices
 622  639           * are balanced again.
 623  640           */
 624  641          if (was_allocatable && !mg->mg_allocatable)
 625  642                  mc->mc_alloc_groups--;
 626  643          else if (!was_allocatable && mg->mg_allocatable)
 627  644                  mc->mc_alloc_groups++;
 628  645          mutex_exit(&mc->mc_lock);
 629  646  
 630  647          mutex_exit(&mg->mg_lock);
 631  648  }
 632  649  
 633  650  metaslab_group_t *
 634  651  metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
 635  652  {
 636  653          metaslab_group_t *mg;
 637  654  
 638  655          mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
 639  656          mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
 640  657          mutex_init(&mg->mg_ms_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
 641  658          cv_init(&mg->mg_ms_initialize_cv, NULL, CV_DEFAULT, NULL);
 642  659          mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
 643  660              KM_SLEEP);
 644  661          mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
 645  662              KM_SLEEP);
 646  663          avl_create(&mg->mg_metaslab_tree, metaslab_compare,
 647  664              sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
 648  665          mg->mg_vd = vd;
 649  666          mg->mg_class = mc;
 650  667          mg->mg_activation_count = 0;
 651  668          mg->mg_initialized = B_FALSE;
 652  669          mg->mg_no_free_space = B_TRUE;
 653  670          mg->mg_allocators = allocators;
 654  671  
 655  672          mg->mg_alloc_queue_depth = kmem_zalloc(allocators *
 656  673              sizeof (zfs_refcount_t), KM_SLEEP);
 657  674          mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators *
 658  675              sizeof (uint64_t), KM_SLEEP);
 659  676          for (int i = 0; i < allocators; i++) {
 660  677                  zfs_refcount_create_tracked(&mg->mg_alloc_queue_depth[i]);
 661  678                  mg->mg_cur_max_alloc_queue_depth[i] = 0;
 662  679          }
 663  680  
 664  681          mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
 665  682              minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
 666  683  
 667  684          return (mg);
 668  685  }
 669  686  
 670  687  void
 671  688  metaslab_group_destroy(metaslab_group_t *mg)
 672  689  {
 673  690          ASSERT(mg->mg_prev == NULL);
 674  691          ASSERT(mg->mg_next == NULL);
 675  692          /*
 676  693           * We may have gone below zero with the activation count
 677  694           * either because we never activated in the first place or
 678  695           * because we're done, and possibly removing the vdev.
 679  696           */
 680  697          ASSERT(mg->mg_activation_count <= 0);
 681  698  
 682  699          taskq_destroy(mg->mg_taskq);
 683  700          avl_destroy(&mg->mg_metaslab_tree);
 684  701          kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *));
 685  702          kmem_free(mg->mg_secondaries, mg->mg_allocators *
 686  703              sizeof (metaslab_t *));
 687  704          mutex_destroy(&mg->mg_lock);
 688  705          mutex_destroy(&mg->mg_ms_initialize_lock);
 689  706          cv_destroy(&mg->mg_ms_initialize_cv);
 690  707  
 691  708          for (int i = 0; i < mg->mg_allocators; i++) {
 692  709                  zfs_refcount_destroy(&mg->mg_alloc_queue_depth[i]);
 693  710                  mg->mg_cur_max_alloc_queue_depth[i] = 0;
 694  711          }
 695  712          kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators *
 696  713              sizeof (zfs_refcount_t));
 697  714          kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators *
 698  715              sizeof (uint64_t));
 699  716  
 700  717          kmem_free(mg, sizeof (metaslab_group_t));
 701  718  }
 702  719  
 703  720  void
 704  721  metaslab_group_activate(metaslab_group_t *mg)
 705  722  {
 706  723          metaslab_class_t *mc = mg->mg_class;
 707  724          metaslab_group_t *mgprev, *mgnext;
 708  725  
 709  726          ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0);
 710  727  
 711  728          ASSERT(mc->mc_rotor != mg);
 712  729          ASSERT(mg->mg_prev == NULL);
 713  730          ASSERT(mg->mg_next == NULL);
 714  731          ASSERT(mg->mg_activation_count <= 0);
 715  732  
 716  733          if (++mg->mg_activation_count <= 0)
 717  734                  return;
 718  735  
 719  736          mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
 720  737          metaslab_group_alloc_update(mg);
 721  738  
 722  739          if ((mgprev = mc->mc_rotor) == NULL) {
 723  740                  mg->mg_prev = mg;
 724  741                  mg->mg_next = mg;
 725  742          } else {
 726  743                  mgnext = mgprev->mg_next;
 727  744                  mg->mg_prev = mgprev;
 728  745                  mg->mg_next = mgnext;
 729  746                  mgprev->mg_next = mg;
 730  747                  mgnext->mg_prev = mg;
 731  748          }
 732  749          mc->mc_rotor = mg;
 733  750  }
 734  751  
 735  752  /*
 736  753   * Passivate a metaslab group and remove it from the allocation rotor.
 737  754   * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating
 738  755   * a metaslab group. This function will momentarily drop spa_config_locks
 739  756   * that are lower than the SCL_ALLOC lock (see comment below).
 740  757   */
 741  758  void
 742  759  metaslab_group_passivate(metaslab_group_t *mg)
 743  760  {
 744  761          metaslab_class_t *mc = mg->mg_class;
 745  762          spa_t *spa = mc->mc_spa;
 746  763          metaslab_group_t *mgprev, *mgnext;
 747  764          int locks = spa_config_held(spa, SCL_ALL, RW_WRITER);
 748  765  
 749  766          ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==,
 750  767              (SCL_ALLOC | SCL_ZIO));
 751  768  
 752  769          if (--mg->mg_activation_count != 0) {
 753  770                  ASSERT(mc->mc_rotor != mg);
 754  771                  ASSERT(mg->mg_prev == NULL);
 755  772                  ASSERT(mg->mg_next == NULL);
 756  773                  ASSERT(mg->mg_activation_count < 0);
 757  774                  return;
 758  775          }
 759  776  
 760  777          /*
 761  778           * The spa_config_lock is an array of rwlocks, ordered as
 762  779           * follows (from highest to lowest):
 763  780           *      SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC >
 764  781           *      SCL_ZIO > SCL_FREE > SCL_VDEV
 765  782           * (For more information about the spa_config_lock see spa_misc.c)
 766  783           * The higher the lock, the broader its coverage. When we passivate
 767  784           * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO
 768  785           * config locks. However, the metaslab group's taskq might be trying
 769  786           * to preload metaslabs so we must drop the SCL_ZIO lock and any
 770  787           * lower locks to allow the I/O to complete. At a minimum,
 771  788           * we continue to hold the SCL_ALLOC lock, which prevents any future
 772  789           * allocations from taking place and any changes to the vdev tree.
 773  790           */
 774  791          spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa);
 775  792          taskq_wait(mg->mg_taskq);
 776  793          spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
 777  794          metaslab_group_alloc_update(mg);
 778  795          for (int i = 0; i < mg->mg_allocators; i++) {
 779  796                  metaslab_t *msp = mg->mg_primaries[i];
 780  797                  if (msp != NULL) {
 781  798                          mutex_enter(&msp->ms_lock);
 782  799                          metaslab_passivate(msp,
 783  800                              metaslab_weight_from_range_tree(msp));
 784  801                          mutex_exit(&msp->ms_lock);
 785  802                  }
 786  803                  msp = mg->mg_secondaries[i];
 787  804                  if (msp != NULL) {
 788  805                          mutex_enter(&msp->ms_lock);
 789  806                          metaslab_passivate(msp,
 790  807                              metaslab_weight_from_range_tree(msp));
 791  808                          mutex_exit(&msp->ms_lock);
 792  809                  }
 793  810          }
 794  811  
 795  812          mgprev = mg->mg_prev;
 796  813          mgnext = mg->mg_next;
 797  814  
 798  815          if (mg == mgnext) {
 799  816                  mc->mc_rotor = NULL;
 800  817          } else {
 801  818                  mc->mc_rotor = mgnext;
 802  819                  mgprev->mg_next = mgnext;
 803  820                  mgnext->mg_prev = mgprev;
 804  821          }
 805  822  
 806  823          mg->mg_prev = NULL;
 807  824          mg->mg_next = NULL;
 808  825  }
 809  826  
 810  827  boolean_t
 811  828  metaslab_group_initialized(metaslab_group_t *mg)
 812  829  {
 813  830          vdev_t *vd = mg->mg_vd;
 814  831          vdev_stat_t *vs = &vd->vdev_stat;
 815  832  
 816  833          return (vs->vs_space != 0 && mg->mg_activation_count > 0);
 817  834  }
 818  835  
 819  836  uint64_t
 820  837  metaslab_group_get_space(metaslab_group_t *mg)
 821  838  {
 822  839          return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
 823  840  }
 824  841  
 825  842  void
 826  843  metaslab_group_histogram_verify(metaslab_group_t *mg)
 827  844  {
 828  845          uint64_t *mg_hist;
 829  846          vdev_t *vd = mg->mg_vd;
 830  847          uint64_t ashift = vd->vdev_ashift;
 831  848          int i;
 832  849  
 833  850          if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)

↓ open down ↓

284 lines elided

↑ open up ↑

 834  851                  return;
 835  852  
 836  853          mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
 837  854              KM_SLEEP);
 838  855  
 839  856          ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
 840  857              SPACE_MAP_HISTOGRAM_SIZE + ashift);
 841  858  
 842  859          for (int m = 0; m < vd->vdev_ms_count; m++) {
 843  860                  metaslab_t *msp = vd->vdev_ms[m];
      861 +                ASSERT(msp != NULL);
 844  862  
 845  863                  /* skip if not active or not a member */
 846  864                  if (msp->ms_sm == NULL || msp->ms_group != mg)
 847  865                          continue;
 848  866  
 849  867                  for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
 850  868                          mg_hist[i + ashift] +=
 851  869                              msp->ms_sm->sm_phys->smp_histogram[i];
 852  870          }
 853  871

 854  872          for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
 855  873                  VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
 856  874  
 857  875          kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
 858  876  }
 859  877  
 860  878  static void
 861  879  metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
 862  880  {
 863  881          metaslab_class_t *mc = mg->mg_class;
 864  882          uint64_t ashift = mg->mg_vd->vdev_ashift;
 865  883  
 866  884          ASSERT(MUTEX_HELD(&msp->ms_lock));
 867  885          if (msp->ms_sm == NULL)
 868  886                  return;
 869  887  
 870  888          mutex_enter(&mg->mg_lock);
 871  889          for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 872  890                  mg->mg_histogram[i + ashift] +=
 873  891                      msp->ms_sm->sm_phys->smp_histogram[i];
 874  892                  mc->mc_histogram[i + ashift] +=
 875  893                      msp->ms_sm->sm_phys->smp_histogram[i];
 876  894          }
 877  895          mutex_exit(&mg->mg_lock);
 878  896  }
 879  897  
 880  898  void
 881  899  metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
 882  900  {
 883  901          metaslab_class_t *mc = mg->mg_class;
 884  902          uint64_t ashift = mg->mg_vd->vdev_ashift;
 885  903  
 886  904          ASSERT(MUTEX_HELD(&msp->ms_lock));
 887  905          if (msp->ms_sm == NULL)
 888  906                  return;
 889  907  
 890  908          mutex_enter(&mg->mg_lock);
 891  909          for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 892  910                  ASSERT3U(mg->mg_histogram[i + ashift], >=,
 893  911                      msp->ms_sm->sm_phys->smp_histogram[i]);
 894  912                  ASSERT3U(mc->mc_histogram[i + ashift], >=,
 895  913                      msp->ms_sm->sm_phys->smp_histogram[i]);
 896  914  
 897  915                  mg->mg_histogram[i + ashift] -=
 898  916                      msp->ms_sm->sm_phys->smp_histogram[i];
 899  917                  mc->mc_histogram[i + ashift] -=
 900  918                      msp->ms_sm->sm_phys->smp_histogram[i];
 901  919          }
 902  920          mutex_exit(&mg->mg_lock);
 903  921  }
 904  922  
 905  923  static void
 906  924  metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
 907  925  {
 908  926          ASSERT(msp->ms_group == NULL);
 909  927          mutex_enter(&mg->mg_lock);
 910  928          msp->ms_group = mg;
 911  929          msp->ms_weight = 0;
 912  930          avl_add(&mg->mg_metaslab_tree, msp);
 913  931          mutex_exit(&mg->mg_lock);
 914  932  
 915  933          mutex_enter(&msp->ms_lock);
 916  934          metaslab_group_histogram_add(mg, msp);
 917  935          mutex_exit(&msp->ms_lock);
 918  936  }
 919  937  
 920  938  static void
 921  939  metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
 922  940  {
 923  941          mutex_enter(&msp->ms_lock);
 924  942          metaslab_group_histogram_remove(mg, msp);
 925  943          mutex_exit(&msp->ms_lock);
 926  944  
 927  945          mutex_enter(&mg->mg_lock);
 928  946          ASSERT(msp->ms_group == mg);
 929  947          avl_remove(&mg->mg_metaslab_tree, msp);
 930  948          msp->ms_group = NULL;
 931  949          mutex_exit(&mg->mg_lock);
 932  950  }
 933  951  
 934  952  static void
 935  953  metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 936  954  {
 937  955          ASSERT(MUTEX_HELD(&mg->mg_lock));
 938  956          ASSERT(msp->ms_group == mg);
 939  957          avl_remove(&mg->mg_metaslab_tree, msp);
 940  958          msp->ms_weight = weight;
 941  959          avl_add(&mg->mg_metaslab_tree, msp);
 942  960  
 943  961  }
 944  962  
 945  963  static void
 946  964  metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 947  965  {
 948  966          /*
 949  967           * Although in principle the weight can be any value, in
 950  968           * practice we do not use values in the range [1, 511].
 951  969           */
 952  970          ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
 953  971          ASSERT(MUTEX_HELD(&msp->ms_lock));
 954  972  
 955  973          mutex_enter(&mg->mg_lock);
 956  974          metaslab_group_sort_impl(mg, msp, weight);
 957  975          mutex_exit(&mg->mg_lock);
 958  976  }
 959  977  
 960  978  /*
 961  979   * Calculate the fragmentation for a given metaslab group. We can use
 962  980   * a simple average here since all metaslabs within the group must have
 963  981   * the same size. The return value will be a value between 0 and 100
 964  982   * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
 965  983   * group have a fragmentation metric.
 966  984   */
 967  985  uint64_t
 968  986  metaslab_group_fragmentation(metaslab_group_t *mg)
 969  987  {
 970  988          vdev_t *vd = mg->mg_vd;
 971  989          uint64_t fragmentation = 0;
 972  990          uint64_t valid_ms = 0;
 973  991  
 974  992          for (int m = 0; m < vd->vdev_ms_count; m++) {
 975  993                  metaslab_t *msp = vd->vdev_ms[m];
 976  994  
 977  995                  if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
 978  996                          continue;
 979  997                  if (msp->ms_group != mg)
 980  998                          continue;
 981  999  
 982 1000                  valid_ms++;
 983 1001                  fragmentation += msp->ms_fragmentation;
 984 1002          }
 985 1003  
 986 1004          if (valid_ms <= mg->mg_vd->vdev_ms_count / 2)
 987 1005                  return (ZFS_FRAG_INVALID);
 988 1006  
 989 1007          fragmentation /= valid_ms;
 990 1008          ASSERT3U(fragmentation, <=, 100);
 991 1009          return (fragmentation);
 992 1010  }
 993 1011  
 994 1012  /*
 995 1013   * Determine if a given metaslab group should skip allocations. A metaslab
 996 1014   * group should avoid allocations if its free capacity is less than the
 997 1015   * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
 998 1016   * zfs_mg_fragmentation_threshold and there is at least one metaslab group
 999 1017   * that can still handle allocations. If the allocation throttle is enabled
1000 1018   * then we skip allocations to devices that have reached their maximum
1001 1019   * allocation queue depth unless the selected metaslab group is the only
1002 1020   * eligible group remaining.
1003 1021   */
1004 1022  static boolean_t
1005 1023  metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
1006 1024      uint64_t psize, int allocator)
1007 1025  {
1008 1026          spa_t *spa = mg->mg_vd->vdev_spa;
1009 1027          metaslab_class_t *mc = mg->mg_class;
1010 1028  
1011 1029          /*
1012 1030           * We can only consider skipping this metaslab group if it's
1013 1031           * in the normal metaslab class and there are other metaslab
1014 1032           * groups to select from. Otherwise, we always consider it eligible
1015 1033           * for allocations.
1016 1034           */
1017 1035          if ((mc != spa_normal_class(spa) &&
1018 1036              mc != spa_special_class(spa) &&
1019 1037              mc != spa_dedup_class(spa)) ||
1020 1038              mc->mc_groups <= 1)
1021 1039                  return (B_TRUE);
1022 1040  
1023 1041          /*
1024 1042           * If the metaslab group's mg_allocatable flag is set (see comments
1025 1043           * in metaslab_group_alloc_update() for more information) and
1026 1044           * the allocation throttle is disabled then allow allocations to this
1027 1045           * device. However, if the allocation throttle is enabled then
1028 1046           * check if we have reached our allocation limit (mg_alloc_queue_depth)
1029 1047           * to determine if we should allow allocations to this metaslab group.
1030 1048           * If all metaslab groups are no longer considered allocatable
1031 1049           * (mc_alloc_groups == 0) or we're trying to allocate the smallest
1032 1050           * gang block size then we allow allocations on this metaslab group
1033 1051           * regardless of the mg_allocatable or throttle settings.
1034 1052           */
1035 1053          if (mg->mg_allocatable) {
1036 1054                  metaslab_group_t *mgp;
1037 1055                  int64_t qdepth;
1038 1056                  uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator];
1039 1057  
1040 1058                  if (!mc->mc_alloc_throttle_enabled)
1041 1059                          return (B_TRUE);
1042 1060  
1043 1061                  /*
1044 1062                   * If this metaslab group does not have any free space, then
1045 1063                   * there is no point in looking further.
1046 1064                   */
1047 1065                  if (mg->mg_no_free_space)
1048 1066                          return (B_FALSE);
1049 1067  
1050 1068                  qdepth = zfs_refcount_count(
1051 1069                      &mg->mg_alloc_queue_depth[allocator]);
1052 1070  
1053 1071                  /*
1054 1072                   * If this metaslab group is below its qmax or it's
1055 1073                   * the only allocatable metasable group, then attempt
1056 1074                   * to allocate from it.
1057 1075                   */
1058 1076                  if (qdepth < qmax || mc->mc_alloc_groups == 1)
1059 1077                          return (B_TRUE);
1060 1078                  ASSERT3U(mc->mc_alloc_groups, >, 1);
1061 1079  
1062 1080                  /*
1063 1081                   * Since this metaslab group is at or over its qmax, we
1064 1082                   * need to determine if there are metaslab groups after this
1065 1083                   * one that might be able to handle this allocation. This is
1066 1084                   * racy since we can't hold the locks for all metaslab
1067 1085                   * groups at the same time when we make this check.
1068 1086                   */
1069 1087                  for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
1070 1088                          qmax = mgp->mg_cur_max_alloc_queue_depth[allocator];
1071 1089  
1072 1090                          qdepth = zfs_refcount_count(
1073 1091                              &mgp->mg_alloc_queue_depth[allocator]);
1074 1092  
1075 1093                          /*
1076 1094                           * If there is another metaslab group that
1077 1095                           * might be able to handle the allocation, then
1078 1096                           * we return false so that we skip this group.
1079 1097                           */
1080 1098                          if (qdepth < qmax && !mgp->mg_no_free_space)
1081 1099                                  return (B_FALSE);
1082 1100                  }
1083 1101  
1084 1102                  /*
1085 1103                   * We didn't find another group to handle the allocation
1086 1104                   * so we can't skip this metaslab group even though
1087 1105                   * we are at or over our qmax.
1088 1106                   */
1089 1107                  return (B_TRUE);
1090 1108  
1091 1109          } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
1092 1110                  return (B_TRUE);
1093 1111          }
1094 1112          return (B_FALSE);
1095 1113  }
1096 1114  
1097 1115  /*
1098 1116   * ==========================================================================
1099 1117   * Range tree callbacks
1100 1118   * ==========================================================================
1101 1119   */
1102 1120  
1103 1121  /*
1104 1122   * Comparison function for the private size-ordered tree. Tree is sorted
1105 1123   * by size, larger sizes at the end of the tree.
1106 1124   */
1107 1125  static int
1108 1126  metaslab_rangesize_compare(const void *x1, const void *x2)
1109 1127  {
1110 1128          const range_seg_t *r1 = x1;
1111 1129          const range_seg_t *r2 = x2;
1112 1130          uint64_t rs_size1 = r1->rs_end - r1->rs_start;
1113 1131          uint64_t rs_size2 = r2->rs_end - r2->rs_start;
1114 1132  
1115 1133          if (rs_size1 < rs_size2)
1116 1134                  return (-1);
1117 1135          if (rs_size1 > rs_size2)
1118 1136                  return (1);
1119 1137  
1120 1138          if (r1->rs_start < r2->rs_start)
1121 1139                  return (-1);
1122 1140  
1123 1141          if (r1->rs_start > r2->rs_start)
1124 1142                  return (1);
1125 1143  
1126 1144          return (0);
1127 1145  }
1128 1146  
1129 1147  /*
1130 1148   * Create any block allocator specific components. The current allocators
1131 1149   * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
1132 1150   */
1133 1151  static void
1134 1152  metaslab_rt_create(range_tree_t *rt, void *arg)
1135 1153  {
1136 1154          metaslab_t *msp = arg;
1137 1155  
1138 1156          ASSERT3P(rt->rt_arg, ==, msp);
1139 1157          ASSERT(msp->ms_allocatable == NULL);
1140 1158  
1141 1159          avl_create(&msp->ms_allocatable_by_size, metaslab_rangesize_compare,
1142 1160              sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
1143 1161  }
1144 1162  
1145 1163  /*
1146 1164   * Destroy the block allocator specific components.
1147 1165   */
1148 1166  static void
1149 1167  metaslab_rt_destroy(range_tree_t *rt, void *arg)
1150 1168  {
1151 1169          metaslab_t *msp = arg;
1152 1170  
1153 1171          ASSERT3P(rt->rt_arg, ==, msp);
1154 1172          ASSERT3P(msp->ms_allocatable, ==, rt);
1155 1173          ASSERT0(avl_numnodes(&msp->ms_allocatable_by_size));
1156 1174  
1157 1175          avl_destroy(&msp->ms_allocatable_by_size);
1158 1176  }
1159 1177  
1160 1178  static void
1161 1179  metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
1162 1180  {
1163 1181          metaslab_t *msp = arg;
1164 1182  
1165 1183          ASSERT3P(rt->rt_arg, ==, msp);
1166 1184          ASSERT3P(msp->ms_allocatable, ==, rt);
1167 1185          VERIFY(!msp->ms_condensing);
1168 1186          avl_add(&msp->ms_allocatable_by_size, rs);
1169 1187  }
1170 1188  
1171 1189  static void
1172 1190  metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
1173 1191  {
1174 1192          metaslab_t *msp = arg;
1175 1193  
1176 1194          ASSERT3P(rt->rt_arg, ==, msp);
1177 1195          ASSERT3P(msp->ms_allocatable, ==, rt);
1178 1196          VERIFY(!msp->ms_condensing);
1179 1197          avl_remove(&msp->ms_allocatable_by_size, rs);
1180 1198  }
1181 1199  
1182 1200  static void
1183 1201  metaslab_rt_vacate(range_tree_t *rt, void *arg)
1184 1202  {
1185 1203          metaslab_t *msp = arg;
1186 1204  
1187 1205          ASSERT3P(rt->rt_arg, ==, msp);
1188 1206          ASSERT3P(msp->ms_allocatable, ==, rt);
1189 1207  
1190 1208          /*
1191 1209           * Normally one would walk the tree freeing nodes along the way.
1192 1210           * Since the nodes are shared with the range trees we can avoid
1193 1211           * walking all nodes and just reinitialize the avl tree. The nodes
1194 1212           * will be freed by the range tree, so we don't want to free them here.
1195 1213           */
1196 1214          avl_create(&msp->ms_allocatable_by_size, metaslab_rangesize_compare,
1197 1215              sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
1198 1216  }
1199 1217  
1200 1218  static range_tree_ops_t metaslab_rt_ops = {
1201 1219          metaslab_rt_create,
1202 1220          metaslab_rt_destroy,
1203 1221          metaslab_rt_add,
1204 1222          metaslab_rt_remove,
1205 1223          metaslab_rt_vacate
1206 1224  };
1207 1225  
1208 1226  /*
1209 1227   * ==========================================================================
1210 1228   * Common allocator routines
1211 1229   * ==========================================================================
1212 1230   */
1213 1231  
1214 1232  /*
1215 1233   * Return the maximum contiguous segment within the metaslab.
1216 1234   */
1217 1235  uint64_t
1218 1236  metaslab_block_maxsize(metaslab_t *msp)
1219 1237  {
1220 1238          avl_tree_t *t = &msp->ms_allocatable_by_size;
1221 1239          range_seg_t *rs;
1222 1240  
1223 1241          if (t == NULL || (rs = avl_last(t)) == NULL)
1224 1242                  return (0ULL);
1225 1243  
1226 1244          return (rs->rs_end - rs->rs_start);
1227 1245  }
1228 1246  
1229 1247  static range_seg_t *
1230 1248  metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
1231 1249  {
1232 1250          range_seg_t *rs, rsearch;
1233 1251          avl_index_t where;
1234 1252  
1235 1253          rsearch.rs_start = start;
1236 1254          rsearch.rs_end = start + size;
1237 1255  
1238 1256          rs = avl_find(t, &rsearch, &where);
1239 1257          if (rs == NULL) {
1240 1258                  rs = avl_nearest(t, where, AVL_AFTER);
1241 1259          }
1242 1260  
1243 1261          return (rs);
1244 1262  }
1245 1263  
1246 1264  /*
1247 1265   * This is a helper function that can be used by the allocator to find
1248 1266   * a suitable block to allocate. This will search the specified AVL
1249 1267   * tree looking for a block that matches the specified criteria.
1250 1268   */
1251 1269  static uint64_t
1252 1270  metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
1253 1271      uint64_t align)
1254 1272  {
1255 1273          range_seg_t *rs = metaslab_block_find(t, *cursor, size);
1256 1274  
1257 1275          while (rs != NULL) {
1258 1276                  uint64_t offset = P2ROUNDUP(rs->rs_start, align);
1259 1277  
1260 1278                  if (offset + size <= rs->rs_end) {
1261 1279                          *cursor = offset + size;
1262 1280                          return (offset);
1263 1281                  }
1264 1282                  rs = AVL_NEXT(t, rs);
1265 1283          }
1266 1284  
1267 1285          /*
1268 1286           * If we know we've searched the whole map (*cursor == 0), give up.
1269 1287           * Otherwise, reset the cursor to the beginning and try again.
1270 1288           */
1271 1289          if (*cursor == 0)
1272 1290                  return (-1ULL);
1273 1291  
1274 1292          *cursor = 0;
1275 1293          return (metaslab_block_picker(t, cursor, size, align));
1276 1294  }
1277 1295  
1278 1296  /*
1279 1297   * ==========================================================================
1280 1298   * The first-fit block allocator
1281 1299   * ==========================================================================
1282 1300   */
1283 1301  static uint64_t
1284 1302  metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
1285 1303  {
1286 1304          /*
1287 1305           * Find the largest power of 2 block size that evenly divides the
1288 1306           * requested size. This is used to try to allocate blocks with similar
1289 1307           * alignment from the same area of the metaslab (i.e. same cursor
1290 1308           * bucket) but it does not guarantee that other allocations sizes
1291 1309           * may exist in the same region.
1292 1310           */
1293 1311          uint64_t align = size & -size;
1294 1312          uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1295 1313          avl_tree_t *t = &msp->ms_allocatable->rt_root;
1296 1314  
1297 1315          return (metaslab_block_picker(t, cursor, size, align));
1298 1316  }
1299 1317  
1300 1318  static metaslab_ops_t metaslab_ff_ops = {
1301 1319          metaslab_ff_alloc
1302 1320  };
1303 1321  
1304 1322  /*
1305 1323   * ==========================================================================
1306 1324   * Dynamic block allocator -
1307 1325   * Uses the first fit allocation scheme until space get low and then
1308 1326   * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
1309 1327   * and metaslab_df_free_pct to determine when to switch the allocation scheme.
1310 1328   * ==========================================================================
1311 1329   */
1312 1330  static uint64_t
1313 1331  metaslab_df_alloc(metaslab_t *msp, uint64_t size)
1314 1332  {
1315 1333          /*
1316 1334           * Find the largest power of 2 block size that evenly divides the
1317 1335           * requested size. This is used to try to allocate blocks with similar
1318 1336           * alignment from the same area of the metaslab (i.e. same cursor
1319 1337           * bucket) but it does not guarantee that other allocations sizes
1320 1338           * may exist in the same region.
1321 1339           */
1322 1340          uint64_t align = size & -size;
1323 1341          uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1324 1342          range_tree_t *rt = msp->ms_allocatable;
1325 1343          avl_tree_t *t = &rt->rt_root;
1326 1344          uint64_t max_size = metaslab_block_maxsize(msp);
1327 1345          int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
1328 1346  
1329 1347          ASSERT(MUTEX_HELD(&msp->ms_lock));
1330 1348          ASSERT3U(avl_numnodes(t), ==,
1331 1349              avl_numnodes(&msp->ms_allocatable_by_size));
1332 1350  
1333 1351          if (max_size < size)
1334 1352                  return (-1ULL);
1335 1353  
1336 1354          /*
1337 1355           * If we're running low on space switch to using the size
1338 1356           * sorted AVL tree (best-fit).
1339 1357           */
1340 1358          if (max_size < metaslab_df_alloc_threshold ||
1341 1359              free_pct < metaslab_df_free_pct) {
1342 1360                  t = &msp->ms_allocatable_by_size;
1343 1361                  *cursor = 0;
1344 1362          }
1345 1363  
1346 1364          return (metaslab_block_picker(t, cursor, size, 1ULL));
1347 1365  }
1348 1366  
1349 1367  static metaslab_ops_t metaslab_df_ops = {
1350 1368          metaslab_df_alloc
1351 1369  };
1352 1370  
1353 1371  /*
1354 1372   * ==========================================================================
1355 1373   * Cursor fit block allocator -
1356 1374   * Select the largest region in the metaslab, set the cursor to the beginning
1357 1375   * of the range and the cursor_end to the end of the range. As allocations
1358 1376   * are made advance the cursor. Continue allocating from the cursor until
1359 1377   * the range is exhausted and then find a new range.
1360 1378   * ==========================================================================
1361 1379   */
1362 1380  static uint64_t
1363 1381  metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
1364 1382  {
1365 1383          range_tree_t *rt = msp->ms_allocatable;
1366 1384          avl_tree_t *t = &msp->ms_allocatable_by_size;
1367 1385          uint64_t *cursor = &msp->ms_lbas[0];
1368 1386          uint64_t *cursor_end = &msp->ms_lbas[1];
1369 1387          uint64_t offset = 0;
1370 1388  
1371 1389          ASSERT(MUTEX_HELD(&msp->ms_lock));
1372 1390          ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
1373 1391  
1374 1392          ASSERT3U(*cursor_end, >=, *cursor);
1375 1393  
1376 1394          if ((*cursor + size) > *cursor_end) {
1377 1395                  range_seg_t *rs;
1378 1396  
1379 1397                  rs = avl_last(&msp->ms_allocatable_by_size);
1380 1398                  if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
1381 1399                          return (-1ULL);
1382 1400  
1383 1401                  *cursor = rs->rs_start;
1384 1402                  *cursor_end = rs->rs_end;
1385 1403          }
1386 1404  
1387 1405          offset = *cursor;
1388 1406          *cursor += size;
1389 1407  
1390 1408          return (offset);
1391 1409  }
1392 1410  
1393 1411  static metaslab_ops_t metaslab_cf_ops = {
1394 1412          metaslab_cf_alloc
1395 1413  };
1396 1414  
1397 1415  /*
1398 1416   * ==========================================================================
1399 1417   * New dynamic fit allocator -
1400 1418   * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
1401 1419   * contiguous blocks. If no region is found then just use the largest segment
1402 1420   * that remains.
1403 1421   * ==========================================================================
1404 1422   */
1405 1423  
1406 1424  /*
1407 1425   * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
1408 1426   * to request from the allocator.
1409 1427   */
1410 1428  uint64_t metaslab_ndf_clump_shift = 4;
1411 1429  
1412 1430  static uint64_t
1413 1431  metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
1414 1432  {
1415 1433          avl_tree_t *t = &msp->ms_allocatable->rt_root;
1416 1434          avl_index_t where;
1417 1435          range_seg_t *rs, rsearch;
1418 1436          uint64_t hbit = highbit64(size);
1419 1437          uint64_t *cursor = &msp->ms_lbas[hbit - 1];
1420 1438          uint64_t max_size = metaslab_block_maxsize(msp);
1421 1439  
1422 1440          ASSERT(MUTEX_HELD(&msp->ms_lock));
1423 1441          ASSERT3U(avl_numnodes(t), ==,
1424 1442              avl_numnodes(&msp->ms_allocatable_by_size));
1425 1443  
1426 1444          if (max_size < size)
1427 1445                  return (-1ULL);
1428 1446  
1429 1447          rsearch.rs_start = *cursor;
1430 1448          rsearch.rs_end = *cursor + size;
1431 1449  
1432 1450          rs = avl_find(t, &rsearch, &where);
1433 1451          if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
1434 1452                  t = &msp->ms_allocatable_by_size;
1435 1453  
1436 1454                  rsearch.rs_start = 0;
1437 1455                  rsearch.rs_end = MIN(max_size,
1438 1456                      1ULL << (hbit + metaslab_ndf_clump_shift));
1439 1457                  rs = avl_find(t, &rsearch, &where);
1440 1458                  if (rs == NULL)
1441 1459                          rs = avl_nearest(t, where, AVL_AFTER);
1442 1460                  ASSERT(rs != NULL);
1443 1461          }
1444 1462  
1445 1463          if ((rs->rs_end - rs->rs_start) >= size) {
1446 1464                  *cursor = rs->rs_start + size;
1447 1465                  return (rs->rs_start);
1448 1466          }
1449 1467          return (-1ULL);
1450 1468  }
1451 1469  
1452 1470  static metaslab_ops_t metaslab_ndf_ops = {
1453 1471          metaslab_ndf_alloc

↓ open down ↓

600 lines elided

↑ open up ↑

1454 1472  };
1455 1473  
1456 1474  metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
1457 1475  
1458 1476  /*
1459 1477   * ==========================================================================
1460 1478   * Metaslabs
1461 1479   * ==========================================================================
1462 1480   */
1463 1481  
     1482 +static void
     1483 +metaslab_aux_histograms_clear(metaslab_t *msp)
     1484 +{
     1485 +        /*
     1486 +         * Auxiliary histograms are only cleared when resetting them,
     1487 +         * which can only happen while the metaslab is loaded.
     1488 +         */
     1489 +        ASSERT(msp->ms_loaded);
     1490 +
     1491 +        bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
     1492 +        for (int t = 0; t < TXG_DEFER_SIZE; t++)
     1493 +                bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t]));
     1494 +}
     1495 +
     1496 +static void
     1497 +metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift,
     1498 +    range_tree_t *rt)
     1499 +{
     1500 +        /*
     1501 +         * This is modeled after space_map_histogram_add(), so refer to that
     1502 +         * function for implementation details. We want this to work like
     1503 +         * the space map histogram, and not the range tree histogram, as we
     1504 +         * are essentially constructing a delta that will be later subtracted
     1505 +         * from the space map histogram.
     1506 +         */
     1507 +        int idx = 0;
     1508 +        for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
     1509 +                ASSERT3U(i, >=, idx + shift);
     1510 +                histogram[idx] += rt->rt_histogram[i] << (i - idx - shift);
     1511 +
     1512 +                if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
     1513 +                        ASSERT3U(idx + shift, ==, i);
     1514 +                        idx++;
     1515 +                        ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
     1516 +                }
     1517 +        }
     1518 +}
     1519 +
1464 1520  /*
     1521 + * Called at every sync pass that the metaslab gets synced.
     1522 + *
     1523 + * The reason is that we want our auxiliary histograms to be updated
     1524 + * wherever the metaslab's space map histogram is updated. This way
     1525 + * we stay consistent on which parts of the metaslab space map's
     1526 + * histogram are currently not available for allocations (e.g because
     1527 + * they are in the defer, freed, and freeing trees).
     1528 + */
     1529 +static void
     1530 +metaslab_aux_histograms_update(metaslab_t *msp)
     1531 +{
     1532 +        space_map_t *sm = msp->ms_sm;
     1533 +        ASSERT(sm != NULL);
     1534 +
     1535 +        /*
     1536 +         * This is similar to the metaslab's space map histogram updates
     1537 +         * that take place in metaslab_sync(). The only difference is that
     1538 +         * we only care about segments that haven't made it into the
     1539 +         * ms_allocatable tree yet.
     1540 +         */
     1541 +        if (msp->ms_loaded) {
     1542 +                metaslab_aux_histograms_clear(msp);
     1543 +
     1544 +                metaslab_aux_histogram_add(msp->ms_synchist,
     1545 +                    sm->sm_shift, msp->ms_freed);
     1546 +
     1547 +                for (int t = 0; t < TXG_DEFER_SIZE; t++) {
     1548 +                        metaslab_aux_histogram_add(msp->ms_deferhist[t],
     1549 +                            sm->sm_shift, msp->ms_defer[t]);
     1550 +                }
     1551 +        }
     1552 +
     1553 +        metaslab_aux_histogram_add(msp->ms_synchist,
     1554 +            sm->sm_shift, msp->ms_freeing);
     1555 +}
     1556 +
     1557 +/*
     1558 + * Called every time we are done syncing (writing to) the metaslab,
     1559 + * i.e. at the end of each sync pass.
     1560 + * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist]
     1561 + */
     1562 +static void
     1563 +metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed)
     1564 +{
     1565 +        spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
     1566 +        space_map_t *sm = msp->ms_sm;
     1567 +
     1568 +        if (sm == NULL) {
     1569 +                /*
     1570 +                 * We came here from metaslab_init() when creating/opening a
     1571 +                 * pool, looking at a metaslab that hasn't had any allocations
     1572 +                 * yet.
     1573 +                 */
     1574 +                return;
     1575 +        }
     1576 +
     1577 +        /*
     1578 +         * This is similar to the actions that we take for the ms_freed
     1579 +         * and ms_defer trees in metaslab_sync_done().
     1580 +         */
     1581 +        uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE;
     1582 +        if (defer_allowed) {
     1583 +                bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index],
     1584 +                    sizeof (msp->ms_synchist));
     1585 +        } else {
     1586 +                bzero(msp->ms_deferhist[hist_index],
     1587 +                    sizeof (msp->ms_deferhist[hist_index]));
     1588 +        }
     1589 +        bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
     1590 +}
     1591 +
     1592 +/*
     1593 + * Ensure that the metaslab's weight and fragmentation are consistent
     1594 + * with the contents of the histogram (either the range tree's histogram
     1595 + * or the space map's depending whether the metaslab is loaded).
     1596 + */
     1597 +static void
     1598 +metaslab_verify_weight_and_frag(metaslab_t *msp)
     1599 +{
     1600 +        ASSERT(MUTEX_HELD(&msp->ms_lock));
     1601 +
     1602 +        if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
     1603 +                return;
     1604 +
     1605 +        /* see comment in metaslab_verify_unflushed_changes() */
     1606 +        if (msp->ms_group == NULL)
     1607 +                return;
     1608 +
     1609 +        /*
     1610 +         * Devices being removed always return a weight of 0 and leave
     1611 +         * fragmentation and ms_max_size as is - there is nothing for
     1612 +         * us to verify here.
     1613 +         */
     1614 +        vdev_t *vd = msp->ms_group->mg_vd;
     1615 +        if (vd->vdev_removing)
     1616 +                return;
     1617 +
     1618 +        /*
     1619 +         * If the metaslab is dirty it probably means that we've done
     1620 +         * some allocations or frees that have changed our histograms
     1621 +         * and thus the weight.
     1622 +         */
     1623 +        for (int t = 0; t < TXG_SIZE; t++) {
     1624 +                if (txg_list_member(&vd->vdev_ms_list, msp, t))
     1625 +                        return;
     1626 +        }
     1627 +
     1628 +        /*
     1629 +         * This verification checks that our in-memory state is consistent
     1630 +         * with what's on disk. If the pool is read-only then there aren't
     1631 +         * any changes and we just have the initially-loaded state.
     1632 +         */
     1633 +        if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa))
     1634 +                return;
     1635 +
     1636 +        /* some extra verification for in-core tree if you can */
     1637 +        if (msp->ms_loaded) {
     1638 +                range_tree_stat_verify(msp->ms_allocatable);
     1639 +                VERIFY(space_map_histogram_verify(msp->ms_sm,
     1640 +                    msp->ms_allocatable));
     1641 +        }
     1642 +
     1643 +        uint64_t weight = msp->ms_weight;
     1644 +        uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
     1645 +        boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight);
     1646 +        uint64_t frag = msp->ms_fragmentation;
     1647 +        uint64_t max_segsize = msp->ms_max_size;
     1648 +
     1649 +        msp->ms_weight = 0;
     1650 +        msp->ms_fragmentation = 0;
     1651 +        msp->ms_max_size = 0;
     1652 +
     1653 +        /*
     1654 +         * This function is used for verification purposes. Regardless of
     1655 +         * whether metaslab_weight() thinks this metaslab should be active or
     1656 +         * not, we want to ensure that the actual weight (and therefore the
     1657 +         * value of ms_weight) would be the same if it was to be recalculated
     1658 +         * at this point.
     1659 +         */
     1660 +        msp->ms_weight = metaslab_weight(msp) | was_active;
     1661 +
     1662 +        VERIFY3U(max_segsize, ==, msp->ms_max_size);
     1663 +
     1664 +        /*
     1665 +         * If the weight type changed then there is no point in doing
     1666 +         * verification. Revert fields to their original values.
     1667 +         */
     1668 +        if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) ||
     1669 +            (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) {
     1670 +                msp->ms_fragmentation = frag;
     1671 +                msp->ms_weight = weight;
     1672 +                return;
     1673 +        }
     1674 +
     1675 +        VERIFY3U(msp->ms_fragmentation, ==, frag);
     1676 +        VERIFY3U(msp->ms_weight, ==, weight);
     1677 +}
     1678 +
     1679 +/*
1465 1680   * Wait for any in-progress metaslab loads to complete.
1466 1681   */
1467 1682  static void
1468 1683  metaslab_load_wait(metaslab_t *msp)
1469 1684  {
1470 1685          ASSERT(MUTEX_HELD(&msp->ms_lock));
1471 1686  
1472 1687          while (msp->ms_loading) {
1473 1688                  ASSERT(!msp->ms_loaded);
1474 1689                  cv_wait(&msp->ms_load_cv, &msp->ms_lock);
1475 1690          }
1476 1691  }
1477 1692  
1478 1693  static int
1479 1694  metaslab_load_impl(metaslab_t *msp)
1480 1695  {
1481 1696          int error = 0;
1482 1697  
1483 1698          ASSERT(MUTEX_HELD(&msp->ms_lock));
1484 1699          ASSERT(msp->ms_loading);
     1700 +        ASSERT(!msp->ms_condensing);
1485 1701  
1486 1702          /*
1487      -         * Nobody else can manipulate a loading metaslab, so it's now safe
1488      -         * to drop the lock. This way we don't have to hold the lock while
1489      -         * reading the spacemap from disk.
     1703 +         * We temporarily drop the lock to unblock other operations while we
     1704 +         * are reading the space map. Therefore, metaslab_sync() and
     1705 +         * metaslab_sync_done() can run at the same time as we do.
     1706 +         *
     1707 +         * metaslab_sync() can append to the space map while we are loading.
     1708 +         * Therefore we load only entries that existed when we started the
     1709 +         * load. Additionally, metaslab_sync_done() has to wait for the load
     1710 +         * to complete because there are potential races like metaslab_load()
     1711 +         * loading parts of the space map that are currently being appended
     1712 +         * by metaslab_sync(). If we didn't, the ms_allocatable would have
     1713 +         * entries that metaslab_sync_done() would try to re-add later.
     1714 +         *
     1715 +         * That's why before dropping the lock we remember the synced length
     1716 +         * of the metaslab and read up to that point of the space map,
     1717 +         * ignoring entries appended by metaslab_sync() that happen after we
     1718 +         * drop the lock.
1490 1719           */
     1720 +        uint64_t length = msp->ms_synced_length;
1491 1721          mutex_exit(&msp->ms_lock);
1492 1722  
1493      -        /*
1494      -         * If the space map has not been allocated yet, then treat
1495      -         * all the space in the metaslab as free and add it to ms_allocatable.
1496      -         */
1497 1723          if (msp->ms_sm != NULL) {
1498      -                error = space_map_load(msp->ms_sm, msp->ms_allocatable,
1499      -                    SM_FREE);
     1724 +                error = space_map_load_length(msp->ms_sm, msp->ms_allocatable,
     1725 +                    SM_FREE, length);
1500 1726          } else {
     1727 +                /*
     1728 +                 * The space map has not been allocated yet, so treat
     1729 +                 * all the space in the metaslab as free and add it to the
     1730 +                 * ms_allocatable tree.
     1731 +                 */
1501 1732                  range_tree_add(msp->ms_allocatable,
1502 1733                      msp->ms_start, msp->ms_size);
1503 1734          }
1504 1735  
     1736 +        /*
     1737 +         * We need to grab the ms_sync_lock to prevent metaslab_sync() from
     1738 +         * changing the ms_sm and the metaslab's range trees while we are
     1739 +         * about to use them and populate the ms_allocatable. The ms_lock
     1740 +         * is insufficient for this because metaslab_sync() doesn't hold
     1741 +         * the ms_lock while writing the ms_checkpointing tree to disk.
     1742 +         */
     1743 +        mutex_enter(&msp->ms_sync_lock);
1505 1744          mutex_enter(&msp->ms_lock);
     1745 +        ASSERT(!msp->ms_condensing);
1506 1746  
1507      -        if (error != 0)
     1747 +        if (error != 0) {
     1748 +                mutex_exit(&msp->ms_sync_lock);
1508 1749                  return (error);
     1750 +        }
1509 1751  
1510 1752          ASSERT3P(msp->ms_group, !=, NULL);
1511 1753          msp->ms_loaded = B_TRUE;
1512 1754  
1513 1755          /*
1514      -         * If the metaslab already has a spacemap, then we need to
1515      -         * remove all segments from the defer tree; otherwise, the
1516      -         * metaslab is completely empty and we can skip this.
     1756 +         * The ms_allocatable contains the segments that exist in the
     1757 +         * ms_defer trees [see ms_synced_length]. Thus we need to remove
     1758 +         * them from ms_allocatable as they will be added again in
     1759 +         * metaslab_sync_done().
1517 1760           */
1518      -        if (msp->ms_sm != NULL) {
1519      -                for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1520      -                        range_tree_walk(msp->ms_defer[t],
1521      -                            range_tree_remove, msp->ms_allocatable);
1522      -                }
     1761 +        for (int t = 0; t < TXG_DEFER_SIZE; t++) {
     1762 +                range_tree_walk(msp->ms_defer[t],
     1763 +                    range_tree_remove, msp->ms_allocatable);
1523 1764          }
     1765 +
     1766 +        /*
     1767 +         * Call metaslab_recalculate_weight_and_sort() now that the
     1768 +         * metaslab is loaded so we get the metaslab's real weight.
     1769 +         *
     1770 +         * Unless this metaslab was created with older software and
     1771 +         * has not yet been converted to use segment-based weight, we
     1772 +         * expect the new weight to be better or equal to the weight
     1773 +         * that the metaslab had while it was not loaded. This is
     1774 +         * because the old weight does not take into account the
     1775 +         * consolidation of adjacent segments between TXGs. [see
     1776 +         * comment for ms_synchist and ms_deferhist[] for more info]
     1777 +         */
     1778 +        uint64_t weight = msp->ms_weight;
     1779 +        metaslab_recalculate_weight_and_sort(msp);
     1780 +        if (!WEIGHT_IS_SPACEBASED(weight))
     1781 +                ASSERT3U(weight, <=, msp->ms_weight);
1524 1782          msp->ms_max_size = metaslab_block_maxsize(msp);
1525 1783  
     1784 +        spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
     1785 +        metaslab_verify_space(msp, spa_syncing_txg(spa));
     1786 +        mutex_exit(&msp->ms_sync_lock);
     1787 +
1526 1788          return (0);
1527 1789  }
1528 1790  
1529 1791  int
1530 1792  metaslab_load(metaslab_t *msp)
1531 1793  {
1532 1794          ASSERT(MUTEX_HELD(&msp->ms_lock));
1533 1795  
1534 1796          /*
1535 1797           * There may be another thread loading the same metaslab, if that's
1536 1798           * the case just wait until the other thread is done and return.
1537 1799           */
1538 1800          metaslab_load_wait(msp);
1539 1801          if (msp->ms_loaded)
1540 1802                  return (0);
1541 1803          VERIFY(!msp->ms_loading);
     1804 +        ASSERT(!msp->ms_condensing);
1542 1805  
1543 1806          msp->ms_loading = B_TRUE;
1544 1807          int error = metaslab_load_impl(msp);
1545 1808          msp->ms_loading = B_FALSE;
1546 1809          cv_broadcast(&msp->ms_load_cv);
1547 1810  
1548 1811          return (error);
1549 1812  }
1550 1813  
1551 1814  void
1552 1815  metaslab_unload(metaslab_t *msp)
1553 1816  {
1554 1817          ASSERT(MUTEX_HELD(&msp->ms_lock));
     1818 +
     1819 +        metaslab_verify_weight_and_frag(msp);
     1820 +
1555 1821          range_tree_vacate(msp->ms_allocatable, NULL, NULL);
1556 1822          msp->ms_loaded = B_FALSE;
     1823 +
1557 1824          msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
1558 1825          msp->ms_max_size = 0;
     1826 +
     1827 +        /*
     1828 +         * We explicitly recalculate the metaslab's weight based on its space
     1829 +         * map (as it is now not loaded). We want unload metaslabs to always
     1830 +         * have their weights calculated from the space map histograms, while
     1831 +         * loaded ones have it calculated from their in-core range tree
     1832 +         * [see metaslab_load()]. This way, the weight reflects the information
     1833 +         * available in-core, whether it is loaded or not
     1834 +         *
     1835 +         * If ms_group == NULL means that we came here from metaslab_fini(),
     1836 +         * at which point it doesn't make sense for us to do the recalculation
     1837 +         * and the sorting.
     1838 +         */
     1839 +        if (msp->ms_group != NULL)
     1840 +                metaslab_recalculate_weight_and_sort(msp);
1559 1841  }
1560 1842  
1561 1843  static void
1562 1844  metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
1563 1845      int64_t defer_delta, int64_t space_delta)
1564 1846  {
1565 1847          vdev_space_update(vd, alloc_delta, defer_delta, space_delta);
1566 1848  
1567 1849          ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent);
1568 1850          ASSERT(vd->vdev_ms_count != 0);

1569 1851  
1570 1852          metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta,
1571 1853              vdev_deflated_space(vd, space_delta));
1572 1854  }
1573 1855  
1574 1856  int
1575 1857  metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
1576 1858      metaslab_t **msp)
1577 1859  {
1578 1860          vdev_t *vd = mg->mg_vd;
1579 1861          spa_t *spa = vd->vdev_spa;
1580 1862          objset_t *mos = spa->spa_meta_objset;
1581 1863          metaslab_t *ms;
1582 1864          int error;
1583 1865  
1584 1866          ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
1585 1867          mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
1586 1868          mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
1587 1869          cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);

↓ open down ↓

19 lines elided

↑ open up ↑

1588 1870  
1589 1871          ms->ms_id = id;
1590 1872          ms->ms_start = id << vd->vdev_ms_shift;
1591 1873          ms->ms_size = 1ULL << vd->vdev_ms_shift;
1592 1874          ms->ms_allocator = -1;
1593 1875          ms->ms_new = B_TRUE;
1594 1876  
1595 1877          /*
1596 1878           * We only open space map objects that already exist. All others
1597 1879           * will be opened when we finally allocate an object for it.
     1880 +         *
     1881 +         * Note:
     1882 +         * When called from vdev_expand(), we can't call into the DMU as
     1883 +         * we are holding the spa_config_lock as a writer and we would
     1884 +         * deadlock [see relevant comment in vdev_metaslab_init()]. in
     1885 +         * that case, the object parameter is zero though, so we won't
     1886 +         * call into the DMU.
1598 1887           */
1599 1888          if (object != 0) {
1600 1889                  error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
1601 1890                      ms->ms_size, vd->vdev_ashift);
1602 1891  
1603 1892                  if (error != 0) {
1604 1893                          kmem_free(ms, sizeof (metaslab_t));
1605 1894                          return (error);
1606 1895                  }
1607 1896  
1608 1897                  ASSERT(ms->ms_sm != NULL);
     1898 +                ASSERT3S(space_map_allocated(ms->ms_sm), >=, 0);
     1899 +                ms->ms_allocated_space = space_map_allocated(ms->ms_sm);
1609 1900          }
1610 1901  
1611 1902          /*
1612      -         * We create the main range tree here, but we don't create the
     1903 +         * We create the ms_allocatable here, but we don't create the
1613 1904           * other range trees until metaslab_sync_done().  This serves
1614 1905           * two purposes: it allows metaslab_sync_done() to detect the
1615      -         * addition of new space; and for debugging, it ensures that we'd
1616      -         * data fault on any attempt to use this metaslab before it's ready.
     1906 +         * addition of new space; and for debugging, it ensures that
     1907 +         * we'd data fault on any attempt to use this metaslab before
     1908 +         * it's ready.
1617 1909           */
1618 1910          ms->ms_allocatable = range_tree_create(&metaslab_rt_ops, ms);
1619 1911          metaslab_group_add(mg, ms);
1620 1912  
1621 1913          metaslab_set_fragmentation(ms);
1622 1914  
1623 1915          /*
1624 1916           * If we're opening an existing pool (txg == 0) or creating
1625 1917           * a new one (txg == TXG_INITIAL), all space is available now.
1626 1918           * If we're adding space to an existing pool, the new space
1627 1919           * does not become available until after this txg has synced.
1628 1920           * The metaslab's weight will also be initialized when we sync
1629 1921           * out this txg. This ensures that we don't attempt to allocate
1630 1922           * from it before we have initialized it completely.
1631 1923           */
1632      -        if (txg <= TXG_INITIAL)
     1924 +        if (txg <= TXG_INITIAL) {
1633 1925                  metaslab_sync_done(ms, 0);
     1926 +                metaslab_space_update(vd, mg->mg_class,
     1927 +                    metaslab_allocated_space(ms), 0, 0);
     1928 +        }
1634 1929  
1635 1930          /*
1636 1931           * If metaslab_debug_load is set and we're initializing a metaslab
1637 1932           * that has an allocated space map object then load the space map
1638 1933           * so that we can verify frees.
1639 1934           */
1640 1935          if (metaslab_debug_load && ms->ms_sm != NULL) {
1641 1936                  mutex_enter(&ms->ms_lock);
1642 1937                  VERIFY0(metaslab_load(ms));
1643 1938                  mutex_exit(&ms->ms_lock);

1644 1939          }
1645 1940  
1646 1941          if (txg != 0) {
1647 1942                  vdev_dirty(vd, 0, NULL, txg);
1648 1943                  vdev_dirty(vd, VDD_METASLAB, ms, txg);
1649 1944          }
1650 1945  
1651 1946          *msp = ms;
1652 1947  
1653 1948          return (0);
1654 1949  }
1655 1950  
1656 1951  void

↓ open down ↓

13 lines elided

↑ open up ↑

1657 1952  metaslab_fini(metaslab_t *msp)
1658 1953  {
1659 1954          metaslab_group_t *mg = msp->ms_group;
1660 1955          vdev_t *vd = mg->mg_vd;
1661 1956  
1662 1957          metaslab_group_remove(mg, msp);
1663 1958  
1664 1959          mutex_enter(&msp->ms_lock);
1665 1960          VERIFY(msp->ms_group == NULL);
1666 1961          metaslab_space_update(vd, mg->mg_class,
1667      -            -space_map_allocated(msp->ms_sm), 0, -msp->ms_size);
     1962 +            -metaslab_allocated_space(msp), 0, -msp->ms_size);
1668 1963  
1669 1964          space_map_close(msp->ms_sm);
1670 1965  
1671 1966          metaslab_unload(msp);
1672 1967  
1673 1968          range_tree_destroy(msp->ms_allocatable);
1674 1969          range_tree_destroy(msp->ms_freeing);
1675 1970          range_tree_destroy(msp->ms_freed);
1676 1971  
1677 1972          for (int t = 0; t < TXG_SIZE; t++) {
1678 1973                  range_tree_destroy(msp->ms_allocating[t]);
1679 1974          }
1680 1975  
1681 1976          for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1682 1977                  range_tree_destroy(msp->ms_defer[t]);
1683 1978          }
1684 1979          ASSERT0(msp->ms_deferspace);
1685 1980  
1686 1981          range_tree_destroy(msp->ms_checkpointing);
1687 1982  
     1983 +        for (int t = 0; t < TXG_SIZE; t++)
     1984 +                ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t));
     1985 +
1688 1986          mutex_exit(&msp->ms_lock);
1689 1987          cv_destroy(&msp->ms_load_cv);
1690 1988          mutex_destroy(&msp->ms_lock);
1691 1989          mutex_destroy(&msp->ms_sync_lock);
1692 1990          ASSERT3U(msp->ms_allocator, ==, -1);
1693 1991  
1694 1992          kmem_free(msp, sizeof (metaslab_t));
1695 1993  }
1696 1994  
1697 1995  #define FRAGMENTATION_TABLE_SIZE        17
1698 1996  
1699 1997  /*
1700 1998   * This table defines a segment size based fragmentation metric that will
1701 1999   * allow each metaslab to derive its own fragmentation value. This is done
1702 2000   * by calculating the space in each bucket of the spacemap histogram and
1703      - * multiplying that by the fragmetation metric in this table. Doing
     2001 + * multiplying that by the fragmentation metric in this table. Doing
1704 2002   * this for all buckets and dividing it by the total amount of free
1705 2003   * space in this metaslab (i.e. the total free space in all buckets) gives
1706 2004   * us the fragmentation metric. This means that a high fragmentation metric
1707 2005   * equates to most of the free space being comprised of small segments.
1708 2006   * Conversely, if the metric is low, then most of the free space is in
1709 2007   * large segments. A 10% change in fragmentation equates to approximately
1710 2008   * double the number of segments.
1711 2009   *
1712 2010   * This table defines 0% fragmented space using 16MB segments. Testing has
1713 2011   * shown that segments that are greater than or equal to 16MB do not suffer

1714 2012   * from drastic performance problems. Using this value, we derive the rest
1715 2013   * of the table. Since the fragmentation value is never stored on disk, it
1716 2014   * is possible to change these calculations in the future.
1717 2015   */
1718 2016  int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
1719 2017          100,    /* 512B */
1720 2018          100,    /* 1K   */
1721 2019          98,     /* 2K   */
1722 2020          95,     /* 4K   */
1723 2021          90,     /* 8K   */
1724 2022          80,     /* 16K  */
1725 2023          70,     /* 32K  */
1726 2024          60,     /* 64K  */
1727 2025          50,     /* 128K */

↓ open down ↓

14 lines elided

↑ open up ↑

1728 2026          40,     /* 256K */
1729 2027          30,     /* 512K */
1730 2028          20,     /* 1M   */
1731 2029          15,     /* 2M   */
1732 2030          10,     /* 4M   */
1733 2031          5,      /* 8M   */
1734 2032          0       /* 16M  */
1735 2033  };
1736 2034  
1737 2035  /*
1738      - * Calclate the metaslab's fragmentation metric. A return value
1739      - * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does
1740      - * not support this metric. Otherwise, the return value should be in the
1741      - * range [0, 100].
     2036 + * Calculate the metaslab's fragmentation metric and set ms_fragmentation.
     2037 + * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not
     2038 + * been upgraded and does not support this metric. Otherwise, the return
     2039 + * value should be in the range [0, 100].
1742 2040   */
1743 2041  static void
1744 2042  metaslab_set_fragmentation(metaslab_t *msp)
1745 2043  {
1746 2044          spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1747 2045          uint64_t fragmentation = 0;
1748 2046          uint64_t total = 0;
1749 2047          boolean_t feature_enabled = spa_feature_is_enabled(spa,
1750 2048              SPA_FEATURE_SPACEMAP_HISTOGRAM);
1751 2049

1752 2050          if (!feature_enabled) {
1753 2051                  msp->ms_fragmentation = ZFS_FRAG_INVALID;
1754 2052                  return;
1755 2053          }
1756 2054  
1757 2055          /*
1758 2056           * A null space map means that the entire metaslab is free
1759 2057           * and thus is not fragmented.
1760 2058           */
1761 2059          if (msp->ms_sm == NULL) {
1762 2060                  msp->ms_fragmentation = 0;
1763 2061                  return;
1764 2062          }
1765 2063  
1766 2064          /*
1767 2065           * If this metaslab's space map has not been upgraded, flag it
1768 2066           * so that we upgrade next time we encounter it.
1769 2067           */
1770 2068          if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
1771 2069                  uint64_t txg = spa_syncing_txg(spa);
1772 2070                  vdev_t *vd = msp->ms_group->mg_vd;
1773 2071  
1774 2072                  /*
1775 2073                   * If we've reached the final dirty txg, then we must
1776 2074                   * be shutting down the pool. We don't want to dirty
1777 2075                   * any data past this point so skip setting the condense
1778 2076                   * flag. We can retry this action the next time the pool
1779 2077                   * is imported.
1780 2078                   */
1781 2079                  if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) {
1782 2080                          msp->ms_condense_wanted = B_TRUE;
1783 2081                          vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
1784 2082                          zfs_dbgmsg("txg %llu, requesting force condense: "
1785 2083                              "ms_id %llu, vdev_id %llu", txg, msp->ms_id,
1786 2084                              vd->vdev_id);
1787 2085                  }
1788 2086                  msp->ms_fragmentation = ZFS_FRAG_INVALID;
1789 2087                  return;
1790 2088          }
1791 2089  
1792 2090          for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
1793 2091                  uint64_t space = 0;
1794 2092                  uint8_t shift = msp->ms_sm->sm_shift;
1795 2093  
1796 2094                  int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
1797 2095                      FRAGMENTATION_TABLE_SIZE - 1);
1798 2096  
1799 2097                  if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
1800 2098                          continue;
1801 2099  
1802 2100                  space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
1803 2101                  total += space;
1804 2102  
1805 2103                  ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
1806 2104                  fragmentation += space * zfs_frag_table[idx];
1807 2105          }
1808 2106  
1809 2107          if (total > 0)
1810 2108                  fragmentation /= total;
1811 2109          ASSERT3U(fragmentation, <=, 100);
1812 2110  
1813 2111          msp->ms_fragmentation = fragmentation;
1814 2112  }
1815 2113  
1816 2114  /*
1817 2115   * Compute a weight -- a selection preference value -- for the given metaslab.
1818 2116   * This is based on the amount of free space, the level of fragmentation,
1819 2117   * the LBA range, and whether the metaslab is loaded.
1820 2118   */
1821 2119  static uint64_t
1822 2120  metaslab_space_weight(metaslab_t *msp)
1823 2121  {

↓ open down ↓

72 lines elided

↑ open up ↑

1824 2122          metaslab_group_t *mg = msp->ms_group;
1825 2123          vdev_t *vd = mg->mg_vd;
1826 2124          uint64_t weight, space;
1827 2125  
1828 2126          ASSERT(MUTEX_HELD(&msp->ms_lock));
1829 2127          ASSERT(!vd->vdev_removing);
1830 2128  
1831 2129          /*
1832 2130           * The baseline weight is the metaslab's free space.
1833 2131           */
1834      -        space = msp->ms_size - space_map_allocated(msp->ms_sm);
     2132 +        space = msp->ms_size - metaslab_allocated_space(msp);
1835 2133  
1836 2134          if (metaslab_fragmentation_factor_enabled &&
1837 2135              msp->ms_fragmentation != ZFS_FRAG_INVALID) {
1838 2136                  /*
1839 2137                   * Use the fragmentation information to inversely scale
1840 2138                   * down the baseline weight. We need to ensure that we
1841 2139                   * don't exclude this metaslab completely when it's 100%
1842 2140                   * fragmented. To avoid this we reduce the fragmented value
1843 2141                   * by 1.
1844 2142                   */

1845 2143                  space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
1846 2144  
1847 2145                  /*
1848 2146                   * If space < SPA_MINBLOCKSIZE, then we will not allocate from
1849 2147                   * this metaslab again. The fragmentation metric may have
1850 2148                   * decreased the space to something smaller than
1851 2149                   * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
1852 2150                   * so that we can consume any remaining space.
1853 2151                   */
1854 2152                  if (space > 0 && space < SPA_MINBLOCKSIZE)
1855 2153                          space = SPA_MINBLOCKSIZE;
1856 2154          }
1857 2155          weight = space;
1858 2156  
1859 2157          /*
1860 2158           * Modern disks have uniform bit density and constant angular velocity.
1861 2159           * Therefore, the outer recording zones are faster (higher bandwidth)
1862 2160           * than the inner zones by the ratio of outer to inner track diameter,
1863 2161           * which is typically around 2:1.  We account for this by assigning
1864 2162           * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
1865 2163           * In effect, this means that we'll select the metaslab with the most
1866 2164           * free bandwidth rather than simply the one with the most free space.
1867 2165           */
1868 2166          if (metaslab_lba_weighting_enabled) {
1869 2167                  weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
1870 2168                  ASSERT(weight >= space && weight <= 2 * space);
1871 2169          }
1872 2170  
1873 2171          /*
1874 2172           * If this metaslab is one we're actively using, adjust its
1875 2173           * weight to make it preferable to any inactive metaslab so
1876 2174           * we'll polish it off. If the fragmentation on this metaslab
1877 2175           * has exceed our threshold, then don't mark it active.
1878 2176           */
1879 2177          if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
1880 2178              msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
1881 2179                  weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
1882 2180          }
1883 2181  
1884 2182          WEIGHT_SET_SPACEBASED(weight);
1885 2183          return (weight);
1886 2184  }
1887 2185  
1888 2186  /*
1889 2187   * Return the weight of the specified metaslab, according to the segment-based
1890 2188   * weighting algorithm. The metaslab must be loaded. This function can
1891 2189   * be called within a sync pass since it relies only on the metaslab's
1892 2190   * range tree which is always accurate when the metaslab is loaded.
1893 2191   */
1894 2192  static uint64_t
1895 2193  metaslab_weight_from_range_tree(metaslab_t *msp)
1896 2194  {
1897 2195          uint64_t weight = 0;
1898 2196          uint32_t segments = 0;
1899 2197  
1900 2198          ASSERT(msp->ms_loaded);
1901 2199  
1902 2200          for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
1903 2201              i--) {
1904 2202                  uint8_t shift = msp->ms_group->mg_vd->vdev_ashift;
1905 2203                  int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
1906 2204  
1907 2205                  segments <<= 1;
1908 2206                  segments += msp->ms_allocatable->rt_histogram[i];
1909 2207  
1910 2208                  /*
1911 2209                   * The range tree provides more precision than the space map
1912 2210                   * and must be downgraded so that all values fit within the
1913 2211                   * space map's histogram. This allows us to compare loaded
1914 2212                   * vs. unloaded metaslabs to determine which metaslab is
1915 2213                   * considered "best".
1916 2214                   */
1917 2215                  if (i > max_idx)
1918 2216                          continue;
1919 2217  
1920 2218                  if (segments != 0) {
1921 2219                          WEIGHT_SET_COUNT(weight, segments);
1922 2220                          WEIGHT_SET_INDEX(weight, i);
1923 2221                          WEIGHT_SET_ACTIVE(weight, 0);
1924 2222                          break;
1925 2223                  }
1926 2224          }
1927 2225          return (weight);

↓ open down ↓

83 lines elided

↑ open up ↑

1928 2226  }
1929 2227  
1930 2228  /*
1931 2229   * Calculate the weight based on the on-disk histogram. This should only
1932 2230   * be called after a sync pass has completely finished since the on-disk
1933 2231   * information is updated in metaslab_sync().
1934 2232   */
1935 2233  static uint64_t
1936 2234  metaslab_weight_from_spacemap(metaslab_t *msp)
1937 2235  {
1938      -        uint64_t weight = 0;
     2236 +        space_map_t *sm = msp->ms_sm;
     2237 +        ASSERT(!msp->ms_loaded);
     2238 +        ASSERT(sm != NULL);
     2239 +        ASSERT3U(space_map_object(sm), !=, 0);
     2240 +        ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
1939 2241  
     2242 +        /*
     2243 +         * Create a joint histogram from all the segments that have made
     2244 +         * it to the metaslab's space map histogram, that are not yet
     2245 +         * available for allocation because they are still in the freeing
     2246 +         * pipeline (e.g. freeing, freed, and defer trees). Then subtract
     2247 +         * these segments from the space map's histogram to get a more
     2248 +         * accurate weight.
     2249 +         */
     2250 +        uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0};
     2251 +        for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
     2252 +                deferspace_histogram[i] += msp->ms_synchist[i];
     2253 +        for (int t = 0; t < TXG_DEFER_SIZE; t++) {
     2254 +                for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
     2255 +                        deferspace_histogram[i] += msp->ms_deferhist[t][i];
     2256 +                }
     2257 +        }
     2258 +
     2259 +        uint64_t weight = 0;
1940 2260          for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
1941      -                if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) {
1942      -                        WEIGHT_SET_COUNT(weight,
1943      -                            msp->ms_sm->sm_phys->smp_histogram[i]);
1944      -                        WEIGHT_SET_INDEX(weight, i +
1945      -                            msp->ms_sm->sm_shift);
     2261 +                ASSERT3U(sm->sm_phys->smp_histogram[i], >=,
     2262 +                    deferspace_histogram[i]);
     2263 +                uint64_t count =
     2264 +                    sm->sm_phys->smp_histogram[i] - deferspace_histogram[i];
     2265 +                if (count != 0) {
     2266 +                        WEIGHT_SET_COUNT(weight, count);
     2267 +                        WEIGHT_SET_INDEX(weight, i + sm->sm_shift);
1946 2268                          WEIGHT_SET_ACTIVE(weight, 0);
1947 2269                          break;
1948 2270                  }
1949 2271          }
1950 2272          return (weight);
1951 2273  }
1952 2274  
1953 2275  /*
1954 2276   * Compute a segment-based weight for the specified metaslab. The weight
1955 2277   * is determined by highest bucket in the histogram. The information

1956 2278   * for the highest bucket is encoded into the weight value.
1957 2279   */
1958 2280  static uint64_t
1959 2281  metaslab_segment_weight(metaslab_t *msp)

↓ open down ↓

4 lines elided

↑ open up ↑

1960 2282  {
1961 2283          metaslab_group_t *mg = msp->ms_group;
1962 2284          uint64_t weight = 0;
1963 2285          uint8_t shift = mg->mg_vd->vdev_ashift;
1964 2286  
1965 2287          ASSERT(MUTEX_HELD(&msp->ms_lock));
1966 2288  
1967 2289          /*
1968 2290           * The metaslab is completely free.
1969 2291           */
1970      -        if (space_map_allocated(msp->ms_sm) == 0) {
     2292 +        if (metaslab_allocated_space(msp) == 0) {
1971 2293                  int idx = highbit64(msp->ms_size) - 1;
1972 2294                  int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
1973 2295  
1974 2296                  if (idx < max_idx) {
1975 2297                          WEIGHT_SET_COUNT(weight, 1ULL);
1976 2298                          WEIGHT_SET_INDEX(weight, idx);
1977 2299                  } else {
1978 2300                          WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
1979 2301                          WEIGHT_SET_INDEX(weight, max_idx);
1980 2302                  }

1981 2303                  WEIGHT_SET_ACTIVE(weight, 0);

↓ open down ↓

1 lines elided

↑ open up ↑

1982 2304                  ASSERT(!WEIGHT_IS_SPACEBASED(weight));
1983 2305  
1984 2306                  return (weight);
1985 2307          }
1986 2308  
1987 2309          ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
1988 2310  
1989 2311          /*
1990 2312           * If the metaslab is fully allocated then just make the weight 0.
1991 2313           */
1992      -        if (space_map_allocated(msp->ms_sm) == msp->ms_size)
     2314 +        if (metaslab_allocated_space(msp) == msp->ms_size)
1993 2315                  return (0);
1994 2316          /*
1995 2317           * If the metaslab is already loaded, then use the range tree to
1996 2318           * determine the weight. Otherwise, we rely on the space map information
1997 2319           * to generate the weight.
1998 2320           */
1999 2321          if (msp->ms_loaded) {
2000 2322                  weight = metaslab_weight_from_range_tree(msp);
2001 2323          } else {
2002 2324                  weight = metaslab_weight_from_spacemap(msp);

2003 2325          }
2004 2326  
2005 2327          /*
2006 2328           * If the metaslab was active the last time we calculated its weight
2007 2329           * then keep it active. We want to consume the entire region that
2008 2330           * is associated with this weight.
2009 2331           */
2010 2332          if (msp->ms_activation_weight != 0 && weight != 0)
2011 2333                  WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
2012 2334          return (weight);
2013 2335  }
2014 2336  
2015 2337  /*
2016 2338   * Determine if we should attempt to allocate from this metaslab. If the
2017 2339   * metaslab has a maximum size then we can quickly determine if the desired
2018 2340   * allocation size can be satisfied. Otherwise, if we're using segment-based
2019 2341   * weighting then we can determine the maximum allocation that this metaslab
2020 2342   * can accommodate based on the index encoded in the weight. If we're using
2021 2343   * space-based weights then rely on the entire weight (excluding the weight
2022 2344   * type bit).
2023 2345   */
2024 2346  boolean_t
2025 2347  metaslab_should_allocate(metaslab_t *msp, uint64_t asize)
2026 2348  {
2027 2349          boolean_t should_allocate;
2028 2350  
2029 2351          if (msp->ms_max_size != 0)
2030 2352                  return (msp->ms_max_size >= asize);
2031 2353  
2032 2354          if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
2033 2355                  /*
2034 2356                   * The metaslab segment weight indicates segments in the
2035 2357                   * range [2^i, 2^(i+1)), where i is the index in the weight.
2036 2358                   * Since the asize might be in the middle of the range, we
2037 2359                   * should attempt the allocation if asize < 2^(i+1).
2038 2360                   */
2039 2361                  should_allocate = (asize <
2040 2362                      1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1));
2041 2363          } else {
2042 2364                  should_allocate = (asize <=
2043 2365                      (msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
2044 2366          }
2045 2367          return (should_allocate);
2046 2368  }
2047 2369  
2048 2370  static uint64_t
2049 2371  metaslab_weight(metaslab_t *msp)
2050 2372  {
2051 2373          vdev_t *vd = msp->ms_group->mg_vd;
2052 2374          spa_t *spa = vd->vdev_spa;
2053 2375          uint64_t weight;
2054 2376  
2055 2377          ASSERT(MUTEX_HELD(&msp->ms_lock));
2056 2378  
2057 2379          /*
2058 2380           * If this vdev is in the process of being removed, there is nothing
2059 2381           * for us to do here.
2060 2382           */
2061 2383          if (vd->vdev_removing)
2062 2384                  return (0);

↓ open down ↓

60 lines elided

↑ open up ↑

2063 2385  
2064 2386          metaslab_set_fragmentation(msp);
2065 2387  
2066 2388          /*
2067 2389           * Update the maximum size if the metaslab is loaded. This will
2068 2390           * ensure that we get an accurate maximum size if newly freed space
2069 2391           * has been added back into the free tree.
2070 2392           */
2071 2393          if (msp->ms_loaded)
2072 2394                  msp->ms_max_size = metaslab_block_maxsize(msp);
     2395 +        else
     2396 +                ASSERT0(msp->ms_max_size);
2073 2397  
2074 2398          /*
2075 2399           * Segment-based weighting requires space map histogram support.
2076 2400           */
2077 2401          if (zfs_metaslab_segment_weight_enabled &&
2078 2402              spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
2079 2403              (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
2080 2404              sizeof (space_map_phys_t))) {
2081 2405                  weight = metaslab_segment_weight(msp);
2082 2406          } else {
2083 2407                  weight = metaslab_space_weight(msp);
2084 2408          }
2085 2409          return (weight);
2086 2410  }
2087 2411  
     2412 +void
     2413 +metaslab_recalculate_weight_and_sort(metaslab_t *msp)
     2414 +{
     2415 +        /* note: we preserve the mask (e.g. indication of primary, etc..) */
     2416 +        uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
     2417 +        metaslab_group_sort(msp->ms_group, msp,
     2418 +            metaslab_weight(msp) | was_active);
     2419 +}
     2420 +
2088 2421  static int
2089 2422  metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
2090 2423      int allocator, uint64_t activation_weight)
2091 2424  {
2092 2425          /*
2093 2426           * If we're activating for the claim code, we don't want to actually
2094 2427           * set the metaslab up for a specific allocator.
2095 2428           */
2096 2429          if (activation_weight == METASLAB_WEIGHT_CLAIM)
2097 2430                  return (0);

2098 2431          metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
2099 2432              mg->mg_primaries : mg->mg_secondaries);
2100 2433  
2101 2434          ASSERT(MUTEX_HELD(&msp->ms_lock));
2102 2435          mutex_enter(&mg->mg_lock);
2103 2436          if (arr[allocator] != NULL) {
2104 2437                  mutex_exit(&mg->mg_lock);
2105 2438                  return (EEXIST);
2106 2439          }
2107 2440  
2108 2441          arr[allocator] = msp;
2109 2442          ASSERT3S(msp->ms_allocator, ==, -1);
2110 2443          msp->ms_allocator = allocator;
2111 2444          msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY);
2112 2445          mutex_exit(&mg->mg_lock);
2113 2446  
2114 2447          return (0);
2115 2448  }
2116 2449  
2117 2450  static int
2118 2451  metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
2119 2452  {
2120 2453          ASSERT(MUTEX_HELD(&msp->ms_lock));
2121 2454  
2122 2455          if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
2123 2456                  int error = metaslab_load(msp);
2124 2457                  if (error != 0) {
2125 2458                          metaslab_group_sort(msp->ms_group, msp, 0);
2126 2459                          return (error);
2127 2460                  }
2128 2461                  if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
2129 2462                          /*
2130 2463                           * The metaslab was activated for another allocator
2131 2464                           * while we were waiting, we should reselect.
2132 2465                           */
2133 2466                          return (EBUSY);
2134 2467                  }
2135 2468                  if ((error = metaslab_activate_allocator(msp->ms_group, msp,
2136 2469                      allocator, activation_weight)) != 0) {
2137 2470                          return (error);
2138 2471                  }
2139 2472  
2140 2473                  msp->ms_activation_weight = msp->ms_weight;
2141 2474                  metaslab_group_sort(msp->ms_group, msp,
2142 2475                      msp->ms_weight | activation_weight);
2143 2476          }
2144 2477          ASSERT(msp->ms_loaded);
2145 2478          ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
2146 2479  
2147 2480          return (0);
2148 2481  }
2149 2482  
2150 2483  static void
2151 2484  metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp,
2152 2485      uint64_t weight)
2153 2486  {
2154 2487          ASSERT(MUTEX_HELD(&msp->ms_lock));
2155 2488          if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
2156 2489                  metaslab_group_sort(mg, msp, weight);
2157 2490                  return;
2158 2491          }
2159 2492  
2160 2493          mutex_enter(&mg->mg_lock);
2161 2494          ASSERT3P(msp->ms_group, ==, mg);
2162 2495          if (msp->ms_primary) {
2163 2496                  ASSERT3U(0, <=, msp->ms_allocator);
2164 2497                  ASSERT3U(msp->ms_allocator, <, mg->mg_allocators);
2165 2498                  ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp);
2166 2499                  ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
2167 2500                  mg->mg_primaries[msp->ms_allocator] = NULL;
2168 2501          } else {
2169 2502                  ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
2170 2503                  ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp);
2171 2504                  mg->mg_secondaries[msp->ms_allocator] = NULL;
2172 2505          }
2173 2506          msp->ms_allocator = -1;
2174 2507          metaslab_group_sort_impl(mg, msp, weight);
2175 2508          mutex_exit(&mg->mg_lock);
2176 2509  }
2177 2510  
2178 2511  static void
2179 2512  metaslab_passivate(metaslab_t *msp, uint64_t weight)
2180 2513  {
2181 2514          uint64_t size = weight & ~METASLAB_WEIGHT_TYPE;
2182 2515  
2183 2516          /*
2184 2517           * If size < SPA_MINBLOCKSIZE, then we will not allocate from
2185 2518           * this metaslab again.  In that case, it had better be empty,
2186 2519           * or we would be leaving space on the table.
2187 2520           */
2188 2521          ASSERT(size >= SPA_MINBLOCKSIZE ||
2189 2522              range_tree_is_empty(msp->ms_allocatable));
2190 2523          ASSERT0(weight & METASLAB_ACTIVE_MASK);
2191 2524  
2192 2525          msp->ms_activation_weight = 0;
2193 2526          metaslab_passivate_allocator(msp->ms_group, msp, weight);
2194 2527          ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
2195 2528  }
2196 2529  
2197 2530  /*
2198 2531   * Segment-based metaslabs are activated once and remain active until
2199 2532   * we either fail an allocation attempt (similar to space-based metaslabs)
2200 2533   * or have exhausted the free space in zfs_metaslab_switch_threshold
2201 2534   * buckets since the metaslab was activated. This function checks to see
2202 2535   * if we've exhaused the zfs_metaslab_switch_threshold buckets in the
2203 2536   * metaslab and passivates it proactively. This will allow us to select a
2204 2537   * metaslabs with larger contiguous region if any remaining within this
2205 2538   * metaslab group. If we're in sync pass > 1, then we continue using this
2206 2539   * metaslab so that we don't dirty more block and cause more sync passes.
2207 2540   */
2208 2541  void
2209 2542  metaslab_segment_may_passivate(metaslab_t *msp)
2210 2543  {
2211 2544          spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2212 2545  
2213 2546          if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1)
2214 2547                  return;
2215 2548  
2216 2549          /*
2217 2550           * Since we are in the middle of a sync pass, the most accurate
2218 2551           * information that is accessible to us is the in-core range tree
2219 2552           * histogram; calculate the new weight based on that information.
2220 2553           */
2221 2554          uint64_t weight = metaslab_weight_from_range_tree(msp);
2222 2555          int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight);
2223 2556          int current_idx = WEIGHT_GET_INDEX(weight);
2224 2557  
2225 2558          if (current_idx <= activation_idx - zfs_metaslab_switch_threshold)
2226 2559                  metaslab_passivate(msp, weight);
2227 2560  }
2228 2561  
2229 2562  static void
2230 2563  metaslab_preload(void *arg)
2231 2564  {
2232 2565          metaslab_t *msp = arg;
2233 2566          spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2234 2567  
2235 2568          ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
2236 2569  
2237 2570          mutex_enter(&msp->ms_lock);
2238 2571          (void) metaslab_load(msp);
2239 2572          msp->ms_selected_txg = spa_syncing_txg(spa);
2240 2573          mutex_exit(&msp->ms_lock);
2241 2574  }
2242 2575  
2243 2576  static void
2244 2577  metaslab_group_preload(metaslab_group_t *mg)
2245 2578  {
2246 2579          spa_t *spa = mg->mg_vd->vdev_spa;
2247 2580          metaslab_t *msp;
2248 2581          avl_tree_t *t = &mg->mg_metaslab_tree;
2249 2582          int m = 0;
2250 2583  
2251 2584          if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
2252 2585                  taskq_wait(mg->mg_taskq);
2253 2586                  return;
2254 2587          }
2255 2588  
2256 2589          mutex_enter(&mg->mg_lock);
2257 2590  
2258 2591          /*
2259 2592           * Load the next potential metaslabs
2260 2593           */
2261 2594          for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
2262 2595                  ASSERT3P(msp->ms_group, ==, mg);
2263 2596  
2264 2597                  /*
2265 2598                   * We preload only the maximum number of metaslabs specified
2266 2599                   * by metaslab_preload_limit. If a metaslab is being forced
2267 2600                   * to condense then we preload it too. This will ensure
2268 2601                   * that force condensing happens in the next txg.
2269 2602                   */
2270 2603                  if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
2271 2604                          continue;
2272 2605                  }
2273 2606  
2274 2607                  VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
2275 2608                      msp, TQ_SLEEP) != TASKQID_INVALID);
2276 2609          }
2277 2610          mutex_exit(&mg->mg_lock);
2278 2611  }
2279 2612  
2280 2613  /*
2281 2614   * Determine if the space map's on-disk footprint is past our tolerance
2282 2615   * for inefficiency. We would like to use the following criteria to make
2283 2616   * our decision:
2284 2617   *
2285 2618   * 1. The size of the space map object should not dramatically increase as a
2286 2619   * result of writing out the free space range tree.
2287 2620   *
2288 2621   * 2. The minimal on-disk space map representation is zfs_condense_pct/100
2289 2622   * times the size than the free space range tree representation
2290 2623   * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1MB).
2291 2624   *
2292 2625   * 3. The on-disk size of the space map should actually decrease.
2293 2626   *
2294 2627   * Unfortunately, we cannot compute the on-disk size of the space map in this
2295 2628   * context because we cannot accurately compute the effects of compression, etc.
2296 2629   * Instead, we apply the heuristic described in the block comment for
2297 2630   * zfs_metaslab_condense_block_threshold - we only condense if the space used
2298 2631   * is greater than a threshold number of blocks.
2299 2632   */
2300 2633  static boolean_t
2301 2634  metaslab_should_condense(metaslab_t *msp)
2302 2635  {
2303 2636          space_map_t *sm = msp->ms_sm;
2304 2637          vdev_t *vd = msp->ms_group->mg_vd;
2305 2638          uint64_t vdev_blocksize = 1 << vd->vdev_ashift;
2306 2639          uint64_t current_txg = spa_syncing_txg(vd->vdev_spa);
2307 2640  
2308 2641          ASSERT(MUTEX_HELD(&msp->ms_lock));
2309 2642          ASSERT(msp->ms_loaded);
2310 2643  
2311 2644          /*
2312 2645           * Allocations and frees in early passes are generally more space
2313 2646           * efficient (in terms of blocks described in space map entries)
2314 2647           * than the ones in later passes (e.g. we don't compress after
2315 2648           * sync pass 5) and condensing a metaslab multiple times in a txg
2316 2649           * could degrade performance.
2317 2650           *
2318 2651           * Thus we prefer condensing each metaslab at most once every txg at
2319 2652           * the earliest sync pass possible. If a metaslab is eligible for
2320 2653           * condensing again after being considered for condensing within the
2321 2654           * same txg, it will hopefully be dirty in the next txg where it will
2322 2655           * be condensed at an earlier pass.
2323 2656           */
2324 2657          if (msp->ms_condense_checked_txg == current_txg)
2325 2658                  return (B_FALSE);
2326 2659          msp->ms_condense_checked_txg = current_txg;
2327 2660  
2328 2661          /*
2329 2662           * We always condense metaslabs that are empty and metaslabs for
2330 2663           * which a condense request has been made.
2331 2664           */
2332 2665          if (avl_is_empty(&msp->ms_allocatable_by_size) ||
2333 2666              msp->ms_condense_wanted)
2334 2667                  return (B_TRUE);
2335 2668  
2336 2669          uint64_t object_size = space_map_length(msp->ms_sm);
2337 2670          uint64_t optimal_size = space_map_estimate_optimal_size(sm,
2338 2671              msp->ms_allocatable, SM_NO_VDEVID);
2339 2672  
2340 2673          dmu_object_info_t doi;
2341 2674          dmu_object_info_from_db(sm->sm_dbuf, &doi);
2342 2675          uint64_t record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
2343 2676  
2344 2677          return (object_size >= (optimal_size * zfs_condense_pct / 100) &&
2345 2678              object_size > zfs_metaslab_condense_block_threshold * record_size);
2346 2679  }
2347 2680  
2348 2681  /*
2349 2682   * Condense the on-disk space map representation to its minimized form.
2350 2683   * The minimized form consists of a small number of allocations followed by
2351 2684   * the entries of the free range tree.
2352 2685   */
2353 2686  static void
2354 2687  metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
2355 2688  {
2356 2689          range_tree_t *condense_tree;
2357 2690          space_map_t *sm = msp->ms_sm;
2358 2691  
2359 2692          ASSERT(MUTEX_HELD(&msp->ms_lock));
2360 2693          ASSERT(msp->ms_loaded);
2361 2694  
2362 2695          zfs_dbgmsg("condensing: txg %llu, msp[%llu] %p, vdev id %llu, "
2363 2696              "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
2364 2697              msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
2365 2698              msp->ms_group->mg_vd->vdev_spa->spa_name,
2366 2699              space_map_length(msp->ms_sm),
2367 2700              avl_numnodes(&msp->ms_allocatable->rt_root),
2368 2701              msp->ms_condense_wanted ? "TRUE" : "FALSE");
2369 2702  
2370 2703          msp->ms_condense_wanted = B_FALSE;
2371 2704  
2372 2705          /*
2373 2706           * Create an range tree that is 100% allocated. We remove segments
2374 2707           * that have been freed in this txg, any deferred frees that exist,
2375 2708           * and any allocation in the future. Removing segments should be
2376 2709           * a relatively inexpensive operation since we expect these trees to
2377 2710           * have a small number of nodes.
2378 2711           */
2379 2712          condense_tree = range_tree_create(NULL, NULL);
2380 2713          range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
2381 2714  
2382 2715          range_tree_walk(msp->ms_freeing, range_tree_remove, condense_tree);
2383 2716          range_tree_walk(msp->ms_freed, range_tree_remove, condense_tree);
2384 2717  
2385 2718          for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2386 2719                  range_tree_walk(msp->ms_defer[t],
2387 2720                      range_tree_remove, condense_tree);
2388 2721          }
2389 2722  
2390 2723          for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
2391 2724                  range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK],
2392 2725                      range_tree_remove, condense_tree);
2393 2726          }
2394 2727  
2395 2728          /*
2396 2729           * We're about to drop the metaslab's lock thus allowing
2397 2730           * other consumers to change it's content. Set the
2398 2731           * metaslab's ms_condensing flag to ensure that
2399 2732           * allocations on this metaslab do not occur while we're
2400 2733           * in the middle of committing it to disk. This is only critical
2401 2734           * for ms_allocatable as all other range trees use per txg
2402 2735           * views of their content.
2403 2736           */
2404 2737          msp->ms_condensing = B_TRUE;
2405 2738  
2406 2739          mutex_exit(&msp->ms_lock);
2407 2740          space_map_truncate(sm, zfs_metaslab_sm_blksz, tx);
2408 2741  
2409 2742          /*
2410 2743           * While we would ideally like to create a space map representation
2411 2744           * that consists only of allocation records, doing so can be
2412 2745           * prohibitively expensive because the in-core free tree can be
2413 2746           * large, and therefore computationally expensive to subtract
2414 2747           * from the condense_tree. Instead we sync out two trees, a cheap
2415 2748           * allocation only tree followed by the in-core free tree. While not
2416 2749           * optimal, this is typically close to optimal, and much cheaper to
2417 2750           * compute.
2418 2751           */
2419 2752          space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx);
2420 2753          range_tree_vacate(condense_tree, NULL, NULL);
2421 2754          range_tree_destroy(condense_tree);
2422 2755  
2423 2756          space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
2424 2757          mutex_enter(&msp->ms_lock);
2425 2758          msp->ms_condensing = B_FALSE;
2426 2759  }
2427 2760  
2428 2761  /*
2429 2762   * Write a metaslab to disk in the context of the specified transaction group.
2430 2763   */
2431 2764  void
2432 2765  metaslab_sync(metaslab_t *msp, uint64_t txg)
2433 2766  {
2434 2767          metaslab_group_t *mg = msp->ms_group;
2435 2768          vdev_t *vd = mg->mg_vd;
2436 2769          spa_t *spa = vd->vdev_spa;
2437 2770          objset_t *mos = spa_meta_objset(spa);
2438 2771          range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK];
2439 2772          dmu_tx_t *tx;
2440 2773          uint64_t object = space_map_object(msp->ms_sm);
2441 2774  
2442 2775          ASSERT(!vd->vdev_ishole);
2443 2776  
2444 2777          /*
2445 2778           * This metaslab has just been added so there's no work to do now.
2446 2779           */
2447 2780          if (msp->ms_freeing == NULL) {
2448 2781                  ASSERT3P(alloctree, ==, NULL);
2449 2782                  return;
2450 2783          }
2451 2784  
2452 2785          ASSERT3P(alloctree, !=, NULL);
2453 2786          ASSERT3P(msp->ms_freeing, !=, NULL);
2454 2787          ASSERT3P(msp->ms_freed, !=, NULL);
2455 2788          ASSERT3P(msp->ms_checkpointing, !=, NULL);
2456 2789  
2457 2790          /*
2458 2791           * Normally, we don't want to process a metaslab if there are no
2459 2792           * allocations or frees to perform. However, if the metaslab is being
2460 2793           * forced to condense and it's loaded, we need to let it through.
2461 2794           */

↓ open down ↓

364 lines elided

↑ open up ↑

2462 2795          if (range_tree_is_empty(alloctree) &&
2463 2796              range_tree_is_empty(msp->ms_freeing) &&
2464 2797              range_tree_is_empty(msp->ms_checkpointing) &&
2465 2798              !(msp->ms_loaded && msp->ms_condense_wanted))
2466 2799                  return;
2467 2800  
2468 2801  
2469 2802          VERIFY(txg <= spa_final_dirty_txg(spa));
2470 2803  
2471 2804          /*
2472      -         * The only state that can actually be changing concurrently with
2473      -         * metaslab_sync() is the metaslab's ms_allocatable.  No other
2474      -         * thread can be modifying this txg's alloc, freeing,
     2805 +         * The only state that can actually be changing concurrently
     2806 +         * with metaslab_sync() is the metaslab's ms_allocatable. No
     2807 +         * other thread can be modifying this txg's alloc, freeing,
2475 2808           * freed, or space_map_phys_t.  We drop ms_lock whenever we
2476      -         * could call into the DMU, because the DMU can call down to us
2477      -         * (e.g. via zio_free()) at any time.
     2809 +         * could call into the DMU, because the DMU can call down to
     2810 +         * us (e.g. via zio_free()) at any time.
2478 2811           *
2479 2812           * The spa_vdev_remove_thread() can be reading metaslab state
2480      -         * concurrently, and it is locked out by the ms_sync_lock.  Note
2481      -         * that the ms_lock is insufficient for this, because it is dropped
2482      -         * by space_map_write().
     2813 +         * concurrently, and it is locked out by the ms_sync_lock.
     2814 +         * Note that the ms_lock is insufficient for this, because it
     2815 +         * is dropped by space_map_write().
2483 2816           */
2484 2817          tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
2485 2818  
2486 2819          if (msp->ms_sm == NULL) {
2487 2820                  uint64_t new_object;
2488 2821  
2489 2822                  new_object = space_map_alloc(mos, zfs_metaslab_sm_blksz, tx);
2490 2823                  VERIFY3U(new_object, !=, 0);
2491 2824  
2492 2825                  VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
2493 2826                      msp->ms_start, msp->ms_size, vd->vdev_ashift));
     2827 +
2494 2828                  ASSERT(msp->ms_sm != NULL);
     2829 +                ASSERT0(metaslab_allocated_space(msp));
2495 2830          }
2496 2831  
2497 2832          if (!range_tree_is_empty(msp->ms_checkpointing) &&
2498 2833              vd->vdev_checkpoint_sm == NULL) {
2499 2834                  ASSERT(spa_has_checkpoint(spa));
2500 2835  
2501 2836                  uint64_t new_object = space_map_alloc(mos,
2502 2837                      vdev_standard_sm_blksz, tx);
2503 2838                  VERIFY3U(new_object, !=, 0);
2504 2839

2505 2840                  VERIFY0(space_map_open(&vd->vdev_checkpoint_sm,
2506 2841                      mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift));
2507 2842                  ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
2508 2843  
2509 2844                  /*
2510 2845                   * We save the space map object as an entry in vdev_top_zap
2511 2846                   * so it can be retrieved when the pool is reopened after an
2512 2847                   * export or through zdb.
2513 2848                   */
2514 2849                  VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset,
2515 2850                      vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
2516 2851                      sizeof (new_object), 1, &new_object, tx));
2517 2852          }
2518 2853  
2519 2854          mutex_enter(&msp->ms_sync_lock);
2520 2855          mutex_enter(&msp->ms_lock);
2521 2856  
2522 2857          /*
2523 2858           * Note: metaslab_condense() clears the space map's histogram.
2524 2859           * Therefore we must verify and remove this histogram before
2525 2860           * condensing.
2526 2861           */
2527 2862          metaslab_group_histogram_verify(mg);
2528 2863          metaslab_class_histogram_verify(mg->mg_class);
2529 2864          metaslab_group_histogram_remove(mg, msp);
2530 2865  
2531 2866          if (msp->ms_loaded && metaslab_should_condense(msp)) {

↓ open down ↓

27 lines elided

↑ open up ↑

2532 2867                  metaslab_condense(msp, txg, tx);
2533 2868          } else {
2534 2869                  mutex_exit(&msp->ms_lock);
2535 2870                  space_map_write(msp->ms_sm, alloctree, SM_ALLOC,
2536 2871                      SM_NO_VDEVID, tx);
2537 2872                  space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE,
2538 2873                      SM_NO_VDEVID, tx);
2539 2874                  mutex_enter(&msp->ms_lock);
2540 2875          }
2541 2876  
     2877 +        msp->ms_allocated_space += range_tree_space(alloctree);
     2878 +        ASSERT3U(msp->ms_allocated_space, >=,
     2879 +            range_tree_space(msp->ms_freeing));
     2880 +        msp->ms_allocated_space -= range_tree_space(msp->ms_freeing);
     2881 +
2542 2882          if (!range_tree_is_empty(msp->ms_checkpointing)) {
2543 2883                  ASSERT(spa_has_checkpoint(spa));
2544 2884                  ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
2545 2885  
2546 2886                  /*
2547 2887                   * Since we are doing writes to disk and the ms_checkpointing
2548 2888                   * tree won't be changing during that time, we drop the
2549 2889                   * ms_lock while writing to the checkpoint space map.
2550 2890                   */
2551 2891                  mutex_exit(&msp->ms_lock);
2552 2892                  space_map_write(vd->vdev_checkpoint_sm,
2553 2893                      msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx);
2554 2894                  mutex_enter(&msp->ms_lock);
2555      -                space_map_update(vd->vdev_checkpoint_sm);
2556 2895  
2557 2896                  spa->spa_checkpoint_info.sci_dspace +=
2558 2897                      range_tree_space(msp->ms_checkpointing);
2559 2898                  vd->vdev_stat.vs_checkpoint_space +=
2560 2899                      range_tree_space(msp->ms_checkpointing);
2561 2900                  ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==,
2562      -                    -vd->vdev_checkpoint_sm->sm_alloc);
     2901 +                    -space_map_allocated(vd->vdev_checkpoint_sm));
2563 2902  
2564 2903                  range_tree_vacate(msp->ms_checkpointing, NULL, NULL);
2565 2904          }
2566 2905  
2567 2906          if (msp->ms_loaded) {
2568 2907                  /*
2569 2908                   * When the space map is loaded, we have an accurate
2570 2909                   * histogram in the range tree. This gives us an opportunity
2571 2910                   * to bring the space map's histogram up-to-date so we clear
2572 2911                   * it first before updating it.

2573 2912                   */
2574 2913                  space_map_histogram_clear(msp->ms_sm);
2575 2914                  space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
2576 2915  
2577 2916                  /*
2578 2917                   * Since we've cleared the histogram we need to add back
2579 2918                   * any free space that has already been processed, plus
2580 2919                   * any deferred space. This allows the on-disk histogram
2581 2920                   * to accurately reflect all free space even if some space
2582 2921                   * is not yet available for allocation (i.e. deferred).
2583 2922                   */
2584 2923                  space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx);
2585 2924  
2586 2925                  /*
2587 2926                   * Add back any deferred free space that has not been
2588 2927                   * added back into the in-core free tree yet. This will
2589 2928                   * ensure that we don't end up with a space map histogram
2590 2929                   * that is completely empty unless the metaslab is fully
2591 2930                   * allocated.
2592 2931                   */
2593 2932                  for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2594 2933                          space_map_histogram_add(msp->ms_sm,
2595 2934                              msp->ms_defer[t], tx);
2596 2935                  }

↓ open down ↓

24 lines elided

↑ open up ↑

2597 2936          }
2598 2937  
2599 2938          /*
2600 2939           * Always add the free space from this sync pass to the space
2601 2940           * map histogram. We want to make sure that the on-disk histogram
2602 2941           * accounts for all free space. If the space map is not loaded,
2603 2942           * then we will lose some accuracy but will correct it the next
2604 2943           * time we load the space map.
2605 2944           */
2606 2945          space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx);
     2946 +        metaslab_aux_histograms_update(msp);
2607 2947  
2608 2948          metaslab_group_histogram_add(mg, msp);
2609 2949          metaslab_group_histogram_verify(mg);
2610 2950          metaslab_class_histogram_verify(mg->mg_class);
2611 2951  
2612 2952          /*
2613 2953           * For sync pass 1, we avoid traversing this txg's free range tree
2614      -         * and instead will just swap the pointers for freeing and
2615      -         * freed. We can safely do this since the freed_tree is
2616      -         * guaranteed to be empty on the initial pass.
     2954 +         * and instead will just swap the pointers for freeing and freed.
     2955 +         * We can safely do this since the freed_tree is guaranteed to be
     2956 +         * empty on the initial pass.
2617 2957           */
2618 2958          if (spa_sync_pass(spa) == 1) {
2619 2959                  range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
     2960 +                ASSERT0(msp->ms_allocated_this_txg);
2620 2961          } else {
2621 2962                  range_tree_vacate(msp->ms_freeing,
2622 2963                      range_tree_add, msp->ms_freed);
2623 2964          }
     2965 +        msp->ms_allocated_this_txg += range_tree_space(alloctree);
2624 2966          range_tree_vacate(alloctree, NULL, NULL);
2625 2967  
2626 2968          ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
2627 2969          ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg)
2628 2970              & TXG_MASK]));
2629 2971          ASSERT0(range_tree_space(msp->ms_freeing));
2630 2972          ASSERT0(range_tree_space(msp->ms_checkpointing));
2631 2973  
2632 2974          mutex_exit(&msp->ms_lock);
2633 2975

2634 2976          if (object != space_map_object(msp->ms_sm)) {
2635 2977                  object = space_map_object(msp->ms_sm);
2636 2978                  dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
2637 2979                      msp->ms_id, sizeof (uint64_t), &object, tx);
2638 2980          }
2639 2981          mutex_exit(&msp->ms_sync_lock);
2640 2982          dmu_tx_commit(tx);
2641 2983  }
2642 2984  
2643 2985  /*
2644 2986   * Called after a transaction group has completely synced to mark
2645 2987   * all of the metaslab's free space as usable.
2646 2988   */
2647 2989  void
2648 2990  metaslab_sync_done(metaslab_t *msp, uint64_t txg)
2649 2991  {
2650 2992          metaslab_group_t *mg = msp->ms_group;
2651 2993          vdev_t *vd = mg->mg_vd;
2652 2994          spa_t *spa = vd->vdev_spa;
2653 2995          range_tree_t **defer_tree;
2654 2996          int64_t alloc_delta, defer_delta;
2655 2997          boolean_t defer_allowed = B_TRUE;
2656 2998  
2657 2999          ASSERT(!vd->vdev_ishole);
2658 3000  
2659 3001          mutex_enter(&msp->ms_lock);
2660 3002  
2661 3003          /*
2662 3004           * If this metaslab is just becoming available, initialize its
2663 3005           * range trees and add its capacity to the vdev.
2664 3006           */
2665 3007          if (msp->ms_freed == NULL) {
2666 3008                  for (int t = 0; t < TXG_SIZE; t++) {
2667 3009                          ASSERT(msp->ms_allocating[t] == NULL);
2668 3010  
2669 3011                          msp->ms_allocating[t] = range_tree_create(NULL, NULL);
2670 3012                  }
2671 3013  
2672 3014                  ASSERT3P(msp->ms_freeing, ==, NULL);
2673 3015                  msp->ms_freeing = range_tree_create(NULL, NULL);
2674 3016  
2675 3017                  ASSERT3P(msp->ms_freed, ==, NULL);
2676 3018                  msp->ms_freed = range_tree_create(NULL, NULL);
2677 3019  
2678 3020                  for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2679 3021                          ASSERT(msp->ms_defer[t] == NULL);
2680 3022  
2681 3023                          msp->ms_defer[t] = range_tree_create(NULL, NULL);
2682 3024                  }
2683 3025  
2684 3026                  ASSERT3P(msp->ms_checkpointing, ==, NULL);
2685 3027                  msp->ms_checkpointing = range_tree_create(NULL, NULL);
2686 3028  
2687 3029                  metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
2688 3030          }
2689 3031          ASSERT0(range_tree_space(msp->ms_freeing));
2690 3032          ASSERT0(range_tree_space(msp->ms_checkpointing));

↓ open down ↓

57 lines elided

↑ open up ↑

2691 3033  
2692 3034          defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE];
2693 3035  
2694 3036          uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
2695 3037              metaslab_class_get_alloc(spa_normal_class(spa));
2696 3038          if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) {
2697 3039                  defer_allowed = B_FALSE;
2698 3040          }
2699 3041  
2700 3042          defer_delta = 0;
2701      -        alloc_delta = space_map_alloc_delta(msp->ms_sm);
     3043 +        alloc_delta = msp->ms_allocated_this_txg -
     3044 +            range_tree_space(msp->ms_freed);
2702 3045          if (defer_allowed) {
2703 3046                  defer_delta = range_tree_space(msp->ms_freed) -
2704 3047                      range_tree_space(*defer_tree);
2705 3048          } else {
2706 3049                  defer_delta -= range_tree_space(*defer_tree);
2707 3050          }
2708 3051  
2709 3052          metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta,
2710 3053              defer_delta, 0);
2711 3054

2712 3055          /*
2713 3056           * If there's a metaslab_load() in progress, wait for it to complete
2714 3057           * so that we have a consistent view of the in-core space map.
2715 3058           */
2716 3059          metaslab_load_wait(msp);
2717 3060  
2718 3061          /*
2719 3062           * Move the frees from the defer_tree back to the free
2720 3063           * range tree (if it's loaded). Swap the freed_tree and
2721 3064           * the defer_tree -- this is safe to do because we've
2722 3065           * just emptied out the defer_tree.

↓ open down ↓

11 lines elided

↑ open up ↑

2723 3066           */
2724 3067          range_tree_vacate(*defer_tree,
2725 3068              msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable);
2726 3069          if (defer_allowed) {
2727 3070                  range_tree_swap(&msp->ms_freed, defer_tree);
2728 3071          } else {
2729 3072                  range_tree_vacate(msp->ms_freed,
2730 3073                      msp->ms_loaded ? range_tree_add : NULL,
2731 3074                      msp->ms_allocatable);
2732 3075          }
2733      -        space_map_update(msp->ms_sm);
2734 3076  
     3077 +        msp->ms_synced_length = space_map_length(msp->ms_sm);
     3078 +
2735 3079          msp->ms_deferspace += defer_delta;
2736 3080          ASSERT3S(msp->ms_deferspace, >=, 0);
2737 3081          ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
2738 3082          if (msp->ms_deferspace != 0) {
2739 3083                  /*
2740 3084                   * Keep syncing this metaslab until all deferred frees
2741 3085                   * are back in circulation.
2742 3086                   */
2743 3087                  vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
2744 3088          }
     3089 +        metaslab_aux_histograms_update_done(msp, defer_allowed);
2745 3090  
2746 3091          if (msp->ms_new) {
2747 3092                  msp->ms_new = B_FALSE;
2748 3093                  mutex_enter(&mg->mg_lock);
2749 3094                  mg->mg_ms_ready++;
2750 3095                  mutex_exit(&mg->mg_lock);
2751 3096          }
     3097 +
2752 3098          /*
2753      -         * Calculate the new weights before unloading any metaslabs.
2754      -         * This will give us the most accurate weighting.
     3099 +         * Re-sort metaslab within its group now that we've adjusted
     3100 +         * its allocatable space.
2755 3101           */
2756      -        metaslab_group_sort(mg, msp, metaslab_weight(msp) |
2757      -            (msp->ms_weight & METASLAB_ACTIVE_MASK));
     3102 +        metaslab_recalculate_weight_and_sort(msp);
2758 3103  
2759 3104          /*
2760 3105           * If the metaslab is loaded and we've not tried to load or allocate
2761 3106           * from it in 'metaslab_unload_delay' txgs, then unload it.
2762 3107           */
2763 3108          if (msp->ms_loaded &&
2764 3109              msp->ms_initializing == 0 &&
2765 3110              msp->ms_selected_txg + metaslab_unload_delay < txg) {
2766 3111                  for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
2767 3112                          VERIFY0(range_tree_space(

2768 3113                              msp->ms_allocating[(txg + t) & TXG_MASK]));
2769 3114                  }
2770 3115                  if (msp->ms_allocator != -1) {
2771 3116                          metaslab_passivate(msp, msp->ms_weight &
2772 3117                              ~METASLAB_ACTIVE_MASK);
2773 3118                  }

↓ open down ↓

6 lines elided

↑ open up ↑

2774 3119  
2775 3120                  if (!metaslab_debug_unload)
2776 3121                          metaslab_unload(msp);
2777 3122          }
2778 3123  
2779 3124          ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
2780 3125          ASSERT0(range_tree_space(msp->ms_freeing));
2781 3126          ASSERT0(range_tree_space(msp->ms_freed));
2782 3127          ASSERT0(range_tree_space(msp->ms_checkpointing));
2783 3128  
     3129 +        msp->ms_allocated_this_txg = 0;
2784 3130          mutex_exit(&msp->ms_lock);
2785 3131  }
2786 3132  
2787 3133  void
2788 3134  metaslab_sync_reassess(metaslab_group_t *mg)
2789 3135  {
2790 3136          spa_t *spa = mg->mg_class->mc_spa;
2791 3137  
2792 3138          spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
2793 3139          metaslab_group_alloc_update(mg);

2794 3140          mg->mg_fragmentation = metaslab_group_fragmentation(mg);
2795 3141  
2796 3142          /*
2797 3143           * Preload the next potential metaslabs but only on active
2798 3144           * metaslab groups. We can get into a state where the metaslab
2799 3145           * is no longer active since we dirty metaslabs as we remove a
2800 3146           * a device, thus potentially making the metaslab group eligible
2801 3147           * for preloading.
2802 3148           */
2803 3149          if (mg->mg_activation_count > 0) {
2804 3150                  metaslab_group_preload(mg);
2805 3151          }
2806 3152          spa_config_exit(spa, SCL_ALLOC, FTAG);
2807 3153  }
2808 3154  
2809 3155  /*
2810 3156   * When writing a ditto block (i.e. more than one DVA for a given BP) on
2811 3157   * the same vdev as an existing DVA of this BP, then try to allocate it
2812 3158   * on a different metaslab than existing DVAs (i.e. a unique metaslab).
2813 3159   */
2814 3160  static boolean_t
2815 3161  metaslab_is_unique(metaslab_t *msp, dva_t *dva)
2816 3162  {
2817 3163          uint64_t dva_ms_id;
2818 3164  
2819 3165          if (DVA_GET_ASIZE(dva) == 0)
2820 3166                  return (B_TRUE);
2821 3167  
2822 3168          if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
2823 3169                  return (B_TRUE);
2824 3170  
2825 3171          dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift;
2826 3172  
2827 3173          return (msp->ms_id != dva_ms_id);
2828 3174  }
2829 3175  
2830 3176  /*
2831 3177   * ==========================================================================
2832 3178   * Metaslab allocation tracing facility
2833 3179   * ==========================================================================
2834 3180   */
2835 3181  kstat_t *metaslab_trace_ksp;
2836 3182  kstat_named_t metaslab_trace_over_limit;
2837 3183  
2838 3184  void
2839 3185  metaslab_alloc_trace_init(void)
2840 3186  {
2841 3187          ASSERT(metaslab_alloc_trace_cache == NULL);
2842 3188          metaslab_alloc_trace_cache = kmem_cache_create(
2843 3189              "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
2844 3190              0, NULL, NULL, NULL, NULL, NULL, 0);
2845 3191          metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats",
2846 3192              "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL);
2847 3193          if (metaslab_trace_ksp != NULL) {
2848 3194                  metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit;
2849 3195                  kstat_named_init(&metaslab_trace_over_limit,
2850 3196                      "metaslab_trace_over_limit", KSTAT_DATA_UINT64);
2851 3197                  kstat_install(metaslab_trace_ksp);
2852 3198          }
2853 3199  }
2854 3200  
2855 3201  void
2856 3202  metaslab_alloc_trace_fini(void)
2857 3203  {
2858 3204          if (metaslab_trace_ksp != NULL) {
2859 3205                  kstat_delete(metaslab_trace_ksp);
2860 3206                  metaslab_trace_ksp = NULL;
2861 3207          }
2862 3208          kmem_cache_destroy(metaslab_alloc_trace_cache);
2863 3209          metaslab_alloc_trace_cache = NULL;
2864 3210  }
2865 3211  
2866 3212  /*
2867 3213   * Add an allocation trace element to the allocation tracing list.
2868 3214   */
2869 3215  static void
2870 3216  metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
2871 3217      metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset,
2872 3218      int allocator)
2873 3219  {
2874 3220          if (!metaslab_trace_enabled)
2875 3221                  return;
2876 3222  
2877 3223          /*
2878 3224           * When the tracing list reaches its maximum we remove
2879 3225           * the second element in the list before adding a new one.
2880 3226           * By removing the second element we preserve the original
2881 3227           * entry as a clue to what allocations steps have already been
2882 3228           * performed.
2883 3229           */
2884 3230          if (zal->zal_size == metaslab_trace_max_entries) {
2885 3231                  metaslab_alloc_trace_t *mat_next;
2886 3232  #ifdef DEBUG
2887 3233                  panic("too many entries in allocation list");
2888 3234  #endif
2889 3235                  atomic_inc_64(&metaslab_trace_over_limit.value.ui64);
2890 3236                  zal->zal_size--;
2891 3237                  mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
2892 3238                  list_remove(&zal->zal_list, mat_next);
2893 3239                  kmem_cache_free(metaslab_alloc_trace_cache, mat_next);
2894 3240          }
2895 3241  
2896 3242          metaslab_alloc_trace_t *mat =
2897 3243              kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
2898 3244          list_link_init(&mat->mat_list_node);
2899 3245          mat->mat_mg = mg;
2900 3246          mat->mat_msp = msp;
2901 3247          mat->mat_size = psize;
2902 3248          mat->mat_dva_id = dva_id;
2903 3249          mat->mat_offset = offset;
2904 3250          mat->mat_weight = 0;
2905 3251          mat->mat_allocator = allocator;
2906 3252  
2907 3253          if (msp != NULL)
2908 3254                  mat->mat_weight = msp->ms_weight;
2909 3255  
2910 3256          /*
2911 3257           * The list is part of the zio so locking is not required. Only
2912 3258           * a single thread will perform allocations for a given zio.
2913 3259           */
2914 3260          list_insert_tail(&zal->zal_list, mat);
2915 3261          zal->zal_size++;
2916 3262  
2917 3263          ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
2918 3264  }
2919 3265  
2920 3266  void
2921 3267  metaslab_trace_init(zio_alloc_list_t *zal)
2922 3268  {
2923 3269          list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t),
2924 3270              offsetof(metaslab_alloc_trace_t, mat_list_node));
2925 3271          zal->zal_size = 0;
2926 3272  }
2927 3273  
2928 3274  void
2929 3275  metaslab_trace_fini(zio_alloc_list_t *zal)
2930 3276  {
2931 3277          metaslab_alloc_trace_t *mat;
2932 3278  
2933 3279          while ((mat = list_remove_head(&zal->zal_list)) != NULL)
2934 3280                  kmem_cache_free(metaslab_alloc_trace_cache, mat);
2935 3281          list_destroy(&zal->zal_list);
2936 3282          zal->zal_size = 0;
2937 3283  }
2938 3284  
2939 3285  /*
2940 3286   * ==========================================================================
2941 3287   * Metaslab block operations
2942 3288   * ==========================================================================
2943 3289   */
2944 3290  
2945 3291  static void
2946 3292  metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags,
2947 3293      int allocator)
2948 3294  {
2949 3295          if (!(flags & METASLAB_ASYNC_ALLOC) ||
2950 3296              (flags & METASLAB_DONT_THROTTLE))
2951 3297                  return;
2952 3298  
2953 3299          metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2954 3300          if (!mg->mg_class->mc_alloc_throttle_enabled)
2955 3301                  return;
2956 3302  
2957 3303          (void) zfs_refcount_add(&mg->mg_alloc_queue_depth[allocator], tag);
2958 3304  }
2959 3305  
2960 3306  static void
2961 3307  metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator)
2962 3308  {
2963 3309          uint64_t max = mg->mg_max_alloc_queue_depth;
2964 3310          uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator];
2965 3311          while (cur < max) {
2966 3312                  if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator],
2967 3313                      cur, cur + 1) == cur) {
2968 3314                          atomic_inc_64(
2969 3315                              &mg->mg_class->mc_alloc_max_slots[allocator]);
2970 3316                          return;
2971 3317                  }
2972 3318                  cur = mg->mg_cur_max_alloc_queue_depth[allocator];
2973 3319          }
2974 3320  }
2975 3321  
2976 3322  void
2977 3323  metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags,
2978 3324      int allocator, boolean_t io_complete)
2979 3325  {
2980 3326          if (!(flags & METASLAB_ASYNC_ALLOC) ||
2981 3327              (flags & METASLAB_DONT_THROTTLE))
2982 3328                  return;
2983 3329  
2984 3330          metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2985 3331          if (!mg->mg_class->mc_alloc_throttle_enabled)
2986 3332                  return;
2987 3333  
2988 3334          (void) zfs_refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag);
2989 3335          if (io_complete)
2990 3336                  metaslab_group_increment_qdepth(mg, allocator);
2991 3337  }
2992 3338  
2993 3339  void
2994 3340  metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag,
2995 3341      int allocator)
2996 3342  {
2997 3343  #ifdef ZFS_DEBUG
2998 3344          const dva_t *dva = bp->blk_dva;
2999 3345          int ndvas = BP_GET_NDVAS(bp);
3000 3346  
3001 3347          for (int d = 0; d < ndvas; d++) {
3002 3348                  uint64_t vdev = DVA_GET_VDEV(&dva[d]);
3003 3349                  metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
3004 3350                  VERIFY(zfs_refcount_not_held(
3005 3351                      &mg->mg_alloc_queue_depth[allocator], tag));
3006 3352          }
3007 3353  #endif
3008 3354  }
3009 3355  
3010 3356  static uint64_t
3011 3357  metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
3012 3358  {
3013 3359          uint64_t start;
3014 3360          range_tree_t *rt = msp->ms_allocatable;
3015 3361          metaslab_class_t *mc = msp->ms_group->mg_class;
3016 3362  
3017 3363          VERIFY(!msp->ms_condensing);
3018 3364          VERIFY0(msp->ms_initializing);
3019 3365  
3020 3366          start = mc->mc_ops->msop_alloc(msp, size);
3021 3367          if (start != -1ULL) {
3022 3368                  metaslab_group_t *mg = msp->ms_group;
3023 3369                  vdev_t *vd = mg->mg_vd;
3024 3370  
3025 3371                  VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
3026 3372                  VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
3027 3373                  VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
3028 3374                  range_tree_remove(rt, start, size);
3029 3375  
3030 3376                  if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
3031 3377                          vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
3032 3378  
3033 3379                  range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size);
3034 3380  
3035 3381                  /* Track the last successful allocation */
3036 3382                  msp->ms_alloc_txg = txg;
3037 3383                  metaslab_verify_space(msp, txg);
3038 3384          }
3039 3385  
3040 3386          /*
3041 3387           * Now that we've attempted the allocation we need to update the
3042 3388           * metaslab's maximum block size since it may have changed.
3043 3389           */
3044 3390          msp->ms_max_size = metaslab_block_maxsize(msp);
3045 3391          return (start);
3046 3392  }
3047 3393  
3048 3394  /*
3049 3395   * Find the metaslab with the highest weight that is less than what we've
3050 3396   * already tried.  In the common case, this means that we will examine each
3051 3397   * metaslab at most once. Note that concurrent callers could reorder metaslabs
3052 3398   * by activation/passivation once we have dropped the mg_lock. If a metaslab is
3053 3399   * activated by another thread, and we fail to allocate from the metaslab we
3054 3400   * have selected, we may not try the newly-activated metaslab, and instead
3055 3401   * activate another metaslab.  This is not optimal, but generally does not cause
3056 3402   * any problems (a possible exception being if every metaslab is completely full
3057 3403   * except for the the newly-activated metaslab which we fail to examine).
3058 3404   */
3059 3405  static metaslab_t *
3060 3406  find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
3061 3407      dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator,
3062 3408      zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active)
3063 3409  {
3064 3410          avl_index_t idx;
3065 3411          avl_tree_t *t = &mg->mg_metaslab_tree;
3066 3412          metaslab_t *msp = avl_find(t, search, &idx);
3067 3413          if (msp == NULL)
3068 3414                  msp = avl_nearest(t, idx, AVL_AFTER);
3069 3415  
3070 3416          for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
3071 3417                  int i;
3072 3418                  if (!metaslab_should_allocate(msp, asize)) {
3073 3419                          metaslab_trace_add(zal, mg, msp, asize, d,
3074 3420                              TRACE_TOO_SMALL, allocator);
3075 3421                          continue;
3076 3422                  }
3077 3423  
3078 3424                  /*
3079 3425                   * If the selected metaslab is condensing or being
3080 3426                   * initialized, skip it.
3081 3427                   */
3082 3428                  if (msp->ms_condensing || msp->ms_initializing > 0)
3083 3429                          continue;
3084 3430  
3085 3431                  *was_active = msp->ms_allocator != -1;
3086 3432                  /*
3087 3433                   * If we're activating as primary, this is our first allocation
3088 3434                   * from this disk, so we don't need to check how close we are.
3089 3435                   * If the metaslab under consideration was already active,
3090 3436                   * we're getting desperate enough to steal another allocator's
3091 3437                   * metaslab, so we still don't care about distances.
3092 3438                   */
3093 3439                  if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active)
3094 3440                          break;
3095 3441  
3096 3442                  for (i = 0; i < d; i++) {
3097 3443                          if (want_unique &&
3098 3444                              !metaslab_is_unique(msp, &dva[i]))
3099 3445                                  break;  /* try another metaslab */
3100 3446                  }
3101 3447                  if (i == d)
3102 3448                          break;
3103 3449          }
3104 3450  
3105 3451          if (msp != NULL) {
3106 3452                  search->ms_weight = msp->ms_weight;
3107 3453                  search->ms_start = msp->ms_start + 1;
3108 3454                  search->ms_allocator = msp->ms_allocator;
3109 3455                  search->ms_primary = msp->ms_primary;
3110 3456          }
3111 3457          return (msp);
3112 3458  }
3113 3459  
3114 3460  /* ARGSUSED */
3115 3461  static uint64_t
3116 3462  metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
3117 3463      uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva,
3118 3464      int d, int allocator)
3119 3465  {
3120 3466          metaslab_t *msp = NULL;
3121 3467          uint64_t offset = -1ULL;
3122 3468          uint64_t activation_weight;
3123 3469  
3124 3470          activation_weight = METASLAB_WEIGHT_PRIMARY;
3125 3471          for (int i = 0; i < d; i++) {
3126 3472                  if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
3127 3473                      DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
3128 3474                          activation_weight = METASLAB_WEIGHT_SECONDARY;
3129 3475                  } else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
3130 3476                      DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
3131 3477                          activation_weight = METASLAB_WEIGHT_CLAIM;
3132 3478                          break;
3133 3479                  }
3134 3480          }
3135 3481  
3136 3482          /*
3137 3483           * If we don't have enough metaslabs active to fill the entire array, we
3138 3484           * just use the 0th slot.
3139 3485           */
3140 3486          if (mg->mg_ms_ready < mg->mg_allocators * 3)
3141 3487                  allocator = 0;
3142 3488  
3143 3489          ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2);
3144 3490  
3145 3491          metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
3146 3492          search->ms_weight = UINT64_MAX;
3147 3493          search->ms_start = 0;
3148 3494          /*
3149 3495           * At the end of the metaslab tree are the already-active metaslabs,
3150 3496           * first the primaries, then the secondaries. When we resume searching
3151 3497           * through the tree, we need to consider ms_allocator and ms_primary so
3152 3498           * we start in the location right after where we left off, and don't
3153 3499           * accidentally loop forever considering the same metaslabs.
3154 3500           */
3155 3501          search->ms_allocator = -1;
3156 3502          search->ms_primary = B_TRUE;
3157 3503          for (;;) {
3158 3504                  boolean_t was_active = B_FALSE;
3159 3505  
3160 3506                  mutex_enter(&mg->mg_lock);
3161 3507  
3162 3508                  if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
3163 3509                      mg->mg_primaries[allocator] != NULL) {
3164 3510                          msp = mg->mg_primaries[allocator];
3165 3511                          was_active = B_TRUE;
3166 3512                  } else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
3167 3513                      mg->mg_secondaries[allocator] != NULL) {
3168 3514                          msp = mg->mg_secondaries[allocator];
3169 3515                          was_active = B_TRUE;
3170 3516                  } else {
3171 3517                          msp = find_valid_metaslab(mg, activation_weight, dva, d,
3172 3518                              want_unique, asize, allocator, zal, search,
3173 3519                              &was_active);
3174 3520                  }
3175 3521  
3176 3522                  mutex_exit(&mg->mg_lock);
3177 3523                  if (msp == NULL) {
3178 3524                          kmem_free(search, sizeof (*search));
3179 3525                          return (-1ULL);
3180 3526                  }
3181 3527  
3182 3528                  mutex_enter(&msp->ms_lock);
3183 3529                  /*
3184 3530                   * Ensure that the metaslab we have selected is still
3185 3531                   * capable of handling our request. It's possible that
3186 3532                   * another thread may have changed the weight while we
3187 3533                   * were blocked on the metaslab lock. We check the
3188 3534                   * active status first to see if we need to reselect
3189 3535                   * a new metaslab.
3190 3536                   */
3191 3537                  if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
3192 3538                          mutex_exit(&msp->ms_lock);
3193 3539                          continue;
3194 3540                  }
3195 3541  
3196 3542                  /*
3197 3543                   * If the metaslab is freshly activated for an allocator that
3198 3544                   * isn't the one we're allocating from, or if it's a primary and
3199 3545                   * we're seeking a secondary (or vice versa), we go back and
3200 3546                   * select a new metaslab.
3201 3547                   */
3202 3548                  if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) &&
3203 3549                      (msp->ms_allocator != -1) &&
3204 3550                      (msp->ms_allocator != allocator || ((activation_weight ==
3205 3551                      METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) {
3206 3552                          mutex_exit(&msp->ms_lock);
3207 3553                          continue;
3208 3554                  }
3209 3555  
3210 3556                  if (msp->ms_weight & METASLAB_WEIGHT_CLAIM &&
3211 3557                      activation_weight != METASLAB_WEIGHT_CLAIM) {
3212 3558                          metaslab_passivate(msp, msp->ms_weight &
3213 3559                              ~METASLAB_WEIGHT_CLAIM);
3214 3560                          mutex_exit(&msp->ms_lock);
3215 3561                          continue;
3216 3562                  }
3217 3563  
3218 3564                  if (metaslab_activate(msp, allocator, activation_weight) != 0) {
3219 3565                          mutex_exit(&msp->ms_lock);
3220 3566                          continue;
3221 3567                  }
3222 3568  
3223 3569                  msp->ms_selected_txg = txg;
3224 3570  
3225 3571                  /*
3226 3572                   * Now that we have the lock, recheck to see if we should
3227 3573                   * continue to use this metaslab for this allocation. The
3228 3574                   * the metaslab is now loaded so metaslab_should_allocate() can
3229 3575                   * accurately determine if the allocation attempt should
3230 3576                   * proceed.
3231 3577                   */
3232 3578                  if (!metaslab_should_allocate(msp, asize)) {
3233 3579                          /* Passivate this metaslab and select a new one. */
3234 3580                          metaslab_trace_add(zal, mg, msp, asize, d,
3235 3581                              TRACE_TOO_SMALL, allocator);
3236 3582                          goto next;
3237 3583                  }
3238 3584  
3239 3585                  /*
3240 3586                   * If this metaslab is currently condensing then pick again as
3241 3587                   * we can't manipulate this metaslab until it's committed
3242 3588                   * to disk. If this metaslab is being initialized, we shouldn't
3243 3589                   * allocate from it since the allocated region might be
3244 3590                   * overwritten after allocation.
3245 3591                   */
3246 3592                  if (msp->ms_condensing) {
3247 3593                          metaslab_trace_add(zal, mg, msp, asize, d,
3248 3594                              TRACE_CONDENSING, allocator);
3249 3595                          metaslab_passivate(msp, msp->ms_weight &
3250 3596                              ~METASLAB_ACTIVE_MASK);
3251 3597                          mutex_exit(&msp->ms_lock);
3252 3598                          continue;
3253 3599                  } else if (msp->ms_initializing > 0) {
3254 3600                          metaslab_trace_add(zal, mg, msp, asize, d,
3255 3601                              TRACE_INITIALIZING, allocator);
3256 3602                          metaslab_passivate(msp, msp->ms_weight &
3257 3603                              ~METASLAB_ACTIVE_MASK);
3258 3604                          mutex_exit(&msp->ms_lock);
3259 3605                          continue;
3260 3606                  }
3261 3607  
3262 3608                  offset = metaslab_block_alloc(msp, asize, txg);
3263 3609                  metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator);
3264 3610  
3265 3611                  if (offset != -1ULL) {
3266 3612                          /* Proactively passivate the metaslab, if needed */
3267 3613                          metaslab_segment_may_passivate(msp);
3268 3614                          break;
3269 3615                  }
3270 3616  next:
3271 3617                  ASSERT(msp->ms_loaded);
3272 3618  
3273 3619                  /*
3274 3620                   * We were unable to allocate from this metaslab so determine
3275 3621                   * a new weight for this metaslab. Now that we have loaded
3276 3622                   * the metaslab we can provide a better hint to the metaslab
3277 3623                   * selector.
3278 3624                   *
3279 3625                   * For space-based metaslabs, we use the maximum block size.
3280 3626                   * This information is only available when the metaslab
3281 3627                   * is loaded and is more accurate than the generic free
3282 3628                   * space weight that was calculated by metaslab_weight().
3283 3629                   * This information allows us to quickly compare the maximum
3284 3630                   * available allocation in the metaslab to the allocation
3285 3631                   * size being requested.
3286 3632                   *
3287 3633                   * For segment-based metaslabs, determine the new weight
3288 3634                   * based on the highest bucket in the range tree. We
3289 3635                   * explicitly use the loaded segment weight (i.e. the range
3290 3636                   * tree histogram) since it contains the space that is
3291 3637                   * currently available for allocation and is accurate
3292 3638                   * even within a sync pass.
3293 3639                   */
3294 3640                  if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
3295 3641                          uint64_t weight = metaslab_block_maxsize(msp);
3296 3642                          WEIGHT_SET_SPACEBASED(weight);
3297 3643                          metaslab_passivate(msp, weight);
3298 3644                  } else {
3299 3645                          metaslab_passivate(msp,
3300 3646                              metaslab_weight_from_range_tree(msp));
3301 3647                  }
3302 3648  
3303 3649                  /*
3304 3650                   * We have just failed an allocation attempt, check
3305 3651                   * that metaslab_should_allocate() agrees. Otherwise,
3306 3652                   * we may end up in an infinite loop retrying the same
3307 3653                   * metaslab.
3308 3654                   */
3309 3655                  ASSERT(!metaslab_should_allocate(msp, asize));
3310 3656  
3311 3657                  mutex_exit(&msp->ms_lock);
3312 3658          }
3313 3659          mutex_exit(&msp->ms_lock);
3314 3660          kmem_free(search, sizeof (*search));
3315 3661          return (offset);
3316 3662  }
3317 3663  
3318 3664  static uint64_t
3319 3665  metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
3320 3666      uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva,
3321 3667      int d, int allocator)
3322 3668  {
3323 3669          uint64_t offset;
3324 3670          ASSERT(mg->mg_initialized);
3325 3671  
3326 3672          offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique,
3327 3673              dva, d, allocator);
3328 3674  
3329 3675          mutex_enter(&mg->mg_lock);
3330 3676          if (offset == -1ULL) {
3331 3677                  mg->mg_failed_allocations++;
3332 3678                  metaslab_trace_add(zal, mg, NULL, asize, d,
3333 3679                      TRACE_GROUP_FAILURE, allocator);
3334 3680                  if (asize == SPA_GANGBLOCKSIZE) {
3335 3681                          /*
3336 3682                           * This metaslab group was unable to allocate
3337 3683                           * the minimum gang block size so it must be out of
3338 3684                           * space. We must notify the allocation throttle
3339 3685                           * to start skipping allocation attempts to this
3340 3686                           * metaslab group until more space becomes available.
3341 3687                           * Note: this failure cannot be caused by the
3342 3688                           * allocation throttle since the allocation throttle
3343 3689                           * is only responsible for skipping devices and
3344 3690                           * not failing block allocations.
3345 3691                           */
3346 3692                          mg->mg_no_free_space = B_TRUE;
3347 3693                  }
3348 3694          }
3349 3695          mg->mg_allocations++;
3350 3696          mutex_exit(&mg->mg_lock);
3351 3697          return (offset);
3352 3698  }
3353 3699  
3354 3700  /*
3355 3701   * Allocate a block for the specified i/o.
3356 3702   */
3357 3703  int
3358 3704  metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
3359 3705      dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
3360 3706      zio_alloc_list_t *zal, int allocator)
3361 3707  {
3362 3708          metaslab_group_t *mg, *rotor;
3363 3709          vdev_t *vd;
3364 3710          boolean_t try_hard = B_FALSE;
3365 3711  
3366 3712          ASSERT(!DVA_IS_VALID(&dva[d]));
3367 3713  
3368 3714          /*
3369 3715           * For testing, make some blocks above a certain size be gang blocks.
3370 3716           * This will also test spilling from special to normal.
3371 3717           */
3372 3718          if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) {
3373 3719                  metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
3374 3720                      allocator);
3375 3721                  return (SET_ERROR(ENOSPC));
3376 3722          }
3377 3723  
3378 3724          /*
3379 3725           * Start at the rotor and loop through all mgs until we find something.
3380 3726           * Note that there's no locking on mc_rotor or mc_aliquot because
3381 3727           * nothing actually breaks if we miss a few updates -- we just won't
3382 3728           * allocate quite as evenly.  It all balances out over time.
3383 3729           *
3384 3730           * If we are doing ditto or log blocks, try to spread them across
3385 3731           * consecutive vdevs.  If we're forced to reuse a vdev before we've
3386 3732           * allocated all of our ditto blocks, then try and spread them out on
3387 3733           * that vdev as much as possible.  If it turns out to not be possible,
3388 3734           * gradually lower our standards until anything becomes acceptable.
3389 3735           * Also, allocating on consecutive vdevs (as opposed to random vdevs)
3390 3736           * gives us hope of containing our fault domains to something we're
3391 3737           * able to reason about.  Otherwise, any two top-level vdev failures
3392 3738           * will guarantee the loss of data.  With consecutive allocation,
3393 3739           * only two adjacent top-level vdev failures will result in data loss.
3394 3740           *
3395 3741           * If we are doing gang blocks (hintdva is non-NULL), try to keep
3396 3742           * ourselves on the same vdev as our gang block header.  That
3397 3743           * way, we can hope for locality in vdev_cache, plus it makes our
3398 3744           * fault domains something tractable.
3399 3745           */
3400 3746          if (hintdva) {
3401 3747                  vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
3402 3748  
3403 3749                  /*
3404 3750                   * It's possible the vdev we're using as the hint no
3405 3751                   * longer exists or its mg has been closed (e.g. by
3406 3752                   * device removal).  Consult the rotor when
3407 3753                   * all else fails.
3408 3754                   */
3409 3755                  if (vd != NULL && vd->vdev_mg != NULL) {
3410 3756                          mg = vd->vdev_mg;
3411 3757  
3412 3758                          if (flags & METASLAB_HINTBP_AVOID &&
3413 3759                              mg->mg_next != NULL)
3414 3760                                  mg = mg->mg_next;
3415 3761                  } else {
3416 3762                          mg = mc->mc_rotor;
3417 3763                  }
3418 3764          } else if (d != 0) {
3419 3765                  vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
3420 3766                  mg = vd->vdev_mg->mg_next;
3421 3767          } else {
3422 3768                  ASSERT(mc->mc_rotor != NULL);
3423 3769                  mg = mc->mc_rotor;
3424 3770          }
3425 3771  
3426 3772          /*
3427 3773           * If the hint put us into the wrong metaslab class, or into a
3428 3774           * metaslab group that has been passivated, just follow the rotor.
3429 3775           */
3430 3776          if (mg->mg_class != mc || mg->mg_activation_count <= 0)
3431 3777                  mg = mc->mc_rotor;
3432 3778  
3433 3779          rotor = mg;
3434 3780  top:
3435 3781          do {
3436 3782                  boolean_t allocatable;
3437 3783  
3438 3784                  ASSERT(mg->mg_activation_count == 1);
3439 3785                  vd = mg->mg_vd;
3440 3786  
3441 3787                  /*
3442 3788                   * Don't allocate from faulted devices.
3443 3789                   */
3444 3790                  if (try_hard) {
3445 3791                          spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
3446 3792                          allocatable = vdev_allocatable(vd);
3447 3793                          spa_config_exit(spa, SCL_ZIO, FTAG);
3448 3794                  } else {
3449 3795                          allocatable = vdev_allocatable(vd);
3450 3796                  }
3451 3797  
3452 3798                  /*
3453 3799                   * Determine if the selected metaslab group is eligible
3454 3800                   * for allocations. If we're ganging then don't allow
3455 3801                   * this metaslab group to skip allocations since that would
3456 3802                   * inadvertently return ENOSPC and suspend the pool
3457 3803                   * even though space is still available.
3458 3804                   */
3459 3805                  if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
3460 3806                          allocatable = metaslab_group_allocatable(mg, rotor,
3461 3807                              psize, allocator);
3462 3808                  }
3463 3809  
3464 3810                  if (!allocatable) {
3465 3811                          metaslab_trace_add(zal, mg, NULL, psize, d,
3466 3812                              TRACE_NOT_ALLOCATABLE, allocator);
3467 3813                          goto next;
3468 3814                  }
3469 3815  
3470 3816                  ASSERT(mg->mg_initialized);
3471 3817  
3472 3818                  /*
3473 3819                   * Avoid writing single-copy data to a failing,
3474 3820                   * non-redundant vdev, unless we've already tried all
3475 3821                   * other vdevs.
3476 3822                   */
3477 3823                  if ((vd->vdev_stat.vs_write_errors > 0 ||
3478 3824                      vd->vdev_state < VDEV_STATE_HEALTHY) &&
3479 3825                      d == 0 && !try_hard && vd->vdev_children == 0) {
3480 3826                          metaslab_trace_add(zal, mg, NULL, psize, d,
3481 3827                              TRACE_VDEV_ERROR, allocator);
3482 3828                          goto next;
3483 3829                  }
3484 3830  
3485 3831                  ASSERT(mg->mg_class == mc);
3486 3832  
3487 3833                  uint64_t asize = vdev_psize_to_asize(vd, psize);
3488 3834                  ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
3489 3835  
3490 3836                  /*
3491 3837                   * If we don't need to try hard, then require that the
3492 3838                   * block be on an different metaslab from any other DVAs
3493 3839                   * in this BP (unique=true).  If we are trying hard, then
3494 3840                   * allow any metaslab to be used (unique=false).
3495 3841                   */
3496 3842                  uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
3497 3843                      !try_hard, dva, d, allocator);
3498 3844  
3499 3845                  if (offset != -1ULL) {
3500 3846                          /*
3501 3847                           * If we've just selected this metaslab group,
3502 3848                           * figure out whether the corresponding vdev is
3503 3849                           * over- or under-used relative to the pool,
3504 3850                           * and set an allocation bias to even it out.
3505 3851                           */
3506 3852                          if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
3507 3853                                  vdev_stat_t *vs = &vd->vdev_stat;
3508 3854                                  int64_t vu, cu;
3509 3855  
3510 3856                                  vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
3511 3857                                  cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
3512 3858  
3513 3859                                  /*
3514 3860                                   * Calculate how much more or less we should
3515 3861                                   * try to allocate from this device during
3516 3862                                   * this iteration around the rotor.
3517 3863                                   * For example, if a device is 80% full
3518 3864                                   * and the pool is 20% full then we should
3519 3865                                   * reduce allocations by 60% on this device.
3520 3866                                   *
3521 3867                                   * mg_bias = (20 - 80) * 512K / 100 = -307K
3522 3868                                   *
3523 3869                                   * This reduces allocations by 307K for this
3524 3870                                   * iteration.
3525 3871                                   */
3526 3872                                  mg->mg_bias = ((cu - vu) *
3527 3873                                      (int64_t)mg->mg_aliquot) / 100;
3528 3874                          } else if (!metaslab_bias_enabled) {
3529 3875                                  mg->mg_bias = 0;
3530 3876                          }
3531 3877  
3532 3878                          if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
3533 3879                              mg->mg_aliquot + mg->mg_bias) {
3534 3880                                  mc->mc_rotor = mg->mg_next;
3535 3881                                  mc->mc_aliquot = 0;
3536 3882                          }
3537 3883  
3538 3884                          DVA_SET_VDEV(&dva[d], vd->vdev_id);
3539 3885                          DVA_SET_OFFSET(&dva[d], offset);
3540 3886                          DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
3541 3887                          DVA_SET_ASIZE(&dva[d], asize);
3542 3888  
3543 3889                          return (0);
3544 3890                  }
3545 3891  next:
3546 3892                  mc->mc_rotor = mg->mg_next;
3547 3893                  mc->mc_aliquot = 0;
3548 3894          } while ((mg = mg->mg_next) != rotor);
3549 3895  
3550 3896          /*
3551 3897           * If we haven't tried hard, do so now.
3552 3898           */
3553 3899          if (!try_hard) {
3554 3900                  try_hard = B_TRUE;
3555 3901                  goto top;
3556 3902          }
3557 3903  
3558 3904          bzero(&dva[d], sizeof (dva_t));
3559 3905  
3560 3906          metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator);
3561 3907          return (SET_ERROR(ENOSPC));
3562 3908  }
3563 3909  
3564 3910  void
3565 3911  metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
3566 3912      boolean_t checkpoint)
3567 3913  {
3568 3914          metaslab_t *msp;
3569 3915          spa_t *spa = vd->vdev_spa;
3570 3916  
3571 3917          ASSERT(vdev_is_concrete(vd));
3572 3918          ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
3573 3919          ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
3574 3920  
3575 3921          msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3576 3922  
3577 3923          VERIFY(!msp->ms_condensing);
3578 3924          VERIFY3U(offset, >=, msp->ms_start);
3579 3925          VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size);
3580 3926          VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
3581 3927          VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift));
3582 3928  
3583 3929          metaslab_check_free_impl(vd, offset, asize);
3584 3930  
3585 3931          mutex_enter(&msp->ms_lock);
3586 3932          if (range_tree_is_empty(msp->ms_freeing) &&
3587 3933              range_tree_is_empty(msp->ms_checkpointing)) {
3588 3934                  vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa));
3589 3935          }
3590 3936  
3591 3937          if (checkpoint) {
3592 3938                  ASSERT(spa_has_checkpoint(spa));
3593 3939                  range_tree_add(msp->ms_checkpointing, offset, asize);
3594 3940          } else {
3595 3941                  range_tree_add(msp->ms_freeing, offset, asize);
3596 3942          }
3597 3943          mutex_exit(&msp->ms_lock);
3598 3944  }
3599 3945  
3600 3946  /* ARGSUSED */
3601 3947  void
3602 3948  metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
3603 3949      uint64_t size, void *arg)
3604 3950  {
3605 3951          boolean_t *checkpoint = arg;
3606 3952  
3607 3953          ASSERT3P(checkpoint, !=, NULL);
3608 3954  
3609 3955          if (vd->vdev_ops->vdev_op_remap != NULL)
3610 3956                  vdev_indirect_mark_obsolete(vd, offset, size);
3611 3957          else
3612 3958                  metaslab_free_impl(vd, offset, size, *checkpoint);
3613 3959  }
3614 3960  
3615 3961  static void
3616 3962  metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
3617 3963      boolean_t checkpoint)
3618 3964  {
3619 3965          spa_t *spa = vd->vdev_spa;
3620 3966  
3621 3967          ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
3622 3968  
3623 3969          if (spa_syncing_txg(spa) > spa_freeze_txg(spa))
3624 3970                  return;
3625 3971  
3626 3972          if (spa->spa_vdev_removal != NULL &&
3627 3973              spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id &&
3628 3974              vdev_is_concrete(vd)) {
3629 3975                  /*
3630 3976                   * Note: we check if the vdev is concrete because when
3631 3977                   * we complete the removal, we first change the vdev to be
3632 3978                   * an indirect vdev (in open context), and then (in syncing
3633 3979                   * context) clear spa_vdev_removal.
3634 3980                   */
3635 3981                  free_from_removing_vdev(vd, offset, size);
3636 3982          } else if (vd->vdev_ops->vdev_op_remap != NULL) {
3637 3983                  vdev_indirect_mark_obsolete(vd, offset, size);
3638 3984                  vd->vdev_ops->vdev_op_remap(vd, offset, size,
3639 3985                      metaslab_free_impl_cb, &checkpoint);
3640 3986          } else {
3641 3987                  metaslab_free_concrete(vd, offset, size, checkpoint);
3642 3988          }
3643 3989  }
3644 3990  
3645 3991  typedef struct remap_blkptr_cb_arg {
3646 3992          blkptr_t *rbca_bp;
3647 3993          spa_remap_cb_t rbca_cb;
3648 3994          vdev_t *rbca_remap_vd;
3649 3995          uint64_t rbca_remap_offset;
3650 3996          void *rbca_cb_arg;
3651 3997  } remap_blkptr_cb_arg_t;
3652 3998  
3653 3999  void
3654 4000  remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
3655 4001      uint64_t size, void *arg)
3656 4002  {
3657 4003          remap_blkptr_cb_arg_t *rbca = arg;
3658 4004          blkptr_t *bp = rbca->rbca_bp;
3659 4005  
3660 4006          /* We can not remap split blocks. */
3661 4007          if (size != DVA_GET_ASIZE(&bp->blk_dva[0]))
3662 4008                  return;
3663 4009          ASSERT0(inner_offset);
3664 4010  
3665 4011          if (rbca->rbca_cb != NULL) {
3666 4012                  /*
3667 4013                   * At this point we know that we are not handling split
3668 4014                   * blocks and we invoke the callback on the previous
3669 4015                   * vdev which must be indirect.
3670 4016                   */
3671 4017                  ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops);
3672 4018  
3673 4019                  rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id,
3674 4020                      rbca->rbca_remap_offset, size, rbca->rbca_cb_arg);
3675 4021  
3676 4022                  /* set up remap_blkptr_cb_arg for the next call */
3677 4023                  rbca->rbca_remap_vd = vd;
3678 4024                  rbca->rbca_remap_offset = offset;
3679 4025          }
3680 4026  
3681 4027          /*
3682 4028           * The phys birth time is that of dva[0].  This ensures that we know
3683 4029           * when each dva was written, so that resilver can determine which
3684 4030           * blocks need to be scrubbed (i.e. those written during the time
3685 4031           * the vdev was offline).  It also ensures that the key used in
3686 4032           * the ARC hash table is unique (i.e. dva[0] + phys_birth).  If
3687 4033           * we didn't change the phys_birth, a lookup in the ARC for a
3688 4034           * remapped BP could find the data that was previously stored at
3689 4035           * this vdev + offset.
3690 4036           */
3691 4037          vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa,
3692 4038              DVA_GET_VDEV(&bp->blk_dva[0]));
3693 4039          vdev_indirect_births_t *vib = oldvd->vdev_indirect_births;
3694 4040          bp->blk_phys_birth = vdev_indirect_births_physbirth(vib,
3695 4041              DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0]));
3696 4042  
3697 4043          DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
3698 4044          DVA_SET_OFFSET(&bp->blk_dva[0], offset);
3699 4045  }
3700 4046  
3701 4047  /*
3702 4048   * If the block pointer contains any indirect DVAs, modify them to refer to
3703 4049   * concrete DVAs.  Note that this will sometimes not be possible, leaving
3704 4050   * the indirect DVA in place.  This happens if the indirect DVA spans multiple
3705 4051   * segments in the mapping (i.e. it is a "split block").
3706 4052   *
3707 4053   * If the BP was remapped, calls the callback on the original dva (note the
3708 4054   * callback can be called multiple times if the original indirect DVA refers
3709 4055   * to another indirect DVA, etc).
3710 4056   *
3711 4057   * Returns TRUE if the BP was remapped.
3712 4058   */
3713 4059  boolean_t
3714 4060  spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg)
3715 4061  {
3716 4062          remap_blkptr_cb_arg_t rbca;
3717 4063  
3718 4064          if (!zfs_remap_blkptr_enable)
3719 4065                  return (B_FALSE);
3720 4066  
3721 4067          if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS))
3722 4068                  return (B_FALSE);
3723 4069  
3724 4070          /*
3725 4071           * Dedup BP's can not be remapped, because ddt_phys_select() depends
3726 4072           * on DVA[0] being the same in the BP as in the DDT (dedup table).
3727 4073           */
3728 4074          if (BP_GET_DEDUP(bp))
3729 4075                  return (B_FALSE);
3730 4076  
3731 4077          /*
3732 4078           * Gang blocks can not be remapped, because
3733 4079           * zio_checksum_gang_verifier() depends on the DVA[0] that's in
3734 4080           * the BP used to read the gang block header (GBH) being the same
3735 4081           * as the DVA[0] that we allocated for the GBH.
3736 4082           */
3737 4083          if (BP_IS_GANG(bp))
3738 4084                  return (B_FALSE);
3739 4085  
3740 4086          /*
3741 4087           * Embedded BP's have no DVA to remap.
3742 4088           */
3743 4089          if (BP_GET_NDVAS(bp) < 1)
3744 4090                  return (B_FALSE);
3745 4091  
3746 4092          /*
3747 4093           * Note: we only remap dva[0].  If we remapped other dvas, we
3748 4094           * would no longer know what their phys birth txg is.
3749 4095           */
3750 4096          dva_t *dva = &bp->blk_dva[0];
3751 4097  
3752 4098          uint64_t offset = DVA_GET_OFFSET(dva);
3753 4099          uint64_t size = DVA_GET_ASIZE(dva);
3754 4100          vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
3755 4101  
3756 4102          if (vd->vdev_ops->vdev_op_remap == NULL)
3757 4103                  return (B_FALSE);
3758 4104  
3759 4105          rbca.rbca_bp = bp;
3760 4106          rbca.rbca_cb = callback;
3761 4107          rbca.rbca_remap_vd = vd;
3762 4108          rbca.rbca_remap_offset = offset;
3763 4109          rbca.rbca_cb_arg = arg;
3764 4110  
3765 4111          /*
3766 4112           * remap_blkptr_cb() will be called in order for each level of
3767 4113           * indirection, until a concrete vdev is reached or a split block is
3768 4114           * encountered. old_vd and old_offset are updated within the callback
3769 4115           * as we go from the one indirect vdev to the next one (either concrete
3770 4116           * or indirect again) in that order.
3771 4117           */
3772 4118          vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca);
3773 4119  
3774 4120          /* Check if the DVA wasn't remapped because it is a split block */
3775 4121          if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id)
3776 4122                  return (B_FALSE);
3777 4123  
3778 4124          return (B_TRUE);
3779 4125  }
3780 4126  
3781 4127  /*
3782 4128   * Undo the allocation of a DVA which happened in the given transaction group.
3783 4129   */
3784 4130  void
3785 4131  metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
3786 4132  {
3787 4133          metaslab_t *msp;
3788 4134          vdev_t *vd;
3789 4135          uint64_t vdev = DVA_GET_VDEV(dva);
3790 4136          uint64_t offset = DVA_GET_OFFSET(dva);
3791 4137          uint64_t size = DVA_GET_ASIZE(dva);
3792 4138  
3793 4139          ASSERT(DVA_IS_VALID(dva));
3794 4140          ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
3795 4141  
3796 4142          if (txg > spa_freeze_txg(spa))
3797 4143                  return;
3798 4144  
3799 4145          if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
3800 4146              (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
3801 4147                  cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
3802 4148                      (u_longlong_t)vdev, (u_longlong_t)offset);
3803 4149                  ASSERT(0);
3804 4150                  return;
3805 4151          }
3806 4152  
3807 4153          ASSERT(!vd->vdev_removing);
3808 4154          ASSERT(vdev_is_concrete(vd));
3809 4155          ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
3810 4156          ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
3811 4157  
3812 4158          if (DVA_GET_GANG(dva))
3813 4159                  size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3814 4160  
3815 4161          msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3816 4162  
3817 4163          mutex_enter(&msp->ms_lock);
3818 4164          range_tree_remove(msp->ms_allocating[txg & TXG_MASK],
3819 4165              offset, size);
3820 4166  
3821 4167          VERIFY(!msp->ms_condensing);
3822 4168          VERIFY3U(offset, >=, msp->ms_start);
3823 4169          VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
3824 4170          VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=,
3825 4171              msp->ms_size);
3826 4172          VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
3827 4173          VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
3828 4174          range_tree_add(msp->ms_allocatable, offset, size);
3829 4175          mutex_exit(&msp->ms_lock);
3830 4176  }
3831 4177  
3832 4178  /*
3833 4179   * Free the block represented by the given DVA.
3834 4180   */
3835 4181  void
3836 4182  metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint)
3837 4183  {
3838 4184          uint64_t vdev = DVA_GET_VDEV(dva);
3839 4185          uint64_t offset = DVA_GET_OFFSET(dva);
3840 4186          uint64_t size = DVA_GET_ASIZE(dva);
3841 4187          vdev_t *vd = vdev_lookup_top(spa, vdev);
3842 4188  
3843 4189          ASSERT(DVA_IS_VALID(dva));
3844 4190          ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
3845 4191  
3846 4192          if (DVA_GET_GANG(dva)) {
3847 4193                  size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3848 4194          }
3849 4195  
3850 4196          metaslab_free_impl(vd, offset, size, checkpoint);
3851 4197  }
3852 4198  
3853 4199  /*
3854 4200   * Reserve some allocation slots. The reservation system must be called
3855 4201   * before we call into the allocator. If there aren't any available slots
3856 4202   * then the I/O will be throttled until an I/O completes and its slots are
3857 4203   * freed up. The function returns true if it was successful in placing
3858 4204   * the reservation.
3859 4205   */
3860 4206  boolean_t
3861 4207  metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
3862 4208      zio_t *zio, int flags)
3863 4209  {
3864 4210          uint64_t available_slots = 0;
3865 4211          boolean_t slot_reserved = B_FALSE;
3866 4212          uint64_t max = mc->mc_alloc_max_slots[allocator];
3867 4213  
3868 4214          ASSERT(mc->mc_alloc_throttle_enabled);
3869 4215          mutex_enter(&mc->mc_lock);
3870 4216  
3871 4217          uint64_t reserved_slots =
3872 4218              zfs_refcount_count(&mc->mc_alloc_slots[allocator]);
3873 4219          if (reserved_slots < max)
3874 4220                  available_slots = max - reserved_slots;
3875 4221  
3876 4222          if (slots <= available_slots || GANG_ALLOCATION(flags) ||
3877 4223              flags & METASLAB_MUST_RESERVE) {
3878 4224                  /*
3879 4225                   * We reserve the slots individually so that we can unreserve
3880 4226                   * them individually when an I/O completes.
3881 4227                   */
3882 4228                  for (int d = 0; d < slots; d++) {
3883 4229                          reserved_slots =
3884 4230                              zfs_refcount_add(&mc->mc_alloc_slots[allocator],
3885 4231                              zio);
3886 4232                  }
3887 4233                  zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
3888 4234                  slot_reserved = B_TRUE;
3889 4235          }
3890 4236  
3891 4237          mutex_exit(&mc->mc_lock);
3892 4238          return (slot_reserved);
3893 4239  }
3894 4240  
3895 4241  void
3896 4242  metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
3897 4243      int allocator, zio_t *zio)
3898 4244  {
3899 4245          ASSERT(mc->mc_alloc_throttle_enabled);
3900 4246          mutex_enter(&mc->mc_lock);
3901 4247          for (int d = 0; d < slots; d++) {
3902 4248                  (void) zfs_refcount_remove(&mc->mc_alloc_slots[allocator],
3903 4249                      zio);
3904 4250          }
3905 4251          mutex_exit(&mc->mc_lock);
3906 4252  }
3907 4253  
3908 4254  static int
3909 4255  metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
3910 4256      uint64_t txg)
3911 4257  {
3912 4258          metaslab_t *msp;
3913 4259          spa_t *spa = vd->vdev_spa;
3914 4260          int error = 0;
3915 4261  
3916 4262          if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count)
3917 4263                  return (ENXIO);
3918 4264  
3919 4265          ASSERT3P(vd->vdev_ms, !=, NULL);
3920 4266          msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3921 4267  
3922 4268          mutex_enter(&msp->ms_lock);
3923 4269  
3924 4270          if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
3925 4271                  error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM);
3926 4272          /*
3927 4273           * No need to fail in that case; someone else has activated the
3928 4274           * metaslab, but that doesn't preclude us from using it.
3929 4275           */
3930 4276          if (error == EBUSY)
3931 4277                  error = 0;
3932 4278  
3933 4279          if (error == 0 &&
3934 4280              !range_tree_contains(msp->ms_allocatable, offset, size))
3935 4281                  error = SET_ERROR(ENOENT);
3936 4282  
3937 4283          if (error || txg == 0) {        /* txg == 0 indicates dry run */
3938 4284                  mutex_exit(&msp->ms_lock);
3939 4285                  return (error);
3940 4286          }
3941 4287  
3942 4288          VERIFY(!msp->ms_condensing);
3943 4289          VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
3944 4290          VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
3945 4291          VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=,
3946 4292              msp->ms_size);
3947 4293          range_tree_remove(msp->ms_allocatable, offset, size);
3948 4294  
3949 4295          if (spa_writeable(spa)) {       /* don't dirty if we're zdb(1M) */
3950 4296                  if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
3951 4297                          vdev_dirty(vd, VDD_METASLAB, msp, txg);
3952 4298                  range_tree_add(msp->ms_allocating[txg & TXG_MASK],
3953 4299                      offset, size);
3954 4300          }
3955 4301  
3956 4302          mutex_exit(&msp->ms_lock);
3957 4303  
3958 4304          return (0);
3959 4305  }
3960 4306  
3961 4307  typedef struct metaslab_claim_cb_arg_t {
3962 4308          uint64_t        mcca_txg;
3963 4309          int             mcca_error;
3964 4310  } metaslab_claim_cb_arg_t;
3965 4311  
3966 4312  /* ARGSUSED */
3967 4313  static void
3968 4314  metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
3969 4315      uint64_t size, void *arg)
3970 4316  {
3971 4317          metaslab_claim_cb_arg_t *mcca_arg = arg;
3972 4318  
3973 4319          if (mcca_arg->mcca_error == 0) {
3974 4320                  mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset,
3975 4321                      size, mcca_arg->mcca_txg);
3976 4322          }
3977 4323  }
3978 4324  
3979 4325  int
3980 4326  metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
3981 4327  {
3982 4328          if (vd->vdev_ops->vdev_op_remap != NULL) {
3983 4329                  metaslab_claim_cb_arg_t arg;
3984 4330  
3985 4331                  /*
3986 4332                   * Only zdb(1M) can claim on indirect vdevs.  This is used
3987 4333                   * to detect leaks of mapped space (that are not accounted
3988 4334                   * for in the obsolete counts, spacemap, or bpobj).
3989 4335                   */
3990 4336                  ASSERT(!spa_writeable(vd->vdev_spa));
3991 4337                  arg.mcca_error = 0;
3992 4338                  arg.mcca_txg = txg;
3993 4339  
3994 4340                  vd->vdev_ops->vdev_op_remap(vd, offset, size,
3995 4341                      metaslab_claim_impl_cb, &arg);
3996 4342  
3997 4343                  if (arg.mcca_error == 0) {
3998 4344                          arg.mcca_error = metaslab_claim_concrete(vd,
3999 4345                              offset, size, txg);
4000 4346                  }
4001 4347                  return (arg.mcca_error);
4002 4348          } else {
4003 4349                  return (metaslab_claim_concrete(vd, offset, size, txg));
4004 4350          }
4005 4351  }
4006 4352  
4007 4353  /*
4008 4354   * Intent log support: upon opening the pool after a crash, notify the SPA
4009 4355   * of blocks that the intent log has allocated for immediate write, but
4010 4356   * which are still considered free by the SPA because the last transaction
4011 4357   * group didn't commit yet.
4012 4358   */
4013 4359  static int
4014 4360  metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
4015 4361  {
4016 4362          uint64_t vdev = DVA_GET_VDEV(dva);
4017 4363          uint64_t offset = DVA_GET_OFFSET(dva);
4018 4364          uint64_t size = DVA_GET_ASIZE(dva);
4019 4365          vdev_t *vd;
4020 4366  
4021 4367          if ((vd = vdev_lookup_top(spa, vdev)) == NULL) {
4022 4368                  return (SET_ERROR(ENXIO));
4023 4369          }
4024 4370  
4025 4371          ASSERT(DVA_IS_VALID(dva));
4026 4372  
4027 4373          if (DVA_GET_GANG(dva))
4028 4374                  size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);

↓ open down ↓

1235 lines elided

↑ open up ↑

4029 4375  
4030 4376          return (metaslab_claim_impl(vd, offset, size, txg));
4031 4377  }
4032 4378  
4033 4379  int
4034 4380  metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
4035 4381      int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
4036 4382      zio_alloc_list_t *zal, zio_t *zio, int allocator)
4037 4383  {
4038 4384          dva_t *dva = bp->blk_dva;
4039      -        dva_t *hintdva = hintbp->blk_dva;
     4385 +        dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
4040 4386          int error = 0;
4041 4387  
4042 4388          ASSERT(bp->blk_birth == 0);
4043 4389          ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
4044 4390  
4045 4391          spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
4046 4392  
4047 4393          if (mc->mc_rotor == NULL) {     /* no vdevs in this class */
4048 4394                  spa_config_exit(spa, SCL_ALLOC, FTAG);
4049 4395                  return (SET_ERROR(ENOSPC));

4050 4396          }
4051 4397  
4052 4398          ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
4053 4399          ASSERT(BP_GET_NDVAS(bp) == 0);
4054 4400          ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
4055 4401          ASSERT3P(zal, !=, NULL);
4056 4402  
4057 4403          for (int d = 0; d < ndvas; d++) {
4058 4404                  error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
4059 4405                      txg, flags, zal, allocator);
4060 4406                  if (error != 0) {
4061 4407                          for (d--; d >= 0; d--) {
4062 4408                                  metaslab_unalloc_dva(spa, &dva[d], txg);
4063 4409                                  metaslab_group_alloc_decrement(spa,
4064 4410                                      DVA_GET_VDEV(&dva[d]), zio, flags,
4065 4411                                      allocator, B_FALSE);
4066 4412                                  bzero(&dva[d], sizeof (dva_t));
4067 4413                          }
4068 4414                          spa_config_exit(spa, SCL_ALLOC, FTAG);
4069 4415                          return (error);
4070 4416                  } else {
4071 4417                          /*
4072 4418                           * Update the metaslab group's queue depth
4073 4419                           * based on the newly allocated dva.
4074 4420                           */
4075 4421                          metaslab_group_alloc_increment(spa,
4076 4422                              DVA_GET_VDEV(&dva[d]), zio, flags, allocator);
4077 4423                  }
4078 4424  
4079 4425          }
4080 4426          ASSERT(error == 0);
4081 4427          ASSERT(BP_GET_NDVAS(bp) == ndvas);
4082 4428  
4083 4429          spa_config_exit(spa, SCL_ALLOC, FTAG);
4084 4430  
4085 4431          BP_SET_BIRTH(bp, txg, txg);
4086 4432  
4087 4433          return (0);
4088 4434  }
4089 4435  
4090 4436  void
4091 4437  metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
4092 4438  {
4093 4439          const dva_t *dva = bp->blk_dva;
4094 4440          int ndvas = BP_GET_NDVAS(bp);
4095 4441  
4096 4442          ASSERT(!BP_IS_HOLE(bp));
4097 4443          ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
4098 4444  
4099 4445          /*
4100 4446           * If we have a checkpoint for the pool we need to make sure that
4101 4447           * the blocks that we free that are part of the checkpoint won't be
4102 4448           * reused until the checkpoint is discarded or we revert to it.
4103 4449           *
4104 4450           * The checkpoint flag is passed down the metaslab_free code path
4105 4451           * and is set whenever we want to add a block to the checkpoint's
4106 4452           * accounting. That is, we "checkpoint" blocks that existed at the
4107 4453           * time the checkpoint was created and are therefore referenced by
4108 4454           * the checkpointed uberblock.
4109 4455           *
4110 4456           * Note that, we don't checkpoint any blocks if the current
4111 4457           * syncing txg <= spa_checkpoint_txg. We want these frees to sync
4112 4458           * normally as they will be referenced by the checkpointed uberblock.
4113 4459           */
4114 4460          boolean_t checkpoint = B_FALSE;
4115 4461          if (bp->blk_birth <= spa->spa_checkpoint_txg &&
4116 4462              spa_syncing_txg(spa) > spa->spa_checkpoint_txg) {
4117 4463                  /*
4118 4464                   * At this point, if the block is part of the checkpoint
4119 4465                   * there is no way it was created in the current txg.
4120 4466                   */
4121 4467                  ASSERT(!now);
4122 4468                  ASSERT3U(spa_syncing_txg(spa), ==, txg);
4123 4469                  checkpoint = B_TRUE;
4124 4470          }
4125 4471  
4126 4472          spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
4127 4473  
4128 4474          for (int d = 0; d < ndvas; d++) {
4129 4475                  if (now) {
4130 4476                          metaslab_unalloc_dva(spa, &dva[d], txg);
4131 4477                  } else {
4132 4478                          ASSERT3U(txg, ==, spa_syncing_txg(spa));
4133 4479                          metaslab_free_dva(spa, &dva[d], checkpoint);
4134 4480                  }
4135 4481          }
4136 4482  
4137 4483          spa_config_exit(spa, SCL_FREE, FTAG);
4138 4484  }
4139 4485  
4140 4486  int
4141 4487  metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
4142 4488  {
4143 4489          const dva_t *dva = bp->blk_dva;
4144 4490          int ndvas = BP_GET_NDVAS(bp);
4145 4491          int error = 0;
4146 4492  
4147 4493          ASSERT(!BP_IS_HOLE(bp));
4148 4494  
4149 4495          if (txg != 0) {
4150 4496                  /*
4151 4497                   * First do a dry run to make sure all DVAs are claimable,
4152 4498                   * so we don't have to unwind from partial failures below.
4153 4499                   */
4154 4500                  if ((error = metaslab_claim(spa, bp, 0)) != 0)
4155 4501                          return (error);
4156 4502          }
4157 4503  
4158 4504          spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
4159 4505  
4160 4506          for (int d = 0; d < ndvas; d++) {
4161 4507                  error = metaslab_claim_dva(spa, &dva[d], txg);
4162 4508                  if (error != 0)
4163 4509                          break;
4164 4510          }
4165 4511  
4166 4512          spa_config_exit(spa, SCL_ALLOC, FTAG);
4167 4513  
4168 4514          ASSERT(error == 0 || txg == 0);
4169 4515  
4170 4516          return (error);
4171 4517  }
4172 4518  
4173 4519  /* ARGSUSED */
4174 4520  static void
4175 4521  metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset,
4176 4522      uint64_t size, void *arg)
4177 4523  {
4178 4524          if (vd->vdev_ops == &vdev_indirect_ops)
4179 4525                  return;
4180 4526  
4181 4527          metaslab_check_free_impl(vd, offset, size);
4182 4528  }
4183 4529  
4184 4530  static void
4185 4531  metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
4186 4532  {
4187 4533          metaslab_t *msp;
4188 4534          spa_t *spa = vd->vdev_spa;
4189 4535  
4190 4536          if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
4191 4537                  return;
4192 4538  
4193 4539          if (vd->vdev_ops->vdev_op_remap != NULL) {
4194 4540                  vd->vdev_ops->vdev_op_remap(vd, offset, size,
4195 4541                      metaslab_check_free_impl_cb, NULL);

↓ open down ↓

146 lines elided

↑ open up ↑

4196 4542                  return;
4197 4543          }
4198 4544  
4199 4545          ASSERT(vdev_is_concrete(vd));
4200 4546          ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
4201 4547          ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
4202 4548  
4203 4549          msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
4204 4550  
4205 4551          mutex_enter(&msp->ms_lock);
4206      -        if (msp->ms_loaded)
4207      -                range_tree_verify(msp->ms_allocatable, offset, size);
     4552 +        if (msp->ms_loaded) {
     4553 +                range_tree_verify_not_present(msp->ms_allocatable,
     4554 +                    offset, size);
     4555 +        }
4208 4556  
4209      -        range_tree_verify(msp->ms_freeing, offset, size);
4210      -        range_tree_verify(msp->ms_checkpointing, offset, size);
4211      -        range_tree_verify(msp->ms_freed, offset, size);
     4557 +        range_tree_verify_not_present(msp->ms_freeing, offset, size);
     4558 +        range_tree_verify_not_present(msp->ms_checkpointing, offset, size);
     4559 +        range_tree_verify_not_present(msp->ms_freed, offset, size);
4212 4560          for (int j = 0; j < TXG_DEFER_SIZE; j++)
4213      -                range_tree_verify(msp->ms_defer[j], offset, size);
     4561 +                range_tree_verify_not_present(msp->ms_defer[j], offset, size);
4214 4562          mutex_exit(&msp->ms_lock);
4215 4563  }
4216 4564  
4217 4565  void
4218 4566  metaslab_check_free(spa_t *spa, const blkptr_t *bp)
4219 4567  {
4220 4568          if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
4221 4569                  return;
4222 4570  
4223 4571          spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);

4224 4572          for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
4225 4573                  uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
4226 4574                  vdev_t *vd = vdev_lookup_top(spa, vdev);
4227 4575                  uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
4228 4576                  uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
4229 4577  
4230 4578                  if (DVA_GET_GANG(&bp->blk_dva[i]))
4231 4579                          size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
4232 4580  
4233 4581                  ASSERT3P(vd, !=, NULL);
4234 4582  
4235 4583                  metaslab_check_free_impl(vd, offset, size);
4236 4584          }
4237 4585          spa_config_exit(spa, SCL_VDEV, FTAG);
4238 4586  }

↓ open down ↓

15 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX