merge Wdiff usr/src/uts/common/fs/zfs/metaslab.c

Print this page

4374 dn_free_ranges should use range_tree_t
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Max Grossman <max.grossman@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com
Reviewed by: Garrett D'Amore <garrett@damore.org>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Approved by: Dan McDonald <danmcd@omniti.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/metaslab.c
          +++ new/usr/src/uts/common/fs/zfs/metaslab.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *

↓ open down ↓

12 lines elided

↑ open up ↑

  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23      - * Copyright (c) 2013 by Delphix. All rights reserved.
       23 + * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  24   24   * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  25   25   */
  26   26  
  27   27  #include <sys/zfs_context.h>
  28   28  #include <sys/dmu.h>
  29   29  #include <sys/dmu_tx.h>
  30   30  #include <sys/space_map.h>
  31   31  #include <sys/metaslab_impl.h>
  32   32  #include <sys/vdev_impl.h>
  33   33  #include <sys/zio.h>

  34   34  #include <sys/spa_impl.h>
  35   35  
  36   36  /*
  37   37   * Allow allocations to switch to gang blocks quickly. We do this to
  38   38   * avoid having to load lots of space_maps in a given txg. There are,
  39   39   * however, some cases where we want to avoid "fast" ganging and instead
  40   40   * we want to do an exhaustive search of all metaslabs on this device.
  41   41   * Currently we don't allow any gang, zil, or dump device related allocations
  42   42   * to "fast" gang.
  43   43   */
  44   44  #define CAN_FASTGANG(flags) \
  45   45          (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
  46   46          METASLAB_GANG_AVOID)))
  47   47  
  48   48  #define METASLAB_WEIGHT_PRIMARY         (1ULL << 63)
  49   49  #define METASLAB_WEIGHT_SECONDARY       (1ULL << 62)
  50   50  #define METASLAB_ACTIVE_MASK            \
  51   51          (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
  52   52  
  53   53  uint64_t metaslab_aliquot = 512ULL << 10;
  54   54  uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;     /* force gang blocks */
  55   55  
  56   56  /*
  57   57   * The in-core space map representation is more compact than its on-disk form.
  58   58   * The zfs_condense_pct determines how much more compact the in-core
  59   59   * space_map representation must be before we compact it on-disk.
  60   60   * Values should be greater than or equal to 100.
  61   61   */
  62   62  int zfs_condense_pct = 200;
  63   63  
  64   64  /*
  65   65   * This value defines the number of allowed allocation failures per vdev.
  66   66   * If a device reaches this threshold in a given txg then we consider skipping
  67   67   * allocations on that device. The value of zfs_mg_alloc_failures is computed
  68   68   * in zio_init() unless it has been overridden in /etc/system.
  69   69   */
  70   70  int zfs_mg_alloc_failures = 0;
  71   71  
  72   72  /*
  73   73   * The zfs_mg_noalloc_threshold defines which metaslab groups should
  74   74   * be eligible for allocation. The value is defined as a percentage of
  75   75   * a free space. Metaslab groups that have more free space than
  76   76   * zfs_mg_noalloc_threshold are always eligible for allocations. Once
  77   77   * a metaslab group's free space is less than or equal to the
  78   78   * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
  79   79   * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
  80   80   * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
  81   81   * groups are allowed to accept allocations. Gang blocks are always
  82   82   * eligible to allocate on any metaslab group. The default value of 0 means
  83   83   * no metaslab group will be excluded based on this criterion.
  84   84   */
  85   85  int zfs_mg_noalloc_threshold = 0;
  86   86  
  87   87  /*
  88   88   * When set will load all metaslabs when pool is first opened.
  89   89   */
  90   90  int metaslab_debug_load = 0;
  91   91  
  92   92  /*
  93   93   * When set will prevent metaslabs from being unloaded.
  94   94   */
  95   95  int metaslab_debug_unload = 0;
  96   96  
  97   97  /*
  98   98   * Minimum size which forces the dynamic allocator to change
  99   99   * it's allocation strategy.  Once the space map cannot satisfy
 100  100   * an allocation of this size then it switches to using more
 101  101   * aggressive strategy (i.e search by size rather than offset).
 102  102   */
 103  103  uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
 104  104  
 105  105  /*
 106  106   * The minimum free space, in percent, which must be available
 107  107   * in a space map to continue allocations in a first-fit fashion.
 108  108   * Once the space_map's free space drops below this level we dynamically
 109  109   * switch to using best-fit allocations.
 110  110   */
 111  111  int metaslab_df_free_pct = 4;
 112  112  
 113  113  /*
 114  114   * A metaslab is considered "free" if it contains a contiguous
 115  115   * segment which is greater than metaslab_min_alloc_size.
 116  116   */
 117  117  uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
 118  118  
 119  119  /*
 120  120   * Percentage of all cpus that can be used by the metaslab taskq.
 121  121   */
 122  122  int metaslab_load_pct = 50;
 123  123  
 124  124  /*
 125  125   * Determines how many txgs a metaslab may remain loaded without having any
 126  126   * allocations from it. As long as a metaslab continues to be used we will
 127  127   * keep it loaded.
 128  128   */
 129  129  int metaslab_unload_delay = TXG_SIZE * 2;
 130  130  
 131  131  /*
 132  132   * Should we be willing to write data to degraded vdevs?
 133  133   */
 134  134  boolean_t zfs_write_to_degraded = B_FALSE;
 135  135  
 136  136  /*
 137  137   * Max number of metaslabs per group to preload.
 138  138   */
 139  139  int metaslab_preload_limit = SPA_DVAS_PER_BP;
 140  140  
 141  141  /*
 142  142   * Enable/disable preloading of metaslab.
 143  143   */
 144  144  boolean_t metaslab_preload_enabled = B_TRUE;
 145  145  
 146  146  /*
 147  147   * Enable/disable additional weight factor for each metaslab.
 148  148   */
 149  149  boolean_t metaslab_weight_factor_enable = B_FALSE;
 150  150  
 151  151  
 152  152  /*
 153  153   * ==========================================================================
 154  154   * Metaslab classes
 155  155   * ==========================================================================
 156  156   */
 157  157  metaslab_class_t *
 158  158  metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
 159  159  {
 160  160          metaslab_class_t *mc;
 161  161  
 162  162          mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
 163  163  
 164  164          mc->mc_spa = spa;
 165  165          mc->mc_rotor = NULL;
 166  166          mc->mc_ops = ops;
 167  167  
 168  168          return (mc);
 169  169  }
 170  170  
 171  171  void
 172  172  metaslab_class_destroy(metaslab_class_t *mc)
 173  173  {
 174  174          ASSERT(mc->mc_rotor == NULL);
 175  175          ASSERT(mc->mc_alloc == 0);
 176  176          ASSERT(mc->mc_deferred == 0);
 177  177          ASSERT(mc->mc_space == 0);
 178  178          ASSERT(mc->mc_dspace == 0);
 179  179  
 180  180          kmem_free(mc, sizeof (metaslab_class_t));
 181  181  }
 182  182  
 183  183  int
 184  184  metaslab_class_validate(metaslab_class_t *mc)
 185  185  {
 186  186          metaslab_group_t *mg;
 187  187          vdev_t *vd;
 188  188  
 189  189          /*
 190  190           * Must hold one of the spa_config locks.
 191  191           */
 192  192          ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
 193  193              spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
 194  194  
 195  195          if ((mg = mc->mc_rotor) == NULL)
 196  196                  return (0);
 197  197  
 198  198          do {
 199  199                  vd = mg->mg_vd;
 200  200                  ASSERT(vd->vdev_mg != NULL);
 201  201                  ASSERT3P(vd->vdev_top, ==, vd);
 202  202                  ASSERT3P(mg->mg_class, ==, mc);
 203  203                  ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
 204  204          } while ((mg = mg->mg_next) != mc->mc_rotor);
 205  205  
 206  206          return (0);
 207  207  }
 208  208  
 209  209  void
 210  210  metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
 211  211      int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
 212  212  {
 213  213          atomic_add_64(&mc->mc_alloc, alloc_delta);
 214  214          atomic_add_64(&mc->mc_deferred, defer_delta);
 215  215          atomic_add_64(&mc->mc_space, space_delta);
 216  216          atomic_add_64(&mc->mc_dspace, dspace_delta);
 217  217  }
 218  218  
 219  219  uint64_t
 220  220  metaslab_class_get_alloc(metaslab_class_t *mc)
 221  221  {
 222  222          return (mc->mc_alloc);
 223  223  }
 224  224  
 225  225  uint64_t
 226  226  metaslab_class_get_deferred(metaslab_class_t *mc)
 227  227  {
 228  228          return (mc->mc_deferred);
 229  229  }
 230  230  
 231  231  uint64_t
 232  232  metaslab_class_get_space(metaslab_class_t *mc)
 233  233  {
 234  234          return (mc->mc_space);
 235  235  }
 236  236  
 237  237  uint64_t
 238  238  metaslab_class_get_dspace(metaslab_class_t *mc)
 239  239  {
 240  240          return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
 241  241  }
 242  242  
 243  243  /*
 244  244   * ==========================================================================
 245  245   * Metaslab groups
 246  246   * ==========================================================================
 247  247   */
 248  248  static int
 249  249  metaslab_compare(const void *x1, const void *x2)
 250  250  {
 251  251          const metaslab_t *m1 = x1;
 252  252          const metaslab_t *m2 = x2;
 253  253  
 254  254          if (m1->ms_weight < m2->ms_weight)
 255  255                  return (1);
 256  256          if (m1->ms_weight > m2->ms_weight)
 257  257                  return (-1);
 258  258  
 259  259          /*
 260  260           * If the weights are identical, use the offset to force uniqueness.
 261  261           */
 262  262          if (m1->ms_start < m2->ms_start)
 263  263                  return (-1);
 264  264          if (m1->ms_start > m2->ms_start)
 265  265                  return (1);
 266  266  
 267  267          ASSERT3P(m1, ==, m2);
 268  268  
 269  269          return (0);
 270  270  }
 271  271  
 272  272  /*
 273  273   * Update the allocatable flag and the metaslab group's capacity.
 274  274   * The allocatable flag is set to true if the capacity is below
 275  275   * the zfs_mg_noalloc_threshold. If a metaslab group transitions
 276  276   * from allocatable to non-allocatable or vice versa then the metaslab
 277  277   * group's class is updated to reflect the transition.
 278  278   */
 279  279  static void
 280  280  metaslab_group_alloc_update(metaslab_group_t *mg)
 281  281  {
 282  282          vdev_t *vd = mg->mg_vd;
 283  283          metaslab_class_t *mc = mg->mg_class;
 284  284          vdev_stat_t *vs = &vd->vdev_stat;
 285  285          boolean_t was_allocatable;
 286  286  
 287  287          ASSERT(vd == vd->vdev_top);
 288  288  
 289  289          mutex_enter(&mg->mg_lock);
 290  290          was_allocatable = mg->mg_allocatable;
 291  291  
 292  292          mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
 293  293              (vs->vs_space + 1);
 294  294  
 295  295          mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold);
 296  296  
 297  297          /*
 298  298           * The mc_alloc_groups maintains a count of the number of
 299  299           * groups in this metaslab class that are still above the
 300  300           * zfs_mg_noalloc_threshold. This is used by the allocating
 301  301           * threads to determine if they should avoid allocations to
 302  302           * a given group. The allocator will avoid allocations to a group
 303  303           * if that group has reached or is below the zfs_mg_noalloc_threshold
 304  304           * and there are still other groups that are above the threshold.
 305  305           * When a group transitions from allocatable to non-allocatable or
 306  306           * vice versa we update the metaslab class to reflect that change.
 307  307           * When the mc_alloc_groups value drops to 0 that means that all
 308  308           * groups have reached the zfs_mg_noalloc_threshold making all groups
 309  309           * eligible for allocations. This effectively means that all devices
 310  310           * are balanced again.
 311  311           */
 312  312          if (was_allocatable && !mg->mg_allocatable)
 313  313                  mc->mc_alloc_groups--;
 314  314          else if (!was_allocatable && mg->mg_allocatable)
 315  315                  mc->mc_alloc_groups++;
 316  316          mutex_exit(&mg->mg_lock);
 317  317  }
 318  318  
 319  319  metaslab_group_t *
 320  320  metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
 321  321  {
 322  322          metaslab_group_t *mg;
 323  323  
 324  324          mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
 325  325          mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
 326  326          avl_create(&mg->mg_metaslab_tree, metaslab_compare,
 327  327              sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
 328  328          mg->mg_vd = vd;
 329  329          mg->mg_class = mc;
 330  330          mg->mg_activation_count = 0;
 331  331  
 332  332          mg->mg_taskq = taskq_create("metaslab_group_tasksq", metaslab_load_pct,
 333  333              minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
 334  334  
 335  335          return (mg);
 336  336  }
 337  337  
 338  338  void
 339  339  metaslab_group_destroy(metaslab_group_t *mg)
 340  340  {
 341  341          ASSERT(mg->mg_prev == NULL);
 342  342          ASSERT(mg->mg_next == NULL);
 343  343          /*
 344  344           * We may have gone below zero with the activation count
 345  345           * either because we never activated in the first place or
 346  346           * because we're done, and possibly removing the vdev.
 347  347           */
 348  348          ASSERT(mg->mg_activation_count <= 0);
 349  349  
 350  350          avl_destroy(&mg->mg_metaslab_tree);
 351  351          mutex_destroy(&mg->mg_lock);
 352  352          kmem_free(mg, sizeof (metaslab_group_t));
 353  353  }
 354  354  
 355  355  void
 356  356  metaslab_group_activate(metaslab_group_t *mg)
 357  357  {
 358  358          metaslab_class_t *mc = mg->mg_class;
 359  359          metaslab_group_t *mgprev, *mgnext;
 360  360  
 361  361          ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
 362  362  
 363  363          ASSERT(mc->mc_rotor != mg);
 364  364          ASSERT(mg->mg_prev == NULL);
 365  365          ASSERT(mg->mg_next == NULL);
 366  366          ASSERT(mg->mg_activation_count <= 0);
 367  367  
 368  368          if (++mg->mg_activation_count <= 0)
 369  369                  return;
 370  370  
 371  371          mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
 372  372          metaslab_group_alloc_update(mg);
 373  373  
 374  374          if ((mgprev = mc->mc_rotor) == NULL) {
 375  375                  mg->mg_prev = mg;
 376  376                  mg->mg_next = mg;
 377  377          } else {
 378  378                  mgnext = mgprev->mg_next;
 379  379                  mg->mg_prev = mgprev;
 380  380                  mg->mg_next = mgnext;
 381  381                  mgprev->mg_next = mg;
 382  382                  mgnext->mg_prev = mg;
 383  383          }
 384  384          mc->mc_rotor = mg;
 385  385  }
 386  386  
 387  387  void
 388  388  metaslab_group_passivate(metaslab_group_t *mg)
 389  389  {
 390  390          metaslab_class_t *mc = mg->mg_class;
 391  391          metaslab_group_t *mgprev, *mgnext;
 392  392  
 393  393          ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
 394  394  
 395  395          if (--mg->mg_activation_count != 0) {
 396  396                  ASSERT(mc->mc_rotor != mg);
 397  397                  ASSERT(mg->mg_prev == NULL);
 398  398                  ASSERT(mg->mg_next == NULL);
 399  399                  ASSERT(mg->mg_activation_count < 0);
 400  400                  return;
 401  401          }
 402  402  
 403  403          taskq_wait(mg->mg_taskq);
 404  404  
 405  405          mgprev = mg->mg_prev;
 406  406          mgnext = mg->mg_next;
 407  407  
 408  408          if (mg == mgnext) {
 409  409                  mc->mc_rotor = NULL;
 410  410          } else {
 411  411                  mc->mc_rotor = mgnext;
 412  412                  mgprev->mg_next = mgnext;
 413  413                  mgnext->mg_prev = mgprev;
 414  414          }
 415  415  
 416  416          mg->mg_prev = NULL;
 417  417          mg->mg_next = NULL;
 418  418  }
 419  419  
 420  420  static void
 421  421  metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
 422  422  {
 423  423          mutex_enter(&mg->mg_lock);
 424  424          ASSERT(msp->ms_group == NULL);
 425  425          msp->ms_group = mg;
 426  426          msp->ms_weight = 0;
 427  427          avl_add(&mg->mg_metaslab_tree, msp);
 428  428          mutex_exit(&mg->mg_lock);
 429  429  }
 430  430  
 431  431  static void
 432  432  metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
 433  433  {
 434  434          mutex_enter(&mg->mg_lock);
 435  435          ASSERT(msp->ms_group == mg);
 436  436          avl_remove(&mg->mg_metaslab_tree, msp);
 437  437          msp->ms_group = NULL;
 438  438          mutex_exit(&mg->mg_lock);
 439  439  }
 440  440  
 441  441  static void
 442  442  metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 443  443  {
 444  444          /*
 445  445           * Although in principle the weight can be any value, in
 446  446           * practice we do not use values in the range [1, 510].
 447  447           */
 448  448          ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0);
 449  449          ASSERT(MUTEX_HELD(&msp->ms_lock));
 450  450  
 451  451          mutex_enter(&mg->mg_lock);
 452  452          ASSERT(msp->ms_group == mg);
 453  453          avl_remove(&mg->mg_metaslab_tree, msp);
 454  454          msp->ms_weight = weight;
 455  455          avl_add(&mg->mg_metaslab_tree, msp);
 456  456          mutex_exit(&mg->mg_lock);
 457  457  }
 458  458  
 459  459  /*
 460  460   * Determine if a given metaslab group should skip allocations. A metaslab
 461  461   * group should avoid allocations if its used capacity has crossed the
 462  462   * zfs_mg_noalloc_threshold and there is at least one metaslab group
 463  463   * that can still handle allocations.
 464  464   */
 465  465  static boolean_t
 466  466  metaslab_group_allocatable(metaslab_group_t *mg)
 467  467  {
 468  468          vdev_t *vd = mg->mg_vd;
 469  469          spa_t *spa = vd->vdev_spa;
 470  470          metaslab_class_t *mc = mg->mg_class;
 471  471  
 472  472          /*
 473  473           * A metaslab group is considered allocatable if its free capacity
 474  474           * is greater than the set value of zfs_mg_noalloc_threshold, it's
 475  475           * associated with a slog, or there are no other metaslab groups
 476  476           * with free capacity greater than zfs_mg_noalloc_threshold.
 477  477           */
 478  478          return (mg->mg_free_capacity > zfs_mg_noalloc_threshold ||
 479  479              mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0);
 480  480  }
 481  481  
 482  482  /*
 483  483   * ==========================================================================
 484  484   * Range tree callbacks
 485  485   * ==========================================================================
 486  486   */
 487  487  
 488  488  /*
 489  489   * Comparison function for the private size-ordered tree. Tree is sorted
 490  490   * by size, larger sizes at the end of the tree.
 491  491   */
 492  492  static int
 493  493  metaslab_rangesize_compare(const void *x1, const void *x2)
 494  494  {
 495  495          const range_seg_t *r1 = x1;
 496  496          const range_seg_t *r2 = x2;
 497  497          uint64_t rs_size1 = r1->rs_end - r1->rs_start;
 498  498          uint64_t rs_size2 = r2->rs_end - r2->rs_start;
 499  499  
 500  500          if (rs_size1 < rs_size2)
 501  501                  return (-1);
 502  502          if (rs_size1 > rs_size2)
 503  503                  return (1);
 504  504  
 505  505          if (r1->rs_start < r2->rs_start)
 506  506                  return (-1);
 507  507  
 508  508          if (r1->rs_start > r2->rs_start)
 509  509                  return (1);
 510  510  
 511  511          return (0);
 512  512  }
 513  513  
 514  514  /*
 515  515   * Create any block allocator specific components. The current allocators
 516  516   * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
 517  517   */
 518  518  static void
 519  519  metaslab_rt_create(range_tree_t *rt, void *arg)
 520  520  {
 521  521          metaslab_t *msp = arg;
 522  522  
 523  523          ASSERT3P(rt->rt_arg, ==, msp);
 524  524          ASSERT(msp->ms_tree == NULL);
 525  525  
 526  526          avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
 527  527              sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
 528  528  }
 529  529  
 530  530  /*
 531  531   * Destroy the block allocator specific components.
 532  532   */
 533  533  static void
 534  534  metaslab_rt_destroy(range_tree_t *rt, void *arg)
 535  535  {
 536  536          metaslab_t *msp = arg;
 537  537  
 538  538          ASSERT3P(rt->rt_arg, ==, msp);
 539  539          ASSERT3P(msp->ms_tree, ==, rt);
 540  540          ASSERT0(avl_numnodes(&msp->ms_size_tree));
 541  541  
 542  542          avl_destroy(&msp->ms_size_tree);
 543  543  }
 544  544  
 545  545  static void
 546  546  metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
 547  547  {
 548  548          metaslab_t *msp = arg;
 549  549  
 550  550          ASSERT3P(rt->rt_arg, ==, msp);
 551  551          ASSERT3P(msp->ms_tree, ==, rt);
 552  552          VERIFY(!msp->ms_condensing);
 553  553          avl_add(&msp->ms_size_tree, rs);
 554  554  }
 555  555  
 556  556  static void
 557  557  metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
 558  558  {
 559  559          metaslab_t *msp = arg;
 560  560  
 561  561          ASSERT3P(rt->rt_arg, ==, msp);
 562  562          ASSERT3P(msp->ms_tree, ==, rt);
 563  563          VERIFY(!msp->ms_condensing);
 564  564          avl_remove(&msp->ms_size_tree, rs);
 565  565  }
 566  566  
 567  567  static void
 568  568  metaslab_rt_vacate(range_tree_t *rt, void *arg)
 569  569  {
 570  570          metaslab_t *msp = arg;
 571  571  
 572  572          ASSERT3P(rt->rt_arg, ==, msp);
 573  573          ASSERT3P(msp->ms_tree, ==, rt);
 574  574  
 575  575          /*
 576  576           * Normally one would walk the tree freeing nodes along the way.
 577  577           * Since the nodes are shared with the range trees we can avoid
 578  578           * walking all nodes and just reinitialize the avl tree. The nodes
 579  579           * will be freed by the range tree, so we don't want to free them here.
 580  580           */
 581  581          avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
 582  582              sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
 583  583  }
 584  584  
 585  585  static range_tree_ops_t metaslab_rt_ops = {
 586  586          metaslab_rt_create,
 587  587          metaslab_rt_destroy,
 588  588          metaslab_rt_add,
 589  589          metaslab_rt_remove,
 590  590          metaslab_rt_vacate
 591  591  };
 592  592  
 593  593  /*
 594  594   * ==========================================================================
 595  595   * Metaslab block operations
 596  596   * ==========================================================================
 597  597   */
 598  598  
 599  599  /*
 600  600   * Return the maximum contiguous segment within the metaslab.
 601  601   */
 602  602  uint64_t
 603  603  metaslab_block_maxsize(metaslab_t *msp)
 604  604  {
 605  605          avl_tree_t *t = &msp->ms_size_tree;
 606  606          range_seg_t *rs;
 607  607  
 608  608          if (t == NULL || (rs = avl_last(t)) == NULL)
 609  609                  return (0ULL);
 610  610  
 611  611          return (rs->rs_end - rs->rs_start);
 612  612  }
 613  613  
 614  614  uint64_t
 615  615  metaslab_block_alloc(metaslab_t *msp, uint64_t size)
 616  616  {
 617  617          uint64_t start;
 618  618          range_tree_t *rt = msp->ms_tree;
 619  619  
 620  620          VERIFY(!msp->ms_condensing);
 621  621  
 622  622          start = msp->ms_ops->msop_alloc(msp, size);
 623  623          if (start != -1ULL) {
 624  624                  vdev_t *vd = msp->ms_group->mg_vd;
 625  625  
 626  626                  VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
 627  627                  VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
 628  628                  VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
 629  629                  range_tree_remove(rt, start, size);
 630  630          }
 631  631          return (start);
 632  632  }
 633  633  
 634  634  /*
 635  635   * ==========================================================================
 636  636   * Common allocator routines
 637  637   * ==========================================================================
 638  638   */
 639  639  
 640  640  /*
 641  641   * This is a helper function that can be used by the allocator to find
 642  642   * a suitable block to allocate. This will search the specified AVL
 643  643   * tree looking for a block that matches the specified criteria.
 644  644   */
 645  645  static uint64_t
 646  646  metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
 647  647      uint64_t align)
 648  648  {
 649  649          range_seg_t *rs, rsearch;
 650  650          avl_index_t where;
 651  651  
 652  652          rsearch.rs_start = *cursor;
 653  653          rsearch.rs_end = *cursor + size;
 654  654  
 655  655          rs = avl_find(t, &rsearch, &where);
 656  656          if (rs == NULL)
 657  657                  rs = avl_nearest(t, where, AVL_AFTER);
 658  658  
 659  659          while (rs != NULL) {
 660  660                  uint64_t offset = P2ROUNDUP(rs->rs_start, align);
 661  661  
 662  662                  if (offset + size <= rs->rs_end) {
 663  663                          *cursor = offset + size;
 664  664                          return (offset);
 665  665                  }
 666  666                  rs = AVL_NEXT(t, rs);
 667  667          }
 668  668  
 669  669          /*
 670  670           * If we know we've searched the whole map (*cursor == 0), give up.
 671  671           * Otherwise, reset the cursor to the beginning and try again.
 672  672           */
 673  673          if (*cursor == 0)
 674  674                  return (-1ULL);
 675  675  
 676  676          *cursor = 0;
 677  677          return (metaslab_block_picker(t, cursor, size, align));
 678  678  }
 679  679  
 680  680  /*
 681  681   * ==========================================================================
 682  682   * The first-fit block allocator
 683  683   * ==========================================================================
 684  684   */
 685  685  static uint64_t

↓ open down ↓

652 lines elided

↑ open up ↑

 686  686  metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
 687  687  {
 688  688          /*
 689  689           * Find the largest power of 2 block size that evenly divides the
 690  690           * requested size. This is used to try to allocate blocks with similar
 691  691           * alignment from the same area of the metaslab (i.e. same cursor
 692  692           * bucket) but it does not guarantee that other allocations sizes
 693  693           * may exist in the same region.
 694  694           */
 695  695          uint64_t align = size & -size;
 696      -        uint64_t *cursor = &msp->ms_lbas[highbit(align) - 1];
      696 +        uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
 697  697          avl_tree_t *t = &msp->ms_tree->rt_root;
 698  698  
 699  699          return (metaslab_block_picker(t, cursor, size, align));
 700  700  }
 701  701  
 702  702  /* ARGSUSED */
 703  703  static boolean_t
 704  704  metaslab_ff_fragmented(metaslab_t *msp)
 705  705  {
 706  706          return (B_TRUE);

 707  707  }
 708  708  
 709  709  static metaslab_ops_t metaslab_ff_ops = {
 710  710          metaslab_ff_alloc,
 711  711          metaslab_ff_fragmented
 712  712  };
 713  713  
 714  714  /*
 715  715   * ==========================================================================
 716  716   * Dynamic block allocator -
 717  717   * Uses the first fit allocation scheme until space get low and then
 718  718   * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
 719  719   * and metaslab_df_free_pct to determine when to switch the allocation scheme.
 720  720   * ==========================================================================
 721  721   */
 722  722  static uint64_t

↓ open down ↓

16 lines elided

↑ open up ↑

 723  723  metaslab_df_alloc(metaslab_t *msp, uint64_t size)
 724  724  {
 725  725          /*
 726  726           * Find the largest power of 2 block size that evenly divides the
 727  727           * requested size. This is used to try to allocate blocks with similar
 728  728           * alignment from the same area of the metaslab (i.e. same cursor
 729  729           * bucket) but it does not guarantee that other allocations sizes
 730  730           * may exist in the same region.
 731  731           */
 732  732          uint64_t align = size & -size;
 733      -        uint64_t *cursor = &msp->ms_lbas[highbit(align) - 1];
      733 +        uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
 734  734          range_tree_t *rt = msp->ms_tree;
 735  735          avl_tree_t *t = &rt->rt_root;
 736  736          uint64_t max_size = metaslab_block_maxsize(msp);
 737  737          int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
 738  738  
 739  739          ASSERT(MUTEX_HELD(&msp->ms_lock));
 740  740          ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
 741  741  
 742  742          if (max_size < size)
 743  743                  return (-1ULL);

 744  744  
 745  745          /*
 746  746           * If we're running low on space switch to using the size
 747  747           * sorted AVL tree (best-fit).
 748  748           */
 749  749          if (max_size < metaslab_df_alloc_threshold ||
 750  750              free_pct < metaslab_df_free_pct) {
 751  751                  t = &msp->ms_size_tree;
 752  752                  *cursor = 0;
 753  753          }
 754  754  
 755  755          return (metaslab_block_picker(t, cursor, size, 1ULL));
 756  756  }
 757  757  
 758  758  static boolean_t
 759  759  metaslab_df_fragmented(metaslab_t *msp)
 760  760  {
 761  761          range_tree_t *rt = msp->ms_tree;
 762  762          uint64_t max_size = metaslab_block_maxsize(msp);
 763  763          int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
 764  764  
 765  765          if (max_size >= metaslab_df_alloc_threshold &&
 766  766              free_pct >= metaslab_df_free_pct)
 767  767                  return (B_FALSE);
 768  768  
 769  769          return (B_TRUE);
 770  770  }
 771  771  
 772  772  static metaslab_ops_t metaslab_df_ops = {
 773  773          metaslab_df_alloc,
 774  774          metaslab_df_fragmented
 775  775  };
 776  776  
 777  777  /*
 778  778   * ==========================================================================
 779  779   * Cursor fit block allocator -
 780  780   * Select the largest region in the metaslab, set the cursor to the beginning
 781  781   * of the range and the cursor_end to the end of the range. As allocations
 782  782   * are made advance the cursor. Continue allocating from the cursor until
 783  783   * the range is exhausted and then find a new range.
 784  784   * ==========================================================================
 785  785   */
 786  786  static uint64_t
 787  787  metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
 788  788  {
 789  789          range_tree_t *rt = msp->ms_tree;
 790  790          avl_tree_t *t = &msp->ms_size_tree;
 791  791          uint64_t *cursor = &msp->ms_lbas[0];
 792  792          uint64_t *cursor_end = &msp->ms_lbas[1];
 793  793          uint64_t offset = 0;
 794  794  
 795  795          ASSERT(MUTEX_HELD(&msp->ms_lock));
 796  796          ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
 797  797  
 798  798          ASSERT3U(*cursor_end, >=, *cursor);
 799  799  
 800  800          if ((*cursor + size) > *cursor_end) {
 801  801                  range_seg_t *rs;
 802  802  
 803  803                  rs = avl_last(&msp->ms_size_tree);
 804  804                  if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
 805  805                          return (-1ULL);
 806  806  
 807  807                  *cursor = rs->rs_start;
 808  808                  *cursor_end = rs->rs_end;
 809  809          }
 810  810  
 811  811          offset = *cursor;
 812  812          *cursor += size;
 813  813  
 814  814          return (offset);
 815  815  }
 816  816  
 817  817  static boolean_t
 818  818  metaslab_cf_fragmented(metaslab_t *msp)
 819  819  {
 820  820          return (metaslab_block_maxsize(msp) < metaslab_min_alloc_size);
 821  821  }
 822  822  
 823  823  static metaslab_ops_t metaslab_cf_ops = {
 824  824          metaslab_cf_alloc,
 825  825          metaslab_cf_fragmented
 826  826  };
 827  827  
 828  828  /*
 829  829   * ==========================================================================
 830  830   * New dynamic fit allocator -
 831  831   * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
 832  832   * contiguous blocks. If no region is found then just use the largest segment
 833  833   * that remains.
 834  834   * ==========================================================================
 835  835   */
 836  836  
 837  837  /*
 838  838   * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)

↓ open down ↓

95 lines elided

↑ open up ↑

 839  839   * to request from the allocator.
 840  840   */
 841  841  uint64_t metaslab_ndf_clump_shift = 4;
 842  842  
 843  843  static uint64_t
 844  844  metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
 845  845  {
 846  846          avl_tree_t *t = &msp->ms_tree->rt_root;
 847  847          avl_index_t where;
 848  848          range_seg_t *rs, rsearch;
 849      -        uint64_t hbit = highbit(size);
      849 +        uint64_t hbit = highbit64(size);
 850  850          uint64_t *cursor = &msp->ms_lbas[hbit - 1];
 851  851          uint64_t max_size = metaslab_block_maxsize(msp);
 852  852  
 853  853          ASSERT(MUTEX_HELD(&msp->ms_lock));
 854  854          ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
 855  855  
 856  856          if (max_size < size)
 857  857                  return (-1ULL);
 858  858  
 859  859          rsearch.rs_start = *cursor;

 860  860          rsearch.rs_end = *cursor + size;
 861  861  
 862  862          rs = avl_find(t, &rsearch, &where);
 863  863          if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
 864  864                  t = &msp->ms_size_tree;
 865  865  
 866  866                  rsearch.rs_start = 0;
 867  867                  rsearch.rs_end = MIN(max_size,
 868  868                      1ULL << (hbit + metaslab_ndf_clump_shift));
 869  869                  rs = avl_find(t, &rsearch, &where);
 870  870                  if (rs == NULL)
 871  871                          rs = avl_nearest(t, where, AVL_AFTER);
 872  872                  ASSERT(rs != NULL);
 873  873          }
 874  874  
 875  875          if ((rs->rs_end - rs->rs_start) >= size) {
 876  876                  *cursor = rs->rs_start + size;
 877  877                  return (rs->rs_start);
 878  878          }
 879  879          return (-1ULL);
 880  880  }
 881  881  
 882  882  static boolean_t
 883  883  metaslab_ndf_fragmented(metaslab_t *msp)
 884  884  {
 885  885          return (metaslab_block_maxsize(msp) <=
 886  886              (metaslab_min_alloc_size << metaslab_ndf_clump_shift));
 887  887  }
 888  888  
 889  889  static metaslab_ops_t metaslab_ndf_ops = {
 890  890          metaslab_ndf_alloc,
 891  891          metaslab_ndf_fragmented
 892  892  };
 893  893  
 894  894  metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
 895  895  
 896  896  /*
 897  897   * ==========================================================================
 898  898   * Metaslabs
 899  899   * ==========================================================================
 900  900   */
 901  901  
 902  902  /*
 903  903   * Wait for any in-progress metaslab loads to complete.
 904  904   */
 905  905  void
 906  906  metaslab_load_wait(metaslab_t *msp)
 907  907  {
 908  908          ASSERT(MUTEX_HELD(&msp->ms_lock));
 909  909  
 910  910          while (msp->ms_loading) {
 911  911                  ASSERT(!msp->ms_loaded);
 912  912                  cv_wait(&msp->ms_load_cv, &msp->ms_lock);
 913  913          }
 914  914  }
 915  915  
 916  916  int
 917  917  metaslab_load(metaslab_t *msp)
 918  918  {
 919  919          int error = 0;
 920  920  
 921  921          ASSERT(MUTEX_HELD(&msp->ms_lock));
 922  922          ASSERT(!msp->ms_loaded);
 923  923          ASSERT(!msp->ms_loading);
 924  924  
 925  925          msp->ms_loading = B_TRUE;
 926  926  
 927  927          /*
 928  928           * If the space map has not been allocated yet, then treat
 929  929           * all the space in the metaslab as free and add it to the
 930  930           * ms_tree.
 931  931           */
 932  932          if (msp->ms_sm != NULL)
 933  933                  error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
 934  934          else
 935  935                  range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
 936  936  
 937  937          msp->ms_loaded = (error == 0);
 938  938          msp->ms_loading = B_FALSE;
 939  939  
 940  940          if (msp->ms_loaded) {
 941  941                  for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 942  942                          range_tree_walk(msp->ms_defertree[t],
 943  943                              range_tree_remove, msp->ms_tree);
 944  944                  }
 945  945          }
 946  946          cv_broadcast(&msp->ms_load_cv);
 947  947          return (error);
 948  948  }
 949  949  
 950  950  void
 951  951  metaslab_unload(metaslab_t *msp)
 952  952  {
 953  953          ASSERT(MUTEX_HELD(&msp->ms_lock));
 954  954          range_tree_vacate(msp->ms_tree, NULL, NULL);
 955  955          msp->ms_loaded = B_FALSE;
 956  956          msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
 957  957  }
 958  958  
 959  959  metaslab_t *
 960  960  metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg)
 961  961  {
 962  962          vdev_t *vd = mg->mg_vd;
 963  963          objset_t *mos = vd->vdev_spa->spa_meta_objset;
 964  964          metaslab_t *msp;
 965  965  
 966  966          msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
 967  967          mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL);
 968  968          cv_init(&msp->ms_load_cv, NULL, CV_DEFAULT, NULL);
 969  969          msp->ms_id = id;
 970  970          msp->ms_start = id << vd->vdev_ms_shift;
 971  971          msp->ms_size = 1ULL << vd->vdev_ms_shift;
 972  972  
 973  973          /*
 974  974           * We only open space map objects that already exist. All others
 975  975           * will be opened when we finally allocate an object for it.
 976  976           */
 977  977          if (object != 0) {
 978  978                  VERIFY0(space_map_open(&msp->ms_sm, mos, object, msp->ms_start,
 979  979                      msp->ms_size, vd->vdev_ashift, &msp->ms_lock));
 980  980                  ASSERT(msp->ms_sm != NULL);
 981  981          }
 982  982  
 983  983          /*
 984  984           * We create the main range tree here, but we don't create the
 985  985           * alloctree and freetree until metaslab_sync_done().  This serves
 986  986           * two purposes: it allows metaslab_sync_done() to detect the
 987  987           * addition of new space; and for debugging, it ensures that we'd
 988  988           * data fault on any attempt to use this metaslab before it's ready.
 989  989           */
 990  990          msp->ms_tree = range_tree_create(&metaslab_rt_ops, msp, &msp->ms_lock);
 991  991          metaslab_group_add(mg, msp);
 992  992  
 993  993          msp->ms_ops = mg->mg_class->mc_ops;
 994  994  
 995  995          /*
 996  996           * If we're opening an existing pool (txg == 0) or creating
 997  997           * a new one (txg == TXG_INITIAL), all space is available now.
 998  998           * If we're adding space to an existing pool, the new space
 999  999           * does not become available until after this txg has synced.
1000 1000           */
1001 1001          if (txg <= TXG_INITIAL)
1002 1002                  metaslab_sync_done(msp, 0);
1003 1003  
1004 1004          /*
1005 1005           * If metaslab_debug_load is set and we're initializing a metaslab
1006 1006           * that has an allocated space_map object then load the its space
1007 1007           * map so that can verify frees.
1008 1008           */
1009 1009          if (metaslab_debug_load && msp->ms_sm != NULL) {
1010 1010                  mutex_enter(&msp->ms_lock);
1011 1011                  VERIFY0(metaslab_load(msp));
1012 1012                  mutex_exit(&msp->ms_lock);
1013 1013          }
1014 1014  
1015 1015          if (txg != 0) {
1016 1016                  vdev_dirty(vd, 0, NULL, txg);
1017 1017                  vdev_dirty(vd, VDD_METASLAB, msp, txg);
1018 1018          }
1019 1019  
1020 1020          return (msp);
1021 1021  }
1022 1022  
1023 1023  void
1024 1024  metaslab_fini(metaslab_t *msp)
1025 1025  {
1026 1026          metaslab_group_t *mg = msp->ms_group;
1027 1027  
1028 1028          metaslab_group_remove(mg, msp);
1029 1029  
1030 1030          mutex_enter(&msp->ms_lock);
1031 1031  
1032 1032          VERIFY(msp->ms_group == NULL);
1033 1033          vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
1034 1034              0, -msp->ms_size);
1035 1035          space_map_close(msp->ms_sm);
1036 1036  
1037 1037          metaslab_unload(msp);
1038 1038          range_tree_destroy(msp->ms_tree);
1039 1039  
1040 1040          for (int t = 0; t < TXG_SIZE; t++) {
1041 1041                  range_tree_destroy(msp->ms_alloctree[t]);
1042 1042                  range_tree_destroy(msp->ms_freetree[t]);
1043 1043          }
1044 1044  
1045 1045          for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1046 1046                  range_tree_destroy(msp->ms_defertree[t]);
1047 1047          }
1048 1048  
1049 1049          ASSERT0(msp->ms_deferspace);
1050 1050  
1051 1051          mutex_exit(&msp->ms_lock);
1052 1052          cv_destroy(&msp->ms_load_cv);
1053 1053          mutex_destroy(&msp->ms_lock);
1054 1054  
1055 1055          kmem_free(msp, sizeof (metaslab_t));
1056 1056  }
1057 1057  
1058 1058  /*
1059 1059   * Apply a weighting factor based on the histogram information for this
1060 1060   * metaslab. The current weighting factor is somewhat arbitrary and requires
1061 1061   * additional investigation. The implementation provides a measure of
1062 1062   * "weighted" free space and gives a higher weighting for larger contiguous
1063 1063   * regions. The weighting factor is determined by counting the number of
1064 1064   * sm_shift sectors that exist in each region represented by the histogram.
1065 1065   * That value is then multiplied by the power of 2 exponent and the sm_shift
1066 1066   * value.
1067 1067   *
1068 1068   * For example, assume the 2^21 histogram bucket has 4 2MB regions and the
1069 1069   * metaslab has an sm_shift value of 9 (512B):
1070 1070   *
1071 1071   * 1) calculate the number of sm_shift sectors in the region:
1072 1072   *      2^21 / 2^9 = 2^12 = 4096 * 4 (number of regions) = 16384
1073 1073   * 2) multiply by the power of 2 exponent and the sm_shift value:
1074 1074   *      16384 * 21 * 9 = 3096576
1075 1075   * This value will be added to the weighting of the metaslab.
1076 1076   */
1077 1077  static uint64_t
1078 1078  metaslab_weight_factor(metaslab_t *msp)
1079 1079  {
1080 1080          uint64_t factor = 0;
1081 1081          uint64_t sectors;

↓ open down ↓

222 lines elided

↑ open up ↑

1082 1082          int i;
1083 1083  
1084 1084          /*
1085 1085           * A null space map means that the entire metaslab is free,
1086 1086           * calculate a weight factor that spans the entire size of the
1087 1087           * metaslab.
1088 1088           */
1089 1089          if (msp->ms_sm == NULL) {
1090 1090                  vdev_t *vd = msp->ms_group->mg_vd;
1091 1091  
1092      -                i = highbit(msp->ms_size) - 1;
     1092 +                i = highbit64(msp->ms_size) - 1;
1093 1093                  sectors = msp->ms_size >> vd->vdev_ashift;
1094 1094                  return (sectors * i * vd->vdev_ashift);
1095 1095          }
1096 1096  
1097 1097          if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
1098 1098                  return (0);
1099 1099  
1100 1100          for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE(msp->ms_sm); i++) {
1101 1101                  if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
1102 1102                          continue;

1103 1103  
1104 1104                  /*
1105 1105                   * Determine the number of sm_shift sectors in the region
1106 1106                   * indicated by the histogram. For example, given an
1107 1107                   * sm_shift value of 9 (512 bytes) and i = 4 then we know
1108 1108                   * that we're looking at an 8K region in the histogram
1109 1109                   * (i.e. 9 + 4 = 13, 2^13 = 8192). To figure out the
1110 1110                   * number of sm_shift sectors (512 bytes in this example),
1111 1111                   * we would take 8192 / 512 = 16. Since the histogram
1112 1112                   * is offset by sm_shift we can simply use the value of
1113 1113                   * of i to calculate this (i.e. 2^i = 16 where i = 4).
1114 1114                   */
1115 1115                  sectors = msp->ms_sm->sm_phys->smp_histogram[i] << i;
1116 1116                  factor += (i + msp->ms_sm->sm_shift) * sectors;
1117 1117          }
1118 1118          return (factor * msp->ms_sm->sm_shift);
1119 1119  }
1120 1120  
1121 1121  static uint64_t
1122 1122  metaslab_weight(metaslab_t *msp)
1123 1123  {
1124 1124          metaslab_group_t *mg = msp->ms_group;
1125 1125          vdev_t *vd = mg->mg_vd;
1126 1126          uint64_t weight, space;
1127 1127  
1128 1128          ASSERT(MUTEX_HELD(&msp->ms_lock));
1129 1129  
1130 1130          /*
1131 1131           * This vdev is in the process of being removed so there is nothing
1132 1132           * for us to do here.
1133 1133           */
1134 1134          if (vd->vdev_removing) {
1135 1135                  ASSERT0(space_map_allocated(msp->ms_sm));
1136 1136                  ASSERT0(vd->vdev_ms_shift);
1137 1137                  return (0);
1138 1138          }
1139 1139  
1140 1140          /*
1141 1141           * The baseline weight is the metaslab's free space.
1142 1142           */
1143 1143          space = msp->ms_size - space_map_allocated(msp->ms_sm);
1144 1144          weight = space;
1145 1145  
1146 1146          /*
1147 1147           * Modern disks have uniform bit density and constant angular velocity.
1148 1148           * Therefore, the outer recording zones are faster (higher bandwidth)
1149 1149           * than the inner zones by the ratio of outer to inner track diameter,
1150 1150           * which is typically around 2:1.  We account for this by assigning
1151 1151           * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
1152 1152           * In effect, this means that we'll select the metaslab with the most
1153 1153           * free bandwidth rather than simply the one with the most free space.
1154 1154           */
1155 1155          weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
1156 1156          ASSERT(weight >= space && weight <= 2 * space);
1157 1157  
1158 1158          msp->ms_factor = metaslab_weight_factor(msp);
1159 1159          if (metaslab_weight_factor_enable)
1160 1160                  weight += msp->ms_factor;
1161 1161  
1162 1162          if (msp->ms_loaded && !msp->ms_ops->msop_fragmented(msp)) {
1163 1163                  /*
1164 1164                   * If this metaslab is one we're actively using, adjust its
1165 1165                   * weight to make it preferable to any inactive metaslab so
1166 1166                   * we'll polish it off.
1167 1167                   */
1168 1168                  weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
1169 1169          }
1170 1170  
1171 1171          return (weight);
1172 1172  }
1173 1173  
1174 1174  static int
1175 1175  metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
1176 1176  {
1177 1177          ASSERT(MUTEX_HELD(&msp->ms_lock));
1178 1178  
1179 1179          if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
1180 1180                  metaslab_load_wait(msp);
1181 1181                  if (!msp->ms_loaded) {
1182 1182                          int error = metaslab_load(msp);
1183 1183                          if (error) {
1184 1184                                  metaslab_group_sort(msp->ms_group, msp, 0);
1185 1185                                  return (error);
1186 1186                          }
1187 1187                  }
1188 1188  
1189 1189                  metaslab_group_sort(msp->ms_group, msp,
1190 1190                      msp->ms_weight | activation_weight);
1191 1191          }
1192 1192          ASSERT(msp->ms_loaded);
1193 1193          ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
1194 1194  
1195 1195          return (0);
1196 1196  }
1197 1197  
1198 1198  static void
1199 1199  metaslab_passivate(metaslab_t *msp, uint64_t size)
1200 1200  {
1201 1201          /*
1202 1202           * If size < SPA_MINBLOCKSIZE, then we will not allocate from
1203 1203           * this metaslab again.  In that case, it had better be empty,
1204 1204           * or we would be leaving space on the table.
1205 1205           */
1206 1206          ASSERT(size >= SPA_MINBLOCKSIZE || range_tree_space(msp->ms_tree) == 0);
1207 1207          metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
1208 1208          ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
1209 1209  }
1210 1210  
1211 1211  static void
1212 1212  metaslab_preload(void *arg)
1213 1213  {
1214 1214          metaslab_t *msp = arg;
1215 1215          spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1216 1216  
1217 1217          mutex_enter(&msp->ms_lock);
1218 1218          metaslab_load_wait(msp);
1219 1219          if (!msp->ms_loaded)
1220 1220                  (void) metaslab_load(msp);
1221 1221  
1222 1222          /*
1223 1223           * Set the ms_access_txg value so that we don't unload it right away.
1224 1224           */
1225 1225          msp->ms_access_txg = spa_syncing_txg(spa) + metaslab_unload_delay + 1;
1226 1226          mutex_exit(&msp->ms_lock);
1227 1227  }
1228 1228  
1229 1229  static void
1230 1230  metaslab_group_preload(metaslab_group_t *mg)
1231 1231  {
1232 1232          spa_t *spa = mg->mg_vd->vdev_spa;
1233 1233          metaslab_t *msp;
1234 1234          avl_tree_t *t = &mg->mg_metaslab_tree;
1235 1235          int m = 0;
1236 1236  
1237 1237          if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
1238 1238                  taskq_wait(mg->mg_taskq);
1239 1239                  return;
1240 1240          }
1241 1241          mutex_enter(&mg->mg_lock);
1242 1242  
1243 1243          /*
1244 1244           * Prefetch the next potential metaslabs
1245 1245           */
1246 1246          for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
1247 1247  
1248 1248                  /* If we have reached our preload limit then we're done */
1249 1249                  if (++m > metaslab_preload_limit)
1250 1250                          break;
1251 1251  
1252 1252                  VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
1253 1253                      msp, TQ_SLEEP) != NULL);
1254 1254          }
1255 1255          mutex_exit(&mg->mg_lock);
1256 1256  }
1257 1257  
1258 1258  /*
1259 1259   * Determine if the space map's on-disk footprint is past our tolerance
1260 1260   * for inefficiency. We would like to use the following criteria to make
1261 1261   * our decision:
1262 1262   *
1263 1263   * 1. The size of the space map object should not dramatically increase as a
1264 1264   * result of writing out the free space range tree.
1265 1265   *
1266 1266   * 2. The minimal on-disk space map representation is zfs_condense_pct/100
1267 1267   * times the size than the free space range tree representation
1268 1268   * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB).
1269 1269   *
1270 1270   * Checking the first condition is tricky since we don't want to walk
1271 1271   * the entire AVL tree calculating the estimated on-disk size. Instead we
1272 1272   * use the size-ordered range tree in the metaslab and calculate the
1273 1273   * size required to write out the largest segment in our free tree. If the
1274 1274   * size required to represent that segment on disk is larger than the space
1275 1275   * map object then we avoid condensing this map.
1276 1276   *
1277 1277   * To determine the second criterion we use a best-case estimate and assume
1278 1278   * each segment can be represented on-disk as a single 64-bit entry. We refer
1279 1279   * to this best-case estimate as the space map's minimal form.
1280 1280   */
1281 1281  static boolean_t
1282 1282  metaslab_should_condense(metaslab_t *msp)
1283 1283  {
1284 1284          space_map_t *sm = msp->ms_sm;
1285 1285          range_seg_t *rs;
1286 1286          uint64_t size, entries, segsz;
1287 1287  
1288 1288          ASSERT(MUTEX_HELD(&msp->ms_lock));
1289 1289          ASSERT(msp->ms_loaded);
1290 1290  
1291 1291          /*
1292 1292           * Use the ms_size_tree range tree, which is ordered by size, to
1293 1293           * obtain the largest segment in the free tree. If the tree is empty
1294 1294           * then we should condense the map.
1295 1295           */
1296 1296          rs = avl_last(&msp->ms_size_tree);
1297 1297          if (rs == NULL)
1298 1298                  return (B_TRUE);
1299 1299  
1300 1300          /*
1301 1301           * Calculate the number of 64-bit entries this segment would
1302 1302           * require when written to disk. If this single segment would be
1303 1303           * larger on-disk than the entire current on-disk structure, then
1304 1304           * clearly condensing will increase the on-disk structure size.
1305 1305           */
1306 1306          size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
1307 1307          entries = size / (MIN(size, SM_RUN_MAX));
1308 1308          segsz = entries * sizeof (uint64_t);
1309 1309  
1310 1310          return (segsz <= space_map_length(msp->ms_sm) &&
1311 1311              space_map_length(msp->ms_sm) >= (zfs_condense_pct *
1312 1312              sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root)) / 100);
1313 1313  }
1314 1314  
1315 1315  /*
1316 1316   * Condense the on-disk space map representation to its minimized form.
1317 1317   * The minimized form consists of a small number of allocations followed by
1318 1318   * the entries of the free range tree.
1319 1319   */
1320 1320  static void
1321 1321  metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
1322 1322  {
1323 1323          spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1324 1324          range_tree_t *freetree = msp->ms_freetree[txg & TXG_MASK];
1325 1325          range_tree_t *condense_tree;
1326 1326          space_map_t *sm = msp->ms_sm;
1327 1327  
1328 1328          ASSERT(MUTEX_HELD(&msp->ms_lock));
1329 1329          ASSERT3U(spa_sync_pass(spa), ==, 1);
1330 1330          ASSERT(msp->ms_loaded);
1331 1331  
1332 1332          spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, "
1333 1333              "smp size %llu, segments %lu", txg, msp->ms_id, msp,
1334 1334              space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root));
1335 1335  
1336 1336          /*
1337 1337           * Create an range tree that is 100% allocated. We remove segments
1338 1338           * that have been freed in this txg, any deferred frees that exist,
1339 1339           * and any allocation in the future. Removing segments should be
1340 1340           * a relatively inexpensive operation since we expect these trees to
1341 1341           * have a small number of nodes.
1342 1342           */
1343 1343          condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock);
1344 1344          range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
1345 1345  
1346 1346          /*
1347 1347           * Remove what's been freed in this txg from the condense_tree.
1348 1348           * Since we're in sync_pass 1, we know that all the frees from
1349 1349           * this txg are in the freetree.
1350 1350           */
1351 1351          range_tree_walk(freetree, range_tree_remove, condense_tree);
1352 1352  
1353 1353          for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1354 1354                  range_tree_walk(msp->ms_defertree[t],
1355 1355                      range_tree_remove, condense_tree);
1356 1356          }
1357 1357  
1358 1358          for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
1359 1359                  range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK],
1360 1360                      range_tree_remove, condense_tree);
1361 1361          }
1362 1362  
1363 1363          /*
1364 1364           * We're about to drop the metaslab's lock thus allowing
1365 1365           * other consumers to change it's content. Set the
1366 1366           * metaslab's ms_condensing flag to ensure that
1367 1367           * allocations on this metaslab do not occur while we're
1368 1368           * in the middle of committing it to disk. This is only critical
1369 1369           * for the ms_tree as all other range trees use per txg
1370 1370           * views of their content.
1371 1371           */
1372 1372          msp->ms_condensing = B_TRUE;
1373 1373  
1374 1374          mutex_exit(&msp->ms_lock);
1375 1375          space_map_truncate(sm, tx);
1376 1376          mutex_enter(&msp->ms_lock);
1377 1377  
1378 1378          /*
1379 1379           * While we would ideally like to create a space_map representation
1380 1380           * that consists only of allocation records, doing so can be
1381 1381           * prohibitively expensive because the in-core free tree can be
1382 1382           * large, and therefore computationally expensive to subtract
1383 1383           * from the condense_tree. Instead we sync out two trees, a cheap
1384 1384           * allocation only tree followed by the in-core free tree. While not
1385 1385           * optimal, this is typically close to optimal, and much cheaper to
1386 1386           * compute.
1387 1387           */
1388 1388          space_map_write(sm, condense_tree, SM_ALLOC, tx);
1389 1389          range_tree_vacate(condense_tree, NULL, NULL);
1390 1390          range_tree_destroy(condense_tree);
1391 1391  
1392 1392          space_map_write(sm, msp->ms_tree, SM_FREE, tx);
1393 1393          msp->ms_condensing = B_FALSE;
1394 1394  }
1395 1395  
1396 1396  /*
1397 1397   * Write a metaslab to disk in the context of the specified transaction group.
1398 1398   */
1399 1399  void
1400 1400  metaslab_sync(metaslab_t *msp, uint64_t txg)
1401 1401  {
1402 1402          metaslab_group_t *mg = msp->ms_group;
1403 1403          vdev_t *vd = mg->mg_vd;
1404 1404          spa_t *spa = vd->vdev_spa;
1405 1405          objset_t *mos = spa_meta_objset(spa);
1406 1406          range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK];
1407 1407          range_tree_t **freetree = &msp->ms_freetree[txg & TXG_MASK];
1408 1408          range_tree_t **freed_tree =
1409 1409              &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
1410 1410          dmu_tx_t *tx;
1411 1411          uint64_t object = space_map_object(msp->ms_sm);
1412 1412  
1413 1413          ASSERT(!vd->vdev_ishole);
1414 1414  
1415 1415          /*
1416 1416           * This metaslab has just been added so there's no work to do now.
1417 1417           */
1418 1418          if (*freetree == NULL) {
1419 1419                  ASSERT3P(alloctree, ==, NULL);
1420 1420                  return;
1421 1421          }
1422 1422  
1423 1423          ASSERT3P(alloctree, !=, NULL);
1424 1424          ASSERT3P(*freetree, !=, NULL);
1425 1425          ASSERT3P(*freed_tree, !=, NULL);
1426 1426  
1427 1427          if (range_tree_space(alloctree) == 0 &&
1428 1428              range_tree_space(*freetree) == 0)
1429 1429                  return;
1430 1430  
1431 1431          /*
1432 1432           * The only state that can actually be changing concurrently with
1433 1433           * metaslab_sync() is the metaslab's ms_tree.  No other thread can
1434 1434           * be modifying this txg's alloctree, freetree, freed_tree, or
1435 1435           * space_map_phys_t. Therefore, we only hold ms_lock to satify
1436 1436           * space_map ASSERTs. We drop it whenever we call into the DMU,
1437 1437           * because the DMU can call down to us (e.g. via zio_free()) at
1438 1438           * any time.
1439 1439           */
1440 1440  
1441 1441          tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
1442 1442  
1443 1443          if (msp->ms_sm == NULL) {
1444 1444                  uint64_t new_object;
1445 1445  
1446 1446                  new_object = space_map_alloc(mos, tx);
1447 1447                  VERIFY3U(new_object, !=, 0);
1448 1448  
1449 1449                  VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
1450 1450                      msp->ms_start, msp->ms_size, vd->vdev_ashift,
1451 1451                      &msp->ms_lock));
1452 1452                  ASSERT(msp->ms_sm != NULL);
1453 1453          }
1454 1454  
1455 1455          mutex_enter(&msp->ms_lock);
1456 1456  
1457 1457          if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
1458 1458              metaslab_should_condense(msp)) {
1459 1459                  metaslab_condense(msp, txg, tx);
1460 1460          } else {
1461 1461                  space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
1462 1462                  space_map_write(msp->ms_sm, *freetree, SM_FREE, tx);
1463 1463          }
1464 1464  
1465 1465          range_tree_vacate(alloctree, NULL, NULL);
1466 1466  
1467 1467          if (msp->ms_loaded) {
1468 1468                  /*
1469 1469                   * When the space map is loaded, we have an accruate
1470 1470                   * histogram in the range tree. This gives us an opportunity
1471 1471                   * to bring the space map's histogram up-to-date so we clear
1472 1472                   * it first before updating it.
1473 1473                   */
1474 1474                  space_map_histogram_clear(msp->ms_sm);
1475 1475                  space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
1476 1476          } else {
1477 1477                  /*
1478 1478                   * Since the space map is not loaded we simply update the
1479 1479                   * exisiting histogram with what was freed in this txg. This
1480 1480                   * means that the on-disk histogram may not have an accurate
1481 1481                   * view of the free space but it's close enough to allow
1482 1482                   * us to make allocation decisions.
1483 1483                   */
1484 1484                  space_map_histogram_add(msp->ms_sm, *freetree, tx);
1485 1485          }
1486 1486  
1487 1487          /*
1488 1488           * For sync pass 1, we avoid traversing this txg's free range tree
1489 1489           * and instead will just swap the pointers for freetree and
1490 1490           * freed_tree. We can safely do this since the freed_tree is
1491 1491           * guaranteed to be empty on the initial pass.
1492 1492           */
1493 1493          if (spa_sync_pass(spa) == 1) {
1494 1494                  range_tree_swap(freetree, freed_tree);
1495 1495          } else {
1496 1496                  range_tree_vacate(*freetree, range_tree_add, *freed_tree);
1497 1497          }
1498 1498  
1499 1499          ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
1500 1500          ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
1501 1501  
1502 1502          mutex_exit(&msp->ms_lock);
1503 1503  
1504 1504          if (object != space_map_object(msp->ms_sm)) {
1505 1505                  object = space_map_object(msp->ms_sm);
1506 1506                  dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
1507 1507                      msp->ms_id, sizeof (uint64_t), &object, tx);
1508 1508          }
1509 1509          dmu_tx_commit(tx);
1510 1510  }
1511 1511  
1512 1512  /*
1513 1513   * Called after a transaction group has completely synced to mark
1514 1514   * all of the metaslab's free space as usable.
1515 1515   */
1516 1516  void
1517 1517  metaslab_sync_done(metaslab_t *msp, uint64_t txg)
1518 1518  {
1519 1519          metaslab_group_t *mg = msp->ms_group;
1520 1520          vdev_t *vd = mg->mg_vd;
1521 1521          range_tree_t **freed_tree;
1522 1522          range_tree_t **defer_tree;
1523 1523          int64_t alloc_delta, defer_delta;
1524 1524  
1525 1525          ASSERT(!vd->vdev_ishole);
1526 1526  
1527 1527          mutex_enter(&msp->ms_lock);
1528 1528  
1529 1529          /*
1530 1530           * If this metaslab is just becoming available, initialize its
1531 1531           * alloctrees, freetrees, and defertree and add its capacity to
1532 1532           * the vdev.
1533 1533           */
1534 1534          if (msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK] == NULL) {
1535 1535                  for (int t = 0; t < TXG_SIZE; t++) {
1536 1536                          ASSERT(msp->ms_alloctree[t] == NULL);
1537 1537                          ASSERT(msp->ms_freetree[t] == NULL);
1538 1538  
1539 1539                          msp->ms_alloctree[t] = range_tree_create(NULL, msp,
1540 1540                              &msp->ms_lock);
1541 1541                          msp->ms_freetree[t] = range_tree_create(NULL, msp,
1542 1542                              &msp->ms_lock);
1543 1543                  }
1544 1544  
1545 1545                  for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1546 1546                          ASSERT(msp->ms_defertree[t] == NULL);
1547 1547  
1548 1548                          msp->ms_defertree[t] = range_tree_create(NULL, msp,
1549 1549                              &msp->ms_lock);
1550 1550                  }
1551 1551  
1552 1552                  vdev_space_update(vd, 0, 0, msp->ms_size);
1553 1553          }
1554 1554  
1555 1555          freed_tree = &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
1556 1556          defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];
1557 1557  
1558 1558          alloc_delta = space_map_alloc_delta(msp->ms_sm);
1559 1559          defer_delta = range_tree_space(*freed_tree) -
1560 1560              range_tree_space(*defer_tree);
1561 1561  
1562 1562          vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
1563 1563  
1564 1564          ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
1565 1565          ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
1566 1566  
1567 1567          /*
1568 1568           * If there's a metaslab_load() in progress, wait for it to complete
1569 1569           * so that we have a consistent view of the in-core space map.
1570 1570           */
1571 1571          metaslab_load_wait(msp);
1572 1572  
1573 1573          /*
1574 1574           * Move the frees from the defer_tree back to the free
1575 1575           * range tree (if it's loaded). Swap the freed_tree and the
1576 1576           * defer_tree -- this is safe to do because we've just emptied out
1577 1577           * the defer_tree.
1578 1578           */
1579 1579          range_tree_vacate(*defer_tree,
1580 1580              msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
1581 1581          range_tree_swap(freed_tree, defer_tree);
1582 1582  
1583 1583          space_map_update(msp->ms_sm);
1584 1584  
1585 1585          msp->ms_deferspace += defer_delta;
1586 1586          ASSERT3S(msp->ms_deferspace, >=, 0);
1587 1587          ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
1588 1588          if (msp->ms_deferspace != 0) {
1589 1589                  /*
1590 1590                   * Keep syncing this metaslab until all deferred frees
1591 1591                   * are back in circulation.
1592 1592                   */
1593 1593                  vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
1594 1594          }
1595 1595  
1596 1596          if (msp->ms_loaded && msp->ms_access_txg < txg) {
1597 1597                  for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
1598 1598                          VERIFY0(range_tree_space(
1599 1599                              msp->ms_alloctree[(txg + t) & TXG_MASK]));
1600 1600                  }
1601 1601  
1602 1602                  if (!metaslab_debug_unload)
1603 1603                          metaslab_unload(msp);
1604 1604          }
1605 1605  
1606 1606          metaslab_group_sort(mg, msp, metaslab_weight(msp));
1607 1607          mutex_exit(&msp->ms_lock);
1608 1608  
1609 1609  }
1610 1610  
1611 1611  void
1612 1612  metaslab_sync_reassess(metaslab_group_t *mg)
1613 1613  {
1614 1614          int64_t failures = mg->mg_alloc_failures;
1615 1615  
1616 1616          metaslab_group_alloc_update(mg);
1617 1617          atomic_add_64(&mg->mg_alloc_failures, -failures);
1618 1618  
1619 1619          /*
1620 1620           * Preload the next potential metaslabs
1621 1621           */
1622 1622          metaslab_group_preload(mg);
1623 1623  }
1624 1624  
1625 1625  static uint64_t
1626 1626  metaslab_distance(metaslab_t *msp, dva_t *dva)
1627 1627  {
1628 1628          uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
1629 1629          uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
1630 1630          uint64_t start = msp->ms_id;
1631 1631  
1632 1632          if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
1633 1633                  return (1ULL << 63);
1634 1634  
1635 1635          if (offset < start)
1636 1636                  return ((start - offset) << ms_shift);
1637 1637          if (offset > start)
1638 1638                  return ((offset - start) << ms_shift);
1639 1639          return (0);
1640 1640  }
1641 1641  
1642 1642  static uint64_t
1643 1643  metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
1644 1644      uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags)
1645 1645  {
1646 1646          spa_t *spa = mg->mg_vd->vdev_spa;
1647 1647          metaslab_t *msp = NULL;
1648 1648          uint64_t offset = -1ULL;
1649 1649          avl_tree_t *t = &mg->mg_metaslab_tree;
1650 1650          uint64_t activation_weight;
1651 1651          uint64_t target_distance;
1652 1652          int i;
1653 1653  
1654 1654          activation_weight = METASLAB_WEIGHT_PRIMARY;
1655 1655          for (i = 0; i < d; i++) {
1656 1656                  if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
1657 1657                          activation_weight = METASLAB_WEIGHT_SECONDARY;
1658 1658                          break;
1659 1659                  }
1660 1660          }
1661 1661  
1662 1662          for (;;) {
1663 1663                  boolean_t was_active;
1664 1664  
1665 1665                  mutex_enter(&mg->mg_lock);
1666 1666                  for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
1667 1667                          if (msp->ms_weight < asize) {
1668 1668                                  spa_dbgmsg(spa, "%s: failed to meet weight "
1669 1669                                      "requirement: vdev %llu, txg %llu, mg %p, "
1670 1670                                      "msp %p, psize %llu, asize %llu, "
1671 1671                                      "failures %llu, weight %llu",
1672 1672                                      spa_name(spa), mg->mg_vd->vdev_id, txg,
1673 1673                                      mg, msp, psize, asize,
1674 1674                                      mg->mg_alloc_failures, msp->ms_weight);
1675 1675                                  mutex_exit(&mg->mg_lock);
1676 1676                                  return (-1ULL);
1677 1677                          }
1678 1678  
1679 1679                          /*
1680 1680                           * If the selected metaslab is condensing, skip it.
1681 1681                           */
1682 1682                          if (msp->ms_condensing)
1683 1683                                  continue;
1684 1684  
1685 1685                          was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
1686 1686                          if (activation_weight == METASLAB_WEIGHT_PRIMARY)
1687 1687                                  break;
1688 1688  
1689 1689                          target_distance = min_distance +
1690 1690                              (space_map_allocated(msp->ms_sm) != 0 ? 0 :
1691 1691                              min_distance >> 1);
1692 1692  
1693 1693                          for (i = 0; i < d; i++)
1694 1694                                  if (metaslab_distance(msp, &dva[i]) <
1695 1695                                      target_distance)
1696 1696                                          break;
1697 1697                          if (i == d)
1698 1698                                  break;
1699 1699                  }
1700 1700                  mutex_exit(&mg->mg_lock);
1701 1701                  if (msp == NULL)
1702 1702                          return (-1ULL);
1703 1703  
1704 1704                  mutex_enter(&msp->ms_lock);
1705 1705  
1706 1706                  /*
1707 1707                   * If we've already reached the allowable number of failed
1708 1708                   * allocation attempts on this metaslab group then we
1709 1709                   * consider skipping it. We skip it only if we're allowed
1710 1710                   * to "fast" gang, the physical size is larger than
1711 1711                   * a gang block, and we're attempting to allocate from
1712 1712                   * the primary metaslab.
1713 1713                   */
1714 1714                  if (mg->mg_alloc_failures > zfs_mg_alloc_failures &&
1715 1715                      CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE &&
1716 1716                      activation_weight == METASLAB_WEIGHT_PRIMARY) {
1717 1717                          spa_dbgmsg(spa, "%s: skipping metaslab group: "
1718 1718                              "vdev %llu, txg %llu, mg %p, msp[%llu] %p, "
1719 1719                              "psize %llu, asize %llu, failures %llu",
1720 1720                              spa_name(spa), mg->mg_vd->vdev_id, txg, mg,
1721 1721                              msp->ms_id, msp, psize, asize,
1722 1722                              mg->mg_alloc_failures);
1723 1723                          mutex_exit(&msp->ms_lock);
1724 1724                          return (-1ULL);
1725 1725                  }
1726 1726  
1727 1727                  /*
1728 1728                   * Ensure that the metaslab we have selected is still
1729 1729                   * capable of handling our request. It's possible that
1730 1730                   * another thread may have changed the weight while we
1731 1731                   * were blocked on the metaslab lock.
1732 1732                   */
1733 1733                  if (msp->ms_weight < asize || (was_active &&
1734 1734                      !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
1735 1735                      activation_weight == METASLAB_WEIGHT_PRIMARY)) {
1736 1736                          mutex_exit(&msp->ms_lock);
1737 1737                          continue;
1738 1738                  }
1739 1739  
1740 1740                  if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
1741 1741                      activation_weight == METASLAB_WEIGHT_PRIMARY) {
1742 1742                          metaslab_passivate(msp,
1743 1743                              msp->ms_weight & ~METASLAB_ACTIVE_MASK);
1744 1744                          mutex_exit(&msp->ms_lock);
1745 1745                          continue;
1746 1746                  }
1747 1747  
1748 1748                  if (metaslab_activate(msp, activation_weight) != 0) {
1749 1749                          mutex_exit(&msp->ms_lock);
1750 1750                          continue;
1751 1751                  }
1752 1752  
1753 1753                  /*
1754 1754                   * If this metaslab is currently condensing then pick again as
1755 1755                   * we can't manipulate this metaslab until it's committed
1756 1756                   * to disk.
1757 1757                   */
1758 1758                  if (msp->ms_condensing) {
1759 1759                          mutex_exit(&msp->ms_lock);
1760 1760                          continue;
1761 1761                  }
1762 1762  
1763 1763                  if ((offset = metaslab_block_alloc(msp, asize)) != -1ULL)
1764 1764                          break;
1765 1765  
1766 1766                  atomic_inc_64(&mg->mg_alloc_failures);
1767 1767  
1768 1768                  metaslab_passivate(msp, metaslab_block_maxsize(msp));
1769 1769                  mutex_exit(&msp->ms_lock);
1770 1770          }
1771 1771  
1772 1772          if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
1773 1773                  vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
1774 1774  
1775 1775          range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, asize);
1776 1776          msp->ms_access_txg = txg + metaslab_unload_delay;
1777 1777  
1778 1778          mutex_exit(&msp->ms_lock);
1779 1779  
1780 1780          return (offset);
1781 1781  }
1782 1782  
1783 1783  /*
1784 1784   * Allocate a block for the specified i/o.
1785 1785   */
1786 1786  static int
1787 1787  metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
1788 1788      dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags)
1789 1789  {
1790 1790          metaslab_group_t *mg, *rotor;
1791 1791          vdev_t *vd;
1792 1792          int dshift = 3;
1793 1793          int all_zero;
1794 1794          int zio_lock = B_FALSE;
1795 1795          boolean_t allocatable;
1796 1796          uint64_t offset = -1ULL;
1797 1797          uint64_t asize;
1798 1798          uint64_t distance;
1799 1799  
1800 1800          ASSERT(!DVA_IS_VALID(&dva[d]));
1801 1801  
1802 1802          /*
1803 1803           * For testing, make some blocks above a certain size be gang blocks.
1804 1804           */
1805 1805          if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0)
1806 1806                  return (SET_ERROR(ENOSPC));
1807 1807  
1808 1808          /*
1809 1809           * Start at the rotor and loop through all mgs until we find something.
1810 1810           * Note that there's no locking on mc_rotor or mc_aliquot because
1811 1811           * nothing actually breaks if we miss a few updates -- we just won't
1812 1812           * allocate quite as evenly.  It all balances out over time.
1813 1813           *
1814 1814           * If we are doing ditto or log blocks, try to spread them across
1815 1815           * consecutive vdevs.  If we're forced to reuse a vdev before we've
1816 1816           * allocated all of our ditto blocks, then try and spread them out on
1817 1817           * that vdev as much as possible.  If it turns out to not be possible,
1818 1818           * gradually lower our standards until anything becomes acceptable.
1819 1819           * Also, allocating on consecutive vdevs (as opposed to random vdevs)
1820 1820           * gives us hope of containing our fault domains to something we're
1821 1821           * able to reason about.  Otherwise, any two top-level vdev failures
1822 1822           * will guarantee the loss of data.  With consecutive allocation,
1823 1823           * only two adjacent top-level vdev failures will result in data loss.
1824 1824           *
1825 1825           * If we are doing gang blocks (hintdva is non-NULL), try to keep
1826 1826           * ourselves on the same vdev as our gang block header.  That
1827 1827           * way, we can hope for locality in vdev_cache, plus it makes our
1828 1828           * fault domains something tractable.
1829 1829           */
1830 1830          if (hintdva) {
1831 1831                  vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
1832 1832  
1833 1833                  /*
1834 1834                   * It's possible the vdev we're using as the hint no
1835 1835                   * longer exists (i.e. removed). Consult the rotor when
1836 1836                   * all else fails.
1837 1837                   */
1838 1838                  if (vd != NULL) {
1839 1839                          mg = vd->vdev_mg;
1840 1840  
1841 1841                          if (flags & METASLAB_HINTBP_AVOID &&
1842 1842                              mg->mg_next != NULL)
1843 1843                                  mg = mg->mg_next;
1844 1844                  } else {
1845 1845                          mg = mc->mc_rotor;
1846 1846                  }
1847 1847          } else if (d != 0) {
1848 1848                  vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
1849 1849                  mg = vd->vdev_mg->mg_next;
1850 1850          } else {
1851 1851                  mg = mc->mc_rotor;
1852 1852          }
1853 1853  
1854 1854          /*
1855 1855           * If the hint put us into the wrong metaslab class, or into a
1856 1856           * metaslab group that has been passivated, just follow the rotor.
1857 1857           */
1858 1858          if (mg->mg_class != mc || mg->mg_activation_count <= 0)
1859 1859                  mg = mc->mc_rotor;
1860 1860  
1861 1861          rotor = mg;
1862 1862  top:
1863 1863          all_zero = B_TRUE;
1864 1864          do {
1865 1865                  ASSERT(mg->mg_activation_count == 1);
1866 1866  
1867 1867                  vd = mg->mg_vd;
1868 1868  
1869 1869                  /*
1870 1870                   * Don't allocate from faulted devices.
1871 1871                   */
1872 1872                  if (zio_lock) {
1873 1873                          spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
1874 1874                          allocatable = vdev_allocatable(vd);
1875 1875                          spa_config_exit(spa, SCL_ZIO, FTAG);
1876 1876                  } else {
1877 1877                          allocatable = vdev_allocatable(vd);
1878 1878                  }
1879 1879  
1880 1880                  /*
1881 1881                   * Determine if the selected metaslab group is eligible
1882 1882                   * for allocations. If we're ganging or have requested
1883 1883                   * an allocation for the smallest gang block size
1884 1884                   * then we don't want to avoid allocating to the this
1885 1885                   * metaslab group. If we're in this condition we should
1886 1886                   * try to allocate from any device possible so that we
1887 1887                   * don't inadvertently return ENOSPC and suspend the pool
1888 1888                   * even though space is still available.
1889 1889                   */
1890 1890                  if (allocatable && CAN_FASTGANG(flags) &&
1891 1891                      psize > SPA_GANGBLOCKSIZE)
1892 1892                          allocatable = metaslab_group_allocatable(mg);
1893 1893  
1894 1894                  if (!allocatable)
1895 1895                          goto next;
1896 1896  
1897 1897                  /*
1898 1898                   * Avoid writing single-copy data to a failing vdev
1899 1899                   * unless the user instructs us that it is okay.
1900 1900                   */
1901 1901                  if ((vd->vdev_stat.vs_write_errors > 0 ||
1902 1902                      vd->vdev_state < VDEV_STATE_HEALTHY) &&
1903 1903                      d == 0 && dshift == 3 &&
1904 1904                      !(zfs_write_to_degraded && vd->vdev_state ==
1905 1905                      VDEV_STATE_DEGRADED)) {
1906 1906                          all_zero = B_FALSE;
1907 1907                          goto next;
1908 1908                  }
1909 1909  
1910 1910                  ASSERT(mg->mg_class == mc);
1911 1911  
1912 1912                  distance = vd->vdev_asize >> dshift;
1913 1913                  if (distance <= (1ULL << vd->vdev_ms_shift))
1914 1914                          distance = 0;
1915 1915                  else
1916 1916                          all_zero = B_FALSE;
1917 1917  
1918 1918                  asize = vdev_psize_to_asize(vd, psize);
1919 1919                  ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
1920 1920  
1921 1921                  offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
1922 1922                      dva, d, flags);
1923 1923                  if (offset != -1ULL) {
1924 1924                          /*
1925 1925                           * If we've just selected this metaslab group,
1926 1926                           * figure out whether the corresponding vdev is
1927 1927                           * over- or under-used relative to the pool,
1928 1928                           * and set an allocation bias to even it out.
1929 1929                           */
1930 1930                          if (mc->mc_aliquot == 0) {
1931 1931                                  vdev_stat_t *vs = &vd->vdev_stat;
1932 1932                                  int64_t vu, cu;
1933 1933  
1934 1934                                  vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
1935 1935                                  cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
1936 1936  
1937 1937                                  /*
1938 1938                                   * Calculate how much more or less we should
1939 1939                                   * try to allocate from this device during
1940 1940                                   * this iteration around the rotor.
1941 1941                                   * For example, if a device is 80% full
1942 1942                                   * and the pool is 20% full then we should
1943 1943                                   * reduce allocations by 60% on this device.
1944 1944                                   *
1945 1945                                   * mg_bias = (20 - 80) * 512K / 100 = -307K
1946 1946                                   *
1947 1947                                   * This reduces allocations by 307K for this
1948 1948                                   * iteration.
1949 1949                                   */
1950 1950                                  mg->mg_bias = ((cu - vu) *
1951 1951                                      (int64_t)mg->mg_aliquot) / 100;
1952 1952                          }
1953 1953  
1954 1954                          if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
1955 1955                              mg->mg_aliquot + mg->mg_bias) {
1956 1956                                  mc->mc_rotor = mg->mg_next;
1957 1957                                  mc->mc_aliquot = 0;
1958 1958                          }
1959 1959  
1960 1960                          DVA_SET_VDEV(&dva[d], vd->vdev_id);
1961 1961                          DVA_SET_OFFSET(&dva[d], offset);
1962 1962                          DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
1963 1963                          DVA_SET_ASIZE(&dva[d], asize);
1964 1964  
1965 1965                          return (0);
1966 1966                  }
1967 1967  next:
1968 1968                  mc->mc_rotor = mg->mg_next;
1969 1969                  mc->mc_aliquot = 0;
1970 1970          } while ((mg = mg->mg_next) != rotor);
1971 1971  
1972 1972          if (!all_zero) {
1973 1973                  dshift++;
1974 1974                  ASSERT(dshift < 64);
1975 1975                  goto top;
1976 1976          }
1977 1977  
1978 1978          if (!allocatable && !zio_lock) {
1979 1979                  dshift = 3;
1980 1980                  zio_lock = B_TRUE;
1981 1981                  goto top;
1982 1982          }
1983 1983  
1984 1984          bzero(&dva[d], sizeof (dva_t));
1985 1985  
1986 1986          return (SET_ERROR(ENOSPC));
1987 1987  }
1988 1988  
1989 1989  /*
1990 1990   * Free the block represented by DVA in the context of the specified
1991 1991   * transaction group.
1992 1992   */
1993 1993  static void
1994 1994  metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
1995 1995  {
1996 1996          uint64_t vdev = DVA_GET_VDEV(dva);
1997 1997          uint64_t offset = DVA_GET_OFFSET(dva);
1998 1998          uint64_t size = DVA_GET_ASIZE(dva);
1999 1999          vdev_t *vd;
2000 2000          metaslab_t *msp;
2001 2001  
2002 2002          ASSERT(DVA_IS_VALID(dva));
2003 2003  
2004 2004          if (txg > spa_freeze_txg(spa))
2005 2005                  return;
2006 2006  
2007 2007          if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
2008 2008              (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
2009 2009                  cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
2010 2010                      (u_longlong_t)vdev, (u_longlong_t)offset);
2011 2011                  ASSERT(0);
2012 2012                  return;
2013 2013          }
2014 2014  
2015 2015          msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
2016 2016  
2017 2017          if (DVA_GET_GANG(dva))
2018 2018                  size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
2019 2019  
2020 2020          mutex_enter(&msp->ms_lock);
2021 2021  
2022 2022          if (now) {
2023 2023                  range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
2024 2024                      offset, size);
2025 2025  
2026 2026                  VERIFY(!msp->ms_condensing);
2027 2027                  VERIFY3U(offset, >=, msp->ms_start);
2028 2028                  VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
2029 2029                  VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
2030 2030                      msp->ms_size);
2031 2031                  VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
2032 2032                  VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
2033 2033                  range_tree_add(msp->ms_tree, offset, size);
2034 2034          } else {
2035 2035                  if (range_tree_space(msp->ms_freetree[txg & TXG_MASK]) == 0)
2036 2036                          vdev_dirty(vd, VDD_METASLAB, msp, txg);
2037 2037                  range_tree_add(msp->ms_freetree[txg & TXG_MASK],
2038 2038                      offset, size);
2039 2039          }
2040 2040  
2041 2041          mutex_exit(&msp->ms_lock);
2042 2042  }
2043 2043  
2044 2044  /*
2045 2045   * Intent log support: upon opening the pool after a crash, notify the SPA
2046 2046   * of blocks that the intent log has allocated for immediate write, but
2047 2047   * which are still considered free by the SPA because the last transaction
2048 2048   * group didn't commit yet.
2049 2049   */
2050 2050  static int
2051 2051  metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
2052 2052  {
2053 2053          uint64_t vdev = DVA_GET_VDEV(dva);
2054 2054          uint64_t offset = DVA_GET_OFFSET(dva);
2055 2055          uint64_t size = DVA_GET_ASIZE(dva);
2056 2056          vdev_t *vd;
2057 2057          metaslab_t *msp;
2058 2058          int error = 0;
2059 2059  
2060 2060          ASSERT(DVA_IS_VALID(dva));
2061 2061  
2062 2062          if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
2063 2063              (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
2064 2064                  return (SET_ERROR(ENXIO));
2065 2065  
2066 2066          msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
2067 2067  
2068 2068          if (DVA_GET_GANG(dva))
2069 2069                  size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
2070 2070  
2071 2071          mutex_enter(&msp->ms_lock);
2072 2072  
2073 2073          if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
2074 2074                  error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
2075 2075  
2076 2076          if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
2077 2077                  error = SET_ERROR(ENOENT);
2078 2078  
2079 2079          if (error || txg == 0) {        /* txg == 0 indicates dry run */
2080 2080                  mutex_exit(&msp->ms_lock);
2081 2081                  return (error);
2082 2082          }
2083 2083  
2084 2084          VERIFY(!msp->ms_condensing);
2085 2085          VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
2086 2086          VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
2087 2087          VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
2088 2088          range_tree_remove(msp->ms_tree, offset, size);
2089 2089  
2090 2090          if (spa_writeable(spa)) {       /* don't dirty if we're zdb(1M) */
2091 2091                  if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
2092 2092                          vdev_dirty(vd, VDD_METASLAB, msp, txg);
2093 2093                  range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
2094 2094          }
2095 2095  
2096 2096          mutex_exit(&msp->ms_lock);
2097 2097  
2098 2098          return (0);
2099 2099  }
2100 2100  
2101 2101  int
2102 2102  metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
2103 2103      int ndvas, uint64_t txg, blkptr_t *hintbp, int flags)
2104 2104  {
2105 2105          dva_t *dva = bp->blk_dva;
2106 2106          dva_t *hintdva = hintbp->blk_dva;
2107 2107          int error = 0;
2108 2108  
2109 2109          ASSERT(bp->blk_birth == 0);
2110 2110          ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
2111 2111  
2112 2112          spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
2113 2113  
2114 2114          if (mc->mc_rotor == NULL) {     /* no vdevs in this class */
2115 2115                  spa_config_exit(spa, SCL_ALLOC, FTAG);
2116 2116                  return (SET_ERROR(ENOSPC));
2117 2117          }
2118 2118  
2119 2119          ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
2120 2120          ASSERT(BP_GET_NDVAS(bp) == 0);
2121 2121          ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
2122 2122  
2123 2123          for (int d = 0; d < ndvas; d++) {
2124 2124                  error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
2125 2125                      txg, flags);
2126 2126                  if (error != 0) {
2127 2127                          for (d--; d >= 0; d--) {
2128 2128                                  metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
2129 2129                                  bzero(&dva[d], sizeof (dva_t));
2130 2130                          }
2131 2131                          spa_config_exit(spa, SCL_ALLOC, FTAG);
2132 2132                          return (error);
2133 2133                  }
2134 2134          }
2135 2135          ASSERT(error == 0);
2136 2136          ASSERT(BP_GET_NDVAS(bp) == ndvas);
2137 2137  
2138 2138          spa_config_exit(spa, SCL_ALLOC, FTAG);
2139 2139  
2140 2140          BP_SET_BIRTH(bp, txg, txg);
2141 2141  
2142 2142          return (0);
2143 2143  }
2144 2144  
2145 2145  void
2146 2146  metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
2147 2147  {
2148 2148          const dva_t *dva = bp->blk_dva;
2149 2149          int ndvas = BP_GET_NDVAS(bp);
2150 2150  
2151 2151          ASSERT(!BP_IS_HOLE(bp));
2152 2152          ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
2153 2153  
2154 2154          spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
2155 2155  
2156 2156          for (int d = 0; d < ndvas; d++)
2157 2157                  metaslab_free_dva(spa, &dva[d], txg, now);
2158 2158  
2159 2159          spa_config_exit(spa, SCL_FREE, FTAG);
2160 2160  }
2161 2161  
2162 2162  int
2163 2163  metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
2164 2164  {
2165 2165          const dva_t *dva = bp->blk_dva;
2166 2166          int ndvas = BP_GET_NDVAS(bp);
2167 2167          int error = 0;
2168 2168  
2169 2169          ASSERT(!BP_IS_HOLE(bp));
2170 2170  
2171 2171          if (txg != 0) {
2172 2172                  /*
2173 2173                   * First do a dry run to make sure all DVAs are claimable,
2174 2174                   * so we don't have to unwind from partial failures below.
2175 2175                   */
2176 2176                  if ((error = metaslab_claim(spa, bp, 0)) != 0)
2177 2177                          return (error);
2178 2178          }
2179 2179  
2180 2180          spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
2181 2181  
2182 2182          for (int d = 0; d < ndvas; d++)
2183 2183                  if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
2184 2184                          break;
2185 2185  
2186 2186          spa_config_exit(spa, SCL_ALLOC, FTAG);
2187 2187  
2188 2188          ASSERT(error == 0 || txg == 0);
2189 2189  
2190 2190          return (error);
2191 2191  }
2192 2192  
2193 2193  void
2194 2194  metaslab_check_free(spa_t *spa, const blkptr_t *bp)
2195 2195  {
2196 2196          if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
2197 2197                  return;
2198 2198  
2199 2199          spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
2200 2200          for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
2201 2201                  uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
2202 2202                  vdev_t *vd = vdev_lookup_top(spa, vdev);
2203 2203                  uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
2204 2204                  uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
2205 2205                  metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
2206 2206  
2207 2207                  if (msp->ms_loaded)
2208 2208                          range_tree_verify(msp->ms_tree, offset, size);
2209 2209  
2210 2210                  for (int j = 0; j < TXG_SIZE; j++)
2211 2211                          range_tree_verify(msp->ms_freetree[j], offset, size);
2212 2212                  for (int j = 0; j < TXG_DEFER_SIZE; j++)
2213 2213                          range_tree_verify(msp->ms_defertree[j], offset, size);
2214 2214          }
2215 2215          spa_config_exit(spa, SCL_VDEV, FTAG);
2216 2216  }

↓ open down ↓

1114 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX