Print this page
    
10592 misc. metaslab and vdev related ZoL bug fixes
Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Giuseppe Di Natale <guss80@gmail.com>
Reviewed by: George Melikov <mail@gmelikov.ru>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Tony Hutter <hutter2@llnl.gov>
Reviewed by: Kody Kantor <kody.kantor@joyent.com>
Approved by: Dan McDonald <danmcd@joyent.com>
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/zfs/space_map.c
          +++ new/usr/src/uts/common/fs/zfs/space_map.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  
    | 
      ↓ open down ↓ | 
    15 lines elided | 
    
      ↑ open up ↑ | 
  
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24   24   */
  25   25  /*
  26      - * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
       26 + * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  27   27   */
  28   28  
  29   29  #include <sys/zfs_context.h>
  30   30  #include <sys/spa.h>
  31   31  #include <sys/dmu.h>
  32   32  #include <sys/dmu_tx.h>
  33   33  #include <sys/dnode.h>
  34   34  #include <sys/dsl_pool.h>
  35   35  #include <sys/zio.h>
  36   36  #include <sys/space_map.h>
  37   37  #include <sys/refcount.h>
  38   38  #include <sys/zfeature.h>
  39   39  
  40   40  /*
  41   41   * Note on space map block size:
  42   42   *
  43   43   * The data for a given space map can be kept on blocks of any size.
  44   44   * Larger blocks entail fewer I/O operations, but they also cause the
  45   45   * DMU to keep more data in-core, and also to waste more I/O bandwidth
  46   46   * when only a few blocks have changed since the last transaction group.
  47   47   */
  48   48  
  49   49  /*
  50   50   * Enabled whenever we want to stress test the use of double-word
  51   51   * space map entries.
  52   52   */
  53   53  boolean_t zfs_force_some_double_word_sm_entries = B_FALSE;
  54   54  
  55   55  /*
  56   56   * Override the default indirect block size of 128K, instead using 16K for
  57   57   * spacemaps (2^14 bytes).  This dramatically reduces write inflation since
  58   58   * appending to a spacemap typically has to write one data block (4KB) and one
  59   59   * or two indirect blocks (16K-32K, rather than 128K).
  60   60   */
  61   61  int space_map_ibs = 14;
  62   62  
  63   63  boolean_t
  64   64  sm_entry_is_debug(uint64_t e)
  65   65  {
  66   66          return (SM_PREFIX_DECODE(e) == SM_DEBUG_PREFIX);
  67   67  }
  68   68  
  69   69  boolean_t
  70   70  sm_entry_is_single_word(uint64_t e)
  71   71  {
  72   72          uint8_t prefix = SM_PREFIX_DECODE(e);
  73   73          return (prefix != SM_DEBUG_PREFIX && prefix != SM2_PREFIX);
  
    | 
      ↓ open down ↓ | 
    37 lines elided | 
    
      ↑ open up ↑ | 
  
  74   74  }
  75   75  
  76   76  boolean_t
  77   77  sm_entry_is_double_word(uint64_t e)
  78   78  {
  79   79          return (SM_PREFIX_DECODE(e) == SM2_PREFIX);
  80   80  }
  81   81  
  82   82  /*
  83   83   * Iterate through the space map, invoking the callback on each (non-debug)
  84      - * space map entry.
       84 + * space map entry. Stop after reading 'end' bytes of the space map.
  85   85   */
  86   86  int
  87      -space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg)
       87 +space_map_iterate(space_map_t *sm, uint64_t end, sm_cb_t callback, void *arg)
  88   88  {
  89      -        uint64_t sm_len = space_map_length(sm);
  90      -        ASSERT3U(sm->sm_blksz, !=, 0);
       89 +        uint64_t blksz = sm->sm_blksz;
  91   90  
  92      -        dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, sm_len,
       91 +        ASSERT3U(blksz, !=, 0);
       92 +        ASSERT3U(end, <=, space_map_length(sm));
       93 +        ASSERT0(P2PHASE(end, sizeof (uint64_t)));
       94 +
       95 +        dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, end,
  93   96              ZIO_PRIORITY_SYNC_READ);
  94   97  
  95      -        uint64_t blksz = sm->sm_blksz;
  96   98          int error = 0;
  97      -        for (uint64_t block_base = 0; block_base < sm_len && error == 0;
       99 +        for (uint64_t block_base = 0; block_base < end && error == 0;
  98  100              block_base += blksz) {
  99  101                  dmu_buf_t *db;
 100  102                  error = dmu_buf_hold(sm->sm_os, space_map_object(sm),
 101  103                      block_base, FTAG, &db, DMU_READ_PREFETCH);
 102  104                  if (error != 0)
 103  105                          return (error);
 104  106  
 105  107                  uint64_t *block_start = db->db_data;
 106      -                uint64_t block_length = MIN(sm_len - block_base, blksz);
      108 +                uint64_t block_length = MIN(end - block_base, blksz);
 107  109                  uint64_t *block_end = block_start +
 108  110                      (block_length / sizeof (uint64_t));
 109  111  
 110  112                  VERIFY0(P2PHASE(block_length, sizeof (uint64_t)));
 111  113                  VERIFY3U(block_length, !=, 0);
 112  114                  ASSERT3U(blksz, ==, db->db_size);
 113  115  
 114  116                  for (uint64_t *block_cursor = block_start;
 115  117                      block_cursor < block_end && error == 0; block_cursor++) {
 116  118                          uint64_t e = *block_cursor;
 117  119  
 118  120                          if (sm_entry_is_debug(e)) /* Skip debug entries */
 119  121                                  continue;
 120  122  
 121  123                          uint64_t raw_offset, raw_run, vdev_id;
 122  124                          maptype_t type;
 123  125                          if (sm_entry_is_single_word(e)) {
 124  126                                  type = SM_TYPE_DECODE(e);
 125  127                                  vdev_id = SM_NO_VDEVID;
 126  128                                  raw_offset = SM_OFFSET_DECODE(e);
 127  129                                  raw_run = SM_RUN_DECODE(e);
 128  130                          } else {
 129  131                                  /* it is a two-word entry */
 130  132                                  ASSERT(sm_entry_is_double_word(e));
 131  133                                  raw_run = SM2_RUN_DECODE(e);
 132  134                                  vdev_id = SM2_VDEV_DECODE(e);
 133  135  
 134  136                                  /* move on to the second word */
 135  137                                  block_cursor++;
 136  138                                  e = *block_cursor;
 137  139                                  VERIFY3P(block_cursor, <=, block_end);
 138  140  
 139  141                                  type = SM2_TYPE_DECODE(e);
 140  142                                  raw_offset = SM2_OFFSET_DECODE(e);
 141  143                          }
 142  144  
 143  145                          uint64_t entry_offset = (raw_offset << sm->sm_shift) +
 144  146                              sm->sm_start;
 145  147                          uint64_t entry_run = raw_run << sm->sm_shift;
 146  148  
 147  149                          VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift));
 148  150                          VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift));
 149  151                          ASSERT3U(entry_offset, >=, sm->sm_start);
 150  152                          ASSERT3U(entry_offset, <, sm->sm_start + sm->sm_size);
 151  153                          ASSERT3U(entry_run, <=, sm->sm_size);
 152  154                          ASSERT3U(entry_offset + entry_run, <=,
 153  155                              sm->sm_start + sm->sm_size);
 154  156  
 155  157                          space_map_entry_t sme = {
 156  158                              .sme_type = type,
 157  159                              .sme_vdev = vdev_id,
 158  160                              .sme_offset = entry_offset,
 159  161                              .sme_run = entry_run
 160  162                          };
 161  163                          error = callback(&sme, arg);
 162  164                  }
 163  165                  dmu_buf_rele(db, FTAG);
 164  166          }
 165  167          return (error);
 166  168  }
 167  169  
 168  170  /*
 169  171   * Reads the entries from the last block of the space map into
 170  172   * buf in reverse order. Populates nwords with number of words
 171  173   * in the last block.
 172  174   *
 173  175   * Refer to block comment within space_map_incremental_destroy()
 174  176   * to understand why this function is needed.
 175  177   */
 176  178  static int
 177  179  space_map_reversed_last_block_entries(space_map_t *sm, uint64_t *buf,
 178  180      uint64_t bufsz, uint64_t *nwords)
  
    | 
      ↓ open down ↓ | 
    62 lines elided | 
    
      ↑ open up ↑ | 
  
 179  181  {
 180  182          int error = 0;
 181  183          dmu_buf_t *db;
 182  184  
 183  185          /*
 184  186           * Find the offset of the last word in the space map and use
 185  187           * that to read the last block of the space map with
 186  188           * dmu_buf_hold().
 187  189           */
 188  190          uint64_t last_word_offset =
 189      -            sm->sm_phys->smp_objsize - sizeof (uint64_t);
      191 +            sm->sm_phys->smp_length - sizeof (uint64_t);
 190  192          error = dmu_buf_hold(sm->sm_os, space_map_object(sm), last_word_offset,
 191  193              FTAG, &db, DMU_READ_NO_PREFETCH);
 192  194          if (error != 0)
 193  195                  return (error);
 194  196  
 195  197          ASSERT3U(sm->sm_object, ==, db->db_object);
 196  198          ASSERT3U(sm->sm_blksz, ==, db->db_size);
 197  199          ASSERT3U(bufsz, >=, db->db_size);
 198  200          ASSERT(nwords != NULL);
 199  201  
 200  202          uint64_t *words = db->db_data;
 201  203          *nwords =
 202      -            (sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t);
      204 +            (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t);
 203  205  
 204  206          ASSERT3U(*nwords, <=, bufsz / sizeof (uint64_t));
 205  207  
 206  208          uint64_t n = *nwords;
 207  209          uint64_t j = n - 1;
 208  210          for (uint64_t i = 0; i < n; i++) {
 209  211                  uint64_t entry = words[i];
 210  212                  if (sm_entry_is_double_word(entry)) {
 211  213                          /*
 212  214                           * Since we are populating the buffer backwards
 213  215                           * we have to be extra careful and add the two
 214  216                           * words of the double-word entry in the right
 215  217                           * order.
 216  218                           */
 217  219                          ASSERT3U(j, >, 0);
 218  220                          buf[j - 1] = entry;
 219  221  
 220  222                          i++;
 221  223                          ASSERT3U(i, <, n);
 222  224                          entry = words[i];
 223  225                          buf[j] = entry;
 224  226                          j -= 2;
 225  227                  } else {
 226  228                          ASSERT(sm_entry_is_debug(entry) ||
 227  229                              sm_entry_is_single_word(entry));
 228  230                          buf[j] = entry;
 229  231                          j--;
 230  232                  }
 231  233          }
 232  234  
 233  235          /*
 234  236           * Assert that we wrote backwards all the
 235  237           * way to the beginning of the buffer.
 236  238           */
 237  239          ASSERT3S(j, ==, -1);
 238  240  
 239  241          dmu_buf_rele(db, FTAG);
 240  242          return (error);
 241  243  }
 242  244  
 243  245  /*
 244  246   * Note: This function performs destructive actions - specifically
 245  247   * it deletes entries from the end of the space map. Thus, callers
 246  248   * should ensure that they are holding the appropriate locks for
 247  249   * the space map that they provide.
 248  250   */
 249  251  int
 250  252  space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg,
 251  253      dmu_tx_t *tx)
 252  254  {
 253  255          uint64_t bufsz = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE);
 254  256          uint64_t *buf = zio_buf_alloc(bufsz);
 255  257  
 256  258          dmu_buf_will_dirty(sm->sm_dbuf, tx);
 257  259  
 258  260          /*
 259  261           * Ideally we would want to iterate from the beginning of the
 260  262           * space map to the end in incremental steps. The issue with this
 261  263           * approach is that we don't have any field on-disk that points
 262  264           * us where to start between each step. We could try zeroing out
 263  265           * entries that we've destroyed, but this doesn't work either as
 264  266           * an entry that is 0 is a valid one (ALLOC for range [0x0:0x200]).
 265  267           *
 266  268           * As a result, we destroy its entries incrementally starting from
 267  269           * the end after applying the callback to each of them.
 268  270           *
 269  271           * The problem with this approach is that we cannot literally
 270  272           * iterate through the words in the space map backwards as we
 271  273           * can't distinguish two-word space map entries from their second
 272  274           * word. Thus we do the following:
 273  275           *
 274  276           * 1] We get all the entries from the last block of the space map
 275  277           *    and put them into a buffer in reverse order. This way the
 276  278           *    last entry comes first in the buffer, the second to last is
 277  279           *    second, etc.
 278  280           * 2] We iterate through the entries in the buffer and we apply
 279  281           *    the callback to each one. As we move from entry to entry we
 280  282           *    we decrease the size of the space map, deleting effectively
 281  283           *    each entry.
 282  284           * 3] If there are no more entries in the space map or the callback
 283  285           *    returns a value other than 0, we stop iterating over the
 284  286           *    space map. If there are entries remaining and the callback
 285  287           *    returned 0, we go back to step [1].
 286  288           */
 287  289          int error = 0;
 288  290          while (space_map_length(sm) > 0 && error == 0) {
 289  291                  uint64_t nwords = 0;
 290  292                  error = space_map_reversed_last_block_entries(sm, buf, bufsz,
  
    | 
      ↓ open down ↓ | 
    78 lines elided | 
    
      ↑ open up ↑ | 
  
 291  293                      &nwords);
 292  294                  if (error != 0)
 293  295                          break;
 294  296  
 295  297                  ASSERT3U(nwords, <=, bufsz / sizeof (uint64_t));
 296  298  
 297  299                  for (uint64_t i = 0; i < nwords; i++) {
 298  300                          uint64_t e = buf[i];
 299  301  
 300  302                          if (sm_entry_is_debug(e)) {
 301      -                                sm->sm_phys->smp_objsize -= sizeof (uint64_t);
 302      -                                space_map_update(sm);
      303 +                                sm->sm_phys->smp_length -= sizeof (uint64_t);
 303  304                                  continue;
 304  305                          }
 305  306  
 306  307                          int words = 1;
 307  308                          uint64_t raw_offset, raw_run, vdev_id;
 308  309                          maptype_t type;
 309  310                          if (sm_entry_is_single_word(e)) {
 310  311                                  type = SM_TYPE_DECODE(e);
 311  312                                  vdev_id = SM_NO_VDEVID;
 312  313                                  raw_offset = SM_OFFSET_DECODE(e);
 313  314                                  raw_run = SM_RUN_DECODE(e);
 314  315                          } else {
 315  316                                  ASSERT(sm_entry_is_double_word(e));
 316  317                                  words = 2;
 317  318  
 318  319                                  raw_run = SM2_RUN_DECODE(e);
 319  320                                  vdev_id = SM2_VDEV_DECODE(e);
 320  321  
 321  322                                  /* move to the second word */
 322  323                                  i++;
 323  324                                  e = buf[i];
 324  325  
 325  326                                  ASSERT3P(i, <=, nwords);
 326  327  
 327  328                                  type = SM2_TYPE_DECODE(e);
 328  329                                  raw_offset = SM2_OFFSET_DECODE(e);
 329  330                          }
 330  331  
 331  332                          uint64_t entry_offset =
 332  333                              (raw_offset << sm->sm_shift) + sm->sm_start;
 333  334                          uint64_t entry_run = raw_run << sm->sm_shift;
 334  335  
 335  336                          VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift));
 336  337                          VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift));
 337  338                          VERIFY3U(entry_offset, >=, sm->sm_start);
 338  339                          VERIFY3U(entry_offset, <, sm->sm_start + sm->sm_size);
 339  340                          VERIFY3U(entry_run, <=, sm->sm_size);
 340  341                          VERIFY3U(entry_offset + entry_run, <=,
 341  342                              sm->sm_start + sm->sm_size);
 342  343  
 343  344                          space_map_entry_t sme = {
 344  345                              .sme_type = type,
 345  346                              .sme_vdev = vdev_id,
 346  347                              .sme_offset = entry_offset,
  
    | 
      ↓ open down ↓ | 
    34 lines elided | 
    
      ↑ open up ↑ | 
  
 347  348                              .sme_run = entry_run
 348  349                          };
 349  350                          error = callback(&sme, arg);
 350  351                          if (error != 0)
 351  352                                  break;
 352  353  
 353  354                          if (type == SM_ALLOC)
 354  355                                  sm->sm_phys->smp_alloc -= entry_run;
 355  356                          else
 356  357                                  sm->sm_phys->smp_alloc += entry_run;
 357      -                        sm->sm_phys->smp_objsize -= words * sizeof (uint64_t);
 358      -                        space_map_update(sm);
      358 +                        sm->sm_phys->smp_length -= words * sizeof (uint64_t);
 359  359                  }
 360  360          }
 361  361  
 362  362          if (space_map_length(sm) == 0) {
 363  363                  ASSERT0(error);
 364      -                ASSERT0(sm->sm_phys->smp_objsize);
 365      -                ASSERT0(sm->sm_alloc);
      364 +                ASSERT0(space_map_allocated(sm));
 366  365          }
 367  366  
 368  367          zio_buf_free(buf, bufsz);
 369  368          return (error);
 370  369  }
 371  370  
 372  371  typedef struct space_map_load_arg {
 373  372          space_map_t     *smla_sm;
 374  373          range_tree_t    *smla_rt;
 375  374          maptype_t       smla_type;
 376  375  } space_map_load_arg_t;
 377  376  
 378  377  static int
 379  378  space_map_load_callback(space_map_entry_t *sme, void *arg)
 380  379  {
 381  380          space_map_load_arg_t *smla = arg;
 382  381          if (sme->sme_type == smla->smla_type) {
 383  382                  VERIFY3U(range_tree_space(smla->smla_rt) + sme->sme_run, <=,
  
    | 
      ↓ open down ↓ | 
    8 lines elided | 
    
      ↑ open up ↑ | 
  
 384  383                      smla->smla_sm->sm_size);
 385  384                  range_tree_add(smla->smla_rt, sme->sme_offset, sme->sme_run);
 386  385          } else {
 387  386                  range_tree_remove(smla->smla_rt, sme->sme_offset, sme->sme_run);
 388  387          }
 389  388  
 390  389          return (0);
 391  390  }
 392  391  
 393  392  /*
 394      - * Load the space map disk into the specified range tree. Segments of maptype
 395      - * are added to the range tree, other segment types are removed.
      393 + * Load the spacemap into the rangetree, like space_map_load. But only
      394 + * read the first 'length' bytes of the spacemap.
 396  395   */
 397  396  int
 398      -space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype)
      397 +space_map_load_length(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
      398 +    uint64_t length)
 399  399  {
 400      -        uint64_t space;
 401      -        int err;
 402  400          space_map_load_arg_t smla;
 403  401  
 404  402          VERIFY0(range_tree_space(rt));
 405      -        space = space_map_allocated(sm);
 406  403  
 407      -        if (maptype == SM_FREE) {
      404 +        if (maptype == SM_FREE)
 408  405                  range_tree_add(rt, sm->sm_start, sm->sm_size);
 409      -                space = sm->sm_size - space;
 410      -        }
 411  406  
 412  407          smla.smla_rt = rt;
 413  408          smla.smla_sm = sm;
 414  409          smla.smla_type = maptype;
 415      -        err = space_map_iterate(sm, space_map_load_callback, &smla);
      410 +        int err = space_map_iterate(sm, length,
      411 +            space_map_load_callback, &smla);
 416  412  
 417      -        if (err == 0) {
 418      -                VERIFY3U(range_tree_space(rt), ==, space);
 419      -        } else {
      413 +        if (err != 0)
 420  414                  range_tree_vacate(rt, NULL, NULL);
 421      -        }
 422  415  
 423  416          return (err);
 424  417  }
 425  418  
      419 +/*
      420 + * Load the space map disk into the specified range tree. Segments of maptype
      421 + * are added to the range tree, other segment types are removed.
      422 + */
      423 +int
      424 +space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype)
      425 +{
      426 +        return (space_map_load_length(sm, rt, maptype, space_map_length(sm)));
      427 +}
      428 +
 426  429  void
 427  430  space_map_histogram_clear(space_map_t *sm)
 428  431  {
 429  432          if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
 430  433                  return;
 431  434  
 432  435          bzero(sm->sm_phys->smp_histogram, sizeof (sm->sm_phys->smp_histogram));
 433  436  }
 434  437  
 435  438  boolean_t
 436  439  space_map_histogram_verify(space_map_t *sm, range_tree_t *rt)
 437  440  {
 438  441          /*
 439  442           * Verify that the in-core range tree does not have any
 440  443           * ranges smaller than our sm_shift size.
 441  444           */
 442  445          for (int i = 0; i < sm->sm_shift; i++) {
 443  446                  if (rt->rt_histogram[i] != 0)
 444  447                          return (B_FALSE);
 445  448          }
 446  449          return (B_TRUE);
 447  450  }
 448  451  
 449  452  void
 450  453  space_map_histogram_add(space_map_t *sm, range_tree_t *rt, dmu_tx_t *tx)
 451  454  {
 452  455          int idx = 0;
 453  456  
 454  457          ASSERT(dmu_tx_is_syncing(tx));
 455  458          VERIFY3U(space_map_object(sm), !=, 0);
 456  459  
 457  460          if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
 458  461                  return;
 459  462  
 460  463          dmu_buf_will_dirty(sm->sm_dbuf, tx);
 461  464  
 462  465          ASSERT(space_map_histogram_verify(sm, rt));
 463  466          /*
 464  467           * Transfer the content of the range tree histogram to the space
 465  468           * map histogram. The space map histogram contains 32 buckets ranging
 466  469           * between 2^sm_shift to 2^(32+sm_shift-1). The range tree,
 467  470           * however, can represent ranges from 2^0 to 2^63. Since the space
 468  471           * map only cares about allocatable blocks (minimum of sm_shift) we
 469  472           * can safely ignore all ranges in the range tree smaller than sm_shift.
 470  473           */
 471  474          for (int i = sm->sm_shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
 472  475  
 473  476                  /*
 474  477                   * Since the largest histogram bucket in the space map is
 475  478                   * 2^(32+sm_shift-1), we need to normalize the values in
 476  479                   * the range tree for any bucket larger than that size. For
 477  480                   * example given an sm_shift of 9, ranges larger than 2^40
 478  481                   * would get normalized as if they were 1TB ranges. Assume
 479  482                   * the range tree had a count of 5 in the 2^44 (16TB) bucket,
 480  483                   * the calculation below would normalize this to 5 * 2^4 (16).
 481  484                   */
 482  485                  ASSERT3U(i, >=, idx + sm->sm_shift);
 483  486                  sm->sm_phys->smp_histogram[idx] +=
 484  487                      rt->rt_histogram[i] << (i - idx - sm->sm_shift);
 485  488  
 486  489                  /*
 487  490                   * Increment the space map's index as long as we haven't
 488  491                   * reached the maximum bucket size. Accumulate all ranges
 489  492                   * larger than the max bucket size into the last bucket.
 490  493                   */
 491  494                  if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
 492  495                          ASSERT3U(idx + sm->sm_shift, ==, i);
 493  496                          idx++;
 494  497                          ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
 495  498                  }
 496  499          }
 497  500  }
 498  501  
  
    | 
      ↓ open down ↓ | 
    63 lines elided | 
    
      ↑ open up ↑ | 
  
 499  502  static void
 500  503  space_map_write_intro_debug(space_map_t *sm, maptype_t maptype, dmu_tx_t *tx)
 501  504  {
 502  505          dmu_buf_will_dirty(sm->sm_dbuf, tx);
 503  506  
 504  507          uint64_t dentry = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) |
 505  508              SM_DEBUG_ACTION_ENCODE(maptype) |
 506  509              SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(tx->tx_pool->dp_spa)) |
 507  510              SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
 508  511  
 509      -        dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_objsize,
      512 +        dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_length,
 510  513              sizeof (dentry), &dentry, tx);
 511  514  
 512      -        sm->sm_phys->smp_objsize += sizeof (dentry);
      515 +        sm->sm_phys->smp_length += sizeof (dentry);
 513  516  }
 514  517  
 515  518  /*
 516  519   * Writes one or more entries given a segment.
 517  520   *
 518  521   * Note: The function may release the dbuf from the pointer initially
 519  522   * passed to it, and return a different dbuf. Also, the space map's
 520  523   * dbuf must be dirty for the changes in sm_phys to take effect.
 521  524   */
 522  525  static void
 523  526  space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype,
 524  527      uint64_t vdev_id, uint8_t words, dmu_buf_t **dbp, void *tag, dmu_tx_t *tx)
 525  528  {
 526  529          ASSERT3U(words, !=, 0);
 527  530          ASSERT3U(words, <=, 2);
 528  531  
 529  532          /* ensure the vdev_id can be represented by the space map */
 530  533          ASSERT3U(vdev_id, <=, SM_NO_VDEVID);
 531  534  
 532  535          /*
 533  536           * if this is a single word entry, ensure that no vdev was
  
    | 
      ↓ open down ↓ | 
    11 lines elided | 
    
      ↑ open up ↑ | 
  
 534  537           * specified.
 535  538           */
 536  539          IMPLY(words == 1, vdev_id == SM_NO_VDEVID);
 537  540  
 538  541          dmu_buf_t *db = *dbp;
 539  542          ASSERT3U(db->db_size, ==, sm->sm_blksz);
 540  543  
 541  544          uint64_t *block_base = db->db_data;
 542  545          uint64_t *block_end = block_base + (sm->sm_blksz / sizeof (uint64_t));
 543  546          uint64_t *block_cursor = block_base +
 544      -            (sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t);
      547 +            (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t);
 545  548  
 546  549          ASSERT3P(block_cursor, <=, block_end);
 547  550  
 548  551          uint64_t size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
 549  552          uint64_t start = (rs->rs_start - sm->sm_start) >> sm->sm_shift;
 550  553          uint64_t run_max = (words == 2) ? SM2_RUN_MAX : SM_RUN_MAX;
 551  554  
 552  555          ASSERT3U(rs->rs_start, >=, sm->sm_start);
 553  556          ASSERT3U(rs->rs_start, <, sm->sm_start + sm->sm_size);
 554  557          ASSERT3U(rs->rs_end - rs->rs_start, <=, sm->sm_size);
 555  558          ASSERT3U(rs->rs_end, <=, sm->sm_start + sm->sm_size);
 556  559  
  
    | 
      ↓ open down ↓ | 
    2 lines elided | 
    
      ↑ open up ↑ | 
  
 557  560          while (size != 0) {
 558  561                  ASSERT3P(block_cursor, <=, block_end);
 559  562  
 560  563                  /*
 561  564                   * If we are at the end of this block, flush it and start
 562  565                   * writing again from the beginning.
 563  566                   */
 564  567                  if (block_cursor == block_end) {
 565  568                          dmu_buf_rele(db, tag);
 566  569  
 567      -                        uint64_t next_word_offset = sm->sm_phys->smp_objsize;
      570 +                        uint64_t next_word_offset = sm->sm_phys->smp_length;
 568  571                          VERIFY0(dmu_buf_hold(sm->sm_os,
 569  572                              space_map_object(sm), next_word_offset,
 570  573                              tag, &db, DMU_READ_PREFETCH));
 571  574                          dmu_buf_will_dirty(db, tx);
 572  575  
 573  576                          /* update caller's dbuf */
 574  577                          *dbp = db;
 575  578  
 576  579                          ASSERT3U(db->db_size, ==, sm->sm_blksz);
 577  580  
 578  581                          block_base = db->db_data;
 579  582                          block_cursor = block_base;
 580  583                          block_end = block_base +
 581  584                              (db->db_size / sizeof (uint64_t));
 582  585                  }
 583  586  
 584  587                  /*
 585  588                   * If we are writing a two-word entry and we only have one
 586  589                   * word left on this block, just pad it with an empty debug
  
    | 
      ↓ open down ↓ | 
    9 lines elided | 
    
      ↑ open up ↑ | 
  
 587  590                   * entry and write the two-word entry in the next block.
 588  591                   */
 589  592                  uint64_t *next_entry = block_cursor + 1;
 590  593                  if (next_entry == block_end && words > 1) {
 591  594                          ASSERT3U(words, ==, 2);
 592  595                          *block_cursor = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) |
 593  596                              SM_DEBUG_ACTION_ENCODE(0) |
 594  597                              SM_DEBUG_SYNCPASS_ENCODE(0) |
 595  598                              SM_DEBUG_TXG_ENCODE(0);
 596  599                          block_cursor++;
 597      -                        sm->sm_phys->smp_objsize += sizeof (uint64_t);
      600 +                        sm->sm_phys->smp_length += sizeof (uint64_t);
 598  601                          ASSERT3P(block_cursor, ==, block_end);
 599  602                          continue;
 600  603                  }
 601  604  
 602  605                  uint64_t run_len = MIN(size, run_max);
 603  606                  switch (words) {
 604  607                  case 1:
 605  608                          *block_cursor = SM_OFFSET_ENCODE(start) |
 606  609                              SM_TYPE_ENCODE(maptype) |
 607  610                              SM_RUN_ENCODE(run_len);
 608  611                          block_cursor++;
 609  612                          break;
 610  613                  case 2:
 611  614                          /* write the first word of the entry */
 612  615                          *block_cursor = SM_PREFIX_ENCODE(SM2_PREFIX) |
 613  616                              SM2_RUN_ENCODE(run_len) |
 614  617                              SM2_VDEV_ENCODE(vdev_id);
 615  618                          block_cursor++;
 616  619  
 617  620                          /* move on to the second word of the entry */
  
    | 
      ↓ open down ↓ | 
    10 lines elided | 
    
      ↑ open up ↑ | 
  
 618  621                          ASSERT3P(block_cursor, <, block_end);
 619  622                          *block_cursor = SM2_TYPE_ENCODE(maptype) |
 620  623                              SM2_OFFSET_ENCODE(start);
 621  624                          block_cursor++;
 622  625                          break;
 623  626                  default:
 624  627                          panic("%d-word space map entries are not supported",
 625  628                              words);
 626  629                          break;
 627  630                  }
 628      -                sm->sm_phys->smp_objsize += words * sizeof (uint64_t);
      631 +                sm->sm_phys->smp_length += words * sizeof (uint64_t);
 629  632  
 630  633                  start += run_len;
 631  634                  size -= run_len;
 632  635          }
 633  636          ASSERT0(size);
 634  637  
 635  638  }
 636  639  
 637  640  /*
 638  641   * Note: The space map's dbuf must be dirty for the changes in sm_phys to
 639  642   * take effect.
 640  643   */
 641  644  static void
 642  645  space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
 643  646      uint64_t vdev_id, dmu_tx_t *tx)
 644  647  {
  
    | 
      ↓ open down ↓ | 
    6 lines elided | 
    
      ↑ open up ↑ | 
  
 645  648          spa_t *spa = tx->tx_pool->dp_spa;
 646  649          dmu_buf_t *db;
 647  650  
 648  651          space_map_write_intro_debug(sm, maptype, tx);
 649  652  
 650  653  #ifdef DEBUG
 651  654          /*
 652  655           * We do this right after we write the intro debug entry
 653  656           * because the estimate does not take it into account.
 654  657           */
 655      -        uint64_t initial_objsize = sm->sm_phys->smp_objsize;
      658 +        uint64_t initial_objsize = sm->sm_phys->smp_length;
 656  659          uint64_t estimated_growth =
 657  660              space_map_estimate_optimal_size(sm, rt, SM_NO_VDEVID);
 658  661          uint64_t estimated_final_objsize = initial_objsize + estimated_growth;
 659  662  #endif
 660  663  
 661  664          /*
 662  665           * Find the offset right after the last word in the space map
 663  666           * and use that to get a hold of the last block, so we can
 664  667           * start appending to it.
 665  668           */
 666      -        uint64_t next_word_offset = sm->sm_phys->smp_objsize;
      669 +        uint64_t next_word_offset = sm->sm_phys->smp_length;
 667  670          VERIFY0(dmu_buf_hold(sm->sm_os, space_map_object(sm),
 668  671              next_word_offset, FTAG, &db, DMU_READ_PREFETCH));
 669  672          ASSERT3U(db->db_size, ==, sm->sm_blksz);
 670  673  
 671  674          dmu_buf_will_dirty(db, tx);
 672  675  
 673  676          avl_tree_t *t = &rt->rt_root;
 674  677          for (range_seg_t *rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) {
 675  678                  uint64_t offset = (rs->rs_start - sm->sm_start) >> sm->sm_shift;
 676  679                  uint64_t length = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
 677  680                  uint8_t words = 1;
 678  681  
 679  682                  /*
 680  683                   * We only write two-word entries when both of the following
 681  684                   * are true:
 682  685                   *
 683  686                   * [1] The feature is enabled.
 684  687                   * [2] The offset or run is too big for a single-word entry,
 685  688                   *      or the vdev_id is set (meaning not equal to
 686  689                   *      SM_NO_VDEVID).
 687  690                   *
 688  691                   * Note that for purposes of testing we've added the case that
 689  692                   * we write two-word entries occasionally when the feature is
 690  693                   * enabled and zfs_force_some_double_word_sm_entries has been
 691  694                   * set.
 692  695                   */
 693  696                  if (spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_V2) &&
 694  697                      (offset >= (1ULL << SM_OFFSET_BITS) ||
 695  698                      length > SM_RUN_MAX ||
 696  699                      vdev_id != SM_NO_VDEVID ||
 697  700                      (zfs_force_some_double_word_sm_entries &&
 698  701                      spa_get_random(100) == 0)))
 699  702                          words = 2;
 700  703  
 701  704                  space_map_write_seg(sm, rs, maptype, vdev_id, words,
 702  705                      &db, FTAG, tx);
 703  706          }
  
    | 
      ↓ open down ↓ | 
    27 lines elided | 
    
      ↑ open up ↑ | 
  
 704  707  
 705  708          dmu_buf_rele(db, FTAG);
 706  709  
 707  710  #ifdef DEBUG
 708  711          /*
 709  712           * We expect our estimation to be based on the worst case
 710  713           * scenario [see comment in space_map_estimate_optimal_size()].
 711  714           * Therefore we expect the actual objsize to be equal or less
 712  715           * than whatever we estimated it to be.
 713  716           */
 714      -        ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_objsize);
      717 +        ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_length);
 715  718  #endif
 716  719  }
 717  720  
 718  721  /*
 719  722   * Note: This function manipulates the state of the given space map but
 720  723   * does not hold any locks implicitly. Thus the caller is responsible
 721  724   * for synchronizing writes to the space map.
 722  725   */
 723  726  void
 724  727  space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
 725  728      uint64_t vdev_id, dmu_tx_t *tx)
 726  729  {
 727  730          objset_t *os = sm->sm_os;
 728  731  
 729  732          ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
 730  733          VERIFY3U(space_map_object(sm), !=, 0);
 731  734  
 732  735          dmu_buf_will_dirty(sm->sm_dbuf, tx);
 733  736  
 734  737          /*
 735  738           * This field is no longer necessary since the in-core space map
 736  739           * now contains the object number but is maintained for backwards
 737  740           * compatibility.
 738  741           */
 739  742          sm->sm_phys->smp_object = sm->sm_object;
 740  743  
 741  744          if (range_tree_is_empty(rt)) {
 742  745                  VERIFY3U(sm->sm_object, ==, sm->sm_phys->smp_object);
 743  746                  return;
 744  747          }
 745  748  
 746  749          if (maptype == SM_ALLOC)
 747  750                  sm->sm_phys->smp_alloc += range_tree_space(rt);
 748  751          else
 749  752                  sm->sm_phys->smp_alloc -= range_tree_space(rt);
 750  753  
 751  754          uint64_t nodes = avl_numnodes(&rt->rt_root);
 752  755          uint64_t rt_space = range_tree_space(rt);
 753  756  
 754  757          space_map_write_impl(sm, rt, maptype, vdev_id, tx);
 755  758  
 756  759          /*
 757  760           * Ensure that the space_map's accounting wasn't changed
 758  761           * while we were in the middle of writing it out.
 759  762           */
 760  763          VERIFY3U(nodes, ==, avl_numnodes(&rt->rt_root));
 761  764          VERIFY3U(range_tree_space(rt), ==, rt_space);
 762  765  }
 763  766  
 764  767  static int
 765  768  space_map_open_impl(space_map_t *sm)
 766  769  {
 767  770          int error;
 768  771          u_longlong_t blocks;
 769  772  
 770  773          error = dmu_bonus_hold(sm->sm_os, sm->sm_object, sm, &sm->sm_dbuf);
 771  774          if (error)
 772  775                  return (error);
 773  776  
 774  777          dmu_object_size_from_db(sm->sm_dbuf, &sm->sm_blksz, &blocks);
 775  778          sm->sm_phys = sm->sm_dbuf->db_data;
 776  779          return (0);
 777  780  }
 778  781  
 779  782  int
 780  783  space_map_open(space_map_t **smp, objset_t *os, uint64_t object,
 781  784      uint64_t start, uint64_t size, uint8_t shift)
 782  785  {
 783  786          space_map_t *sm;
 784  787          int error;
 785  788  
 786  789          ASSERT(*smp == NULL);
 787  790          ASSERT(os != NULL);
 788  791          ASSERT(object != 0);
 789  792  
 790  793          sm = kmem_zalloc(sizeof (space_map_t), KM_SLEEP);
 791  794  
 792  795          sm->sm_start = start;
 793  796          sm->sm_size = size;
 794  797          sm->sm_shift = shift;
 795  798          sm->sm_os = os;
 796  799          sm->sm_object = object;
 797  800  
 798  801          error = space_map_open_impl(sm);
 799  802          if (error != 0) {
 800  803                  space_map_close(sm);
 801  804                  return (error);
 802  805          }
 803  806          *smp = sm;
 804  807  
 805  808          return (0);
 806  809  }
 807  810  
 808  811  void
 809  812  space_map_close(space_map_t *sm)
 810  813  {
 811  814          if (sm == NULL)
 812  815                  return;
 813  816  
 814  817          if (sm->sm_dbuf != NULL)
 815  818                  dmu_buf_rele(sm->sm_dbuf, sm);
 816  819          sm->sm_dbuf = NULL;
 817  820          sm->sm_phys = NULL;
 818  821  
 819  822          kmem_free(sm, sizeof (*sm));
 820  823  }
 821  824  
 822  825  void
 823  826  space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx)
 824  827  {
 825  828          objset_t *os = sm->sm_os;
 826  829          spa_t *spa = dmu_objset_spa(os);
 827  830          dmu_object_info_t doi;
 828  831  
 829  832          ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
 830  833          ASSERT(dmu_tx_is_syncing(tx));
 831  834          VERIFY3U(dmu_tx_get_txg(tx), <=, spa_final_dirty_txg(spa));
 832  835  
 833  836          dmu_object_info_from_db(sm->sm_dbuf, &doi);
 834  837  
 835  838          /*
 836  839           * If the space map has the wrong bonus size (because
 837  840           * SPA_FEATURE_SPACEMAP_HISTOGRAM has recently been enabled), or
 838  841           * the wrong block size (because space_map_blksz has changed),
 839  842           * free and re-allocate its object with the updated sizes.
 840  843           *
 841  844           * Otherwise, just truncate the current object.
 842  845           */
 843  846          if ((spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
 844  847              doi.doi_bonus_size != sizeof (space_map_phys_t)) ||
 845  848              doi.doi_data_block_size != blocksize ||
 846  849              doi.doi_metadata_block_size != 1 << space_map_ibs) {
 847  850                  zfs_dbgmsg("txg %llu, spa %s, sm %p, reallocating "
 848  851                      "object[%llu]: old bonus %u, old blocksz %u",
 849  852                      dmu_tx_get_txg(tx), spa_name(spa), sm, sm->sm_object,
 850  853                      doi.doi_bonus_size, doi.doi_data_block_size);
 851  854  
 852  855                  space_map_free(sm, tx);
 853  856                  dmu_buf_rele(sm->sm_dbuf, sm);
 854  857  
 855  858                  sm->sm_object = space_map_alloc(sm->sm_os, blocksize, tx);
 856  859                  VERIFY0(space_map_open_impl(sm));
 857  860          } else {
 858  861                  VERIFY0(dmu_free_range(os, space_map_object(sm), 0, -1ULL, tx));
 859  862  
  
    | 
      ↓ open down ↓ | 
    135 lines elided | 
    
      ↑ open up ↑ | 
  
 860  863                  /*
 861  864                   * If the spacemap is reallocated, its histogram
 862  865                   * will be reset.  Do the same in the common case so that
 863  866                   * bugs related to the uncommon case do not go unnoticed.
 864  867                   */
 865  868                  bzero(sm->sm_phys->smp_histogram,
 866  869                      sizeof (sm->sm_phys->smp_histogram));
 867  870          }
 868  871  
 869  872          dmu_buf_will_dirty(sm->sm_dbuf, tx);
 870      -        sm->sm_phys->smp_objsize = 0;
      873 +        sm->sm_phys->smp_length = 0;
 871  874          sm->sm_phys->smp_alloc = 0;
 872  875  }
 873  876  
 874      -/*
 875      - * Update the in-core space_map allocation and length values.
 876      - */
 877      -void
 878      -space_map_update(space_map_t *sm)
 879      -{
 880      -        if (sm == NULL)
 881      -                return;
 882      -
 883      -        sm->sm_alloc = sm->sm_phys->smp_alloc;
 884      -        sm->sm_length = sm->sm_phys->smp_objsize;
 885      -}
 886      -
 887  877  uint64_t
 888  878  space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
 889  879  {
 890  880          spa_t *spa = dmu_objset_spa(os);
 891  881          uint64_t object;
 892  882          int bonuslen;
 893  883  
 894  884          if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
 895  885                  spa_feature_incr(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM, tx);
 896  886                  bonuslen = sizeof (space_map_phys_t);
 897  887                  ASSERT3U(bonuslen, <=, dmu_bonus_max());
 898  888          } else {
 899  889                  bonuslen = SPACE_MAP_SIZE_V0;
 900  890          }
 901  891  
 902  892          object = dmu_object_alloc_ibs(os, DMU_OT_SPACE_MAP, blocksize,
 903  893              space_map_ibs, DMU_OT_SPACE_MAP_HEADER, bonuslen, tx);
 904  894  
 905  895          return (object);
 906  896  }
 907  897  
 908  898  void
 909  899  space_map_free_obj(objset_t *os, uint64_t smobj, dmu_tx_t *tx)
 910  900  {
 911  901          spa_t *spa = dmu_objset_spa(os);
 912  902          if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
 913  903                  dmu_object_info_t doi;
 914  904  
 915  905                  VERIFY0(dmu_object_info(os, smobj, &doi));
 916  906                  if (doi.doi_bonus_size != SPACE_MAP_SIZE_V0) {
 917  907                          spa_feature_decr(spa,
 918  908                              SPA_FEATURE_SPACEMAP_HISTOGRAM, tx);
 919  909                  }
 920  910          }
 921  911  
 922  912          VERIFY0(dmu_object_free(os, smobj, tx));
 923  913  }
 924  914  
 925  915  void
 926  916  space_map_free(space_map_t *sm, dmu_tx_t *tx)
 927  917  {
 928  918          if (sm == NULL)
 929  919                  return;
 930  920  
 931  921          space_map_free_obj(sm->sm_os, space_map_object(sm), tx);
 932  922          sm->sm_object = 0;
 933  923  }
 934  924  
 935  925  /*
 936  926   * Given a range tree, it makes a worst-case estimate of how much
 937  927   * space would the tree's segments take if they were written to
 938  928   * the given space map.
 939  929   */
 940  930  uint64_t
 941  931  space_map_estimate_optimal_size(space_map_t *sm, range_tree_t *rt,
 942  932      uint64_t vdev_id)
 943  933  {
 944  934          spa_t *spa = dmu_objset_spa(sm->sm_os);
 945  935          uint64_t shift = sm->sm_shift;
 946  936          uint64_t *histogram = rt->rt_histogram;
 947  937          uint64_t entries_for_seg = 0;
 948  938  
 949  939          /*
 950  940           * In order to get a quick estimate of the optimal size that this
 951  941           * range tree would have on-disk as a space map, we iterate through
 952  942           * its histogram buckets instead of iterating through its nodes.
 953  943           *
 954  944           * Note that this is a highest-bound/worst-case estimate for the
 955  945           * following reasons:
 956  946           *
 957  947           * 1] We assume that we always add a debug padding for each block
 958  948           *    we write and we also assume that we start at the last word
 959  949           *    of a block attempting to write a two-word entry.
 960  950           * 2] Rounding up errors due to the way segments are distributed
 961  951           *    in the buckets of the range tree's histogram.
 962  952           * 3] The activation of zfs_force_some_double_word_sm_entries
 963  953           *    (tunable) when testing.
 964  954           *
 965  955           * = Math and Rounding Errors =
 966  956           *
 967  957           * rt_histogram[i] bucket of a range tree represents the number
 968  958           * of entries in [2^i, (2^(i+1))-1] of that range_tree. Given
 969  959           * that, we want to divide the buckets into groups: Buckets that
 970  960           * can be represented using a single-word entry, ones that can
 971  961           * be represented with a double-word entry, and ones that can
 972  962           * only be represented with multiple two-word entries.
 973  963           *
 974  964           * [Note that if the new encoding feature is not enabled there
 975  965           * are only two groups: single-word entry buckets and multiple
 976  966           * single-word entry buckets. The information below assumes
 977  967           * two-word entries enabled, but it can easily applied when
 978  968           * the feature is not enabled]
 979  969           *
 980  970           * To find the highest bucket that can be represented with a
 981  971           * single-word entry we look at the maximum run that such entry
 982  972           * can have, which is 2^(SM_RUN_BITS + sm_shift) [remember that
 983  973           * the run of a space map entry is shifted by sm_shift, thus we
 984  974           * add it to the exponent]. This way, excluding the value of the
 985  975           * maximum run that can be represented by a single-word entry,
 986  976           * all runs that are smaller exist in buckets 0 to
 987  977           * SM_RUN_BITS + shift - 1.
 988  978           *
 989  979           * To find the highest bucket that can be represented with a
 990  980           * double-word entry, we follow the same approach. Finally, any
 991  981           * bucket higher than that are represented with multiple two-word
 992  982           * entries. To be more specific, if the highest bucket whose
 993  983           * segments can be represented with a single two-word entry is X,
 994  984           * then bucket X+1 will need 2 two-word entries for each of its
 995  985           * segments, X+2 will need 4, X+3 will need 8, ...etc.
 996  986           *
 997  987           * With all of the above we make our estimation based on bucket
 998  988           * groups. There is a rounding error though. As we mentioned in
 999  989           * the example with the one-word entry, the maximum run that can
1000  990           * be represented in a one-word entry 2^(SM_RUN_BITS + shift) is
1001  991           * not part of bucket SM_RUN_BITS + shift - 1. Thus, segments of
1002  992           * that length fall into the next bucket (and bucket group) where
1003  993           * we start counting two-word entries and this is one more reason
1004  994           * why the estimated size may end up being bigger than the actual
1005  995           * size written.
1006  996           */
1007  997          uint64_t size = 0;
1008  998          uint64_t idx = 0;
1009  999  
1010 1000          if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2) ||
1011 1001              (vdev_id == SM_NO_VDEVID && sm->sm_size < SM_OFFSET_MAX)) {
1012 1002  
1013 1003                  /*
1014 1004                   * If we are trying to force some double word entries just
1015 1005                   * assume the worst-case of every single word entry being
1016 1006                   * written as a double word entry.
1017 1007                   */
1018 1008                  uint64_t entry_size =
1019 1009                      (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2) &&
1020 1010                      zfs_force_some_double_word_sm_entries) ?
1021 1011                      (2 * sizeof (uint64_t)) : sizeof (uint64_t);
1022 1012  
1023 1013                  uint64_t single_entry_max_bucket = SM_RUN_BITS + shift - 1;
1024 1014                  for (; idx <= single_entry_max_bucket; idx++)
1025 1015                          size += histogram[idx] * entry_size;
1026 1016  
1027 1017                  if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2)) {
1028 1018                          for (; idx < RANGE_TREE_HISTOGRAM_SIZE; idx++) {
1029 1019                                  ASSERT3U(idx, >=, single_entry_max_bucket);
1030 1020                                  entries_for_seg =
1031 1021                                      1ULL << (idx - single_entry_max_bucket);
1032 1022                                  size += histogram[idx] *
1033 1023                                      entries_for_seg * entry_size;
1034 1024                          }
1035 1025                          return (size);
1036 1026                  }
1037 1027          }
1038 1028  
1039 1029          ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2));
1040 1030  
1041 1031          uint64_t double_entry_max_bucket = SM2_RUN_BITS + shift - 1;
1042 1032          for (; idx <= double_entry_max_bucket; idx++)
1043 1033                  size += histogram[idx] * 2 * sizeof (uint64_t);
1044 1034  
1045 1035          for (; idx < RANGE_TREE_HISTOGRAM_SIZE; idx++) {
1046 1036                  ASSERT3U(idx, >=, double_entry_max_bucket);
1047 1037                  entries_for_seg = 1ULL << (idx - double_entry_max_bucket);
1048 1038                  size += histogram[idx] *
1049 1039                      entries_for_seg * 2 * sizeof (uint64_t);
1050 1040          }
1051 1041  
1052 1042          /*
1053 1043           * Assume the worst case where we start with the padding at the end
1054 1044           * of the current block and we add an extra padding entry at the end
1055 1045           * of all subsequent blocks.
1056 1046           */
1057 1047          size += ((size / sm->sm_blksz) + 1) * sizeof (uint64_t);
  
    | 
      ↓ open down ↓ | 
    161 lines elided | 
    
      ↑ open up ↑ | 
  
1058 1048  
1059 1049          return (size);
1060 1050  }
1061 1051  
1062 1052  uint64_t
1063 1053  space_map_object(space_map_t *sm)
1064 1054  {
1065 1055          return (sm != NULL ? sm->sm_object : 0);
1066 1056  }
1067 1057  
1068      -/*
1069      - * Returns the already synced, on-disk allocated space.
1070      - */
1071      -uint64_t
     1058 +int64_t
1072 1059  space_map_allocated(space_map_t *sm)
1073 1060  {
1074      -        return (sm != NULL ? sm->sm_alloc : 0);
     1061 +        return (sm != NULL ? sm->sm_phys->smp_alloc : 0);
1075 1062  }
1076 1063  
1077      -/*
1078      - * Returns the already synced, on-disk length;
1079      - */
1080 1064  uint64_t
1081 1065  space_map_length(space_map_t *sm)
1082 1066  {
1083      -        return (sm != NULL ? sm->sm_length : 0);
1084      -}
1085      -
1086      -/*
1087      - * Returns the allocated space that is currently syncing.
1088      - */
1089      -int64_t
1090      -space_map_alloc_delta(space_map_t *sm)
1091      -{
1092      -        if (sm == NULL)
1093      -                return (0);
1094      -        ASSERT(sm->sm_dbuf != NULL);
1095      -        return (sm->sm_phys->smp_alloc - space_map_allocated(sm));
     1067 +        return (sm != NULL ? sm->sm_phys->smp_length : 0);
1096 1068  }
    
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX