Print this page
10592 misc. metaslab and vdev related ZoL bug fixes
Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Giuseppe Di Natale <guss80@gmail.com>
Reviewed by: George Melikov <mail@gmelikov.ru>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Tony Hutter <hutter2@llnl.gov>
Reviewed by: Kody Kantor <kody.kantor@joyent.com>
Approved by: Dan McDonald <danmcd@joyent.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/space_map.c
          +++ new/usr/src/uts/common/fs/zfs/space_map.c
↓ open down ↓ 15 lines elided ↑ open up ↑
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24   24   */
  25   25  /*
  26      - * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
       26 + * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  27   27   */
  28   28  
  29   29  #include <sys/zfs_context.h>
  30   30  #include <sys/spa.h>
  31   31  #include <sys/dmu.h>
  32   32  #include <sys/dmu_tx.h>
  33   33  #include <sys/dnode.h>
  34   34  #include <sys/dsl_pool.h>
  35   35  #include <sys/zio.h>
  36   36  #include <sys/space_map.h>
↓ open down ↓ 37 lines elided ↑ open up ↑
  74   74  }
  75   75  
  76   76  boolean_t
  77   77  sm_entry_is_double_word(uint64_t e)
  78   78  {
  79   79          return (SM_PREFIX_DECODE(e) == SM2_PREFIX);
  80   80  }
  81   81  
  82   82  /*
  83   83   * Iterate through the space map, invoking the callback on each (non-debug)
  84      - * space map entry.
       84 + * space map entry. Stop after reading 'end' bytes of the space map.
  85   85   */
  86   86  int
  87      -space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg)
       87 +space_map_iterate(space_map_t *sm, uint64_t end, sm_cb_t callback, void *arg)
  88   88  {
  89      -        uint64_t sm_len = space_map_length(sm);
  90      -        ASSERT3U(sm->sm_blksz, !=, 0);
       89 +        uint64_t blksz = sm->sm_blksz;
  91   90  
  92      -        dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, sm_len,
       91 +        ASSERT3U(blksz, !=, 0);
       92 +        ASSERT3U(end, <=, space_map_length(sm));
       93 +        ASSERT0(P2PHASE(end, sizeof (uint64_t)));
       94 +
       95 +        dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, end,
  93   96              ZIO_PRIORITY_SYNC_READ);
  94   97  
  95      -        uint64_t blksz = sm->sm_blksz;
  96   98          int error = 0;
  97      -        for (uint64_t block_base = 0; block_base < sm_len && error == 0;
       99 +        for (uint64_t block_base = 0; block_base < end && error == 0;
  98  100              block_base += blksz) {
  99  101                  dmu_buf_t *db;
 100  102                  error = dmu_buf_hold(sm->sm_os, space_map_object(sm),
 101  103                      block_base, FTAG, &db, DMU_READ_PREFETCH);
 102  104                  if (error != 0)
 103  105                          return (error);
 104  106  
 105  107                  uint64_t *block_start = db->db_data;
 106      -                uint64_t block_length = MIN(sm_len - block_base, blksz);
      108 +                uint64_t block_length = MIN(end - block_base, blksz);
 107  109                  uint64_t *block_end = block_start +
 108  110                      (block_length / sizeof (uint64_t));
 109  111  
 110  112                  VERIFY0(P2PHASE(block_length, sizeof (uint64_t)));
 111  113                  VERIFY3U(block_length, !=, 0);
 112  114                  ASSERT3U(blksz, ==, db->db_size);
 113  115  
 114  116                  for (uint64_t *block_cursor = block_start;
 115  117                      block_cursor < block_end && error == 0; block_cursor++) {
 116  118                          uint64_t e = *block_cursor;
↓ open down ↓ 62 lines elided ↑ open up ↑
 179  181  {
 180  182          int error = 0;
 181  183          dmu_buf_t *db;
 182  184  
 183  185          /*
 184  186           * Find the offset of the last word in the space map and use
 185  187           * that to read the last block of the space map with
 186  188           * dmu_buf_hold().
 187  189           */
 188  190          uint64_t last_word_offset =
 189      -            sm->sm_phys->smp_objsize - sizeof (uint64_t);
      191 +            sm->sm_phys->smp_length - sizeof (uint64_t);
 190  192          error = dmu_buf_hold(sm->sm_os, space_map_object(sm), last_word_offset,
 191  193              FTAG, &db, DMU_READ_NO_PREFETCH);
 192  194          if (error != 0)
 193  195                  return (error);
 194  196  
 195  197          ASSERT3U(sm->sm_object, ==, db->db_object);
 196  198          ASSERT3U(sm->sm_blksz, ==, db->db_size);
 197  199          ASSERT3U(bufsz, >=, db->db_size);
 198  200          ASSERT(nwords != NULL);
 199  201  
 200  202          uint64_t *words = db->db_data;
 201  203          *nwords =
 202      -            (sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t);
      204 +            (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t);
 203  205  
 204  206          ASSERT3U(*nwords, <=, bufsz / sizeof (uint64_t));
 205  207  
 206  208          uint64_t n = *nwords;
 207  209          uint64_t j = n - 1;
 208  210          for (uint64_t i = 0; i < n; i++) {
 209  211                  uint64_t entry = words[i];
 210  212                  if (sm_entry_is_double_word(entry)) {
 211  213                          /*
 212  214                           * Since we are populating the buffer backwards
↓ open down ↓ 78 lines elided ↑ open up ↑
 291  293                      &nwords);
 292  294                  if (error != 0)
 293  295                          break;
 294  296  
 295  297                  ASSERT3U(nwords, <=, bufsz / sizeof (uint64_t));
 296  298  
 297  299                  for (uint64_t i = 0; i < nwords; i++) {
 298  300                          uint64_t e = buf[i];
 299  301  
 300  302                          if (sm_entry_is_debug(e)) {
 301      -                                sm->sm_phys->smp_objsize -= sizeof (uint64_t);
 302      -                                space_map_update(sm);
      303 +                                sm->sm_phys->smp_length -= sizeof (uint64_t);
 303  304                                  continue;
 304  305                          }
 305  306  
 306  307                          int words = 1;
 307  308                          uint64_t raw_offset, raw_run, vdev_id;
 308  309                          maptype_t type;
 309  310                          if (sm_entry_is_single_word(e)) {
 310  311                                  type = SM_TYPE_DECODE(e);
 311  312                                  vdev_id = SM_NO_VDEVID;
 312  313                                  raw_offset = SM_OFFSET_DECODE(e);
↓ open down ↓ 34 lines elided ↑ open up ↑
 347  348                              .sme_run = entry_run
 348  349                          };
 349  350                          error = callback(&sme, arg);
 350  351                          if (error != 0)
 351  352                                  break;
 352  353  
 353  354                          if (type == SM_ALLOC)
 354  355                                  sm->sm_phys->smp_alloc -= entry_run;
 355  356                          else
 356  357                                  sm->sm_phys->smp_alloc += entry_run;
 357      -                        sm->sm_phys->smp_objsize -= words * sizeof (uint64_t);
 358      -                        space_map_update(sm);
      358 +                        sm->sm_phys->smp_length -= words * sizeof (uint64_t);
 359  359                  }
 360  360          }
 361  361  
 362  362          if (space_map_length(sm) == 0) {
 363  363                  ASSERT0(error);
 364      -                ASSERT0(sm->sm_phys->smp_objsize);
 365      -                ASSERT0(sm->sm_alloc);
      364 +                ASSERT0(space_map_allocated(sm));
 366  365          }
 367  366  
 368  367          zio_buf_free(buf, bufsz);
 369  368          return (error);
 370  369  }
 371  370  
 372  371  typedef struct space_map_load_arg {
 373  372          space_map_t     *smla_sm;
 374  373          range_tree_t    *smla_rt;
 375  374          maptype_t       smla_type;
↓ open down ↓ 8 lines elided ↑ open up ↑
 384  383                      smla->smla_sm->sm_size);
 385  384                  range_tree_add(smla->smla_rt, sme->sme_offset, sme->sme_run);
 386  385          } else {
 387  386                  range_tree_remove(smla->smla_rt, sme->sme_offset, sme->sme_run);
 388  387          }
 389  388  
 390  389          return (0);
 391  390  }
 392  391  
 393  392  /*
 394      - * Load the space map disk into the specified range tree. Segments of maptype
 395      - * are added to the range tree, other segment types are removed.
      393 + * Load the spacemap into the rangetree, like space_map_load. But only
      394 + * read the first 'length' bytes of the spacemap.
 396  395   */
 397  396  int
 398      -space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype)
      397 +space_map_load_length(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
      398 +    uint64_t length)
 399  399  {
 400      -        uint64_t space;
 401      -        int err;
 402  400          space_map_load_arg_t smla;
 403  401  
 404  402          VERIFY0(range_tree_space(rt));
 405      -        space = space_map_allocated(sm);
 406  403  
 407      -        if (maptype == SM_FREE) {
      404 +        if (maptype == SM_FREE)
 408  405                  range_tree_add(rt, sm->sm_start, sm->sm_size);
 409      -                space = sm->sm_size - space;
 410      -        }
 411  406  
 412  407          smla.smla_rt = rt;
 413  408          smla.smla_sm = sm;
 414  409          smla.smla_type = maptype;
 415      -        err = space_map_iterate(sm, space_map_load_callback, &smla);
      410 +        int err = space_map_iterate(sm, length,
      411 +            space_map_load_callback, &smla);
 416  412  
 417      -        if (err == 0) {
 418      -                VERIFY3U(range_tree_space(rt), ==, space);
 419      -        } else {
      413 +        if (err != 0)
 420  414                  range_tree_vacate(rt, NULL, NULL);
 421      -        }
 422  415  
 423  416          return (err);
 424  417  }
 425  418  
      419 +/*
      420 + * Load the space map disk into the specified range tree. Segments of maptype
      421 + * are added to the range tree, other segment types are removed.
      422 + */
      423 +int
      424 +space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype)
      425 +{
      426 +        return (space_map_load_length(sm, rt, maptype, space_map_length(sm)));
      427 +}
      428 +
 426  429  void
 427  430  space_map_histogram_clear(space_map_t *sm)
 428  431  {
 429  432          if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
 430  433                  return;
 431  434  
 432  435          bzero(sm->sm_phys->smp_histogram, sizeof (sm->sm_phys->smp_histogram));
 433  436  }
 434  437  
 435  438  boolean_t
↓ open down ↓ 63 lines elided ↑ open up ↑
 499  502  static void
 500  503  space_map_write_intro_debug(space_map_t *sm, maptype_t maptype, dmu_tx_t *tx)
 501  504  {
 502  505          dmu_buf_will_dirty(sm->sm_dbuf, tx);
 503  506  
 504  507          uint64_t dentry = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) |
 505  508              SM_DEBUG_ACTION_ENCODE(maptype) |
 506  509              SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(tx->tx_pool->dp_spa)) |
 507  510              SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
 508  511  
 509      -        dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_objsize,
      512 +        dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_length,
 510  513              sizeof (dentry), &dentry, tx);
 511  514  
 512      -        sm->sm_phys->smp_objsize += sizeof (dentry);
      515 +        sm->sm_phys->smp_length += sizeof (dentry);
 513  516  }
 514  517  
 515  518  /*
 516  519   * Writes one or more entries given a segment.
 517  520   *
 518  521   * Note: The function may release the dbuf from the pointer initially
 519  522   * passed to it, and return a different dbuf. Also, the space map's
 520  523   * dbuf must be dirty for the changes in sm_phys to take effect.
 521  524   */
 522  525  static void
↓ open down ↓ 11 lines elided ↑ open up ↑
 534  537           * specified.
 535  538           */
 536  539          IMPLY(words == 1, vdev_id == SM_NO_VDEVID);
 537  540  
 538  541          dmu_buf_t *db = *dbp;
 539  542          ASSERT3U(db->db_size, ==, sm->sm_blksz);
 540  543  
 541  544          uint64_t *block_base = db->db_data;
 542  545          uint64_t *block_end = block_base + (sm->sm_blksz / sizeof (uint64_t));
 543  546          uint64_t *block_cursor = block_base +
 544      -            (sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t);
      547 +            (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t);
 545  548  
 546  549          ASSERT3P(block_cursor, <=, block_end);
 547  550  
 548  551          uint64_t size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
 549  552          uint64_t start = (rs->rs_start - sm->sm_start) >> sm->sm_shift;
 550  553          uint64_t run_max = (words == 2) ? SM2_RUN_MAX : SM_RUN_MAX;
 551  554  
 552  555          ASSERT3U(rs->rs_start, >=, sm->sm_start);
 553  556          ASSERT3U(rs->rs_start, <, sm->sm_start + sm->sm_size);
 554  557          ASSERT3U(rs->rs_end - rs->rs_start, <=, sm->sm_size);
↓ open down ↓ 2 lines elided ↑ open up ↑
 557  560          while (size != 0) {
 558  561                  ASSERT3P(block_cursor, <=, block_end);
 559  562  
 560  563                  /*
 561  564                   * If we are at the end of this block, flush it and start
 562  565                   * writing again from the beginning.
 563  566                   */
 564  567                  if (block_cursor == block_end) {
 565  568                          dmu_buf_rele(db, tag);
 566  569  
 567      -                        uint64_t next_word_offset = sm->sm_phys->smp_objsize;
      570 +                        uint64_t next_word_offset = sm->sm_phys->smp_length;
 568  571                          VERIFY0(dmu_buf_hold(sm->sm_os,
 569  572                              space_map_object(sm), next_word_offset,
 570  573                              tag, &db, DMU_READ_PREFETCH));
 571  574                          dmu_buf_will_dirty(db, tx);
 572  575  
 573  576                          /* update caller's dbuf */
 574  577                          *dbp = db;
 575  578  
 576  579                          ASSERT3U(db->db_size, ==, sm->sm_blksz);
 577  580  
↓ open down ↓ 9 lines elided ↑ open up ↑
 587  590                   * entry and write the two-word entry in the next block.
 588  591                   */
 589  592                  uint64_t *next_entry = block_cursor + 1;
 590  593                  if (next_entry == block_end && words > 1) {
 591  594                          ASSERT3U(words, ==, 2);
 592  595                          *block_cursor = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) |
 593  596                              SM_DEBUG_ACTION_ENCODE(0) |
 594  597                              SM_DEBUG_SYNCPASS_ENCODE(0) |
 595  598                              SM_DEBUG_TXG_ENCODE(0);
 596  599                          block_cursor++;
 597      -                        sm->sm_phys->smp_objsize += sizeof (uint64_t);
      600 +                        sm->sm_phys->smp_length += sizeof (uint64_t);
 598  601                          ASSERT3P(block_cursor, ==, block_end);
 599  602                          continue;
 600  603                  }
 601  604  
 602  605                  uint64_t run_len = MIN(size, run_max);
 603  606                  switch (words) {
 604  607                  case 1:
 605  608                          *block_cursor = SM_OFFSET_ENCODE(start) |
 606  609                              SM_TYPE_ENCODE(maptype) |
 607  610                              SM_RUN_ENCODE(run_len);
↓ open down ↓ 10 lines elided ↑ open up ↑
 618  621                          ASSERT3P(block_cursor, <, block_end);
 619  622                          *block_cursor = SM2_TYPE_ENCODE(maptype) |
 620  623                              SM2_OFFSET_ENCODE(start);
 621  624                          block_cursor++;
 622  625                          break;
 623  626                  default:
 624  627                          panic("%d-word space map entries are not supported",
 625  628                              words);
 626  629                          break;
 627  630                  }
 628      -                sm->sm_phys->smp_objsize += words * sizeof (uint64_t);
      631 +                sm->sm_phys->smp_length += words * sizeof (uint64_t);
 629  632  
 630  633                  start += run_len;
 631  634                  size -= run_len;
 632  635          }
 633  636          ASSERT0(size);
 634  637  
 635  638  }
 636  639  
 637  640  /*
 638  641   * Note: The space map's dbuf must be dirty for the changes in sm_phys to
↓ open down ↓ 6 lines elided ↑ open up ↑
 645  648          spa_t *spa = tx->tx_pool->dp_spa;
 646  649          dmu_buf_t *db;
 647  650  
 648  651          space_map_write_intro_debug(sm, maptype, tx);
 649  652  
 650  653  #ifdef DEBUG
 651  654          /*
 652  655           * We do this right after we write the intro debug entry
 653  656           * because the estimate does not take it into account.
 654  657           */
 655      -        uint64_t initial_objsize = sm->sm_phys->smp_objsize;
      658 +        uint64_t initial_objsize = sm->sm_phys->smp_length;
 656  659          uint64_t estimated_growth =
 657  660              space_map_estimate_optimal_size(sm, rt, SM_NO_VDEVID);
 658  661          uint64_t estimated_final_objsize = initial_objsize + estimated_growth;
 659  662  #endif
 660  663  
 661  664          /*
 662  665           * Find the offset right after the last word in the space map
 663  666           * and use that to get a hold of the last block, so we can
 664  667           * start appending to it.
 665  668           */
 666      -        uint64_t next_word_offset = sm->sm_phys->smp_objsize;
      669 +        uint64_t next_word_offset = sm->sm_phys->smp_length;
 667  670          VERIFY0(dmu_buf_hold(sm->sm_os, space_map_object(sm),
 668  671              next_word_offset, FTAG, &db, DMU_READ_PREFETCH));
 669  672          ASSERT3U(db->db_size, ==, sm->sm_blksz);
 670  673  
 671  674          dmu_buf_will_dirty(db, tx);
 672  675  
 673  676          avl_tree_t *t = &rt->rt_root;
 674  677          for (range_seg_t *rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) {
 675  678                  uint64_t offset = (rs->rs_start - sm->sm_start) >> sm->sm_shift;
 676  679                  uint64_t length = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
↓ open down ↓ 27 lines elided ↑ open up ↑
 704  707  
 705  708          dmu_buf_rele(db, FTAG);
 706  709  
 707  710  #ifdef DEBUG
 708  711          /*
 709  712           * We expect our estimation to be based on the worst case
 710  713           * scenario [see comment in space_map_estimate_optimal_size()].
 711  714           * Therefore we expect the actual objsize to be equal or less
 712  715           * than whatever we estimated it to be.
 713  716           */
 714      -        ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_objsize);
      717 +        ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_length);
 715  718  #endif
 716  719  }
 717  720  
 718  721  /*
 719  722   * Note: This function manipulates the state of the given space map but
 720  723   * does not hold any locks implicitly. Thus the caller is responsible
 721  724   * for synchronizing writes to the space map.
 722  725   */
 723  726  void
 724  727  space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
↓ open down ↓ 135 lines elided ↑ open up ↑
 860  863                  /*
 861  864                   * If the spacemap is reallocated, its histogram
 862  865                   * will be reset.  Do the same in the common case so that
 863  866                   * bugs related to the uncommon case do not go unnoticed.
 864  867                   */
 865  868                  bzero(sm->sm_phys->smp_histogram,
 866  869                      sizeof (sm->sm_phys->smp_histogram));
 867  870          }
 868  871  
 869  872          dmu_buf_will_dirty(sm->sm_dbuf, tx);
 870      -        sm->sm_phys->smp_objsize = 0;
      873 +        sm->sm_phys->smp_length = 0;
 871  874          sm->sm_phys->smp_alloc = 0;
 872  875  }
 873  876  
 874      -/*
 875      - * Update the in-core space_map allocation and length values.
 876      - */
 877      -void
 878      -space_map_update(space_map_t *sm)
 879      -{
 880      -        if (sm == NULL)
 881      -                return;
 882      -
 883      -        sm->sm_alloc = sm->sm_phys->smp_alloc;
 884      -        sm->sm_length = sm->sm_phys->smp_objsize;
 885      -}
 886      -
 887  877  uint64_t
 888  878  space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
 889  879  {
 890  880          spa_t *spa = dmu_objset_spa(os);
 891  881          uint64_t object;
 892  882          int bonuslen;
 893  883  
 894  884          if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
 895  885                  spa_feature_incr(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM, tx);
 896  886                  bonuslen = sizeof (space_map_phys_t);
↓ open down ↓ 161 lines elided ↑ open up ↑
1058 1048  
1059 1049          return (size);
1060 1050  }
1061 1051  
1062 1052  uint64_t
1063 1053  space_map_object(space_map_t *sm)
1064 1054  {
1065 1055          return (sm != NULL ? sm->sm_object : 0);
1066 1056  }
1067 1057  
1068      -/*
1069      - * Returns the already synced, on-disk allocated space.
1070      - */
1071      -uint64_t
     1058 +int64_t
1072 1059  space_map_allocated(space_map_t *sm)
1073 1060  {
1074      -        return (sm != NULL ? sm->sm_alloc : 0);
     1061 +        return (sm != NULL ? sm->sm_phys->smp_alloc : 0);
1075 1062  }
1076 1063  
1077      -/*
1078      - * Returns the already synced, on-disk length;
1079      - */
1080 1064  uint64_t
1081 1065  space_map_length(space_map_t *sm)
1082 1066  {
1083      -        return (sm != NULL ? sm->sm_length : 0);
1084      -}
1085      -
1086      -/*
1087      - * Returns the allocated space that is currently syncing.
1088      - */
1089      -int64_t
1090      -space_map_alloc_delta(space_map_t *sm)
1091      -{
1092      -        if (sm == NULL)
1093      -                return (0);
1094      -        ASSERT(sm->sm_dbuf != NULL);
1095      -        return (sm->sm_phys->smp_alloc - space_map_allocated(sm));
     1067 +        return (sm != NULL ? sm->sm_phys->smp_length : 0);
1096 1068  }
    
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX