Print this page
    
5056 ZFS deadlock on db_mtx and dn_holds
Reviewed by: Will Andrews <willa@spectralogic.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Approved by: Dan McDonald <danmcd@omniti.com>
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/zfs/zap_micro.c
          +++ new/usr/src/uts/common/fs/zfs/zap_micro.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  
    | 
      ↓ open down ↓ | 
    13 lines elided | 
    
      ↑ open up ↑ | 
  
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
       24 + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  24   25   */
  25   26  
  26   27  #include <sys/zio.h>
  27   28  #include <sys/spa.h>
  28   29  #include <sys/dmu.h>
  29   30  #include <sys/zfs_context.h>
  30   31  #include <sys/zap.h>
  31   32  #include <sys/refcount.h>
  32   33  #include <sys/zap_impl.h>
  33   34  #include <sys/zap_leaf.h>
  34   35  #include <sys/avl.h>
  35   36  #include <sys/arc.h>
  36   37  #include <sys/dmu_objset.h>
  37   38  
  38   39  #ifdef _KERNEL
  39   40  #include <sys/sunddi.h>
  40   41  #endif
  41   42  
  42   43  extern inline mzap_phys_t *zap_m_phys(zap_t *zap);
  43   44  
  44   45  static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags);
  45   46  
  46   47  uint64_t
  47   48  zap_getflags(zap_t *zap)
  48   49  {
  49   50          if (zap->zap_ismicro)
  50   51                  return (0);
  51   52          return (zap_f_phys(zap)->zap_flags);
  52   53  }
  53   54  
  54   55  int
  55   56  zap_hashbits(zap_t *zap)
  56   57  {
  57   58          if (zap_getflags(zap) & ZAP_FLAG_HASH64)
  58   59                  return (48);
  59   60          else
  60   61                  return (28);
  61   62  }
  62   63  
  63   64  uint32_t
  64   65  zap_maxcd(zap_t *zap)
  65   66  {
  66   67          if (zap_getflags(zap) & ZAP_FLAG_HASH64)
  67   68                  return ((1<<16)-1);
  68   69          else
  69   70                  return (-1U);
  70   71  }
  71   72  
  72   73  static uint64_t
  73   74  zap_hash(zap_name_t *zn)
  74   75  {
  75   76          zap_t *zap = zn->zn_zap;
  76   77          uint64_t h = 0;
  77   78  
  78   79          if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) {
  79   80                  ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY);
  80   81                  h = *(uint64_t *)zn->zn_key_orig;
  81   82          } else {
  82   83                  h = zap->zap_salt;
  83   84                  ASSERT(h != 0);
  84   85                  ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
  85   86  
  86   87                  if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
  87   88                          int i;
  88   89                          const uint64_t *wp = zn->zn_key_norm;
  89   90  
  90   91                          ASSERT(zn->zn_key_intlen == 8);
  91   92                          for (i = 0; i < zn->zn_key_norm_numints; wp++, i++) {
  92   93                                  int j;
  93   94                                  uint64_t word = *wp;
  94   95  
  95   96                                  for (j = 0; j < zn->zn_key_intlen; j++) {
  96   97                                          h = (h >> 8) ^
  97   98                                              zfs_crc64_table[(h ^ word) & 0xFF];
  98   99                                          word >>= NBBY;
  99  100                                  }
 100  101                          }
 101  102                  } else {
 102  103                          int i, len;
 103  104                          const uint8_t *cp = zn->zn_key_norm;
 104  105  
 105  106                          /*
 106  107                           * We previously stored the terminating null on
 107  108                           * disk, but didn't hash it, so we need to
 108  109                           * continue to not hash it.  (The
 109  110                           * zn_key_*_numints includes the terminating
 110  111                           * null for non-binary keys.)
 111  112                           */
 112  113                          len = zn->zn_key_norm_numints - 1;
 113  114  
 114  115                          ASSERT(zn->zn_key_intlen == 1);
 115  116                          for (i = 0; i < len; cp++, i++) {
 116  117                                  h = (h >> 8) ^
 117  118                                      zfs_crc64_table[(h ^ *cp) & 0xFF];
 118  119                          }
 119  120                  }
 120  121          }
 121  122          /*
 122  123           * Don't use all 64 bits, since we need some in the cookie for
 123  124           * the collision differentiator.  We MUST use the high bits,
 124  125           * since those are the ones that we first pay attention to when
 125  126           * chosing the bucket.
 126  127           */
 127  128          h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1);
 128  129  
 129  130          return (h);
 130  131  }
 131  132  
 132  133  static int
 133  134  zap_normalize(zap_t *zap, const char *name, char *namenorm)
 134  135  {
 135  136          size_t inlen, outlen;
 136  137          int err;
 137  138  
 138  139          ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY));
 139  140  
 140  141          inlen = strlen(name) + 1;
 141  142          outlen = ZAP_MAXNAMELEN;
 142  143  
 143  144          err = 0;
 144  145          (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen,
 145  146              zap->zap_normflags | U8_TEXTPREP_IGNORE_NULL |
 146  147              U8_TEXTPREP_IGNORE_INVALID, U8_UNICODE_LATEST, &err);
 147  148  
 148  149          return (err);
 149  150  }
 150  151  
 151  152  boolean_t
 152  153  zap_match(zap_name_t *zn, const char *matchname)
 153  154  {
 154  155          ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY));
 155  156  
 156  157          if (zn->zn_matchtype == MT_FIRST) {
 157  158                  char norm[ZAP_MAXNAMELEN];
 158  159  
 159  160                  if (zap_normalize(zn->zn_zap, matchname, norm) != 0)
 160  161                          return (B_FALSE);
 161  162  
 162  163                  return (strcmp(zn->zn_key_norm, norm) == 0);
 163  164          } else {
 164  165                  /* MT_BEST or MT_EXACT */
 165  166                  return (strcmp(zn->zn_key_orig, matchname) == 0);
 166  167          }
 167  168  }
 168  169  
 169  170  void
 170  171  zap_name_free(zap_name_t *zn)
 171  172  {
 172  173          kmem_free(zn, sizeof (zap_name_t));
 173  174  }
 174  175  
 175  176  zap_name_t *
 176  177  zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt)
 177  178  {
 178  179          zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
 179  180  
 180  181          zn->zn_zap = zap;
 181  182          zn->zn_key_intlen = sizeof (*key);
 182  183          zn->zn_key_orig = key;
 183  184          zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1;
 184  185          zn->zn_matchtype = mt;
 185  186          if (zap->zap_normflags) {
 186  187                  if (zap_normalize(zap, key, zn->zn_normbuf) != 0) {
 187  188                          zap_name_free(zn);
 188  189                          return (NULL);
 189  190                  }
 190  191                  zn->zn_key_norm = zn->zn_normbuf;
 191  192                  zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
 192  193          } else {
 193  194                  if (mt != MT_EXACT) {
 194  195                          zap_name_free(zn);
 195  196                          return (NULL);
 196  197                  }
 197  198                  zn->zn_key_norm = zn->zn_key_orig;
 198  199                  zn->zn_key_norm_numints = zn->zn_key_orig_numints;
 199  200          }
 200  201  
 201  202          zn->zn_hash = zap_hash(zn);
 202  203          return (zn);
 203  204  }
 204  205  
 205  206  zap_name_t *
 206  207  zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints)
 207  208  {
 208  209          zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
 209  210  
 210  211          ASSERT(zap->zap_normflags == 0);
 211  212          zn->zn_zap = zap;
 212  213          zn->zn_key_intlen = sizeof (*key);
 213  214          zn->zn_key_orig = zn->zn_key_norm = key;
 214  215          zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints;
 215  216          zn->zn_matchtype = MT_EXACT;
 216  217  
 217  218          zn->zn_hash = zap_hash(zn);
 218  219          return (zn);
 219  220  }
 220  221  
 221  222  static void
 222  223  mzap_byteswap(mzap_phys_t *buf, size_t size)
 223  224  {
 224  225          int i, max;
 225  226          buf->mz_block_type = BSWAP_64(buf->mz_block_type);
 226  227          buf->mz_salt = BSWAP_64(buf->mz_salt);
 227  228          buf->mz_normflags = BSWAP_64(buf->mz_normflags);
 228  229          max = (size / MZAP_ENT_LEN) - 1;
 229  230          for (i = 0; i < max; i++) {
 230  231                  buf->mz_chunk[i].mze_value =
 231  232                      BSWAP_64(buf->mz_chunk[i].mze_value);
 232  233                  buf->mz_chunk[i].mze_cd =
 233  234                      BSWAP_32(buf->mz_chunk[i].mze_cd);
 234  235          }
 235  236  }
 236  237  
 237  238  void
 238  239  zap_byteswap(void *buf, size_t size)
 239  240  {
 240  241          uint64_t block_type;
 241  242  
 242  243          block_type = *(uint64_t *)buf;
 243  244  
 244  245          if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) {
 245  246                  /* ASSERT(magic == ZAP_LEAF_MAGIC); */
 246  247                  mzap_byteswap(buf, size);
 247  248          } else {
 248  249                  fzap_byteswap(buf, size);
 249  250          }
 250  251  }
 251  252  
 252  253  static int
 253  254  mze_compare(const void *arg1, const void *arg2)
 254  255  {
 255  256          const mzap_ent_t *mze1 = arg1;
 256  257          const mzap_ent_t *mze2 = arg2;
 257  258  
 258  259          if (mze1->mze_hash > mze2->mze_hash)
 259  260                  return (+1);
 260  261          if (mze1->mze_hash < mze2->mze_hash)
 261  262                  return (-1);
 262  263          if (mze1->mze_cd > mze2->mze_cd)
 263  264                  return (+1);
 264  265          if (mze1->mze_cd < mze2->mze_cd)
 265  266                  return (-1);
 266  267          return (0);
 267  268  }
 268  269  
 269  270  static void
 270  271  mze_insert(zap_t *zap, int chunkid, uint64_t hash)
 271  272  {
 272  273          mzap_ent_t *mze;
 273  274  
 274  275          ASSERT(zap->zap_ismicro);
 275  276          ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 276  277  
 277  278          mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
 278  279          mze->mze_chunkid = chunkid;
 279  280          mze->mze_hash = hash;
 280  281          mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd;
 281  282          ASSERT(MZE_PHYS(zap, mze)->mze_name[0] != 0);
 282  283          avl_add(&zap->zap_m.zap_avl, mze);
 283  284  }
 284  285  
 285  286  static mzap_ent_t *
 286  287  mze_find(zap_name_t *zn)
 287  288  {
 288  289          mzap_ent_t mze_tofind;
 289  290          mzap_ent_t *mze;
 290  291          avl_index_t idx;
 291  292          avl_tree_t *avl = &zn->zn_zap->zap_m.zap_avl;
 292  293  
 293  294          ASSERT(zn->zn_zap->zap_ismicro);
 294  295          ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock));
 295  296  
 296  297          mze_tofind.mze_hash = zn->zn_hash;
 297  298          mze_tofind.mze_cd = 0;
 298  299  
 299  300  again:
 300  301          mze = avl_find(avl, &mze_tofind, &idx);
 301  302          if (mze == NULL)
 302  303                  mze = avl_nearest(avl, idx, AVL_AFTER);
 303  304          for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) {
 304  305                  ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd);
 305  306                  if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name))
 306  307                          return (mze);
 307  308          }
 308  309          if (zn->zn_matchtype == MT_BEST) {
 309  310                  zn->zn_matchtype = MT_FIRST;
 310  311                  goto again;
 311  312          }
 312  313          return (NULL);
 313  314  }
 314  315  
 315  316  static uint32_t
 316  317  mze_find_unused_cd(zap_t *zap, uint64_t hash)
 317  318  {
 318  319          mzap_ent_t mze_tofind;
 319  320          mzap_ent_t *mze;
 320  321          avl_index_t idx;
 321  322          avl_tree_t *avl = &zap->zap_m.zap_avl;
 322  323          uint32_t cd;
 323  324  
 324  325          ASSERT(zap->zap_ismicro);
 325  326          ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 326  327  
 327  328          mze_tofind.mze_hash = hash;
 328  329          mze_tofind.mze_cd = 0;
 329  330  
 330  331          cd = 0;
 331  332          for (mze = avl_find(avl, &mze_tofind, &idx);
 332  333              mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
 333  334                  if (mze->mze_cd != cd)
 334  335                          break;
 335  336                  cd++;
 336  337          }
 337  338  
 338  339          return (cd);
 339  340  }
 340  341  
 341  342  static void
 342  343  mze_remove(zap_t *zap, mzap_ent_t *mze)
 343  344  {
 344  345          ASSERT(zap->zap_ismicro);
 345  346          ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 346  347  
 347  348          avl_remove(&zap->zap_m.zap_avl, mze);
 348  349          kmem_free(mze, sizeof (mzap_ent_t));
 349  350  }
 350  351  
 351  352  static void
 352  353  mze_destroy(zap_t *zap)
 353  354  {
 354  355          mzap_ent_t *mze;
 355  356          void *avlcookie = NULL;
 356  357  
 357  358          while (mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie))
 358  359                  kmem_free(mze, sizeof (mzap_ent_t));
 359  360          avl_destroy(&zap->zap_m.zap_avl);
 360  361  }
 361  362  
 362  363  static zap_t *
 363  364  mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
 364  365  {
 365  366          zap_t *winner;
 366  367          zap_t *zap;
 367  368          int i;
 368  369  
 369  370          ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));
 370  371  
 371  372          zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
 372  373          rw_init(&zap->zap_rwlock, 0, 0, 0);
 373  374          rw_enter(&zap->zap_rwlock, RW_WRITER);
 374  375          zap->zap_objset = os;
 375  376          zap->zap_object = obj;
 376  377          zap->zap_dbuf = db;
 377  378  
 378  379          if (*(uint64_t *)db->db_data != ZBT_MICRO) {
 379  380                  mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
  
    | 
      ↓ open down ↓ | 
    346 lines elided | 
    
      ↑ open up ↑ | 
  
 380  381                  zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1;
 381  382          } else {
 382  383                  zap->zap_ismicro = TRUE;
 383  384          }
 384  385  
 385  386          /*
 386  387           * Make sure that zap_ismicro is set before we let others see
 387  388           * it, because zap_lockdir() checks zap_ismicro without the lock
 388  389           * held.
 389  390           */
 390      -        winner = dmu_buf_set_user(db, zap, zap_evict);
      391 +        dmu_buf_init_user(&zap->zap_dbu, zap_evict, &zap->zap_dbuf);
      392 +        winner = dmu_buf_set_user(db, &zap->zap_dbu);
 391  393  
 392  394          if (winner != NULL) {
 393  395                  rw_exit(&zap->zap_rwlock);
 394  396                  rw_destroy(&zap->zap_rwlock);
 395  397                  if (!zap->zap_ismicro)
 396  398                          mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
 397  399                  kmem_free(zap, sizeof (zap_t));
 398  400                  return (winner);
 399  401          }
 400  402  
 401  403          if (zap->zap_ismicro) {
 402  404                  zap->zap_salt = zap_m_phys(zap)->mz_salt;
 403  405                  zap->zap_normflags = zap_m_phys(zap)->mz_normflags;
 404  406                  zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
 405  407                  avl_create(&zap->zap_m.zap_avl, mze_compare,
 406  408                      sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node));
 407  409  
 408  410                  for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
 409  411                          mzap_ent_phys_t *mze =
 410  412                              &zap_m_phys(zap)->mz_chunk[i];
 411  413                          if (mze->mze_name[0]) {
 412  414                                  zap_name_t *zn;
 413  415  
 414  416                                  zap->zap_m.zap_num_entries++;
 415  417                                  zn = zap_name_alloc(zap, mze->mze_name,
 416  418                                      MT_EXACT);
 417  419                                  mze_insert(zap, i, zn->zn_hash);
 418  420                                  zap_name_free(zn);
 419  421                          }
 420  422                  }
 421  423          } else {
 422  424                  zap->zap_salt = zap_f_phys(zap)->zap_salt;
 423  425                  zap->zap_normflags = zap_f_phys(zap)->zap_normflags;
 424  426  
 425  427                  ASSERT3U(sizeof (struct zap_leaf_header), ==,
 426  428                      2*ZAP_LEAF_CHUNKSIZE);
 427  429  
 428  430                  /*
 429  431                   * The embedded pointer table should not overlap the
 430  432                   * other members.
 431  433                   */
 432  434                  ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >,
 433  435                      &zap_f_phys(zap)->zap_salt);
 434  436  
 435  437                  /*
 436  438                   * The embedded pointer table should end at the end of
 437  439                   * the block
 438  440                   */
 439  441                  ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap,
 440  442                      1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) -
 441  443                      (uintptr_t)zap_f_phys(zap), ==,
 442  444                      zap->zap_dbuf->db_size);
 443  445          }
 444  446          rw_exit(&zap->zap_rwlock);
 445  447          return (zap);
 446  448  }
 447  449  
 448  450  int
 449  451  zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
 450  452      krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
 451  453  {
 452  454          zap_t *zap;
 453  455          dmu_buf_t *db;
 454  456          krw_t lt;
 455  457          int err;
 456  458  
 457  459          *zapp = NULL;
 458  460  
 459  461          err = dmu_buf_hold(os, obj, 0, NULL, &db, DMU_READ_NO_PREFETCH);
 460  462          if (err)
 461  463                  return (err);
 462  464  
 463  465  #ifdef ZFS_DEBUG
 464  466          {
 465  467                  dmu_object_info_t doi;
 466  468                  dmu_object_info_from_db(db, &doi);
 467  469                  ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
 468  470          }
 469  471  #endif
 470  472  
 471  473          zap = dmu_buf_get_user(db);
 472  474          if (zap == NULL)
 473  475                  zap = mzap_open(os, obj, db);
 474  476  
 475  477          /*
 476  478           * We're checking zap_ismicro without the lock held, in order to
 477  479           * tell what type of lock we want.  Once we have some sort of
 478  480           * lock, see if it really is the right type.  In practice this
 479  481           * can only be different if it was upgraded from micro to fat,
 480  482           * and micro wanted WRITER but fat only needs READER.
 481  483           */
 482  484          lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
 483  485          rw_enter(&zap->zap_rwlock, lt);
 484  486          if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) {
 485  487                  /* it was upgraded, now we only need reader */
 486  488                  ASSERT(lt == RW_WRITER);
 487  489                  ASSERT(RW_READER ==
 488  490                      (!zap->zap_ismicro && fatreader) ? RW_READER : lti);
 489  491                  rw_downgrade(&zap->zap_rwlock);
 490  492                  lt = RW_READER;
 491  493          }
 492  494  
 493  495          zap->zap_objset = os;
 494  496  
 495  497          if (lt == RW_WRITER)
 496  498                  dmu_buf_will_dirty(db, tx);
 497  499  
 498  500          ASSERT3P(zap->zap_dbuf, ==, db);
 499  501  
 500  502          ASSERT(!zap->zap_ismicro ||
 501  503              zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks);
 502  504          if (zap->zap_ismicro && tx && adding &&
 503  505              zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) {
 504  506                  uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE;
 505  507                  if (newsz > MZAP_MAX_BLKSZ) {
 506  508                          dprintf("upgrading obj %llu: num_entries=%u\n",
 507  509                              obj, zap->zap_m.zap_num_entries);
 508  510                          *zapp = zap;
 509  511                          return (mzap_upgrade(zapp, tx, 0));
 510  512                  }
 511  513                  err = dmu_object_set_blocksize(os, obj, newsz, 0, tx);
 512  514                  ASSERT0(err);
 513  515                  zap->zap_m.zap_num_chunks =
 514  516                      db->db_size / MZAP_ENT_LEN - 1;
 515  517          }
 516  518  
 517  519          *zapp = zap;
 518  520          return (0);
 519  521  }
 520  522  
 521  523  void
 522  524  zap_unlockdir(zap_t *zap)
 523  525  {
 524  526          rw_exit(&zap->zap_rwlock);
 525  527          dmu_buf_rele(zap->zap_dbuf, NULL);
 526  528  }
 527  529  
 528  530  static int
 529  531  mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags)
 530  532  {
 531  533          mzap_phys_t *mzp;
 532  534          int i, sz, nchunks;
 533  535          int err = 0;
 534  536          zap_t *zap = *zapp;
 535  537  
 536  538          ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 537  539  
 538  540          sz = zap->zap_dbuf->db_size;
 539  541          mzp = kmem_alloc(sz, KM_SLEEP);
 540  542          bcopy(zap->zap_dbuf->db_data, mzp, sz);
 541  543          nchunks = zap->zap_m.zap_num_chunks;
 542  544  
 543  545          if (!flags) {
 544  546                  err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
 545  547                      1ULL << fzap_default_block_shift, 0, tx);
 546  548                  if (err) {
 547  549                          kmem_free(mzp, sz);
 548  550                          return (err);
 549  551                  }
 550  552          }
 551  553  
 552  554          dprintf("upgrading obj=%llu with %u chunks\n",
 553  555              zap->zap_object, nchunks);
 554  556          /* XXX destroy the avl later, so we can use the stored hash value */
 555  557          mze_destroy(zap);
 556  558  
 557  559          fzap_upgrade(zap, tx, flags);
 558  560  
 559  561          for (i = 0; i < nchunks; i++) {
 560  562                  mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
 561  563                  zap_name_t *zn;
 562  564                  if (mze->mze_name[0] == 0)
 563  565                          continue;
 564  566                  dprintf("adding %s=%llu\n",
 565  567                      mze->mze_name, mze->mze_value);
 566  568                  zn = zap_name_alloc(zap, mze->mze_name, MT_EXACT);
 567  569                  err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, tx);
 568  570                  zap = zn->zn_zap;       /* fzap_add_cd() may change zap */
 569  571                  zap_name_free(zn);
 570  572                  if (err)
 571  573                          break;
 572  574          }
 573  575          kmem_free(mzp, sz);
 574  576          *zapp = zap;
 575  577          return (err);
 576  578  }
 577  579  
 578  580  void
 579  581  mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags,
 580  582      dmu_tx_t *tx)
 581  583  {
 582  584          dmu_buf_t *db;
 583  585          mzap_phys_t *zp;
 584  586  
 585  587          VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH));
 586  588  
 587  589  #ifdef ZFS_DEBUG
 588  590          {
 589  591                  dmu_object_info_t doi;
 590  592                  dmu_object_info_from_db(db, &doi);
 591  593                  ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
 592  594          }
 593  595  #endif
 594  596  
 595  597          dmu_buf_will_dirty(db, tx);
 596  598          zp = db->db_data;
 597  599          zp->mz_block_type = ZBT_MICRO;
 598  600          zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL;
 599  601          zp->mz_normflags = normflags;
 600  602          dmu_buf_rele(db, FTAG);
 601  603  
 602  604          if (flags != 0) {
 603  605                  zap_t *zap;
 604  606                  /* Only fat zap supports flags; upgrade immediately. */
 605  607                  VERIFY(0 == zap_lockdir(os, obj, tx, RW_WRITER,
 606  608                      B_FALSE, B_FALSE, &zap));
 607  609                  VERIFY3U(0, ==, mzap_upgrade(&zap, tx, flags));
 608  610                  zap_unlockdir(zap);
 609  611          }
 610  612  }
 611  613  
 612  614  int
 613  615  zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
 614  616      dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 615  617  {
 616  618          return (zap_create_claim_norm(os, obj,
 617  619              0, ot, bonustype, bonuslen, tx));
 618  620  }
 619  621  
 620  622  int
 621  623  zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags,
 622  624      dmu_object_type_t ot,
 623  625      dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 624  626  {
 625  627          int err;
 626  628  
 627  629          err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx);
 628  630          if (err != 0)
 629  631                  return (err);
 630  632          mzap_create_impl(os, obj, normflags, 0, tx);
 631  633          return (0);
 632  634  }
 633  635  
 634  636  uint64_t
 635  637  zap_create(objset_t *os, dmu_object_type_t ot,
 636  638      dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 637  639  {
 638  640          return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx));
 639  641  }
 640  642  
 641  643  uint64_t
 642  644  zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot,
 643  645      dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 644  646  {
 645  647          uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
 646  648  
 647  649          mzap_create_impl(os, obj, normflags, 0, tx);
 648  650          return (obj);
 649  651  }
 650  652  
 651  653  uint64_t
 652  654  zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
 653  655      dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
 654  656      dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 655  657  {
 656  658          uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
 657  659  
 658  660          ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT &&
 659  661              leaf_blockshift <= SPA_OLD_MAXBLOCKSHIFT &&
 660  662              indirect_blockshift >= SPA_MINBLOCKSHIFT &&
 661  663              indirect_blockshift <= SPA_OLD_MAXBLOCKSHIFT);
 662  664  
 663  665          VERIFY(dmu_object_set_blocksize(os, obj,
 664  666              1ULL << leaf_blockshift, indirect_blockshift, tx) == 0);
 665  667  
 666  668          mzap_create_impl(os, obj, normflags, flags, tx);
 667  669          return (obj);
 668  670  }
 669  671  
 670  672  int
 671  673  zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx)
  
    | 
      ↓ open down ↓ | 
    271 lines elided | 
    
      ↑ open up ↑ | 
  
 672  674  {
 673  675          /*
 674  676           * dmu_object_free will free the object number and free the
 675  677           * data.  Freeing the data will cause our pageout function to be
 676  678           * called, which will destroy our data (zap_leaf_t's and zap_t).
 677  679           */
 678  680  
 679  681          return (dmu_object_free(os, zapobj, tx));
 680  682  }
 681  683  
 682      -_NOTE(ARGSUSED(0))
 683  684  void
 684      -zap_evict(dmu_buf_t *db, void *vzap)
      685 +zap_evict(void *dbu)
 685  686  {
 686      -        zap_t *zap = vzap;
      687 +        zap_t *zap = dbu;
 687  688  
 688  689          rw_destroy(&zap->zap_rwlock);
 689  690  
 690  691          if (zap->zap_ismicro)
 691  692                  mze_destroy(zap);
 692  693          else
 693  694                  mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
 694  695  
 695  696          kmem_free(zap, sizeof (zap_t));
 696  697  }
 697  698  
 698  699  int
 699  700  zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
 700  701  {
 701  702          zap_t *zap;
 702  703          int err;
 703  704  
 704  705          err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
 705  706          if (err)
 706  707                  return (err);
 707  708          if (!zap->zap_ismicro) {
 708  709                  err = fzap_count(zap, count);
 709  710          } else {
 710  711                  *count = zap->zap_m.zap_num_entries;
 711  712          }
 712  713          zap_unlockdir(zap);
 713  714          return (err);
 714  715  }
 715  716  
 716  717  /*
 717  718   * zn may be NULL; if not specified, it will be computed if needed.
 718  719   * See also the comment above zap_entry_normalization_conflict().
 719  720   */
 720  721  static boolean_t
 721  722  mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze)
 722  723  {
 723  724          mzap_ent_t *other;
 724  725          int direction = AVL_BEFORE;
 725  726          boolean_t allocdzn = B_FALSE;
 726  727  
 727  728          if (zap->zap_normflags == 0)
 728  729                  return (B_FALSE);
 729  730  
 730  731  again:
 731  732          for (other = avl_walk(&zap->zap_m.zap_avl, mze, direction);
 732  733              other && other->mze_hash == mze->mze_hash;
 733  734              other = avl_walk(&zap->zap_m.zap_avl, other, direction)) {
 734  735  
 735  736                  if (zn == NULL) {
 736  737                          zn = zap_name_alloc(zap, MZE_PHYS(zap, mze)->mze_name,
 737  738                              MT_FIRST);
 738  739                          allocdzn = B_TRUE;
 739  740                  }
 740  741                  if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
 741  742                          if (allocdzn)
 742  743                                  zap_name_free(zn);
 743  744                          return (B_TRUE);
 744  745                  }
 745  746          }
 746  747  
 747  748          if (direction == AVL_BEFORE) {
 748  749                  direction = AVL_AFTER;
 749  750                  goto again;
 750  751          }
 751  752  
 752  753          if (allocdzn)
 753  754                  zap_name_free(zn);
 754  755          return (B_FALSE);
 755  756  }
 756  757  
 757  758  /*
 758  759   * Routines for manipulating attributes.
 759  760   */
 760  761  
 761  762  int
 762  763  zap_lookup(objset_t *os, uint64_t zapobj, const char *name,
 763  764      uint64_t integer_size, uint64_t num_integers, void *buf)
 764  765  {
 765  766          return (zap_lookup_norm(os, zapobj, name, integer_size,
 766  767              num_integers, buf, MT_EXACT, NULL, 0, NULL));
 767  768  }
 768  769  
 769  770  int
 770  771  zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
 771  772      uint64_t integer_size, uint64_t num_integers, void *buf,
 772  773      matchtype_t mt, char *realname, int rn_len,
 773  774      boolean_t *ncp)
 774  775  {
 775  776          zap_t *zap;
 776  777          int err;
 777  778          mzap_ent_t *mze;
 778  779          zap_name_t *zn;
 779  780  
 780  781          err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
 781  782          if (err)
 782  783                  return (err);
 783  784          zn = zap_name_alloc(zap, name, mt);
 784  785          if (zn == NULL) {
 785  786                  zap_unlockdir(zap);
 786  787                  return (SET_ERROR(ENOTSUP));
 787  788          }
 788  789  
 789  790          if (!zap->zap_ismicro) {
 790  791                  err = fzap_lookup(zn, integer_size, num_integers, buf,
 791  792                      realname, rn_len, ncp);
 792  793          } else {
 793  794                  mze = mze_find(zn);
 794  795                  if (mze == NULL) {
 795  796                          err = SET_ERROR(ENOENT);
 796  797                  } else {
 797  798                          if (num_integers < 1) {
 798  799                                  err = SET_ERROR(EOVERFLOW);
 799  800                          } else if (integer_size != 8) {
 800  801                                  err = SET_ERROR(EINVAL);
 801  802                          } else {
 802  803                                  *(uint64_t *)buf =
 803  804                                      MZE_PHYS(zap, mze)->mze_value;
 804  805                                  (void) strlcpy(realname,
 805  806                                      MZE_PHYS(zap, mze)->mze_name, rn_len);
 806  807                                  if (ncp) {
 807  808                                          *ncp = mzap_normalization_conflict(zap,
 808  809                                              zn, mze);
 809  810                                  }
 810  811                          }
 811  812                  }
 812  813          }
 813  814          zap_name_free(zn);
 814  815          zap_unlockdir(zap);
 815  816          return (err);
 816  817  }
 817  818  
 818  819  int
 819  820  zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
 820  821      int key_numints)
 821  822  {
 822  823          zap_t *zap;
 823  824          int err;
 824  825          zap_name_t *zn;
 825  826  
 826  827          err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
 827  828          if (err)
 828  829                  return (err);
 829  830          zn = zap_name_alloc_uint64(zap, key, key_numints);
 830  831          if (zn == NULL) {
 831  832                  zap_unlockdir(zap);
 832  833                  return (SET_ERROR(ENOTSUP));
 833  834          }
 834  835  
 835  836          fzap_prefetch(zn);
 836  837          zap_name_free(zn);
 837  838          zap_unlockdir(zap);
 838  839          return (err);
 839  840  }
 840  841  
 841  842  int
 842  843  zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
 843  844      int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
 844  845  {
 845  846          zap_t *zap;
 846  847          int err;
 847  848          zap_name_t *zn;
 848  849  
 849  850          err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
 850  851          if (err)
 851  852                  return (err);
 852  853          zn = zap_name_alloc_uint64(zap, key, key_numints);
 853  854          if (zn == NULL) {
 854  855                  zap_unlockdir(zap);
 855  856                  return (SET_ERROR(ENOTSUP));
 856  857          }
 857  858  
 858  859          err = fzap_lookup(zn, integer_size, num_integers, buf,
 859  860              NULL, 0, NULL);
 860  861          zap_name_free(zn);
 861  862          zap_unlockdir(zap);
 862  863          return (err);
 863  864  }
 864  865  
 865  866  int
 866  867  zap_contains(objset_t *os, uint64_t zapobj, const char *name)
 867  868  {
 868  869          int err = zap_lookup_norm(os, zapobj, name, 0,
 869  870              0, NULL, MT_EXACT, NULL, 0, NULL);
 870  871          if (err == EOVERFLOW || err == EINVAL)
 871  872                  err = 0; /* found, but skipped reading the value */
 872  873          return (err);
 873  874  }
 874  875  
 875  876  int
 876  877  zap_length(objset_t *os, uint64_t zapobj, const char *name,
 877  878      uint64_t *integer_size, uint64_t *num_integers)
 878  879  {
 879  880          zap_t *zap;
 880  881          int err;
 881  882          mzap_ent_t *mze;
 882  883          zap_name_t *zn;
 883  884  
 884  885          err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
 885  886          if (err)
 886  887                  return (err);
 887  888          zn = zap_name_alloc(zap, name, MT_EXACT);
 888  889          if (zn == NULL) {
 889  890                  zap_unlockdir(zap);
 890  891                  return (SET_ERROR(ENOTSUP));
 891  892          }
 892  893          if (!zap->zap_ismicro) {
 893  894                  err = fzap_length(zn, integer_size, num_integers);
 894  895          } else {
 895  896                  mze = mze_find(zn);
 896  897                  if (mze == NULL) {
 897  898                          err = SET_ERROR(ENOENT);
 898  899                  } else {
 899  900                          if (integer_size)
 900  901                                  *integer_size = 8;
 901  902                          if (num_integers)
 902  903                                  *num_integers = 1;
 903  904                  }
 904  905          }
 905  906          zap_name_free(zn);
 906  907          zap_unlockdir(zap);
 907  908          return (err);
 908  909  }
 909  910  
 910  911  int
 911  912  zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
 912  913      int key_numints, uint64_t *integer_size, uint64_t *num_integers)
 913  914  {
 914  915          zap_t *zap;
 915  916          int err;
 916  917          zap_name_t *zn;
 917  918  
 918  919          err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
 919  920          if (err)
 920  921                  return (err);
 921  922          zn = zap_name_alloc_uint64(zap, key, key_numints);
 922  923          if (zn == NULL) {
 923  924                  zap_unlockdir(zap);
 924  925                  return (SET_ERROR(ENOTSUP));
 925  926          }
 926  927          err = fzap_length(zn, integer_size, num_integers);
 927  928          zap_name_free(zn);
 928  929          zap_unlockdir(zap);
 929  930          return (err);
 930  931  }
 931  932  
 932  933  static void
 933  934  mzap_addent(zap_name_t *zn, uint64_t value)
 934  935  {
 935  936          int i;
 936  937          zap_t *zap = zn->zn_zap;
 937  938          int start = zap->zap_m.zap_alloc_next;
 938  939          uint32_t cd;
 939  940  
 940  941          ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 941  942  
 942  943  #ifdef ZFS_DEBUG
 943  944          for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
 944  945                  mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
 945  946                  ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0);
 946  947          }
 947  948  #endif
 948  949  
 949  950          cd = mze_find_unused_cd(zap, zn->zn_hash);
 950  951          /* given the limited size of the microzap, this can't happen */
 951  952          ASSERT(cd < zap_maxcd(zap));
 952  953  
 953  954  again:
 954  955          for (i = start; i < zap->zap_m.zap_num_chunks; i++) {
 955  956                  mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
 956  957                  if (mze->mze_name[0] == 0) {
 957  958                          mze->mze_value = value;
 958  959                          mze->mze_cd = cd;
 959  960                          (void) strcpy(mze->mze_name, zn->zn_key_orig);
 960  961                          zap->zap_m.zap_num_entries++;
 961  962                          zap->zap_m.zap_alloc_next = i+1;
 962  963                          if (zap->zap_m.zap_alloc_next ==
 963  964                              zap->zap_m.zap_num_chunks)
 964  965                                  zap->zap_m.zap_alloc_next = 0;
 965  966                          mze_insert(zap, i, zn->zn_hash);
 966  967                          return;
 967  968                  }
 968  969          }
 969  970          if (start != 0) {
 970  971                  start = 0;
 971  972                  goto again;
 972  973          }
 973  974          ASSERT(!"out of entries!");
 974  975  }
 975  976  
 976  977  int
 977  978  zap_add(objset_t *os, uint64_t zapobj, const char *key,
 978  979      int integer_size, uint64_t num_integers,
 979  980      const void *val, dmu_tx_t *tx)
 980  981  {
 981  982          zap_t *zap;
 982  983          int err;
 983  984          mzap_ent_t *mze;
 984  985          const uint64_t *intval = val;
 985  986          zap_name_t *zn;
 986  987  
 987  988          err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
 988  989          if (err)
 989  990                  return (err);
 990  991          zn = zap_name_alloc(zap, key, MT_EXACT);
 991  992          if (zn == NULL) {
 992  993                  zap_unlockdir(zap);
 993  994                  return (SET_ERROR(ENOTSUP));
 994  995          }
 995  996          if (!zap->zap_ismicro) {
 996  997                  err = fzap_add(zn, integer_size, num_integers, val, tx);
 997  998                  zap = zn->zn_zap;       /* fzap_add() may change zap */
 998  999          } else if (integer_size != 8 || num_integers != 1 ||
 999 1000              strlen(key) >= MZAP_NAME_LEN) {
1000 1001                  err = mzap_upgrade(&zn->zn_zap, tx, 0);
1001 1002                  if (err == 0)
1002 1003                          err = fzap_add(zn, integer_size, num_integers, val, tx);
1003 1004                  zap = zn->zn_zap;       /* fzap_add() may change zap */
1004 1005          } else {
1005 1006                  mze = mze_find(zn);
1006 1007                  if (mze != NULL) {
1007 1008                          err = SET_ERROR(EEXIST);
1008 1009                  } else {
1009 1010                          mzap_addent(zn, *intval);
1010 1011                  }
1011 1012          }
1012 1013          ASSERT(zap == zn->zn_zap);
1013 1014          zap_name_free(zn);
1014 1015          if (zap != NULL)        /* may be NULL if fzap_add() failed */
1015 1016                  zap_unlockdir(zap);
1016 1017          return (err);
1017 1018  }
1018 1019  
1019 1020  int
1020 1021  zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1021 1022      int key_numints, int integer_size, uint64_t num_integers,
1022 1023      const void *val, dmu_tx_t *tx)
1023 1024  {
1024 1025          zap_t *zap;
1025 1026          int err;
1026 1027          zap_name_t *zn;
1027 1028  
1028 1029          err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
1029 1030          if (err)
1030 1031                  return (err);
1031 1032          zn = zap_name_alloc_uint64(zap, key, key_numints);
1032 1033          if (zn == NULL) {
1033 1034                  zap_unlockdir(zap);
1034 1035                  return (SET_ERROR(ENOTSUP));
1035 1036          }
1036 1037          err = fzap_add(zn, integer_size, num_integers, val, tx);
1037 1038          zap = zn->zn_zap;       /* fzap_add() may change zap */
1038 1039          zap_name_free(zn);
1039 1040          if (zap != NULL)        /* may be NULL if fzap_add() failed */
1040 1041                  zap_unlockdir(zap);
1041 1042          return (err);
1042 1043  }
1043 1044  
1044 1045  int
1045 1046  zap_update(objset_t *os, uint64_t zapobj, const char *name,
1046 1047      int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
1047 1048  {
1048 1049          zap_t *zap;
1049 1050          mzap_ent_t *mze;
1050 1051          uint64_t oldval;
1051 1052          const uint64_t *intval = val;
1052 1053          zap_name_t *zn;
1053 1054          int err;
1054 1055  
1055 1056  #ifdef ZFS_DEBUG
1056 1057          /*
1057 1058           * If there is an old value, it shouldn't change across the
1058 1059           * lockdir (eg, due to bprewrite's xlation).
1059 1060           */
1060 1061          if (integer_size == 8 && num_integers == 1)
1061 1062                  (void) zap_lookup(os, zapobj, name, 8, 1, &oldval);
1062 1063  #endif
1063 1064  
1064 1065          err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
1065 1066          if (err)
1066 1067                  return (err);
1067 1068          zn = zap_name_alloc(zap, name, MT_EXACT);
1068 1069          if (zn == NULL) {
1069 1070                  zap_unlockdir(zap);
1070 1071                  return (SET_ERROR(ENOTSUP));
1071 1072          }
1072 1073          if (!zap->zap_ismicro) {
1073 1074                  err = fzap_update(zn, integer_size, num_integers, val, tx);
1074 1075                  zap = zn->zn_zap;       /* fzap_update() may change zap */
1075 1076          } else if (integer_size != 8 || num_integers != 1 ||
1076 1077              strlen(name) >= MZAP_NAME_LEN) {
1077 1078                  dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
1078 1079                      zapobj, integer_size, num_integers, name);
1079 1080                  err = mzap_upgrade(&zn->zn_zap, tx, 0);
1080 1081                  if (err == 0)
1081 1082                          err = fzap_update(zn, integer_size, num_integers,
1082 1083                              val, tx);
1083 1084                  zap = zn->zn_zap;       /* fzap_update() may change zap */
1084 1085          } else {
1085 1086                  mze = mze_find(zn);
1086 1087                  if (mze != NULL) {
1087 1088                          ASSERT3U(MZE_PHYS(zap, mze)->mze_value, ==, oldval);
1088 1089                          MZE_PHYS(zap, mze)->mze_value = *intval;
1089 1090                  } else {
1090 1091                          mzap_addent(zn, *intval);
1091 1092                  }
1092 1093          }
1093 1094          ASSERT(zap == zn->zn_zap);
1094 1095          zap_name_free(zn);
1095 1096          if (zap != NULL)        /* may be NULL if fzap_upgrade() failed */
1096 1097                  zap_unlockdir(zap);
1097 1098          return (err);
1098 1099  }
1099 1100  
1100 1101  int
1101 1102  zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1102 1103      int key_numints,
1103 1104      int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
1104 1105  {
1105 1106          zap_t *zap;
1106 1107          zap_name_t *zn;
1107 1108          int err;
1108 1109  
1109 1110          err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
1110 1111          if (err)
1111 1112                  return (err);
1112 1113          zn = zap_name_alloc_uint64(zap, key, key_numints);
1113 1114          if (zn == NULL) {
1114 1115                  zap_unlockdir(zap);
1115 1116                  return (SET_ERROR(ENOTSUP));
1116 1117          }
1117 1118          err = fzap_update(zn, integer_size, num_integers, val, tx);
1118 1119          zap = zn->zn_zap;       /* fzap_update() may change zap */
1119 1120          zap_name_free(zn);
1120 1121          if (zap != NULL)        /* may be NULL if fzap_upgrade() failed */
1121 1122                  zap_unlockdir(zap);
1122 1123          return (err);
1123 1124  }
1124 1125  
1125 1126  int
1126 1127  zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
1127 1128  {
1128 1129          return (zap_remove_norm(os, zapobj, name, MT_EXACT, tx));
1129 1130  }
1130 1131  
1131 1132  int
1132 1133  zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
1133 1134      matchtype_t mt, dmu_tx_t *tx)
1134 1135  {
1135 1136          zap_t *zap;
1136 1137          int err;
1137 1138          mzap_ent_t *mze;
1138 1139          zap_name_t *zn;
1139 1140  
1140 1141          err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap);
1141 1142          if (err)
1142 1143                  return (err);
1143 1144          zn = zap_name_alloc(zap, name, mt);
1144 1145          if (zn == NULL) {
1145 1146                  zap_unlockdir(zap);
1146 1147                  return (SET_ERROR(ENOTSUP));
1147 1148          }
1148 1149          if (!zap->zap_ismicro) {
1149 1150                  err = fzap_remove(zn, tx);
1150 1151          } else {
1151 1152                  mze = mze_find(zn);
1152 1153                  if (mze == NULL) {
1153 1154                          err = SET_ERROR(ENOENT);
1154 1155                  } else {
1155 1156                          zap->zap_m.zap_num_entries--;
1156 1157                          bzero(&zap_m_phys(zap)->mz_chunk[mze->mze_chunkid],
1157 1158                              sizeof (mzap_ent_phys_t));
1158 1159                          mze_remove(zap, mze);
1159 1160                  }
1160 1161          }
1161 1162          zap_name_free(zn);
1162 1163          zap_unlockdir(zap);
1163 1164          return (err);
1164 1165  }
1165 1166  
1166 1167  int
1167 1168  zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1168 1169      int key_numints, dmu_tx_t *tx)
1169 1170  {
1170 1171          zap_t *zap;
1171 1172          int err;
1172 1173          zap_name_t *zn;
1173 1174  
1174 1175          err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap);
1175 1176          if (err)
1176 1177                  return (err);
1177 1178          zn = zap_name_alloc_uint64(zap, key, key_numints);
1178 1179          if (zn == NULL) {
1179 1180                  zap_unlockdir(zap);
1180 1181                  return (SET_ERROR(ENOTSUP));
1181 1182          }
1182 1183          err = fzap_remove(zn, tx);
1183 1184          zap_name_free(zn);
1184 1185          zap_unlockdir(zap);
1185 1186          return (err);
1186 1187  }
1187 1188  
1188 1189  /*
1189 1190   * Routines for iterating over the attributes.
1190 1191   */
1191 1192  
1192 1193  void
1193 1194  zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
1194 1195      uint64_t serialized)
1195 1196  {
1196 1197          zc->zc_objset = os;
1197 1198          zc->zc_zap = NULL;
1198 1199          zc->zc_leaf = NULL;
1199 1200          zc->zc_zapobj = zapobj;
1200 1201          zc->zc_serialized = serialized;
1201 1202          zc->zc_hash = 0;
1202 1203          zc->zc_cd = 0;
1203 1204  }
1204 1205  
1205 1206  void
1206 1207  zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
1207 1208  {
1208 1209          zap_cursor_init_serialized(zc, os, zapobj, 0);
1209 1210  }
1210 1211  
1211 1212  void
1212 1213  zap_cursor_fini(zap_cursor_t *zc)
1213 1214  {
1214 1215          if (zc->zc_zap) {
1215 1216                  rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
1216 1217                  zap_unlockdir(zc->zc_zap);
1217 1218                  zc->zc_zap = NULL;
1218 1219          }
1219 1220          if (zc->zc_leaf) {
1220 1221                  rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
1221 1222                  zap_put_leaf(zc->zc_leaf);
1222 1223                  zc->zc_leaf = NULL;
1223 1224          }
1224 1225          zc->zc_objset = NULL;
1225 1226  }
1226 1227  
1227 1228  uint64_t
1228 1229  zap_cursor_serialize(zap_cursor_t *zc)
1229 1230  {
1230 1231          if (zc->zc_hash == -1ULL)
1231 1232                  return (-1ULL);
1232 1233          if (zc->zc_zap == NULL)
1233 1234                  return (zc->zc_serialized);
1234 1235          ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0);
1235 1236          ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap));
1236 1237  
1237 1238          /*
1238 1239           * We want to keep the high 32 bits of the cursor zero if we can, so
1239 1240           * that 32-bit programs can access this.  So usually use a small
1240 1241           * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits
1241 1242           * of the cursor.
1242 1243           *
1243 1244           * [ collision differentiator | zap_hashbits()-bit hash value ]
1244 1245           */
1245 1246          return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) |
1246 1247              ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap)));
1247 1248  }
1248 1249  
1249 1250  int
1250 1251  zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
1251 1252  {
1252 1253          int err;
1253 1254          avl_index_t idx;
1254 1255          mzap_ent_t mze_tofind;
1255 1256          mzap_ent_t *mze;
1256 1257  
1257 1258          if (zc->zc_hash == -1ULL)
1258 1259                  return (SET_ERROR(ENOENT));
1259 1260  
1260 1261          if (zc->zc_zap == NULL) {
1261 1262                  int hb;
1262 1263                  err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
1263 1264                      RW_READER, TRUE, FALSE, &zc->zc_zap);
1264 1265                  if (err)
1265 1266                          return (err);
1266 1267  
1267 1268                  /*
1268 1269                   * To support zap_cursor_init_serialized, advance, retrieve,
1269 1270                   * we must add to the existing zc_cd, which may already
1270 1271                   * be 1 due to the zap_cursor_advance.
1271 1272                   */
1272 1273                  ASSERT(zc->zc_hash == 0);
1273 1274                  hb = zap_hashbits(zc->zc_zap);
1274 1275                  zc->zc_hash = zc->zc_serialized << (64 - hb);
1275 1276                  zc->zc_cd += zc->zc_serialized >> hb;
1276 1277                  if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */
1277 1278                          zc->zc_cd = 0;
1278 1279          } else {
1279 1280                  rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
1280 1281          }
1281 1282          if (!zc->zc_zap->zap_ismicro) {
1282 1283                  err = fzap_cursor_retrieve(zc->zc_zap, zc, za);
1283 1284          } else {
1284 1285                  mze_tofind.mze_hash = zc->zc_hash;
1285 1286                  mze_tofind.mze_cd = zc->zc_cd;
1286 1287  
1287 1288                  mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx);
1288 1289                  if (mze == NULL) {
1289 1290                          mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl,
1290 1291                              idx, AVL_AFTER);
1291 1292                  }
1292 1293                  if (mze) {
1293 1294                          mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze);
1294 1295                          ASSERT3U(mze->mze_cd, ==, mzep->mze_cd);
1295 1296                          za->za_normalization_conflict =
1296 1297                              mzap_normalization_conflict(zc->zc_zap, NULL, mze);
1297 1298                          za->za_integer_length = 8;
1298 1299                          za->za_num_integers = 1;
1299 1300                          za->za_first_integer = mzep->mze_value;
1300 1301                          (void) strcpy(za->za_name, mzep->mze_name);
1301 1302                          zc->zc_hash = mze->mze_hash;
1302 1303                          zc->zc_cd = mze->mze_cd;
1303 1304                          err = 0;
1304 1305                  } else {
1305 1306                          zc->zc_hash = -1ULL;
1306 1307                          err = SET_ERROR(ENOENT);
1307 1308                  }
1308 1309          }
1309 1310          rw_exit(&zc->zc_zap->zap_rwlock);
1310 1311          return (err);
1311 1312  }
1312 1313  
1313 1314  void
1314 1315  zap_cursor_advance(zap_cursor_t *zc)
1315 1316  {
1316 1317          if (zc->zc_hash == -1ULL)
1317 1318                  return;
1318 1319          zc->zc_cd++;
1319 1320  }
1320 1321  
1321 1322  int
1322 1323  zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
1323 1324  {
1324 1325          int err;
1325 1326          zap_t *zap;
1326 1327  
1327 1328          err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
1328 1329          if (err)
1329 1330                  return (err);
1330 1331  
1331 1332          bzero(zs, sizeof (zap_stats_t));
1332 1333  
1333 1334          if (zap->zap_ismicro) {
1334 1335                  zs->zs_blocksize = zap->zap_dbuf->db_size;
1335 1336                  zs->zs_num_entries = zap->zap_m.zap_num_entries;
1336 1337                  zs->zs_num_blocks = 1;
1337 1338          } else {
1338 1339                  fzap_get_stats(zap, zs);
1339 1340          }
1340 1341          zap_unlockdir(zap);
1341 1342          return (0);
1342 1343  }
1343 1344  
1344 1345  int
1345 1346  zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
1346 1347      uint64_t *towrite, uint64_t *tooverwrite)
1347 1348  {
1348 1349          zap_t *zap;
1349 1350          int err = 0;
1350 1351  
1351 1352          /*
1352 1353           * Since, we don't have a name, we cannot figure out which blocks will
1353 1354           * be affected in this operation. So, account for the worst case :
1354 1355           * - 3 blocks overwritten: target leaf, ptrtbl block, header block
1355 1356           * - 4 new blocks written if adding:
1356 1357           *      - 2 blocks for possibly split leaves,
1357 1358           *      - 2 grown ptrtbl blocks
1358 1359           *
1359 1360           * This also accomodates the case where an add operation to a fairly
1360 1361           * large microzap results in a promotion to fatzap.
1361 1362           */
1362 1363          if (name == NULL) {
1363 1364                  *towrite += (3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE;
1364 1365                  return (err);
1365 1366          }
1366 1367  
1367 1368          /*
1368 1369           * We lock the zap with adding == FALSE. Because, if we pass
1369 1370           * the actual value of add, it could trigger a mzap_upgrade().
1370 1371           * At present we are just evaluating the possibility of this operation
1371 1372           * and hence we donot want to trigger an upgrade.
1372 1373           */
1373 1374          err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
1374 1375          if (err)
1375 1376                  return (err);
1376 1377  
1377 1378          if (!zap->zap_ismicro) {
1378 1379                  zap_name_t *zn = zap_name_alloc(zap, name, MT_EXACT);
1379 1380                  if (zn) {
1380 1381                          err = fzap_count_write(zn, add, towrite,
1381 1382                              tooverwrite);
1382 1383                          zap_name_free(zn);
1383 1384                  } else {
1384 1385                          /*
1385 1386                           * We treat this case as similar to (name == NULL)
1386 1387                           */
1387 1388                          *towrite += (3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE;
1388 1389                  }
1389 1390          } else {
1390 1391                  /*
1391 1392                   * We are here if (name != NULL) and this is a micro-zap.
1392 1393                   * We account for the header block depending on whether it
1393 1394                   * is freeable.
1394 1395                   *
1395 1396                   * Incase of an add-operation it is hard to find out
1396 1397                   * if this add will promote this microzap to fatzap.
1397 1398                   * Hence, we consider the worst case and account for the
1398 1399                   * blocks assuming this microzap would be promoted to a
1399 1400                   * fatzap.
1400 1401                   *
1401 1402                   * 1 block overwritten  : header block
1402 1403                   * 4 new blocks written : 2 new split leaf, 2 grown
1403 1404                   *                      ptrtbl blocks
1404 1405                   */
1405 1406                  if (dmu_buf_freeable(zap->zap_dbuf))
1406 1407                          *tooverwrite += MZAP_MAX_BLKSZ;
1407 1408                  else
1408 1409                          *towrite += MZAP_MAX_BLKSZ;
1409 1410  
1410 1411                  if (add) {
1411 1412                          *towrite += 4 * MZAP_MAX_BLKSZ;
1412 1413                  }
1413 1414          }
1414 1415  
1415 1416          zap_unlockdir(zap);
1416 1417          return (err);
1417 1418  }
  
    | 
      ↓ open down ↓ | 
    721 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX