Print this page
    
NEX-6088 ZFS scrub/resilver take excessively long due to issuing lots of random IO
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
NEX-5366 Race between unique_insert() and unique_remove() causes ZFS fsid change
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Dan Vatca <dan.vatca@gmail.com>
NEX-4582 update wrc test cases for allow to use write back cache per tree of datasets
Reviewed by: Steve Peng <steve.peng@nexenta.com>
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
5960 zfs recv should prefetch indirect blocks
5925 zfs receive -o origin=
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Remaining fixes for the illumos merge
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/zfs/zap.c
          +++ new/usr/src/uts/common/fs/zfs/zap.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
  24   24   * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  25   25   */
  26   26  
  27   27  /*
  28   28   * This file contains the top half of the zfs directory structure
  29   29   * implementation. The bottom half is in zap_leaf.c.
  30   30   *
  31   31   * The zdir is an extendable hash data structure. There is a table of
  32   32   * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are
  33   33   * each a constant size and hold a variable number of directory entries.
  34   34   * The buckets (aka "leaf nodes") are implemented in zap_leaf.c.
  35   35   *
  36   36   * The pointer table holds a power of 2 number of pointers.
  37   37   * (1<<zap_t->zd_data->zd_phys->zd_prefix_len).  The bucket pointed to
  38   38   * by the pointer at index i in the table holds entries whose hash value
  39   39   * has a zd_prefix_len - bit prefix
  40   40   */
  41   41  
  42   42  #include <sys/spa.h>
  43   43  #include <sys/dmu.h>
  44   44  #include <sys/zfs_context.h>
  45   45  #include <sys/zfs_znode.h>
  46   46  #include <sys/fs/zfs.h>
  47   47  #include <sys/zap.h>
  48   48  #include <sys/refcount.h>
  49   49  #include <sys/zap_impl.h>
  50   50  #include <sys/zap_leaf.h>
  51   51  
  52   52  int fzap_default_block_shift = 14; /* 16k blocksize */
  53   53  
  54   54  extern inline zap_phys_t *zap_f_phys(zap_t *zap);
  55   55  
  56   56  static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
  57   57  
  58   58  void
  59   59  fzap_byteswap(void *vbuf, size_t size)
  60   60  {
  61   61          uint64_t block_type;
  62   62  
  63   63          block_type = *(uint64_t *)vbuf;
  64   64  
  65   65          if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF))
  66   66                  zap_leaf_byteswap(vbuf, size);
  67   67          else {
  68   68                  /* it's a ptrtbl block */
  69   69                  byteswap_uint64_array(vbuf, size);
  70   70          }
  71   71  }
  72   72  
  73   73  void
  74   74  fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
  75   75  {
  76   76          dmu_buf_t *db;
  77   77          zap_leaf_t *l;
  78   78          int i;
  79   79          zap_phys_t *zp;
  80   80  
  81   81          ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
  82   82          zap->zap_ismicro = FALSE;
  83   83  
  84   84          zap->zap_dbu.dbu_evict_func_sync = zap_evict_sync;
  85   85          zap->zap_dbu.dbu_evict_func_async = NULL;
  86   86  
  87   87          mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
  88   88          zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1;
  89   89  
  90   90          zp = zap_f_phys(zap);
  91   91          /*
  92   92           * explicitly zero it since it might be coming from an
  93   93           * initialized microzap
  94   94           */
  95   95          bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size);
  96   96          zp->zap_block_type = ZBT_HEADER;
  97   97          zp->zap_magic = ZAP_MAGIC;
  98   98  
  99   99          zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap);
 100  100  
 101  101          zp->zap_freeblk = 2;            /* block 1 will be the first leaf */
 102  102          zp->zap_num_leafs = 1;
 103  103          zp->zap_num_entries = 0;
 104  104          zp->zap_salt = zap->zap_salt;
 105  105          zp->zap_normflags = zap->zap_normflags;
 106  106          zp->zap_flags = flags;
 107  107  
 108  108          /* block 1 will be the first leaf */
 109  109          for (i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++)
 110  110                  ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1;
 111  111  
 112  112          /*
 113  113           * set up block 1 - the first leaf
 114  114           */
 115  115          VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
 116  116              1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db, DMU_READ_NO_PREFETCH));
 117  117          dmu_buf_will_dirty(db, tx);
 118  118  
 119  119          l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
 120  120          l->l_dbuf = db;
 121  121  
 122  122          zap_leaf_init(l, zp->zap_normflags != 0);
 123  123  
 124  124          kmem_free(l, sizeof (zap_leaf_t));
 125  125          dmu_buf_rele(db, FTAG);
 126  126  }
 127  127  
 128  128  static int
 129  129  zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx)
 130  130  {
 131  131          if (RW_WRITE_HELD(&zap->zap_rwlock))
 132  132                  return (1);
 133  133          if (rw_tryupgrade(&zap->zap_rwlock)) {
 134  134                  dmu_buf_will_dirty(zap->zap_dbuf, tx);
 135  135                  return (1);
 136  136          }
 137  137          return (0);
 138  138  }
 139  139  
 140  140  /*
 141  141   * Generic routines for dealing with the pointer & cookie tables.
 142  142   */
 143  143  
 144  144  static int
 145  145  zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
 146  146      void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n),
 147  147      dmu_tx_t *tx)
 148  148  {
 149  149          uint64_t b, newblk;
 150  150          dmu_buf_t *db_old, *db_new;
 151  151          int err;
 152  152          int bs = FZAP_BLOCK_SHIFT(zap);
 153  153          int hepb = 1<<(bs-4);
 154  154          /* hepb = half the number of entries in a block */
 155  155  
 156  156          ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 157  157          ASSERT(tbl->zt_blk != 0);
 158  158          ASSERT(tbl->zt_numblks > 0);
 159  159  
 160  160          if (tbl->zt_nextblk != 0) {
 161  161                  newblk = tbl->zt_nextblk;
 162  162          } else {
 163  163                  newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
 164  164                  tbl->zt_nextblk = newblk;
 165  165                  ASSERT0(tbl->zt_blks_copied);
 166  166                  dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
 167  167                      tbl->zt_blk << bs, tbl->zt_numblks << bs,
 168  168                      ZIO_PRIORITY_SYNC_READ);
 169  169          }
 170  170  
 171  171          /*
 172  172           * Copy the ptrtbl from the old to new location.
 173  173           */
 174  174  
 175  175          b = tbl->zt_blks_copied;
 176  176          err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
 177  177              (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH);
 178  178          if (err)
 179  179                  return (err);
 180  180  
 181  181          /* first half of entries in old[b] go to new[2*b+0] */
 182  182          VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
 183  183              (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
 184  184          dmu_buf_will_dirty(db_new, tx);
 185  185          transfer_func(db_old->db_data, db_new->db_data, hepb);
 186  186          dmu_buf_rele(db_new, FTAG);
 187  187  
 188  188          /* second half of entries in old[b] go to new[2*b+1] */
 189  189          VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
 190  190              (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
 191  191          dmu_buf_will_dirty(db_new, tx);
 192  192          transfer_func((uint64_t *)db_old->db_data + hepb,
 193  193              db_new->db_data, hepb);
 194  194          dmu_buf_rele(db_new, FTAG);
 195  195  
 196  196          dmu_buf_rele(db_old, FTAG);
 197  197  
 198  198          tbl->zt_blks_copied++;
 199  199  
 200  200          dprintf("copied block %llu of %llu\n",
 201  201              tbl->zt_blks_copied, tbl->zt_numblks);
 202  202  
 203  203          if (tbl->zt_blks_copied == tbl->zt_numblks) {
 204  204                  (void) dmu_free_range(zap->zap_objset, zap->zap_object,
 205  205                      tbl->zt_blk << bs, tbl->zt_numblks << bs, tx);
 206  206  
 207  207                  tbl->zt_blk = newblk;
 208  208                  tbl->zt_numblks *= 2;
 209  209                  tbl->zt_shift++;
 210  210                  tbl->zt_nextblk = 0;
 211  211                  tbl->zt_blks_copied = 0;
 212  212  
 213  213                  dprintf("finished; numblocks now %llu (%lluk entries)\n",
 214  214                      tbl->zt_numblks, 1<<(tbl->zt_shift-10));
 215  215          }
 216  216  
 217  217          return (0);
 218  218  }
 219  219  
 220  220  static int
 221  221  zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
 222  222      dmu_tx_t *tx)
 223  223  {
 224  224          int err;
 225  225          uint64_t blk, off;
 226  226          int bs = FZAP_BLOCK_SHIFT(zap);
 227  227          dmu_buf_t *db;
 228  228  
 229  229          ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 230  230          ASSERT(tbl->zt_blk != 0);
 231  231  
 232  232          dprintf("storing %llx at index %llx\n", val, idx);
 233  233  
 234  234          blk = idx >> (bs-3);
 235  235          off = idx & ((1<<(bs-3))-1);
 236  236  
 237  237          err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
 238  238              (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
 239  239          if (err)
 240  240                  return (err);
 241  241          dmu_buf_will_dirty(db, tx);
 242  242  
 243  243          if (tbl->zt_nextblk != 0) {
 244  244                  uint64_t idx2 = idx * 2;
 245  245                  uint64_t blk2 = idx2 >> (bs-3);
 246  246                  uint64_t off2 = idx2 & ((1<<(bs-3))-1);
 247  247                  dmu_buf_t *db2;
 248  248  
 249  249                  err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
 250  250                      (tbl->zt_nextblk + blk2) << bs, FTAG, &db2,
 251  251                      DMU_READ_NO_PREFETCH);
 252  252                  if (err) {
 253  253                          dmu_buf_rele(db, FTAG);
 254  254                          return (err);
 255  255                  }
 256  256                  dmu_buf_will_dirty(db2, tx);
 257  257                  ((uint64_t *)db2->db_data)[off2] = val;
 258  258                  ((uint64_t *)db2->db_data)[off2+1] = val;
 259  259                  dmu_buf_rele(db2, FTAG);
 260  260          }
 261  261  
 262  262          ((uint64_t *)db->db_data)[off] = val;
 263  263          dmu_buf_rele(db, FTAG);
 264  264  
 265  265          return (0);
 266  266  }
 267  267  
 268  268  static int
 269  269  zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
 270  270  {
 271  271          uint64_t blk, off;
 272  272          int err;
 273  273          dmu_buf_t *db;
 274  274          dnode_t *dn;
 275  275          int bs = FZAP_BLOCK_SHIFT(zap);
 276  276  
 277  277          ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 278  278  
 279  279          blk = idx >> (bs-3);
 280  280          off = idx & ((1<<(bs-3))-1);
 281  281  
 282  282          /*
 283  283           * Note: this is equivalent to dmu_buf_hold(), but we use
 284  284           * _dnode_enter / _by_dnode because it's faster because we don't
 285  285           * have to hold the dnode.
 286  286           */
 287  287          dn = dmu_buf_dnode_enter(zap->zap_dbuf);
 288  288          err = dmu_buf_hold_by_dnode(dn,
 289  289              (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
 290  290          dmu_buf_dnode_exit(zap->zap_dbuf);
 291  291          if (err)
 292  292                  return (err);
 293  293          *valp = ((uint64_t *)db->db_data)[off];
 294  294          dmu_buf_rele(db, FTAG);
 295  295  
 296  296          if (tbl->zt_nextblk != 0) {
 297  297                  /*
 298  298                   * read the nextblk for the sake of i/o error checking,
 299  299                   * so that zap_table_load() will catch errors for
 300  300                   * zap_table_store.
 301  301                   */
 302  302                  blk = (idx*2) >> (bs-3);
 303  303  
 304  304                  dn = dmu_buf_dnode_enter(zap->zap_dbuf);
 305  305                  err = dmu_buf_hold_by_dnode(dn,
 306  306                      (tbl->zt_nextblk + blk) << bs, FTAG, &db,
 307  307                      DMU_READ_NO_PREFETCH);
 308  308                  dmu_buf_dnode_exit(zap->zap_dbuf);
 309  309                  if (err == 0)
 310  310                          dmu_buf_rele(db, FTAG);
 311  311          }
 312  312          return (err);
 313  313  }
 314  314  
 315  315  /*
 316  316   * Routines for growing the ptrtbl.
 317  317   */
 318  318  
 319  319  static void
 320  320  zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n)
 321  321  {
 322  322          int i;
 323  323          for (i = 0; i < n; i++) {
 324  324                  uint64_t lb = src[i];
 325  325                  dst[2*i+0] = lb;
 326  326                  dst[2*i+1] = lb;
 327  327          }
 328  328  }
 329  329  
 330  330  static int
 331  331  zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
 332  332  {
 333  333          /*
 334  334           * The pointer table should never use more hash bits than we
 335  335           * have (otherwise we'd be using useless zero bits to index it).
 336  336           * If we are within 2 bits of running out, stop growing, since
 337  337           * this is already an aberrant condition.
 338  338           */
 339  339          if (zap_f_phys(zap)->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2)
 340  340                  return (SET_ERROR(ENOSPC));
 341  341  
 342  342          if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
 343  343                  /*
 344  344                   * We are outgrowing the "embedded" ptrtbl (the one
 345  345                   * stored in the header block).  Give it its own entire
 346  346                   * block, which will double the size of the ptrtbl.
 347  347                   */
 348  348                  uint64_t newblk;
 349  349                  dmu_buf_t *db_new;
 350  350                  int err;
 351  351  
 352  352                  ASSERT3U(zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==,
 353  353                      ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
 354  354                  ASSERT0(zap_f_phys(zap)->zap_ptrtbl.zt_blk);
 355  355  
 356  356                  newblk = zap_allocate_blocks(zap, 1);
 357  357                  err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
 358  358                      newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new,
 359  359                      DMU_READ_NO_PREFETCH);
 360  360                  if (err)
 361  361                          return (err);
 362  362                  dmu_buf_will_dirty(db_new, tx);
 363  363                  zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
 364  364                      db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
 365  365                  dmu_buf_rele(db_new, FTAG);
 366  366  
 367  367                  zap_f_phys(zap)->zap_ptrtbl.zt_blk = newblk;
 368  368                  zap_f_phys(zap)->zap_ptrtbl.zt_numblks = 1;
 369  369                  zap_f_phys(zap)->zap_ptrtbl.zt_shift++;
 370  370  
 371  371                  ASSERT3U(1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==,
 372  372                      zap_f_phys(zap)->zap_ptrtbl.zt_numblks <<
 373  373                      (FZAP_BLOCK_SHIFT(zap)-3));
 374  374  
 375  375                  return (0);
 376  376          } else {
 377  377                  return (zap_table_grow(zap, &zap_f_phys(zap)->zap_ptrtbl,
 378  378                      zap_ptrtbl_transfer, tx));
 379  379          }
 380  380  }
 381  381  
 382  382  static void
 383  383  zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx)
 384  384  {
 385  385          dmu_buf_will_dirty(zap->zap_dbuf, tx);
 386  386          mutex_enter(&zap->zap_f.zap_num_entries_mtx);
 387  387          ASSERT(delta > 0 || zap_f_phys(zap)->zap_num_entries >= -delta);
 388  388          zap_f_phys(zap)->zap_num_entries += delta;
 389  389          mutex_exit(&zap->zap_f.zap_num_entries_mtx);
 390  390  }
 391  391  
 392  392  static uint64_t
 393  393  zap_allocate_blocks(zap_t *zap, int nblocks)
 394  394  {
 395  395          uint64_t newblk;
 396  396          ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 397  397          newblk = zap_f_phys(zap)->zap_freeblk;
 398  398          zap_f_phys(zap)->zap_freeblk += nblocks;
 399  399          return (newblk);
 400  400  }
 401  401  
 402  402  static void
 403  403  zap_leaf_evict_sync(void *dbu)
 404  404  {
 405  405          zap_leaf_t *l = dbu;
 406  406  
 407  407          rw_destroy(&l->l_rwlock);
 408  408          kmem_free(l, sizeof (zap_leaf_t));
 409  409  }
 410  410  
 411  411  static zap_leaf_t *
 412  412  zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
 413  413  {
 414  414          void *winner;
 415  415          zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
 416  416  
 417  417          ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 418  418  
 419  419          rw_init(&l->l_rwlock, 0, 0, 0);
 420  420          rw_enter(&l->l_rwlock, RW_WRITER);
 421  421          l->l_blkid = zap_allocate_blocks(zap, 1);
 422  422          l->l_dbuf = NULL;
 423  423  
 424  424          VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
 425  425              l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf,
 426  426              DMU_READ_NO_PREFETCH));
 427  427          dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf);
 428  428          winner = dmu_buf_set_user(l->l_dbuf, &l->l_dbu);
 429  429          ASSERT(winner == NULL);
 430  430          dmu_buf_will_dirty(l->l_dbuf, tx);
 431  431  
 432  432          zap_leaf_init(l, zap->zap_normflags != 0);
 433  433  
 434  434          zap_f_phys(zap)->zap_num_leafs++;
 435  435  
 436  436          return (l);
 437  437  }
 438  438  
 439  439  int
 440  440  fzap_count(zap_t *zap, uint64_t *count)
 441  441  {
 442  442          ASSERT(!zap->zap_ismicro);
 443  443          mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */
 444  444          *count = zap_f_phys(zap)->zap_num_entries;
 445  445          mutex_exit(&zap->zap_f.zap_num_entries_mtx);
 446  446          return (0);
 447  447  }
 448  448  
 449  449  /*
 450  450   * Routines for obtaining zap_leaf_t's
 451  451   */
 452  452  
 453  453  void
 454  454  zap_put_leaf(zap_leaf_t *l)
 455  455  {
 456  456          rw_exit(&l->l_rwlock);
 457  457          dmu_buf_rele(l->l_dbuf, NULL);
 458  458  }
 459  459  
 460  460  static zap_leaf_t *
 461  461  zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
 462  462  {
 463  463          zap_leaf_t *l, *winner;
 464  464  
 465  465          ASSERT(blkid != 0);
 466  466  
 467  467          l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
 468  468          rw_init(&l->l_rwlock, 0, 0, 0);
 469  469          rw_enter(&l->l_rwlock, RW_WRITER);
 470  470          l->l_blkid = blkid;
 471  471          l->l_bs = highbit64(db->db_size) - 1;
 472  472          l->l_dbuf = db;
 473  473  
 474  474          dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf);
 475  475          winner = dmu_buf_set_user(db, &l->l_dbu);
 476  476  
 477  477          rw_exit(&l->l_rwlock);
 478  478          if (winner != NULL) {
 479  479                  /* someone else set it first */
 480  480                  zap_leaf_evict_sync(&l->l_dbu);
 481  481                  l = winner;
 482  482          }
 483  483  
 484  484          /*
 485  485           * lhr_pad was previously used for the next leaf in the leaf
 486  486           * chain.  There should be no chained leafs (as we have removed
 487  487           * support for them).
 488  488           */
 489  489          ASSERT0(zap_leaf_phys(l)->l_hdr.lh_pad1);
 490  490  
 491  491          /*
 492  492           * There should be more hash entries than there can be
 493  493           * chunks to put in the hash table
 494  494           */
 495  495          ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3);
 496  496  
 497  497          /* The chunks should begin at the end of the hash table */
 498  498          ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==,
 499  499              &zap_leaf_phys(l)->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]);
 500  500  
 501  501          /* The chunks should end at the end of the block */
 502  502          ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) -
 503  503              (uintptr_t)zap_leaf_phys(l), ==, l->l_dbuf->db_size);
 504  504  
 505  505          return (l);
 506  506  }
 507  507  
 508  508  static int
 509  509  zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
 510  510      zap_leaf_t **lp)
 511  511  {
 512  512          dmu_buf_t *db;
 513  513          zap_leaf_t *l;
 514  514          int bs = FZAP_BLOCK_SHIFT(zap);
 515  515          int err;
 516  516  
 517  517          ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 518  518  
 519  519          dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
 520  520          err = dmu_buf_hold_by_dnode(dn,
 521  521              blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH);
 522  522          dmu_buf_dnode_exit(zap->zap_dbuf);
 523  523          if (err)
 524  524                  return (err);
 525  525  
 526  526          ASSERT3U(db->db_object, ==, zap->zap_object);
 527  527          ASSERT3U(db->db_offset, ==, blkid << bs);
 528  528          ASSERT3U(db->db_size, ==, 1 << bs);
 529  529          ASSERT(blkid != 0);
 530  530  
 531  531          l = dmu_buf_get_user(db);
 532  532  
 533  533          if (l == NULL)
 534  534                  l = zap_open_leaf(blkid, db);
 535  535  
 536  536          rw_enter(&l->l_rwlock, lt);
 537  537          /*
 538  538           * Must lock before dirtying, otherwise zap_leaf_phys(l) could change,
 539  539           * causing ASSERT below to fail.
 540  540           */
 541  541          if (lt == RW_WRITER)
 542  542                  dmu_buf_will_dirty(db, tx);
 543  543          ASSERT3U(l->l_blkid, ==, blkid);
 544  544          ASSERT3P(l->l_dbuf, ==, db);
 545  545          ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_block_type, ==, ZBT_LEAF);
 546  546          ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
 547  547  
 548  548          *lp = l;
 549  549          return (0);
 550  550  }
 551  551  
 552  552  static int
 553  553  zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp)
 554  554  {
 555  555          ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 556  556  
 557  557          if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
 558  558                  ASSERT3U(idx, <,
 559  559                      (1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift));
 560  560                  *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx);
 561  561                  return (0);
 562  562          } else {
 563  563                  return (zap_table_load(zap, &zap_f_phys(zap)->zap_ptrtbl,
 564  564                      idx, valp));
 565  565          }
 566  566  }
 567  567  
 568  568  static int
 569  569  zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
 570  570  {
 571  571          ASSERT(tx != NULL);
 572  572          ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 573  573  
 574  574          if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) {
 575  575                  ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk;
 576  576                  return (0);
 577  577          } else {
 578  578                  return (zap_table_store(zap, &zap_f_phys(zap)->zap_ptrtbl,
 579  579                      idx, blk, tx));
 580  580          }
 581  581  }
 582  582  
 583  583  static int
 584  584  zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
 585  585  {
 586  586          uint64_t idx, blk;
 587  587          int err;
 588  588  
 589  589          ASSERT(zap->zap_dbuf == NULL ||
 590  590              zap_f_phys(zap) == zap->zap_dbuf->db_data);
 591  591  
 592  592          /* Reality check for corrupt zap objects (leaf or header). */
 593  593          if ((zap_f_phys(zap)->zap_block_type != ZBT_LEAF &&
 594  594              zap_f_phys(zap)->zap_block_type != ZBT_HEADER) ||
 595  595              zap_f_phys(zap)->zap_magic != ZAP_MAGIC) {
 596  596                  return (SET_ERROR(EIO));
 597  597          }
 598  598  
 599  599          idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
 600  600          err = zap_idx_to_blk(zap, idx, &blk);
 601  601          if (err != 0)
 602  602                  return (err);
 603  603          err = zap_get_leaf_byblk(zap, blk, tx, lt, lp);
 604  604  
 605  605          ASSERT(err ||
 606  606              ZAP_HASH_IDX(h, zap_leaf_phys(*lp)->l_hdr.lh_prefix_len) ==
 607  607              zap_leaf_phys(*lp)->l_hdr.lh_prefix);
 608  608          return (err);
 609  609  }
 610  610  
 611  611  static int
 612  612  zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l,
 613  613      void *tag, dmu_tx_t *tx, zap_leaf_t **lp)
 614  614  {
 615  615          zap_t *zap = zn->zn_zap;
 616  616          uint64_t hash = zn->zn_hash;
 617  617          zap_leaf_t *nl;
 618  618          int prefix_diff, i, err;
 619  619          uint64_t sibling;
 620  620          int old_prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 621  621  
 622  622          ASSERT3U(old_prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
 623  623          ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 624  624  
 625  625          ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
 626  626              zap_leaf_phys(l)->l_hdr.lh_prefix);
 627  627  
 628  628          if (zap_tryupgradedir(zap, tx) == 0 ||
 629  629              old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
 630  630                  /* We failed to upgrade, or need to grow the pointer table */
 631  631                  objset_t *os = zap->zap_objset;
 632  632                  uint64_t object = zap->zap_object;
 633  633  
 634  634                  zap_put_leaf(l);
 635  635                  zap_unlockdir(zap, tag);
 636  636                  err = zap_lockdir(os, object, tx, RW_WRITER,
 637  637                      FALSE, FALSE, tag, &zn->zn_zap);
 638  638                  zap = zn->zn_zap;
 639  639                  if (err)
 640  640                          return (err);
 641  641                  ASSERT(!zap->zap_ismicro);
 642  642  
 643  643                  while (old_prefix_len ==
 644  644                      zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
 645  645                          err = zap_grow_ptrtbl(zap, tx);
 646  646                          if (err)
 647  647                                  return (err);
 648  648                  }
 649  649  
 650  650                  err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
 651  651                  if (err)
 652  652                          return (err);
 653  653  
 654  654                  if (zap_leaf_phys(l)->l_hdr.lh_prefix_len != old_prefix_len) {
 655  655                          /* it split while our locks were down */
 656  656                          *lp = l;
 657  657                          return (0);
 658  658                  }
 659  659          }
 660  660          ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 661  661          ASSERT3U(old_prefix_len, <, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
 662  662          ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
 663  663              zap_leaf_phys(l)->l_hdr.lh_prefix);
 664  664  
 665  665          prefix_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
 666  666              (old_prefix_len + 1);
 667  667          sibling = (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff;
 668  668  
 669  669          /* check for i/o errors before doing zap_leaf_split */
 670  670          for (i = 0; i < (1ULL<<prefix_diff); i++) {
 671  671                  uint64_t blk;
 672  672                  err = zap_idx_to_blk(zap, sibling+i, &blk);
 673  673                  if (err)
 674  674                          return (err);
 675  675                  ASSERT3U(blk, ==, l->l_blkid);
 676  676          }
 677  677  
 678  678          nl = zap_create_leaf(zap, tx);
 679  679          zap_leaf_split(l, nl, zap->zap_normflags != 0);
 680  680  
 681  681          /* set sibling pointers */
 682  682          for (i = 0; i < (1ULL << prefix_diff); i++) {
 683  683                  err = zap_set_idx_to_blk(zap, sibling+i, nl->l_blkid, tx);
 684  684                  ASSERT0(err); /* we checked for i/o errors above */
 685  685          }
 686  686  
 687  687          if (hash & (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len))) {
 688  688                  /* we want the sibling */
 689  689                  zap_put_leaf(l);
 690  690                  *lp = nl;
 691  691          } else {
 692  692                  zap_put_leaf(nl);
 693  693                  *lp = l;
 694  694          }
 695  695  
 696  696          return (0);
 697  697  }
 698  698  
 699  699  static void
 700  700  zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l,
 701  701      void *tag, dmu_tx_t *tx)
 702  702  {
 703  703          zap_t *zap = zn->zn_zap;
 704  704          int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
 705  705          int leaffull = (zap_leaf_phys(l)->l_hdr.lh_prefix_len == shift &&
 706  706              zap_leaf_phys(l)->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER);
 707  707  
 708  708          zap_put_leaf(l);
 709  709  
 710  710          if (leaffull || zap_f_phys(zap)->zap_ptrtbl.zt_nextblk) {
 711  711                  int err;
 712  712  
 713  713                  /*
 714  714                   * We are in the middle of growing the pointer table, or
 715  715                   * this leaf will soon make us grow it.
 716  716                   */
 717  717                  if (zap_tryupgradedir(zap, tx) == 0) {
 718  718                          objset_t *os = zap->zap_objset;
 719  719                          uint64_t zapobj = zap->zap_object;
 720  720  
 721  721                          zap_unlockdir(zap, tag);
 722  722                          err = zap_lockdir(os, zapobj, tx,
 723  723                              RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap);
 724  724                          zap = zn->zn_zap;
 725  725                          if (err)
 726  726                                  return;
 727  727                  }
 728  728  
 729  729                  /* could have finished growing while our locks were down */
 730  730                  if (zap_f_phys(zap)->zap_ptrtbl.zt_shift == shift)
 731  731                          (void) zap_grow_ptrtbl(zap, tx);
 732  732          }
 733  733  }
 734  734  
 735  735  static int
 736  736  fzap_checkname(zap_name_t *zn)
 737  737  {
 738  738          if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN)
 739  739                  return (SET_ERROR(ENAMETOOLONG));
 740  740          return (0);
 741  741  }
 742  742  
 743  743  static int
 744  744  fzap_checksize(uint64_t integer_size, uint64_t num_integers)
 745  745  {
 746  746          /* Only integer sizes supported by C */
 747  747          switch (integer_size) {
  
    | 
      ↓ open down ↓ | 
    747 lines elided | 
    
      ↑ open up ↑ | 
  
 748  748          case 1:
 749  749          case 2:
 750  750          case 4:
 751  751          case 8:
 752  752                  break;
 753  753          default:
 754  754                  return (SET_ERROR(EINVAL));
 755  755          }
 756  756  
 757  757          if (integer_size * num_integers > ZAP_MAXVALUELEN)
 758      -                return (E2BIG);
      758 +                return (SET_ERROR(E2BIG));
 759  759  
 760  760          return (0);
 761  761  }
 762  762  
 763  763  static int
 764  764  fzap_check(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers)
 765  765  {
 766  766          int err;
 767  767  
 768  768          if ((err = fzap_checkname(zn)) != 0)
 769  769                  return (err);
 770  770          return (fzap_checksize(integer_size, num_integers));
 771  771  }
 772  772  
 773  773  /*
 774  774   * Routines for manipulating attributes.
 775  775   */
 776  776  int
 777  777  fzap_lookup(zap_name_t *zn,
 778  778      uint64_t integer_size, uint64_t num_integers, void *buf,
 779  779      char *realname, int rn_len, boolean_t *ncp)
 780  780  {
 781  781          zap_leaf_t *l;
 782  782          int err;
 783  783          zap_entry_handle_t zeh;
 784  784  
 785  785          if ((err = fzap_checkname(zn)) != 0)
 786  786                  return (err);
 787  787  
 788  788          err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l);
 789  789          if (err != 0)
 790  790                  return (err);
 791  791          err = zap_leaf_lookup(l, zn, &zeh);
 792  792          if (err == 0) {
 793  793                  if ((err = fzap_checksize(integer_size, num_integers)) != 0) {
 794  794                          zap_put_leaf(l);
 795  795                          return (err);
 796  796                  }
 797  797  
 798  798                  err = zap_entry_read(&zeh, integer_size, num_integers, buf);
 799  799                  (void) zap_entry_read_name(zn->zn_zap, &zeh, rn_len, realname);
 800  800                  if (ncp) {
 801  801                          *ncp = zap_entry_normalization_conflict(&zeh,
 802  802                              zn, NULL, zn->zn_zap);
 803  803                  }
 804  804          }
 805  805  
 806  806          zap_put_leaf(l);
 807  807          return (err);
 808  808  }
 809  809  
 810  810  int
 811  811  fzap_add_cd(zap_name_t *zn,
 812  812      uint64_t integer_size, uint64_t num_integers,
 813  813      const void *val, uint32_t cd, void *tag, dmu_tx_t *tx)
 814  814  {
 815  815          zap_leaf_t *l;
 816  816          int err;
 817  817          zap_entry_handle_t zeh;
 818  818          zap_t *zap = zn->zn_zap;
 819  819  
 820  820          ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 821  821          ASSERT(!zap->zap_ismicro);
 822  822          ASSERT(fzap_check(zn, integer_size, num_integers) == 0);
 823  823  
 824  824          err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l);
 825  825          if (err != 0)
 826  826                  return (err);
 827  827  retry:
 828  828          err = zap_leaf_lookup(l, zn, &zeh);
 829  829          if (err == 0) {
 830  830                  err = SET_ERROR(EEXIST);
 831  831                  goto out;
 832  832          }
 833  833          if (err != ENOENT)
 834  834                  goto out;
 835  835  
 836  836          err = zap_entry_create(l, zn, cd,
 837  837              integer_size, num_integers, val, &zeh);
 838  838  
 839  839          if (err == 0) {
 840  840                  zap_increment_num_entries(zap, 1, tx);
 841  841          } else if (err == EAGAIN) {
 842  842                  err = zap_expand_leaf(zn, l, tag, tx, &l);
 843  843                  zap = zn->zn_zap;       /* zap_expand_leaf() may change zap */
 844  844                  if (err == 0)
 845  845                          goto retry;
 846  846          }
 847  847  
 848  848  out:
 849  849          if (zap != NULL)
 850  850                  zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
 851  851          return (err);
 852  852  }
 853  853  
 854  854  int
 855  855  fzap_add(zap_name_t *zn,
 856  856      uint64_t integer_size, uint64_t num_integers,
 857  857      const void *val, void *tag, dmu_tx_t *tx)
 858  858  {
 859  859          int err = fzap_check(zn, integer_size, num_integers);
 860  860          if (err != 0)
 861  861                  return (err);
 862  862  
 863  863          return (fzap_add_cd(zn, integer_size, num_integers,
 864  864              val, ZAP_NEED_CD, tag, tx));
 865  865  }
 866  866  
 867  867  int
 868  868  fzap_update(zap_name_t *zn,
 869  869      int integer_size, uint64_t num_integers, const void *val,
 870  870      void *tag, dmu_tx_t *tx)
 871  871  {
 872  872          zap_leaf_t *l;
 873  873          int err, create;
 874  874          zap_entry_handle_t zeh;
 875  875          zap_t *zap = zn->zn_zap;
 876  876  
 877  877          ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 878  878          err = fzap_check(zn, integer_size, num_integers);
 879  879          if (err != 0)
 880  880                  return (err);
 881  881  
 882  882          err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l);
 883  883          if (err != 0)
 884  884                  return (err);
 885  885  retry:
 886  886          err = zap_leaf_lookup(l, zn, &zeh);
 887  887          create = (err == ENOENT);
 888  888          ASSERT(err == 0 || err == ENOENT);
 889  889  
 890  890          if (create) {
 891  891                  err = zap_entry_create(l, zn, ZAP_NEED_CD,
 892  892                      integer_size, num_integers, val, &zeh);
 893  893                  if (err == 0)
 894  894                          zap_increment_num_entries(zap, 1, tx);
 895  895          } else {
 896  896                  err = zap_entry_update(&zeh, integer_size, num_integers, val);
 897  897          }
 898  898  
 899  899          if (err == EAGAIN) {
 900  900                  err = zap_expand_leaf(zn, l, tag, tx, &l);
 901  901                  zap = zn->zn_zap;       /* zap_expand_leaf() may change zap */
 902  902                  if (err == 0)
 903  903                          goto retry;
 904  904          }
 905  905  
 906  906          if (zap != NULL)
 907  907                  zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
 908  908          return (err);
 909  909  }
 910  910  
 911  911  int
 912  912  fzap_length(zap_name_t *zn,
 913  913      uint64_t *integer_size, uint64_t *num_integers)
 914  914  {
 915  915          zap_leaf_t *l;
 916  916          int err;
 917  917          zap_entry_handle_t zeh;
 918  918  
 919  919          err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l);
 920  920          if (err != 0)
 921  921                  return (err);
 922  922          err = zap_leaf_lookup(l, zn, &zeh);
 923  923          if (err != 0)
 924  924                  goto out;
 925  925  
 926  926          if (integer_size)
 927  927                  *integer_size = zeh.zeh_integer_size;
 928  928          if (num_integers)
 929  929                  *num_integers = zeh.zeh_num_integers;
 930  930  out:
 931  931          zap_put_leaf(l);
 932  932          return (err);
 933  933  }
 934  934  
 935  935  int
 936  936  fzap_remove(zap_name_t *zn, dmu_tx_t *tx)
 937  937  {
 938  938          zap_leaf_t *l;
 939  939          int err;
 940  940          zap_entry_handle_t zeh;
 941  941  
 942  942          err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, tx, RW_WRITER, &l);
 943  943          if (err != 0)
 944  944                  return (err);
 945  945          err = zap_leaf_lookup(l, zn, &zeh);
 946  946          if (err == 0) {
 947  947                  zap_entry_remove(&zeh);
 948  948                  zap_increment_num_entries(zn->zn_zap, -1, tx);
 949  949          }
 950  950          zap_put_leaf(l);
 951  951          return (err);
 952  952  }
 953  953  
 954  954  void
 955  955  fzap_prefetch(zap_name_t *zn)
 956  956  {
 957  957          uint64_t idx, blk;
 958  958          zap_t *zap = zn->zn_zap;
 959  959          int bs;
 960  960  
 961  961          idx = ZAP_HASH_IDX(zn->zn_hash,
 962  962              zap_f_phys(zap)->zap_ptrtbl.zt_shift);
 963  963          if (zap_idx_to_blk(zap, idx, &blk) != 0)
 964  964                  return;
 965  965          bs = FZAP_BLOCK_SHIFT(zap);
 966  966          dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs,
 967  967              ZIO_PRIORITY_SYNC_READ);
 968  968  }
 969  969  
 970  970  /*
 971  971   * Helper functions for consumers.
 972  972   */
 973  973  
 974  974  uint64_t
 975  975  zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj,
 976  976      const char *name, dmu_tx_t *tx)
 977  977  {
 978  978          uint64_t new_obj;
 979  979  
 980  980          VERIFY((new_obj = zap_create(os, ot, DMU_OT_NONE, 0, tx)) > 0);
 981  981          VERIFY0(zap_add(os, parent_obj, name, sizeof (uint64_t), 1, &new_obj,
 982  982              tx));
 983  983  
 984  984          return (new_obj);
 985  985  }
 986  986  
 987  987  int
 988  988  zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask,
 989  989      char *name)
 990  990  {
 991  991          zap_cursor_t zc;
 992  992          zap_attribute_t *za;
 993  993          int err;
 994  994  
 995  995          if (mask == 0)
 996  996                  mask = -1ULL;
 997  997  
 998  998          za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
 999  999          for (zap_cursor_init(&zc, os, zapobj);
1000 1000              (err = zap_cursor_retrieve(&zc, za)) == 0;
1001 1001              zap_cursor_advance(&zc)) {
1002 1002                  if ((za->za_first_integer & mask) == (value & mask)) {
1003 1003                          (void) strcpy(name, za->za_name);
1004 1004                          break;
1005 1005                  }
1006 1006          }
1007 1007          zap_cursor_fini(&zc);
1008 1008          kmem_free(za, sizeof (zap_attribute_t));
1009 1009          return (err);
1010 1010  }
1011 1011  
1012 1012  int
1013 1013  zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx)
1014 1014  {
1015 1015          zap_cursor_t zc;
1016 1016          zap_attribute_t za;
1017 1017          int err;
1018 1018  
1019 1019          err = 0;
1020 1020          for (zap_cursor_init(&zc, os, fromobj);
1021 1021              zap_cursor_retrieve(&zc, &za) == 0;
1022 1022              (void) zap_cursor_advance(&zc)) {
1023 1023                  if (za.za_integer_length != 8 || za.za_num_integers != 1) {
1024 1024                          err = SET_ERROR(EINVAL);
1025 1025                          break;
1026 1026                  }
1027 1027                  err = zap_add(os, intoobj, za.za_name,
  
    | 
      ↓ open down ↓ | 
    259 lines elided | 
    
      ↑ open up ↑ | 
  
1028 1028                      8, 1, &za.za_first_integer, tx);
1029 1029                  if (err)
1030 1030                          break;
1031 1031          }
1032 1032          zap_cursor_fini(&zc);
1033 1033          return (err);
1034 1034  }
1035 1035  
1036 1036  int
1037 1037  zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
1038      -    uint64_t value, dmu_tx_t *tx)
     1038 +    uint64_t value, dmu_tx_t *tx, boolean_t exists_ok)
1039 1039  {
1040 1040          zap_cursor_t zc;
1041 1041          zap_attribute_t za;
1042 1042          int err;
1043 1043  
1044 1044          err = 0;
1045 1045          for (zap_cursor_init(&zc, os, fromobj);
1046 1046              zap_cursor_retrieve(&zc, &za) == 0;
1047 1047              (void) zap_cursor_advance(&zc)) {
1048 1048                  if (za.za_integer_length != 8 || za.za_num_integers != 1) {
1049 1049                          err = SET_ERROR(EINVAL);
1050 1050                          break;
1051 1051                  }
1052 1052                  err = zap_add(os, intoobj, za.za_name,
1053 1053                      8, 1, &value, tx);
1054      -                if (err)
1055      -                        break;
     1054 +                if (err != 0) {
     1055 +                        if (err == EEXIST && exists_ok)
     1056 +                                err = 0;
     1057 +                        else
     1058 +                                break;
     1059 +                }
1056 1060          }
1057 1061          zap_cursor_fini(&zc);
1058 1062          return (err);
1059 1063  }
1060 1064  
1061 1065  int
1062 1066  zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj,
1063 1067      dmu_tx_t *tx)
1064 1068  {
1065 1069          zap_cursor_t zc;
1066 1070          zap_attribute_t za;
1067 1071          int err;
1068 1072  
1069 1073          err = 0;
1070 1074          for (zap_cursor_init(&zc, os, fromobj);
1071 1075              zap_cursor_retrieve(&zc, &za) == 0;
1072 1076              (void) zap_cursor_advance(&zc)) {
1073 1077                  uint64_t delta = 0;
1074 1078  
1075 1079                  if (za.za_integer_length != 8 || za.za_num_integers != 1) {
1076 1080                          err = SET_ERROR(EINVAL);
1077 1081                          break;
1078 1082                  }
1079 1083  
1080 1084                  err = zap_lookup(os, intoobj, za.za_name, 8, 1, &delta);
1081 1085                  if (err != 0 && err != ENOENT)
1082 1086                          break;
1083 1087                  delta += za.za_first_integer;
1084 1088                  err = zap_update(os, intoobj, za.za_name, 8, 1, &delta, tx);
1085 1089                  if (err)
1086 1090                          break;
1087 1091          }
1088 1092          zap_cursor_fini(&zc);
1089 1093          return (err);
1090 1094  }
1091 1095  
1092 1096  int
1093 1097  zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx)
1094 1098  {
1095 1099          char name[20];
1096 1100  
1097 1101          (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
1098 1102          return (zap_add(os, obj, name, 8, 1, &value, tx));
1099 1103  }
1100 1104  
1101 1105  int
1102 1106  zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx)
1103 1107  {
1104 1108          char name[20];
1105 1109  
1106 1110          (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
1107 1111          return (zap_remove(os, obj, name, tx));
1108 1112  }
1109 1113  
1110 1114  int
1111 1115  zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value)
1112 1116  {
1113 1117          char name[20];
1114 1118  
1115 1119          (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
1116 1120          return (zap_lookup(os, obj, name, 8, 1, &value));
1117 1121  }
1118 1122  
1119 1123  int
1120 1124  zap_add_int_key(objset_t *os, uint64_t obj,
1121 1125      uint64_t key, uint64_t value, dmu_tx_t *tx)
1122 1126  {
1123 1127          char name[20];
1124 1128  
1125 1129          (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
1126 1130          return (zap_add(os, obj, name, 8, 1, &value, tx));
1127 1131  }
1128 1132  
1129 1133  int
1130 1134  zap_update_int_key(objset_t *os, uint64_t obj,
1131 1135      uint64_t key, uint64_t value, dmu_tx_t *tx)
1132 1136  {
1133 1137          char name[20];
1134 1138  
1135 1139          (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
1136 1140          return (zap_update(os, obj, name, 8, 1, &value, tx));
1137 1141  }
1138 1142  
1139 1143  int
1140 1144  zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep)
1141 1145  {
1142 1146          char name[20];
1143 1147  
1144 1148          (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
1145 1149          return (zap_lookup(os, obj, name, 8, 1, valuep));
1146 1150  }
1147 1151  
1148 1152  int
1149 1153  zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
1150 1154      dmu_tx_t *tx)
1151 1155  {
1152 1156          uint64_t value = 0;
1153 1157          int err;
1154 1158  
1155 1159          if (delta == 0)
1156 1160                  return (0);
1157 1161  
1158 1162          err = zap_lookup(os, obj, name, 8, 1, &value);
1159 1163          if (err != 0 && err != ENOENT)
1160 1164                  return (err);
1161 1165          value += delta;
1162 1166          if (value == 0)
1163 1167                  err = zap_remove(os, obj, name, tx);
1164 1168          else
1165 1169                  err = zap_update(os, obj, name, 8, 1, &value, tx);
1166 1170          return (err);
1167 1171  }
1168 1172  
1169 1173  int
1170 1174  zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
1171 1175      dmu_tx_t *tx)
1172 1176  {
1173 1177          char name[20];
1174 1178  
1175 1179          (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
1176 1180          return (zap_increment(os, obj, name, delta, tx));
1177 1181  }
1178 1182  
1179 1183  /*
1180 1184   * Routines for iterating over the attributes.
1181 1185   */
1182 1186  
1183 1187  int
1184 1188  fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
1185 1189  {
1186 1190          int err = ENOENT;
1187 1191          zap_entry_handle_t zeh;
1188 1192          zap_leaf_t *l;
1189 1193  
1190 1194          /* retrieve the next entry at or after zc_hash/zc_cd */
1191 1195          /* if no entry, return ENOENT */
1192 1196  
1193 1197          if (zc->zc_leaf &&
1194 1198              (ZAP_HASH_IDX(zc->zc_hash,
1195 1199              zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) !=
1196 1200              zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) {
1197 1201                  rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
1198 1202                  zap_put_leaf(zc->zc_leaf);
1199 1203                  zc->zc_leaf = NULL;
1200 1204          }
1201 1205  
1202 1206  again:
1203 1207          if (zc->zc_leaf == NULL) {
1204 1208                  err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER,
1205 1209                      &zc->zc_leaf);
1206 1210                  if (err != 0)
1207 1211                          return (err);
1208 1212          } else {
1209 1213                  rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
1210 1214          }
1211 1215          l = zc->zc_leaf;
1212 1216  
1213 1217          err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh);
1214 1218  
1215 1219          if (err == ENOENT) {
1216 1220                  uint64_t nocare =
1217 1221                      (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len)) - 1;
1218 1222                  zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1;
1219 1223                  zc->zc_cd = 0;
1220 1224                  if (zap_leaf_phys(l)->l_hdr.lh_prefix_len == 0 ||
1221 1225                      zc->zc_hash == 0) {
1222 1226                          zc->zc_hash = -1ULL;
1223 1227                  } else {
1224 1228                          zap_put_leaf(zc->zc_leaf);
1225 1229                          zc->zc_leaf = NULL;
1226 1230                          goto again;
1227 1231                  }
1228 1232          }
1229 1233  
1230 1234          if (err == 0) {
1231 1235                  zc->zc_hash = zeh.zeh_hash;
1232 1236                  zc->zc_cd = zeh.zeh_cd;
1233 1237                  za->za_integer_length = zeh.zeh_integer_size;
1234 1238                  za->za_num_integers = zeh.zeh_num_integers;
1235 1239                  if (zeh.zeh_num_integers == 0) {
1236 1240                          za->za_first_integer = 0;
1237 1241                  } else {
1238 1242                          err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer);
1239 1243                          ASSERT(err == 0 || err == EOVERFLOW);
1240 1244                  }
1241 1245                  err = zap_entry_read_name(zap, &zeh,
1242 1246                      sizeof (za->za_name), za->za_name);
1243 1247                  ASSERT(err == 0);
1244 1248  
1245 1249                  za->za_normalization_conflict =
1246 1250                      zap_entry_normalization_conflict(&zeh,
1247 1251                      NULL, za->za_name, zap);
1248 1252          }
1249 1253          rw_exit(&zc->zc_leaf->l_rwlock);
1250 1254          return (err);
1251 1255  }
1252 1256  
1253 1257  static void
1254 1258  zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
1255 1259  {
1256 1260          int i, err;
1257 1261          uint64_t lastblk = 0;
1258 1262  
1259 1263          /*
1260 1264           * NB: if a leaf has more pointers than an entire ptrtbl block
1261 1265           * can hold, then it'll be accounted for more than once, since
1262 1266           * we won't have lastblk.
1263 1267           */
1264 1268          for (i = 0; i < len; i++) {
1265 1269                  zap_leaf_t *l;
1266 1270  
1267 1271                  if (tbl[i] == lastblk)
1268 1272                          continue;
1269 1273                  lastblk = tbl[i];
1270 1274  
1271 1275                  err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l);
1272 1276                  if (err == 0) {
1273 1277                          zap_leaf_stats(zap, l, zs);
1274 1278                          zap_put_leaf(l);
1275 1279                  }
1276 1280          }
1277 1281  }
1278 1282  
1279 1283  void
1280 1284  fzap_get_stats(zap_t *zap, zap_stats_t *zs)
1281 1285  {
1282 1286          int bs = FZAP_BLOCK_SHIFT(zap);
1283 1287          zs->zs_blocksize = 1ULL << bs;
1284 1288  
1285 1289          /*
1286 1290           * Set zap_phys_t fields
1287 1291           */
1288 1292          zs->zs_num_leafs = zap_f_phys(zap)->zap_num_leafs;
1289 1293          zs->zs_num_entries = zap_f_phys(zap)->zap_num_entries;
1290 1294          zs->zs_num_blocks = zap_f_phys(zap)->zap_freeblk;
1291 1295          zs->zs_block_type = zap_f_phys(zap)->zap_block_type;
1292 1296          zs->zs_magic = zap_f_phys(zap)->zap_magic;
1293 1297          zs->zs_salt = zap_f_phys(zap)->zap_salt;
1294 1298  
1295 1299          /*
1296 1300           * Set zap_ptrtbl fields
1297 1301           */
1298 1302          zs->zs_ptrtbl_len = 1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift;
1299 1303          zs->zs_ptrtbl_nextblk = zap_f_phys(zap)->zap_ptrtbl.zt_nextblk;
1300 1304          zs->zs_ptrtbl_blks_copied =
1301 1305              zap_f_phys(zap)->zap_ptrtbl.zt_blks_copied;
1302 1306          zs->zs_ptrtbl_zt_blk = zap_f_phys(zap)->zap_ptrtbl.zt_blk;
1303 1307          zs->zs_ptrtbl_zt_numblks = zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
1304 1308          zs->zs_ptrtbl_zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
1305 1309  
1306 1310          if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
1307 1311                  /* the ptrtbl is entirely in the header block. */
1308 1312                  zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
1309 1313                      1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs);
1310 1314          } else {
1311 1315                  int b;
1312 1316  
1313 1317                  dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
1314 1318                      zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs,
1315 1319                      zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs,
1316 1320                      ZIO_PRIORITY_SYNC_READ);
1317 1321  
1318 1322                  for (b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
1319 1323                      b++) {
1320 1324                          dmu_buf_t *db;
1321 1325                          int err;
1322 1326  
1323 1327                          err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
1324 1328                              (zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs,
1325 1329                              FTAG, &db, DMU_READ_NO_PREFETCH);
1326 1330                          if (err == 0) {
1327 1331                                  zap_stats_ptrtbl(zap, db->db_data,
1328 1332                                      1<<(bs-3), zs);
1329 1333                                  dmu_buf_rele(db, FTAG);
1330 1334                          }
1331 1335                  }
1332 1336          }
1333 1337  }
  
    | 
      ↓ open down ↓ | 
    268 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX