Print this page
    
6842 Fix empty xattr dir causing lockup
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/zfs/zap.c
          +++ new/usr/src/uts/common/fs/zfs/zap.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  24   24   * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  25   25   */
  26   26  
  27   27  /*
  28   28   * This file contains the top half of the zfs directory structure
  29   29   * implementation. The bottom half is in zap_leaf.c.
  30   30   *
  31   31   * The zdir is an extendable hash data structure. There is a table of
  32   32   * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are
  33   33   * each a constant size and hold a variable number of directory entries.
  34   34   * The buckets (aka "leaf nodes") are implemented in zap_leaf.c.
  35   35   *
  36   36   * The pointer table holds a power of 2 number of pointers.
  37   37   * (1<<zap_t->zd_data->zd_phys->zd_prefix_len).  The bucket pointed to
  38   38   * by the pointer at index i in the table holds entries whose hash value
  39   39   * has a zd_prefix_len - bit prefix
  40   40   */
  41   41  
  42   42  #include <sys/spa.h>
  43   43  #include <sys/dmu.h>
  44   44  #include <sys/zfs_context.h>
  45   45  #include <sys/zfs_znode.h>
  46   46  #include <sys/fs/zfs.h>
  47   47  #include <sys/zap.h>
  48   48  #include <sys/refcount.h>
  49   49  #include <sys/zap_impl.h>
  50   50  #include <sys/zap_leaf.h>
  51   51  
  52   52  int fzap_default_block_shift = 14; /* 16k blocksize */
  53   53  
  54   54  extern inline zap_phys_t *zap_f_phys(zap_t *zap);
  55   55  
  56   56  static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
  57   57  
  58   58  void
  59   59  fzap_byteswap(void *vbuf, size_t size)
  60   60  {
  61   61          uint64_t block_type;
  62   62  
  63   63          block_type = *(uint64_t *)vbuf;
  64   64  
  65   65          if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF))
  66   66                  zap_leaf_byteswap(vbuf, size);
  67   67          else {
  68   68                  /* it's a ptrtbl block */
  69   69                  byteswap_uint64_array(vbuf, size);
  70   70          }
  71   71  }
  72   72  
  73   73  void
  74   74  fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
  75   75  {
  76   76          dmu_buf_t *db;
  77   77          zap_leaf_t *l;
  78   78          int i;
  79   79          zap_phys_t *zp;
  80   80  
  81   81          ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
  82   82          zap->zap_ismicro = FALSE;
  83   83  
  84   84          zap->zap_dbu.dbu_evict_func = zap_evict;
  85   85  
  86   86          mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
  87   87          zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1;
  88   88  
  89   89          zp = zap_f_phys(zap);
  90   90          /*
  91   91           * explicitly zero it since it might be coming from an
  92   92           * initialized microzap
  93   93           */
  94   94          bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size);
  95   95          zp->zap_block_type = ZBT_HEADER;
  96   96          zp->zap_magic = ZAP_MAGIC;
  97   97  
  98   98          zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap);
  99   99  
 100  100          zp->zap_freeblk = 2;            /* block 1 will be the first leaf */
 101  101          zp->zap_num_leafs = 1;
 102  102          zp->zap_num_entries = 0;
 103  103          zp->zap_salt = zap->zap_salt;
 104  104          zp->zap_normflags = zap->zap_normflags;
 105  105          zp->zap_flags = flags;
 106  106  
 107  107          /* block 1 will be the first leaf */
 108  108          for (i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++)
 109  109                  ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1;
 110  110  
 111  111          /*
 112  112           * set up block 1 - the first leaf
 113  113           */
 114  114          VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
 115  115              1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db, DMU_READ_NO_PREFETCH));
 116  116          dmu_buf_will_dirty(db, tx);
 117  117  
 118  118          l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
 119  119          l->l_dbuf = db;
 120  120  
 121  121          zap_leaf_init(l, zp->zap_normflags != 0);
 122  122  
 123  123          kmem_free(l, sizeof (zap_leaf_t));
 124  124          dmu_buf_rele(db, FTAG);
 125  125  }
 126  126  
 127  127  static int
 128  128  zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx)
 129  129  {
 130  130          if (RW_WRITE_HELD(&zap->zap_rwlock))
 131  131                  return (1);
 132  132          if (rw_tryupgrade(&zap->zap_rwlock)) {
 133  133                  dmu_buf_will_dirty(zap->zap_dbuf, tx);
 134  134                  return (1);
 135  135          }
 136  136          return (0);
 137  137  }
 138  138  
 139  139  /*
 140  140   * Generic routines for dealing with the pointer & cookie tables.
 141  141   */
 142  142  
 143  143  static int
 144  144  zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
 145  145      void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n),
 146  146      dmu_tx_t *tx)
 147  147  {
 148  148          uint64_t b, newblk;
 149  149          dmu_buf_t *db_old, *db_new;
 150  150          int err;
 151  151          int bs = FZAP_BLOCK_SHIFT(zap);
 152  152          int hepb = 1<<(bs-4);
 153  153          /* hepb = half the number of entries in a block */
 154  154  
 155  155          ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 156  156          ASSERT(tbl->zt_blk != 0);
 157  157          ASSERT(tbl->zt_numblks > 0);
 158  158  
 159  159          if (tbl->zt_nextblk != 0) {
 160  160                  newblk = tbl->zt_nextblk;
 161  161          } else {
 162  162                  newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
 163  163                  tbl->zt_nextblk = newblk;
 164  164                  ASSERT0(tbl->zt_blks_copied);
 165  165                  dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
 166  166                      tbl->zt_blk << bs, tbl->zt_numblks << bs,
 167  167                      ZIO_PRIORITY_SYNC_READ);
 168  168          }
 169  169  
 170  170          /*
 171  171           * Copy the ptrtbl from the old to new location.
 172  172           */
 173  173  
 174  174          b = tbl->zt_blks_copied;
 175  175          err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
 176  176              (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH);
 177  177          if (err)
 178  178                  return (err);
 179  179  
 180  180          /* first half of entries in old[b] go to new[2*b+0] */
 181  181          VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
 182  182              (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
 183  183          dmu_buf_will_dirty(db_new, tx);
 184  184          transfer_func(db_old->db_data, db_new->db_data, hepb);
 185  185          dmu_buf_rele(db_new, FTAG);
 186  186  
 187  187          /* second half of entries in old[b] go to new[2*b+1] */
 188  188          VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
 189  189              (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
 190  190          dmu_buf_will_dirty(db_new, tx);
 191  191          transfer_func((uint64_t *)db_old->db_data + hepb,
 192  192              db_new->db_data, hepb);
 193  193          dmu_buf_rele(db_new, FTAG);
 194  194  
 195  195          dmu_buf_rele(db_old, FTAG);
 196  196  
 197  197          tbl->zt_blks_copied++;
 198  198  
 199  199          dprintf("copied block %llu of %llu\n",
 200  200              tbl->zt_blks_copied, tbl->zt_numblks);
 201  201  
 202  202          if (tbl->zt_blks_copied == tbl->zt_numblks) {
 203  203                  (void) dmu_free_range(zap->zap_objset, zap->zap_object,
 204  204                      tbl->zt_blk << bs, tbl->zt_numblks << bs, tx);
 205  205  
 206  206                  tbl->zt_blk = newblk;
 207  207                  tbl->zt_numblks *= 2;
 208  208                  tbl->zt_shift++;
 209  209                  tbl->zt_nextblk = 0;
 210  210                  tbl->zt_blks_copied = 0;
 211  211  
 212  212                  dprintf("finished; numblocks now %llu (%lluk entries)\n",
 213  213                      tbl->zt_numblks, 1<<(tbl->zt_shift-10));
 214  214          }
 215  215  
 216  216          return (0);
 217  217  }
 218  218  
 219  219  static int
 220  220  zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
 221  221      dmu_tx_t *tx)
 222  222  {
 223  223          int err;
 224  224          uint64_t blk, off;
 225  225          int bs = FZAP_BLOCK_SHIFT(zap);
 226  226          dmu_buf_t *db;
 227  227  
 228  228          ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 229  229          ASSERT(tbl->zt_blk != 0);
 230  230  
 231  231          dprintf("storing %llx at index %llx\n", val, idx);
 232  232  
 233  233          blk = idx >> (bs-3);
 234  234          off = idx & ((1<<(bs-3))-1);
 235  235  
 236  236          err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
 237  237              (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
 238  238          if (err)
 239  239                  return (err);
 240  240          dmu_buf_will_dirty(db, tx);
 241  241  
 242  242          if (tbl->zt_nextblk != 0) {
 243  243                  uint64_t idx2 = idx * 2;
 244  244                  uint64_t blk2 = idx2 >> (bs-3);
 245  245                  uint64_t off2 = idx2 & ((1<<(bs-3))-1);
 246  246                  dmu_buf_t *db2;
 247  247  
 248  248                  err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
 249  249                      (tbl->zt_nextblk + blk2) << bs, FTAG, &db2,
 250  250                      DMU_READ_NO_PREFETCH);
 251  251                  if (err) {
 252  252                          dmu_buf_rele(db, FTAG);
 253  253                          return (err);
 254  254                  }
 255  255                  dmu_buf_will_dirty(db2, tx);
 256  256                  ((uint64_t *)db2->db_data)[off2] = val;
 257  257                  ((uint64_t *)db2->db_data)[off2+1] = val;
 258  258                  dmu_buf_rele(db2, FTAG);
 259  259          }
 260  260  
 261  261          ((uint64_t *)db->db_data)[off] = val;
 262  262          dmu_buf_rele(db, FTAG);
 263  263  
 264  264          return (0);
 265  265  }
 266  266  
 267  267  static int
 268  268  zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
 269  269  {
 270  270          uint64_t blk, off;
 271  271          int err;
 272  272          dmu_buf_t *db;
 273  273          int bs = FZAP_BLOCK_SHIFT(zap);
 274  274  
 275  275          ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 276  276  
 277  277          blk = idx >> (bs-3);
 278  278          off = idx & ((1<<(bs-3))-1);
 279  279  
 280  280          err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
 281  281              (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
 282  282          if (err)
 283  283                  return (err);
 284  284          *valp = ((uint64_t *)db->db_data)[off];
 285  285          dmu_buf_rele(db, FTAG);
 286  286  
 287  287          if (tbl->zt_nextblk != 0) {
 288  288                  /*
 289  289                   * read the nextblk for the sake of i/o error checking,
 290  290                   * so that zap_table_load() will catch errors for
 291  291                   * zap_table_store.
 292  292                   */
 293  293                  blk = (idx*2) >> (bs-3);
 294  294  
 295  295                  err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
 296  296                      (tbl->zt_nextblk + blk) << bs, FTAG, &db,
 297  297                      DMU_READ_NO_PREFETCH);
 298  298                  if (err == 0)
 299  299                          dmu_buf_rele(db, FTAG);
 300  300          }
 301  301          return (err);
 302  302  }
 303  303  
 304  304  /*
 305  305   * Routines for growing the ptrtbl.
 306  306   */
 307  307  
 308  308  static void
 309  309  zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n)
 310  310  {
 311  311          int i;
 312  312          for (i = 0; i < n; i++) {
 313  313                  uint64_t lb = src[i];
 314  314                  dst[2*i+0] = lb;
 315  315                  dst[2*i+1] = lb;
 316  316          }
 317  317  }
 318  318  
 319  319  static int
 320  320  zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
 321  321  {
 322  322          /*
 323  323           * The pointer table should never use more hash bits than we
 324  324           * have (otherwise we'd be using useless zero bits to index it).
 325  325           * If we are within 2 bits of running out, stop growing, since
 326  326           * this is already an aberrant condition.
 327  327           */
 328  328          if (zap_f_phys(zap)->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2)
 329  329                  return (SET_ERROR(ENOSPC));
 330  330  
 331  331          if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
 332  332                  /*
 333  333                   * We are outgrowing the "embedded" ptrtbl (the one
 334  334                   * stored in the header block).  Give it its own entire
 335  335                   * block, which will double the size of the ptrtbl.
 336  336                   */
 337  337                  uint64_t newblk;
 338  338                  dmu_buf_t *db_new;
 339  339                  int err;
 340  340  
 341  341                  ASSERT3U(zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==,
 342  342                      ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
 343  343                  ASSERT0(zap_f_phys(zap)->zap_ptrtbl.zt_blk);
 344  344  
 345  345                  newblk = zap_allocate_blocks(zap, 1);
 346  346                  err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
 347  347                      newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new,
 348  348                      DMU_READ_NO_PREFETCH);
 349  349                  if (err)
 350  350                          return (err);
 351  351                  dmu_buf_will_dirty(db_new, tx);
 352  352                  zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
 353  353                      db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
 354  354                  dmu_buf_rele(db_new, FTAG);
 355  355  
 356  356                  zap_f_phys(zap)->zap_ptrtbl.zt_blk = newblk;
 357  357                  zap_f_phys(zap)->zap_ptrtbl.zt_numblks = 1;
 358  358                  zap_f_phys(zap)->zap_ptrtbl.zt_shift++;
 359  359  
 360  360                  ASSERT3U(1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==,
 361  361                      zap_f_phys(zap)->zap_ptrtbl.zt_numblks <<
 362  362                      (FZAP_BLOCK_SHIFT(zap)-3));
 363  363  
 364  364                  return (0);
 365  365          } else {
 366  366                  return (zap_table_grow(zap, &zap_f_phys(zap)->zap_ptrtbl,
 367  367                      zap_ptrtbl_transfer, tx));
 368  368          }
 369  369  }
 370  370  
 371  371  static void
 372  372  zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx)
 373  373  {
 374  374          dmu_buf_will_dirty(zap->zap_dbuf, tx);
 375  375          mutex_enter(&zap->zap_f.zap_num_entries_mtx);
 376  376          ASSERT(delta > 0 || zap_f_phys(zap)->zap_num_entries >= -delta);
 377  377          zap_f_phys(zap)->zap_num_entries += delta;
 378  378          mutex_exit(&zap->zap_f.zap_num_entries_mtx);
 379  379  }
 380  380  
 381  381  static uint64_t
 382  382  zap_allocate_blocks(zap_t *zap, int nblocks)
 383  383  {
 384  384          uint64_t newblk;
 385  385          ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 386  386          newblk = zap_f_phys(zap)->zap_freeblk;
 387  387          zap_f_phys(zap)->zap_freeblk += nblocks;
 388  388          return (newblk);
 389  389  }
 390  390  
 391  391  static void
 392  392  zap_leaf_pageout(void *dbu)
 393  393  {
 394  394          zap_leaf_t *l = dbu;
 395  395  
 396  396          rw_destroy(&l->l_rwlock);
 397  397          kmem_free(l, sizeof (zap_leaf_t));
 398  398  }
 399  399  
 400  400  static zap_leaf_t *
 401  401  zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
 402  402  {
 403  403          void *winner;
 404  404          zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
 405  405  
 406  406          ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 407  407  
 408  408          rw_init(&l->l_rwlock, 0, 0, 0);
 409  409          rw_enter(&l->l_rwlock, RW_WRITER);
 410  410          l->l_blkid = zap_allocate_blocks(zap, 1);
 411  411          l->l_dbuf = NULL;
 412  412  
 413  413          VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
 414  414              l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf,
 415  415              DMU_READ_NO_PREFETCH));
 416  416          dmu_buf_init_user(&l->l_dbu, zap_leaf_pageout, &l->l_dbuf);
 417  417          winner = dmu_buf_set_user(l->l_dbuf, &l->l_dbu);
 418  418          ASSERT(winner == NULL);
 419  419          dmu_buf_will_dirty(l->l_dbuf, tx);
 420  420  
 421  421          zap_leaf_init(l, zap->zap_normflags != 0);
 422  422  
 423  423          zap_f_phys(zap)->zap_num_leafs++;
 424  424  
 425  425          return (l);
 426  426  }
 427  427  
 428  428  int
 429  429  fzap_count(zap_t *zap, uint64_t *count)
 430  430  {
 431  431          ASSERT(!zap->zap_ismicro);
 432  432          mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */
 433  433          *count = zap_f_phys(zap)->zap_num_entries;
 434  434          mutex_exit(&zap->zap_f.zap_num_entries_mtx);
 435  435          return (0);
 436  436  }
 437  437  
 438  438  /*
 439  439   * Routines for obtaining zap_leaf_t's
 440  440   */
 441  441  
 442  442  void
 443  443  zap_put_leaf(zap_leaf_t *l)
 444  444  {
 445  445          rw_exit(&l->l_rwlock);
 446  446          dmu_buf_rele(l->l_dbuf, NULL);
 447  447  }
 448  448  
 449  449  static zap_leaf_t *
 450  450  zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
 451  451  {
 452  452          zap_leaf_t *l, *winner;
 453  453  
 454  454          ASSERT(blkid != 0);
 455  455  
 456  456          l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
 457  457          rw_init(&l->l_rwlock, 0, 0, 0);
 458  458          rw_enter(&l->l_rwlock, RW_WRITER);
 459  459          l->l_blkid = blkid;
 460  460          l->l_bs = highbit64(db->db_size) - 1;
 461  461          l->l_dbuf = db;
 462  462  
 463  463          dmu_buf_init_user(&l->l_dbu, zap_leaf_pageout, &l->l_dbuf);
 464  464          winner = dmu_buf_set_user(db, &l->l_dbu);
 465  465  
 466  466          rw_exit(&l->l_rwlock);
 467  467          if (winner != NULL) {
 468  468                  /* someone else set it first */
 469  469                  zap_leaf_pageout(&l->l_dbu);
 470  470                  l = winner;
 471  471          }
 472  472  
 473  473          /*
 474  474           * lhr_pad was previously used for the next leaf in the leaf
 475  475           * chain.  There should be no chained leafs (as we have removed
 476  476           * support for them).
 477  477           */
 478  478          ASSERT0(zap_leaf_phys(l)->l_hdr.lh_pad1);
 479  479  
 480  480          /*
 481  481           * There should be more hash entries than there can be
 482  482           * chunks to put in the hash table
 483  483           */
 484  484          ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3);
 485  485  
 486  486          /* The chunks should begin at the end of the hash table */
 487  487          ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==,
 488  488              &zap_leaf_phys(l)->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]);
 489  489  
 490  490          /* The chunks should end at the end of the block */
 491  491          ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) -
 492  492              (uintptr_t)zap_leaf_phys(l), ==, l->l_dbuf->db_size);
 493  493  
 494  494          return (l);
 495  495  }
 496  496  
 497  497  static int
 498  498  zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
 499  499      zap_leaf_t **lp)
 500  500  {
 501  501          dmu_buf_t *db;
 502  502          zap_leaf_t *l;
 503  503          int bs = FZAP_BLOCK_SHIFT(zap);
 504  504          int err;
 505  505  
 506  506          ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 507  507  
 508  508          err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
 509  509              blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH);
 510  510          if (err)
 511  511                  return (err);
 512  512  
 513  513          ASSERT3U(db->db_object, ==, zap->zap_object);
 514  514          ASSERT3U(db->db_offset, ==, blkid << bs);
 515  515          ASSERT3U(db->db_size, ==, 1 << bs);
 516  516          ASSERT(blkid != 0);
 517  517  
 518  518          l = dmu_buf_get_user(db);
 519  519  
 520  520          if (l == NULL)
 521  521                  l = zap_open_leaf(blkid, db);
 522  522  
 523  523          rw_enter(&l->l_rwlock, lt);
 524  524          /*
 525  525           * Must lock before dirtying, otherwise zap_leaf_phys(l) could change,
 526  526           * causing ASSERT below to fail.
 527  527           */
 528  528          if (lt == RW_WRITER)
 529  529                  dmu_buf_will_dirty(db, tx);
 530  530          ASSERT3U(l->l_blkid, ==, blkid);
 531  531          ASSERT3P(l->l_dbuf, ==, db);
 532  532          ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_block_type, ==, ZBT_LEAF);
 533  533          ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
 534  534  
 535  535          *lp = l;
 536  536          return (0);
 537  537  }
 538  538  
 539  539  static int
 540  540  zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp)
 541  541  {
 542  542          ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 543  543  
 544  544          if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
 545  545                  ASSERT3U(idx, <,
 546  546                      (1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift));
 547  547                  *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx);
 548  548                  return (0);
 549  549          } else {
 550  550                  return (zap_table_load(zap, &zap_f_phys(zap)->zap_ptrtbl,
 551  551                      idx, valp));
 552  552          }
 553  553  }
 554  554  
 555  555  static int
 556  556  zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
 557  557  {
 558  558          ASSERT(tx != NULL);
 559  559          ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 560  560  
 561  561          if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) {
 562  562                  ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk;
 563  563                  return (0);
 564  564          } else {
 565  565                  return (zap_table_store(zap, &zap_f_phys(zap)->zap_ptrtbl,
 566  566                      idx, blk, tx));
 567  567          }
  
    | 
      ↓ open down ↓ | 
    567 lines elided | 
    
      ↑ open up ↑ | 
  
 568  568  }
 569  569  
 570  570  static int
 571  571  zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
 572  572  {
 573  573          uint64_t idx, blk;
 574  574          int err;
 575  575  
 576  576          ASSERT(zap->zap_dbuf == NULL ||
 577  577              zap_f_phys(zap) == zap->zap_dbuf->db_data);
 578      -        ASSERT3U(zap_f_phys(zap)->zap_magic, ==, ZAP_MAGIC);
      578 +
      579 +        /* Reality check for corrupt zap objects (leaf or header). */
      580 +        if ((zap_f_phys(zap)->zap_block_type != ZBT_LEAF &&
      581 +            zap_f_phys(zap)->zap_block_type != ZBT_HEADER) ||
      582 +            zap_f_phys(zap)->zap_magic != ZAP_MAGIC) {
      583 +                return (SET_ERROR(EIO));
      584 +        }
      585 +
 579  586          idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
 580  587          err = zap_idx_to_blk(zap, idx, &blk);
 581  588          if (err != 0)
 582  589                  return (err);
 583  590          err = zap_get_leaf_byblk(zap, blk, tx, lt, lp);
 584  591  
 585  592          ASSERT(err ||
 586  593              ZAP_HASH_IDX(h, zap_leaf_phys(*lp)->l_hdr.lh_prefix_len) ==
 587  594              zap_leaf_phys(*lp)->l_hdr.lh_prefix);
 588  595          return (err);
 589  596  }
 590  597  
 591  598  static int
 592  599  zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx, zap_leaf_t **lp)
 593  600  {
 594  601          zap_t *zap = zn->zn_zap;
 595  602          uint64_t hash = zn->zn_hash;
 596  603          zap_leaf_t *nl;
 597  604          int prefix_diff, i, err;
 598  605          uint64_t sibling;
 599  606          int old_prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 600  607  
 601  608          ASSERT3U(old_prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
 602  609          ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 603  610  
 604  611          ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
 605  612              zap_leaf_phys(l)->l_hdr.lh_prefix);
 606  613  
 607  614          if (zap_tryupgradedir(zap, tx) == 0 ||
 608  615              old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
 609  616                  /* We failed to upgrade, or need to grow the pointer table */
 610  617                  objset_t *os = zap->zap_objset;
 611  618                  uint64_t object = zap->zap_object;
 612  619  
 613  620                  zap_put_leaf(l);
 614  621                  zap_unlockdir(zap);
 615  622                  err = zap_lockdir(os, object, tx, RW_WRITER,
 616  623                      FALSE, FALSE, &zn->zn_zap);
 617  624                  zap = zn->zn_zap;
 618  625                  if (err)
 619  626                          return (err);
 620  627                  ASSERT(!zap->zap_ismicro);
 621  628  
 622  629                  while (old_prefix_len ==
 623  630                      zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
 624  631                          err = zap_grow_ptrtbl(zap, tx);
 625  632                          if (err)
 626  633                                  return (err);
 627  634                  }
 628  635  
 629  636                  err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
 630  637                  if (err)
 631  638                          return (err);
 632  639  
 633  640                  if (zap_leaf_phys(l)->l_hdr.lh_prefix_len != old_prefix_len) {
 634  641                          /* it split while our locks were down */
 635  642                          *lp = l;
 636  643                          return (0);
 637  644                  }
 638  645          }
 639  646          ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 640  647          ASSERT3U(old_prefix_len, <, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
 641  648          ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
 642  649              zap_leaf_phys(l)->l_hdr.lh_prefix);
 643  650  
 644  651          prefix_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
 645  652              (old_prefix_len + 1);
 646  653          sibling = (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff;
 647  654  
 648  655          /* check for i/o errors before doing zap_leaf_split */
 649  656          for (i = 0; i < (1ULL<<prefix_diff); i++) {
 650  657                  uint64_t blk;
 651  658                  err = zap_idx_to_blk(zap, sibling+i, &blk);
 652  659                  if (err)
 653  660                          return (err);
 654  661                  ASSERT3U(blk, ==, l->l_blkid);
 655  662          }
 656  663  
 657  664          nl = zap_create_leaf(zap, tx);
 658  665          zap_leaf_split(l, nl, zap->zap_normflags != 0);
 659  666  
 660  667          /* set sibling pointers */
 661  668          for (i = 0; i < (1ULL << prefix_diff); i++) {
 662  669                  err = zap_set_idx_to_blk(zap, sibling+i, nl->l_blkid, tx);
 663  670                  ASSERT0(err); /* we checked for i/o errors above */
 664  671          }
 665  672  
 666  673          if (hash & (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len))) {
 667  674                  /* we want the sibling */
 668  675                  zap_put_leaf(l);
 669  676                  *lp = nl;
 670  677          } else {
 671  678                  zap_put_leaf(nl);
 672  679                  *lp = l;
 673  680          }
 674  681  
 675  682          return (0);
 676  683  }
 677  684  
 678  685  static void
 679  686  zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx)
 680  687  {
 681  688          zap_t *zap = zn->zn_zap;
 682  689          int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
 683  690          int leaffull = (zap_leaf_phys(l)->l_hdr.lh_prefix_len == shift &&
 684  691              zap_leaf_phys(l)->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER);
 685  692  
 686  693          zap_put_leaf(l);
 687  694  
 688  695          if (leaffull || zap_f_phys(zap)->zap_ptrtbl.zt_nextblk) {
 689  696                  int err;
 690  697  
 691  698                  /*
 692  699                   * We are in the middle of growing the pointer table, or
 693  700                   * this leaf will soon make us grow it.
 694  701                   */
 695  702                  if (zap_tryupgradedir(zap, tx) == 0) {
 696  703                          objset_t *os = zap->zap_objset;
 697  704                          uint64_t zapobj = zap->zap_object;
 698  705  
 699  706                          zap_unlockdir(zap);
 700  707                          err = zap_lockdir(os, zapobj, tx,
 701  708                              RW_WRITER, FALSE, FALSE, &zn->zn_zap);
 702  709                          zap = zn->zn_zap;
 703  710                          if (err)
 704  711                                  return;
 705  712                  }
 706  713  
 707  714                  /* could have finished growing while our locks were down */
 708  715                  if (zap_f_phys(zap)->zap_ptrtbl.zt_shift == shift)
 709  716                          (void) zap_grow_ptrtbl(zap, tx);
 710  717          }
 711  718  }
 712  719  
 713  720  static int
 714  721  fzap_checkname(zap_name_t *zn)
 715  722  {
 716  723          if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN)
 717  724                  return (SET_ERROR(ENAMETOOLONG));
 718  725          return (0);
 719  726  }
 720  727  
 721  728  static int
 722  729  fzap_checksize(uint64_t integer_size, uint64_t num_integers)
 723  730  {
 724  731          /* Only integer sizes supported by C */
 725  732          switch (integer_size) {
 726  733          case 1:
 727  734          case 2:
 728  735          case 4:
 729  736          case 8:
 730  737                  break;
 731  738          default:
 732  739                  return (SET_ERROR(EINVAL));
 733  740          }
 734  741  
 735  742          if (integer_size * num_integers > ZAP_MAXVALUELEN)
 736  743                  return (E2BIG);
 737  744  
 738  745          return (0);
 739  746  }
 740  747  
 741  748  static int
 742  749  fzap_check(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers)
 743  750  {
 744  751          int err;
 745  752  
 746  753          if ((err = fzap_checkname(zn)) != 0)
 747  754                  return (err);
 748  755          return (fzap_checksize(integer_size, num_integers));
 749  756  }
 750  757  
 751  758  /*
 752  759   * Routines for manipulating attributes.
 753  760   */
 754  761  int
 755  762  fzap_lookup(zap_name_t *zn,
 756  763      uint64_t integer_size, uint64_t num_integers, void *buf,
 757  764      char *realname, int rn_len, boolean_t *ncp)
 758  765  {
 759  766          zap_leaf_t *l;
 760  767          int err;
 761  768          zap_entry_handle_t zeh;
 762  769  
 763  770          if ((err = fzap_checkname(zn)) != 0)
 764  771                  return (err);
 765  772  
 766  773          err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l);
 767  774          if (err != 0)
 768  775                  return (err);
 769  776          err = zap_leaf_lookup(l, zn, &zeh);
 770  777          if (err == 0) {
 771  778                  if ((err = fzap_checksize(integer_size, num_integers)) != 0) {
 772  779                          zap_put_leaf(l);
 773  780                          return (err);
 774  781                  }
 775  782  
 776  783                  err = zap_entry_read(&zeh, integer_size, num_integers, buf);
 777  784                  (void) zap_entry_read_name(zn->zn_zap, &zeh, rn_len, realname);
 778  785                  if (ncp) {
 779  786                          *ncp = zap_entry_normalization_conflict(&zeh,
 780  787                              zn, NULL, zn->zn_zap);
 781  788                  }
 782  789          }
 783  790  
 784  791          zap_put_leaf(l);
 785  792          return (err);
 786  793  }
 787  794  
 788  795  int
 789  796  fzap_add_cd(zap_name_t *zn,
 790  797      uint64_t integer_size, uint64_t num_integers,
 791  798      const void *val, uint32_t cd, dmu_tx_t *tx)
 792  799  {
 793  800          zap_leaf_t *l;
 794  801          int err;
 795  802          zap_entry_handle_t zeh;
 796  803          zap_t *zap = zn->zn_zap;
 797  804  
 798  805          ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 799  806          ASSERT(!zap->zap_ismicro);
 800  807          ASSERT(fzap_check(zn, integer_size, num_integers) == 0);
 801  808  
 802  809          err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l);
 803  810          if (err != 0)
 804  811                  return (err);
 805  812  retry:
 806  813          err = zap_leaf_lookup(l, zn, &zeh);
 807  814          if (err == 0) {
 808  815                  err = SET_ERROR(EEXIST);
 809  816                  goto out;
 810  817          }
 811  818          if (err != ENOENT)
 812  819                  goto out;
 813  820  
 814  821          err = zap_entry_create(l, zn, cd,
 815  822              integer_size, num_integers, val, &zeh);
 816  823  
 817  824          if (err == 0) {
 818  825                  zap_increment_num_entries(zap, 1, tx);
 819  826          } else if (err == EAGAIN) {
 820  827                  err = zap_expand_leaf(zn, l, tx, &l);
 821  828                  zap = zn->zn_zap;       /* zap_expand_leaf() may change zap */
 822  829                  if (err == 0)
 823  830                          goto retry;
 824  831          }
 825  832  
 826  833  out:
 827  834          if (zap != NULL)
 828  835                  zap_put_leaf_maybe_grow_ptrtbl(zn, l, tx);
 829  836          return (err);
 830  837  }
 831  838  
 832  839  int
 833  840  fzap_add(zap_name_t *zn,
 834  841      uint64_t integer_size, uint64_t num_integers,
 835  842      const void *val, dmu_tx_t *tx)
 836  843  {
 837  844          int err = fzap_check(zn, integer_size, num_integers);
 838  845          if (err != 0)
 839  846                  return (err);
 840  847  
 841  848          return (fzap_add_cd(zn, integer_size, num_integers,
 842  849              val, ZAP_NEED_CD, tx));
 843  850  }
 844  851  
 845  852  int
 846  853  fzap_update(zap_name_t *zn,
 847  854      int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
 848  855  {
 849  856          zap_leaf_t *l;
 850  857          int err, create;
 851  858          zap_entry_handle_t zeh;
 852  859          zap_t *zap = zn->zn_zap;
 853  860  
 854  861          ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 855  862          err = fzap_check(zn, integer_size, num_integers);
 856  863          if (err != 0)
 857  864                  return (err);
 858  865  
 859  866          err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l);
 860  867          if (err != 0)
 861  868                  return (err);
 862  869  retry:
 863  870          err = zap_leaf_lookup(l, zn, &zeh);
 864  871          create = (err == ENOENT);
 865  872          ASSERT(err == 0 || err == ENOENT);
 866  873  
 867  874          if (create) {
 868  875                  err = zap_entry_create(l, zn, ZAP_NEED_CD,
 869  876                      integer_size, num_integers, val, &zeh);
 870  877                  if (err == 0)
 871  878                          zap_increment_num_entries(zap, 1, tx);
 872  879          } else {
 873  880                  err = zap_entry_update(&zeh, integer_size, num_integers, val);
 874  881          }
 875  882  
 876  883          if (err == EAGAIN) {
 877  884                  err = zap_expand_leaf(zn, l, tx, &l);
 878  885                  zap = zn->zn_zap;       /* zap_expand_leaf() may change zap */
 879  886                  if (err == 0)
 880  887                          goto retry;
 881  888          }
 882  889  
 883  890          if (zap != NULL)
 884  891                  zap_put_leaf_maybe_grow_ptrtbl(zn, l, tx);
 885  892          return (err);
 886  893  }
 887  894  
 888  895  int
 889  896  fzap_length(zap_name_t *zn,
 890  897      uint64_t *integer_size, uint64_t *num_integers)
 891  898  {
 892  899          zap_leaf_t *l;
 893  900          int err;
 894  901          zap_entry_handle_t zeh;
 895  902  
 896  903          err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l);
 897  904          if (err != 0)
 898  905                  return (err);
 899  906          err = zap_leaf_lookup(l, zn, &zeh);
 900  907          if (err != 0)
 901  908                  goto out;
 902  909  
 903  910          if (integer_size)
 904  911                  *integer_size = zeh.zeh_integer_size;
 905  912          if (num_integers)
 906  913                  *num_integers = zeh.zeh_num_integers;
 907  914  out:
 908  915          zap_put_leaf(l);
 909  916          return (err);
 910  917  }
 911  918  
 912  919  int
 913  920  fzap_remove(zap_name_t *zn, dmu_tx_t *tx)
 914  921  {
 915  922          zap_leaf_t *l;
 916  923          int err;
 917  924          zap_entry_handle_t zeh;
 918  925  
 919  926          err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, tx, RW_WRITER, &l);
 920  927          if (err != 0)
 921  928                  return (err);
 922  929          err = zap_leaf_lookup(l, zn, &zeh);
 923  930          if (err == 0) {
 924  931                  zap_entry_remove(&zeh);
 925  932                  zap_increment_num_entries(zn->zn_zap, -1, tx);
 926  933          }
 927  934          zap_put_leaf(l);
 928  935          return (err);
 929  936  }
 930  937  
 931  938  void
 932  939  fzap_prefetch(zap_name_t *zn)
 933  940  {
 934  941          uint64_t idx, blk;
 935  942          zap_t *zap = zn->zn_zap;
 936  943          int bs;
 937  944  
 938  945          idx = ZAP_HASH_IDX(zn->zn_hash,
 939  946              zap_f_phys(zap)->zap_ptrtbl.zt_shift);
 940  947          if (zap_idx_to_blk(zap, idx, &blk) != 0)
 941  948                  return;
 942  949          bs = FZAP_BLOCK_SHIFT(zap);
 943  950          dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs,
 944  951              ZIO_PRIORITY_SYNC_READ);
 945  952  }
 946  953  
 947  954  /*
 948  955   * Helper functions for consumers.
 949  956   */
 950  957  
 951  958  uint64_t
 952  959  zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj,
 953  960      const char *name, dmu_tx_t *tx)
 954  961  {
 955  962          uint64_t new_obj;
 956  963  
 957  964          VERIFY((new_obj = zap_create(os, ot, DMU_OT_NONE, 0, tx)) > 0);
 958  965          VERIFY(zap_add(os, parent_obj, name, sizeof (uint64_t), 1, &new_obj,
 959  966              tx) == 0);
 960  967  
 961  968          return (new_obj);
 962  969  }
 963  970  
 964  971  int
 965  972  zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask,
 966  973      char *name)
 967  974  {
 968  975          zap_cursor_t zc;
 969  976          zap_attribute_t *za;
 970  977          int err;
 971  978  
 972  979          if (mask == 0)
 973  980                  mask = -1ULL;
 974  981  
 975  982          za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
 976  983          for (zap_cursor_init(&zc, os, zapobj);
 977  984              (err = zap_cursor_retrieve(&zc, za)) == 0;
 978  985              zap_cursor_advance(&zc)) {
 979  986                  if ((za->za_first_integer & mask) == (value & mask)) {
 980  987                          (void) strcpy(name, za->za_name);
 981  988                          break;
 982  989                  }
 983  990          }
 984  991          zap_cursor_fini(&zc);
 985  992          kmem_free(za, sizeof (zap_attribute_t));
 986  993          return (err);
 987  994  }
 988  995  
 989  996  int
 990  997  zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx)
 991  998  {
 992  999          zap_cursor_t zc;
 993 1000          zap_attribute_t za;
 994 1001          int err;
 995 1002  
 996 1003          err = 0;
 997 1004          for (zap_cursor_init(&zc, os, fromobj);
 998 1005              zap_cursor_retrieve(&zc, &za) == 0;
 999 1006              (void) zap_cursor_advance(&zc)) {
1000 1007                  if (za.za_integer_length != 8 || za.za_num_integers != 1) {
1001 1008                          err = SET_ERROR(EINVAL);
1002 1009                          break;
1003 1010                  }
1004 1011                  err = zap_add(os, intoobj, za.za_name,
1005 1012                      8, 1, &za.za_first_integer, tx);
1006 1013                  if (err)
1007 1014                          break;
1008 1015          }
1009 1016          zap_cursor_fini(&zc);
1010 1017          return (err);
1011 1018  }
1012 1019  
1013 1020  int
1014 1021  zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
1015 1022      uint64_t value, dmu_tx_t *tx)
1016 1023  {
1017 1024          zap_cursor_t zc;
1018 1025          zap_attribute_t za;
1019 1026          int err;
1020 1027  
1021 1028          err = 0;
1022 1029          for (zap_cursor_init(&zc, os, fromobj);
1023 1030              zap_cursor_retrieve(&zc, &za) == 0;
1024 1031              (void) zap_cursor_advance(&zc)) {
1025 1032                  if (za.za_integer_length != 8 || za.za_num_integers != 1) {
1026 1033                          err = SET_ERROR(EINVAL);
1027 1034                          break;
1028 1035                  }
1029 1036                  err = zap_add(os, intoobj, za.za_name,
1030 1037                      8, 1, &value, tx);
1031 1038                  if (err)
1032 1039                          break;
1033 1040          }
1034 1041          zap_cursor_fini(&zc);
1035 1042          return (err);
1036 1043  }
1037 1044  
1038 1045  int
1039 1046  zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj,
1040 1047      dmu_tx_t *tx)
1041 1048  {
1042 1049          zap_cursor_t zc;
1043 1050          zap_attribute_t za;
1044 1051          int err;
1045 1052  
1046 1053          err = 0;
1047 1054          for (zap_cursor_init(&zc, os, fromobj);
1048 1055              zap_cursor_retrieve(&zc, &za) == 0;
1049 1056              (void) zap_cursor_advance(&zc)) {
1050 1057                  uint64_t delta = 0;
1051 1058  
1052 1059                  if (za.za_integer_length != 8 || za.za_num_integers != 1) {
1053 1060                          err = SET_ERROR(EINVAL);
1054 1061                          break;
1055 1062                  }
1056 1063  
1057 1064                  err = zap_lookup(os, intoobj, za.za_name, 8, 1, &delta);
1058 1065                  if (err != 0 && err != ENOENT)
1059 1066                          break;
1060 1067                  delta += za.za_first_integer;
1061 1068                  err = zap_update(os, intoobj, za.za_name, 8, 1, &delta, tx);
1062 1069                  if (err)
1063 1070                          break;
1064 1071          }
1065 1072          zap_cursor_fini(&zc);
1066 1073          return (err);
1067 1074  }
1068 1075  
1069 1076  int
1070 1077  zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx)
1071 1078  {
1072 1079          char name[20];
1073 1080  
1074 1081          (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
1075 1082          return (zap_add(os, obj, name, 8, 1, &value, tx));
1076 1083  }
1077 1084  
1078 1085  int
1079 1086  zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx)
1080 1087  {
1081 1088          char name[20];
1082 1089  
1083 1090          (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
1084 1091          return (zap_remove(os, obj, name, tx));
1085 1092  }
1086 1093  
1087 1094  int
1088 1095  zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value)
1089 1096  {
1090 1097          char name[20];
1091 1098  
1092 1099          (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
1093 1100          return (zap_lookup(os, obj, name, 8, 1, &value));
1094 1101  }
1095 1102  
1096 1103  int
1097 1104  zap_add_int_key(objset_t *os, uint64_t obj,
1098 1105      uint64_t key, uint64_t value, dmu_tx_t *tx)
1099 1106  {
1100 1107          char name[20];
1101 1108  
1102 1109          (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
1103 1110          return (zap_add(os, obj, name, 8, 1, &value, tx));
1104 1111  }
1105 1112  
1106 1113  int
1107 1114  zap_update_int_key(objset_t *os, uint64_t obj,
1108 1115      uint64_t key, uint64_t value, dmu_tx_t *tx)
1109 1116  {
1110 1117          char name[20];
1111 1118  
1112 1119          (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
1113 1120          return (zap_update(os, obj, name, 8, 1, &value, tx));
1114 1121  }
1115 1122  
1116 1123  int
1117 1124  zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep)
1118 1125  {
1119 1126          char name[20];
1120 1127  
1121 1128          (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
1122 1129          return (zap_lookup(os, obj, name, 8, 1, valuep));
1123 1130  }
1124 1131  
1125 1132  int
1126 1133  zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
1127 1134      dmu_tx_t *tx)
1128 1135  {
1129 1136          uint64_t value = 0;
1130 1137          int err;
1131 1138  
1132 1139          if (delta == 0)
1133 1140                  return (0);
1134 1141  
1135 1142          err = zap_lookup(os, obj, name, 8, 1, &value);
1136 1143          if (err != 0 && err != ENOENT)
1137 1144                  return (err);
1138 1145          value += delta;
1139 1146          if (value == 0)
1140 1147                  err = zap_remove(os, obj, name, tx);
1141 1148          else
1142 1149                  err = zap_update(os, obj, name, 8, 1, &value, tx);
1143 1150          return (err);
1144 1151  }
1145 1152  
1146 1153  int
1147 1154  zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
1148 1155      dmu_tx_t *tx)
1149 1156  {
1150 1157          char name[20];
1151 1158  
1152 1159          (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
1153 1160          return (zap_increment(os, obj, name, delta, tx));
1154 1161  }
1155 1162  
1156 1163  /*
1157 1164   * Routines for iterating over the attributes.
1158 1165   */
1159 1166  
1160 1167  int
1161 1168  fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
1162 1169  {
1163 1170          int err = ENOENT;
1164 1171          zap_entry_handle_t zeh;
1165 1172          zap_leaf_t *l;
1166 1173  
1167 1174          /* retrieve the next entry at or after zc_hash/zc_cd */
1168 1175          /* if no entry, return ENOENT */
1169 1176  
1170 1177          if (zc->zc_leaf &&
1171 1178              (ZAP_HASH_IDX(zc->zc_hash,
1172 1179              zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) !=
1173 1180              zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) {
1174 1181                  rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
1175 1182                  zap_put_leaf(zc->zc_leaf);
1176 1183                  zc->zc_leaf = NULL;
1177 1184          }
1178 1185  
1179 1186  again:
1180 1187          if (zc->zc_leaf == NULL) {
1181 1188                  err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER,
1182 1189                      &zc->zc_leaf);
1183 1190                  if (err != 0)
1184 1191                          return (err);
1185 1192          } else {
1186 1193                  rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
1187 1194          }
1188 1195          l = zc->zc_leaf;
1189 1196  
1190 1197          err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh);
1191 1198  
1192 1199          if (err == ENOENT) {
1193 1200                  uint64_t nocare =
1194 1201                      (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len)) - 1;
1195 1202                  zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1;
1196 1203                  zc->zc_cd = 0;
1197 1204                  if (zap_leaf_phys(l)->l_hdr.lh_prefix_len == 0 ||
1198 1205                      zc->zc_hash == 0) {
1199 1206                          zc->zc_hash = -1ULL;
1200 1207                  } else {
1201 1208                          zap_put_leaf(zc->zc_leaf);
1202 1209                          zc->zc_leaf = NULL;
1203 1210                          goto again;
1204 1211                  }
1205 1212          }
1206 1213  
1207 1214          if (err == 0) {
1208 1215                  zc->zc_hash = zeh.zeh_hash;
1209 1216                  zc->zc_cd = zeh.zeh_cd;
1210 1217                  za->za_integer_length = zeh.zeh_integer_size;
1211 1218                  za->za_num_integers = zeh.zeh_num_integers;
1212 1219                  if (zeh.zeh_num_integers == 0) {
1213 1220                          za->za_first_integer = 0;
1214 1221                  } else {
1215 1222                          err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer);
1216 1223                          ASSERT(err == 0 || err == EOVERFLOW);
1217 1224                  }
1218 1225                  err = zap_entry_read_name(zap, &zeh,
1219 1226                      sizeof (za->za_name), za->za_name);
1220 1227                  ASSERT(err == 0);
1221 1228  
1222 1229                  za->za_normalization_conflict =
1223 1230                      zap_entry_normalization_conflict(&zeh,
1224 1231                      NULL, za->za_name, zap);
1225 1232          }
1226 1233          rw_exit(&zc->zc_leaf->l_rwlock);
1227 1234          return (err);
1228 1235  }
1229 1236  
1230 1237  static void
1231 1238  zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
1232 1239  {
1233 1240          int i, err;
1234 1241          uint64_t lastblk = 0;
1235 1242  
1236 1243          /*
1237 1244           * NB: if a leaf has more pointers than an entire ptrtbl block
1238 1245           * can hold, then it'll be accounted for more than once, since
1239 1246           * we won't have lastblk.
1240 1247           */
1241 1248          for (i = 0; i < len; i++) {
1242 1249                  zap_leaf_t *l;
1243 1250  
1244 1251                  if (tbl[i] == lastblk)
1245 1252                          continue;
1246 1253                  lastblk = tbl[i];
1247 1254  
1248 1255                  err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l);
1249 1256                  if (err == 0) {
1250 1257                          zap_leaf_stats(zap, l, zs);
1251 1258                          zap_put_leaf(l);
1252 1259                  }
1253 1260          }
1254 1261  }
1255 1262  
1256 1263  void
1257 1264  fzap_get_stats(zap_t *zap, zap_stats_t *zs)
1258 1265  {
1259 1266          int bs = FZAP_BLOCK_SHIFT(zap);
1260 1267          zs->zs_blocksize = 1ULL << bs;
1261 1268  
1262 1269          /*
1263 1270           * Set zap_phys_t fields
1264 1271           */
1265 1272          zs->zs_num_leafs = zap_f_phys(zap)->zap_num_leafs;
1266 1273          zs->zs_num_entries = zap_f_phys(zap)->zap_num_entries;
1267 1274          zs->zs_num_blocks = zap_f_phys(zap)->zap_freeblk;
1268 1275          zs->zs_block_type = zap_f_phys(zap)->zap_block_type;
1269 1276          zs->zs_magic = zap_f_phys(zap)->zap_magic;
1270 1277          zs->zs_salt = zap_f_phys(zap)->zap_salt;
1271 1278  
1272 1279          /*
1273 1280           * Set zap_ptrtbl fields
1274 1281           */
1275 1282          zs->zs_ptrtbl_len = 1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift;
1276 1283          zs->zs_ptrtbl_nextblk = zap_f_phys(zap)->zap_ptrtbl.zt_nextblk;
1277 1284          zs->zs_ptrtbl_blks_copied =
1278 1285              zap_f_phys(zap)->zap_ptrtbl.zt_blks_copied;
1279 1286          zs->zs_ptrtbl_zt_blk = zap_f_phys(zap)->zap_ptrtbl.zt_blk;
1280 1287          zs->zs_ptrtbl_zt_numblks = zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
1281 1288          zs->zs_ptrtbl_zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
1282 1289  
1283 1290          if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
1284 1291                  /* the ptrtbl is entirely in the header block. */
1285 1292                  zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
1286 1293                      1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs);
1287 1294          } else {
1288 1295                  int b;
1289 1296  
1290 1297                  dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
1291 1298                      zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs,
1292 1299                      zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs,
1293 1300                      ZIO_PRIORITY_SYNC_READ);
1294 1301  
1295 1302                  for (b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
1296 1303                      b++) {
1297 1304                          dmu_buf_t *db;
1298 1305                          int err;
1299 1306  
1300 1307                          err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
1301 1308                              (zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs,
1302 1309                              FTAG, &db, DMU_READ_NO_PREFETCH);
1303 1310                          if (err == 0) {
1304 1311                                  zap_stats_ptrtbl(zap, db->db_data,
1305 1312                                      1<<(bs-3), zs);
1306 1313                                  dmu_buf_rele(db, FTAG);
1307 1314                          }
1308 1315                  }
1309 1316          }
1310 1317  }
1311 1318  
1312 1319  int
1313 1320  fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite,
1314 1321      uint64_t *tooverwrite)
1315 1322  {
1316 1323          zap_t *zap = zn->zn_zap;
1317 1324          zap_leaf_t *l;
1318 1325          int err;
1319 1326  
1320 1327          /*
1321 1328           * Account for the header block of the fatzap.
1322 1329           */
1323 1330          if (!add && dmu_buf_freeable(zap->zap_dbuf)) {
1324 1331                  *tooverwrite += zap->zap_dbuf->db_size;
1325 1332          } else {
1326 1333                  *towrite += zap->zap_dbuf->db_size;
1327 1334          }
1328 1335  
1329 1336          /*
1330 1337           * Account for the pointer table blocks.
1331 1338           * If we are adding we need to account for the following cases :
1332 1339           * - If the pointer table is embedded, this operation could force an
1333 1340           *   external pointer table.
1334 1341           * - If this already has an external pointer table this operation
1335 1342           *   could extend the table.
1336 1343           */
1337 1344          if (add) {
1338 1345                  if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0)
1339 1346                          *towrite += zap->zap_dbuf->db_size;
1340 1347                  else
1341 1348                          *towrite += (zap->zap_dbuf->db_size * 3);
1342 1349          }
1343 1350  
1344 1351          /*
1345 1352           * Now, check if the block containing leaf is freeable
1346 1353           * and account accordingly.
1347 1354           */
1348 1355          err = zap_deref_leaf(zap, zn->zn_hash, NULL, RW_READER, &l);
1349 1356          if (err != 0) {
1350 1357                  return (err);
1351 1358          }
1352 1359  
1353 1360          if (!add && dmu_buf_freeable(l->l_dbuf)) {
1354 1361                  *tooverwrite += l->l_dbuf->db_size;
1355 1362          } else {
1356 1363                  /*
1357 1364                   * If this an add operation, the leaf block could split.
1358 1365                   * Hence, we need to account for an additional leaf block.
1359 1366                   */
1360 1367                  *towrite += (add ? 2 : 1) * l->l_dbuf->db_size;
1361 1368          }
1362 1369  
1363 1370          zap_put_leaf(l);
1364 1371          return (0);
1365 1372  }
  
    | 
      ↓ open down ↓ | 
    777 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX