Print this page
5056 ZFS deadlock on db_mtx and dn_holds
Reviewed by: Will Andrews <willa@spectralogic.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Approved by: Dan McDonald <danmcd@omniti.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/zap.c
          +++ new/usr/src/uts/common/fs/zfs/zap.c
↓ open down ↓ 13 lines elided ↑ open up ↑
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
       24 + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  24   25   */
  25   26  
  26   27  /*
  27   28   * This file contains the top half of the zfs directory structure
  28   29   * implementation. The bottom half is in zap_leaf.c.
  29   30   *
  30   31   * The zdir is an extendable hash data structure. There is a table of
  31   32   * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are
  32   33   * each a constant size and hold a variable number of directory entries.
  33   34   * The buckets (aka "leaf nodes") are implemented in zap_leaf.c.
↓ open down ↓ 11 lines elided ↑ open up ↑
  45   46  #include <sys/fs/zfs.h>
  46   47  #include <sys/zap.h>
  47   48  #include <sys/refcount.h>
  48   49  #include <sys/zap_impl.h>
  49   50  #include <sys/zap_leaf.h>
  50   51  
  51   52  int fzap_default_block_shift = 14; /* 16k blocksize */
  52   53  
  53   54  extern inline zap_phys_t *zap_f_phys(zap_t *zap);
  54   55  
  55      -static void zap_leaf_pageout(dmu_buf_t *db, void *vl);
  56   56  static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
  57   57  
  58   58  void
  59   59  fzap_byteswap(void *vbuf, size_t size)
  60   60  {
  61   61          uint64_t block_type;
  62   62  
  63   63          block_type = *(uint64_t *)vbuf;
  64   64  
  65   65          if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF))
↓ open down ↓ 8 lines elided ↑ open up ↑
  74   74  fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
  75   75  {
  76   76          dmu_buf_t *db;
  77   77          zap_leaf_t *l;
  78   78          int i;
  79   79          zap_phys_t *zp;
  80   80  
  81   81          ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
  82   82          zap->zap_ismicro = FALSE;
  83   83  
  84      -        (void) dmu_buf_update_user(zap->zap_dbuf, zap, zap, zap_evict);
       84 +        zap->zap_dbu.dbu_evict_func = zap_evict;
  85   85  
  86   86          mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
  87   87          zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1;
  88   88  
  89   89          zp = zap_f_phys(zap);
  90   90          /*
  91   91           * explicitly zero it since it might be coming from an
  92   92           * initialized microzap
  93   93           */
  94   94          bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size);
↓ open down ↓ 285 lines elided ↑ open up ↑
 380  380  static uint64_t
 381  381  zap_allocate_blocks(zap_t *zap, int nblocks)
 382  382  {
 383  383          uint64_t newblk;
 384  384          ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 385  385          newblk = zap_f_phys(zap)->zap_freeblk;
 386  386          zap_f_phys(zap)->zap_freeblk += nblocks;
 387  387          return (newblk);
 388  388  }
 389  389  
      390 +static void
      391 +zap_leaf_pageout(void *dbu)
      392 +{
      393 +        zap_leaf_t *l = dbu;
      394 +
      395 +        rw_destroy(&l->l_rwlock);
      396 +        kmem_free(l, sizeof (zap_leaf_t));
      397 +}
      398 +
 390  399  static zap_leaf_t *
 391  400  zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
 392  401  {
 393  402          void *winner;
 394      -        zap_leaf_t *l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP);
      403 +        zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
 395  404  
 396  405          ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 397  406  
 398  407          rw_init(&l->l_rwlock, 0, 0, 0);
 399  408          rw_enter(&l->l_rwlock, RW_WRITER);
 400  409          l->l_blkid = zap_allocate_blocks(zap, 1);
 401  410          l->l_dbuf = NULL;
 402  411  
 403  412          VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
 404  413              l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf,
 405  414              DMU_READ_NO_PREFETCH));
 406      -        winner = dmu_buf_set_user(l->l_dbuf, l, zap_leaf_pageout);
      415 +        dmu_buf_init_user(&l->l_dbu, zap_leaf_pageout, &l->l_dbuf);
      416 +        winner = dmu_buf_set_user(l->l_dbuf, &l->l_dbu);
 407  417          ASSERT(winner == NULL);
 408  418          dmu_buf_will_dirty(l->l_dbuf, tx);
 409  419  
 410  420          zap_leaf_init(l, zap->zap_normflags != 0);
 411  421  
 412  422          zap_f_phys(zap)->zap_num_leafs++;
 413  423  
 414  424          return (l);
 415  425  }
 416  426  
↓ open down ↓ 11 lines elided ↑ open up ↑
 428  438   * Routines for obtaining zap_leaf_t's
 429  439   */
 430  440  
 431  441  void
 432  442  zap_put_leaf(zap_leaf_t *l)
 433  443  {
 434  444          rw_exit(&l->l_rwlock);
 435  445          dmu_buf_rele(l->l_dbuf, NULL);
 436  446  }
 437  447  
 438      -_NOTE(ARGSUSED(0))
 439      -static void
 440      -zap_leaf_pageout(dmu_buf_t *db, void *vl)
 441      -{
 442      -        zap_leaf_t *l = vl;
 443      -
 444      -        rw_destroy(&l->l_rwlock);
 445      -        kmem_free(l, sizeof (zap_leaf_t));
 446      -}
 447      -
 448  448  static zap_leaf_t *
 449  449  zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
 450  450  {
 451  451          zap_leaf_t *l, *winner;
 452  452  
 453  453          ASSERT(blkid != 0);
 454  454  
 455      -        l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP);
      455 +        l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
 456  456          rw_init(&l->l_rwlock, 0, 0, 0);
 457  457          rw_enter(&l->l_rwlock, RW_WRITER);
 458  458          l->l_blkid = blkid;
 459  459          l->l_bs = highbit64(db->db_size) - 1;
 460  460          l->l_dbuf = db;
 461  461  
 462      -        winner = dmu_buf_set_user(db, l, zap_leaf_pageout);
      462 +        dmu_buf_init_user(&l->l_dbu, zap_leaf_pageout, &l->l_dbuf);
      463 +        winner = dmu_buf_set_user(db, &l->l_dbu);
 463  464  
 464  465          rw_exit(&l->l_rwlock);
 465  466          if (winner != NULL) {
 466  467                  /* someone else set it first */
 467      -                zap_leaf_pageout(NULL, l);
      468 +                zap_leaf_pageout(&l->l_dbu);
 468  469                  l = winner;
 469  470          }
 470  471  
 471  472          /*
 472  473           * lhr_pad was previously used for the next leaf in the leaf
 473  474           * chain.  There should be no chained leafs (as we have removed
 474  475           * support for them).
 475  476           */
 476  477          ASSERT0(zap_leaf_phys(l)->l_hdr.lh_pad1);
 477  478  
↓ open down ↓ 884 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX