illumos-gate Sdiff usr/src/uts/common/fs/zfs/zap.c

Print this page

5056 ZFS deadlock on db_mtx and dn_holds
Reviewed by: Will Andrews <willa@spectralogic.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Approved by: Dan McDonald <danmcd@omniti.com>

   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.

  24  */
  25 
  26 /*
  27  * This file contains the top half of the zfs directory structure
  28  * implementation. The bottom half is in zap_leaf.c.
  29  *
  30  * The zdir is an extendable hash data structure. There is a table of
  31  * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are
  32  * each a constant size and hold a variable number of directory entries.
  33  * The buckets (aka "leaf nodes") are implemented in zap_leaf.c.
  34  *
  35  * The pointer table holds a power of 2 number of pointers.
  36  * (1<<zap_t->zd_data->zd_phys->zd_prefix_len).  The bucket pointed to
  37  * by the pointer at index i in the table holds entries whose hash value
  38  * has a zd_prefix_len - bit prefix
  39  */
  40 
  41 #include <sys/spa.h>
  42 #include <sys/dmu.h>
  43 #include <sys/zfs_context.h>
  44 #include <sys/zfs_znode.h>
  45 #include <sys/fs/zfs.h>
  46 #include <sys/zap.h>
  47 #include <sys/refcount.h>
  48 #include <sys/zap_impl.h>
  49 #include <sys/zap_leaf.h>
  50 
  51 int fzap_default_block_shift = 14; /* 16k blocksize */
  52 
  53 extern inline zap_phys_t *zap_f_phys(zap_t *zap);
  54 
  55 static void zap_leaf_pageout(dmu_buf_t *db, void *vl);
  56 static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
  57 
  58 void
  59 fzap_byteswap(void *vbuf, size_t size)
  60 {
  61         uint64_t block_type;
  62 
  63         block_type = *(uint64_t *)vbuf;
  64 
  65         if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF))
  66                 zap_leaf_byteswap(vbuf, size);
  67         else {
  68                 /* it's a ptrtbl block */
  69                 byteswap_uint64_array(vbuf, size);
  70         }
  71 }
  72 
  73 void
  74 fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
  75 {
  76         dmu_buf_t *db;
  77         zap_leaf_t *l;
  78         int i;
  79         zap_phys_t *zp;
  80 
  81         ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
  82         zap->zap_ismicro = FALSE;
  83 
  84         (void) dmu_buf_update_user(zap->zap_dbuf, zap, zap, zap_evict);
  85 
  86         mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
  87         zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1;
  88 
  89         zp = zap_f_phys(zap);
  90         /*
  91          * explicitly zero it since it might be coming from an
  92          * initialized microzap
  93          */
  94         bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size);
  95         zp->zap_block_type = ZBT_HEADER;
  96         zp->zap_magic = ZAP_MAGIC;
  97 
  98         zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap);
  99 
 100         zp->zap_freeblk = 2;         /* block 1 will be the first leaf */
 101         zp->zap_num_leafs = 1;
 102         zp->zap_num_entries = 0;
 103         zp->zap_salt = zap->zap_salt;
 104         zp->zap_normflags = zap->zap_normflags;

 370 static void
 371 zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx)
 372 {
 373         dmu_buf_will_dirty(zap->zap_dbuf, tx);
 374         mutex_enter(&zap->zap_f.zap_num_entries_mtx);
 375         ASSERT(delta > 0 || zap_f_phys(zap)->zap_num_entries >= -delta);
 376         zap_f_phys(zap)->zap_num_entries += delta;
 377         mutex_exit(&zap->zap_f.zap_num_entries_mtx);
 378 }
 379 
 380 static uint64_t
 381 zap_allocate_blocks(zap_t *zap, int nblocks)
 382 {
 383         uint64_t newblk;
 384         ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 385         newblk = zap_f_phys(zap)->zap_freeblk;
 386         zap_f_phys(zap)->zap_freeblk += nblocks;
 387         return (newblk);
 388 }
 389 









 390 static zap_leaf_t *
 391 zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
 392 {
 393         void *winner;
 394         zap_leaf_t *l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP);
 395 
 396         ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 397 
 398         rw_init(&l->l_rwlock, 0, 0, 0);
 399         rw_enter(&l->l_rwlock, RW_WRITER);
 400         l->l_blkid = zap_allocate_blocks(zap, 1);
 401         l->l_dbuf = NULL;
 402 
 403         VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
 404             l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf,
 405             DMU_READ_NO_PREFETCH));
 406         winner = dmu_buf_set_user(l->l_dbuf, l, zap_leaf_pageout);

 407         ASSERT(winner == NULL);
 408         dmu_buf_will_dirty(l->l_dbuf, tx);
 409 
 410         zap_leaf_init(l, zap->zap_normflags != 0);
 411 
 412         zap_f_phys(zap)->zap_num_leafs++;
 413 
 414         return (l);
 415 }
 416 
 417 int
 418 fzap_count(zap_t *zap, uint64_t *count)
 419 {
 420         ASSERT(!zap->zap_ismicro);
 421         mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */
 422         *count = zap_f_phys(zap)->zap_num_entries;
 423         mutex_exit(&zap->zap_f.zap_num_entries_mtx);
 424         return (0);
 425 }
 426 
 427 /*
 428  * Routines for obtaining zap_leaf_t's
 429  */
 430 
 431 void
 432 zap_put_leaf(zap_leaf_t *l)
 433 {
 434         rw_exit(&l->l_rwlock);
 435         dmu_buf_rele(l->l_dbuf, NULL);
 436 }
 437 
 438 _NOTE(ARGSUSED(0))
 439 static void
 440 zap_leaf_pageout(dmu_buf_t *db, void *vl)
 441 {
 442         zap_leaf_t *l = vl;
 443 
 444         rw_destroy(&l->l_rwlock);
 445         kmem_free(l, sizeof (zap_leaf_t));
 446 }
 447 
 448 static zap_leaf_t *
 449 zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
 450 {
 451         zap_leaf_t *l, *winner;
 452 
 453         ASSERT(blkid != 0);
 454 
 455         l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP);
 456         rw_init(&l->l_rwlock, 0, 0, 0);
 457         rw_enter(&l->l_rwlock, RW_WRITER);
 458         l->l_blkid = blkid;
 459         l->l_bs = highbit64(db->db_size) - 1;
 460         l->l_dbuf = db;
 461 
 462         winner = dmu_buf_set_user(db, l, zap_leaf_pageout);

 463 
 464         rw_exit(&l->l_rwlock);
 465         if (winner != NULL) {
 466                 /* someone else set it first */
 467                 zap_leaf_pageout(NULL, l);
 468                 l = winner;
 469         }
 470 
 471         /*
 472          * lhr_pad was previously used for the next leaf in the leaf
 473          * chain.  There should be no chained leafs (as we have removed
 474          * support for them).
 475          */
 476         ASSERT0(zap_leaf_phys(l)->l_hdr.lh_pad1);
 477 
 478         /*
 479          * There should be more hash entries than there can be
 480          * chunks to put in the hash table
 481          */
 482         ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3);
 483 
 484         /* The chunks should begin at the end of the hash table */
 485         ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==,
 486             &zap_leaf_phys(l)->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]);
 487

   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  24  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  25  */
  26 
  27 /*
  28  * This file contains the top half of the zfs directory structure
  29  * implementation. The bottom half is in zap_leaf.c.
  30  *
  31  * The zdir is an extendable hash data structure. There is a table of
  32  * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are
  33  * each a constant size and hold a variable number of directory entries.
  34  * The buckets (aka "leaf nodes") are implemented in zap_leaf.c.
  35  *
  36  * The pointer table holds a power of 2 number of pointers.
  37  * (1<<zap_t->zd_data->zd_phys->zd_prefix_len).  The bucket pointed to
  38  * by the pointer at index i in the table holds entries whose hash value
  39  * has a zd_prefix_len - bit prefix
  40  */
  41 
  42 #include <sys/spa.h>
  43 #include <sys/dmu.h>
  44 #include <sys/zfs_context.h>
  45 #include <sys/zfs_znode.h>
  46 #include <sys/fs/zfs.h>
  47 #include <sys/zap.h>
  48 #include <sys/refcount.h>
  49 #include <sys/zap_impl.h>
  50 #include <sys/zap_leaf.h>
  51 
  52 int fzap_default_block_shift = 14; /* 16k blocksize */
  53 
  54 extern inline zap_phys_t *zap_f_phys(zap_t *zap);
  55 

  56 static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
  57 
  58 void
  59 fzap_byteswap(void *vbuf, size_t size)
  60 {
  61         uint64_t block_type;
  62 
  63         block_type = *(uint64_t *)vbuf;
  64 
  65         if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF))
  66                 zap_leaf_byteswap(vbuf, size);
  67         else {
  68                 /* it's a ptrtbl block */
  69                 byteswap_uint64_array(vbuf, size);
  70         }
  71 }
  72 
  73 void
  74 fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
  75 {
  76         dmu_buf_t *db;
  77         zap_leaf_t *l;
  78         int i;
  79         zap_phys_t *zp;
  80 
  81         ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
  82         zap->zap_ismicro = FALSE;
  83 
  84         zap->zap_dbu.dbu_evict_func = zap_evict;
  85 
  86         mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
  87         zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1;
  88 
  89         zp = zap_f_phys(zap);
  90         /*
  91          * explicitly zero it since it might be coming from an
  92          * initialized microzap
  93          */
  94         bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size);
  95         zp->zap_block_type = ZBT_HEADER;
  96         zp->zap_magic = ZAP_MAGIC;
  97 
  98         zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap);
  99 
 100         zp->zap_freeblk = 2;         /* block 1 will be the first leaf */
 101         zp->zap_num_leafs = 1;
 102         zp->zap_num_entries = 0;
 103         zp->zap_salt = zap->zap_salt;
 104         zp->zap_normflags = zap->zap_normflags;

 370 static void
 371 zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx)
 372 {
 373         dmu_buf_will_dirty(zap->zap_dbuf, tx);
 374         mutex_enter(&zap->zap_f.zap_num_entries_mtx);
 375         ASSERT(delta > 0 || zap_f_phys(zap)->zap_num_entries >= -delta);
 376         zap_f_phys(zap)->zap_num_entries += delta;
 377         mutex_exit(&zap->zap_f.zap_num_entries_mtx);
 378 }
 379 
 380 static uint64_t
 381 zap_allocate_blocks(zap_t *zap, int nblocks)
 382 {
 383         uint64_t newblk;
 384         ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 385         newblk = zap_f_phys(zap)->zap_freeblk;
 386         zap_f_phys(zap)->zap_freeblk += nblocks;
 387         return (newblk);
 388 }
 389 
 390 static void
 391 zap_leaf_pageout(void *dbu)
 392 {
 393         zap_leaf_t *l = dbu;
 394 
 395         rw_destroy(&l->l_rwlock);
 396         kmem_free(l, sizeof (zap_leaf_t));
 397 }
 398 
 399 static zap_leaf_t *
 400 zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
 401 {
 402         void *winner;
 403         zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
 404 
 405         ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 406 
 407         rw_init(&l->l_rwlock, 0, 0, 0);
 408         rw_enter(&l->l_rwlock, RW_WRITER);
 409         l->l_blkid = zap_allocate_blocks(zap, 1);
 410         l->l_dbuf = NULL;
 411 
 412         VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
 413             l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf,
 414             DMU_READ_NO_PREFETCH));
 415         dmu_buf_init_user(&l->l_dbu, zap_leaf_pageout, &l->l_dbuf);
 416         winner = dmu_buf_set_user(l->l_dbuf, &l->l_dbu);
 417         ASSERT(winner == NULL);
 418         dmu_buf_will_dirty(l->l_dbuf, tx);
 419 
 420         zap_leaf_init(l, zap->zap_normflags != 0);
 421 
 422         zap_f_phys(zap)->zap_num_leafs++;
 423 
 424         return (l);
 425 }
 426 
 427 int
 428 fzap_count(zap_t *zap, uint64_t *count)
 429 {
 430         ASSERT(!zap->zap_ismicro);
 431         mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */
 432         *count = zap_f_phys(zap)->zap_num_entries;
 433         mutex_exit(&zap->zap_f.zap_num_entries_mtx);
 434         return (0);
 435 }
 436 
 437 /*
 438  * Routines for obtaining zap_leaf_t's
 439  */
 440 
 441 void
 442 zap_put_leaf(zap_leaf_t *l)
 443 {
 444         rw_exit(&l->l_rwlock);
 445         dmu_buf_rele(l->l_dbuf, NULL);
 446 }
 447 










 448 static zap_leaf_t *
 449 zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
 450 {
 451         zap_leaf_t *l, *winner;
 452 
 453         ASSERT(blkid != 0);
 454 
 455         l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
 456         rw_init(&l->l_rwlock, 0, 0, 0);
 457         rw_enter(&l->l_rwlock, RW_WRITER);
 458         l->l_blkid = blkid;
 459         l->l_bs = highbit64(db->db_size) - 1;
 460         l->l_dbuf = db;
 461 
 462         dmu_buf_init_user(&l->l_dbu, zap_leaf_pageout, &l->l_dbuf);
 463         winner = dmu_buf_set_user(db, &l->l_dbu);
 464 
 465         rw_exit(&l->l_rwlock);
 466         if (winner != NULL) {
 467                 /* someone else set it first */
 468                 zap_leaf_pageout(&l->l_dbu);
 469                 l = winner;
 470         }
 471 
 472         /*
 473          * lhr_pad was previously used for the next leaf in the leaf
 474          * chain.  There should be no chained leafs (as we have removed
 475          * support for them).
 476          */
 477         ASSERT0(zap_leaf_phys(l)->l_hdr.lh_pad1);
 478 
 479         /*
 480          * There should be more hash entries than there can be
 481          * chunks to put in the hash table
 482          */
 483         ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3);
 484 
 485         /* The chunks should begin at the end of the hash table */
 486         ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==,
 487             &zap_leaf_phys(l)->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]);
 488