Print this page
    
5056 ZFS deadlock on db_mtx and dn_holds
Reviewed by: Will Andrews <willa@spectralogic.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Approved by: Dan McDonald <danmcd@omniti.com>
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/zfs/sys/dnode.h
          +++ new/usr/src/uts/common/fs/zfs/sys/dnode.h
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  
    | 
      ↓ open down ↓ | 
    13 lines elided | 
    
      ↑ open up ↑ | 
  
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
       24 + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  24   25   */
  25   26  
  26   27  #ifndef _SYS_DNODE_H
  27   28  #define _SYS_DNODE_H
  28   29  
  29   30  #include <sys/zfs_context.h>
  30   31  #include <sys/avl.h>
  31   32  #include <sys/spa.h>
  32   33  #include <sys/txg.h>
  33   34  #include <sys/zio.h>
  34   35  #include <sys/refcount.h>
  35   36  #include <sys/dmu_zfetch.h>
  36   37  #include <sys/zrlock.h>
  37   38  
  38   39  #ifdef  __cplusplus
  39   40  extern "C" {
  40   41  #endif
  41   42  
  42   43  /*
  43   44   * dnode_hold() flags.
  44   45   */
  45   46  #define DNODE_MUST_BE_ALLOCATED 1
  46   47  #define DNODE_MUST_BE_FREE      2
  47   48  
  48   49  /*
  49   50   * dnode_next_offset() flags.
  50   51   */
  51   52  #define DNODE_FIND_HOLE         1
  52   53  #define DNODE_FIND_BACKWARDS    2
  53   54  #define DNODE_FIND_HAVELOCK     4
  54   55  
  55   56  /*
  56   57   * Fixed constants.
  57   58   */
  58   59  #define DNODE_SHIFT             9       /* 512 bytes */
  59   60  #define DN_MIN_INDBLKSHIFT      12      /* 4k */
  60   61  #define DN_MAX_INDBLKSHIFT      14      /* 16k */
  61   62  #define DNODE_BLOCK_SHIFT       14      /* 16k */
  62   63  #define DNODE_CORE_SIZE         64      /* 64 bytes for dnode sans blkptrs */
  63   64  #define DN_MAX_OBJECT_SHIFT     48      /* 256 trillion (zfs_fid_t limit) */
  64   65  #define DN_MAX_OFFSET_SHIFT     64      /* 2^64 bytes in a dnode */
  65   66  
  66   67  /*
  67   68   * dnode id flags
  68   69   *
  69   70   * Note: a file will never ever have its
  70   71   * ids moved from bonus->spill
  71   72   * and only in a crypto environment would it be on spill
  72   73   */
  73   74  #define DN_ID_CHKED_BONUS       0x1
  74   75  #define DN_ID_CHKED_SPILL       0x2
  75   76  #define DN_ID_OLD_EXIST         0x4
  76   77  #define DN_ID_NEW_EXIST         0x8
  77   78  
  78   79  /*
  79   80   * Derived constants.
  80   81   */
  81   82  #define DNODE_SIZE      (1 << DNODE_SHIFT)
  82   83  #define DN_MAX_NBLKPTR  ((DNODE_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT)
  83   84  #define DN_MAX_BONUSLEN (DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT))
  84   85  #define DN_MAX_OBJECT   (1ULL << DN_MAX_OBJECT_SHIFT)
  85   86  #define DN_ZERO_BONUSLEN        (DN_MAX_BONUSLEN + 1)
  86   87  #define DN_KILL_SPILLBLK (1)
  87   88  
  88   89  #define DNODES_PER_BLOCK_SHIFT  (DNODE_BLOCK_SHIFT - DNODE_SHIFT)
  89   90  #define DNODES_PER_BLOCK        (1ULL << DNODES_PER_BLOCK_SHIFT)
  90   91  #define DNODES_PER_LEVEL_SHIFT  (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT)
  91   92  #define DNODES_PER_LEVEL        (1ULL << DNODES_PER_LEVEL_SHIFT)
  92   93  
  93   94  /* The +2 here is a cheesy way to round up */
  94   95  #define DN_MAX_LEVELS   (2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \
  95   96          (DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT)))
  96   97  
  97   98  #define DN_BONUS(dnp)   ((void*)((dnp)->dn_bonus + \
  98   99          (((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t))))
  99  100  
 100  101  #define DN_USED_BYTES(dnp) (((dnp)->dn_flags & DNODE_FLAG_USED_BYTES) ? \
 101  102          (dnp)->dn_used : (dnp)->dn_used << SPA_MINBLOCKSHIFT)
 102  103  
 103  104  #define EPB(blkshift, typeshift)        (1 << (blkshift - typeshift))
 104  105  
 105  106  struct dmu_buf_impl;
 106  107  struct objset;
 107  108  struct zio;
 108  109  
 109  110  enum dnode_dirtycontext {
 110  111          DN_UNDIRTIED,
 111  112          DN_DIRTY_OPEN,
 112  113          DN_DIRTY_SYNC
 113  114  };
 114  115  
 115  116  /* Is dn_used in bytes?  if not, it's in multiples of SPA_MINBLOCKSIZE */
 116  117  #define DNODE_FLAG_USED_BYTES           (1<<0)
 117  118  #define DNODE_FLAG_USERUSED_ACCOUNTED   (1<<1)
 118  119  
 119  120  /* Does dnode have a SA spill blkptr in bonus? */
 120  121  #define DNODE_FLAG_SPILL_BLKPTR (1<<2)
 121  122  
 122  123  typedef struct dnode_phys {
 123  124          uint8_t dn_type;                /* dmu_object_type_t */
 124  125          uint8_t dn_indblkshift;         /* ln2(indirect block size) */
 125  126          uint8_t dn_nlevels;             /* 1=dn_blkptr->data blocks */
 126  127          uint8_t dn_nblkptr;             /* length of dn_blkptr */
 127  128          uint8_t dn_bonustype;           /* type of data in bonus buffer */
 128  129          uint8_t dn_checksum;            /* ZIO_CHECKSUM type */
 129  130          uint8_t dn_compress;            /* ZIO_COMPRESS type */
 130  131          uint8_t dn_flags;               /* DNODE_FLAG_* */
 131  132          uint16_t dn_datablkszsec;       /* data block size in 512b sectors */
 132  133          uint16_t dn_bonuslen;           /* length of dn_bonus */
 133  134          uint8_t dn_pad2[4];
 134  135  
 135  136          /* accounting is protected by dn_dirty_mtx */
 136  137          uint64_t dn_maxblkid;           /* largest allocated block ID */
 137  138          uint64_t dn_used;               /* bytes (or sectors) of disk space */
 138  139  
 139  140          uint64_t dn_pad3[4];
 140  141  
 141  142          blkptr_t dn_blkptr[1];
 142  143          uint8_t dn_bonus[DN_MAX_BONUSLEN - sizeof (blkptr_t)];
 143  144          blkptr_t dn_spill;
 144  145  } dnode_phys_t;
 145  146  
 146  147  typedef struct dnode {
 147  148          /*
 148  149           * Protects the structure of the dnode, including the number of levels
 149  150           * of indirection (dn_nlevels), dn_maxblkid, and dn_next_*
 150  151           */
 151  152          krwlock_t dn_struct_rwlock;
 152  153  
 153  154          /* Our link on dn_objset->os_dnodes list; protected by os_lock.  */
 154  155          list_node_t dn_link;
 155  156  
 156  157          /* immutable: */
 157  158          struct objset *dn_objset;
 158  159          uint64_t dn_object;
 159  160          struct dmu_buf_impl *dn_dbuf;
 160  161          struct dnode_handle *dn_handle;
 161  162          dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */
 162  163  
 163  164          /*
 164  165           * Copies of stuff in dn_phys.  They're valid in the open
 165  166           * context (eg. even before the dnode is first synced).
 166  167           * Where necessary, these are protected by dn_struct_rwlock.
 167  168           */
 168  169          dmu_object_type_t dn_type;      /* object type */
 169  170          uint16_t dn_bonuslen;           /* bonus length */
 170  171          uint8_t dn_bonustype;           /* bonus type */
 171  172          uint8_t dn_nblkptr;             /* number of blkptrs (immutable) */
 172  173          uint8_t dn_checksum;            /* ZIO_CHECKSUM type */
 173  174          uint8_t dn_compress;            /* ZIO_COMPRESS type */
 174  175          uint8_t dn_nlevels;
 175  176          uint8_t dn_indblkshift;
 176  177          uint8_t dn_datablkshift;        /* zero if blksz not power of 2! */
 177  178          uint8_t dn_moved;               /* Has this dnode been moved? */
 178  179          uint16_t dn_datablkszsec;       /* in 512b sectors */
 179  180          uint32_t dn_datablksz;          /* in bytes */
 180  181          uint64_t dn_maxblkid;
 181  182          uint8_t dn_next_type[TXG_SIZE];
 182  183          uint8_t dn_next_nblkptr[TXG_SIZE];
 183  184          uint8_t dn_next_nlevels[TXG_SIZE];
 184  185          uint8_t dn_next_indblkshift[TXG_SIZE];
 185  186          uint8_t dn_next_bonustype[TXG_SIZE];
 186  187          uint8_t dn_rm_spillblk[TXG_SIZE];       /* for removing spill blk */
 187  188          uint16_t dn_next_bonuslen[TXG_SIZE];
 188  189          uint32_t dn_next_blksz[TXG_SIZE];       /* next block size in bytes */
 189  190  
 190  191          /* protected by dn_dbufs_mtx; declared here to fill 32-bit hole */
 191  192          uint32_t dn_dbufs_count;        /* count of dn_dbufs */
 192  193          /* There are no level-0 blocks of this blkid or higher in dn_dbufs */
 193  194          uint64_t dn_unlisted_l0_blkid;
 194  195  
 195  196          /* protected by os_lock: */
 196  197          list_node_t dn_dirty_link[TXG_SIZE];    /* next on dataset's dirty */
 197  198  
 198  199          /* protected by dn_mtx: */
 199  200          kmutex_t dn_mtx;
 200  201          list_t dn_dirty_records[TXG_SIZE];
 201  202          struct range_tree *dn_free_ranges[TXG_SIZE];
 202  203          uint64_t dn_allocated_txg;
 203  204          uint64_t dn_free_txg;
 204  205          uint64_t dn_assigned_txg;
 205  206          kcondvar_t dn_notxholds;
 206  207          enum dnode_dirtycontext dn_dirtyctx;
 207  208          uint8_t *dn_dirtyctx_firstset;          /* dbg: contents meaningless */
 208  209  
 209  210          /* protected by own devices */
 210  211          refcount_t dn_tx_holds;
 211  212          refcount_t dn_holds;
 212  213  
 213  214          kmutex_t dn_dbufs_mtx;
 214  215          /*
 215  216           * Descendent dbufs, ordered by dbuf_compare. Note that dn_dbufs
 216  217           * can contain multiple dbufs of the same (level, blkid) when a
 217  218           * dbuf is marked DB_EVICTING without being removed from
 218  219           * dn_dbufs. To maintain the avl invariant that there cannot be
 219  220           * duplicate entries, we order the dbufs by an arbitrary value -
 220  221           * their address in memory. This means that dn_dbufs cannot be used to
 221  222           * directly look up a dbuf. Instead, callers must use avl_walk, have
 222  223           * a reference to the dbuf, or look up a non-existant node with
 223  224           * db_state = DB_SEARCH (see dbuf_free_range for an example).
 224  225           */
 225  226          avl_tree_t dn_dbufs;
 226  227  
 227  228          /* protected by dn_struct_rwlock */
 228  229          struct dmu_buf_impl *dn_bonus;  /* bonus buffer dbuf */
 229  230  
 230  231          boolean_t dn_have_spill;        /* have spill or are spilling */
 231  232  
 232  233          /* parent IO for current sync write */
 233  234          zio_t *dn_zio;
 234  235  
 235  236          /* used in syncing context */
 236  237          uint64_t dn_oldused;    /* old phys used bytes */
 237  238          uint64_t dn_oldflags;   /* old phys dn_flags */
 238  239          uint64_t dn_olduid, dn_oldgid;
 239  240          uint64_t dn_newuid, dn_newgid;
 240  241          int dn_id_flags;
 241  242  
 242  243          /* holds prefetch structure */
 243  244          struct zfetch   dn_zfetch;
 244  245  } dnode_t;
 245  246  
 246  247  /*
 247  248   * Adds a level of indirection between the dbuf and the dnode to avoid
  
    | 
      ↓ open down ↓ | 
    214 lines elided | 
    
      ↑ open up ↑ | 
  
 248  249   * iterating descendent dbufs in dnode_move(). Handles are not allocated
 249  250   * individually, but as an array of child dnodes in dnode_hold_impl().
 250  251   */
 251  252  typedef struct dnode_handle {
 252  253          /* Protects dnh_dnode from modification by dnode_move(). */
 253  254          zrlock_t dnh_zrlock;
 254  255          dnode_t *dnh_dnode;
 255  256  } dnode_handle_t;
 256  257  
 257  258  typedef struct dnode_children {
      259 +        dmu_buf_user_t dnc_dbu;         /* User evict data */
 258  260          size_t dnc_count;               /* number of children */
 259  261          dnode_handle_t dnc_children[];  /* sized dynamically */
 260  262  } dnode_children_t;
 261  263  
 262  264  typedef struct free_range {
 263  265          avl_node_t fr_node;
 264  266          uint64_t fr_blkid;
 265  267          uint64_t fr_nblks;
 266  268  } free_range_t;
 267  269  
 268      -dnode_t *dnode_special_open(struct objset *dd, dnode_phys_t *dnp,
      270 +void dnode_special_open(struct objset *dd, dnode_phys_t *dnp,
 269  271      uint64_t object, dnode_handle_t *dnh);
 270  272  void dnode_special_close(dnode_handle_t *dnh);
 271  273  
 272  274  void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx);
 273  275  void dnode_setbonus_type(dnode_t *dn, dmu_object_type_t, dmu_tx_t *tx);
 274  276  void dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx);
 275  277  
 276  278  int dnode_hold(struct objset *dd, uint64_t object,
 277  279      void *ref, dnode_t **dnp);
 278  280  int dnode_hold_impl(struct objset *dd, uint64_t object, int flag,
 279  281      void *ref, dnode_t **dnp);
 280  282  boolean_t dnode_add_ref(dnode_t *dn, void *ref);
 281  283  void dnode_rele(dnode_t *dn, void *ref);
 282  284  void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
 283  285  void dnode_sync(dnode_t *dn, dmu_tx_t *tx);
 284  286  void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
 285  287      dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
 286  288  void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
 287  289      dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
 288  290  void dnode_free(dnode_t *dn, dmu_tx_t *tx);
 289  291  void dnode_byteswap(dnode_phys_t *dnp);
 290  292  void dnode_buf_byteswap(void *buf, size_t size);
 291  293  void dnode_verify(dnode_t *dn);
 292  294  int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx);
 293  295  void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx);
 294  296  void dnode_diduse_space(dnode_t *dn, int64_t space);
 295  297  void dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx);
 296  298  void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t);
 297  299  uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid);
 298  300  void dnode_init(void);
 299  301  void dnode_fini(void);
 300  302  int dnode_next_offset(dnode_t *dn, int flags, uint64_t *off,
 301  303      int minlvl, uint64_t blkfill, uint64_t txg);
 302  304  void dnode_evict_dbufs(dnode_t *dn);
 303  305  
 304  306  #ifdef ZFS_DEBUG
 305  307  
 306  308  /*
 307  309   * There should be a ## between the string literal and fmt, to make it
 308  310   * clear that we're joining two strings together, but that piece of shit
 309  311   * gcc doesn't support that preprocessor token.
 310  312   */
 311  313  #define dprintf_dnode(dn, fmt, ...) do { \
 312  314          if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
 313  315          char __db_buf[32]; \
 314  316          uint64_t __db_obj = (dn)->dn_object; \
 315  317          if (__db_obj == DMU_META_DNODE_OBJECT) \
 316  318                  (void) strcpy(__db_buf, "mdn"); \
 317  319          else \
 318  320                  (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \
 319  321                      (u_longlong_t)__db_obj);\
 320  322          dprintf_ds((dn)->dn_objset->os_dsl_dataset, "obj=%s " fmt, \
 321  323              __db_buf, __VA_ARGS__); \
 322  324          } \
 323  325  _NOTE(CONSTCOND) } while (0)
 324  326  
 325  327  #define DNODE_VERIFY(dn)                dnode_verify(dn)
 326  328  #define FREE_VERIFY(db, start, end, tx) free_verify(db, start, end, tx)
 327  329  
 328  330  #else
 329  331  
 330  332  #define dprintf_dnode(db, fmt, ...)
 331  333  #define DNODE_VERIFY(dn)
 332  334  #define FREE_VERIFY(db, start, end, tx)
 333  335  
 334  336  #endif
 335  337  
 336  338  #ifdef  __cplusplus
 337  339  }
 338  340  #endif
 339  341  
 340  342  #endif  /* _SYS_DNODE_H */
  
    | 
      ↓ open down ↓ | 
    62 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX