Print this page
    
5056 ZFS deadlock on db_mtx and dn_holds
Reviewed by: Will Andrews <willa@spectralogic.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Approved by: Dan McDonald <danmcd@omniti.com>
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/zfs/sys/sa_impl.h
          +++ new/usr/src/uts/common/fs/zfs/sys/sa_impl.h
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  
    | 
      ↓ open down ↓ | 
    13 lines elided | 
    
      ↑ open up ↑ | 
  
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2013 by Delphix. All rights reserved.
       24 + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  24   25   */
  25   26  
  26   27  #ifndef _SYS_SA_IMPL_H
  27   28  #define _SYS_SA_IMPL_H
  28   29  
  29   30  #include <sys/dmu.h>
  30   31  #include <sys/refcount.h>
  31   32  #include <sys/list.h>
  32   33  
  33   34  /*
  34   35   * Array of known attributes and their
  35   36   * various characteristics.
  36   37   */
  37   38  typedef struct sa_attr_table {
  38   39          sa_attr_type_t  sa_attr;
  39   40          uint8_t sa_registered;
  40   41          uint16_t sa_length;
  41   42          sa_bswap_type_t sa_byteswap;
  42   43          char *sa_name;
  43   44  } sa_attr_table_t;
  44   45  
  45   46  /*
  46   47   * Zap attribute format for attribute registration
  47   48   *
  48   49   * 64      56      48      40      32      24      16      8       0
  49   50   * +-------+-------+-------+-------+-------+-------+-------+-------+
  50   51   * |        unused         |      len      | bswap |   attr num    |
  51   52   * +-------+-------+-------+-------+-------+-------+-------+-------+
  52   53   *
  53   54   * Zap attribute format for layout information.
  54   55   *
  55   56   * layout information is stored as an array of attribute numbers
  56   57   * The name of the attribute is the layout number (0, 1, 2, ...)
  57   58   *
  58   59   * 16       0
  59   60   * +---- ---+
  60   61   * | attr # |
  61   62   * +--------+
  62   63   * | attr # |
  63   64   * +--- ----+
  64   65   *  ......
  65   66   *
  66   67   */
  67   68  
  68   69  #define ATTR_BSWAP(x)   BF32_GET(x, 16, 8)
  69   70  #define ATTR_LENGTH(x)  BF32_GET(x, 24, 16)
  70   71  #define ATTR_NUM(x)     BF32_GET(x, 0, 16)
  71   72  #define ATTR_ENCODE(x, attr, length, bswap) \
  72   73  { \
  73   74          BF64_SET(x, 24, 16, length); \
  74   75          BF64_SET(x, 16, 8, bswap); \
  75   76          BF64_SET(x, 0, 16, attr); \
  76   77  }
  77   78  
  78   79  #define TOC_OFF(x)              BF32_GET(x, 0, 23)
  79   80  #define TOC_ATTR_PRESENT(x)     BF32_GET(x, 31, 1)
  80   81  #define TOC_LEN_IDX(x)          BF32_GET(x, 24, 4)
  81   82  #define TOC_ATTR_ENCODE(x, len_idx, offset) \
  82   83  { \
  83   84          BF32_SET(x, 31, 1, 1); \
  84   85          BF32_SET(x, 24, 7, len_idx); \
  85   86          BF32_SET(x, 0, 24, offset); \
  86   87  }
  87   88  
  88   89  #define SA_LAYOUTS      "LAYOUTS"
  89   90  #define SA_REGISTRY     "REGISTRY"
  90   91  
  91   92  /*
  92   93   * Each unique layout will have their own table
  93   94   * sa_lot (layout_table)
  94   95   */
  95   96  typedef struct sa_lot {
  96   97          avl_node_t lot_num_node;
  97   98          avl_node_t lot_hash_node;
  98   99          uint64_t lot_num;
  99  100          uint64_t lot_hash;
 100  101          sa_attr_type_t *lot_attrs;      /* array of attr #'s */
 101  102          uint32_t lot_var_sizes; /* how many aren't fixed size */
 102  103          uint32_t lot_attr_count;        /* total attr count */
 103  104          list_t  lot_idx_tab;    /* should be only a couple of entries */
 104  105          int     lot_instance;   /* used with lot_hash to identify entry */
 105  106  } sa_lot_t;
 106  107  
 107  108  /* index table of offsets */
 108  109  typedef struct sa_idx_tab {
 109  110          list_node_t     sa_next;
 110  111          sa_lot_t        *sa_layout;
 111  112          uint16_t        *sa_variable_lengths;
 112  113          refcount_t      sa_refcount;
 113  114          uint32_t        *sa_idx_tab;    /* array of offsets */
 114  115  } sa_idx_tab_t;
 115  116  
 116  117  /*
 117  118   * Since the offset/index information into the actual data
 118  119   * will usually be identical we can share that information with
 119  120   * all handles that have the exact same offsets.
 120  121   *
 121  122   * You would typically only have a large number of different table of
 122  123   * contents if you had a several variable sized attributes.
 123  124   *
 124  125   * Two AVL trees are used to track the attribute layout numbers.
 125  126   * one is keyed by number and will be consulted when a DMU_OT_SA
 126  127   * object is first read.  The second tree is keyed by the hash signature
 127  128   * of the attributes and will be consulted when an attribute is added
 128  129   * to determine if we already have an instance of that layout.  Both
 129  130   * of these tree's are interconnected.  The only difference is that
 130  131   * when an entry is found in the "hash" tree the list of attributes will
 131  132   * need to be compared against the list of attributes you have in hand.
 132  133   * The assumption is that typically attributes will just be updated and
 133  134   * adding a completely new attribute is a very rare operation.
 134  135   */
 135  136  struct sa_os {
 136  137          kmutex_t        sa_lock;
 137  138          boolean_t       sa_need_attr_registration;
 138  139          boolean_t       sa_force_spill;
 139  140          uint64_t        sa_master_obj;
 140  141          uint64_t        sa_reg_attr_obj;
 141  142          uint64_t        sa_layout_attr_obj;
 142  143          int             sa_num_attrs;
 143  144          sa_attr_table_t *sa_attr_table;  /* private attr table */
 144  145          sa_update_cb_t  *sa_update_cb;
 145  146          avl_tree_t      sa_layout_num_tree;  /* keyed by layout number */
 146  147          avl_tree_t      sa_layout_hash_tree; /* keyed by layout hash value */
 147  148          int             sa_user_table_sz;
 148  149          sa_attr_type_t  *sa_user_table; /* user name->attr mapping table */
 149  150  };
 150  151  
 151  152  /*
 152  153   * header for all bonus and spill buffers.
 153  154   *
 154  155   * The header has a fixed portion with a variable number
 155  156   * of "lengths" depending on the number of variable sized
 156  157   * attributes which are determined by the "layout number"
 157  158   */
 158  159  
 159  160  #define SA_MAGIC        0x2F505A  /* ZFS SA */
 160  161  typedef struct sa_hdr_phys {
 161  162          uint32_t sa_magic;
 162  163          /* BEGIN CSTYLED */
 163  164          /*
 164  165           * Encoded with hdrsize and layout number as follows:
 165  166           * 16      10       0
 166  167           * +--------+-------+
 167  168           * | hdrsz  |layout |
 168  169           * +--------+-------+
 169  170           *
 170  171           * Bits 0-10 are the layout number
 171  172           * Bits 11-16 are the size of the header.
 172  173           * The hdrsize is the number * 8
 173  174           *
 174  175           * For example.
 175  176           * hdrsz of 1 ==> 8 byte header
 176  177           *          2 ==> 16 byte header
 177  178           *
 178  179           */
 179  180          /* END CSTYLED */
 180  181          uint16_t sa_layout_info;
 181  182          uint16_t sa_lengths[1]; /* optional sizes for variable length attrs */
 182  183          /* ... Data follows the lengths.  */
 183  184  } sa_hdr_phys_t;
 184  185  
 185  186  #define SA_HDR_LAYOUT_NUM(hdr) BF32_GET(hdr->sa_layout_info, 0, 10)
 186  187  #define SA_HDR_SIZE(hdr) BF32_GET_SB(hdr->sa_layout_info, 10, 6, 3, 0)
 187  188  #define SA_HDR_LAYOUT_INFO_ENCODE(x, num, size) \
 188  189  { \
 189  190          BF32_SET_SB(x, 10, 6, 3, 0, size); \
 190  191          BF32_SET(x, 0, 10, num); \
 191  192  }
 192  193  
 193  194  typedef enum sa_buf_type {
 194  195          SA_BONUS = 1,
 195  196          SA_SPILL = 2
 196  197  } sa_buf_type_t;
 197  198  
 198  199  typedef enum sa_data_op {
 199  200          SA_LOOKUP,
 200  201          SA_UPDATE,
 201  202          SA_ADD,
 202  203          SA_REPLACE,
  
    | 
      ↓ open down ↓ | 
    169 lines elided | 
    
      ↑ open up ↑ | 
  
 203  204          SA_REMOVE
 204  205  } sa_data_op_t;
 205  206  
 206  207  /*
 207  208   * Opaque handle used for most sa functions
 208  209   *
 209  210   * This needs to be kept as small as possible.
 210  211   */
 211  212  
 212  213  struct sa_handle {
      214 +        dmu_buf_user_t  sa_dbu;
 213  215          kmutex_t        sa_lock;
 214  216          dmu_buf_t       *sa_bonus;
 215  217          dmu_buf_t       *sa_spill;
 216  218          objset_t        *sa_os;
 217      -        void            *sa_userp;
      219 +        void            *sa_userp;
 218  220          sa_idx_tab_t    *sa_bonus_tab;   /* idx of bonus */
 219  221          sa_idx_tab_t    *sa_spill_tab; /* only present if spill activated */
 220  222  };
 221  223  
 222  224  #define SA_GET_DB(hdl, type)    \
 223  225          (dmu_buf_impl_t *)((type == SA_BONUS) ? hdl->sa_bonus : hdl->sa_spill)
 224  226  
 225  227  #define SA_GET_HDR(hdl, type) \
 226  228          ((sa_hdr_phys_t *)((dmu_buf_impl_t *)(SA_GET_DB(hdl, \
 227  229          type))->db.db_data))
 228  230  
 229  231  #define SA_IDX_TAB_GET(hdl, type) \
 230  232          (type == SA_BONUS ? hdl->sa_bonus_tab : hdl->sa_spill_tab)
 231  233  
 232  234  #define IS_SA_BONUSTYPE(a)      \
 233  235          ((a == DMU_OT_SA) ? B_TRUE : B_FALSE)
 234  236  
 235  237  #define SA_BONUSTYPE_FROM_DB(db) \
 236  238          (dmu_get_bonustype((dmu_buf_t *)db))
 237  239  
 238  240  #define SA_BLKPTR_SPACE (DN_MAX_BONUSLEN - sizeof (blkptr_t))
 239  241  
 240  242  #define SA_LAYOUT_NUM(x, type) \
 241  243          ((!IS_SA_BONUSTYPE(type) ? 0 : (((IS_SA_BONUSTYPE(type)) && \
 242  244          ((SA_HDR_LAYOUT_NUM(x)) == 0)) ? 1 : SA_HDR_LAYOUT_NUM(x))))
 243  245  
 244  246  
 245  247  #define SA_REGISTERED_LEN(sa, attr) sa->sa_attr_table[attr].sa_length
 246  248  
 247  249  #define SA_ATTR_LEN(sa, idx, attr, hdr) ((SA_REGISTERED_LEN(sa, attr) == 0) ?\
 248  250          hdr->sa_lengths[TOC_LEN_IDX(idx->sa_idx_tab[attr])] : \
 249  251          SA_REGISTERED_LEN(sa, attr))
 250  252  
 251  253  #define SA_SET_HDR(hdr, num, size) \
 252  254          { \
 253  255                  hdr->sa_magic = SA_MAGIC; \
 254  256                  SA_HDR_LAYOUT_INFO_ENCODE(hdr->sa_layout_info, num, size); \
 255  257          }
 256  258  
 257  259  #define SA_ATTR_INFO(sa, idx, hdr, attr, bulk, type, hdl) \
 258  260          { \
 259  261                  bulk.sa_size = SA_ATTR_LEN(sa, idx, attr, hdr); \
 260  262                  bulk.sa_buftype = type; \
 261  263                  bulk.sa_addr = \
 262  264                      (void *)((uintptr_t)TOC_OFF(idx->sa_idx_tab[attr]) + \
 263  265                      (uintptr_t)hdr); \
 264  266  }
 265  267  
 266  268  #define SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) \
 267  269          (SA_HDR_SIZE(hdr) == (sizeof (sa_hdr_phys_t) + \
 268  270          (tb->lot_var_sizes > 1 ? P2ROUNDUP((tb->lot_var_sizes - 1) * \
 269  271          sizeof (uint16_t), 8) : 0)))
 270  272  
 271  273  int sa_add_impl(sa_handle_t *, sa_attr_type_t,
 272  274      uint32_t, sa_data_locator_t, void *, dmu_tx_t *);
 273  275  
 274  276  void sa_register_update_callback_locked(objset_t *, sa_update_cb_t *);
 275  277  int sa_size_locked(sa_handle_t *, sa_attr_type_t, int *);
 276  278  
 277  279  void sa_default_locator(void **, uint32_t *, uint32_t, boolean_t, void *);
 278  280  int sa_attr_size(sa_os_t *, sa_idx_tab_t *, sa_attr_type_t,
 279  281      uint16_t *, sa_hdr_phys_t *);
 280  282  
 281  283  #ifdef  __cplusplus
 282  284  extern "C" {
 283  285  #endif
 284  286  
 285  287  #ifdef  __cplusplus
 286  288  }
 287  289  #endif
 288  290  
 289  291  #endif  /* _SYS_SA_IMPL_H */
  
    | 
      ↓ open down ↓ | 
    62 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX