Print this page
    
2619 asynchronous destruction of ZFS file systems
2747 SPA versioning with zfs feature flags
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <gwilson@delphix.com>
Reviewed by: Richard Lowe <richlowe@richlowe.net>
Reviewed by: Dan Kruchinin <dan.kruchinin@gmail.com>
Approved by: Dan McDonald <danmcd@nexenta.com>
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/zfs/sa.c
          +++ new/usr/src/uts/common/fs/zfs/sa.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  
    | 
      ↓ open down ↓ | 
    10 lines elided | 
    
      ↑ open up ↑ | 
  
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
       21 +
  21   22  /*
  22   23   * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  23   24   * Portions Copyright 2011 iXsystems, Inc
       25 + * Copyright (c) 2012 by Delphix. All rights reserved.
  24   26   */
  25   27  
  26   28  #include <sys/zfs_context.h>
  27   29  #include <sys/types.h>
  28   30  #include <sys/param.h>
  29   31  #include <sys/systm.h>
  30   32  #include <sys/sysmacros.h>
  31   33  #include <sys/dmu.h>
  32   34  #include <sys/dmu_impl.h>
  33   35  #include <sys/dmu_objset.h>
  34   36  #include <sys/dbuf.h>
  35   37  #include <sys/dnode.h>
  36   38  #include <sys/zap.h>
  37   39  #include <sys/sa.h>
  38   40  #include <sys/sunddi.h>
  39   41  #include <sys/sa_impl.h>
  40   42  #include <sys/dnode.h>
  41   43  #include <sys/errno.h>
  42   44  #include <sys/zfs_context.h>
  43   45  
  44   46  /*
  45   47   * ZFS System attributes:
  46   48   *
  47   49   * A generic mechanism to allow for arbitrary attributes
  48   50   * to be stored in a dnode.  The data will be stored in the bonus buffer of
  49   51   * the dnode and if necessary a special "spill" block will be used to handle
  50   52   * overflow situations.  The spill block will be sized to fit the data
  51   53   * from 512 - 128K.  When a spill block is used the BP (blkptr_t) for the
  52   54   * spill block is stored at the end of the current bonus buffer.  Any
  53   55   * attributes that would be in the way of the blkptr_t will be relocated
  54   56   * into the spill block.
  55   57   *
  56   58   * Attribute registration:
  57   59   *
  58   60   * Stored persistently on a per dataset basis
  59   61   * a mapping between attribute "string" names and their actual attribute
  60   62   * numeric values, length, and byteswap function.  The names are only used
  61   63   * during registration.  All  attributes are known by their unique attribute
  62   64   * id value.  If an attribute can have a variable size then the value
  63   65   * 0 will be used to indicate this.
  64   66   *
  65   67   * Attribute Layout:
  66   68   *
  67   69   * Attribute layouts are a way to compactly store multiple attributes, but
  68   70   * without taking the overhead associated with managing each attribute
  69   71   * individually.  Since you will typically have the same set of attributes
  70   72   * stored in the same order a single table will be used to represent that
  71   73   * layout.  The ZPL for example will usually have only about 10 different
  72   74   * layouts (regular files, device files, symlinks,
  73   75   * regular files + scanstamp, files/dir with extended attributes, and then
  74   76   * you have the possibility of all of those minus ACL, because it would
  75   77   * be kicked out into the spill block)
  76   78   *
  77   79   * Layouts are simply an array of the attributes and their
  78   80   * ordering i.e. [0, 1, 4, 5, 2]
  79   81   *
  80   82   * Each distinct layout is given a unique layout number and that is whats
  81   83   * stored in the header at the beginning of the SA data buffer.
  82   84   *
  83   85   * A layout only covers a single dbuf (bonus or spill).  If a set of
  84   86   * attributes is split up between the bonus buffer and a spill buffer then
  85   87   * two different layouts will be used.  This allows us to byteswap the
  86   88   * spill without looking at the bonus buffer and keeps the on disk format of
  87   89   * the bonus and spill buffer the same.
  88   90   *
  89   91   * Adding a single attribute will cause the entire set of attributes to
  90   92   * be rewritten and could result in a new layout number being constructed
  91   93   * as part of the rewrite if no such layout exists for the new set of
  92   94   * attribues.  The new attribute will be appended to the end of the already
  93   95   * existing attributes.
  94   96   *
  95   97   * Both the attribute registration and attribute layout information are
  96   98   * stored in normal ZAP attributes.  Their should be a small number of
  97   99   * known layouts and the set of attributes is assumed to typically be quite
  98  100   * small.
  99  101   *
 100  102   * The registered attributes and layout "table" information is maintained
 101  103   * in core and a special "sa_os_t" is attached to the objset_t.
 102  104   *
 103  105   * A special interface is provided to allow for quickly applying
 104  106   * a large set of attributes at once.  sa_replace_all_by_template() is
 105  107   * used to set an array of attributes.  This is used by the ZPL when
 106  108   * creating a brand new file.  The template that is passed into the function
 107  109   * specifies the attribute, size for variable length attributes, location of
 108  110   * data and special "data locator" function if the data isn't in a contiguous
 109  111   * location.
 110  112   *
 111  113   * Byteswap implications:
 112  114   * Since the SA attributes are not entirely self describing we can't do
 113  115   * the normal byteswap processing.  The special ZAP layout attribute and
 114  116   * attribute registration attributes define the byteswap function and the
 115  117   * size of the attributes, unless it is variable sized.
 116  118   * The normal ZFS byteswapping infrastructure assumes you don't need
 117  119   * to read any objects in order to do the necessary byteswapping.  Whereas
 118  120   * SA attributes can only be properly byteswapped if the dataset is opened
 119  121   * and the layout/attribute ZAP attributes are available.  Because of this
 120  122   * the SA attributes will be byteswapped when they are first accessed by
 121  123   * the SA code that will read the SA data.
 122  124   */
 123  125  
 124  126  typedef void (sa_iterfunc_t)(void *hdr, void *addr, sa_attr_type_t,
 125  127      uint16_t length, int length_idx, boolean_t, void *userp);
 126  128  
 127  129  static int sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype);
 128  130  static void sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab);
 129  131  static void *sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype,
 130  132      void *data);
 131  133  static void sa_idx_tab_rele(objset_t *os, void *arg);
 132  134  static void sa_copy_data(sa_data_locator_t *func, void *start, void *target,
 133  135      int buflen);
 134  136  static int sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
 135  137      sa_data_op_t action, sa_data_locator_t *locator, void *datastart,
 136  138      uint16_t buflen, dmu_tx_t *tx);
 137  139  
 138  140  arc_byteswap_func_t *sa_bswap_table[] = {
 139  141          byteswap_uint64_array,
 140  142          byteswap_uint32_array,
 141  143          byteswap_uint16_array,
 142  144          byteswap_uint8_array,
 143  145          zfs_acl_byteswap,
 144  146  };
 145  147  
 146  148  #define SA_COPY_DATA(f, s, t, l) \
 147  149          { \
 148  150                  if (f == NULL) { \
 149  151                          if (l == 8) { \
 150  152                                  *(uint64_t *)t = *(uint64_t *)s; \
 151  153                          } else if (l == 16) { \
 152  154                                  *(uint64_t *)t = *(uint64_t *)s; \
 153  155                                  *(uint64_t *)((uintptr_t)t + 8) = \
 154  156                                      *(uint64_t *)((uintptr_t)s + 8); \
 155  157                          } else { \
 156  158                                  bcopy(s, t, l); \
 157  159                          } \
 158  160                  } else \
 159  161                          sa_copy_data(f, s, t, l); \
 160  162          }
 161  163  
 162  164  /*
 163  165   * This table is fixed and cannot be changed.  Its purpose is to
 164  166   * allow the SA code to work with both old/new ZPL file systems.
 165  167   * It contains the list of legacy attributes.  These attributes aren't
 166  168   * stored in the "attribute" registry zap objects, since older ZPL file systems
 167  169   * won't have the registry.  Only objsets of type ZFS_TYPE_FILESYSTEM will
 168  170   * use this static table.
 169  171   */
 170  172  sa_attr_reg_t sa_legacy_attrs[] = {
 171  173          {"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0},
 172  174          {"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1},
 173  175          {"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2},
 174  176          {"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3},
 175  177          {"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4},
 176  178          {"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5},
 177  179          {"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6},
 178  180          {"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7},
 179  181          {"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8},
 180  182          {"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9},
 181  183          {"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10},
 182  184          {"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11},
 183  185          {"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12},
 184  186          {"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13},
 185  187          {"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14},
 186  188          {"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15},
 187  189  };
 188  190  
 189  191  /*
 190  192   * ZPL legacy layout
 191  193   * This is only used for objects of type DMU_OT_ZNODE
 192  194   */
 193  195  sa_attr_type_t sa_legacy_zpl_layout[] = {
 194  196      0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
 195  197  };
 196  198  
 197  199  /*
 198  200   * Special dummy layout used for buffers with no attributes.
 199  201   */
 200  202  
 201  203  sa_attr_type_t sa_dummy_zpl_layout[] = { 0 };
 202  204  
 203  205  static int sa_legacy_attr_count = 16;
 204  206  static kmem_cache_t *sa_cache = NULL;
 205  207  
 206  208  /*ARGSUSED*/
 207  209  static int
 208  210  sa_cache_constructor(void *buf, void *unused, int kmflag)
 209  211  {
 210  212          sa_handle_t *hdl = buf;
 211  213  
 212  214          hdl->sa_bonus_tab = NULL;
 213  215          hdl->sa_spill_tab = NULL;
 214  216          hdl->sa_os = NULL;
 215  217          hdl->sa_userp = NULL;
 216  218          hdl->sa_bonus = NULL;
 217  219          hdl->sa_spill = NULL;
 218  220          mutex_init(&hdl->sa_lock, NULL, MUTEX_DEFAULT, NULL);
 219  221          return (0);
 220  222  }
 221  223  
 222  224  /*ARGSUSED*/
 223  225  static void
 224  226  sa_cache_destructor(void *buf, void *unused)
 225  227  {
 226  228          sa_handle_t *hdl = buf;
 227  229          mutex_destroy(&hdl->sa_lock);
 228  230  }
 229  231  
 230  232  void
 231  233  sa_cache_init(void)
 232  234  {
 233  235          sa_cache = kmem_cache_create("sa_cache",
 234  236              sizeof (sa_handle_t), 0, sa_cache_constructor,
 235  237              sa_cache_destructor, NULL, NULL, NULL, 0);
 236  238  }
 237  239  
 238  240  void
 239  241  sa_cache_fini(void)
 240  242  {
 241  243          if (sa_cache)
 242  244                  kmem_cache_destroy(sa_cache);
 243  245  }
 244  246  
 245  247  static int
 246  248  layout_num_compare(const void *arg1, const void *arg2)
 247  249  {
 248  250          const sa_lot_t *node1 = arg1;
 249  251          const sa_lot_t *node2 = arg2;
 250  252  
 251  253          if (node1->lot_num > node2->lot_num)
 252  254                  return (1);
 253  255          else if (node1->lot_num < node2->lot_num)
 254  256                  return (-1);
 255  257          return (0);
 256  258  }
 257  259  
 258  260  static int
 259  261  layout_hash_compare(const void *arg1, const void *arg2)
 260  262  {
 261  263          const sa_lot_t *node1 = arg1;
 262  264          const sa_lot_t *node2 = arg2;
 263  265  
 264  266          if (node1->lot_hash > node2->lot_hash)
 265  267                  return (1);
 266  268          if (node1->lot_hash < node2->lot_hash)
 267  269                  return (-1);
 268  270          if (node1->lot_instance > node2->lot_instance)
 269  271                  return (1);
 270  272          if (node1->lot_instance < node2->lot_instance)
 271  273                  return (-1);
 272  274          return (0);
 273  275  }
 274  276  
 275  277  boolean_t
 276  278  sa_layout_equal(sa_lot_t *tbf, sa_attr_type_t *attrs, int count)
 277  279  {
 278  280          int i;
 279  281  
 280  282          if (count != tbf->lot_attr_count)
 281  283                  return (1);
 282  284  
 283  285          for (i = 0; i != count; i++) {
 284  286                  if (attrs[i] != tbf->lot_attrs[i])
 285  287                          return (1);
 286  288          }
 287  289          return (0);
 288  290  }
 289  291  
 290  292  #define SA_ATTR_HASH(attr) (zfs_crc64_table[(-1ULL ^ attr) & 0xFF])
 291  293  
 292  294  static uint64_t
 293  295  sa_layout_info_hash(sa_attr_type_t *attrs, int attr_count)
 294  296  {
 295  297          int i;
 296  298          uint64_t crc = -1ULL;
 297  299  
 298  300          for (i = 0; i != attr_count; i++)
 299  301                  crc ^= SA_ATTR_HASH(attrs[i]);
 300  302  
 301  303          return (crc);
 302  304  }
 303  305  
 304  306  static int
 305  307  sa_get_spill(sa_handle_t *hdl)
 306  308  {
 307  309          int rc;
 308  310          if (hdl->sa_spill == NULL) {
 309  311                  if ((rc = dmu_spill_hold_existing(hdl->sa_bonus, NULL,
 310  312                      &hdl->sa_spill)) == 0)
 311  313                          VERIFY(0 == sa_build_index(hdl, SA_SPILL));
 312  314          } else {
 313  315                  rc = 0;
 314  316          }
 315  317  
 316  318          return (rc);
 317  319  }
 318  320  
 319  321  /*
 320  322   * Main attribute lookup/update function
 321  323   * returns 0 for success or non zero for failures
 322  324   *
 323  325   * Operates on bulk array, first failure will abort further processing
 324  326   */
 325  327  int
 326  328  sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
 327  329      sa_data_op_t data_op, dmu_tx_t *tx)
 328  330  {
 329  331          sa_os_t *sa = hdl->sa_os->os_sa;
 330  332          int i;
 331  333          int error = 0;
 332  334          sa_buf_type_t buftypes;
 333  335  
 334  336          buftypes = 0;
 335  337  
 336  338          ASSERT(count > 0);
 337  339          for (i = 0; i != count; i++) {
 338  340                  ASSERT(bulk[i].sa_attr <= hdl->sa_os->os_sa->sa_num_attrs);
 339  341  
 340  342                  bulk[i].sa_addr = NULL;
 341  343                  /* First check the bonus buffer */
 342  344  
 343  345                  if (hdl->sa_bonus_tab && TOC_ATTR_PRESENT(
 344  346                      hdl->sa_bonus_tab->sa_idx_tab[bulk[i].sa_attr])) {
 345  347                          SA_ATTR_INFO(sa, hdl->sa_bonus_tab,
 346  348                              SA_GET_HDR(hdl, SA_BONUS),
 347  349                              bulk[i].sa_attr, bulk[i], SA_BONUS, hdl);
 348  350                          if (tx && !(buftypes & SA_BONUS)) {
 349  351                                  dmu_buf_will_dirty(hdl->sa_bonus, tx);
 350  352                                  buftypes |= SA_BONUS;
 351  353                          }
 352  354                  }
 353  355                  if (bulk[i].sa_addr == NULL &&
 354  356                      ((error = sa_get_spill(hdl)) == 0)) {
 355  357                          if (TOC_ATTR_PRESENT(
 356  358                              hdl->sa_spill_tab->sa_idx_tab[bulk[i].sa_attr])) {
 357  359                                  SA_ATTR_INFO(sa, hdl->sa_spill_tab,
 358  360                                      SA_GET_HDR(hdl, SA_SPILL),
 359  361                                      bulk[i].sa_attr, bulk[i], SA_SPILL, hdl);
 360  362                                  if (tx && !(buftypes & SA_SPILL) &&
 361  363                                      bulk[i].sa_size == bulk[i].sa_length) {
 362  364                                          dmu_buf_will_dirty(hdl->sa_spill, tx);
 363  365                                          buftypes |= SA_SPILL;
 364  366                                  }
 365  367                          }
 366  368                  }
 367  369                  if (error && error != ENOENT) {
 368  370                          return ((error == ECKSUM) ? EIO : error);
 369  371                  }
 370  372  
 371  373                  switch (data_op) {
 372  374                  case SA_LOOKUP:
 373  375                          if (bulk[i].sa_addr == NULL)
 374  376                                  return (ENOENT);
 375  377                          if (bulk[i].sa_data) {
 376  378                                  SA_COPY_DATA(bulk[i].sa_data_func,
 377  379                                      bulk[i].sa_addr, bulk[i].sa_data,
 378  380                                      bulk[i].sa_size);
 379  381                          }
 380  382                          continue;
 381  383  
 382  384                  case SA_UPDATE:
 383  385                          /* existing rewrite of attr */
 384  386                          if (bulk[i].sa_addr &&
 385  387                              bulk[i].sa_size == bulk[i].sa_length) {
 386  388                                  SA_COPY_DATA(bulk[i].sa_data_func,
 387  389                                      bulk[i].sa_data, bulk[i].sa_addr,
 388  390                                      bulk[i].sa_length);
 389  391                                  continue;
 390  392                          } else if (bulk[i].sa_addr) { /* attr size change */
 391  393                                  error = sa_modify_attrs(hdl, bulk[i].sa_attr,
 392  394                                      SA_REPLACE, bulk[i].sa_data_func,
 393  395                                      bulk[i].sa_data, bulk[i].sa_length, tx);
 394  396                          } else { /* adding new attribute */
 395  397                                  error = sa_modify_attrs(hdl, bulk[i].sa_attr,
 396  398                                      SA_ADD, bulk[i].sa_data_func,
 397  399                                      bulk[i].sa_data, bulk[i].sa_length, tx);
 398  400                          }
 399  401                          if (error)
 400  402                                  return (error);
 401  403                          break;
 402  404                  }
 403  405          }
 404  406          return (error);
 405  407  }
 406  408  
 407  409  static sa_lot_t *
 408  410  sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count,
 409  411      uint64_t lot_num, uint64_t hash, boolean_t zapadd, dmu_tx_t *tx)
 410  412  {
 411  413          sa_os_t *sa = os->os_sa;
 412  414          sa_lot_t *tb, *findtb;
 413  415          int i;
 414  416          avl_index_t loc;
 415  417  
 416  418          ASSERT(MUTEX_HELD(&sa->sa_lock));
 417  419          tb = kmem_zalloc(sizeof (sa_lot_t), KM_SLEEP);
 418  420          tb->lot_attr_count = attr_count;
 419  421          tb->lot_attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
  
    | 
      ↓ open down ↓ | 
    386 lines elided | 
    
      ↑ open up ↑ | 
  
 420  422              KM_SLEEP);
 421  423          bcopy(attrs, tb->lot_attrs, sizeof (sa_attr_type_t) * attr_count);
 422  424          tb->lot_num = lot_num;
 423  425          tb->lot_hash = hash;
 424  426          tb->lot_instance = 0;
 425  427  
 426  428          if (zapadd) {
 427  429                  char attr_name[8];
 428  430  
 429  431                  if (sa->sa_layout_attr_obj == 0) {
 430      -                        sa->sa_layout_attr_obj = zap_create(os,
 431      -                            DMU_OT_SA_ATTR_LAYOUTS, DMU_OT_NONE, 0, tx);
 432      -                        VERIFY(zap_add(os, sa->sa_master_obj, SA_LAYOUTS, 8, 1,
 433      -                            &sa->sa_layout_attr_obj, tx) == 0);
      432 +                        sa->sa_layout_attr_obj = zap_create_link(os,
      433 +                            DMU_OT_SA_ATTR_LAYOUTS,
      434 +                            sa->sa_master_obj, SA_LAYOUTS, tx);
 434  435                  }
 435  436  
 436  437                  (void) snprintf(attr_name, sizeof (attr_name),
 437  438                      "%d", (int)lot_num);
 438  439                  VERIFY(0 == zap_update(os, os->os_sa->sa_layout_attr_obj,
 439  440                      attr_name, 2, attr_count, attrs, tx));
 440  441          }
 441  442  
 442  443          list_create(&tb->lot_idx_tab, sizeof (sa_idx_tab_t),
 443  444              offsetof(sa_idx_tab_t, sa_next));
 444  445  
 445  446          for (i = 0; i != attr_count; i++) {
 446  447                  if (sa->sa_attr_table[tb->lot_attrs[i]].sa_length == 0)
 447  448                          tb->lot_var_sizes++;
 448  449          }
 449  450  
 450  451          avl_add(&sa->sa_layout_num_tree, tb);
 451  452  
 452  453          /* verify we don't have a hash collision */
 453  454          if ((findtb = avl_find(&sa->sa_layout_hash_tree, tb, &loc)) != NULL) {
 454  455                  for (; findtb && findtb->lot_hash == hash;
 455  456                      findtb = AVL_NEXT(&sa->sa_layout_hash_tree, findtb)) {
 456  457                          if (findtb->lot_instance != tb->lot_instance)
 457  458                                  break;
 458  459                          tb->lot_instance++;
 459  460                  }
 460  461          }
 461  462          avl_add(&sa->sa_layout_hash_tree, tb);
 462  463          return (tb);
 463  464  }
 464  465  
 465  466  static void
 466  467  sa_find_layout(objset_t *os, uint64_t hash, sa_attr_type_t *attrs,
 467  468      int count, dmu_tx_t *tx, sa_lot_t **lot)
 468  469  {
 469  470          sa_lot_t *tb, tbsearch;
 470  471          avl_index_t loc;
 471  472          sa_os_t *sa = os->os_sa;
 472  473          boolean_t found = B_FALSE;
 473  474  
 474  475          mutex_enter(&sa->sa_lock);
 475  476          tbsearch.lot_hash = hash;
 476  477          tbsearch.lot_instance = 0;
 477  478          tb = avl_find(&sa->sa_layout_hash_tree, &tbsearch, &loc);
 478  479          if (tb) {
 479  480                  for (; tb && tb->lot_hash == hash;
 480  481                      tb = AVL_NEXT(&sa->sa_layout_hash_tree, tb)) {
 481  482                          if (sa_layout_equal(tb, attrs, count) == 0) {
 482  483                                  found = B_TRUE;
 483  484                                  break;
 484  485                          }
 485  486                  }
 486  487          }
 487  488          if (!found) {
 488  489                  tb = sa_add_layout_entry(os, attrs, count,
 489  490                      avl_numnodes(&sa->sa_layout_num_tree), hash, B_TRUE, tx);
 490  491          }
 491  492          mutex_exit(&sa->sa_lock);
 492  493          *lot = tb;
 493  494  }
 494  495  
 495  496  static int
 496  497  sa_resize_spill(sa_handle_t *hdl, uint32_t size, dmu_tx_t *tx)
 497  498  {
 498  499          int error;
 499  500          uint32_t blocksize;
 500  501  
 501  502          if (size == 0) {
 502  503                  blocksize = SPA_MINBLOCKSIZE;
 503  504          } else if (size > SPA_MAXBLOCKSIZE) {
 504  505                  ASSERT(0);
 505  506                  return (EFBIG);
 506  507          } else {
 507  508                  blocksize = P2ROUNDUP_TYPED(size, SPA_MINBLOCKSIZE, uint32_t);
 508  509          }
 509  510  
 510  511          error = dbuf_spill_set_blksz(hdl->sa_spill, blocksize, tx);
 511  512          ASSERT(error == 0);
 512  513          return (error);
 513  514  }
 514  515  
 515  516  static void
 516  517  sa_copy_data(sa_data_locator_t *func, void *datastart, void *target, int buflen)
 517  518  {
 518  519          if (func == NULL) {
 519  520                  bcopy(datastart, target, buflen);
 520  521          } else {
 521  522                  boolean_t start;
 522  523                  int bytes;
 523  524                  void *dataptr;
 524  525                  void *saptr = target;
 525  526                  uint32_t length;
 526  527  
 527  528                  start = B_TRUE;
 528  529                  bytes = 0;
 529  530                  while (bytes < buflen) {
 530  531                          func(&dataptr, &length, buflen, start, datastart);
 531  532                          bcopy(dataptr, saptr, length);
 532  533                          saptr = (void *)((caddr_t)saptr + length);
 533  534                          bytes += length;
 534  535                          start = B_FALSE;
 535  536                  }
 536  537          }
 537  538  }
 538  539  
 539  540  /*
 540  541   * Determine several different sizes
 541  542   * first the sa header size
 542  543   * the number of bytes to be stored
 543  544   * if spill would occur the index in the attribute array is returned
 544  545   *
 545  546   * the boolean will_spill will be set when spilling is necessary.  It
 546  547   * is only set when the buftype is SA_BONUS
 547  548   */
 548  549  static int
 549  550  sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count,
 550  551      dmu_buf_t *db, sa_buf_type_t buftype, int *index, int *total,
 551  552      boolean_t *will_spill)
 552  553  {
 553  554          int var_size = 0;
 554  555          int i;
 555  556          int full_space;
 556  557          int hdrsize;
 557  558          boolean_t done = B_FALSE;
 558  559  
 559  560          if (buftype == SA_BONUS && sa->sa_force_spill) {
 560  561                  *total = 0;
 561  562                  *index = 0;
 562  563                  *will_spill = B_TRUE;
 563  564                  return (0);
 564  565          }
 565  566  
 566  567          *index = -1;
 567  568          *total = 0;
 568  569  
 569  570          if (buftype == SA_BONUS)
 570  571                  *will_spill = B_FALSE;
 571  572  
 572  573          hdrsize = (SA_BONUSTYPE_FROM_DB(db) == DMU_OT_ZNODE) ? 0 :
 573  574              sizeof (sa_hdr_phys_t);
 574  575  
 575  576          full_space = (buftype == SA_BONUS) ? DN_MAX_BONUSLEN : db->db_size;
 576  577  
 577  578          for (i = 0; i != attr_count; i++) {
 578  579                  boolean_t is_var_sz;
 579  580  
 580  581                  *total += attr_desc[i].sa_length;
 581  582                  if (done)
 582  583                          goto next;
 583  584  
 584  585                  is_var_sz = (SA_REGISTERED_LEN(sa, attr_desc[i].sa_attr) == 0);
 585  586                  if (is_var_sz) {
 586  587                          var_size++;
 587  588                  }
 588  589  
 589  590                  if (is_var_sz && var_size > 1) {
 590  591                          if (P2ROUNDUP(hdrsize + sizeof (uint16_t), 8) +
 591  592                              *total < full_space) {
 592  593                                  hdrsize += sizeof (uint16_t);
 593  594                          } else {
 594  595                                  done = B_TRUE;
 595  596                                  *index = i;
 596  597                                  if (buftype == SA_BONUS)
 597  598                                          *will_spill = B_TRUE;
 598  599                                  continue;
 599  600                          }
 600  601                  }
 601  602  
 602  603                  /*
 603  604                   * find index of where spill *could* occur.
 604  605                   * Then continue to count of remainder attribute
 605  606                   * space.  The sum is used later for sizing bonus
 606  607                   * and spill buffer.
 607  608                   */
 608  609                  if (buftype == SA_BONUS && *index == -1 &&
 609  610                      *total + P2ROUNDUP(hdrsize, 8) >
 610  611                      (full_space - sizeof (blkptr_t))) {
 611  612                          *index = i;
 612  613                          done = B_TRUE;
 613  614                  }
 614  615  
 615  616  next:
 616  617                  if (*total + P2ROUNDUP(hdrsize, 8) > full_space &&
 617  618                      buftype == SA_BONUS)
 618  619                          *will_spill = B_TRUE;
 619  620          }
 620  621  
 621  622          hdrsize = P2ROUNDUP(hdrsize, 8);
 622  623          return (hdrsize);
 623  624  }
 624  625  
 625  626  #define BUF_SPACE_NEEDED(total, header) (total + header)
 626  627  
 627  628  /*
 628  629   * Find layout that corresponds to ordering of attributes
 629  630   * If not found a new layout number is created and added to
 630  631   * persistent layout tables.
 631  632   */
 632  633  static int
 633  634  sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
 634  635      dmu_tx_t *tx)
 635  636  {
 636  637          sa_os_t *sa = hdl->sa_os->os_sa;
 637  638          uint64_t hash;
 638  639          sa_buf_type_t buftype;
 639  640          sa_hdr_phys_t *sahdr;
 640  641          void *data_start;
 641  642          int buf_space;
 642  643          sa_attr_type_t *attrs, *attrs_start;
 643  644          int i, lot_count;
 644  645          int hdrsize, spillhdrsize;
 645  646          int used;
 646  647          dmu_object_type_t bonustype;
 647  648          sa_lot_t *lot;
 648  649          int len_idx;
 649  650          int spill_used;
 650  651          boolean_t spilling;
 651  652  
 652  653          dmu_buf_will_dirty(hdl->sa_bonus, tx);
 653  654          bonustype = SA_BONUSTYPE_FROM_DB(hdl->sa_bonus);
 654  655  
 655  656          /* first determine bonus header size and sum of all attributes */
 656  657          hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus,
 657  658              SA_BONUS, &i, &used, &spilling);
 658  659  
 659  660          if (used > SPA_MAXBLOCKSIZE)
 660  661                  return (EFBIG);
 661  662  
 662  663          VERIFY(0 == dmu_set_bonus(hdl->sa_bonus, spilling ?
 663  664              MIN(DN_MAX_BONUSLEN - sizeof (blkptr_t), used + hdrsize) :
 664  665              used + hdrsize, tx));
 665  666  
 666  667          ASSERT((bonustype == DMU_OT_ZNODE && spilling == 0) ||
 667  668              bonustype == DMU_OT_SA);
 668  669  
 669  670          /* setup and size spill buffer when needed */
 670  671          if (spilling) {
 671  672                  boolean_t dummy;
 672  673  
 673  674                  if (hdl->sa_spill == NULL) {
 674  675                          VERIFY(dmu_spill_hold_by_bonus(hdl->sa_bonus, NULL,
 675  676                              &hdl->sa_spill) == 0);
 676  677                  }
 677  678                  dmu_buf_will_dirty(hdl->sa_spill, tx);
 678  679  
 679  680                  spillhdrsize = sa_find_sizes(sa, &attr_desc[i],
 680  681                      attr_count - i, hdl->sa_spill, SA_SPILL, &i,
 681  682                      &spill_used, &dummy);
 682  683  
 683  684                  if (spill_used > SPA_MAXBLOCKSIZE)
 684  685                          return (EFBIG);
 685  686  
 686  687                  buf_space = hdl->sa_spill->db_size - spillhdrsize;
 687  688                  if (BUF_SPACE_NEEDED(spill_used, spillhdrsize) >
 688  689                      hdl->sa_spill->db_size)
 689  690                          VERIFY(0 == sa_resize_spill(hdl,
 690  691                              BUF_SPACE_NEEDED(spill_used, spillhdrsize), tx));
 691  692          }
 692  693  
 693  694          /* setup starting pointers to lay down data */
 694  695          data_start = (void *)((uintptr_t)hdl->sa_bonus->db_data + hdrsize);
 695  696          sahdr = (sa_hdr_phys_t *)hdl->sa_bonus->db_data;
 696  697          buftype = SA_BONUS;
 697  698  
 698  699          if (spilling)
 699  700                  buf_space = (sa->sa_force_spill) ?
 700  701                      0 : SA_BLKPTR_SPACE - hdrsize;
 701  702          else
 702  703                  buf_space = hdl->sa_bonus->db_size - hdrsize;
 703  704  
 704  705          attrs_start = attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
 705  706              KM_SLEEP);
 706  707          lot_count = 0;
 707  708  
 708  709          for (i = 0, len_idx = 0, hash = -1ULL; i != attr_count; i++) {
 709  710                  uint16_t length;
 710  711  
 711  712                  attrs[i] = attr_desc[i].sa_attr;
 712  713                  length = SA_REGISTERED_LEN(sa, attrs[i]);
 713  714                  if (length == 0)
 714  715                          length = attr_desc[i].sa_length;
 715  716  
 716  717                  if (buf_space < length) {  /* switch to spill buffer */
 717  718                          VERIFY(bonustype == DMU_OT_SA);
 718  719                          if (buftype == SA_BONUS && !sa->sa_force_spill) {
 719  720                                  sa_find_layout(hdl->sa_os, hash, attrs_start,
 720  721                                      lot_count, tx, &lot);
 721  722                                  SA_SET_HDR(sahdr, lot->lot_num, hdrsize);
 722  723                          }
 723  724  
 724  725                          buftype = SA_SPILL;
 725  726                          hash = -1ULL;
 726  727                          len_idx = 0;
 727  728  
 728  729                          sahdr = (sa_hdr_phys_t *)hdl->sa_spill->db_data;
 729  730                          sahdr->sa_magic = SA_MAGIC;
 730  731                          data_start = (void *)((uintptr_t)sahdr +
 731  732                              spillhdrsize);
 732  733                          attrs_start = &attrs[i];
 733  734                          buf_space = hdl->sa_spill->db_size - spillhdrsize;
 734  735                          lot_count = 0;
 735  736                  }
 736  737                  hash ^= SA_ATTR_HASH(attrs[i]);
 737  738                  attr_desc[i].sa_addr = data_start;
 738  739                  attr_desc[i].sa_size = length;
 739  740                  SA_COPY_DATA(attr_desc[i].sa_data_func, attr_desc[i].sa_data,
 740  741                      data_start, length);
 741  742                  if (sa->sa_attr_table[attrs[i]].sa_length == 0) {
 742  743                          sahdr->sa_lengths[len_idx++] = length;
 743  744                  }
 744  745                  data_start = (void *)P2ROUNDUP(((uintptr_t)data_start +
 745  746                      length), 8);
 746  747                  buf_space -= P2ROUNDUP(length, 8);
 747  748                  lot_count++;
 748  749          }
 749  750  
 750  751          sa_find_layout(hdl->sa_os, hash, attrs_start, lot_count, tx, &lot);
 751  752  
 752  753          /*
 753  754           * Verify that old znodes always have layout number 0.
 754  755           * Must be DMU_OT_SA for arbitrary layouts
 755  756           */
 756  757          VERIFY((bonustype == DMU_OT_ZNODE && lot->lot_num == 0) ||
 757  758              (bonustype == DMU_OT_SA && lot->lot_num > 1));
 758  759  
 759  760          if (bonustype == DMU_OT_SA) {
 760  761                  SA_SET_HDR(sahdr, lot->lot_num,
 761  762                      buftype == SA_BONUS ? hdrsize : spillhdrsize);
 762  763          }
 763  764  
 764  765          kmem_free(attrs, sizeof (sa_attr_type_t) * attr_count);
 765  766          if (hdl->sa_bonus_tab) {
 766  767                  sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab);
 767  768                  hdl->sa_bonus_tab = NULL;
 768  769          }
 769  770          if (!sa->sa_force_spill)
 770  771                  VERIFY(0 == sa_build_index(hdl, SA_BONUS));
 771  772          if (hdl->sa_spill) {
 772  773                  sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
 773  774                  if (!spilling) {
 774  775                          /*
 775  776                           * remove spill block that is no longer needed.
 776  777                           */
 777  778                          dmu_buf_rele(hdl->sa_spill, NULL);
 778  779                          hdl->sa_spill = NULL;
 779  780                          hdl->sa_spill_tab = NULL;
 780  781                          VERIFY(0 == dmu_rm_spill(hdl->sa_os,
 781  782                              sa_handle_object(hdl), tx));
 782  783                  } else {
 783  784                          VERIFY(0 == sa_build_index(hdl, SA_SPILL));
 784  785                  }
 785  786          }
 786  787  
 787  788          return (0);
 788  789  }
 789  790  
 790  791  static void
 791  792  sa_free_attr_table(sa_os_t *sa)
 792  793  {
 793  794          int i;
 794  795  
 795  796          if (sa->sa_attr_table == NULL)
 796  797                  return;
 797  798  
 798  799          for (i = 0; i != sa->sa_num_attrs; i++) {
 799  800                  if (sa->sa_attr_table[i].sa_name)
 800  801                          kmem_free(sa->sa_attr_table[i].sa_name,
 801  802                              strlen(sa->sa_attr_table[i].sa_name) + 1);
 802  803          }
 803  804  
 804  805          kmem_free(sa->sa_attr_table,
 805  806              sizeof (sa_attr_table_t) * sa->sa_num_attrs);
 806  807  
 807  808          sa->sa_attr_table = NULL;
 808  809  }
 809  810  
 810  811  static int
 811  812  sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count)
 812  813  {
 813  814          sa_os_t *sa = os->os_sa;
 814  815          uint64_t sa_attr_count = 0;
 815  816          uint64_t sa_reg_count;
 816  817          int error = 0;
 817  818          uint64_t attr_value;
 818  819          sa_attr_table_t *tb;
 819  820          zap_cursor_t zc;
 820  821          zap_attribute_t za;
 821  822          int registered_count = 0;
 822  823          int i;
 823  824          dmu_objset_type_t ostype = dmu_objset_type(os);
 824  825  
 825  826          sa->sa_user_table =
 826  827              kmem_zalloc(count * sizeof (sa_attr_type_t), KM_SLEEP);
 827  828          sa->sa_user_table_sz = count * sizeof (sa_attr_type_t);
 828  829  
 829  830          if (sa->sa_reg_attr_obj != 0) {
 830  831                  error = zap_count(os, sa->sa_reg_attr_obj,
 831  832                      &sa_attr_count);
 832  833  
 833  834                  /*
 834  835                   * Make sure we retrieved a count and that it isn't zero
 835  836                   */
 836  837                  if (error || (error == 0 && sa_attr_count == 0)) {
 837  838                          if (error == 0)
 838  839                                  error = EINVAL;
 839  840                          goto bail;
 840  841                  }
 841  842                  sa_reg_count = sa_attr_count;
 842  843          }
 843  844  
 844  845          if (ostype == DMU_OST_ZFS && sa_attr_count == 0)
 845  846                  sa_attr_count += sa_legacy_attr_count;
 846  847  
 847  848          /* Allocate attribute numbers for attributes that aren't registered */
 848  849          for (i = 0; i != count; i++) {
 849  850                  boolean_t found = B_FALSE;
 850  851                  int j;
 851  852  
 852  853                  if (ostype == DMU_OST_ZFS) {
 853  854                          for (j = 0; j != sa_legacy_attr_count; j++) {
 854  855                                  if (strcmp(reg_attrs[i].sa_name,
 855  856                                      sa_legacy_attrs[j].sa_name) == 0) {
 856  857                                          sa->sa_user_table[i] =
 857  858                                              sa_legacy_attrs[j].sa_attr;
 858  859                                          found = B_TRUE;
 859  860                                  }
 860  861                          }
 861  862                  }
 862  863                  if (found)
 863  864                          continue;
 864  865  
 865  866                  if (sa->sa_reg_attr_obj)
 866  867                          error = zap_lookup(os, sa->sa_reg_attr_obj,
 867  868                              reg_attrs[i].sa_name, 8, 1, &attr_value);
 868  869                  else
 869  870                          error = ENOENT;
 870  871                  switch (error) {
 871  872                  case ENOENT:
 872  873                          sa->sa_user_table[i] = (sa_attr_type_t)sa_attr_count;
 873  874                          sa_attr_count++;
 874  875                          break;
 875  876                  case 0:
 876  877                          sa->sa_user_table[i] = ATTR_NUM(attr_value);
 877  878                          break;
 878  879                  default:
 879  880                          goto bail;
 880  881                  }
 881  882          }
 882  883  
 883  884          sa->sa_num_attrs = sa_attr_count;
 884  885          tb = sa->sa_attr_table =
 885  886              kmem_zalloc(sizeof (sa_attr_table_t) * sa_attr_count, KM_SLEEP);
 886  887  
 887  888          /*
 888  889           * Attribute table is constructed from requested attribute list,
 889  890           * previously foreign registered attributes, and also the legacy
 890  891           * ZPL set of attributes.
 891  892           */
 892  893  
 893  894          if (sa->sa_reg_attr_obj) {
 894  895                  for (zap_cursor_init(&zc, os, sa->sa_reg_attr_obj);
 895  896                      (error = zap_cursor_retrieve(&zc, &za)) == 0;
 896  897                      zap_cursor_advance(&zc)) {
 897  898                          uint64_t value;
 898  899                          value  = za.za_first_integer;
 899  900  
 900  901                          registered_count++;
 901  902                          tb[ATTR_NUM(value)].sa_attr = ATTR_NUM(value);
 902  903                          tb[ATTR_NUM(value)].sa_length = ATTR_LENGTH(value);
 903  904                          tb[ATTR_NUM(value)].sa_byteswap = ATTR_BSWAP(value);
 904  905                          tb[ATTR_NUM(value)].sa_registered = B_TRUE;
 905  906  
 906  907                          if (tb[ATTR_NUM(value)].sa_name) {
 907  908                                  continue;
 908  909                          }
 909  910                          tb[ATTR_NUM(value)].sa_name =
 910  911                              kmem_zalloc(strlen(za.za_name) +1, KM_SLEEP);
 911  912                          (void) strlcpy(tb[ATTR_NUM(value)].sa_name, za.za_name,
 912  913                              strlen(za.za_name) +1);
 913  914                  }
 914  915                  zap_cursor_fini(&zc);
 915  916                  /*
 916  917                   * Make sure we processed the correct number of registered
 917  918                   * attributes
 918  919                   */
 919  920                  if (registered_count != sa_reg_count) {
 920  921                          ASSERT(error != 0);
 921  922                          goto bail;
 922  923                  }
 923  924  
 924  925          }
 925  926  
 926  927          if (ostype == DMU_OST_ZFS) {
 927  928                  for (i = 0; i != sa_legacy_attr_count; i++) {
 928  929                          if (tb[i].sa_name)
 929  930                                  continue;
 930  931                          tb[i].sa_attr = sa_legacy_attrs[i].sa_attr;
 931  932                          tb[i].sa_length = sa_legacy_attrs[i].sa_length;
 932  933                          tb[i].sa_byteswap = sa_legacy_attrs[i].sa_byteswap;
 933  934                          tb[i].sa_registered = B_FALSE;
 934  935                          tb[i].sa_name =
 935  936                              kmem_zalloc(strlen(sa_legacy_attrs[i].sa_name) +1,
 936  937                              KM_SLEEP);
 937  938                          (void) strlcpy(tb[i].sa_name,
 938  939                              sa_legacy_attrs[i].sa_name,
 939  940                              strlen(sa_legacy_attrs[i].sa_name) + 1);
 940  941                  }
 941  942          }
 942  943  
 943  944          for (i = 0; i != count; i++) {
 944  945                  sa_attr_type_t attr_id;
 945  946  
 946  947                  attr_id = sa->sa_user_table[i];
 947  948                  if (tb[attr_id].sa_name)
 948  949                          continue;
 949  950  
 950  951                  tb[attr_id].sa_length = reg_attrs[i].sa_length;
 951  952                  tb[attr_id].sa_byteswap = reg_attrs[i].sa_byteswap;
 952  953                  tb[attr_id].sa_attr = attr_id;
 953  954                  tb[attr_id].sa_name =
 954  955                      kmem_zalloc(strlen(reg_attrs[i].sa_name) + 1, KM_SLEEP);
 955  956                  (void) strlcpy(tb[attr_id].sa_name, reg_attrs[i].sa_name,
 956  957                      strlen(reg_attrs[i].sa_name) + 1);
 957  958          }
 958  959  
 959  960          sa->sa_need_attr_registration =
 960  961              (sa_attr_count != registered_count);
 961  962  
 962  963          return (0);
 963  964  bail:
 964  965          kmem_free(sa->sa_user_table, count * sizeof (sa_attr_type_t));
 965  966          sa->sa_user_table = NULL;
 966  967          sa_free_attr_table(sa);
 967  968          return ((error != 0) ? error : EINVAL);
 968  969  }
 969  970  
 970  971  int
 971  972  sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count,
 972  973      sa_attr_type_t **user_table)
 973  974  {
 974  975          zap_cursor_t zc;
 975  976          zap_attribute_t za;
 976  977          sa_os_t *sa;
 977  978          dmu_objset_type_t ostype = dmu_objset_type(os);
 978  979          sa_attr_type_t *tb;
 979  980          int error;
 980  981  
 981  982          mutex_enter(&os->os_lock);
 982  983          if (os->os_sa) {
 983  984                  mutex_enter(&os->os_sa->sa_lock);
 984  985                  mutex_exit(&os->os_lock);
 985  986                  tb = os->os_sa->sa_user_table;
 986  987                  mutex_exit(&os->os_sa->sa_lock);
 987  988                  *user_table = tb;
 988  989                  return (0);
 989  990          }
 990  991  
 991  992          sa = kmem_zalloc(sizeof (sa_os_t), KM_SLEEP);
 992  993          mutex_init(&sa->sa_lock, NULL, MUTEX_DEFAULT, NULL);
 993  994          sa->sa_master_obj = sa_obj;
 994  995  
 995  996          os->os_sa = sa;
 996  997          mutex_enter(&sa->sa_lock);
 997  998          mutex_exit(&os->os_lock);
 998  999          avl_create(&sa->sa_layout_num_tree, layout_num_compare,
 999 1000              sizeof (sa_lot_t), offsetof(sa_lot_t, lot_num_node));
1000 1001          avl_create(&sa->sa_layout_hash_tree, layout_hash_compare,
1001 1002              sizeof (sa_lot_t), offsetof(sa_lot_t, lot_hash_node));
1002 1003  
1003 1004          if (sa_obj) {
1004 1005                  error = zap_lookup(os, sa_obj, SA_LAYOUTS,
1005 1006                      8, 1, &sa->sa_layout_attr_obj);
1006 1007                  if (error != 0 && error != ENOENT)
1007 1008                          goto fail;
1008 1009                  error = zap_lookup(os, sa_obj, SA_REGISTRY,
1009 1010                      8, 1, &sa->sa_reg_attr_obj);
1010 1011                  if (error != 0 && error != ENOENT)
1011 1012                          goto fail;
1012 1013          }
1013 1014  
1014 1015          if ((error = sa_attr_table_setup(os, reg_attrs, count)) != 0)
1015 1016                  goto fail;
1016 1017  
1017 1018          if (sa->sa_layout_attr_obj != 0) {
1018 1019                  uint64_t layout_count;
1019 1020  
1020 1021                  error = zap_count(os, sa->sa_layout_attr_obj,
1021 1022                      &layout_count);
1022 1023  
1023 1024                  /*
1024 1025                   * Layout number count should be > 0
1025 1026                   */
1026 1027                  if (error || (error == 0 && layout_count == 0)) {
1027 1028                          if (error == 0)
1028 1029                                  error = EINVAL;
1029 1030                          goto fail;
1030 1031                  }
1031 1032  
1032 1033                  for (zap_cursor_init(&zc, os, sa->sa_layout_attr_obj);
1033 1034                      (error = zap_cursor_retrieve(&zc, &za)) == 0;
1034 1035                      zap_cursor_advance(&zc)) {
1035 1036                          sa_attr_type_t *lot_attrs;
1036 1037                          uint64_t lot_num;
1037 1038  
1038 1039                          lot_attrs = kmem_zalloc(sizeof (sa_attr_type_t) *
1039 1040                              za.za_num_integers, KM_SLEEP);
1040 1041  
1041 1042                          if ((error = (zap_lookup(os, sa->sa_layout_attr_obj,
1042 1043                              za.za_name, 2, za.za_num_integers,
1043 1044                              lot_attrs))) != 0) {
1044 1045                                  kmem_free(lot_attrs, sizeof (sa_attr_type_t) *
1045 1046                                      za.za_num_integers);
1046 1047                                  break;
1047 1048                          }
1048 1049                          VERIFY(ddi_strtoull(za.za_name, NULL, 10,
1049 1050                              (unsigned long long *)&lot_num) == 0);
1050 1051  
1051 1052                          (void) sa_add_layout_entry(os, lot_attrs,
1052 1053                              za.za_num_integers, lot_num,
1053 1054                              sa_layout_info_hash(lot_attrs,
1054 1055                              za.za_num_integers), B_FALSE, NULL);
1055 1056                          kmem_free(lot_attrs, sizeof (sa_attr_type_t) *
1056 1057                              za.za_num_integers);
1057 1058                  }
1058 1059                  zap_cursor_fini(&zc);
1059 1060  
1060 1061                  /*
1061 1062                   * Make sure layout count matches number of entries added
1062 1063                   * to AVL tree
1063 1064                   */
1064 1065                  if (avl_numnodes(&sa->sa_layout_num_tree) != layout_count) {
1065 1066                          ASSERT(error != 0);
1066 1067                          goto fail;
1067 1068                  }
1068 1069          }
1069 1070  
1070 1071          /* Add special layout number for old ZNODES */
1071 1072          if (ostype == DMU_OST_ZFS) {
1072 1073                  (void) sa_add_layout_entry(os, sa_legacy_zpl_layout,
1073 1074                      sa_legacy_attr_count, 0,
1074 1075                      sa_layout_info_hash(sa_legacy_zpl_layout,
1075 1076                      sa_legacy_attr_count), B_FALSE, NULL);
1076 1077  
1077 1078                  (void) sa_add_layout_entry(os, sa_dummy_zpl_layout, 0, 1,
1078 1079                      0, B_FALSE, NULL);
1079 1080          }
1080 1081          *user_table = os->os_sa->sa_user_table;
1081 1082          mutex_exit(&sa->sa_lock);
1082 1083          return (0);
1083 1084  fail:
1084 1085          os->os_sa = NULL;
1085 1086          sa_free_attr_table(sa);
1086 1087          if (sa->sa_user_table)
1087 1088                  kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
1088 1089          mutex_exit(&sa->sa_lock);
1089 1090          kmem_free(sa, sizeof (sa_os_t));
1090 1091          return ((error == ECKSUM) ? EIO : error);
1091 1092  }
1092 1093  
1093 1094  void
1094 1095  sa_tear_down(objset_t *os)
1095 1096  {
1096 1097          sa_os_t *sa = os->os_sa;
1097 1098          sa_lot_t *layout;
1098 1099          void *cookie;
1099 1100  
1100 1101          kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
1101 1102  
1102 1103          /* Free up attr table */
1103 1104  
1104 1105          sa_free_attr_table(sa);
1105 1106  
1106 1107          cookie = NULL;
1107 1108          while (layout = avl_destroy_nodes(&sa->sa_layout_hash_tree, &cookie)) {
1108 1109                  sa_idx_tab_t *tab;
1109 1110                  while (tab = list_head(&layout->lot_idx_tab)) {
1110 1111                          ASSERT(refcount_count(&tab->sa_refcount));
1111 1112                          sa_idx_tab_rele(os, tab);
1112 1113                  }
1113 1114          }
1114 1115  
1115 1116          cookie = NULL;
1116 1117          while (layout = avl_destroy_nodes(&sa->sa_layout_num_tree, &cookie)) {
1117 1118                  kmem_free(layout->lot_attrs,
1118 1119                      sizeof (sa_attr_type_t) * layout->lot_attr_count);
1119 1120                  kmem_free(layout, sizeof (sa_lot_t));
1120 1121          }
1121 1122  
1122 1123          avl_destroy(&sa->sa_layout_hash_tree);
1123 1124          avl_destroy(&sa->sa_layout_num_tree);
1124 1125  
1125 1126          kmem_free(sa, sizeof (sa_os_t));
1126 1127          os->os_sa = NULL;
1127 1128  }
1128 1129  
1129 1130  void
1130 1131  sa_build_idx_tab(void *hdr, void *attr_addr, sa_attr_type_t attr,
1131 1132      uint16_t length, int length_idx, boolean_t var_length, void *userp)
1132 1133  {
1133 1134          sa_idx_tab_t *idx_tab = userp;
1134 1135  
1135 1136          if (var_length) {
1136 1137                  ASSERT(idx_tab->sa_variable_lengths);
1137 1138                  idx_tab->sa_variable_lengths[length_idx] = length;
1138 1139          }
1139 1140          TOC_ATTR_ENCODE(idx_tab->sa_idx_tab[attr], length_idx,
1140 1141              (uint32_t)((uintptr_t)attr_addr - (uintptr_t)hdr));
1141 1142  }
1142 1143  
1143 1144  static void
1144 1145  sa_attr_iter(objset_t *os, sa_hdr_phys_t *hdr, dmu_object_type_t type,
1145 1146      sa_iterfunc_t func, sa_lot_t *tab, void *userp)
1146 1147  {
1147 1148          void *data_start;
1148 1149          sa_lot_t *tb = tab;
1149 1150          sa_lot_t search;
1150 1151          avl_index_t loc;
1151 1152          sa_os_t *sa = os->os_sa;
1152 1153          int i;
1153 1154          uint16_t *length_start = NULL;
1154 1155          uint8_t length_idx = 0;
1155 1156  
1156 1157          if (tab == NULL) {
1157 1158                  search.lot_num = SA_LAYOUT_NUM(hdr, type);
1158 1159                  tb = avl_find(&sa->sa_layout_num_tree, &search, &loc);
1159 1160                  ASSERT(tb);
1160 1161          }
1161 1162  
1162 1163          if (IS_SA_BONUSTYPE(type)) {
1163 1164                  data_start = (void *)P2ROUNDUP(((uintptr_t)hdr +
1164 1165                      offsetof(sa_hdr_phys_t, sa_lengths) +
1165 1166                      (sizeof (uint16_t) * tb->lot_var_sizes)), 8);
1166 1167                  length_start = hdr->sa_lengths;
1167 1168          } else {
1168 1169                  data_start = hdr;
1169 1170          }
1170 1171  
1171 1172          for (i = 0; i != tb->lot_attr_count; i++) {
1172 1173                  int attr_length, reg_length;
1173 1174                  uint8_t idx_len;
1174 1175  
1175 1176                  reg_length = sa->sa_attr_table[tb->lot_attrs[i]].sa_length;
1176 1177                  if (reg_length) {
1177 1178                          attr_length = reg_length;
1178 1179                          idx_len = 0;
1179 1180                  } else {
1180 1181                          attr_length = length_start[length_idx];
1181 1182                          idx_len = length_idx++;
1182 1183                  }
1183 1184  
1184 1185                  func(hdr, data_start, tb->lot_attrs[i], attr_length,
1185 1186                      idx_len, reg_length == 0 ? B_TRUE : B_FALSE, userp);
1186 1187  
1187 1188                  data_start = (void *)P2ROUNDUP(((uintptr_t)data_start +
1188 1189                      attr_length), 8);
1189 1190          }
1190 1191  }
1191 1192  
1192 1193  /*ARGSUSED*/
1193 1194  void
1194 1195  sa_byteswap_cb(void *hdr, void *attr_addr, sa_attr_type_t attr,
1195 1196      uint16_t length, int length_idx, boolean_t variable_length, void *userp)
1196 1197  {
1197 1198          sa_handle_t *hdl = userp;
1198 1199          sa_os_t *sa = hdl->sa_os->os_sa;
1199 1200  
1200 1201          sa_bswap_table[sa->sa_attr_table[attr].sa_byteswap](attr_addr, length);
1201 1202  }
1202 1203  
1203 1204  void
1204 1205  sa_byteswap(sa_handle_t *hdl, sa_buf_type_t buftype)
1205 1206  {
1206 1207          sa_hdr_phys_t *sa_hdr_phys = SA_GET_HDR(hdl, buftype);
1207 1208          dmu_buf_impl_t *db;
1208 1209          sa_os_t *sa = hdl->sa_os->os_sa;
1209 1210          int num_lengths = 1;
1210 1211          int i;
1211 1212  
1212 1213          ASSERT(MUTEX_HELD(&sa->sa_lock));
1213 1214          if (sa_hdr_phys->sa_magic == SA_MAGIC)
1214 1215                  return;
1215 1216  
1216 1217          db = SA_GET_DB(hdl, buftype);
1217 1218  
1218 1219          if (buftype == SA_SPILL) {
1219 1220                  arc_release(db->db_buf, NULL);
1220 1221                  arc_buf_thaw(db->db_buf);
1221 1222          }
1222 1223  
1223 1224          sa_hdr_phys->sa_magic = BSWAP_32(sa_hdr_phys->sa_magic);
1224 1225          sa_hdr_phys->sa_layout_info = BSWAP_16(sa_hdr_phys->sa_layout_info);
1225 1226  
1226 1227          /*
1227 1228           * Determine number of variable lenghts in header
1228 1229           * The standard 8 byte header has one for free and a
1229 1230           * 16 byte header would have 4 + 1;
1230 1231           */
1231 1232          if (SA_HDR_SIZE(sa_hdr_phys) > 8)
1232 1233                  num_lengths += (SA_HDR_SIZE(sa_hdr_phys) - 8) >> 1;
1233 1234          for (i = 0; i != num_lengths; i++)
1234 1235                  sa_hdr_phys->sa_lengths[i] =
1235 1236                      BSWAP_16(sa_hdr_phys->sa_lengths[i]);
1236 1237  
1237 1238          sa_attr_iter(hdl->sa_os, sa_hdr_phys, DMU_OT_SA,
1238 1239              sa_byteswap_cb, NULL, hdl);
1239 1240  
1240 1241          if (buftype == SA_SPILL)
1241 1242                  arc_buf_freeze(((dmu_buf_impl_t *)hdl->sa_spill)->db_buf);
1242 1243  }
1243 1244  
1244 1245  static int
1245 1246  sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype)
1246 1247  {
1247 1248          sa_hdr_phys_t *sa_hdr_phys;
1248 1249          dmu_buf_impl_t *db = SA_GET_DB(hdl, buftype);
1249 1250          dmu_object_type_t bonustype = SA_BONUSTYPE_FROM_DB(db);
1250 1251          sa_os_t *sa = hdl->sa_os->os_sa;
1251 1252          sa_idx_tab_t *idx_tab;
1252 1253  
1253 1254          sa_hdr_phys = SA_GET_HDR(hdl, buftype);
1254 1255  
1255 1256          mutex_enter(&sa->sa_lock);
1256 1257  
1257 1258          /* Do we need to byteswap? */
1258 1259  
1259 1260          /* only check if not old znode */
1260 1261          if (IS_SA_BONUSTYPE(bonustype) && sa_hdr_phys->sa_magic != SA_MAGIC &&
1261 1262              sa_hdr_phys->sa_magic != 0) {
1262 1263                  VERIFY(BSWAP_32(sa_hdr_phys->sa_magic) == SA_MAGIC);
1263 1264                  sa_byteswap(hdl, buftype);
1264 1265          }
1265 1266  
1266 1267          idx_tab = sa_find_idx_tab(hdl->sa_os, bonustype, sa_hdr_phys);
1267 1268  
1268 1269          if (buftype == SA_BONUS)
1269 1270                  hdl->sa_bonus_tab = idx_tab;
1270 1271          else
1271 1272                  hdl->sa_spill_tab = idx_tab;
1272 1273  
1273 1274          mutex_exit(&sa->sa_lock);
1274 1275          return (0);
1275 1276  }
1276 1277  
1277 1278  /*ARGSUSED*/
1278 1279  void
1279 1280  sa_evict(dmu_buf_t *db, void *sap)
1280 1281  {
1281 1282          panic("evicting sa dbuf %p\n", (void *)db);
1282 1283  }
1283 1284  
1284 1285  static void
1285 1286  sa_idx_tab_rele(objset_t *os, void *arg)
1286 1287  {
1287 1288          sa_os_t *sa = os->os_sa;
1288 1289          sa_idx_tab_t *idx_tab = arg;
1289 1290  
1290 1291          if (idx_tab == NULL)
1291 1292                  return;
1292 1293  
1293 1294          mutex_enter(&sa->sa_lock);
1294 1295          if (refcount_remove(&idx_tab->sa_refcount, NULL) == 0) {
1295 1296                  list_remove(&idx_tab->sa_layout->lot_idx_tab, idx_tab);
1296 1297                  if (idx_tab->sa_variable_lengths)
1297 1298                          kmem_free(idx_tab->sa_variable_lengths,
1298 1299                              sizeof (uint16_t) *
1299 1300                              idx_tab->sa_layout->lot_var_sizes);
1300 1301                  refcount_destroy(&idx_tab->sa_refcount);
1301 1302                  kmem_free(idx_tab->sa_idx_tab,
1302 1303                      sizeof (uint32_t) * sa->sa_num_attrs);
1303 1304                  kmem_free(idx_tab, sizeof (sa_idx_tab_t));
1304 1305          }
1305 1306          mutex_exit(&sa->sa_lock);
1306 1307  }
1307 1308  
1308 1309  static void
1309 1310  sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab)
1310 1311  {
1311 1312          sa_os_t *sa = os->os_sa;
1312 1313  
1313 1314          ASSERT(MUTEX_HELD(&sa->sa_lock));
1314 1315          (void) refcount_add(&idx_tab->sa_refcount, NULL);
1315 1316  }
1316 1317  
1317 1318  void
1318 1319  sa_handle_destroy(sa_handle_t *hdl)
1319 1320  {
1320 1321          mutex_enter(&hdl->sa_lock);
1321 1322          (void) dmu_buf_update_user((dmu_buf_t *)hdl->sa_bonus, hdl,
1322 1323              NULL, NULL, NULL);
1323 1324  
1324 1325          if (hdl->sa_bonus_tab) {
1325 1326                  sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab);
1326 1327                  hdl->sa_bonus_tab = NULL;
1327 1328          }
1328 1329          if (hdl->sa_spill_tab) {
1329 1330                  sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
1330 1331                  hdl->sa_spill_tab = NULL;
1331 1332          }
1332 1333  
1333 1334          dmu_buf_rele(hdl->sa_bonus, NULL);
1334 1335  
1335 1336          if (hdl->sa_spill)
1336 1337                  dmu_buf_rele((dmu_buf_t *)hdl->sa_spill, NULL);
1337 1338          mutex_exit(&hdl->sa_lock);
1338 1339  
1339 1340          kmem_cache_free(sa_cache, hdl);
1340 1341  }
1341 1342  
1342 1343  int
1343 1344  sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp,
1344 1345      sa_handle_type_t hdl_type, sa_handle_t **handlepp)
1345 1346  {
1346 1347          int error = 0;
1347 1348          dmu_object_info_t doi;
1348 1349          sa_handle_t *handle;
1349 1350  
1350 1351  #ifdef ZFS_DEBUG
1351 1352          dmu_object_info_from_db(db, &doi);
1352 1353          ASSERT(doi.doi_bonus_type == DMU_OT_SA ||
1353 1354              doi.doi_bonus_type == DMU_OT_ZNODE);
1354 1355  #endif
1355 1356          /* find handle, if it exists */
1356 1357          /* if one doesn't exist then create a new one, and initialize it */
1357 1358  
1358 1359          handle = (hdl_type == SA_HDL_SHARED) ? dmu_buf_get_user(db) : NULL;
1359 1360          if (handle == NULL) {
1360 1361                  sa_handle_t *newhandle;
1361 1362                  handle = kmem_cache_alloc(sa_cache, KM_SLEEP);
1362 1363                  handle->sa_userp = userp;
1363 1364                  handle->sa_bonus = db;
1364 1365                  handle->sa_os = os;
1365 1366                  handle->sa_spill = NULL;
1366 1367  
1367 1368                  error = sa_build_index(handle, SA_BONUS);
1368 1369                  newhandle = (hdl_type == SA_HDL_SHARED) ?
1369 1370                      dmu_buf_set_user_ie(db, handle,
1370 1371                      NULL, sa_evict) : NULL;
1371 1372  
1372 1373                  if (newhandle != NULL) {
1373 1374                          kmem_cache_free(sa_cache, handle);
1374 1375                          handle = newhandle;
1375 1376                  }
1376 1377          }
1377 1378          *handlepp = handle;
1378 1379  
1379 1380          return (error);
1380 1381  }
1381 1382  
1382 1383  int
1383 1384  sa_handle_get(objset_t *objset, uint64_t objid, void *userp,
1384 1385      sa_handle_type_t hdl_type, sa_handle_t **handlepp)
1385 1386  {
1386 1387          dmu_buf_t *db;
1387 1388          int error;
1388 1389  
1389 1390          if (error = dmu_bonus_hold(objset, objid, NULL, &db))
1390 1391                  return (error);
1391 1392  
1392 1393          return (sa_handle_get_from_db(objset, db, userp, hdl_type,
1393 1394              handlepp));
1394 1395  }
1395 1396  
1396 1397  int
1397 1398  sa_buf_hold(objset_t *objset, uint64_t obj_num, void *tag, dmu_buf_t **db)
1398 1399  {
1399 1400          return (dmu_bonus_hold(objset, obj_num, tag, db));
1400 1401  }
1401 1402  
1402 1403  void
1403 1404  sa_buf_rele(dmu_buf_t *db, void *tag)
1404 1405  {
1405 1406          dmu_buf_rele(db, tag);
1406 1407  }
1407 1408  
1408 1409  int
1409 1410  sa_lookup_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count)
1410 1411  {
1411 1412          ASSERT(hdl);
1412 1413          ASSERT(MUTEX_HELD(&hdl->sa_lock));
1413 1414          return (sa_attr_op(hdl, bulk, count, SA_LOOKUP, NULL));
1414 1415  }
1415 1416  
1416 1417  int
1417 1418  sa_lookup(sa_handle_t *hdl, sa_attr_type_t attr, void *buf, uint32_t buflen)
1418 1419  {
1419 1420          int error;
1420 1421          sa_bulk_attr_t bulk;
1421 1422  
1422 1423          bulk.sa_attr = attr;
1423 1424          bulk.sa_data = buf;
1424 1425          bulk.sa_length = buflen;
1425 1426          bulk.sa_data_func = NULL;
1426 1427  
1427 1428          ASSERT(hdl);
1428 1429          mutex_enter(&hdl->sa_lock);
1429 1430          error = sa_lookup_impl(hdl, &bulk, 1);
1430 1431          mutex_exit(&hdl->sa_lock);
1431 1432          return (error);
1432 1433  }
1433 1434  
1434 1435  #ifdef _KERNEL
1435 1436  int
1436 1437  sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, uio_t *uio)
1437 1438  {
1438 1439          int error;
1439 1440          sa_bulk_attr_t bulk;
1440 1441  
1441 1442          bulk.sa_data = NULL;
1442 1443          bulk.sa_attr = attr;
1443 1444          bulk.sa_data_func = NULL;
1444 1445  
1445 1446          ASSERT(hdl);
1446 1447  
1447 1448          mutex_enter(&hdl->sa_lock);
1448 1449          if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) == 0) {
1449 1450                  error = uiomove((void *)bulk.sa_addr, MIN(bulk.sa_size,
1450 1451                      uio->uio_resid), UIO_READ, uio);
1451 1452          }
1452 1453          mutex_exit(&hdl->sa_lock);
1453 1454          return (error);
1454 1455  
1455 1456  }
1456 1457  #endif
1457 1458  
1458 1459  void *
1459 1460  sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, void *data)
1460 1461  {
1461 1462          sa_idx_tab_t *idx_tab;
1462 1463          sa_hdr_phys_t *hdr = (sa_hdr_phys_t *)data;
1463 1464          sa_os_t *sa = os->os_sa;
1464 1465          sa_lot_t *tb, search;
1465 1466          avl_index_t loc;
1466 1467  
1467 1468          /*
1468 1469           * Deterimine layout number.  If SA node and header == 0 then
1469 1470           * force the index table to the dummy "1" empty layout.
1470 1471           *
1471 1472           * The layout number would only be zero for a newly created file
1472 1473           * that has not added any attributes yet, or with crypto enabled which
1473 1474           * doesn't write any attributes to the bonus buffer.
1474 1475           */
1475 1476  
1476 1477          search.lot_num = SA_LAYOUT_NUM(hdr, bonustype);
1477 1478  
1478 1479          tb = avl_find(&sa->sa_layout_num_tree, &search, &loc);
1479 1480  
1480 1481          /* Verify header size is consistent with layout information */
1481 1482          ASSERT(tb);
1482 1483          ASSERT(IS_SA_BONUSTYPE(bonustype) &&
1483 1484              SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) || !IS_SA_BONUSTYPE(bonustype) ||
1484 1485              (IS_SA_BONUSTYPE(bonustype) && hdr->sa_layout_info == 0));
1485 1486  
1486 1487          /*
1487 1488           * See if any of the already existing TOC entries can be reused?
1488 1489           */
1489 1490  
1490 1491          for (idx_tab = list_head(&tb->lot_idx_tab); idx_tab;
1491 1492              idx_tab = list_next(&tb->lot_idx_tab, idx_tab)) {
1492 1493                  boolean_t valid_idx = B_TRUE;
1493 1494                  int i;
1494 1495  
1495 1496                  if (tb->lot_var_sizes != 0 &&
1496 1497                      idx_tab->sa_variable_lengths != NULL) {
1497 1498                          for (i = 0; i != tb->lot_var_sizes; i++) {
1498 1499                                  if (hdr->sa_lengths[i] !=
1499 1500                                      idx_tab->sa_variable_lengths[i]) {
1500 1501                                          valid_idx = B_FALSE;
1501 1502                                          break;
1502 1503                                  }
1503 1504                          }
1504 1505                  }
1505 1506                  if (valid_idx) {
1506 1507                          sa_idx_tab_hold(os, idx_tab);
1507 1508                          return (idx_tab);
1508 1509                  }
1509 1510          }
1510 1511  
1511 1512          /* No such luck, create a new entry */
1512 1513          idx_tab = kmem_zalloc(sizeof (sa_idx_tab_t), KM_SLEEP);
1513 1514          idx_tab->sa_idx_tab =
1514 1515              kmem_zalloc(sizeof (uint32_t) * sa->sa_num_attrs, KM_SLEEP);
1515 1516          idx_tab->sa_layout = tb;
1516 1517          refcount_create(&idx_tab->sa_refcount);
1517 1518          if (tb->lot_var_sizes)
1518 1519                  idx_tab->sa_variable_lengths = kmem_alloc(sizeof (uint16_t) *
1519 1520                      tb->lot_var_sizes, KM_SLEEP);
1520 1521  
1521 1522          sa_attr_iter(os, hdr, bonustype, sa_build_idx_tab,
1522 1523              tb, idx_tab);
1523 1524          sa_idx_tab_hold(os, idx_tab);   /* one hold for consumer */
1524 1525          sa_idx_tab_hold(os, idx_tab);   /* one for layout */
1525 1526          list_insert_tail(&tb->lot_idx_tab, idx_tab);
1526 1527          return (idx_tab);
1527 1528  }
1528 1529  
1529 1530  void
1530 1531  sa_default_locator(void **dataptr, uint32_t *len, uint32_t total_len,
1531 1532      boolean_t start, void *userdata)
1532 1533  {
1533 1534          ASSERT(start);
1534 1535  
1535 1536          *dataptr = userdata;
1536 1537          *len = total_len;
1537 1538  }
1538 1539  
1539 1540  static void
1540 1541  sa_attr_register_sync(sa_handle_t *hdl, dmu_tx_t *tx)
1541 1542  {
1542 1543          uint64_t attr_value = 0;
1543 1544          sa_os_t *sa = hdl->sa_os->os_sa;
1544 1545          sa_attr_table_t *tb = sa->sa_attr_table;
  
    | 
      ↓ open down ↓ | 
    1101 lines elided | 
    
      ↑ open up ↑ | 
  
1545 1546          int i;
1546 1547  
1547 1548          mutex_enter(&sa->sa_lock);
1548 1549  
1549 1550          if (!sa->sa_need_attr_registration || sa->sa_master_obj == NULL) {
1550 1551                  mutex_exit(&sa->sa_lock);
1551 1552                  return;
1552 1553          }
1553 1554  
1554 1555          if (sa->sa_reg_attr_obj == NULL) {
1555      -                sa->sa_reg_attr_obj = zap_create(hdl->sa_os,
1556      -                    DMU_OT_SA_ATTR_REGISTRATION, DMU_OT_NONE, 0, tx);
1557      -                VERIFY(zap_add(hdl->sa_os, sa->sa_master_obj,
1558      -                    SA_REGISTRY, 8, 1, &sa->sa_reg_attr_obj, tx) == 0);
     1556 +                sa->sa_reg_attr_obj = zap_create_link(hdl->sa_os,
     1557 +                    DMU_OT_SA_ATTR_REGISTRATION,
     1558 +                    sa->sa_master_obj, SA_REGISTRY, tx);
1559 1559          }
1560 1560          for (i = 0; i != sa->sa_num_attrs; i++) {
1561 1561                  if (sa->sa_attr_table[i].sa_registered)
1562 1562                          continue;
1563 1563                  ATTR_ENCODE(attr_value, tb[i].sa_attr, tb[i].sa_length,
1564 1564                      tb[i].sa_byteswap);
1565 1565                  VERIFY(0 == zap_update(hdl->sa_os, sa->sa_reg_attr_obj,
1566 1566                      tb[i].sa_name, 8, 1, &attr_value, tx));
1567 1567                  tb[i].sa_registered = B_TRUE;
1568 1568          }
1569 1569          sa->sa_need_attr_registration = B_FALSE;
1570 1570          mutex_exit(&sa->sa_lock);
1571 1571  }
1572 1572  
1573 1573  /*
1574 1574   * Replace all attributes with attributes specified in template.
1575 1575   * If dnode had a spill buffer then those attributes will be
1576 1576   * also be replaced, possibly with just an empty spill block
1577 1577   *
1578 1578   * This interface is intended to only be used for bulk adding of
1579 1579   * attributes for a new file.  It will also be used by the ZPL
1580 1580   * when converting and old formatted znode to native SA support.
1581 1581   */
1582 1582  int
1583 1583  sa_replace_all_by_template_locked(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc,
1584 1584      int attr_count, dmu_tx_t *tx)
1585 1585  {
1586 1586          sa_os_t *sa = hdl->sa_os->os_sa;
1587 1587  
1588 1588          if (sa->sa_need_attr_registration)
1589 1589                  sa_attr_register_sync(hdl, tx);
1590 1590          return (sa_build_layouts(hdl, attr_desc, attr_count, tx));
1591 1591  }
1592 1592  
1593 1593  int
1594 1594  sa_replace_all_by_template(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc,
1595 1595      int attr_count, dmu_tx_t *tx)
1596 1596  {
1597 1597          int error;
1598 1598  
1599 1599          mutex_enter(&hdl->sa_lock);
1600 1600          error = sa_replace_all_by_template_locked(hdl, attr_desc,
1601 1601              attr_count, tx);
1602 1602          mutex_exit(&hdl->sa_lock);
1603 1603          return (error);
1604 1604  }
1605 1605  
1606 1606  /*
1607 1607   * add/remove/replace a single attribute and then rewrite the entire set
1608 1608   * of attributes.
1609 1609   */
1610 1610  static int
1611 1611  sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
1612 1612      sa_data_op_t action, sa_data_locator_t *locator, void *datastart,
1613 1613      uint16_t buflen, dmu_tx_t *tx)
1614 1614  {
1615 1615          sa_os_t *sa = hdl->sa_os->os_sa;
1616 1616          dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
1617 1617          dnode_t *dn;
1618 1618          sa_bulk_attr_t *attr_desc;
1619 1619          void *old_data[2];
1620 1620          int bonus_attr_count = 0;
1621 1621          int bonus_data_size, spill_data_size;
1622 1622          int spill_attr_count = 0;
1623 1623          int error;
1624 1624          uint16_t length;
1625 1625          int i, j, k, length_idx;
1626 1626          sa_hdr_phys_t *hdr;
1627 1627          sa_idx_tab_t *idx_tab;
1628 1628          int attr_count;
1629 1629          int count;
1630 1630  
1631 1631          ASSERT(MUTEX_HELD(&hdl->sa_lock));
1632 1632  
1633 1633          /* First make of copy of the old data */
1634 1634  
1635 1635          DB_DNODE_ENTER(db);
1636 1636          dn = DB_DNODE(db);
1637 1637          if (dn->dn_bonuslen != 0) {
1638 1638                  bonus_data_size = hdl->sa_bonus->db_size;
1639 1639                  old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP);
1640 1640                  bcopy(hdl->sa_bonus->db_data, old_data[0],
1641 1641                      hdl->sa_bonus->db_size);
1642 1642                  bonus_attr_count = hdl->sa_bonus_tab->sa_layout->lot_attr_count;
1643 1643          } else {
1644 1644                  old_data[0] = NULL;
1645 1645          }
1646 1646          DB_DNODE_EXIT(db);
1647 1647  
1648 1648          /* Bring spill buffer online if it isn't currently */
1649 1649  
1650 1650          if ((error = sa_get_spill(hdl)) == 0) {
1651 1651                  spill_data_size = hdl->sa_spill->db_size;
1652 1652                  old_data[1] = kmem_alloc(spill_data_size, KM_SLEEP);
1653 1653                  bcopy(hdl->sa_spill->db_data, old_data[1],
1654 1654                      hdl->sa_spill->db_size);
1655 1655                  spill_attr_count =
1656 1656                      hdl->sa_spill_tab->sa_layout->lot_attr_count;
1657 1657          } else if (error && error != ENOENT) {
1658 1658                  if (old_data[0])
1659 1659                          kmem_free(old_data[0], bonus_data_size);
1660 1660                  return (error);
1661 1661          } else {
1662 1662                  old_data[1] = NULL;
1663 1663          }
1664 1664  
1665 1665          /* build descriptor of all attributes */
1666 1666  
1667 1667          attr_count = bonus_attr_count + spill_attr_count;
1668 1668          if (action == SA_ADD)
1669 1669                  attr_count++;
1670 1670          else if (action == SA_REMOVE)
1671 1671                  attr_count--;
1672 1672  
1673 1673          attr_desc = kmem_zalloc(sizeof (sa_bulk_attr_t) * attr_count, KM_SLEEP);
1674 1674  
1675 1675          /*
1676 1676           * loop through bonus and spill buffer if it exists, and
1677 1677           * build up new attr_descriptor to reset the attributes
1678 1678           */
1679 1679          k = j = 0;
1680 1680          count = bonus_attr_count;
1681 1681          hdr = SA_GET_HDR(hdl, SA_BONUS);
1682 1682          idx_tab = SA_IDX_TAB_GET(hdl, SA_BONUS);
1683 1683          for (; k != 2; k++) {
1684 1684                  /* iterate over each attribute in layout */
1685 1685                  for (i = 0, length_idx = 0; i != count; i++) {
1686 1686                          sa_attr_type_t attr;
1687 1687  
1688 1688                          attr = idx_tab->sa_layout->lot_attrs[i];
1689 1689                          if (attr == newattr) {
1690 1690                                  if (action == SA_REMOVE) {
1691 1691                                          j++;
1692 1692                                          continue;
1693 1693                                  }
1694 1694                                  ASSERT(SA_REGISTERED_LEN(sa, attr) == 0);
1695 1695                                  ASSERT(action == SA_REPLACE);
1696 1696                                  SA_ADD_BULK_ATTR(attr_desc, j, attr,
1697 1697                                      locator, datastart, buflen);
1698 1698                          } else {
1699 1699                                  length = SA_REGISTERED_LEN(sa, attr);
1700 1700                                  if (length == 0) {
1701 1701                                          length = hdr->sa_lengths[length_idx++];
1702 1702                                  }
1703 1703  
1704 1704                                  SA_ADD_BULK_ATTR(attr_desc, j, attr,
1705 1705                                      NULL, (void *)
1706 1706                                      (TOC_OFF(idx_tab->sa_idx_tab[attr]) +
1707 1707                                      (uintptr_t)old_data[k]), length);
1708 1708                          }
1709 1709                  }
1710 1710                  if (k == 0 && hdl->sa_spill) {
1711 1711                          hdr = SA_GET_HDR(hdl, SA_SPILL);
1712 1712                          idx_tab = SA_IDX_TAB_GET(hdl, SA_SPILL);
1713 1713                          count = spill_attr_count;
1714 1714                  } else {
1715 1715                          break;
1716 1716                  }
1717 1717          }
1718 1718          if (action == SA_ADD) {
1719 1719                  length = SA_REGISTERED_LEN(sa, newattr);
1720 1720                  if (length == 0) {
1721 1721                          length = buflen;
1722 1722                  }
1723 1723                  SA_ADD_BULK_ATTR(attr_desc, j, newattr, locator,
1724 1724                      datastart, buflen);
1725 1725          }
1726 1726  
1727 1727          error = sa_build_layouts(hdl, attr_desc, attr_count, tx);
1728 1728  
1729 1729          if (old_data[0])
1730 1730                  kmem_free(old_data[0], bonus_data_size);
1731 1731          if (old_data[1])
1732 1732                  kmem_free(old_data[1], spill_data_size);
1733 1733          kmem_free(attr_desc, sizeof (sa_bulk_attr_t) * attr_count);
1734 1734  
1735 1735          return (error);
1736 1736  }
1737 1737  
1738 1738  static int
1739 1739  sa_bulk_update_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
1740 1740      dmu_tx_t *tx)
1741 1741  {
1742 1742          int error;
1743 1743          sa_os_t *sa = hdl->sa_os->os_sa;
1744 1744          dmu_object_type_t bonustype;
1745 1745  
1746 1746          bonustype = SA_BONUSTYPE_FROM_DB(SA_GET_DB(hdl, SA_BONUS));
1747 1747  
1748 1748          ASSERT(hdl);
1749 1749          ASSERT(MUTEX_HELD(&hdl->sa_lock));
1750 1750  
1751 1751          /* sync out registration table if necessary */
1752 1752          if (sa->sa_need_attr_registration)
1753 1753                  sa_attr_register_sync(hdl, tx);
1754 1754  
1755 1755          error = sa_attr_op(hdl, bulk, count, SA_UPDATE, tx);
1756 1756          if (error == 0 && !IS_SA_BONUSTYPE(bonustype) && sa->sa_update_cb)
1757 1757                  sa->sa_update_cb(hdl, tx);
1758 1758  
1759 1759          return (error);
1760 1760  }
1761 1761  
1762 1762  /*
1763 1763   * update or add new attribute
1764 1764   */
1765 1765  int
1766 1766  sa_update(sa_handle_t *hdl, sa_attr_type_t type,
1767 1767      void *buf, uint32_t buflen, dmu_tx_t *tx)
1768 1768  {
1769 1769          int error;
1770 1770          sa_bulk_attr_t bulk;
1771 1771  
1772 1772          bulk.sa_attr = type;
1773 1773          bulk.sa_data_func = NULL;
1774 1774          bulk.sa_length = buflen;
1775 1775          bulk.sa_data = buf;
1776 1776  
1777 1777          mutex_enter(&hdl->sa_lock);
1778 1778          error = sa_bulk_update_impl(hdl, &bulk, 1, tx);
1779 1779          mutex_exit(&hdl->sa_lock);
1780 1780          return (error);
1781 1781  }
1782 1782  
1783 1783  int
1784 1784  sa_update_from_cb(sa_handle_t *hdl, sa_attr_type_t attr,
1785 1785      uint32_t buflen, sa_data_locator_t *locator, void *userdata, dmu_tx_t *tx)
1786 1786  {
1787 1787          int error;
1788 1788          sa_bulk_attr_t bulk;
1789 1789  
1790 1790          bulk.sa_attr = attr;
1791 1791          bulk.sa_data = userdata;
1792 1792          bulk.sa_data_func = locator;
1793 1793          bulk.sa_length = buflen;
1794 1794  
1795 1795          mutex_enter(&hdl->sa_lock);
1796 1796          error = sa_bulk_update_impl(hdl, &bulk, 1, tx);
1797 1797          mutex_exit(&hdl->sa_lock);
1798 1798          return (error);
1799 1799  }
1800 1800  
1801 1801  /*
1802 1802   * Return size of an attribute
1803 1803   */
1804 1804  
1805 1805  int
1806 1806  sa_size(sa_handle_t *hdl, sa_attr_type_t attr, int *size)
1807 1807  {
1808 1808          sa_bulk_attr_t bulk;
1809 1809          int error;
1810 1810  
1811 1811          bulk.sa_data = NULL;
1812 1812          bulk.sa_attr = attr;
1813 1813          bulk.sa_data_func = NULL;
1814 1814  
1815 1815          ASSERT(hdl);
1816 1816          mutex_enter(&hdl->sa_lock);
1817 1817          if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) != 0) {
1818 1818                  mutex_exit(&hdl->sa_lock);
1819 1819                  return (error);
1820 1820          }
1821 1821          *size = bulk.sa_size;
1822 1822  
1823 1823          mutex_exit(&hdl->sa_lock);
1824 1824          return (0);
1825 1825  }
1826 1826  
1827 1827  int
1828 1828  sa_bulk_lookup_locked(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count)
1829 1829  {
1830 1830          ASSERT(hdl);
1831 1831          ASSERT(MUTEX_HELD(&hdl->sa_lock));
1832 1832          return (sa_lookup_impl(hdl, attrs, count));
1833 1833  }
1834 1834  
1835 1835  int
1836 1836  sa_bulk_lookup(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count)
1837 1837  {
1838 1838          int error;
1839 1839  
1840 1840          ASSERT(hdl);
1841 1841          mutex_enter(&hdl->sa_lock);
1842 1842          error = sa_bulk_lookup_locked(hdl, attrs, count);
1843 1843          mutex_exit(&hdl->sa_lock);
1844 1844          return (error);
1845 1845  }
1846 1846  
1847 1847  int
1848 1848  sa_bulk_update(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count, dmu_tx_t *tx)
1849 1849  {
1850 1850          int error;
1851 1851  
1852 1852          ASSERT(hdl);
1853 1853          mutex_enter(&hdl->sa_lock);
1854 1854          error = sa_bulk_update_impl(hdl, attrs, count, tx);
1855 1855          mutex_exit(&hdl->sa_lock);
1856 1856          return (error);
1857 1857  }
1858 1858  
1859 1859  int
1860 1860  sa_remove(sa_handle_t *hdl, sa_attr_type_t attr, dmu_tx_t *tx)
1861 1861  {
1862 1862          int error;
1863 1863  
1864 1864          mutex_enter(&hdl->sa_lock);
1865 1865          error = sa_modify_attrs(hdl, attr, SA_REMOVE, NULL,
1866 1866              NULL, 0, tx);
1867 1867          mutex_exit(&hdl->sa_lock);
1868 1868          return (error);
1869 1869  }
1870 1870  
1871 1871  void
1872 1872  sa_object_info(sa_handle_t *hdl, dmu_object_info_t *doi)
1873 1873  {
1874 1874          dmu_object_info_from_db((dmu_buf_t *)hdl->sa_bonus, doi);
1875 1875  }
1876 1876  
1877 1877  void
1878 1878  sa_object_size(sa_handle_t *hdl, uint32_t *blksize, u_longlong_t *nblocks)
1879 1879  {
1880 1880          dmu_object_size_from_db((dmu_buf_t *)hdl->sa_bonus,
1881 1881              blksize, nblocks);
1882 1882  }
1883 1883  
1884 1884  void
1885 1885  sa_update_user(sa_handle_t *newhdl, sa_handle_t *oldhdl)
1886 1886  {
1887 1887          (void) dmu_buf_update_user((dmu_buf_t *)newhdl->sa_bonus,
1888 1888              oldhdl, newhdl, NULL, sa_evict);
1889 1889          oldhdl->sa_bonus = NULL;
1890 1890  }
1891 1891  
1892 1892  void
1893 1893  sa_set_userp(sa_handle_t *hdl, void *ptr)
1894 1894  {
1895 1895          hdl->sa_userp = ptr;
1896 1896  }
1897 1897  
1898 1898  dmu_buf_t *
1899 1899  sa_get_db(sa_handle_t *hdl)
1900 1900  {
1901 1901          return ((dmu_buf_t *)hdl->sa_bonus);
1902 1902  }
1903 1903  
1904 1904  void *
1905 1905  sa_get_userdata(sa_handle_t *hdl)
1906 1906  {
1907 1907          return (hdl->sa_userp);
1908 1908  }
1909 1909  
1910 1910  void
1911 1911  sa_register_update_callback_locked(objset_t *os, sa_update_cb_t *func)
1912 1912  {
1913 1913          ASSERT(MUTEX_HELD(&os->os_sa->sa_lock));
1914 1914          os->os_sa->sa_update_cb = func;
1915 1915  }
1916 1916  
1917 1917  void
1918 1918  sa_register_update_callback(objset_t *os, sa_update_cb_t *func)
1919 1919  {
1920 1920  
1921 1921          mutex_enter(&os->os_sa->sa_lock);
1922 1922          sa_register_update_callback_locked(os, func);
1923 1923          mutex_exit(&os->os_sa->sa_lock);
1924 1924  }
1925 1925  
1926 1926  uint64_t
1927 1927  sa_handle_object(sa_handle_t *hdl)
1928 1928  {
1929 1929          return (hdl->sa_bonus->db_object);
1930 1930  }
1931 1931  
1932 1932  boolean_t
1933 1933  sa_enabled(objset_t *os)
1934 1934  {
1935 1935          return (os->os_sa == NULL);
1936 1936  }
1937 1937  
1938 1938  int
1939 1939  sa_set_sa_object(objset_t *os, uint64_t sa_object)
1940 1940  {
1941 1941          sa_os_t *sa = os->os_sa;
1942 1942  
1943 1943          if (sa->sa_master_obj)
1944 1944                  return (1);
1945 1945  
1946 1946          sa->sa_master_obj = sa_object;
1947 1947  
1948 1948          return (0);
1949 1949  }
1950 1950  
1951 1951  int
1952 1952  sa_hdrsize(void *arg)
1953 1953  {
1954 1954          sa_hdr_phys_t *hdr = arg;
1955 1955  
1956 1956          return (SA_HDR_SIZE(hdr));
1957 1957  }
1958 1958  
1959 1959  void
1960 1960  sa_handle_lock(sa_handle_t *hdl)
1961 1961  {
1962 1962          ASSERT(hdl);
1963 1963          mutex_enter(&hdl->sa_lock);
1964 1964  }
1965 1965  
1966 1966  void
1967 1967  sa_handle_unlock(sa_handle_t *hdl)
1968 1968  {
1969 1969          ASSERT(hdl);
1970 1970          mutex_exit(&hdl->sa_lock);
1971 1971  }
  
    | 
      ↓ open down ↓ | 
    403 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX