Print this page
    
2619 asynchronous destruction of ZFS file systems
2747 SPA versioning with zfs feature flags
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <gwilson@delphix.com>
Reviewed by: Richard Lowe <richlowe@richlowe.net>
Reviewed by: Dan Kruchinin <dan.kruchinin@gmail.com>
Approved by: Dan McDonald <danmcd@nexenta.com>
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/zfs/dnode.c
          +++ new/usr/src/uts/common/fs/zfs/dnode.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  
    | 
      ↓ open down ↓ | 
    12 lines elided | 
    
      ↑ open up ↑ | 
  
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
       23 + * Copyright (c) 2012 by Delphix. All rights reserved.
  23   24   */
  24   25  
  25   26  #include <sys/zfs_context.h>
  26   27  #include <sys/dbuf.h>
  27   28  #include <sys/dnode.h>
  28   29  #include <sys/dmu.h>
  29   30  #include <sys/dmu_impl.h>
  30   31  #include <sys/dmu_tx.h>
  31   32  #include <sys/dmu_objset.h>
  32   33  #include <sys/dsl_dir.h>
  33   34  #include <sys/dsl_dataset.h>
  34   35  #include <sys/spa.h>
  35   36  #include <sys/zio.h>
  36   37  #include <sys/dmu_zfetch.h>
  37   38  
  38   39  static int free_range_compar(const void *node1, const void *node2);
  39   40  
  40   41  static kmem_cache_t *dnode_cache;
  41   42  /*
  42   43   * Define DNODE_STATS to turn on statistic gathering. By default, it is only
  43   44   * turned on when DEBUG is also defined.
  44   45   */
  45   46  #ifdef  DEBUG
  46   47  #define DNODE_STATS
  47   48  #endif  /* DEBUG */
  48   49  
  49   50  #ifdef  DNODE_STATS
  50   51  #define DNODE_STAT_ADD(stat)                    ((stat)++)
  51   52  #else
  52   53  #define DNODE_STAT_ADD(stat)                    /* nothing */
  53   54  #endif  /* DNODE_STATS */
  54   55  
  55   56  static dnode_phys_t dnode_phys_zero;
  56   57  
  57   58  int zfs_default_bs = SPA_MINBLOCKSHIFT;
  58   59  int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
  59   60  
  60   61  static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
  61   62  
  62   63  /* ARGSUSED */
  63   64  static int
  64   65  dnode_cons(void *arg, void *unused, int kmflag)
  65   66  {
  66   67          dnode_t *dn = arg;
  67   68          int i;
  68   69  
  69   70          rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL);
  70   71          mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
  71   72          mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
  72   73          cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
  73   74  
  74   75          refcount_create(&dn->dn_holds);
  75   76          refcount_create(&dn->dn_tx_holds);
  76   77          list_link_init(&dn->dn_link);
  77   78  
  78   79          bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr));
  79   80          bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels));
  80   81          bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift));
  81   82          bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype));
  82   83          bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk));
  83   84          bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen));
  84   85          bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz));
  85   86  
  86   87          for (i = 0; i < TXG_SIZE; i++) {
  87   88                  list_link_init(&dn->dn_dirty_link[i]);
  88   89                  avl_create(&dn->dn_ranges[i], free_range_compar,
  89   90                      sizeof (free_range_t),
  90   91                      offsetof(struct free_range, fr_node));
  91   92                  list_create(&dn->dn_dirty_records[i],
  92   93                      sizeof (dbuf_dirty_record_t),
  93   94                      offsetof(dbuf_dirty_record_t, dr_dirty_node));
  94   95          }
  95   96  
  96   97          dn->dn_allocated_txg = 0;
  97   98          dn->dn_free_txg = 0;
  98   99          dn->dn_assigned_txg = 0;
  99  100          dn->dn_dirtyctx = 0;
 100  101          dn->dn_dirtyctx_firstset = NULL;
 101  102          dn->dn_bonus = NULL;
 102  103          dn->dn_have_spill = B_FALSE;
 103  104          dn->dn_zio = NULL;
 104  105          dn->dn_oldused = 0;
 105  106          dn->dn_oldflags = 0;
 106  107          dn->dn_olduid = 0;
 107  108          dn->dn_oldgid = 0;
 108  109          dn->dn_newuid = 0;
 109  110          dn->dn_newgid = 0;
 110  111          dn->dn_id_flags = 0;
 111  112  
 112  113          dn->dn_dbufs_count = 0;
 113  114          list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t),
 114  115              offsetof(dmu_buf_impl_t, db_link));
 115  116  
 116  117          dn->dn_moved = 0;
 117  118          return (0);
 118  119  }
 119  120  
 120  121  /* ARGSUSED */
 121  122  static void
 122  123  dnode_dest(void *arg, void *unused)
 123  124  {
 124  125          int i;
 125  126          dnode_t *dn = arg;
 126  127  
 127  128          rw_destroy(&dn->dn_struct_rwlock);
 128  129          mutex_destroy(&dn->dn_mtx);
 129  130          mutex_destroy(&dn->dn_dbufs_mtx);
 130  131          cv_destroy(&dn->dn_notxholds);
 131  132          refcount_destroy(&dn->dn_holds);
 132  133          refcount_destroy(&dn->dn_tx_holds);
 133  134          ASSERT(!list_link_active(&dn->dn_link));
 134  135  
 135  136          for (i = 0; i < TXG_SIZE; i++) {
 136  137                  ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
 137  138                  avl_destroy(&dn->dn_ranges[i]);
 138  139                  list_destroy(&dn->dn_dirty_records[i]);
 139  140                  ASSERT3U(dn->dn_next_nblkptr[i], ==, 0);
 140  141                  ASSERT3U(dn->dn_next_nlevels[i], ==, 0);
 141  142                  ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
 142  143                  ASSERT3U(dn->dn_next_bonustype[i], ==, 0);
 143  144                  ASSERT3U(dn->dn_rm_spillblk[i], ==, 0);
 144  145                  ASSERT3U(dn->dn_next_bonuslen[i], ==, 0);
 145  146                  ASSERT3U(dn->dn_next_blksz[i], ==, 0);
 146  147          }
 147  148  
 148  149          ASSERT3U(dn->dn_allocated_txg, ==, 0);
 149  150          ASSERT3U(dn->dn_free_txg, ==, 0);
 150  151          ASSERT3U(dn->dn_assigned_txg, ==, 0);
 151  152          ASSERT3U(dn->dn_dirtyctx, ==, 0);
 152  153          ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL);
 153  154          ASSERT3P(dn->dn_bonus, ==, NULL);
 154  155          ASSERT(!dn->dn_have_spill);
 155  156          ASSERT3P(dn->dn_zio, ==, NULL);
 156  157          ASSERT3U(dn->dn_oldused, ==, 0);
 157  158          ASSERT3U(dn->dn_oldflags, ==, 0);
 158  159          ASSERT3U(dn->dn_olduid, ==, 0);
 159  160          ASSERT3U(dn->dn_oldgid, ==, 0);
 160  161          ASSERT3U(dn->dn_newuid, ==, 0);
 161  162          ASSERT3U(dn->dn_newgid, ==, 0);
 162  163          ASSERT3U(dn->dn_id_flags, ==, 0);
 163  164  
 164  165          ASSERT3U(dn->dn_dbufs_count, ==, 0);
 165  166          list_destroy(&dn->dn_dbufs);
 166  167  }
 167  168  
 168  169  void
 169  170  dnode_init(void)
 170  171  {
 171  172          ASSERT(dnode_cache == NULL);
 172  173          dnode_cache = kmem_cache_create("dnode_t",
 173  174              sizeof (dnode_t),
 174  175              0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
 175  176          kmem_cache_set_move(dnode_cache, dnode_move);
 176  177  }
 177  178  
 178  179  void
 179  180  dnode_fini(void)
 180  181  {
 181  182          kmem_cache_destroy(dnode_cache);
 182  183          dnode_cache = NULL;
 183  184  }
 184  185  
 185  186  
  
    | 
      ↓ open down ↓ | 
    153 lines elided | 
    
      ↑ open up ↑ | 
  
 186  187  #ifdef ZFS_DEBUG
 187  188  void
 188  189  dnode_verify(dnode_t *dn)
 189  190  {
 190  191          int drop_struct_lock = FALSE;
 191  192  
 192  193          ASSERT(dn->dn_phys);
 193  194          ASSERT(dn->dn_objset);
 194  195          ASSERT(dn->dn_handle->dnh_dnode == dn);
 195  196  
 196      -        ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES);
      197 +        ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
 197  198  
 198  199          if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY))
 199  200                  return;
 200  201  
 201  202          if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
 202  203                  rw_enter(&dn->dn_struct_rwlock, RW_READER);
 203  204                  drop_struct_lock = TRUE;
 204  205          }
 205  206          if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) {
 206  207                  int i;
 207  208                  ASSERT3U(dn->dn_indblkshift, >=, 0);
 208  209                  ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT);
 209  210                  if (dn->dn_datablkshift) {
 210  211                          ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT);
 211  212                          ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT);
 212  213                          ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz);
 213  214                  }
 214  215                  ASSERT3U(dn->dn_nlevels, <=, 30);
 215      -                ASSERT3U(dn->dn_type, <=, DMU_OT_NUMTYPES);
      216 +                ASSERT(DMU_OT_IS_VALID(dn->dn_type));
 216  217                  ASSERT3U(dn->dn_nblkptr, >=, 1);
 217  218                  ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
 218  219                  ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
 219  220                  ASSERT3U(dn->dn_datablksz, ==,
 220  221                      dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 221  222                  ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0);
 222  223                  ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) +
 223  224                      dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
 224  225                  for (i = 0; i < TXG_SIZE; i++) {
 225  226                          ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels);
 226  227                  }
 227  228          }
 228  229          if (dn->dn_phys->dn_type != DMU_OT_NONE)
 229  230                  ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);
 230  231          ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL);
 231  232          if (dn->dn_dbuf != NULL) {
 232  233                  ASSERT3P(dn->dn_phys, ==,
 233  234                      (dnode_phys_t *)dn->dn_dbuf->db.db_data +
 234  235                      (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT)));
 235  236          }
 236  237          if (drop_struct_lock)
 237  238                  rw_exit(&dn->dn_struct_rwlock);
 238  239  }
 239  240  #endif
 240  241  
 241  242  void
 242  243  dnode_byteswap(dnode_phys_t *dnp)
 243  244  {
 244  245          uint64_t *buf64 = (void*)&dnp->dn_blkptr;
 245  246          int i;
 246  247  
 247  248          if (dnp->dn_type == DMU_OT_NONE) {
 248  249                  bzero(dnp, sizeof (dnode_phys_t));
 249  250                  return;
 250  251          }
 251  252  
 252  253          dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec);
 253  254          dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen);
 254  255          dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid);
 255  256          dnp->dn_used = BSWAP_64(dnp->dn_used);
 256  257  
 257  258          /*
 258  259           * dn_nblkptr is only one byte, so it's OK to read it in either
 259  260           * byte order.  We can't read dn_bouslen.
 260  261           */
 261  262          ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT);
 262  263          ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR);
 263  264          for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++)
 264  265                  buf64[i] = BSWAP_64(buf64[i]);
 265  266  
 266  267          /*
 267  268           * OK to check dn_bonuslen for zero, because it won't matter if
 268  269           * we have the wrong byte order.  This is necessary because the
 269  270           * dnode dnode is smaller than a regular dnode.
 270  271           */
  
    | 
      ↓ open down ↓ | 
    45 lines elided | 
    
      ↑ open up ↑ | 
  
 271  272          if (dnp->dn_bonuslen != 0) {
 272  273                  /*
 273  274                   * Note that the bonus length calculated here may be
 274  275                   * longer than the actual bonus buffer.  This is because
 275  276                   * we always put the bonus buffer after the last block
 276  277                   * pointer (instead of packing it against the end of the
 277  278                   * dnode buffer).
 278  279                   */
 279  280                  int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t);
 280  281                  size_t len = DN_MAX_BONUSLEN - off;
 281      -                ASSERT3U(dnp->dn_bonustype, <, DMU_OT_NUMTYPES);
 282      -                dmu_ot[dnp->dn_bonustype].ot_byteswap(dnp->dn_bonus + off, len);
      282 +                ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype));
      283 +                dmu_object_byteswap_t byteswap =
      284 +                    DMU_OT_BYTESWAP(dnp->dn_bonustype);
      285 +                dmu_ot_byteswap[byteswap].ob_func(dnp->dn_bonus + off, len);
 283  286          }
 284  287  
 285  288          /* Swap SPILL block if we have one */
 286  289          if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
 287  290                  byteswap_uint64_array(&dnp->dn_spill, sizeof (blkptr_t));
 288  291  
 289  292  }
 290  293  
 291  294  void
 292  295  dnode_buf_byteswap(void *vbuf, size_t size)
 293  296  {
 294  297          dnode_phys_t *buf = vbuf;
 295  298          int i;
 296  299  
 297  300          ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
 298  301          ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
 299  302  
 300  303          size >>= DNODE_SHIFT;
 301  304          for (i = 0; i < size; i++) {
 302  305                  dnode_byteswap(buf);
 303  306                  buf++;
 304  307          }
 305  308  }
 306  309  
 307  310  static int
 308  311  free_range_compar(const void *node1, const void *node2)
 309  312  {
 310  313          const free_range_t *rp1 = node1;
 311  314          const free_range_t *rp2 = node2;
 312  315  
 313  316          if (rp1->fr_blkid < rp2->fr_blkid)
 314  317                  return (-1);
 315  318          else if (rp1->fr_blkid > rp2->fr_blkid)
 316  319                  return (1);
 317  320          else return (0);
 318  321  }
 319  322  
 320  323  void
 321  324  dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
 322  325  {
 323  326          ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
 324  327  
 325  328          dnode_setdirty(dn, tx);
 326  329          rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 327  330          ASSERT3U(newsize, <=, DN_MAX_BONUSLEN -
 328  331              (dn->dn_nblkptr-1) * sizeof (blkptr_t));
 329  332          dn->dn_bonuslen = newsize;
 330  333          if (newsize == 0)
 331  334                  dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN;
 332  335          else
 333  336                  dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
 334  337          rw_exit(&dn->dn_struct_rwlock);
 335  338  }
 336  339  
 337  340  void
 338  341  dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx)
 339  342  {
 340  343          ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
 341  344          dnode_setdirty(dn, tx);
 342  345          rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 343  346          dn->dn_bonustype = newtype;
 344  347          dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
 345  348          rw_exit(&dn->dn_struct_rwlock);
 346  349  }
 347  350  
 348  351  void
 349  352  dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx)
 350  353  {
 351  354          ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
 352  355          ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
 353  356          dnode_setdirty(dn, tx);
 354  357          dn->dn_rm_spillblk[tx->tx_txg&TXG_MASK] = DN_KILL_SPILLBLK;
 355  358          dn->dn_have_spill = B_FALSE;
 356  359  }
 357  360  
 358  361  static void
 359  362  dnode_setdblksz(dnode_t *dn, int size)
 360  363  {
 361  364          ASSERT3U(P2PHASE(size, SPA_MINBLOCKSIZE), ==, 0);
 362  365          ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
 363  366          ASSERT3U(size, >=, SPA_MINBLOCKSIZE);
 364  367          ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,
 365  368              1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
 366  369          dn->dn_datablksz = size;
 367  370          dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
 368  371          dn->dn_datablkshift = ISP2(size) ? highbit(size - 1) : 0;
 369  372  }
 370  373  
 371  374  static dnode_t *
 372  375  dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
 373  376      uint64_t object, dnode_handle_t *dnh)
 374  377  {
 375  378          dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
 376  379  
 377  380          ASSERT(!POINTER_IS_VALID(dn->dn_objset));
 378  381          dn->dn_moved = 0;
 379  382  
 380  383          /*
 381  384           * Defer setting dn_objset until the dnode is ready to be a candidate
 382  385           * for the dnode_move() callback.
 383  386           */
 384  387          dn->dn_object = object;
 385  388          dn->dn_dbuf = db;
 386  389          dn->dn_handle = dnh;
 387  390          dn->dn_phys = dnp;
 388  391  
 389  392          if (dnp->dn_datablkszsec) {
 390  393                  dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 391  394          } else {
 392  395                  dn->dn_datablksz = 0;
 393  396                  dn->dn_datablkszsec = 0;
 394  397                  dn->dn_datablkshift = 0;
 395  398          }
 396  399          dn->dn_indblkshift = dnp->dn_indblkshift;
 397  400          dn->dn_nlevels = dnp->dn_nlevels;
 398  401          dn->dn_type = dnp->dn_type;
 399  402          dn->dn_nblkptr = dnp->dn_nblkptr;
  
    | 
      ↓ open down ↓ | 
    107 lines elided | 
    
      ↑ open up ↑ | 
  
 400  403          dn->dn_checksum = dnp->dn_checksum;
 401  404          dn->dn_compress = dnp->dn_compress;
 402  405          dn->dn_bonustype = dnp->dn_bonustype;
 403  406          dn->dn_bonuslen = dnp->dn_bonuslen;
 404  407          dn->dn_maxblkid = dnp->dn_maxblkid;
 405  408          dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
 406  409          dn->dn_id_flags = 0;
 407  410  
 408  411          dmu_zfetch_init(&dn->dn_zfetch, dn);
 409  412  
 410      -        ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES);
      413 +        ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
 411  414  
 412  415          mutex_enter(&os->os_lock);
 413  416          list_insert_head(&os->os_dnodes, dn);
 414  417          membar_producer();
 415  418          /*
 416  419           * Everything else must be valid before assigning dn_objset makes the
 417  420           * dnode eligible for dnode_move().
 418  421           */
 419  422          dn->dn_objset = os;
 420  423          mutex_exit(&os->os_lock);
 421  424  
 422  425          arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
 423  426          return (dn);
 424  427  }
 425  428  
 426  429  /*
 427  430   * Caller must be holding the dnode handle, which is released upon return.
 428  431   */
 429  432  static void
 430  433  dnode_destroy(dnode_t *dn)
 431  434  {
 432  435          objset_t *os = dn->dn_objset;
 433  436  
 434  437          ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
 435  438  
 436  439          mutex_enter(&os->os_lock);
 437  440          POINTER_INVALIDATE(&dn->dn_objset);
 438  441          list_remove(&os->os_dnodes, dn);
 439  442          mutex_exit(&os->os_lock);
 440  443  
 441  444          /* the dnode can no longer move, so we can release the handle */
 442  445          zrl_remove(&dn->dn_handle->dnh_zrlock);
 443  446  
 444  447          dn->dn_allocated_txg = 0;
 445  448          dn->dn_free_txg = 0;
 446  449          dn->dn_assigned_txg = 0;
 447  450  
 448  451          dn->dn_dirtyctx = 0;
 449  452          if (dn->dn_dirtyctx_firstset != NULL) {
 450  453                  kmem_free(dn->dn_dirtyctx_firstset, 1);
 451  454                  dn->dn_dirtyctx_firstset = NULL;
 452  455          }
 453  456          if (dn->dn_bonus != NULL) {
 454  457                  mutex_enter(&dn->dn_bonus->db_mtx);
 455  458                  dbuf_evict(dn->dn_bonus);
 456  459                  dn->dn_bonus = NULL;
 457  460          }
 458  461          dn->dn_zio = NULL;
 459  462  
 460  463          dn->dn_have_spill = B_FALSE;
 461  464          dn->dn_oldused = 0;
 462  465          dn->dn_oldflags = 0;
 463  466          dn->dn_olduid = 0;
 464  467          dn->dn_oldgid = 0;
 465  468          dn->dn_newuid = 0;
 466  469          dn->dn_newgid = 0;
 467  470          dn->dn_id_flags = 0;
 468  471  
 469  472          dmu_zfetch_rele(&dn->dn_zfetch);
 470  473          kmem_cache_free(dnode_cache, dn);
 471  474          arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);
 472  475  }
 473  476  
 474  477  void
 475  478  dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
 476  479      dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 477  480  {
 478  481          int i;
 479  482  
 480  483          if (blocksize == 0)
 481  484                  blocksize = 1 << zfs_default_bs;
 482  485          else if (blocksize > SPA_MAXBLOCKSIZE)
 483  486                  blocksize = SPA_MAXBLOCKSIZE;
 484  487          else
 485  488                  blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
 486  489  
 487  490          if (ibs == 0)
 488  491                  ibs = zfs_default_ibs;
  
    | 
      ↓ open down ↓ | 
    68 lines elided | 
    
      ↑ open up ↑ | 
  
 489  492  
 490  493          ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
 491  494  
 492  495          dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d\n", dn->dn_objset,
 493  496              dn->dn_object, tx->tx_txg, blocksize, ibs);
 494  497  
 495  498          ASSERT(dn->dn_type == DMU_OT_NONE);
 496  499          ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
 497  500          ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE);
 498  501          ASSERT(ot != DMU_OT_NONE);
 499      -        ASSERT3U(ot, <, DMU_OT_NUMTYPES);
      502 +        ASSERT(DMU_OT_IS_VALID(ot));
 500  503          ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
 501  504              (bonustype == DMU_OT_SA && bonuslen == 0) ||
 502  505              (bonustype != DMU_OT_NONE && bonuslen != 0));
 503      -        ASSERT3U(bonustype, <, DMU_OT_NUMTYPES);
      506 +        ASSERT(DMU_OT_IS_VALID(bonustype));
 504  507          ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
 505  508          ASSERT(dn->dn_type == DMU_OT_NONE);
 506  509          ASSERT3U(dn->dn_maxblkid, ==, 0);
 507  510          ASSERT3U(dn->dn_allocated_txg, ==, 0);
 508  511          ASSERT3U(dn->dn_assigned_txg, ==, 0);
 509  512          ASSERT(refcount_is_zero(&dn->dn_tx_holds));
 510  513          ASSERT3U(refcount_count(&dn->dn_holds), <=, 1);
 511  514          ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
 512  515  
 513  516          for (i = 0; i < TXG_SIZE; i++) {
 514  517                  ASSERT3U(dn->dn_next_nblkptr[i], ==, 0);
 515  518                  ASSERT3U(dn->dn_next_nlevels[i], ==, 0);
 516  519                  ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
 517  520                  ASSERT3U(dn->dn_next_bonuslen[i], ==, 0);
 518  521                  ASSERT3U(dn->dn_next_bonustype[i], ==, 0);
 519  522                  ASSERT3U(dn->dn_rm_spillblk[i], ==, 0);
 520  523                  ASSERT3U(dn->dn_next_blksz[i], ==, 0);
 521  524                  ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
 522  525                  ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
 523  526                  ASSERT3U(avl_numnodes(&dn->dn_ranges[i]), ==, 0);
 524  527          }
 525  528  
 526  529          dn->dn_type = ot;
 527  530          dnode_setdblksz(dn, blocksize);
 528  531          dn->dn_indblkshift = ibs;
 529  532          dn->dn_nlevels = 1;
 530  533          if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
 531  534                  dn->dn_nblkptr = 1;
 532  535          else
 533  536                  dn->dn_nblkptr = 1 +
 534  537                      ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
 535  538          dn->dn_bonustype = bonustype;
 536  539          dn->dn_bonuslen = bonuslen;
 537  540          dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
 538  541          dn->dn_compress = ZIO_COMPRESS_INHERIT;
 539  542          dn->dn_dirtyctx = 0;
 540  543  
 541  544          dn->dn_free_txg = 0;
 542  545          if (dn->dn_dirtyctx_firstset) {
 543  546                  kmem_free(dn->dn_dirtyctx_firstset, 1);
 544  547                  dn->dn_dirtyctx_firstset = NULL;
 545  548          }
 546  549  
 547  550          dn->dn_allocated_txg = tx->tx_txg;
 548  551          dn->dn_id_flags = 0;
 549  552  
 550  553          dnode_setdirty(dn, tx);
 551  554          dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
 552  555          dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
 553  556          dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
 554  557          dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz;
 555  558  }
 556  559  
 557  560  void
 558  561  dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
 559  562      dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 560  563  {
  
    | 
      ↓ open down ↓ | 
    47 lines elided | 
    
      ↑ open up ↑ | 
  
 561  564          int nblkptr;
 562  565  
 563  566          ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
 564  567          ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE);
 565  568          ASSERT3U(blocksize % SPA_MINBLOCKSIZE, ==, 0);
 566  569          ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
 567  570          ASSERT(tx->tx_txg != 0);
 568  571          ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
 569  572              (bonustype != DMU_OT_NONE && bonuslen != 0) ||
 570  573              (bonustype == DMU_OT_SA && bonuslen == 0));
 571      -        ASSERT3U(bonustype, <, DMU_OT_NUMTYPES);
      574 +        ASSERT(DMU_OT_IS_VALID(bonustype));
 572  575          ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
 573  576  
 574  577          /* clean up any unreferenced dbufs */
 575  578          dnode_evict_dbufs(dn);
 576  579  
 577  580          dn->dn_id_flags = 0;
 578  581  
 579  582          rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 580  583          dnode_setdirty(dn, tx);
 581  584          if (dn->dn_datablksz != blocksize) {
 582  585                  /* change blocksize */
 583  586                  ASSERT(dn->dn_maxblkid == 0 &&
 584  587                      (BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) ||
 585  588                      dnode_block_freed(dn, 0)));
 586  589                  dnode_setdblksz(dn, blocksize);
 587  590                  dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize;
 588  591          }
 589  592          if (dn->dn_bonuslen != bonuslen)
 590  593                  dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen;
 591  594  
 592  595          if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
 593  596                  nblkptr = 1;
 594  597          else
 595  598                  nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
 596  599          if (dn->dn_bonustype != bonustype)
 597  600                  dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype;
 598  601          if (dn->dn_nblkptr != nblkptr)
 599  602                  dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr;
 600  603          if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 601  604                  dbuf_rm_spill(dn, tx);
 602  605                  dnode_rm_spill(dn, tx);
 603  606          }
 604  607          rw_exit(&dn->dn_struct_rwlock);
 605  608  
 606  609          /* change type */
 607  610          dn->dn_type = ot;
 608  611  
 609  612          /* change bonus size and type */
 610  613          mutex_enter(&dn->dn_mtx);
 611  614          dn->dn_bonustype = bonustype;
 612  615          dn->dn_bonuslen = bonuslen;
 613  616          dn->dn_nblkptr = nblkptr;
 614  617          dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
 615  618          dn->dn_compress = ZIO_COMPRESS_INHERIT;
 616  619          ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
 617  620  
 618  621          /* fix up the bonus db_size */
 619  622          if (dn->dn_bonus) {
 620  623                  dn->dn_bonus->db.db_size =
 621  624                      DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t);
 622  625                  ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);
 623  626          }
 624  627  
 625  628          dn->dn_allocated_txg = tx->tx_txg;
 626  629          mutex_exit(&dn->dn_mtx);
 627  630  }
 628  631  
 629  632  #ifdef  DNODE_STATS
 630  633  static struct {
 631  634          uint64_t dms_dnode_invalid;
 632  635          uint64_t dms_dnode_recheck1;
 633  636          uint64_t dms_dnode_recheck2;
 634  637          uint64_t dms_dnode_special;
 635  638          uint64_t dms_dnode_handle;
 636  639          uint64_t dms_dnode_rwlock;
 637  640          uint64_t dms_dnode_active;
 638  641  } dnode_move_stats;
 639  642  #endif  /* DNODE_STATS */
 640  643  
 641  644  static void
 642  645  dnode_move_impl(dnode_t *odn, dnode_t *ndn)
 643  646  {
 644  647          int i;
 645  648  
 646  649          ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock));
 647  650          ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx));
 648  651          ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx));
 649  652          ASSERT(!RW_LOCK_HELD(&odn->dn_zfetch.zf_rwlock));
 650  653  
 651  654          /* Copy fields. */
 652  655          ndn->dn_objset = odn->dn_objset;
 653  656          ndn->dn_object = odn->dn_object;
 654  657          ndn->dn_dbuf = odn->dn_dbuf;
 655  658          ndn->dn_handle = odn->dn_handle;
 656  659          ndn->dn_phys = odn->dn_phys;
 657  660          ndn->dn_type = odn->dn_type;
 658  661          ndn->dn_bonuslen = odn->dn_bonuslen;
 659  662          ndn->dn_bonustype = odn->dn_bonustype;
 660  663          ndn->dn_nblkptr = odn->dn_nblkptr;
 661  664          ndn->dn_checksum = odn->dn_checksum;
 662  665          ndn->dn_compress = odn->dn_compress;
 663  666          ndn->dn_nlevels = odn->dn_nlevels;
 664  667          ndn->dn_indblkshift = odn->dn_indblkshift;
 665  668          ndn->dn_datablkshift = odn->dn_datablkshift;
 666  669          ndn->dn_datablkszsec = odn->dn_datablkszsec;
 667  670          ndn->dn_datablksz = odn->dn_datablksz;
 668  671          ndn->dn_maxblkid = odn->dn_maxblkid;
 669  672          bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0],
 670  673              sizeof (odn->dn_next_nblkptr));
 671  674          bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0],
 672  675              sizeof (odn->dn_next_nlevels));
 673  676          bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0],
 674  677              sizeof (odn->dn_next_indblkshift));
 675  678          bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0],
 676  679              sizeof (odn->dn_next_bonustype));
 677  680          bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0],
 678  681              sizeof (odn->dn_rm_spillblk));
 679  682          bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0],
 680  683              sizeof (odn->dn_next_bonuslen));
 681  684          bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0],
 682  685              sizeof (odn->dn_next_blksz));
 683  686          for (i = 0; i < TXG_SIZE; i++) {
 684  687                  list_move_tail(&ndn->dn_dirty_records[i],
 685  688                      &odn->dn_dirty_records[i]);
 686  689          }
 687  690          bcopy(&odn->dn_ranges[0], &ndn->dn_ranges[0], sizeof (odn->dn_ranges));
 688  691          ndn->dn_allocated_txg = odn->dn_allocated_txg;
 689  692          ndn->dn_free_txg = odn->dn_free_txg;
 690  693          ndn->dn_assigned_txg = odn->dn_assigned_txg;
 691  694          ndn->dn_dirtyctx = odn->dn_dirtyctx;
 692  695          ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
 693  696          ASSERT(refcount_count(&odn->dn_tx_holds) == 0);
 694  697          refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
 695  698          ASSERT(list_is_empty(&ndn->dn_dbufs));
 696  699          list_move_tail(&ndn->dn_dbufs, &odn->dn_dbufs);
 697  700          ndn->dn_dbufs_count = odn->dn_dbufs_count;
 698  701          ndn->dn_bonus = odn->dn_bonus;
 699  702          ndn->dn_have_spill = odn->dn_have_spill;
 700  703          ndn->dn_zio = odn->dn_zio;
 701  704          ndn->dn_oldused = odn->dn_oldused;
 702  705          ndn->dn_oldflags = odn->dn_oldflags;
 703  706          ndn->dn_olduid = odn->dn_olduid;
 704  707          ndn->dn_oldgid = odn->dn_oldgid;
 705  708          ndn->dn_newuid = odn->dn_newuid;
 706  709          ndn->dn_newgid = odn->dn_newgid;
 707  710          ndn->dn_id_flags = odn->dn_id_flags;
 708  711          dmu_zfetch_init(&ndn->dn_zfetch, NULL);
 709  712          list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream);
 710  713          ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode;
 711  714          ndn->dn_zfetch.zf_stream_cnt = odn->dn_zfetch.zf_stream_cnt;
 712  715          ndn->dn_zfetch.zf_alloc_fail = odn->dn_zfetch.zf_alloc_fail;
 713  716  
 714  717          /*
 715  718           * Update back pointers. Updating the handle fixes the back pointer of
 716  719           * every descendant dbuf as well as the bonus dbuf.
 717  720           */
 718  721          ASSERT(ndn->dn_handle->dnh_dnode == odn);
 719  722          ndn->dn_handle->dnh_dnode = ndn;
 720  723          if (ndn->dn_zfetch.zf_dnode == odn) {
 721  724                  ndn->dn_zfetch.zf_dnode = ndn;
 722  725          }
 723  726  
 724  727          /*
 725  728           * Invalidate the original dnode by clearing all of its back pointers.
 726  729           */
 727  730          odn->dn_dbuf = NULL;
 728  731          odn->dn_handle = NULL;
 729  732          list_create(&odn->dn_dbufs, sizeof (dmu_buf_impl_t),
 730  733              offsetof(dmu_buf_impl_t, db_link));
 731  734          odn->dn_dbufs_count = 0;
 732  735          odn->dn_bonus = NULL;
 733  736          odn->dn_zfetch.zf_dnode = NULL;
 734  737  
 735  738          /*
 736  739           * Set the low bit of the objset pointer to ensure that dnode_move()
 737  740           * recognizes the dnode as invalid in any subsequent callback.
 738  741           */
 739  742          POINTER_INVALIDATE(&odn->dn_objset);
 740  743  
 741  744          /*
 742  745           * Satisfy the destructor.
 743  746           */
 744  747          for (i = 0; i < TXG_SIZE; i++) {
 745  748                  list_create(&odn->dn_dirty_records[i],
 746  749                      sizeof (dbuf_dirty_record_t),
 747  750                      offsetof(dbuf_dirty_record_t, dr_dirty_node));
 748  751                  odn->dn_ranges[i].avl_root = NULL;
 749  752                  odn->dn_ranges[i].avl_numnodes = 0;
 750  753                  odn->dn_next_nlevels[i] = 0;
 751  754                  odn->dn_next_indblkshift[i] = 0;
 752  755                  odn->dn_next_bonustype[i] = 0;
 753  756                  odn->dn_rm_spillblk[i] = 0;
 754  757                  odn->dn_next_bonuslen[i] = 0;
 755  758                  odn->dn_next_blksz[i] = 0;
 756  759          }
 757  760          odn->dn_allocated_txg = 0;
 758  761          odn->dn_free_txg = 0;
 759  762          odn->dn_assigned_txg = 0;
 760  763          odn->dn_dirtyctx = 0;
 761  764          odn->dn_dirtyctx_firstset = NULL;
 762  765          odn->dn_have_spill = B_FALSE;
 763  766          odn->dn_zio = NULL;
 764  767          odn->dn_oldused = 0;
 765  768          odn->dn_oldflags = 0;
 766  769          odn->dn_olduid = 0;
 767  770          odn->dn_oldgid = 0;
 768  771          odn->dn_newuid = 0;
 769  772          odn->dn_newgid = 0;
 770  773          odn->dn_id_flags = 0;
 771  774  
 772  775          /*
 773  776           * Mark the dnode.
 774  777           */
 775  778          ndn->dn_moved = 1;
 776  779          odn->dn_moved = (uint8_t)-1;
 777  780  }
 778  781  
 779  782  #ifdef  _KERNEL
 780  783  /*ARGSUSED*/
 781  784  static kmem_cbrc_t
 782  785  dnode_move(void *buf, void *newbuf, size_t size, void *arg)
 783  786  {
 784  787          dnode_t *odn = buf, *ndn = newbuf;
 785  788          objset_t *os;
 786  789          int64_t refcount;
 787  790          uint32_t dbufs;
 788  791  
 789  792          /*
 790  793           * The dnode is on the objset's list of known dnodes if the objset
 791  794           * pointer is valid. We set the low bit of the objset pointer when
 792  795           * freeing the dnode to invalidate it, and the memory patterns written
 793  796           * by kmem (baddcafe and deadbeef) set at least one of the two low bits.
 794  797           * A newly created dnode sets the objset pointer last of all to indicate
 795  798           * that the dnode is known and in a valid state to be moved by this
 796  799           * function.
 797  800           */
 798  801          os = odn->dn_objset;
 799  802          if (!POINTER_IS_VALID(os)) {
 800  803                  DNODE_STAT_ADD(dnode_move_stats.dms_dnode_invalid);
 801  804                  return (KMEM_CBRC_DONT_KNOW);
 802  805          }
 803  806  
 804  807          /*
 805  808           * Ensure that the objset does not go away during the move.
 806  809           */
 807  810          rw_enter(&os_lock, RW_WRITER);
 808  811          if (os != odn->dn_objset) {
 809  812                  rw_exit(&os_lock);
 810  813                  DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck1);
 811  814                  return (KMEM_CBRC_DONT_KNOW);
 812  815          }
 813  816  
 814  817          /*
 815  818           * If the dnode is still valid, then so is the objset. We know that no
 816  819           * valid objset can be freed while we hold os_lock, so we can safely
 817  820           * ensure that the objset remains in use.
 818  821           */
 819  822          mutex_enter(&os->os_lock);
 820  823  
 821  824          /*
 822  825           * Recheck the objset pointer in case the dnode was removed just before
 823  826           * acquiring the lock.
 824  827           */
 825  828          if (os != odn->dn_objset) {
 826  829                  mutex_exit(&os->os_lock);
 827  830                  rw_exit(&os_lock);
 828  831                  DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck2);
 829  832                  return (KMEM_CBRC_DONT_KNOW);
 830  833          }
 831  834  
 832  835          /*
 833  836           * At this point we know that as long as we hold os->os_lock, the dnode
 834  837           * cannot be freed and fields within the dnode can be safely accessed.
 835  838           * The objset listing this dnode cannot go away as long as this dnode is
 836  839           * on its list.
 837  840           */
 838  841          rw_exit(&os_lock);
 839  842          if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) {
 840  843                  mutex_exit(&os->os_lock);
 841  844                  DNODE_STAT_ADD(dnode_move_stats.dms_dnode_special);
 842  845                  return (KMEM_CBRC_NO);
 843  846          }
 844  847          ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */
 845  848  
 846  849          /*
 847  850           * Lock the dnode handle to prevent the dnode from obtaining any new
 848  851           * holds. This also prevents the descendant dbufs and the bonus dbuf
 849  852           * from accessing the dnode, so that we can discount their holds. The
 850  853           * handle is safe to access because we know that while the dnode cannot
 851  854           * go away, neither can its handle. Once we hold dnh_zrlock, we can
 852  855           * safely move any dnode referenced only by dbufs.
 853  856           */
 854  857          if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) {
 855  858                  mutex_exit(&os->os_lock);
 856  859                  DNODE_STAT_ADD(dnode_move_stats.dms_dnode_handle);
 857  860                  return (KMEM_CBRC_LATER);
 858  861          }
 859  862  
 860  863          /*
 861  864           * Ensure a consistent view of the dnode's holds and the dnode's dbufs.
 862  865           * We need to guarantee that there is a hold for every dbuf in order to
 863  866           * determine whether the dnode is actively referenced. Falsely matching
 864  867           * a dbuf to an active hold would lead to an unsafe move. It's possible
 865  868           * that a thread already having an active dnode hold is about to add a
 866  869           * dbuf, and we can't compare hold and dbuf counts while the add is in
 867  870           * progress.
 868  871           */
 869  872          if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) {
 870  873                  zrl_exit(&odn->dn_handle->dnh_zrlock);
 871  874                  mutex_exit(&os->os_lock);
 872  875                  DNODE_STAT_ADD(dnode_move_stats.dms_dnode_rwlock);
 873  876                  return (KMEM_CBRC_LATER);
 874  877          }
 875  878  
 876  879          /*
 877  880           * A dbuf may be removed (evicted) without an active dnode hold. In that
 878  881           * case, the dbuf count is decremented under the handle lock before the
 879  882           * dbuf's hold is released. This order ensures that if we count the hold
 880  883           * after the dbuf is removed but before its hold is released, we will
 881  884           * treat the unmatched hold as active and exit safely. If we count the
 882  885           * hold before the dbuf is removed, the hold is discounted, and the
 883  886           * removal is blocked until the move completes.
 884  887           */
 885  888          refcount = refcount_count(&odn->dn_holds);
 886  889          ASSERT(refcount >= 0);
 887  890          dbufs = odn->dn_dbufs_count;
 888  891  
 889  892          /* We can't have more dbufs than dnode holds. */
 890  893          ASSERT3U(dbufs, <=, refcount);
 891  894          DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount,
 892  895              uint32_t, dbufs);
 893  896  
 894  897          if (refcount > dbufs) {
 895  898                  rw_exit(&odn->dn_struct_rwlock);
 896  899                  zrl_exit(&odn->dn_handle->dnh_zrlock);
 897  900                  mutex_exit(&os->os_lock);
 898  901                  DNODE_STAT_ADD(dnode_move_stats.dms_dnode_active);
 899  902                  return (KMEM_CBRC_LATER);
 900  903          }
 901  904  
 902  905          rw_exit(&odn->dn_struct_rwlock);
 903  906  
 904  907          /*
 905  908           * At this point we know that anyone with a hold on the dnode is not
 906  909           * actively referencing it. The dnode is known and in a valid state to
 907  910           * move. We're holding the locks needed to execute the critical section.
 908  911           */
 909  912          dnode_move_impl(odn, ndn);
 910  913  
 911  914          list_link_replace(&odn->dn_link, &ndn->dn_link);
 912  915          /* If the dnode was safe to move, the refcount cannot have changed. */
 913  916          ASSERT(refcount == refcount_count(&ndn->dn_holds));
 914  917          ASSERT(dbufs == ndn->dn_dbufs_count);
 915  918          zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */
 916  919          mutex_exit(&os->os_lock);
 917  920  
 918  921          return (KMEM_CBRC_YES);
 919  922  }
 920  923  #endif  /* _KERNEL */
 921  924  
 922  925  void
 923  926  dnode_special_close(dnode_handle_t *dnh)
 924  927  {
 925  928          dnode_t *dn = dnh->dnh_dnode;
 926  929  
 927  930          /*
 928  931           * Wait for final references to the dnode to clear.  This can
 929  932           * only happen if the arc is asyncronously evicting state that
 930  933           * has a hold on this dnode while we are trying to evict this
 931  934           * dnode.
 932  935           */
 933  936          while (refcount_count(&dn->dn_holds) > 0)
 934  937                  delay(1);
 935  938          zrl_add(&dnh->dnh_zrlock);
 936  939          dnode_destroy(dn); /* implicit zrl_remove() */
 937  940          zrl_destroy(&dnh->dnh_zrlock);
 938  941          dnh->dnh_dnode = NULL;
 939  942  }
 940  943  
 941  944  dnode_t *
 942  945  dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
 943  946      dnode_handle_t *dnh)
 944  947  {
 945  948          dnode_t *dn = dnode_create(os, dnp, NULL, object, dnh);
 946  949          dnh->dnh_dnode = dn;
 947  950          zrl_init(&dnh->dnh_zrlock);
 948  951          DNODE_VERIFY(dn);
 949  952          return (dn);
 950  953  }
 951  954  
 952  955  static void
 953  956  dnode_buf_pageout(dmu_buf_t *db, void *arg)
 954  957  {
 955  958          dnode_children_t *children_dnodes = arg;
 956  959          int i;
 957  960          int epb = db->db_size >> DNODE_SHIFT;
 958  961  
 959  962          ASSERT(epb == children_dnodes->dnc_count);
 960  963  
 961  964          for (i = 0; i < epb; i++) {
 962  965                  dnode_handle_t *dnh = &children_dnodes->dnc_children[i];
 963  966                  dnode_t *dn;
 964  967  
 965  968                  /*
 966  969                   * The dnode handle lock guards against the dnode moving to
 967  970                   * another valid address, so there is no need here to guard
 968  971                   * against changes to or from NULL.
 969  972                   */
 970  973                  if (dnh->dnh_dnode == NULL) {
 971  974                          zrl_destroy(&dnh->dnh_zrlock);
 972  975                          continue;
 973  976                  }
 974  977  
 975  978                  zrl_add(&dnh->dnh_zrlock);
 976  979                  dn = dnh->dnh_dnode;
 977  980                  /*
 978  981                   * If there are holds on this dnode, then there should
 979  982                   * be holds on the dnode's containing dbuf as well; thus
 980  983                   * it wouldn't be eligible for eviction and this function
 981  984                   * would not have been called.
 982  985                   */
 983  986                  ASSERT(refcount_is_zero(&dn->dn_holds));
 984  987                  ASSERT(refcount_is_zero(&dn->dn_tx_holds));
 985  988  
 986  989                  dnode_destroy(dn); /* implicit zrl_remove() */
 987  990                  zrl_destroy(&dnh->dnh_zrlock);
 988  991                  dnh->dnh_dnode = NULL;
 989  992          }
 990  993          kmem_free(children_dnodes, sizeof (dnode_children_t) +
 991  994              (epb - 1) * sizeof (dnode_handle_t));
 992  995  }
 993  996  
 994  997  /*
 995  998   * errors:
 996  999   * EINVAL - invalid object number.
 997 1000   * EIO - i/o error.
 998 1001   * succeeds even for free dnodes.
 999 1002   */
1000 1003  int
1001 1004  dnode_hold_impl(objset_t *os, uint64_t object, int flag,
1002 1005      void *tag, dnode_t **dnp)
1003 1006  {
1004 1007          int epb, idx, err;
1005 1008          int drop_struct_lock = FALSE;
1006 1009          int type;
1007 1010          uint64_t blk;
1008 1011          dnode_t *mdn, *dn;
1009 1012          dmu_buf_impl_t *db;
1010 1013          dnode_children_t *children_dnodes;
1011 1014          dnode_handle_t *dnh;
1012 1015  
1013 1016          /*
1014 1017           * If you are holding the spa config lock as writer, you shouldn't
1015 1018           * be asking the DMU to do *anything* unless it's the root pool
1016 1019           * which may require us to read from the root filesystem while
1017 1020           * holding some (not all) of the locks as writer.
1018 1021           */
1019 1022          ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 ||
1020 1023              (spa_is_root(os->os_spa) &&
1021 1024              spa_config_held(os->os_spa, SCL_STATE, RW_WRITER)));
1022 1025  
1023 1026          if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) {
1024 1027                  dn = (object == DMU_USERUSED_OBJECT) ?
1025 1028                      DMU_USERUSED_DNODE(os) : DMU_GROUPUSED_DNODE(os);
1026 1029                  if (dn == NULL)
1027 1030                          return (ENOENT);
1028 1031                  type = dn->dn_type;
1029 1032                  if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE)
1030 1033                          return (ENOENT);
1031 1034                  if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)
1032 1035                          return (EEXIST);
1033 1036                  DNODE_VERIFY(dn);
1034 1037                  (void) refcount_add(&dn->dn_holds, tag);
1035 1038                  *dnp = dn;
1036 1039                  return (0);
1037 1040          }
1038 1041  
1039 1042          if (object == 0 || object >= DN_MAX_OBJECT)
1040 1043                  return (EINVAL);
1041 1044  
1042 1045          mdn = DMU_META_DNODE(os);
1043 1046          ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT);
1044 1047  
1045 1048          DNODE_VERIFY(mdn);
1046 1049  
1047 1050          if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) {
1048 1051                  rw_enter(&mdn->dn_struct_rwlock, RW_READER);
1049 1052                  drop_struct_lock = TRUE;
1050 1053          }
1051 1054  
1052 1055          blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t));
1053 1056  
1054 1057          db = dbuf_hold(mdn, blk, FTAG);
1055 1058          if (drop_struct_lock)
1056 1059                  rw_exit(&mdn->dn_struct_rwlock);
1057 1060          if (db == NULL)
1058 1061                  return (EIO);
1059 1062          err = dbuf_read(db, NULL, DB_RF_CANFAIL);
1060 1063          if (err) {
1061 1064                  dbuf_rele(db, FTAG);
1062 1065                  return (err);
1063 1066          }
1064 1067  
1065 1068          ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
1066 1069          epb = db->db.db_size >> DNODE_SHIFT;
1067 1070  
1068 1071          idx = object & (epb-1);
1069 1072  
1070 1073          ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
1071 1074          children_dnodes = dmu_buf_get_user(&db->db);
1072 1075          if (children_dnodes == NULL) {
1073 1076                  int i;
1074 1077                  dnode_children_t *winner;
1075 1078                  children_dnodes = kmem_alloc(sizeof (dnode_children_t) +
1076 1079                      (epb - 1) * sizeof (dnode_handle_t), KM_SLEEP);
1077 1080                  children_dnodes->dnc_count = epb;
1078 1081                  dnh = &children_dnodes->dnc_children[0];
1079 1082                  for (i = 0; i < epb; i++) {
1080 1083                          zrl_init(&dnh[i].dnh_zrlock);
1081 1084                          dnh[i].dnh_dnode = NULL;
1082 1085                  }
1083 1086                  if (winner = dmu_buf_set_user(&db->db, children_dnodes, NULL,
1084 1087                      dnode_buf_pageout)) {
1085 1088                          kmem_free(children_dnodes, sizeof (dnode_children_t) +
1086 1089                              (epb - 1) * sizeof (dnode_handle_t));
1087 1090                          children_dnodes = winner;
1088 1091                  }
1089 1092          }
1090 1093          ASSERT(children_dnodes->dnc_count == epb);
1091 1094  
1092 1095          dnh = &children_dnodes->dnc_children[idx];
1093 1096          zrl_add(&dnh->dnh_zrlock);
1094 1097          if ((dn = dnh->dnh_dnode) == NULL) {
1095 1098                  dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx;
1096 1099                  dnode_t *winner;
1097 1100  
1098 1101                  dn = dnode_create(os, phys, db, object, dnh);
1099 1102                  winner = atomic_cas_ptr(&dnh->dnh_dnode, NULL, dn);
1100 1103                  if (winner != NULL) {
1101 1104                          zrl_add(&dnh->dnh_zrlock);
1102 1105                          dnode_destroy(dn); /* implicit zrl_remove() */
1103 1106                          dn = winner;
1104 1107                  }
1105 1108          }
1106 1109  
1107 1110          mutex_enter(&dn->dn_mtx);
1108 1111          type = dn->dn_type;
1109 1112          if (dn->dn_free_txg ||
1110 1113              ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
1111 1114              ((flag & DNODE_MUST_BE_FREE) &&
1112 1115              (type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) {
1113 1116                  mutex_exit(&dn->dn_mtx);
1114 1117                  zrl_remove(&dnh->dnh_zrlock);
1115 1118                  dbuf_rele(db, FTAG);
1116 1119                  return (type == DMU_OT_NONE ? ENOENT : EEXIST);
1117 1120          }
1118 1121          mutex_exit(&dn->dn_mtx);
1119 1122  
1120 1123          if (refcount_add(&dn->dn_holds, tag) == 1)
1121 1124                  dbuf_add_ref(db, dnh);
1122 1125          /* Now we can rely on the hold to prevent the dnode from moving. */
1123 1126          zrl_remove(&dnh->dnh_zrlock);
1124 1127  
1125 1128          DNODE_VERIFY(dn);
1126 1129          ASSERT3P(dn->dn_dbuf, ==, db);
1127 1130          ASSERT3U(dn->dn_object, ==, object);
1128 1131          dbuf_rele(db, FTAG);
1129 1132  
1130 1133          *dnp = dn;
1131 1134          return (0);
1132 1135  }
1133 1136  
1134 1137  /*
1135 1138   * Return held dnode if the object is allocated, NULL if not.
1136 1139   */
1137 1140  int
1138 1141  dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp)
1139 1142  {
1140 1143          return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp));
1141 1144  }
1142 1145  
1143 1146  /*
1144 1147   * Can only add a reference if there is already at least one
1145 1148   * reference on the dnode.  Returns FALSE if unable to add a
1146 1149   * new reference.
1147 1150   */
1148 1151  boolean_t
1149 1152  dnode_add_ref(dnode_t *dn, void *tag)
1150 1153  {
1151 1154          mutex_enter(&dn->dn_mtx);
1152 1155          if (refcount_is_zero(&dn->dn_holds)) {
1153 1156                  mutex_exit(&dn->dn_mtx);
1154 1157                  return (FALSE);
1155 1158          }
1156 1159          VERIFY(1 < refcount_add(&dn->dn_holds, tag));
1157 1160          mutex_exit(&dn->dn_mtx);
1158 1161          return (TRUE);
1159 1162  }
1160 1163  
1161 1164  void
1162 1165  dnode_rele(dnode_t *dn, void *tag)
1163 1166  {
1164 1167          uint64_t refs;
1165 1168          /* Get while the hold prevents the dnode from moving. */
1166 1169          dmu_buf_impl_t *db = dn->dn_dbuf;
1167 1170          dnode_handle_t *dnh = dn->dn_handle;
1168 1171  
1169 1172          mutex_enter(&dn->dn_mtx);
1170 1173          refs = refcount_remove(&dn->dn_holds, tag);
1171 1174          mutex_exit(&dn->dn_mtx);
1172 1175  
1173 1176          /*
1174 1177           * It's unsafe to release the last hold on a dnode by dnode_rele() or
1175 1178           * indirectly by dbuf_rele() while relying on the dnode handle to
1176 1179           * prevent the dnode from moving, since releasing the last hold could
1177 1180           * result in the dnode's parent dbuf evicting its dnode handles. For
1178 1181           * that reason anyone calling dnode_rele() or dbuf_rele() without some
1179 1182           * other direct or indirect hold on the dnode must first drop the dnode
1180 1183           * handle.
1181 1184           */
1182 1185          ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread);
1183 1186  
1184 1187          /* NOTE: the DNODE_DNODE does not have a dn_dbuf */
1185 1188          if (refs == 0 && db != NULL) {
1186 1189                  /*
1187 1190                   * Another thread could add a hold to the dnode handle in
1188 1191                   * dnode_hold_impl() while holding the parent dbuf. Since the
1189 1192                   * hold on the parent dbuf prevents the handle from being
1190 1193                   * destroyed, the hold on the handle is OK. We can't yet assert
1191 1194                   * that the handle has zero references, but that will be
1192 1195                   * asserted anyway when the handle gets destroyed.
1193 1196                   */
1194 1197                  dbuf_rele(db, dnh);
1195 1198          }
1196 1199  }
1197 1200  
1198 1201  void
1199 1202  dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
1200 1203  {
1201 1204          objset_t *os = dn->dn_objset;
1202 1205          uint64_t txg = tx->tx_txg;
1203 1206  
1204 1207          if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
1205 1208                  dsl_dataset_dirty(os->os_dsl_dataset, tx);
1206 1209                  return;
1207 1210          }
1208 1211  
1209 1212          DNODE_VERIFY(dn);
1210 1213  
1211 1214  #ifdef ZFS_DEBUG
1212 1215          mutex_enter(&dn->dn_mtx);
1213 1216          ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg);
1214 1217          ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg);
1215 1218          mutex_exit(&dn->dn_mtx);
1216 1219  #endif
1217 1220  
1218 1221          /*
1219 1222           * Determine old uid/gid when necessary
1220 1223           */
1221 1224          dmu_objset_userquota_get_ids(dn, B_TRUE, tx);
1222 1225  
1223 1226          mutex_enter(&os->os_lock);
1224 1227  
1225 1228          /*
1226 1229           * If we are already marked dirty, we're done.
1227 1230           */
1228 1231          if (list_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
1229 1232                  mutex_exit(&os->os_lock);
1230 1233                  return;
1231 1234          }
1232 1235  
1233 1236          ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs));
1234 1237          ASSERT(dn->dn_datablksz != 0);
1235 1238          ASSERT3U(dn->dn_next_bonuslen[txg&TXG_MASK], ==, 0);
1236 1239          ASSERT3U(dn->dn_next_blksz[txg&TXG_MASK], ==, 0);
1237 1240          ASSERT3U(dn->dn_next_bonustype[txg&TXG_MASK], ==, 0);
1238 1241  
1239 1242          dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
1240 1243              dn->dn_object, txg);
1241 1244  
1242 1245          if (dn->dn_free_txg > 0 && dn->dn_free_txg <= txg) {
1243 1246                  list_insert_tail(&os->os_free_dnodes[txg&TXG_MASK], dn);
1244 1247          } else {
1245 1248                  list_insert_tail(&os->os_dirty_dnodes[txg&TXG_MASK], dn);
1246 1249          }
1247 1250  
1248 1251          mutex_exit(&os->os_lock);
1249 1252  
1250 1253          /*
1251 1254           * The dnode maintains a hold on its containing dbuf as
1252 1255           * long as there are holds on it.  Each instantiated child
1253 1256           * dbuf maintains a hold on the dnode.  When the last child
1254 1257           * drops its hold, the dnode will drop its hold on the
1255 1258           * containing dbuf. We add a "dirty hold" here so that the
1256 1259           * dnode will hang around after we finish processing its
1257 1260           * children.
1258 1261           */
1259 1262          VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg));
1260 1263  
1261 1264          (void) dbuf_dirty(dn->dn_dbuf, tx);
1262 1265  
1263 1266          dsl_dataset_dirty(os->os_dsl_dataset, tx);
1264 1267  }
1265 1268  
1266 1269  void
1267 1270  dnode_free(dnode_t *dn, dmu_tx_t *tx)
1268 1271  {
1269 1272          int txgoff = tx->tx_txg & TXG_MASK;
1270 1273  
1271 1274          dprintf("dn=%p txg=%llu\n", dn, tx->tx_txg);
1272 1275  
1273 1276          /* we should be the only holder... hopefully */
1274 1277          /* ASSERT3U(refcount_count(&dn->dn_holds), ==, 1); */
1275 1278  
1276 1279          mutex_enter(&dn->dn_mtx);
1277 1280          if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) {
1278 1281                  mutex_exit(&dn->dn_mtx);
1279 1282                  return;
1280 1283          }
1281 1284          dn->dn_free_txg = tx->tx_txg;
1282 1285          mutex_exit(&dn->dn_mtx);
1283 1286  
1284 1287          /*
1285 1288           * If the dnode is already dirty, it needs to be moved from
1286 1289           * the dirty list to the free list.
1287 1290           */
1288 1291          mutex_enter(&dn->dn_objset->os_lock);
1289 1292          if (list_link_active(&dn->dn_dirty_link[txgoff])) {
1290 1293                  list_remove(&dn->dn_objset->os_dirty_dnodes[txgoff], dn);
1291 1294                  list_insert_tail(&dn->dn_objset->os_free_dnodes[txgoff], dn);
1292 1295                  mutex_exit(&dn->dn_objset->os_lock);
1293 1296          } else {
1294 1297                  mutex_exit(&dn->dn_objset->os_lock);
1295 1298                  dnode_setdirty(dn, tx);
1296 1299          }
1297 1300  }
1298 1301  
1299 1302  /*
1300 1303   * Try to change the block size for the indicated dnode.  This can only
1301 1304   * succeed if there are no blocks allocated or dirty beyond first block
1302 1305   */
1303 1306  int
1304 1307  dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
1305 1308  {
1306 1309          dmu_buf_impl_t *db, *db_next;
1307 1310          int err;
1308 1311  
1309 1312          if (size == 0)
1310 1313                  size = SPA_MINBLOCKSIZE;
1311 1314          if (size > SPA_MAXBLOCKSIZE)
1312 1315                  size = SPA_MAXBLOCKSIZE;
1313 1316          else
1314 1317                  size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
1315 1318  
1316 1319          if (ibs == dn->dn_indblkshift)
1317 1320                  ibs = 0;
1318 1321  
1319 1322          if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0)
1320 1323                  return (0);
1321 1324  
1322 1325          rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1323 1326  
1324 1327          /* Check for any allocated blocks beyond the first */
1325 1328          if (dn->dn_phys->dn_maxblkid != 0)
1326 1329                  goto fail;
1327 1330  
1328 1331          mutex_enter(&dn->dn_dbufs_mtx);
1329 1332          for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
1330 1333                  db_next = list_next(&dn->dn_dbufs, db);
1331 1334  
1332 1335                  if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
1333 1336                      db->db_blkid != DMU_SPILL_BLKID) {
1334 1337                          mutex_exit(&dn->dn_dbufs_mtx);
1335 1338                          goto fail;
1336 1339                  }
1337 1340          }
1338 1341          mutex_exit(&dn->dn_dbufs_mtx);
1339 1342  
1340 1343          if (ibs && dn->dn_nlevels != 1)
1341 1344                  goto fail;
1342 1345  
1343 1346          /* resize the old block */
1344 1347          err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db);
1345 1348          if (err == 0)
1346 1349                  dbuf_new_size(db, size, tx);
1347 1350          else if (err != ENOENT)
1348 1351                  goto fail;
1349 1352  
1350 1353          dnode_setdblksz(dn, size);
1351 1354          dnode_setdirty(dn, tx);
1352 1355          dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size;
1353 1356          if (ibs) {
1354 1357                  dn->dn_indblkshift = ibs;
1355 1358                  dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
1356 1359          }
1357 1360          /* rele after we have fixed the blocksize in the dnode */
1358 1361          if (db)
1359 1362                  dbuf_rele(db, FTAG);
1360 1363  
1361 1364          rw_exit(&dn->dn_struct_rwlock);
1362 1365          return (0);
1363 1366  
1364 1367  fail:
1365 1368          rw_exit(&dn->dn_struct_rwlock);
1366 1369          return (ENOTSUP);
1367 1370  }
1368 1371  
1369 1372  /* read-holding callers must not rely on the lock being continuously held */
1370 1373  void
1371 1374  dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
1372 1375  {
1373 1376          uint64_t txgoff = tx->tx_txg & TXG_MASK;
1374 1377          int epbs, new_nlevels;
1375 1378          uint64_t sz;
1376 1379  
1377 1380          ASSERT(blkid != DMU_BONUS_BLKID);
1378 1381  
1379 1382          ASSERT(have_read ?
1380 1383              RW_READ_HELD(&dn->dn_struct_rwlock) :
1381 1384              RW_WRITE_HELD(&dn->dn_struct_rwlock));
1382 1385  
1383 1386          /*
1384 1387           * if we have a read-lock, check to see if we need to do any work
1385 1388           * before upgrading to a write-lock.
1386 1389           */
1387 1390          if (have_read) {
1388 1391                  if (blkid <= dn->dn_maxblkid)
1389 1392                          return;
1390 1393  
1391 1394                  if (!rw_tryupgrade(&dn->dn_struct_rwlock)) {
1392 1395                          rw_exit(&dn->dn_struct_rwlock);
1393 1396                          rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1394 1397                  }
1395 1398          }
1396 1399  
1397 1400          if (blkid <= dn->dn_maxblkid)
1398 1401                  goto out;
1399 1402  
1400 1403          dn->dn_maxblkid = blkid;
1401 1404  
1402 1405          /*
1403 1406           * Compute the number of levels necessary to support the new maxblkid.
1404 1407           */
1405 1408          new_nlevels = 1;
1406 1409          epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1407 1410          for (sz = dn->dn_nblkptr;
1408 1411              sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)
1409 1412                  new_nlevels++;
1410 1413  
1411 1414          if (new_nlevels > dn->dn_nlevels) {
1412 1415                  int old_nlevels = dn->dn_nlevels;
1413 1416                  dmu_buf_impl_t *db;
1414 1417                  list_t *list;
1415 1418                  dbuf_dirty_record_t *new, *dr, *dr_next;
1416 1419  
1417 1420                  dn->dn_nlevels = new_nlevels;
1418 1421  
1419 1422                  ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
1420 1423                  dn->dn_next_nlevels[txgoff] = new_nlevels;
1421 1424  
1422 1425                  /* dirty the left indirects */
1423 1426                  db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
1424 1427                  ASSERT(db != NULL);
1425 1428                  new = dbuf_dirty(db, tx);
1426 1429                  dbuf_rele(db, FTAG);
1427 1430  
1428 1431                  /* transfer the dirty records to the new indirect */
1429 1432                  mutex_enter(&dn->dn_mtx);
1430 1433                  mutex_enter(&new->dt.di.dr_mtx);
1431 1434                  list = &dn->dn_dirty_records[txgoff];
1432 1435                  for (dr = list_head(list); dr; dr = dr_next) {
1433 1436                          dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
1434 1437                          if (dr->dr_dbuf->db_level != new_nlevels-1 &&
1435 1438                              dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
1436 1439                              dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
1437 1440                                  ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
1438 1441                                  list_remove(&dn->dn_dirty_records[txgoff], dr);
1439 1442                                  list_insert_tail(&new->dt.di.dr_children, dr);
1440 1443                                  dr->dr_parent = new;
1441 1444                          }
1442 1445                  }
1443 1446                  mutex_exit(&new->dt.di.dr_mtx);
1444 1447                  mutex_exit(&dn->dn_mtx);
1445 1448          }
1446 1449  
1447 1450  out:
1448 1451          if (have_read)
1449 1452                  rw_downgrade(&dn->dn_struct_rwlock);
1450 1453  }
1451 1454  
1452 1455  void
1453 1456  dnode_clear_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
1454 1457  {
1455 1458          avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
1456 1459          avl_index_t where;
1457 1460          free_range_t *rp;
1458 1461          free_range_t rp_tofind;
1459 1462          uint64_t endblk = blkid + nblks;
1460 1463  
1461 1464          ASSERT(MUTEX_HELD(&dn->dn_mtx));
1462 1465          ASSERT(nblks <= UINT64_MAX - blkid); /* no overflow */
1463 1466  
1464 1467          dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
1465 1468              blkid, nblks, tx->tx_txg);
1466 1469          rp_tofind.fr_blkid = blkid;
1467 1470          rp = avl_find(tree, &rp_tofind, &where);
1468 1471          if (rp == NULL)
1469 1472                  rp = avl_nearest(tree, where, AVL_BEFORE);
1470 1473          if (rp == NULL)
1471 1474                  rp = avl_nearest(tree, where, AVL_AFTER);
1472 1475  
1473 1476          while (rp && (rp->fr_blkid <= blkid + nblks)) {
1474 1477                  uint64_t fr_endblk = rp->fr_blkid + rp->fr_nblks;
1475 1478                  free_range_t *nrp = AVL_NEXT(tree, rp);
1476 1479  
1477 1480                  if (blkid <= rp->fr_blkid && endblk >= fr_endblk) {
1478 1481                          /* clear this entire range */
1479 1482                          avl_remove(tree, rp);
1480 1483                          kmem_free(rp, sizeof (free_range_t));
1481 1484                  } else if (blkid <= rp->fr_blkid &&
1482 1485                      endblk > rp->fr_blkid && endblk < fr_endblk) {
1483 1486                          /* clear the beginning of this range */
1484 1487                          rp->fr_blkid = endblk;
1485 1488                          rp->fr_nblks = fr_endblk - endblk;
1486 1489                  } else if (blkid > rp->fr_blkid && blkid < fr_endblk &&
1487 1490                      endblk >= fr_endblk) {
1488 1491                          /* clear the end of this range */
1489 1492                          rp->fr_nblks = blkid - rp->fr_blkid;
1490 1493                  } else if (blkid > rp->fr_blkid && endblk < fr_endblk) {
1491 1494                          /* clear a chunk out of this range */
1492 1495                          free_range_t *new_rp =
1493 1496                              kmem_alloc(sizeof (free_range_t), KM_SLEEP);
1494 1497  
1495 1498                          new_rp->fr_blkid = endblk;
1496 1499                          new_rp->fr_nblks = fr_endblk - endblk;
1497 1500                          avl_insert_here(tree, new_rp, rp, AVL_AFTER);
1498 1501                          rp->fr_nblks = blkid - rp->fr_blkid;
1499 1502                  }
1500 1503                  /* there may be no overlap */
1501 1504                  rp = nrp;
1502 1505          }
1503 1506  }
1504 1507  
1505 1508  void
1506 1509  dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
1507 1510  {
1508 1511          dmu_buf_impl_t *db;
1509 1512          uint64_t blkoff, blkid, nblks;
1510 1513          int blksz, blkshift, head, tail;
1511 1514          int trunc = FALSE;
1512 1515          int epbs;
1513 1516  
1514 1517          rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1515 1518          blksz = dn->dn_datablksz;
1516 1519          blkshift = dn->dn_datablkshift;
1517 1520          epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1518 1521  
1519 1522          if (len == -1ULL) {
1520 1523                  len = UINT64_MAX - off;
1521 1524                  trunc = TRUE;
1522 1525          }
1523 1526  
1524 1527          /*
1525 1528           * First, block align the region to free:
1526 1529           */
1527 1530          if (ISP2(blksz)) {
1528 1531                  head = P2NPHASE(off, blksz);
1529 1532                  blkoff = P2PHASE(off, blksz);
1530 1533                  if ((off >> blkshift) > dn->dn_maxblkid)
1531 1534                          goto out;
1532 1535          } else {
1533 1536                  ASSERT(dn->dn_maxblkid == 0);
1534 1537                  if (off == 0 && len >= blksz) {
1535 1538                          /* Freeing the whole block; fast-track this request */
1536 1539                          blkid = 0;
1537 1540                          nblks = 1;
1538 1541                          goto done;
1539 1542                  } else if (off >= blksz) {
1540 1543                          /* Freeing past end-of-data */
1541 1544                          goto out;
1542 1545                  } else {
1543 1546                          /* Freeing part of the block. */
1544 1547                          head = blksz - off;
1545 1548                          ASSERT3U(head, >, 0);
1546 1549                  }
1547 1550                  blkoff = off;
1548 1551          }
1549 1552          /* zero out any partial block data at the start of the range */
1550 1553          if (head) {
1551 1554                  ASSERT3U(blkoff + head, ==, blksz);
1552 1555                  if (len < head)
1553 1556                          head = len;
1554 1557                  if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE,
1555 1558                      FTAG, &db) == 0) {
1556 1559                          caddr_t data;
1557 1560  
1558 1561                          /* don't dirty if it isn't on disk and isn't dirty */
1559 1562                          if (db->db_last_dirty ||
1560 1563                              (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
1561 1564                                  rw_exit(&dn->dn_struct_rwlock);
1562 1565                                  dbuf_will_dirty(db, tx);
1563 1566                                  rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1564 1567                                  data = db->db.db_data;
1565 1568                                  bzero(data + blkoff, head);
1566 1569                          }
1567 1570                          dbuf_rele(db, FTAG);
1568 1571                  }
1569 1572                  off += head;
1570 1573                  len -= head;
1571 1574          }
1572 1575  
1573 1576          /* If the range was less than one block, we're done */
1574 1577          if (len == 0)
1575 1578                  goto out;
1576 1579  
1577 1580          /* If the remaining range is past end of file, we're done */
1578 1581          if ((off >> blkshift) > dn->dn_maxblkid)
1579 1582                  goto out;
1580 1583  
1581 1584          ASSERT(ISP2(blksz));
1582 1585          if (trunc)
1583 1586                  tail = 0;
1584 1587          else
1585 1588                  tail = P2PHASE(len, blksz);
1586 1589  
1587 1590          ASSERT3U(P2PHASE(off, blksz), ==, 0);
1588 1591          /* zero out any partial block data at the end of the range */
1589 1592          if (tail) {
1590 1593                  if (len < tail)
1591 1594                          tail = len;
1592 1595                  if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len),
1593 1596                      TRUE, FTAG, &db) == 0) {
1594 1597                          /* don't dirty if not on disk and not dirty */
1595 1598                          if (db->db_last_dirty ||
1596 1599                              (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
1597 1600                                  rw_exit(&dn->dn_struct_rwlock);
1598 1601                                  dbuf_will_dirty(db, tx);
1599 1602                                  rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1600 1603                                  bzero(db->db.db_data, tail);
1601 1604                          }
1602 1605                          dbuf_rele(db, FTAG);
1603 1606                  }
1604 1607                  len -= tail;
1605 1608          }
1606 1609  
1607 1610          /* If the range did not include a full block, we are done */
1608 1611          if (len == 0)
1609 1612                  goto out;
1610 1613  
1611 1614          ASSERT(IS_P2ALIGNED(off, blksz));
1612 1615          ASSERT(trunc || IS_P2ALIGNED(len, blksz));
1613 1616          blkid = off >> blkshift;
1614 1617          nblks = len >> blkshift;
1615 1618          if (trunc)
1616 1619                  nblks += 1;
1617 1620  
1618 1621          /*
1619 1622           * Read in and mark all the level-1 indirects dirty,
1620 1623           * so that they will stay in memory until syncing phase.
1621 1624           * Always dirty the first and last indirect to make sure
1622 1625           * we dirty all the partial indirects.
1623 1626           */
1624 1627          if (dn->dn_nlevels > 1) {
1625 1628                  uint64_t i, first, last;
1626 1629                  int shift = epbs + dn->dn_datablkshift;
1627 1630  
1628 1631                  first = blkid >> epbs;
1629 1632                  if (db = dbuf_hold_level(dn, 1, first, FTAG)) {
1630 1633                          dbuf_will_dirty(db, tx);
1631 1634                          dbuf_rele(db, FTAG);
1632 1635                  }
1633 1636                  if (trunc)
1634 1637                          last = dn->dn_maxblkid >> epbs;
1635 1638                  else
1636 1639                          last = (blkid + nblks - 1) >> epbs;
1637 1640                  if (last > first && (db = dbuf_hold_level(dn, 1, last, FTAG))) {
1638 1641                          dbuf_will_dirty(db, tx);
1639 1642                          dbuf_rele(db, FTAG);
1640 1643                  }
1641 1644                  for (i = first + 1; i < last; i++) {
1642 1645                          uint64_t ibyte = i << shift;
1643 1646                          int err;
1644 1647  
1645 1648                          err = dnode_next_offset(dn,
1646 1649                              DNODE_FIND_HAVELOCK, &ibyte, 1, 1, 0);
1647 1650                          i = ibyte >> shift;
1648 1651                          if (err == ESRCH || i >= last)
1649 1652                                  break;
1650 1653                          ASSERT(err == 0);
1651 1654                          db = dbuf_hold_level(dn, 1, i, FTAG);
1652 1655                          if (db) {
1653 1656                                  dbuf_will_dirty(db, tx);
1654 1657                                  dbuf_rele(db, FTAG);
1655 1658                          }
1656 1659                  }
1657 1660          }
1658 1661  done:
1659 1662          /*
1660 1663           * Add this range to the dnode range list.
1661 1664           * We will finish up this free operation in the syncing phase.
1662 1665           */
1663 1666          mutex_enter(&dn->dn_mtx);
1664 1667          dnode_clear_range(dn, blkid, nblks, tx);
1665 1668          {
1666 1669                  free_range_t *rp, *found;
1667 1670                  avl_index_t where;
1668 1671                  avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
1669 1672  
1670 1673                  /* Add new range to dn_ranges */
1671 1674                  rp = kmem_alloc(sizeof (free_range_t), KM_SLEEP);
1672 1675                  rp->fr_blkid = blkid;
1673 1676                  rp->fr_nblks = nblks;
1674 1677                  found = avl_find(tree, rp, &where);
1675 1678                  ASSERT(found == NULL);
1676 1679                  avl_insert(tree, rp, where);
1677 1680                  dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
1678 1681                      blkid, nblks, tx->tx_txg);
1679 1682          }
1680 1683          mutex_exit(&dn->dn_mtx);
1681 1684  
1682 1685          dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
1683 1686          dnode_setdirty(dn, tx);
1684 1687  out:
1685 1688          if (trunc && dn->dn_maxblkid >= (off >> blkshift))
1686 1689                  dn->dn_maxblkid = (off >> blkshift ? (off >> blkshift) - 1 : 0);
1687 1690  
1688 1691          rw_exit(&dn->dn_struct_rwlock);
1689 1692  }
1690 1693  
1691 1694  static boolean_t
1692 1695  dnode_spill_freed(dnode_t *dn)
1693 1696  {
1694 1697          int i;
1695 1698  
1696 1699          mutex_enter(&dn->dn_mtx);
1697 1700          for (i = 0; i < TXG_SIZE; i++) {
1698 1701                  if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK)
1699 1702                          break;
1700 1703          }
1701 1704          mutex_exit(&dn->dn_mtx);
1702 1705          return (i < TXG_SIZE);
1703 1706  }
1704 1707  
1705 1708  /* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
1706 1709  uint64_t
1707 1710  dnode_block_freed(dnode_t *dn, uint64_t blkid)
1708 1711  {
1709 1712          free_range_t range_tofind;
1710 1713          void *dp = spa_get_dsl(dn->dn_objset->os_spa);
1711 1714          int i;
1712 1715  
1713 1716          if (blkid == DMU_BONUS_BLKID)
1714 1717                  return (FALSE);
1715 1718  
1716 1719          /*
1717 1720           * If we're in the process of opening the pool, dp will not be
1718 1721           * set yet, but there shouldn't be anything dirty.
1719 1722           */
1720 1723          if (dp == NULL)
1721 1724                  return (FALSE);
1722 1725  
1723 1726          if (dn->dn_free_txg)
1724 1727                  return (TRUE);
1725 1728  
1726 1729          if (blkid == DMU_SPILL_BLKID)
1727 1730                  return (dnode_spill_freed(dn));
1728 1731  
1729 1732          range_tofind.fr_blkid = blkid;
1730 1733          mutex_enter(&dn->dn_mtx);
1731 1734          for (i = 0; i < TXG_SIZE; i++) {
1732 1735                  free_range_t *range_found;
1733 1736                  avl_index_t idx;
1734 1737  
1735 1738                  range_found = avl_find(&dn->dn_ranges[i], &range_tofind, &idx);
1736 1739                  if (range_found) {
1737 1740                          ASSERT(range_found->fr_nblks > 0);
1738 1741                          break;
1739 1742                  }
1740 1743                  range_found = avl_nearest(&dn->dn_ranges[i], idx, AVL_BEFORE);
1741 1744                  if (range_found &&
1742 1745                      range_found->fr_blkid + range_found->fr_nblks > blkid)
1743 1746                          break;
1744 1747          }
1745 1748          mutex_exit(&dn->dn_mtx);
1746 1749          return (i < TXG_SIZE);
1747 1750  }
1748 1751  
1749 1752  /* call from syncing context when we actually write/free space for this dnode */
1750 1753  void
1751 1754  dnode_diduse_space(dnode_t *dn, int64_t delta)
1752 1755  {
1753 1756          uint64_t space;
1754 1757          dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n",
1755 1758              dn, dn->dn_phys,
1756 1759              (u_longlong_t)dn->dn_phys->dn_used,
1757 1760              (longlong_t)delta);
1758 1761  
1759 1762          mutex_enter(&dn->dn_mtx);
1760 1763          space = DN_USED_BYTES(dn->dn_phys);
1761 1764          if (delta > 0) {
1762 1765                  ASSERT3U(space + delta, >=, space); /* no overflow */
1763 1766          } else {
1764 1767                  ASSERT3U(space, >=, -delta); /* no underflow */
1765 1768          }
1766 1769          space += delta;
1767 1770          if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) {
1768 1771                  ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0);
1769 1772                  ASSERT3U(P2PHASE(space, 1<<DEV_BSHIFT), ==, 0);
1770 1773                  dn->dn_phys->dn_used = space >> DEV_BSHIFT;
1771 1774          } else {
1772 1775                  dn->dn_phys->dn_used = space;
1773 1776                  dn->dn_phys->dn_flags |= DNODE_FLAG_USED_BYTES;
1774 1777          }
1775 1778          mutex_exit(&dn->dn_mtx);
1776 1779  }
1777 1780  
1778 1781  /*
1779 1782   * Call when we think we're going to write/free space in open context.
1780 1783   * Be conservative (ie. OK to write less than this or free more than
1781 1784   * this, but don't write more or free less).
1782 1785   */
1783 1786  void
1784 1787  dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
1785 1788  {
1786 1789          objset_t *os = dn->dn_objset;
1787 1790          dsl_dataset_t *ds = os->os_dsl_dataset;
1788 1791  
1789 1792          if (space > 0)
1790 1793                  space = spa_get_asize(os->os_spa, space);
1791 1794  
1792 1795          if (ds)
1793 1796                  dsl_dir_willuse_space(ds->ds_dir, space, tx);
1794 1797  
1795 1798          dmu_tx_willuse_space(tx, space);
1796 1799  }
1797 1800  
1798 1801  /*
1799 1802   * This function scans a block at the indicated "level" looking for
1800 1803   * a hole or data (depending on 'flags').  If level > 0, then we are
1801 1804   * scanning an indirect block looking at its pointers.  If level == 0,
1802 1805   * then we are looking at a block of dnodes.  If we don't find what we
1803 1806   * are looking for in the block, we return ESRCH.  Otherwise, return
1804 1807   * with *offset pointing to the beginning (if searching forwards) or
1805 1808   * end (if searching backwards) of the range covered by the block
1806 1809   * pointer we matched on (or dnode).
1807 1810   *
1808 1811   * The basic search algorithm used below by dnode_next_offset() is to
1809 1812   * use this function to search up the block tree (widen the search) until
1810 1813   * we find something (i.e., we don't return ESRCH) and then search back
1811 1814   * down the tree (narrow the search) until we reach our original search
1812 1815   * level.
1813 1816   */
1814 1817  static int
1815 1818  dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
1816 1819          int lvl, uint64_t blkfill, uint64_t txg)
1817 1820  {
1818 1821          dmu_buf_impl_t *db = NULL;
1819 1822          void *data = NULL;
1820 1823          uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
1821 1824          uint64_t epb = 1ULL << epbs;
1822 1825          uint64_t minfill, maxfill;
1823 1826          boolean_t hole;
1824 1827          int i, inc, error, span;
1825 1828  
1826 1829          dprintf("probing object %llu offset %llx level %d of %u\n",
1827 1830              dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);
1828 1831  
1829 1832          hole = ((flags & DNODE_FIND_HOLE) != 0);
1830 1833          inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
1831 1834          ASSERT(txg == 0 || !hole);
1832 1835  
1833 1836          if (lvl == dn->dn_phys->dn_nlevels) {
1834 1837                  error = 0;
1835 1838                  epb = dn->dn_phys->dn_nblkptr;
1836 1839                  data = dn->dn_phys->dn_blkptr;
1837 1840          } else {
1838 1841                  uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl);
1839 1842                  error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db);
1840 1843                  if (error) {
1841 1844                          if (error != ENOENT)
1842 1845                                  return (error);
1843 1846                          if (hole)
1844 1847                                  return (0);
1845 1848                          /*
1846 1849                           * This can only happen when we are searching up
1847 1850                           * the block tree for data.  We don't really need to
1848 1851                           * adjust the offset, as we will just end up looking
1849 1852                           * at the pointer to this block in its parent, and its
1850 1853                           * going to be unallocated, so we will skip over it.
1851 1854                           */
1852 1855                          return (ESRCH);
1853 1856                  }
1854 1857                  error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT);
1855 1858                  if (error) {
1856 1859                          dbuf_rele(db, FTAG);
1857 1860                          return (error);
1858 1861                  }
1859 1862                  data = db->db.db_data;
1860 1863          }
1861 1864  
1862 1865          if (db && txg &&
1863 1866              (db->db_blkptr == NULL || db->db_blkptr->blk_birth <= txg)) {
1864 1867                  /*
1865 1868                   * This can only happen when we are searching up the tree
1866 1869                   * and these conditions mean that we need to keep climbing.
1867 1870                   */
1868 1871                  error = ESRCH;
1869 1872          } else if (lvl == 0) {
1870 1873                  dnode_phys_t *dnp = data;
1871 1874                  span = DNODE_SHIFT;
1872 1875                  ASSERT(dn->dn_type == DMU_OT_DNODE);
1873 1876  
1874 1877                  for (i = (*offset >> span) & (blkfill - 1);
1875 1878                      i >= 0 && i < blkfill; i += inc) {
1876 1879                          if ((dnp[i].dn_type == DMU_OT_NONE) == hole)
1877 1880                                  break;
1878 1881                          *offset += (1ULL << span) * inc;
1879 1882                  }
1880 1883                  if (i < 0 || i == blkfill)
1881 1884                          error = ESRCH;
1882 1885          } else {
1883 1886                  blkptr_t *bp = data;
1884 1887                  uint64_t start = *offset;
1885 1888                  span = (lvl - 1) * epbs + dn->dn_datablkshift;
1886 1889                  minfill = 0;
1887 1890                  maxfill = blkfill << ((lvl - 1) * epbs);
1888 1891  
1889 1892                  if (hole)
1890 1893                          maxfill--;
1891 1894                  else
1892 1895                          minfill++;
1893 1896  
1894 1897                  *offset = *offset >> span;
1895 1898                  for (i = BF64_GET(*offset, 0, epbs);
1896 1899                      i >= 0 && i < epb; i += inc) {
1897 1900                          if (bp[i].blk_fill >= minfill &&
1898 1901                              bp[i].blk_fill <= maxfill &&
1899 1902                              (hole || bp[i].blk_birth > txg))
1900 1903                                  break;
1901 1904                          if (inc > 0 || *offset > 0)
1902 1905                                  *offset += inc;
1903 1906                  }
1904 1907                  *offset = *offset << span;
1905 1908                  if (inc < 0) {
1906 1909                          /* traversing backwards; position offset at the end */
1907 1910                          ASSERT3U(*offset, <=, start);
1908 1911                          *offset = MIN(*offset + (1ULL << span) - 1, start);
1909 1912                  } else if (*offset < start) {
1910 1913                          *offset = start;
1911 1914                  }
1912 1915                  if (i < 0 || i >= epb)
1913 1916                          error = ESRCH;
1914 1917          }
1915 1918  
1916 1919          if (db)
1917 1920                  dbuf_rele(db, FTAG);
1918 1921  
1919 1922          return (error);
1920 1923  }
1921 1924  
1922 1925  /*
1923 1926   * Find the next hole, data, or sparse region at or after *offset.
1924 1927   * The value 'blkfill' tells us how many items we expect to find
1925 1928   * in an L0 data block; this value is 1 for normal objects,
1926 1929   * DNODES_PER_BLOCK for the meta dnode, and some fraction of
1927 1930   * DNODES_PER_BLOCK when searching for sparse regions thereof.
1928 1931   *
1929 1932   * Examples:
1930 1933   *
1931 1934   * dnode_next_offset(dn, flags, offset, 1, 1, 0);
1932 1935   *      Finds the next/previous hole/data in a file.
1933 1936   *      Used in dmu_offset_next().
1934 1937   *
1935 1938   * dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg);
1936 1939   *      Finds the next free/allocated dnode an objset's meta-dnode.
1937 1940   *      Only finds objects that have new contents since txg (ie.
1938 1941   *      bonus buffer changes and content removal are ignored).
1939 1942   *      Used in dmu_object_next().
1940 1943   *
1941 1944   * dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0);
1942 1945   *      Finds the next L2 meta-dnode bp that's at most 1/4 full.
1943 1946   *      Used in dmu_object_alloc().
1944 1947   */
1945 1948  int
1946 1949  dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
1947 1950      int minlvl, uint64_t blkfill, uint64_t txg)
1948 1951  {
1949 1952          uint64_t initial_offset = *offset;
1950 1953          int lvl, maxlvl;
1951 1954          int error = 0;
1952 1955  
1953 1956          if (!(flags & DNODE_FIND_HAVELOCK))
1954 1957                  rw_enter(&dn->dn_struct_rwlock, RW_READER);
1955 1958  
1956 1959          if (dn->dn_phys->dn_nlevels == 0) {
1957 1960                  error = ESRCH;
1958 1961                  goto out;
1959 1962          }
1960 1963  
1961 1964          if (dn->dn_datablkshift == 0) {
1962 1965                  if (*offset < dn->dn_datablksz) {
1963 1966                          if (flags & DNODE_FIND_HOLE)
1964 1967                                  *offset = dn->dn_datablksz;
1965 1968                  } else {
1966 1969                          error = ESRCH;
1967 1970                  }
1968 1971                  goto out;
1969 1972          }
1970 1973  
1971 1974          maxlvl = dn->dn_phys->dn_nlevels;
1972 1975  
1973 1976          for (lvl = minlvl; lvl <= maxlvl; lvl++) {
1974 1977                  error = dnode_next_offset_level(dn,
1975 1978                      flags, offset, lvl, blkfill, txg);
1976 1979                  if (error != ESRCH)
1977 1980                          break;
1978 1981          }
1979 1982  
1980 1983          while (error == 0 && --lvl >= minlvl) {
1981 1984                  error = dnode_next_offset_level(dn,
1982 1985                      flags, offset, lvl, blkfill, txg);
1983 1986          }
1984 1987  
1985 1988          if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
1986 1989              initial_offset < *offset : initial_offset > *offset))
1987 1990                  error = ESRCH;
1988 1991  out:
1989 1992          if (!(flags & DNODE_FIND_HAVELOCK))
1990 1993                  rw_exit(&dn->dn_struct_rwlock);
1991 1994  
1992 1995          return (error);
1993 1996  }
  
    | 
      ↓ open down ↓ | 
    1412 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX