Print this page
    
5056 ZFS deadlock on db_mtx and dn_holds
Reviewed by: Will Andrews <willa@spectralogic.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Approved by: Dan McDonald <danmcd@omniti.com>
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/zfs/dnode.c
          +++ new/usr/src/uts/common/fs/zfs/dnode.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  
    | 
      ↓ open down ↓ | 
    13 lines elided | 
    
      ↑ open up ↑ | 
  
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
       24 + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  24   25   */
  25   26  
  26   27  #include <sys/zfs_context.h>
  27   28  #include <sys/dbuf.h>
  28   29  #include <sys/dnode.h>
  29   30  #include <sys/dmu.h>
  30   31  #include <sys/dmu_impl.h>
  31   32  #include <sys/dmu_tx.h>
  32   33  #include <sys/dmu_objset.h>
  33   34  #include <sys/dsl_dir.h>
  34   35  #include <sys/dsl_dataset.h>
  35   36  #include <sys/spa.h>
  36   37  #include <sys/zio.h>
  37   38  #include <sys/dmu_zfetch.h>
  38   39  #include <sys/range_tree.h>
  39   40  
  40   41  static kmem_cache_t *dnode_cache;
  41   42  /*
  42   43   * Define DNODE_STATS to turn on statistic gathering. By default, it is only
  43   44   * turned on when DEBUG is also defined.
  44   45   */
  45   46  #ifdef  DEBUG
  46   47  #define DNODE_STATS
  47   48  #endif  /* DEBUG */
  48   49  
  49   50  #ifdef  DNODE_STATS
  50   51  #define DNODE_STAT_ADD(stat)                    ((stat)++)
  51   52  #else
  52   53  #define DNODE_STAT_ADD(stat)                    /* nothing */
  53   54  #endif  /* DNODE_STATS */
  54   55  
  55   56  static dnode_phys_t dnode_phys_zero;
  56   57  
  57   58  int zfs_default_bs = SPA_MINBLOCKSHIFT;
  58   59  int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
  59   60  
  60   61  static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
  61   62  
  62   63  static int
  63   64  dbuf_compare(const void *x1, const void *x2)
  64   65  {
  65   66          const dmu_buf_impl_t *d1 = x1;
  66   67          const dmu_buf_impl_t *d2 = x2;
  67   68  
  68   69          if (d1->db_level < d2->db_level) {
  69   70                  return (-1);
  70   71          }
  71   72          if (d1->db_level > d2->db_level) {
  72   73                  return (1);
  73   74          }
  74   75  
  75   76          if (d1->db_blkid < d2->db_blkid) {
  76   77                  return (-1);
  77   78          }
  78   79          if (d1->db_blkid > d2->db_blkid) {
  79   80                  return (1);
  80   81          }
  81   82  
  82   83          if (d1->db_state == DB_SEARCH) {
  83   84                  ASSERT3S(d2->db_state, !=, DB_SEARCH);
  84   85                  return (-1);
  85   86          } else if (d2->db_state == DB_SEARCH) {
  86   87                  ASSERT3S(d1->db_state, !=, DB_SEARCH);
  87   88                  return (1);
  88   89          }
  89   90  
  90   91          if ((uintptr_t)d1 < (uintptr_t)d2) {
  91   92                  return (-1);
  92   93          }
  93   94          if ((uintptr_t)d1 > (uintptr_t)d2) {
  94   95                  return (1);
  95   96          }
  96   97          return (0);
  97   98  }
  98   99  
  99  100  /* ARGSUSED */
 100  101  static int
 101  102  dnode_cons(void *arg, void *unused, int kmflag)
 102  103  {
 103  104          dnode_t *dn = arg;
 104  105          int i;
 105  106  
 106  107          rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL);
 107  108          mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
 108  109          mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
 109  110          cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
 110  111  
 111  112          /*
 112  113           * Every dbuf has a reference, and dropping a tracked reference is
 113  114           * O(number of references), so don't track dn_holds.
 114  115           */
 115  116          refcount_create_untracked(&dn->dn_holds);
 116  117          refcount_create(&dn->dn_tx_holds);
 117  118          list_link_init(&dn->dn_link);
 118  119  
 119  120          bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr));
 120  121          bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels));
 121  122          bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift));
 122  123          bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype));
 123  124          bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk));
 124  125          bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen));
 125  126          bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz));
 126  127  
 127  128          for (i = 0; i < TXG_SIZE; i++) {
 128  129                  list_link_init(&dn->dn_dirty_link[i]);
 129  130                  dn->dn_free_ranges[i] = NULL;
 130  131                  list_create(&dn->dn_dirty_records[i],
 131  132                      sizeof (dbuf_dirty_record_t),
 132  133                      offsetof(dbuf_dirty_record_t, dr_dirty_node));
 133  134          }
 134  135  
 135  136          dn->dn_allocated_txg = 0;
 136  137          dn->dn_free_txg = 0;
 137  138          dn->dn_assigned_txg = 0;
 138  139          dn->dn_dirtyctx = 0;
 139  140          dn->dn_dirtyctx_firstset = NULL;
 140  141          dn->dn_bonus = NULL;
 141  142          dn->dn_have_spill = B_FALSE;
 142  143          dn->dn_zio = NULL;
 143  144          dn->dn_oldused = 0;
 144  145          dn->dn_oldflags = 0;
 145  146          dn->dn_olduid = 0;
 146  147          dn->dn_oldgid = 0;
 147  148          dn->dn_newuid = 0;
 148  149          dn->dn_newgid = 0;
 149  150          dn->dn_id_flags = 0;
 150  151  
 151  152          dn->dn_dbufs_count = 0;
 152  153          dn->dn_unlisted_l0_blkid = 0;
 153  154          avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
 154  155              offsetof(dmu_buf_impl_t, db_link));
 155  156  
 156  157          dn->dn_moved = 0;
 157  158          return (0);
 158  159  }
 159  160  
 160  161  /* ARGSUSED */
 161  162  static void
 162  163  dnode_dest(void *arg, void *unused)
 163  164  {
 164  165          int i;
 165  166          dnode_t *dn = arg;
 166  167  
 167  168          rw_destroy(&dn->dn_struct_rwlock);
 168  169          mutex_destroy(&dn->dn_mtx);
 169  170          mutex_destroy(&dn->dn_dbufs_mtx);
 170  171          cv_destroy(&dn->dn_notxholds);
 171  172          refcount_destroy(&dn->dn_holds);
 172  173          refcount_destroy(&dn->dn_tx_holds);
 173  174          ASSERT(!list_link_active(&dn->dn_link));
 174  175  
 175  176          for (i = 0; i < TXG_SIZE; i++) {
 176  177                  ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
 177  178                  ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
 178  179                  list_destroy(&dn->dn_dirty_records[i]);
 179  180                  ASSERT0(dn->dn_next_nblkptr[i]);
 180  181                  ASSERT0(dn->dn_next_nlevels[i]);
 181  182                  ASSERT0(dn->dn_next_indblkshift[i]);
 182  183                  ASSERT0(dn->dn_next_bonustype[i]);
 183  184                  ASSERT0(dn->dn_rm_spillblk[i]);
 184  185                  ASSERT0(dn->dn_next_bonuslen[i]);
 185  186                  ASSERT0(dn->dn_next_blksz[i]);
 186  187          }
 187  188  
 188  189          ASSERT0(dn->dn_allocated_txg);
 189  190          ASSERT0(dn->dn_free_txg);
 190  191          ASSERT0(dn->dn_assigned_txg);
 191  192          ASSERT0(dn->dn_dirtyctx);
 192  193          ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL);
 193  194          ASSERT3P(dn->dn_bonus, ==, NULL);
 194  195          ASSERT(!dn->dn_have_spill);
 195  196          ASSERT3P(dn->dn_zio, ==, NULL);
 196  197          ASSERT0(dn->dn_oldused);
 197  198          ASSERT0(dn->dn_oldflags);
 198  199          ASSERT0(dn->dn_olduid);
 199  200          ASSERT0(dn->dn_oldgid);
 200  201          ASSERT0(dn->dn_newuid);
 201  202          ASSERT0(dn->dn_newgid);
 202  203          ASSERT0(dn->dn_id_flags);
 203  204  
 204  205          ASSERT0(dn->dn_dbufs_count);
 205  206          ASSERT0(dn->dn_unlisted_l0_blkid);
 206  207          avl_destroy(&dn->dn_dbufs);
 207  208  }
 208  209  
 209  210  void
 210  211  dnode_init(void)
 211  212  {
 212  213          ASSERT(dnode_cache == NULL);
 213  214          dnode_cache = kmem_cache_create("dnode_t",
 214  215              sizeof (dnode_t),
 215  216              0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
 216  217          kmem_cache_set_move(dnode_cache, dnode_move);
 217  218  }
 218  219  
 219  220  void
 220  221  dnode_fini(void)
 221  222  {
 222  223          kmem_cache_destroy(dnode_cache);
 223  224          dnode_cache = NULL;
 224  225  }
 225  226  
 226  227  
 227  228  #ifdef ZFS_DEBUG
 228  229  void
 229  230  dnode_verify(dnode_t *dn)
 230  231  {
 231  232          int drop_struct_lock = FALSE;
 232  233  
 233  234          ASSERT(dn->dn_phys);
 234  235          ASSERT(dn->dn_objset);
 235  236          ASSERT(dn->dn_handle->dnh_dnode == dn);
 236  237  
 237  238          ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
 238  239  
 239  240          if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY))
 240  241                  return;
 241  242  
 242  243          if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
 243  244                  rw_enter(&dn->dn_struct_rwlock, RW_READER);
 244  245                  drop_struct_lock = TRUE;
 245  246          }
 246  247          if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) {
 247  248                  int i;
 248  249                  ASSERT3U(dn->dn_indblkshift, >=, 0);
 249  250                  ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT);
 250  251                  if (dn->dn_datablkshift) {
 251  252                          ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT);
 252  253                          ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT);
 253  254                          ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz);
 254  255                  }
 255  256                  ASSERT3U(dn->dn_nlevels, <=, 30);
 256  257                  ASSERT(DMU_OT_IS_VALID(dn->dn_type));
 257  258                  ASSERT3U(dn->dn_nblkptr, >=, 1);
 258  259                  ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
 259  260                  ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
 260  261                  ASSERT3U(dn->dn_datablksz, ==,
 261  262                      dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 262  263                  ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0);
 263  264                  ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) +
 264  265                      dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
 265  266                  for (i = 0; i < TXG_SIZE; i++) {
 266  267                          ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels);
 267  268                  }
 268  269          }
 269  270          if (dn->dn_phys->dn_type != DMU_OT_NONE)
 270  271                  ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);
 271  272          ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL);
 272  273          if (dn->dn_dbuf != NULL) {
 273  274                  ASSERT3P(dn->dn_phys, ==,
 274  275                      (dnode_phys_t *)dn->dn_dbuf->db.db_data +
 275  276                      (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT)));
 276  277          }
 277  278          if (drop_struct_lock)
 278  279                  rw_exit(&dn->dn_struct_rwlock);
 279  280  }
 280  281  #endif
 281  282  
 282  283  void
 283  284  dnode_byteswap(dnode_phys_t *dnp)
 284  285  {
 285  286          uint64_t *buf64 = (void*)&dnp->dn_blkptr;
 286  287          int i;
 287  288  
 288  289          if (dnp->dn_type == DMU_OT_NONE) {
 289  290                  bzero(dnp, sizeof (dnode_phys_t));
 290  291                  return;
 291  292          }
 292  293  
 293  294          dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec);
 294  295          dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen);
 295  296          dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid);
 296  297          dnp->dn_used = BSWAP_64(dnp->dn_used);
 297  298  
 298  299          /*
 299  300           * dn_nblkptr is only one byte, so it's OK to read it in either
 300  301           * byte order.  We can't read dn_bouslen.
 301  302           */
 302  303          ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT);
 303  304          ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR);
 304  305          for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++)
 305  306                  buf64[i] = BSWAP_64(buf64[i]);
 306  307  
 307  308          /*
 308  309           * OK to check dn_bonuslen for zero, because it won't matter if
 309  310           * we have the wrong byte order.  This is necessary because the
 310  311           * dnode dnode is smaller than a regular dnode.
 311  312           */
 312  313          if (dnp->dn_bonuslen != 0) {
 313  314                  /*
 314  315                   * Note that the bonus length calculated here may be
 315  316                   * longer than the actual bonus buffer.  This is because
 316  317                   * we always put the bonus buffer after the last block
 317  318                   * pointer (instead of packing it against the end of the
 318  319                   * dnode buffer).
 319  320                   */
 320  321                  int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t);
 321  322                  size_t len = DN_MAX_BONUSLEN - off;
 322  323                  ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype));
 323  324                  dmu_object_byteswap_t byteswap =
 324  325                      DMU_OT_BYTESWAP(dnp->dn_bonustype);
 325  326                  dmu_ot_byteswap[byteswap].ob_func(dnp->dn_bonus + off, len);
 326  327          }
 327  328  
 328  329          /* Swap SPILL block if we have one */
 329  330          if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
 330  331                  byteswap_uint64_array(&dnp->dn_spill, sizeof (blkptr_t));
 331  332  
 332  333  }
 333  334  
 334  335  void
 335  336  dnode_buf_byteswap(void *vbuf, size_t size)
 336  337  {
 337  338          dnode_phys_t *buf = vbuf;
 338  339          int i;
 339  340  
 340  341          ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
 341  342          ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
 342  343  
 343  344          size >>= DNODE_SHIFT;
 344  345          for (i = 0; i < size; i++) {
 345  346                  dnode_byteswap(buf);
 346  347                  buf++;
 347  348          }
 348  349  }
 349  350  
 350  351  void
 351  352  dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
 352  353  {
 353  354          ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
 354  355  
 355  356          dnode_setdirty(dn, tx);
 356  357          rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 357  358          ASSERT3U(newsize, <=, DN_MAX_BONUSLEN -
 358  359              (dn->dn_nblkptr-1) * sizeof (blkptr_t));
 359  360          dn->dn_bonuslen = newsize;
 360  361          if (newsize == 0)
 361  362                  dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN;
 362  363          else
 363  364                  dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
 364  365          rw_exit(&dn->dn_struct_rwlock);
 365  366  }
 366  367  
 367  368  void
 368  369  dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx)
 369  370  {
 370  371          ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
 371  372          dnode_setdirty(dn, tx);
 372  373          rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 373  374          dn->dn_bonustype = newtype;
 374  375          dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
 375  376          rw_exit(&dn->dn_struct_rwlock);
 376  377  }
 377  378  
 378  379  void
 379  380  dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx)
 380  381  {
 381  382          ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
 382  383          ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
 383  384          dnode_setdirty(dn, tx);
 384  385          dn->dn_rm_spillblk[tx->tx_txg&TXG_MASK] = DN_KILL_SPILLBLK;
 385  386          dn->dn_have_spill = B_FALSE;
 386  387  }
 387  388  
 388  389  static void
 389  390  dnode_setdblksz(dnode_t *dn, int size)
 390  391  {
 391  392          ASSERT0(P2PHASE(size, SPA_MINBLOCKSIZE));
 392  393          ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
 393  394          ASSERT3U(size, >=, SPA_MINBLOCKSIZE);
 394  395          ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,
  
    | 
      ↓ open down ↓ | 
    361 lines elided | 
    
      ↑ open up ↑ | 
  
 395  396              1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
 396  397          dn->dn_datablksz = size;
 397  398          dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
 398  399          dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0;
 399  400  }
 400  401  
 401  402  static dnode_t *
 402  403  dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
 403  404      uint64_t object, dnode_handle_t *dnh)
 404  405  {
 405      -        dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
      406 +        dnode_t *dn;
 406  407  
      408 +        dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
 407  409          ASSERT(!POINTER_IS_VALID(dn->dn_objset));
 408  410          dn->dn_moved = 0;
 409  411  
 410  412          /*
 411  413           * Defer setting dn_objset until the dnode is ready to be a candidate
 412  414           * for the dnode_move() callback.
 413  415           */
 414  416          dn->dn_object = object;
 415  417          dn->dn_dbuf = db;
 416  418          dn->dn_handle = dnh;
 417  419          dn->dn_phys = dnp;
 418  420  
 419  421          if (dnp->dn_datablkszsec) {
 420  422                  dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 421  423          } else {
 422  424                  dn->dn_datablksz = 0;
 423  425                  dn->dn_datablkszsec = 0;
 424  426                  dn->dn_datablkshift = 0;
 425  427          }
 426  428          dn->dn_indblkshift = dnp->dn_indblkshift;
 427  429          dn->dn_nlevels = dnp->dn_nlevels;
 428  430          dn->dn_type = dnp->dn_type;
 429  431          dn->dn_nblkptr = dnp->dn_nblkptr;
 430  432          dn->dn_checksum = dnp->dn_checksum;
 431  433          dn->dn_compress = dnp->dn_compress;
 432  434          dn->dn_bonustype = dnp->dn_bonustype;
  
    | 
      ↓ open down ↓ | 
    16 lines elided | 
    
      ↑ open up ↑ | 
  
 433  435          dn->dn_bonuslen = dnp->dn_bonuslen;
 434  436          dn->dn_maxblkid = dnp->dn_maxblkid;
 435  437          dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
 436  438          dn->dn_id_flags = 0;
 437  439  
 438  440          dmu_zfetch_init(&dn->dn_zfetch, dn);
 439  441  
 440  442          ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
 441  443  
 442  444          mutex_enter(&os->os_lock);
 443      -        list_insert_head(&os->os_dnodes, dn);
      445 +        if (dnh->dnh_dnode != NULL) {
      446 +                /* Lost the allocation race. */
      447 +                mutex_exit(&os->os_lock);
      448 +                kmem_cache_free(dnode_cache, dn);
      449 +                return (dnh->dnh_dnode);
      450 +        }
      451 +
      452 +        /*
      453 +         * Exclude special dnodes from os_dnodes so an empty os_dnodes
      454 +         * signifies that the special dnodes have no references from
      455 +         * their children (the entries in os_dnodes).  This allows
      456 +         * dnode_destroy() to easily determine if the last child has
      457 +         * been removed and then complete eviction of the objset.
      458 +         */
      459 +        if (!DMU_OBJECT_IS_SPECIAL(object))
      460 +                list_insert_head(&os->os_dnodes, dn);
 444  461          membar_producer();
      462 +
 445  463          /*
 446      -         * Everything else must be valid before assigning dn_objset makes the
 447      -         * dnode eligible for dnode_move().
      464 +         * Everything else must be valid before assigning dn_objset
      465 +         * makes the dnode eligible for dnode_move().
 448  466           */
 449  467          dn->dn_objset = os;
      468 +
      469 +        dnh->dnh_dnode = dn;
 450  470          mutex_exit(&os->os_lock);
 451  471  
 452  472          arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
 453  473          return (dn);
 454  474  }
 455  475  
 456  476  /*
 457  477   * Caller must be holding the dnode handle, which is released upon return.
 458  478   */
 459  479  static void
 460  480  dnode_destroy(dnode_t *dn)
 461  481  {
 462  482          objset_t *os = dn->dn_objset;
      483 +        boolean_t complete_os_eviction = B_FALSE;
 463  484  
 464  485          ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
 465  486  
 466  487          mutex_enter(&os->os_lock);
 467  488          POINTER_INVALIDATE(&dn->dn_objset);
 468      -        list_remove(&os->os_dnodes, dn);
      489 +        if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
      490 +                list_remove(&os->os_dnodes, dn);
      491 +                complete_os_eviction =
      492 +                    list_is_empty(&os->os_dnodes) &&
      493 +                    list_link_active(&os->os_evicting_node);
      494 +        }
 469  495          mutex_exit(&os->os_lock);
 470  496  
 471  497          /* the dnode can no longer move, so we can release the handle */
 472  498          zrl_remove(&dn->dn_handle->dnh_zrlock);
 473  499  
 474  500          dn->dn_allocated_txg = 0;
 475  501          dn->dn_free_txg = 0;
 476  502          dn->dn_assigned_txg = 0;
 477  503  
 478  504          dn->dn_dirtyctx = 0;
 479  505          if (dn->dn_dirtyctx_firstset != NULL) {
 480  506                  kmem_free(dn->dn_dirtyctx_firstset, 1);
 481  507                  dn->dn_dirtyctx_firstset = NULL;
 482  508          }
 483  509          if (dn->dn_bonus != NULL) {
 484  510                  mutex_enter(&dn->dn_bonus->db_mtx);
 485  511                  dbuf_evict(dn->dn_bonus);
 486  512                  dn->dn_bonus = NULL;
 487  513          }
 488  514          dn->dn_zio = NULL;
 489  515  
 490  516          dn->dn_have_spill = B_FALSE;
 491  517          dn->dn_oldused = 0;
 492  518          dn->dn_oldflags = 0;
  
    | 
      ↓ open down ↓ | 
    14 lines elided | 
    
      ↑ open up ↑ | 
  
 493  519          dn->dn_olduid = 0;
 494  520          dn->dn_oldgid = 0;
 495  521          dn->dn_newuid = 0;
 496  522          dn->dn_newgid = 0;
 497  523          dn->dn_id_flags = 0;
 498  524          dn->dn_unlisted_l0_blkid = 0;
 499  525  
 500  526          dmu_zfetch_rele(&dn->dn_zfetch);
 501  527          kmem_cache_free(dnode_cache, dn);
 502  528          arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);
      529 +
      530 +        if (complete_os_eviction)
      531 +                dmu_objset_evict_done(os);
 503  532  }
 504  533  
 505  534  void
 506  535  dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
 507  536      dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 508  537  {
 509  538          int i;
 510  539  
 511  540          ASSERT3U(blocksize, <=,
 512  541              spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
 513  542          if (blocksize == 0)
 514  543                  blocksize = 1 << zfs_default_bs;
 515  544          else
 516  545                  blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
 517  546  
 518  547          if (ibs == 0)
 519  548                  ibs = zfs_default_ibs;
 520  549  
 521  550          ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
 522  551  
 523  552          dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d\n", dn->dn_objset,
 524  553              dn->dn_object, tx->tx_txg, blocksize, ibs);
 525  554  
 526  555          ASSERT(dn->dn_type == DMU_OT_NONE);
 527  556          ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
 528  557          ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE);
 529  558          ASSERT(ot != DMU_OT_NONE);
 530  559          ASSERT(DMU_OT_IS_VALID(ot));
 531  560          ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
 532  561              (bonustype == DMU_OT_SA && bonuslen == 0) ||
 533  562              (bonustype != DMU_OT_NONE && bonuslen != 0));
 534  563          ASSERT(DMU_OT_IS_VALID(bonustype));
 535  564          ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
 536  565          ASSERT(dn->dn_type == DMU_OT_NONE);
 537  566          ASSERT0(dn->dn_maxblkid);
 538  567          ASSERT0(dn->dn_allocated_txg);
 539  568          ASSERT0(dn->dn_assigned_txg);
 540  569          ASSERT(refcount_is_zero(&dn->dn_tx_holds));
 541  570          ASSERT3U(refcount_count(&dn->dn_holds), <=, 1);
 542  571          ASSERT(avl_is_empty(&dn->dn_dbufs));
 543  572  
 544  573          for (i = 0; i < TXG_SIZE; i++) {
 545  574                  ASSERT0(dn->dn_next_nblkptr[i]);
 546  575                  ASSERT0(dn->dn_next_nlevels[i]);
 547  576                  ASSERT0(dn->dn_next_indblkshift[i]);
 548  577                  ASSERT0(dn->dn_next_bonuslen[i]);
 549  578                  ASSERT0(dn->dn_next_bonustype[i]);
 550  579                  ASSERT0(dn->dn_rm_spillblk[i]);
 551  580                  ASSERT0(dn->dn_next_blksz[i]);
 552  581                  ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
 553  582                  ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
 554  583                  ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
 555  584          }
 556  585  
 557  586          dn->dn_type = ot;
 558  587          dnode_setdblksz(dn, blocksize);
 559  588          dn->dn_indblkshift = ibs;
 560  589          dn->dn_nlevels = 1;
 561  590          if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
 562  591                  dn->dn_nblkptr = 1;
 563  592          else
 564  593                  dn->dn_nblkptr = 1 +
 565  594                      ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
 566  595          dn->dn_bonustype = bonustype;
 567  596          dn->dn_bonuslen = bonuslen;
 568  597          dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
 569  598          dn->dn_compress = ZIO_COMPRESS_INHERIT;
 570  599          dn->dn_dirtyctx = 0;
 571  600  
 572  601          dn->dn_free_txg = 0;
 573  602          if (dn->dn_dirtyctx_firstset) {
 574  603                  kmem_free(dn->dn_dirtyctx_firstset, 1);
 575  604                  dn->dn_dirtyctx_firstset = NULL;
 576  605          }
 577  606  
 578  607          dn->dn_allocated_txg = tx->tx_txg;
 579  608          dn->dn_id_flags = 0;
 580  609  
 581  610          dnode_setdirty(dn, tx);
 582  611          dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
 583  612          dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
 584  613          dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
 585  614          dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz;
 586  615  }
 587  616  
 588  617  void
 589  618  dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
 590  619      dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 591  620  {
 592  621          int nblkptr;
 593  622  
 594  623          ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
 595  624          ASSERT3U(blocksize, <=,
 596  625              spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
 597  626          ASSERT0(blocksize % SPA_MINBLOCKSIZE);
 598  627          ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
 599  628          ASSERT(tx->tx_txg != 0);
 600  629          ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
 601  630              (bonustype != DMU_OT_NONE && bonuslen != 0) ||
 602  631              (bonustype == DMU_OT_SA && bonuslen == 0));
 603  632          ASSERT(DMU_OT_IS_VALID(bonustype));
 604  633          ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
 605  634  
 606  635          /* clean up any unreferenced dbufs */
 607  636          dnode_evict_dbufs(dn);
 608  637  
 609  638          dn->dn_id_flags = 0;
 610  639  
 611  640          rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 612  641          dnode_setdirty(dn, tx);
 613  642          if (dn->dn_datablksz != blocksize) {
 614  643                  /* change blocksize */
 615  644                  ASSERT(dn->dn_maxblkid == 0 &&
 616  645                      (BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) ||
 617  646                      dnode_block_freed(dn, 0)));
 618  647                  dnode_setdblksz(dn, blocksize);
 619  648                  dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize;
 620  649          }
 621  650          if (dn->dn_bonuslen != bonuslen)
 622  651                  dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen;
 623  652  
 624  653          if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
 625  654                  nblkptr = 1;
 626  655          else
 627  656                  nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
 628  657          if (dn->dn_bonustype != bonustype)
 629  658                  dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype;
 630  659          if (dn->dn_nblkptr != nblkptr)
 631  660                  dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr;
 632  661          if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 633  662                  dbuf_rm_spill(dn, tx);
 634  663                  dnode_rm_spill(dn, tx);
 635  664          }
 636  665          rw_exit(&dn->dn_struct_rwlock);
 637  666  
 638  667          /* change type */
 639  668          dn->dn_type = ot;
 640  669  
 641  670          /* change bonus size and type */
 642  671          mutex_enter(&dn->dn_mtx);
 643  672          dn->dn_bonustype = bonustype;
 644  673          dn->dn_bonuslen = bonuslen;
 645  674          dn->dn_nblkptr = nblkptr;
 646  675          dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
 647  676          dn->dn_compress = ZIO_COMPRESS_INHERIT;
 648  677          ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
 649  678  
 650  679          /* fix up the bonus db_size */
 651  680          if (dn->dn_bonus) {
 652  681                  dn->dn_bonus->db.db_size =
 653  682                      DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t);
 654  683                  ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);
 655  684          }
 656  685  
 657  686          dn->dn_allocated_txg = tx->tx_txg;
 658  687          mutex_exit(&dn->dn_mtx);
 659  688  }
 660  689  
 661  690  #ifdef  DNODE_STATS
 662  691  static struct {
 663  692          uint64_t dms_dnode_invalid;
 664  693          uint64_t dms_dnode_recheck1;
 665  694          uint64_t dms_dnode_recheck2;
 666  695          uint64_t dms_dnode_special;
 667  696          uint64_t dms_dnode_handle;
 668  697          uint64_t dms_dnode_rwlock;
 669  698          uint64_t dms_dnode_active;
 670  699  } dnode_move_stats;
 671  700  #endif  /* DNODE_STATS */
 672  701  
 673  702  static void
 674  703  dnode_move_impl(dnode_t *odn, dnode_t *ndn)
 675  704  {
 676  705          int i;
 677  706  
 678  707          ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock));
 679  708          ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx));
 680  709          ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx));
 681  710          ASSERT(!RW_LOCK_HELD(&odn->dn_zfetch.zf_rwlock));
 682  711  
 683  712          /* Copy fields. */
 684  713          ndn->dn_objset = odn->dn_objset;
 685  714          ndn->dn_object = odn->dn_object;
 686  715          ndn->dn_dbuf = odn->dn_dbuf;
 687  716          ndn->dn_handle = odn->dn_handle;
 688  717          ndn->dn_phys = odn->dn_phys;
 689  718          ndn->dn_type = odn->dn_type;
 690  719          ndn->dn_bonuslen = odn->dn_bonuslen;
 691  720          ndn->dn_bonustype = odn->dn_bonustype;
 692  721          ndn->dn_nblkptr = odn->dn_nblkptr;
 693  722          ndn->dn_checksum = odn->dn_checksum;
 694  723          ndn->dn_compress = odn->dn_compress;
 695  724          ndn->dn_nlevels = odn->dn_nlevels;
 696  725          ndn->dn_indblkshift = odn->dn_indblkshift;
 697  726          ndn->dn_datablkshift = odn->dn_datablkshift;
 698  727          ndn->dn_datablkszsec = odn->dn_datablkszsec;
 699  728          ndn->dn_datablksz = odn->dn_datablksz;
 700  729          ndn->dn_maxblkid = odn->dn_maxblkid;
 701  730          bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0],
 702  731              sizeof (odn->dn_next_nblkptr));
 703  732          bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0],
 704  733              sizeof (odn->dn_next_nlevels));
 705  734          bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0],
 706  735              sizeof (odn->dn_next_indblkshift));
 707  736          bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0],
 708  737              sizeof (odn->dn_next_bonustype));
 709  738          bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0],
 710  739              sizeof (odn->dn_rm_spillblk));
 711  740          bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0],
 712  741              sizeof (odn->dn_next_bonuslen));
 713  742          bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0],
 714  743              sizeof (odn->dn_next_blksz));
 715  744          for (i = 0; i < TXG_SIZE; i++) {
 716  745                  list_move_tail(&ndn->dn_dirty_records[i],
 717  746                      &odn->dn_dirty_records[i]);
 718  747          }
 719  748          bcopy(&odn->dn_free_ranges[0], &ndn->dn_free_ranges[0],
 720  749              sizeof (odn->dn_free_ranges));
 721  750          ndn->dn_allocated_txg = odn->dn_allocated_txg;
 722  751          ndn->dn_free_txg = odn->dn_free_txg;
 723  752          ndn->dn_assigned_txg = odn->dn_assigned_txg;
 724  753          ndn->dn_dirtyctx = odn->dn_dirtyctx;
 725  754          ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
 726  755          ASSERT(refcount_count(&odn->dn_tx_holds) == 0);
 727  756          refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
 728  757          ASSERT(avl_is_empty(&ndn->dn_dbufs));
 729  758          avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs);
 730  759          ndn->dn_dbufs_count = odn->dn_dbufs_count;
 731  760          ndn->dn_unlisted_l0_blkid = odn->dn_unlisted_l0_blkid;
 732  761          ndn->dn_bonus = odn->dn_bonus;
 733  762          ndn->dn_have_spill = odn->dn_have_spill;
 734  763          ndn->dn_zio = odn->dn_zio;
 735  764          ndn->dn_oldused = odn->dn_oldused;
 736  765          ndn->dn_oldflags = odn->dn_oldflags;
 737  766          ndn->dn_olduid = odn->dn_olduid;
 738  767          ndn->dn_oldgid = odn->dn_oldgid;
 739  768          ndn->dn_newuid = odn->dn_newuid;
 740  769          ndn->dn_newgid = odn->dn_newgid;
 741  770          ndn->dn_id_flags = odn->dn_id_flags;
 742  771          dmu_zfetch_init(&ndn->dn_zfetch, NULL);
 743  772          list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream);
 744  773          ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode;
 745  774          ndn->dn_zfetch.zf_stream_cnt = odn->dn_zfetch.zf_stream_cnt;
 746  775          ndn->dn_zfetch.zf_alloc_fail = odn->dn_zfetch.zf_alloc_fail;
 747  776  
 748  777          /*
 749  778           * Update back pointers. Updating the handle fixes the back pointer of
 750  779           * every descendant dbuf as well as the bonus dbuf.
 751  780           */
 752  781          ASSERT(ndn->dn_handle->dnh_dnode == odn);
 753  782          ndn->dn_handle->dnh_dnode = ndn;
 754  783          if (ndn->dn_zfetch.zf_dnode == odn) {
 755  784                  ndn->dn_zfetch.zf_dnode = ndn;
 756  785          }
 757  786  
 758  787          /*
 759  788           * Invalidate the original dnode by clearing all of its back pointers.
 760  789           */
 761  790          odn->dn_dbuf = NULL;
 762  791          odn->dn_handle = NULL;
 763  792          avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
 764  793              offsetof(dmu_buf_impl_t, db_link));
 765  794          odn->dn_dbufs_count = 0;
 766  795          odn->dn_unlisted_l0_blkid = 0;
 767  796          odn->dn_bonus = NULL;
 768  797          odn->dn_zfetch.zf_dnode = NULL;
 769  798  
 770  799          /*
 771  800           * Set the low bit of the objset pointer to ensure that dnode_move()
 772  801           * recognizes the dnode as invalid in any subsequent callback.
 773  802           */
 774  803          POINTER_INVALIDATE(&odn->dn_objset);
 775  804  
 776  805          /*
 777  806           * Satisfy the destructor.
 778  807           */
 779  808          for (i = 0; i < TXG_SIZE; i++) {
 780  809                  list_create(&odn->dn_dirty_records[i],
 781  810                      sizeof (dbuf_dirty_record_t),
 782  811                      offsetof(dbuf_dirty_record_t, dr_dirty_node));
 783  812                  odn->dn_free_ranges[i] = NULL;
 784  813                  odn->dn_next_nlevels[i] = 0;
 785  814                  odn->dn_next_indblkshift[i] = 0;
 786  815                  odn->dn_next_bonustype[i] = 0;
 787  816                  odn->dn_rm_spillblk[i] = 0;
 788  817                  odn->dn_next_bonuslen[i] = 0;
 789  818                  odn->dn_next_blksz[i] = 0;
 790  819          }
 791  820          odn->dn_allocated_txg = 0;
 792  821          odn->dn_free_txg = 0;
 793  822          odn->dn_assigned_txg = 0;
 794  823          odn->dn_dirtyctx = 0;
 795  824          odn->dn_dirtyctx_firstset = NULL;
 796  825          odn->dn_have_spill = B_FALSE;
 797  826          odn->dn_zio = NULL;
 798  827          odn->dn_oldused = 0;
 799  828          odn->dn_oldflags = 0;
 800  829          odn->dn_olduid = 0;
 801  830          odn->dn_oldgid = 0;
 802  831          odn->dn_newuid = 0;
 803  832          odn->dn_newgid = 0;
 804  833          odn->dn_id_flags = 0;
 805  834  
 806  835          /*
 807  836           * Mark the dnode.
 808  837           */
 809  838          ndn->dn_moved = 1;
 810  839          odn->dn_moved = (uint8_t)-1;
 811  840  }
 812  841  
 813  842  #ifdef  _KERNEL
 814  843  /*ARGSUSED*/
 815  844  static kmem_cbrc_t
 816  845  dnode_move(void *buf, void *newbuf, size_t size, void *arg)
 817  846  {
 818  847          dnode_t *odn = buf, *ndn = newbuf;
 819  848          objset_t *os;
 820  849          int64_t refcount;
 821  850          uint32_t dbufs;
 822  851  
 823  852          /*
 824  853           * The dnode is on the objset's list of known dnodes if the objset
 825  854           * pointer is valid. We set the low bit of the objset pointer when
 826  855           * freeing the dnode to invalidate it, and the memory patterns written
 827  856           * by kmem (baddcafe and deadbeef) set at least one of the two low bits.
 828  857           * A newly created dnode sets the objset pointer last of all to indicate
 829  858           * that the dnode is known and in a valid state to be moved by this
 830  859           * function.
 831  860           */
 832  861          os = odn->dn_objset;
 833  862          if (!POINTER_IS_VALID(os)) {
 834  863                  DNODE_STAT_ADD(dnode_move_stats.dms_dnode_invalid);
 835  864                  return (KMEM_CBRC_DONT_KNOW);
 836  865          }
 837  866  
 838  867          /*
 839  868           * Ensure that the objset does not go away during the move.
 840  869           */
 841  870          rw_enter(&os_lock, RW_WRITER);
 842  871          if (os != odn->dn_objset) {
 843  872                  rw_exit(&os_lock);
 844  873                  DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck1);
 845  874                  return (KMEM_CBRC_DONT_KNOW);
 846  875          }
 847  876  
 848  877          /*
 849  878           * If the dnode is still valid, then so is the objset. We know that no
 850  879           * valid objset can be freed while we hold os_lock, so we can safely
 851  880           * ensure that the objset remains in use.
 852  881           */
 853  882          mutex_enter(&os->os_lock);
 854  883  
 855  884          /*
 856  885           * Recheck the objset pointer in case the dnode was removed just before
 857  886           * acquiring the lock.
 858  887           */
 859  888          if (os != odn->dn_objset) {
 860  889                  mutex_exit(&os->os_lock);
 861  890                  rw_exit(&os_lock);
 862  891                  DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck2);
 863  892                  return (KMEM_CBRC_DONT_KNOW);
 864  893          }
 865  894  
 866  895          /*
 867  896           * At this point we know that as long as we hold os->os_lock, the dnode
 868  897           * cannot be freed and fields within the dnode can be safely accessed.
 869  898           * The objset listing this dnode cannot go away as long as this dnode is
 870  899           * on its list.
 871  900           */
 872  901          rw_exit(&os_lock);
 873  902          if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) {
 874  903                  mutex_exit(&os->os_lock);
 875  904                  DNODE_STAT_ADD(dnode_move_stats.dms_dnode_special);
 876  905                  return (KMEM_CBRC_NO);
 877  906          }
 878  907          ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */
 879  908  
 880  909          /*
 881  910           * Lock the dnode handle to prevent the dnode from obtaining any new
 882  911           * holds. This also prevents the descendant dbufs and the bonus dbuf
 883  912           * from accessing the dnode, so that we can discount their holds. The
 884  913           * handle is safe to access because we know that while the dnode cannot
 885  914           * go away, neither can its handle. Once we hold dnh_zrlock, we can
 886  915           * safely move any dnode referenced only by dbufs.
 887  916           */
 888  917          if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) {
 889  918                  mutex_exit(&os->os_lock);
 890  919                  DNODE_STAT_ADD(dnode_move_stats.dms_dnode_handle);
 891  920                  return (KMEM_CBRC_LATER);
 892  921          }
 893  922  
 894  923          /*
 895  924           * Ensure a consistent view of the dnode's holds and the dnode's dbufs.
 896  925           * We need to guarantee that there is a hold for every dbuf in order to
 897  926           * determine whether the dnode is actively referenced. Falsely matching
 898  927           * a dbuf to an active hold would lead to an unsafe move. It's possible
 899  928           * that a thread already having an active dnode hold is about to add a
 900  929           * dbuf, and we can't compare hold and dbuf counts while the add is in
 901  930           * progress.
 902  931           */
 903  932          if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) {
 904  933                  zrl_exit(&odn->dn_handle->dnh_zrlock);
 905  934                  mutex_exit(&os->os_lock);
 906  935                  DNODE_STAT_ADD(dnode_move_stats.dms_dnode_rwlock);
 907  936                  return (KMEM_CBRC_LATER);
 908  937          }
 909  938  
 910  939          /*
 911  940           * A dbuf may be removed (evicted) without an active dnode hold. In that
 912  941           * case, the dbuf count is decremented under the handle lock before the
 913  942           * dbuf's hold is released. This order ensures that if we count the hold
 914  943           * after the dbuf is removed but before its hold is released, we will
 915  944           * treat the unmatched hold as active and exit safely. If we count the
 916  945           * hold before the dbuf is removed, the hold is discounted, and the
 917  946           * removal is blocked until the move completes.
 918  947           */
 919  948          refcount = refcount_count(&odn->dn_holds);
 920  949          ASSERT(refcount >= 0);
 921  950          dbufs = odn->dn_dbufs_count;
 922  951  
 923  952          /* We can't have more dbufs than dnode holds. */
 924  953          ASSERT3U(dbufs, <=, refcount);
 925  954          DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount,
 926  955              uint32_t, dbufs);
 927  956  
 928  957          if (refcount > dbufs) {
 929  958                  rw_exit(&odn->dn_struct_rwlock);
 930  959                  zrl_exit(&odn->dn_handle->dnh_zrlock);
 931  960                  mutex_exit(&os->os_lock);
 932  961                  DNODE_STAT_ADD(dnode_move_stats.dms_dnode_active);
 933  962                  return (KMEM_CBRC_LATER);
 934  963          }
 935  964  
 936  965          rw_exit(&odn->dn_struct_rwlock);
 937  966  
 938  967          /*
 939  968           * At this point we know that anyone with a hold on the dnode is not
 940  969           * actively referencing it. The dnode is known and in a valid state to
 941  970           * move. We're holding the locks needed to execute the critical section.
 942  971           */
 943  972          dnode_move_impl(odn, ndn);
 944  973  
 945  974          list_link_replace(&odn->dn_link, &ndn->dn_link);
 946  975          /* If the dnode was safe to move, the refcount cannot have changed. */
 947  976          ASSERT(refcount == refcount_count(&ndn->dn_holds));
 948  977          ASSERT(dbufs == ndn->dn_dbufs_count);
 949  978          zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */
 950  979          mutex_exit(&os->os_lock);
 951  980  
 952  981          return (KMEM_CBRC_YES);
 953  982  }
 954  983  #endif  /* _KERNEL */
 955  984  
 956  985  void
 957  986  dnode_special_close(dnode_handle_t *dnh)
 958  987  {
  
    | 
      ↓ open down ↓ | 
    446 lines elided | 
    
      ↑ open up ↑ | 
  
 959  988          dnode_t *dn = dnh->dnh_dnode;
 960  989  
 961  990          /*
 962  991           * Wait for final references to the dnode to clear.  This can
 963  992           * only happen if the arc is asyncronously evicting state that
 964  993           * has a hold on this dnode while we are trying to evict this
 965  994           * dnode.
 966  995           */
 967  996          while (refcount_count(&dn->dn_holds) > 0)
 968  997                  delay(1);
      998 +        ASSERT(dn->dn_dbuf == NULL ||
      999 +            dmu_buf_get_user(&dn->dn_dbuf->db) == NULL);
 969 1000          zrl_add(&dnh->dnh_zrlock);
 970 1001          dnode_destroy(dn); /* implicit zrl_remove() */
 971 1002          zrl_destroy(&dnh->dnh_zrlock);
 972 1003          dnh->dnh_dnode = NULL;
 973 1004  }
 974 1005  
 975      -dnode_t *
     1006 +void
 976 1007  dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
 977 1008      dnode_handle_t *dnh)
 978 1009  {
 979      -        dnode_t *dn = dnode_create(os, dnp, NULL, object, dnh);
 980      -        dnh->dnh_dnode = dn;
     1010 +        dnode_t *dn;
     1011 +
     1012 +        dn = dnode_create(os, dnp, NULL, object, dnh);
 981 1013          zrl_init(&dnh->dnh_zrlock);
 982 1014          DNODE_VERIFY(dn);
 983      -        return (dn);
 984 1015  }
 985 1016  
 986 1017  static void
 987      -dnode_buf_pageout(dmu_buf_t *db, void *arg)
     1018 +dnode_buf_pageout(void *dbu)
 988 1019  {
 989      -        dnode_children_t *children_dnodes = arg;
     1020 +        dnode_children_t *children_dnodes = dbu;
 990 1021          int i;
 991      -        int epb = db->db_size >> DNODE_SHIFT;
 992 1022  
 993      -        ASSERT(epb == children_dnodes->dnc_count);
 994      -
 995      -        for (i = 0; i < epb; i++) {
     1023 +        for (i = 0; i < children_dnodes->dnc_count; i++) {
 996 1024                  dnode_handle_t *dnh = &children_dnodes->dnc_children[i];
 997 1025                  dnode_t *dn;
 998 1026  
 999 1027                  /*
1000 1028                   * The dnode handle lock guards against the dnode moving to
1001 1029                   * another valid address, so there is no need here to guard
1002 1030                   * against changes to or from NULL.
1003 1031                   */
1004 1032                  if (dnh->dnh_dnode == NULL) {
1005 1033                          zrl_destroy(&dnh->dnh_zrlock);
1006 1034                          continue;
1007 1035                  }
1008 1036  
1009 1037                  zrl_add(&dnh->dnh_zrlock);
1010 1038                  dn = dnh->dnh_dnode;
1011 1039                  /*
1012 1040                   * If there are holds on this dnode, then there should
1013 1041                   * be holds on the dnode's containing dbuf as well; thus
1014 1042                   * it wouldn't be eligible for eviction and this function
  
    | 
      ↓ open down ↓ | 
    9 lines elided | 
    
      ↑ open up ↑ | 
  
1015 1043                   * would not have been called.
1016 1044                   */
1017 1045                  ASSERT(refcount_is_zero(&dn->dn_holds));
1018 1046                  ASSERT(refcount_is_zero(&dn->dn_tx_holds));
1019 1047  
1020 1048                  dnode_destroy(dn); /* implicit zrl_remove() */
1021 1049                  zrl_destroy(&dnh->dnh_zrlock);
1022 1050                  dnh->dnh_dnode = NULL;
1023 1051          }
1024 1052          kmem_free(children_dnodes, sizeof (dnode_children_t) +
1025      -            epb * sizeof (dnode_handle_t));
     1053 +            children_dnodes->dnc_count * sizeof (dnode_handle_t));
1026 1054  }
1027 1055  
1028 1056  /*
1029 1057   * errors:
1030 1058   * EINVAL - invalid object number.
1031 1059   * EIO - i/o error.
1032 1060   * succeeds even for free dnodes.
1033 1061   */
1034 1062  int
1035 1063  dnode_hold_impl(objset_t *os, uint64_t object, int flag,
1036 1064      void *tag, dnode_t **dnp)
1037 1065  {
1038 1066          int epb, idx, err;
1039 1067          int drop_struct_lock = FALSE;
1040 1068          int type;
1041 1069          uint64_t blk;
1042 1070          dnode_t *mdn, *dn;
1043 1071          dmu_buf_impl_t *db;
1044 1072          dnode_children_t *children_dnodes;
1045 1073          dnode_handle_t *dnh;
1046 1074  
1047 1075          /*
1048 1076           * If you are holding the spa config lock as writer, you shouldn't
1049 1077           * be asking the DMU to do *anything* unless it's the root pool
1050 1078           * which may require us to read from the root filesystem while
1051 1079           * holding some (not all) of the locks as writer.
1052 1080           */
1053 1081          ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 ||
1054 1082              (spa_is_root(os->os_spa) &&
1055 1083              spa_config_held(os->os_spa, SCL_STATE, RW_WRITER)));
1056 1084  
1057 1085          if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) {
1058 1086                  dn = (object == DMU_USERUSED_OBJECT) ?
1059 1087                      DMU_USERUSED_DNODE(os) : DMU_GROUPUSED_DNODE(os);
1060 1088                  if (dn == NULL)
1061 1089                          return (SET_ERROR(ENOENT));
1062 1090                  type = dn->dn_type;
1063 1091                  if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE)
1064 1092                          return (SET_ERROR(ENOENT));
1065 1093                  if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)
1066 1094                          return (SET_ERROR(EEXIST));
1067 1095                  DNODE_VERIFY(dn);
1068 1096                  (void) refcount_add(&dn->dn_holds, tag);
1069 1097                  *dnp = dn;
1070 1098                  return (0);
1071 1099          }
1072 1100  
1073 1101          if (object == 0 || object >= DN_MAX_OBJECT)
1074 1102                  return (SET_ERROR(EINVAL));
1075 1103  
1076 1104          mdn = DMU_META_DNODE(os);
1077 1105          ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT);
1078 1106  
1079 1107          DNODE_VERIFY(mdn);
1080 1108  
1081 1109          if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) {
1082 1110                  rw_enter(&mdn->dn_struct_rwlock, RW_READER);
1083 1111                  drop_struct_lock = TRUE;
1084 1112          }
1085 1113  
1086 1114          blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t));
1087 1115  
1088 1116          db = dbuf_hold(mdn, blk, FTAG);
1089 1117          if (drop_struct_lock)
1090 1118                  rw_exit(&mdn->dn_struct_rwlock);
1091 1119          if (db == NULL)
1092 1120                  return (SET_ERROR(EIO));
1093 1121          err = dbuf_read(db, NULL, DB_RF_CANFAIL);
1094 1122          if (err) {
1095 1123                  dbuf_rele(db, FTAG);
1096 1124                  return (err);
1097 1125          }
1098 1126  
  
    | 
      ↓ open down ↓ | 
    63 lines elided | 
    
      ↑ open up ↑ | 
  
1099 1127          ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
1100 1128          epb = db->db.db_size >> DNODE_SHIFT;
1101 1129  
1102 1130          idx = object & (epb-1);
1103 1131  
1104 1132          ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
1105 1133          children_dnodes = dmu_buf_get_user(&db->db);
1106 1134          if (children_dnodes == NULL) {
1107 1135                  int i;
1108 1136                  dnode_children_t *winner;
1109      -                children_dnodes = kmem_alloc(sizeof (dnode_children_t) +
     1137 +                children_dnodes = kmem_zalloc(sizeof (dnode_children_t) +
1110 1138                      epb * sizeof (dnode_handle_t), KM_SLEEP);
1111 1139                  children_dnodes->dnc_count = epb;
1112 1140                  dnh = &children_dnodes->dnc_children[0];
1113 1141                  for (i = 0; i < epb; i++) {
1114 1142                          zrl_init(&dnh[i].dnh_zrlock);
1115      -                        dnh[i].dnh_dnode = NULL;
1116 1143                  }
1117      -                if (winner = dmu_buf_set_user(&db->db, children_dnodes,
1118      -                    dnode_buf_pageout)) {
     1144 +                dmu_buf_init_user(&children_dnodes->dnc_dbu,
     1145 +                    dnode_buf_pageout, NULL);
     1146 +                winner = dmu_buf_set_user(&db->db, &children_dnodes->dnc_dbu);
     1147 +                if (winner != NULL) {
1119 1148  
1120 1149                          for (i = 0; i < epb; i++) {
1121 1150                                  zrl_destroy(&dnh[i].dnh_zrlock);
1122 1151                          }
1123 1152  
1124 1153                          kmem_free(children_dnodes, sizeof (dnode_children_t) +
1125 1154                              epb * sizeof (dnode_handle_t));
1126 1155                          children_dnodes = winner;
1127 1156                  }
1128 1157          }
1129 1158          ASSERT(children_dnodes->dnc_count == epb);
1130 1159  
1131 1160          dnh = &children_dnodes->dnc_children[idx];
1132 1161          zrl_add(&dnh->dnh_zrlock);
1133      -        if ((dn = dnh->dnh_dnode) == NULL) {
     1162 +        dn = dnh->dnh_dnode;
     1163 +        if (dn == NULL) {
1134 1164                  dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx;
1135      -                dnode_t *winner;
1136 1165  
1137 1166                  dn = dnode_create(os, phys, db, object, dnh);
1138      -                winner = atomic_cas_ptr(&dnh->dnh_dnode, NULL, dn);
1139      -                if (winner != NULL) {
1140      -                        zrl_add(&dnh->dnh_zrlock);
1141      -                        dnode_destroy(dn); /* implicit zrl_remove() */
1142      -                        dn = winner;
1143      -                }
1144 1167          }
1145 1168  
1146 1169          mutex_enter(&dn->dn_mtx);
1147 1170          type = dn->dn_type;
1148 1171          if (dn->dn_free_txg ||
1149 1172              ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
1150 1173              ((flag & DNODE_MUST_BE_FREE) &&
1151 1174              (type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) {
1152 1175                  mutex_exit(&dn->dn_mtx);
1153 1176                  zrl_remove(&dnh->dnh_zrlock);
1154 1177                  dbuf_rele(db, FTAG);
1155 1178                  return (type == DMU_OT_NONE ? ENOENT : EEXIST);
1156 1179          }
1157      -        mutex_exit(&dn->dn_mtx);
1158      -
1159 1180          if (refcount_add(&dn->dn_holds, tag) == 1)
1160 1181                  dbuf_add_ref(db, dnh);
     1182 +        mutex_exit(&dn->dn_mtx);
     1183 +
1161 1184          /* Now we can rely on the hold to prevent the dnode from moving. */
1162 1185          zrl_remove(&dnh->dnh_zrlock);
1163 1186  
1164 1187          DNODE_VERIFY(dn);
1165 1188          ASSERT3P(dn->dn_dbuf, ==, db);
1166 1189          ASSERT3U(dn->dn_object, ==, object);
1167 1190          dbuf_rele(db, FTAG);
1168 1191  
1169 1192          *dnp = dn;
1170 1193          return (0);
1171 1194  }
1172 1195  
1173 1196  /*
1174 1197   * Return held dnode if the object is allocated, NULL if not.
1175 1198   */
1176 1199  int
1177 1200  dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp)
1178 1201  {
1179 1202          return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp));
1180 1203  }
1181 1204  
1182 1205  /*
1183 1206   * Can only add a reference if there is already at least one
1184 1207   * reference on the dnode.  Returns FALSE if unable to add a
1185 1208   * new reference.
1186 1209   */
1187 1210  boolean_t
1188 1211  dnode_add_ref(dnode_t *dn, void *tag)
1189 1212  {
1190 1213          mutex_enter(&dn->dn_mtx);
1191 1214          if (refcount_is_zero(&dn->dn_holds)) {
1192 1215                  mutex_exit(&dn->dn_mtx);
1193 1216                  return (FALSE);
1194 1217          }
1195 1218          VERIFY(1 < refcount_add(&dn->dn_holds, tag));
1196 1219          mutex_exit(&dn->dn_mtx);
1197 1220          return (TRUE);
1198 1221  }
1199 1222  
1200 1223  void
1201 1224  dnode_rele(dnode_t *dn, void *tag)
1202 1225  {
1203 1226          uint64_t refs;
1204 1227          /* Get while the hold prevents the dnode from moving. */
1205 1228          dmu_buf_impl_t *db = dn->dn_dbuf;
1206 1229          dnode_handle_t *dnh = dn->dn_handle;
1207 1230  
1208 1231          mutex_enter(&dn->dn_mtx);
1209 1232          refs = refcount_remove(&dn->dn_holds, tag);
1210 1233          mutex_exit(&dn->dn_mtx);
1211 1234  
1212 1235          /*
1213 1236           * It's unsafe to release the last hold on a dnode by dnode_rele() or
1214 1237           * indirectly by dbuf_rele() while relying on the dnode handle to
1215 1238           * prevent the dnode from moving, since releasing the last hold could
1216 1239           * result in the dnode's parent dbuf evicting its dnode handles. For
1217 1240           * that reason anyone calling dnode_rele() or dbuf_rele() without some
1218 1241           * other direct or indirect hold on the dnode must first drop the dnode
1219 1242           * handle.
1220 1243           */
1221 1244          ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread);
1222 1245  
1223 1246          /* NOTE: the DNODE_DNODE does not have a dn_dbuf */
1224 1247          if (refs == 0 && db != NULL) {
1225 1248                  /*
1226 1249                   * Another thread could add a hold to the dnode handle in
1227 1250                   * dnode_hold_impl() while holding the parent dbuf. Since the
1228 1251                   * hold on the parent dbuf prevents the handle from being
1229 1252                   * destroyed, the hold on the handle is OK. We can't yet assert
1230 1253                   * that the handle has zero references, but that will be
1231 1254                   * asserted anyway when the handle gets destroyed.
1232 1255                   */
1233 1256                  dbuf_rele(db, dnh);
1234 1257          }
1235 1258  }
1236 1259  
1237 1260  void
1238 1261  dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
1239 1262  {
1240 1263          objset_t *os = dn->dn_objset;
1241 1264          uint64_t txg = tx->tx_txg;
1242 1265  
1243 1266          if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
1244 1267                  dsl_dataset_dirty(os->os_dsl_dataset, tx);
1245 1268                  return;
1246 1269          }
1247 1270  
1248 1271          DNODE_VERIFY(dn);
1249 1272  
1250 1273  #ifdef ZFS_DEBUG
1251 1274          mutex_enter(&dn->dn_mtx);
1252 1275          ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg);
1253 1276          ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg);
1254 1277          mutex_exit(&dn->dn_mtx);
1255 1278  #endif
1256 1279  
1257 1280          /*
1258 1281           * Determine old uid/gid when necessary
1259 1282           */
1260 1283          dmu_objset_userquota_get_ids(dn, B_TRUE, tx);
1261 1284  
1262 1285          mutex_enter(&os->os_lock);
1263 1286  
1264 1287          /*
1265 1288           * If we are already marked dirty, we're done.
1266 1289           */
1267 1290          if (list_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
1268 1291                  mutex_exit(&os->os_lock);
1269 1292                  return;
1270 1293          }
1271 1294  
1272 1295          ASSERT(!refcount_is_zero(&dn->dn_holds) ||
1273 1296              !avl_is_empty(&dn->dn_dbufs));
1274 1297          ASSERT(dn->dn_datablksz != 0);
1275 1298          ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]);
1276 1299          ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]);
1277 1300          ASSERT0(dn->dn_next_bonustype[txg&TXG_MASK]);
1278 1301  
1279 1302          dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
1280 1303              dn->dn_object, txg);
1281 1304  
1282 1305          if (dn->dn_free_txg > 0 && dn->dn_free_txg <= txg) {
1283 1306                  list_insert_tail(&os->os_free_dnodes[txg&TXG_MASK], dn);
1284 1307          } else {
1285 1308                  list_insert_tail(&os->os_dirty_dnodes[txg&TXG_MASK], dn);
1286 1309          }
1287 1310  
1288 1311          mutex_exit(&os->os_lock);
1289 1312  
1290 1313          /*
1291 1314           * The dnode maintains a hold on its containing dbuf as
1292 1315           * long as there are holds on it.  Each instantiated child
1293 1316           * dbuf maintains a hold on the dnode.  When the last child
1294 1317           * drops its hold, the dnode will drop its hold on the
1295 1318           * containing dbuf. We add a "dirty hold" here so that the
1296 1319           * dnode will hang around after we finish processing its
1297 1320           * children.
1298 1321           */
1299 1322          VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg));
1300 1323  
1301 1324          (void) dbuf_dirty(dn->dn_dbuf, tx);
1302 1325  
1303 1326          dsl_dataset_dirty(os->os_dsl_dataset, tx);
1304 1327  }
1305 1328  
1306 1329  void
1307 1330  dnode_free(dnode_t *dn, dmu_tx_t *tx)
1308 1331  {
1309 1332          int txgoff = tx->tx_txg & TXG_MASK;
1310 1333  
1311 1334          dprintf("dn=%p txg=%llu\n", dn, tx->tx_txg);
1312 1335  
1313 1336          /* we should be the only holder... hopefully */
1314 1337          /* ASSERT3U(refcount_count(&dn->dn_holds), ==, 1); */
1315 1338  
1316 1339          mutex_enter(&dn->dn_mtx);
1317 1340          if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) {
1318 1341                  mutex_exit(&dn->dn_mtx);
1319 1342                  return;
1320 1343          }
1321 1344          dn->dn_free_txg = tx->tx_txg;
1322 1345          mutex_exit(&dn->dn_mtx);
1323 1346  
1324 1347          /*
1325 1348           * If the dnode is already dirty, it needs to be moved from
1326 1349           * the dirty list to the free list.
1327 1350           */
1328 1351          mutex_enter(&dn->dn_objset->os_lock);
1329 1352          if (list_link_active(&dn->dn_dirty_link[txgoff])) {
1330 1353                  list_remove(&dn->dn_objset->os_dirty_dnodes[txgoff], dn);
1331 1354                  list_insert_tail(&dn->dn_objset->os_free_dnodes[txgoff], dn);
1332 1355                  mutex_exit(&dn->dn_objset->os_lock);
1333 1356          } else {
1334 1357                  mutex_exit(&dn->dn_objset->os_lock);
1335 1358                  dnode_setdirty(dn, tx);
1336 1359          }
1337 1360  }
1338 1361  
1339 1362  /*
1340 1363   * Try to change the block size for the indicated dnode.  This can only
1341 1364   * succeed if there are no blocks allocated or dirty beyond first block
1342 1365   */
1343 1366  int
1344 1367  dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
1345 1368  {
1346 1369          dmu_buf_impl_t *db;
1347 1370          int err;
1348 1371  
1349 1372          ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
1350 1373          if (size == 0)
1351 1374                  size = SPA_MINBLOCKSIZE;
1352 1375          else
1353 1376                  size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
1354 1377  
1355 1378          if (ibs == dn->dn_indblkshift)
1356 1379                  ibs = 0;
1357 1380  
1358 1381          if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0)
1359 1382                  return (0);
1360 1383  
1361 1384          rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1362 1385  
1363 1386          /* Check for any allocated blocks beyond the first */
1364 1387          if (dn->dn_maxblkid != 0)
1365 1388                  goto fail;
1366 1389  
1367 1390          mutex_enter(&dn->dn_dbufs_mtx);
1368 1391          for (db = avl_first(&dn->dn_dbufs); db != NULL;
1369 1392              db = AVL_NEXT(&dn->dn_dbufs, db)) {
1370 1393                  if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
1371 1394                      db->db_blkid != DMU_SPILL_BLKID) {
1372 1395                          mutex_exit(&dn->dn_dbufs_mtx);
1373 1396                          goto fail;
1374 1397                  }
1375 1398          }
1376 1399          mutex_exit(&dn->dn_dbufs_mtx);
1377 1400  
1378 1401          if (ibs && dn->dn_nlevels != 1)
1379 1402                  goto fail;
1380 1403  
1381 1404          /* resize the old block */
1382 1405          err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db);
1383 1406          if (err == 0)
1384 1407                  dbuf_new_size(db, size, tx);
1385 1408          else if (err != ENOENT)
1386 1409                  goto fail;
1387 1410  
1388 1411          dnode_setdblksz(dn, size);
1389 1412          dnode_setdirty(dn, tx);
1390 1413          dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size;
1391 1414          if (ibs) {
1392 1415                  dn->dn_indblkshift = ibs;
1393 1416                  dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
1394 1417          }
1395 1418          /* rele after we have fixed the blocksize in the dnode */
1396 1419          if (db)
1397 1420                  dbuf_rele(db, FTAG);
1398 1421  
1399 1422          rw_exit(&dn->dn_struct_rwlock);
1400 1423          return (0);
1401 1424  
1402 1425  fail:
1403 1426          rw_exit(&dn->dn_struct_rwlock);
1404 1427          return (SET_ERROR(ENOTSUP));
1405 1428  }
1406 1429  
1407 1430  /* read-holding callers must not rely on the lock being continuously held */
1408 1431  void
1409 1432  dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
1410 1433  {
1411 1434          uint64_t txgoff = tx->tx_txg & TXG_MASK;
1412 1435          int epbs, new_nlevels;
1413 1436          uint64_t sz;
1414 1437  
1415 1438          ASSERT(blkid != DMU_BONUS_BLKID);
1416 1439  
1417 1440          ASSERT(have_read ?
1418 1441              RW_READ_HELD(&dn->dn_struct_rwlock) :
1419 1442              RW_WRITE_HELD(&dn->dn_struct_rwlock));
1420 1443  
1421 1444          /*
1422 1445           * if we have a read-lock, check to see if we need to do any work
1423 1446           * before upgrading to a write-lock.
1424 1447           */
1425 1448          if (have_read) {
1426 1449                  if (blkid <= dn->dn_maxblkid)
1427 1450                          return;
1428 1451  
1429 1452                  if (!rw_tryupgrade(&dn->dn_struct_rwlock)) {
1430 1453                          rw_exit(&dn->dn_struct_rwlock);
1431 1454                          rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1432 1455                  }
1433 1456          }
1434 1457  
1435 1458          if (blkid <= dn->dn_maxblkid)
1436 1459                  goto out;
1437 1460  
1438 1461          dn->dn_maxblkid = blkid;
1439 1462  
1440 1463          /*
1441 1464           * Compute the number of levels necessary to support the new maxblkid.
1442 1465           */
1443 1466          new_nlevels = 1;
1444 1467          epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1445 1468          for (sz = dn->dn_nblkptr;
1446 1469              sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)
1447 1470                  new_nlevels++;
1448 1471  
1449 1472          if (new_nlevels > dn->dn_nlevels) {
1450 1473                  int old_nlevels = dn->dn_nlevels;
1451 1474                  dmu_buf_impl_t *db;
1452 1475                  list_t *list;
1453 1476                  dbuf_dirty_record_t *new, *dr, *dr_next;
1454 1477  
1455 1478                  dn->dn_nlevels = new_nlevels;
1456 1479  
1457 1480                  ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
1458 1481                  dn->dn_next_nlevels[txgoff] = new_nlevels;
1459 1482  
1460 1483                  /* dirty the left indirects */
1461 1484                  db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
1462 1485                  ASSERT(db != NULL);
1463 1486                  new = dbuf_dirty(db, tx);
1464 1487                  dbuf_rele(db, FTAG);
1465 1488  
1466 1489                  /* transfer the dirty records to the new indirect */
1467 1490                  mutex_enter(&dn->dn_mtx);
1468 1491                  mutex_enter(&new->dt.di.dr_mtx);
1469 1492                  list = &dn->dn_dirty_records[txgoff];
1470 1493                  for (dr = list_head(list); dr; dr = dr_next) {
1471 1494                          dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
1472 1495                          if (dr->dr_dbuf->db_level != new_nlevels-1 &&
1473 1496                              dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
1474 1497                              dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
1475 1498                                  ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
1476 1499                                  list_remove(&dn->dn_dirty_records[txgoff], dr);
1477 1500                                  list_insert_tail(&new->dt.di.dr_children, dr);
1478 1501                                  dr->dr_parent = new;
1479 1502                          }
1480 1503                  }
1481 1504                  mutex_exit(&new->dt.di.dr_mtx);
1482 1505                  mutex_exit(&dn->dn_mtx);
1483 1506          }
1484 1507  
1485 1508  out:
1486 1509          if (have_read)
1487 1510                  rw_downgrade(&dn->dn_struct_rwlock);
1488 1511  }
1489 1512  
1490 1513  void
1491 1514  dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
1492 1515  {
1493 1516          dmu_buf_impl_t *db;
1494 1517          uint64_t blkoff, blkid, nblks;
1495 1518          int blksz, blkshift, head, tail;
1496 1519          int trunc = FALSE;
1497 1520          int epbs;
1498 1521  
1499 1522          rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1500 1523          blksz = dn->dn_datablksz;
1501 1524          blkshift = dn->dn_datablkshift;
1502 1525          epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1503 1526  
1504 1527          if (len == DMU_OBJECT_END) {
1505 1528                  len = UINT64_MAX - off;
1506 1529                  trunc = TRUE;
1507 1530          }
1508 1531  
1509 1532          /*
1510 1533           * First, block align the region to free:
1511 1534           */
1512 1535          if (ISP2(blksz)) {
1513 1536                  head = P2NPHASE(off, blksz);
1514 1537                  blkoff = P2PHASE(off, blksz);
1515 1538                  if ((off >> blkshift) > dn->dn_maxblkid)
1516 1539                          goto out;
1517 1540          } else {
1518 1541                  ASSERT(dn->dn_maxblkid == 0);
1519 1542                  if (off == 0 && len >= blksz) {
1520 1543                          /*
1521 1544                           * Freeing the whole block; fast-track this request.
1522 1545                           * Note that we won't dirty any indirect blocks,
1523 1546                           * which is fine because we will be freeing the entire
1524 1547                           * file and thus all indirect blocks will be freed
1525 1548                           * by free_children().
1526 1549                           */
1527 1550                          blkid = 0;
1528 1551                          nblks = 1;
1529 1552                          goto done;
1530 1553                  } else if (off >= blksz) {
1531 1554                          /* Freeing past end-of-data */
1532 1555                          goto out;
1533 1556                  } else {
1534 1557                          /* Freeing part of the block. */
1535 1558                          head = blksz - off;
1536 1559                          ASSERT3U(head, >, 0);
1537 1560                  }
1538 1561                  blkoff = off;
1539 1562          }
1540 1563          /* zero out any partial block data at the start of the range */
1541 1564          if (head) {
1542 1565                  ASSERT3U(blkoff + head, ==, blksz);
1543 1566                  if (len < head)
1544 1567                          head = len;
1545 1568                  if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE,
1546 1569                      FTAG, &db) == 0) {
1547 1570                          caddr_t data;
1548 1571  
1549 1572                          /* don't dirty if it isn't on disk and isn't dirty */
1550 1573                          if (db->db_last_dirty ||
1551 1574                              (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
1552 1575                                  rw_exit(&dn->dn_struct_rwlock);
1553 1576                                  dmu_buf_will_dirty(&db->db, tx);
1554 1577                                  rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1555 1578                                  data = db->db.db_data;
1556 1579                                  bzero(data + blkoff, head);
1557 1580                          }
1558 1581                          dbuf_rele(db, FTAG);
1559 1582                  }
1560 1583                  off += head;
1561 1584                  len -= head;
1562 1585          }
1563 1586  
1564 1587          /* If the range was less than one block, we're done */
1565 1588          if (len == 0)
1566 1589                  goto out;
1567 1590  
1568 1591          /* If the remaining range is past end of file, we're done */
1569 1592          if ((off >> blkshift) > dn->dn_maxblkid)
1570 1593                  goto out;
1571 1594  
1572 1595          ASSERT(ISP2(blksz));
1573 1596          if (trunc)
1574 1597                  tail = 0;
1575 1598          else
1576 1599                  tail = P2PHASE(len, blksz);
1577 1600  
1578 1601          ASSERT0(P2PHASE(off, blksz));
1579 1602          /* zero out any partial block data at the end of the range */
1580 1603          if (tail) {
1581 1604                  if (len < tail)
1582 1605                          tail = len;
1583 1606                  if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len),
1584 1607                      TRUE, FTAG, &db) == 0) {
1585 1608                          /* don't dirty if not on disk and not dirty */
1586 1609                          if (db->db_last_dirty ||
1587 1610                              (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
1588 1611                                  rw_exit(&dn->dn_struct_rwlock);
1589 1612                                  dmu_buf_will_dirty(&db->db, tx);
1590 1613                                  rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1591 1614                                  bzero(db->db.db_data, tail);
1592 1615                          }
1593 1616                          dbuf_rele(db, FTAG);
1594 1617                  }
1595 1618                  len -= tail;
1596 1619          }
1597 1620  
1598 1621          /* If the range did not include a full block, we are done */
1599 1622          if (len == 0)
1600 1623                  goto out;
1601 1624  
1602 1625          ASSERT(IS_P2ALIGNED(off, blksz));
1603 1626          ASSERT(trunc || IS_P2ALIGNED(len, blksz));
1604 1627          blkid = off >> blkshift;
1605 1628          nblks = len >> blkshift;
1606 1629          if (trunc)
1607 1630                  nblks += 1;
1608 1631  
1609 1632          /*
1610 1633           * Dirty the first and last indirect blocks, as they (and/or their
1611 1634           * parents) will need to be written out if they were only
1612 1635           * partially freed.  Interior indirect blocks will be themselves freed,
1613 1636           * by free_children(), so they need not be dirtied.  Note that these
1614 1637           * interior blocks have already been prefetched by dmu_tx_hold_free().
1615 1638           */
1616 1639          if (dn->dn_nlevels > 1) {
1617 1640                  uint64_t first, last;
1618 1641  
1619 1642                  first = blkid >> epbs;
1620 1643                  if (db = dbuf_hold_level(dn, 1, first, FTAG)) {
1621 1644                          dmu_buf_will_dirty(&db->db, tx);
1622 1645                          dbuf_rele(db, FTAG);
1623 1646                  }
1624 1647                  if (trunc)
1625 1648                          last = dn->dn_maxblkid >> epbs;
1626 1649                  else
1627 1650                          last = (blkid + nblks - 1) >> epbs;
1628 1651                  if (last > first && (db = dbuf_hold_level(dn, 1, last, FTAG))) {
1629 1652                          dmu_buf_will_dirty(&db->db, tx);
1630 1653                          dbuf_rele(db, FTAG);
1631 1654                  }
1632 1655          }
1633 1656  
1634 1657  done:
1635 1658          /*
1636 1659           * Add this range to the dnode range list.
1637 1660           * We will finish up this free operation in the syncing phase.
1638 1661           */
1639 1662          mutex_enter(&dn->dn_mtx);
1640 1663          int txgoff = tx->tx_txg & TXG_MASK;
1641 1664          if (dn->dn_free_ranges[txgoff] == NULL) {
1642 1665                  dn->dn_free_ranges[txgoff] =
1643 1666                      range_tree_create(NULL, NULL, &dn->dn_mtx);
1644 1667          }
1645 1668          range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
1646 1669          range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
1647 1670          dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
1648 1671              blkid, nblks, tx->tx_txg);
1649 1672          mutex_exit(&dn->dn_mtx);
1650 1673  
1651 1674          dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
1652 1675          dnode_setdirty(dn, tx);
1653 1676  out:
1654 1677  
1655 1678          rw_exit(&dn->dn_struct_rwlock);
1656 1679  }
1657 1680  
1658 1681  static boolean_t
1659 1682  dnode_spill_freed(dnode_t *dn)
1660 1683  {
1661 1684          int i;
1662 1685  
1663 1686          mutex_enter(&dn->dn_mtx);
1664 1687          for (i = 0; i < TXG_SIZE; i++) {
1665 1688                  if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK)
1666 1689                          break;
1667 1690          }
1668 1691          mutex_exit(&dn->dn_mtx);
1669 1692          return (i < TXG_SIZE);
1670 1693  }
1671 1694  
1672 1695  /* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
1673 1696  uint64_t
1674 1697  dnode_block_freed(dnode_t *dn, uint64_t blkid)
1675 1698  {
1676 1699          void *dp = spa_get_dsl(dn->dn_objset->os_spa);
1677 1700          int i;
1678 1701  
1679 1702          if (blkid == DMU_BONUS_BLKID)
1680 1703                  return (FALSE);
1681 1704  
1682 1705          /*
1683 1706           * If we're in the process of opening the pool, dp will not be
1684 1707           * set yet, but there shouldn't be anything dirty.
1685 1708           */
1686 1709          if (dp == NULL)
1687 1710                  return (FALSE);
1688 1711  
1689 1712          if (dn->dn_free_txg)
1690 1713                  return (TRUE);
1691 1714  
1692 1715          if (blkid == DMU_SPILL_BLKID)
1693 1716                  return (dnode_spill_freed(dn));
1694 1717  
1695 1718          mutex_enter(&dn->dn_mtx);
1696 1719          for (i = 0; i < TXG_SIZE; i++) {
1697 1720                  if (dn->dn_free_ranges[i] != NULL &&
1698 1721                      range_tree_contains(dn->dn_free_ranges[i], blkid, 1))
1699 1722                          break;
1700 1723          }
1701 1724          mutex_exit(&dn->dn_mtx);
1702 1725          return (i < TXG_SIZE);
1703 1726  }
1704 1727  
1705 1728  /* call from syncing context when we actually write/free space for this dnode */
1706 1729  void
1707 1730  dnode_diduse_space(dnode_t *dn, int64_t delta)
1708 1731  {
1709 1732          uint64_t space;
1710 1733          dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n",
1711 1734              dn, dn->dn_phys,
1712 1735              (u_longlong_t)dn->dn_phys->dn_used,
1713 1736              (longlong_t)delta);
1714 1737  
1715 1738          mutex_enter(&dn->dn_mtx);
1716 1739          space = DN_USED_BYTES(dn->dn_phys);
1717 1740          if (delta > 0) {
1718 1741                  ASSERT3U(space + delta, >=, space); /* no overflow */
1719 1742          } else {
1720 1743                  ASSERT3U(space, >=, -delta); /* no underflow */
1721 1744          }
1722 1745          space += delta;
1723 1746          if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) {
1724 1747                  ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0);
1725 1748                  ASSERT0(P2PHASE(space, 1<<DEV_BSHIFT));
1726 1749                  dn->dn_phys->dn_used = space >> DEV_BSHIFT;
1727 1750          } else {
1728 1751                  dn->dn_phys->dn_used = space;
1729 1752                  dn->dn_phys->dn_flags |= DNODE_FLAG_USED_BYTES;
1730 1753          }
1731 1754          mutex_exit(&dn->dn_mtx);
1732 1755  }
1733 1756  
1734 1757  /*
1735 1758   * Call when we think we're going to write/free space in open context to track
1736 1759   * the amount of memory in use by the currently open txg.
1737 1760   */
1738 1761  void
1739 1762  dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
1740 1763  {
1741 1764          objset_t *os = dn->dn_objset;
1742 1765          dsl_dataset_t *ds = os->os_dsl_dataset;
1743 1766          int64_t aspace = spa_get_asize(os->os_spa, space);
1744 1767  
1745 1768          if (ds != NULL) {
1746 1769                  dsl_dir_willuse_space(ds->ds_dir, aspace, tx);
1747 1770                  dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx);
1748 1771          }
1749 1772  
1750 1773          dmu_tx_willuse_space(tx, aspace);
1751 1774  }
1752 1775  
1753 1776  /*
1754 1777   * Scans a block at the indicated "level" looking for a hole or data,
1755 1778   * depending on 'flags'.
1756 1779   *
1757 1780   * If level > 0, then we are scanning an indirect block looking at its
1758 1781   * pointers.  If level == 0, then we are looking at a block of dnodes.
1759 1782   *
1760 1783   * If we don't find what we are looking for in the block, we return ESRCH.
1761 1784   * Otherwise, return with *offset pointing to the beginning (if searching
1762 1785   * forwards) or end (if searching backwards) of the range covered by the
1763 1786   * block pointer we matched on (or dnode).
1764 1787   *
1765 1788   * The basic search algorithm used below by dnode_next_offset() is to
1766 1789   * use this function to search up the block tree (widen the search) until
1767 1790   * we find something (i.e., we don't return ESRCH) and then search back
1768 1791   * down the tree (narrow the search) until we reach our original search
1769 1792   * level.
1770 1793   */
1771 1794  static int
1772 1795  dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
1773 1796          int lvl, uint64_t blkfill, uint64_t txg)
1774 1797  {
1775 1798          dmu_buf_impl_t *db = NULL;
1776 1799          void *data = NULL;
1777 1800          uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
1778 1801          uint64_t epb = 1ULL << epbs;
1779 1802          uint64_t minfill, maxfill;
1780 1803          boolean_t hole;
1781 1804          int i, inc, error, span;
1782 1805  
1783 1806          dprintf("probing object %llu offset %llx level %d of %u\n",
1784 1807              dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);
1785 1808  
1786 1809          hole = ((flags & DNODE_FIND_HOLE) != 0);
1787 1810          inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
1788 1811          ASSERT(txg == 0 || !hole);
1789 1812  
1790 1813          if (lvl == dn->dn_phys->dn_nlevels) {
1791 1814                  error = 0;
1792 1815                  epb = dn->dn_phys->dn_nblkptr;
1793 1816                  data = dn->dn_phys->dn_blkptr;
1794 1817          } else {
1795 1818                  uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl);
1796 1819                  error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db);
1797 1820                  if (error) {
1798 1821                          if (error != ENOENT)
1799 1822                                  return (error);
1800 1823                          if (hole)
1801 1824                                  return (0);
1802 1825                          /*
1803 1826                           * This can only happen when we are searching up
1804 1827                           * the block tree for data.  We don't really need to
1805 1828                           * adjust the offset, as we will just end up looking
1806 1829                           * at the pointer to this block in its parent, and its
1807 1830                           * going to be unallocated, so we will skip over it.
1808 1831                           */
1809 1832                          return (SET_ERROR(ESRCH));
1810 1833                  }
1811 1834                  error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT);
1812 1835                  if (error) {
1813 1836                          dbuf_rele(db, FTAG);
1814 1837                          return (error);
1815 1838                  }
1816 1839                  data = db->db.db_data;
1817 1840          }
1818 1841  
1819 1842  
1820 1843          if (db != NULL && txg != 0 && (db->db_blkptr == NULL ||
1821 1844              db->db_blkptr->blk_birth <= txg ||
1822 1845              BP_IS_HOLE(db->db_blkptr))) {
1823 1846                  /*
1824 1847                   * This can only happen when we are searching up the tree
1825 1848                   * and these conditions mean that we need to keep climbing.
1826 1849                   */
1827 1850                  error = SET_ERROR(ESRCH);
1828 1851          } else if (lvl == 0) {
1829 1852                  dnode_phys_t *dnp = data;
1830 1853                  span = DNODE_SHIFT;
1831 1854                  ASSERT(dn->dn_type == DMU_OT_DNODE);
1832 1855  
1833 1856                  for (i = (*offset >> span) & (blkfill - 1);
1834 1857                      i >= 0 && i < blkfill; i += inc) {
1835 1858                          if ((dnp[i].dn_type == DMU_OT_NONE) == hole)
1836 1859                                  break;
1837 1860                          *offset += (1ULL << span) * inc;
1838 1861                  }
1839 1862                  if (i < 0 || i == blkfill)
1840 1863                          error = SET_ERROR(ESRCH);
1841 1864          } else {
1842 1865                  blkptr_t *bp = data;
1843 1866                  uint64_t start = *offset;
1844 1867                  span = (lvl - 1) * epbs + dn->dn_datablkshift;
1845 1868                  minfill = 0;
1846 1869                  maxfill = blkfill << ((lvl - 1) * epbs);
1847 1870  
1848 1871                  if (hole)
1849 1872                          maxfill--;
1850 1873                  else
1851 1874                          minfill++;
1852 1875  
1853 1876                  *offset = *offset >> span;
1854 1877                  for (i = BF64_GET(*offset, 0, epbs);
1855 1878                      i >= 0 && i < epb; i += inc) {
1856 1879                          if (BP_GET_FILL(&bp[i]) >= minfill &&
1857 1880                              BP_GET_FILL(&bp[i]) <= maxfill &&
1858 1881                              (hole || bp[i].blk_birth > txg))
1859 1882                                  break;
1860 1883                          if (inc > 0 || *offset > 0)
1861 1884                                  *offset += inc;
1862 1885                  }
1863 1886                  *offset = *offset << span;
1864 1887                  if (inc < 0) {
1865 1888                          /* traversing backwards; position offset at the end */
1866 1889                          ASSERT3U(*offset, <=, start);
1867 1890                          *offset = MIN(*offset + (1ULL << span) - 1, start);
1868 1891                  } else if (*offset < start) {
1869 1892                          *offset = start;
1870 1893                  }
1871 1894                  if (i < 0 || i >= epb)
1872 1895                          error = SET_ERROR(ESRCH);
1873 1896          }
1874 1897  
1875 1898          if (db)
1876 1899                  dbuf_rele(db, FTAG);
1877 1900  
1878 1901          return (error);
1879 1902  }
1880 1903  
1881 1904  /*
1882 1905   * Find the next hole, data, or sparse region at or after *offset.
1883 1906   * The value 'blkfill' tells us how many items we expect to find
1884 1907   * in an L0 data block; this value is 1 for normal objects,
1885 1908   * DNODES_PER_BLOCK for the meta dnode, and some fraction of
1886 1909   * DNODES_PER_BLOCK when searching for sparse regions thereof.
1887 1910   *
1888 1911   * Examples:
1889 1912   *
1890 1913   * dnode_next_offset(dn, flags, offset, 1, 1, 0);
1891 1914   *      Finds the next/previous hole/data in a file.
1892 1915   *      Used in dmu_offset_next().
1893 1916   *
1894 1917   * dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg);
1895 1918   *      Finds the next free/allocated dnode an objset's meta-dnode.
1896 1919   *      Only finds objects that have new contents since txg (ie.
1897 1920   *      bonus buffer changes and content removal are ignored).
1898 1921   *      Used in dmu_object_next().
1899 1922   *
1900 1923   * dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0);
1901 1924   *      Finds the next L2 meta-dnode bp that's at most 1/4 full.
1902 1925   *      Used in dmu_object_alloc().
1903 1926   */
1904 1927  int
1905 1928  dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
1906 1929      int minlvl, uint64_t blkfill, uint64_t txg)
1907 1930  {
1908 1931          uint64_t initial_offset = *offset;
1909 1932          int lvl, maxlvl;
1910 1933          int error = 0;
1911 1934  
1912 1935          if (!(flags & DNODE_FIND_HAVELOCK))
1913 1936                  rw_enter(&dn->dn_struct_rwlock, RW_READER);
1914 1937  
1915 1938          if (dn->dn_phys->dn_nlevels == 0) {
1916 1939                  error = SET_ERROR(ESRCH);
1917 1940                  goto out;
1918 1941          }
1919 1942  
1920 1943          if (dn->dn_datablkshift == 0) {
1921 1944                  if (*offset < dn->dn_datablksz) {
1922 1945                          if (flags & DNODE_FIND_HOLE)
1923 1946                                  *offset = dn->dn_datablksz;
1924 1947                  } else {
1925 1948                          error = SET_ERROR(ESRCH);
1926 1949                  }
1927 1950                  goto out;
1928 1951          }
1929 1952  
1930 1953          maxlvl = dn->dn_phys->dn_nlevels;
1931 1954  
1932 1955          for (lvl = minlvl; lvl <= maxlvl; lvl++) {
1933 1956                  error = dnode_next_offset_level(dn,
1934 1957                      flags, offset, lvl, blkfill, txg);
1935 1958                  if (error != ESRCH)
1936 1959                          break;
1937 1960          }
1938 1961  
1939 1962          while (error == 0 && --lvl >= minlvl) {
1940 1963                  error = dnode_next_offset_level(dn,
1941 1964                      flags, offset, lvl, blkfill, txg);
1942 1965          }
1943 1966  
1944 1967          /*
1945 1968           * There's always a "virtual hole" at the end of the object, even
1946 1969           * if all BP's which physically exist are non-holes.
1947 1970           */
1948 1971          if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 &&
1949 1972              minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) {
1950 1973                  error = 0;
1951 1974          }
1952 1975  
1953 1976          if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
1954 1977              initial_offset < *offset : initial_offset > *offset))
1955 1978                  error = SET_ERROR(ESRCH);
1956 1979  out:
1957 1980          if (!(flags & DNODE_FIND_HAVELOCK))
1958 1981                  rw_exit(&dn->dn_struct_rwlock);
1959 1982  
1960 1983          return (error);
1961 1984  }
  
    | 
      ↓ open down ↓ | 
    791 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX