Print this page
    
5056 ZFS deadlock on db_mtx and dn_holds
Reviewed by: Will Andrews <willa@spectralogic.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Approved by: Dan McDonald <danmcd@omniti.com>
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/zfs/dnode_sync.c
          +++ new/usr/src/uts/common/fs/zfs/dnode_sync.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  
    | 
      ↓ open down ↓ | 
    14 lines elided | 
    
      ↑ open up ↑ | 
  
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
       25 + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  25   26   */
  26   27  
  27   28  #include <sys/zfs_context.h>
  28   29  #include <sys/dbuf.h>
  29   30  #include <sys/dnode.h>
  30   31  #include <sys/dmu.h>
  31   32  #include <sys/dmu_tx.h>
  32   33  #include <sys/dmu_objset.h>
  33   34  #include <sys/dsl_dataset.h>
  34   35  #include <sys/spa.h>
  35   36  #include <sys/range_tree.h>
  36   37  #include <sys/zfeature.h>
  37   38  
  38   39  static void
  39   40  dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
  40   41  {
  41   42          dmu_buf_impl_t *db;
  42   43          int txgoff = tx->tx_txg & TXG_MASK;
  43   44          int nblkptr = dn->dn_phys->dn_nblkptr;
  44   45          int old_toplvl = dn->dn_phys->dn_nlevels - 1;
  45   46          int new_level = dn->dn_next_nlevels[txgoff];
  46   47          int i;
  47   48  
  48   49          rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
  49   50  
  50   51          /* this dnode can't be paged out because it's dirty */
  51   52          ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
  52   53          ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
  53   54          ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0);
  54   55  
  55   56          db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG);
  56   57          ASSERT(db != NULL);
  57   58  
  58   59          dn->dn_phys->dn_nlevels = new_level;
  59   60          dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset,
  60   61              dn->dn_object, dn->dn_phys->dn_nlevels);
  61   62  
  62   63          /* check for existing blkptrs in the dnode */
  63   64          for (i = 0; i < nblkptr; i++)
  64   65                  if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[i]))
  65   66                          break;
  66   67          if (i != nblkptr) {
  67   68                  /* transfer dnode's block pointers to new indirect block */
  68   69                  (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT);
  69   70                  ASSERT(db->db.db_data);
  70   71                  ASSERT(arc_released(db->db_buf));
  71   72                  ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size);
  72   73                  bcopy(dn->dn_phys->dn_blkptr, db->db.db_data,
  73   74                      sizeof (blkptr_t) * nblkptr);
  74   75                  arc_buf_freeze(db->db_buf);
  75   76          }
  76   77  
  77   78          /* set dbuf's parent pointers to new indirect buf */
  78   79          for (i = 0; i < nblkptr; i++) {
  79   80                  dmu_buf_impl_t *child = dbuf_find(dn, old_toplvl, i);
  80   81  
  81   82                  if (child == NULL)
  82   83                          continue;
  83   84  #ifdef  DEBUG
  84   85                  DB_DNODE_ENTER(child);
  85   86                  ASSERT3P(DB_DNODE(child), ==, dn);
  86   87                  DB_DNODE_EXIT(child);
  87   88  #endif  /* DEBUG */
  88   89                  if (child->db_parent && child->db_parent != dn->dn_dbuf) {
  89   90                          ASSERT(child->db_parent->db_level == db->db_level);
  90   91                          ASSERT(child->db_blkptr !=
  91   92                              &dn->dn_phys->dn_blkptr[child->db_blkid]);
  92   93                          mutex_exit(&child->db_mtx);
  93   94                          continue;
  94   95                  }
  95   96                  ASSERT(child->db_parent == NULL ||
  96   97                      child->db_parent == dn->dn_dbuf);
  97   98  
  98   99                  child->db_parent = db;
  99  100                  dbuf_add_ref(db, child);
 100  101                  if (db->db.db_data)
 101  102                          child->db_blkptr = (blkptr_t *)db->db.db_data + i;
 102  103                  else
 103  104                          child->db_blkptr = NULL;
 104  105                  dprintf_dbuf_bp(child, child->db_blkptr,
 105  106                      "changed db_blkptr to new indirect %s", "");
 106  107  
 107  108                  mutex_exit(&child->db_mtx);
 108  109          }
 109  110  
 110  111          bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr);
 111  112  
 112  113          dbuf_rele(db, FTAG);
 113  114  
 114  115          rw_exit(&dn->dn_struct_rwlock);
 115  116  }
 116  117  
 117  118  static void
 118  119  free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)
 119  120  {
 120  121          dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 121  122          uint64_t bytesfreed = 0;
 122  123  
 123  124          dprintf("ds=%p obj=%llx num=%d\n", ds, dn->dn_object, num);
 124  125  
 125  126          for (int i = 0; i < num; i++, bp++) {
 126  127                  if (BP_IS_HOLE(bp))
 127  128                          continue;
 128  129  
 129  130                  bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE);
 130  131                  ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys));
 131  132  
 132  133                  /*
 133  134                   * Save some useful information on the holes being
 134  135                   * punched, including logical size, type, and indirection
 135  136                   * level. Retaining birth time enables detection of when
 136  137                   * holes are punched for reducing the number of free
 137  138                   * records transmitted during a zfs send.
 138  139                   */
 139  140  
 140  141                  uint64_t lsize = BP_GET_LSIZE(bp);
 141  142                  dmu_object_type_t type = BP_GET_TYPE(bp);
 142  143                  uint64_t lvl = BP_GET_LEVEL(bp);
 143  144  
 144  145                  bzero(bp, sizeof (blkptr_t));
 145  146  
 146  147                  if (spa_feature_is_active(dn->dn_objset->os_spa,
 147  148                      SPA_FEATURE_HOLE_BIRTH)) {
 148  149                          BP_SET_LSIZE(bp, lsize);
 149  150                          BP_SET_TYPE(bp, type);
 150  151                          BP_SET_LEVEL(bp, lvl);
 151  152                          BP_SET_BIRTH(bp, dmu_tx_get_txg(tx), 0);
 152  153                  }
 153  154          }
 154  155          dnode_diduse_space(dn, -bytesfreed);
 155  156  }
 156  157  
 157  158  #ifdef ZFS_DEBUG
 158  159  static void
 159  160  free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
 160  161  {
 161  162          int off, num;
 162  163          int i, err, epbs;
 163  164          uint64_t txg = tx->tx_txg;
 164  165          dnode_t *dn;
 165  166  
 166  167          DB_DNODE_ENTER(db);
 167  168          dn = DB_DNODE(db);
 168  169          epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
 169  170          off = start - (db->db_blkid * 1<<epbs);
 170  171          num = end - start + 1;
 171  172  
 172  173          ASSERT3U(off, >=, 0);
 173  174          ASSERT3U(num, >=, 0);
 174  175          ASSERT3U(db->db_level, >, 0);
 175  176          ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
 176  177          ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT);
 177  178          ASSERT(db->db_blkptr != NULL);
 178  179  
 179  180          for (i = off; i < off+num; i++) {
 180  181                  uint64_t *buf;
 181  182                  dmu_buf_impl_t *child;
 182  183                  dbuf_dirty_record_t *dr;
 183  184                  int j;
 184  185  
 185  186                  ASSERT(db->db_level == 1);
 186  187  
 187  188                  rw_enter(&dn->dn_struct_rwlock, RW_READER);
 188  189                  err = dbuf_hold_impl(dn, db->db_level-1,
 189  190                      (db->db_blkid << epbs) + i, TRUE, FTAG, &child);
 190  191                  rw_exit(&dn->dn_struct_rwlock);
 191  192                  if (err == ENOENT)
 192  193                          continue;
 193  194                  ASSERT(err == 0);
 194  195                  ASSERT(child->db_level == 0);
 195  196                  dr = child->db_last_dirty;
 196  197                  while (dr && dr->dr_txg > txg)
 197  198                          dr = dr->dr_next;
 198  199                  ASSERT(dr == NULL || dr->dr_txg == txg);
 199  200  
 200  201                  /* data_old better be zeroed */
 201  202                  if (dr) {
 202  203                          buf = dr->dt.dl.dr_data->b_data;
 203  204                          for (j = 0; j < child->db.db_size >> 3; j++) {
 204  205                                  if (buf[j] != 0) {
 205  206                                          panic("freed data not zero: "
 206  207                                              "child=%p i=%d off=%d num=%d\n",
 207  208                                              (void *)child, i, off, num);
 208  209                                  }
 209  210                          }
 210  211                  }
 211  212  
 212  213                  /*
 213  214                   * db_data better be zeroed unless it's dirty in a
 214  215                   * future txg.
 215  216                   */
 216  217                  mutex_enter(&child->db_mtx);
 217  218                  buf = child->db.db_data;
 218  219                  if (buf != NULL && child->db_state != DB_FILL &&
 219  220                      child->db_last_dirty == NULL) {
 220  221                          for (j = 0; j < child->db.db_size >> 3; j++) {
 221  222                                  if (buf[j] != 0) {
 222  223                                          panic("freed data not zero: "
 223  224                                              "child=%p i=%d off=%d num=%d\n",
 224  225                                              (void *)child, i, off, num);
 225  226                                  }
 226  227                          }
 227  228                  }
 228  229                  mutex_exit(&child->db_mtx);
 229  230  
 230  231                  dbuf_rele(child, FTAG);
 231  232          }
 232  233          DB_DNODE_EXIT(db);
 233  234  }
 234  235  #endif
 235  236  
 236  237  static void
 237  238  free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
 238  239      dmu_tx_t *tx)
 239  240  {
 240  241          dnode_t *dn;
 241  242          blkptr_t *bp;
 242  243          dmu_buf_impl_t *subdb;
 243  244          uint64_t start, end, dbstart, dbend, i;
 244  245          int epbs, shift;
 245  246  
 246  247          /*
 247  248           * There is a small possibility that this block will not be cached:
 248  249           *   1 - if level > 1 and there are no children with level <= 1
 249  250           *   2 - if this block was evicted since we read it from
 250  251           *       dmu_tx_hold_free().
 251  252           */
 252  253          if (db->db_state != DB_CACHED)
 253  254                  (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
 254  255  
 255  256          dbuf_release_bp(db);
 256  257          bp = db->db.db_data;
 257  258  
 258  259          DB_DNODE_ENTER(db);
 259  260          dn = DB_DNODE(db);
 260  261          epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
 261  262          shift = (db->db_level - 1) * epbs;
 262  263          dbstart = db->db_blkid << epbs;
 263  264          start = blkid >> shift;
 264  265          if (dbstart < start) {
 265  266                  bp += start - dbstart;
 266  267          } else {
 267  268                  start = dbstart;
 268  269          }
 269  270          dbend = ((db->db_blkid + 1) << epbs) - 1;
 270  271          end = (blkid + nblks - 1) >> shift;
 271  272          if (dbend <= end)
 272  273                  end = dbend;
 273  274  
 274  275          ASSERT3U(start, <=, end);
 275  276  
 276  277          if (db->db_level == 1) {
 277  278                  FREE_VERIFY(db, start, end, tx);
 278  279                  free_blocks(dn, bp, end-start+1, tx);
 279  280          } else {
 280  281                  for (i = start; i <= end; i++, bp++) {
 281  282                          if (BP_IS_HOLE(bp))
 282  283                                  continue;
 283  284                          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 284  285                          VERIFY0(dbuf_hold_impl(dn, db->db_level - 1,
 285  286                              i, B_TRUE, FTAG, &subdb));
 286  287                          rw_exit(&dn->dn_struct_rwlock);
 287  288                          ASSERT3P(bp, ==, subdb->db_blkptr);
 288  289  
 289  290                          free_children(subdb, blkid, nblks, tx);
 290  291                          dbuf_rele(subdb, FTAG);
 291  292                  }
 292  293          }
 293  294  
 294  295          /* If this whole block is free, free ourself too. */
 295  296          for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) {
 296  297                  if (!BP_IS_HOLE(bp))
 297  298                          break;
 298  299          }
 299  300          if (i == 1 << epbs) {
 300  301                  /* didn't find any non-holes */
 301  302                  bzero(db->db.db_data, db->db.db_size);
 302  303                  free_blocks(dn, db->db_blkptr, 1, tx);
 303  304          } else {
 304  305                  /*
 305  306                   * Partial block free; must be marked dirty so that it
 306  307                   * will be written out.
 307  308                   */
 308  309                  ASSERT(db->db_dirtycnt > 0);
 309  310          }
 310  311  
 311  312          DB_DNODE_EXIT(db);
 312  313          arc_buf_freeze(db->db_buf);
 313  314  }
 314  315  
 315  316  /*
 316  317   * Traverse the indicated range of the provided file
 317  318   * and "free" all the blocks contained there.
 318  319   */
 319  320  static void
 320  321  dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
 321  322      dmu_tx_t *tx)
 322  323  {
 323  324          blkptr_t *bp = dn->dn_phys->dn_blkptr;
 324  325          int dnlevel = dn->dn_phys->dn_nlevels;
 325  326          boolean_t trunc = B_FALSE;
 326  327  
 327  328          if (blkid > dn->dn_phys->dn_maxblkid)
 328  329                  return;
 329  330  
 330  331          ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX);
 331  332          if (blkid + nblks > dn->dn_phys->dn_maxblkid) {
 332  333                  nblks = dn->dn_phys->dn_maxblkid - blkid + 1;
 333  334                  trunc = B_TRUE;
 334  335          }
 335  336  
 336  337          /* There are no indirect blocks in the object */
 337  338          if (dnlevel == 1) {
 338  339                  if (blkid >= dn->dn_phys->dn_nblkptr) {
 339  340                          /* this range was never made persistent */
 340  341                          return;
 341  342                  }
 342  343                  ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr);
 343  344                  free_blocks(dn, bp + blkid, nblks, tx);
 344  345          } else {
 345  346                  int shift = (dnlevel - 1) *
 346  347                      (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT);
 347  348                  int start = blkid >> shift;
 348  349                  int end = (blkid + nblks - 1) >> shift;
 349  350                  dmu_buf_impl_t *db;
 350  351  
 351  352                  ASSERT(start < dn->dn_phys->dn_nblkptr);
 352  353                  bp += start;
 353  354                  for (int i = start; i <= end; i++, bp++) {
 354  355                          if (BP_IS_HOLE(bp))
 355  356                                  continue;
 356  357                          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 357  358                          VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i,
 358  359                              TRUE, FTAG, &db));
 359  360                          rw_exit(&dn->dn_struct_rwlock);
 360  361  
 361  362                          free_children(db, blkid, nblks, tx);
 362  363                          dbuf_rele(db, FTAG);
 363  364                  }
 364  365          }
 365  366  
 366  367          if (trunc) {
 367  368                  dn->dn_phys->dn_maxblkid = blkid == 0 ? 0 : blkid - 1;
 368  369  
 369  370                  uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
 370  371                      (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 371  372                  ASSERT(off < dn->dn_phys->dn_maxblkid ||
 372  373                      dn->dn_phys->dn_maxblkid == 0 ||
 373  374                      dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0);
 374  375          }
 375  376  }
 376  377  
 377  378  typedef struct dnode_sync_free_range_arg {
 378  379          dnode_t *dsfra_dnode;
 379  380          dmu_tx_t *dsfra_tx;
 380  381  } dnode_sync_free_range_arg_t;
 381  382  
 382  383  static void
 383  384  dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks)
 384  385  {
 385  386          dnode_sync_free_range_arg_t *dsfra = arg;
 386  387          dnode_t *dn = dsfra->dsfra_dnode;
 387  388  
 388  389          mutex_exit(&dn->dn_mtx);
  
    | 
      ↓ open down ↓ | 
    354 lines elided | 
    
      ↑ open up ↑ | 
  
 389  390          dnode_sync_free_range_impl(dn, blkid, nblks, dsfra->dsfra_tx);
 390  391          mutex_enter(&dn->dn_mtx);
 391  392  }
 392  393  
 393  394  /*
 394  395   * Try to kick all the dnode's dbufs out of the cache...
 395  396   */
 396  397  void
 397  398  dnode_evict_dbufs(dnode_t *dn)
 398  399  {
 399      -        int progress;
 400      -        int pass = 0;
      400 +        dmu_buf_impl_t db_marker;
      401 +        dmu_buf_impl_t *db, *db_next;
 401  402  
 402      -        do {
 403      -                dmu_buf_impl_t *db, *db_next;
 404      -                int evicting = FALSE;
      403 +        mutex_enter(&dn->dn_dbufs_mtx);
      404 +        for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) {
 405  405  
 406      -                progress = FALSE;
 407      -                mutex_enter(&dn->dn_dbufs_mtx);
 408      -                for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) {
 409      -                        db_next = AVL_NEXT(&dn->dn_dbufs, db);
 410  406  #ifdef  DEBUG
 411      -                        DB_DNODE_ENTER(db);
 412      -                        ASSERT3P(DB_DNODE(db), ==, dn);
 413      -                        DB_DNODE_EXIT(db);
      407 +                DB_DNODE_ENTER(db);
      408 +                ASSERT3P(DB_DNODE(db), ==, dn);
      409 +                DB_DNODE_EXIT(db);
 414  410  #endif  /* DEBUG */
 415  411  
 416      -                        mutex_enter(&db->db_mtx);
 417      -                        if (db->db_state == DB_EVICTING) {
 418      -                                progress = TRUE;
 419      -                                evicting = TRUE;
 420      -                                mutex_exit(&db->db_mtx);
 421      -                        } else if (refcount_is_zero(&db->db_holds)) {
 422      -                                progress = TRUE;
 423      -                                dbuf_clear(db); /* exits db_mtx for us */
 424      -                        } else {
 425      -                                mutex_exit(&db->db_mtx);
 426      -                        }
      412 +                mutex_enter(&db->db_mtx);
      413 +                if (db->db_state != DB_EVICTING &&
      414 +                    refcount_is_zero(&db->db_holds)) {
      415 +                        db_marker.db_level = db->db_level;
      416 +                        db_marker.db_blkid = db->db_blkid;
      417 +                        db_marker.db_state = DB_SEARCH;
      418 +                        avl_insert_here(&dn->dn_dbufs, &db_marker, db,
      419 +                            AVL_BEFORE);
 427  420  
      421 +                        dbuf_clear(db);
      422 +
      423 +                        db_next = AVL_NEXT(&dn->dn_dbufs, &db_marker);
      424 +                        avl_remove(&dn->dn_dbufs, &db_marker);
      425 +                } else {
      426 +                        mutex_exit(&db->db_mtx);
      427 +                        db_next = AVL_NEXT(&dn->dn_dbufs, db);
 428  428                  }
 429      -                /*
 430      -                 * NB: we need to drop dn_dbufs_mtx between passes so
 431      -                 * that any DB_EVICTING dbufs can make progress.
 432      -                 * Ideally, we would have some cv we could wait on, but
 433      -                 * since we don't, just wait a bit to give the other
 434      -                 * thread a chance to run.
 435      -                 */
 436      -                mutex_exit(&dn->dn_dbufs_mtx);
 437      -                if (evicting)
 438      -                        delay(1);
 439      -                pass++;
 440      -                ASSERT(pass < 100); /* sanity check */
 441      -        } while (progress);
      429 +        }
      430 +        mutex_exit(&dn->dn_dbufs_mtx);
 442  431  
 443  432          rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 444  433          if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) {
 445  434                  mutex_enter(&dn->dn_bonus->db_mtx);
 446  435                  dbuf_evict(dn->dn_bonus);
 447  436                  dn->dn_bonus = NULL;
 448  437          }
 449  438          rw_exit(&dn->dn_struct_rwlock);
 450  439  }
 451  440  
 452  441  static void
 453  442  dnode_undirty_dbufs(list_t *list)
 454  443  {
 455  444          dbuf_dirty_record_t *dr;
 456  445  
 457  446          while (dr = list_head(list)) {
 458  447                  dmu_buf_impl_t *db = dr->dr_dbuf;
 459  448                  uint64_t txg = dr->dr_txg;
 460  449  
 461  450                  if (db->db_level != 0)
 462  451                          dnode_undirty_dbufs(&dr->dt.di.dr_children);
 463  452  
 464  453                  mutex_enter(&db->db_mtx);
 465  454                  /* XXX - use dbuf_undirty()? */
 466  455                  list_remove(list, dr);
 467  456                  ASSERT(db->db_last_dirty == dr);
 468  457                  db->db_last_dirty = NULL;
 469  458                  db->db_dirtycnt -= 1;
 470  459                  if (db->db_level == 0) {
 471  460                          ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
 472  461                              dr->dt.dl.dr_data == db->db_buf);
 473  462                          dbuf_unoverride(dr);
 474  463                  } else {
 475  464                          mutex_destroy(&dr->dt.di.dr_mtx);
 476  465                          list_destroy(&dr->dt.di.dr_children);
 477  466                  }
 478  467                  kmem_free(dr, sizeof (dbuf_dirty_record_t));
 479  468                  dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
 480  469          }
 481  470  }
 482  471  
 483  472  static void
 484  473  dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
 485  474  {
 486  475          int txgoff = tx->tx_txg & TXG_MASK;
 487  476  
 488  477          ASSERT(dmu_tx_is_syncing(tx));
 489  478  
  
    | 
      ↓ open down ↓ | 
    38 lines elided | 
    
      ↑ open up ↑ | 
  
 490  479          /*
 491  480           * Our contents should have been freed in dnode_sync() by the
 492  481           * free range record inserted by the caller of dnode_free().
 493  482           */
 494  483          ASSERT0(DN_USED_BYTES(dn->dn_phys));
 495  484          ASSERT(BP_IS_HOLE(dn->dn_phys->dn_blkptr));
 496  485  
 497  486          dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
 498  487          dnode_evict_dbufs(dn);
 499  488          ASSERT(avl_is_empty(&dn->dn_dbufs));
 500      -        ASSERT3P(dn->dn_bonus, ==, NULL);
 501  489  
 502  490          /*
 503  491           * XXX - It would be nice to assert this, but we may still
 504  492           * have residual holds from async evictions from the arc...
 505  493           *
 506  494           * zfs_obj_to_path() also depends on this being
 507  495           * commented out.
 508  496           *
 509  497           * ASSERT3U(refcount_count(&dn->dn_holds), ==, 1);
 510  498           */
 511  499  
 512  500          /* Undirty next bits */
 513  501          dn->dn_next_nlevels[txgoff] = 0;
 514  502          dn->dn_next_indblkshift[txgoff] = 0;
 515  503          dn->dn_next_blksz[txgoff] = 0;
 516  504  
 517  505          /* ASSERT(blkptrs are zero); */
 518  506          ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
 519  507          ASSERT(dn->dn_type != DMU_OT_NONE);
 520  508  
 521  509          ASSERT(dn->dn_free_txg > 0);
 522  510          if (dn->dn_allocated_txg != dn->dn_free_txg)
 523  511                  dmu_buf_will_dirty(&dn->dn_dbuf->db, tx);
 524  512          bzero(dn->dn_phys, sizeof (dnode_phys_t));
 525  513  
 526  514          mutex_enter(&dn->dn_mtx);
 527  515          dn->dn_type = DMU_OT_NONE;
 528  516          dn->dn_maxblkid = 0;
 529  517          dn->dn_allocated_txg = 0;
 530  518          dn->dn_free_txg = 0;
 531  519          dn->dn_have_spill = B_FALSE;
 532  520          mutex_exit(&dn->dn_mtx);
 533  521  
 534  522          ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
 535  523  
 536  524          dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
 537  525          /*
 538  526           * Now that we've released our hold, the dnode may
 539  527           * be evicted, so we musn't access it.
 540  528           */
 541  529  }
 542  530  
 543  531  /*
 544  532   * Write out the dnode's dirty buffers.
 545  533   */
 546  534  void
 547  535  dnode_sync(dnode_t *dn, dmu_tx_t *tx)
 548  536  {
 549  537          dnode_phys_t *dnp = dn->dn_phys;
 550  538          int txgoff = tx->tx_txg & TXG_MASK;
 551  539          list_t *list = &dn->dn_dirty_records[txgoff];
 552  540          static const dnode_phys_t zerodn = { 0 };
 553  541          boolean_t kill_spill = B_FALSE;
 554  542  
 555  543          ASSERT(dmu_tx_is_syncing(tx));
 556  544          ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
 557  545          ASSERT(dnp->dn_type != DMU_OT_NONE ||
 558  546              bcmp(dnp, &zerodn, DNODE_SIZE) == 0);
 559  547          DNODE_VERIFY(dn);
 560  548  
 561  549          ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf));
 562  550  
 563  551          if (dmu_objset_userused_enabled(dn->dn_objset) &&
 564  552              !DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
 565  553                  mutex_enter(&dn->dn_mtx);
 566  554                  dn->dn_oldused = DN_USED_BYTES(dn->dn_phys);
 567  555                  dn->dn_oldflags = dn->dn_phys->dn_flags;
 568  556                  dn->dn_phys->dn_flags |= DNODE_FLAG_USERUSED_ACCOUNTED;
 569  557                  mutex_exit(&dn->dn_mtx);
 570  558                  dmu_objset_userquota_get_ids(dn, B_FALSE, tx);
 571  559          } else {
 572  560                  /* Once we account for it, we should always account for it. */
 573  561                  ASSERT(!(dn->dn_phys->dn_flags &
 574  562                      DNODE_FLAG_USERUSED_ACCOUNTED));
 575  563          }
 576  564  
 577  565          mutex_enter(&dn->dn_mtx);
 578  566          if (dn->dn_allocated_txg == tx->tx_txg) {
 579  567                  /* The dnode is newly allocated or reallocated */
 580  568                  if (dnp->dn_type == DMU_OT_NONE) {
 581  569                          /* this is a first alloc, not a realloc */
 582  570                          dnp->dn_nlevels = 1;
 583  571                          dnp->dn_nblkptr = dn->dn_nblkptr;
 584  572                  }
 585  573  
 586  574                  dnp->dn_type = dn->dn_type;
 587  575                  dnp->dn_bonustype = dn->dn_bonustype;
 588  576                  dnp->dn_bonuslen = dn->dn_bonuslen;
 589  577          }
 590  578          ASSERT(dnp->dn_nlevels > 1 ||
 591  579              BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
 592  580              BP_IS_EMBEDDED(&dnp->dn_blkptr[0]) ||
 593  581              BP_GET_LSIZE(&dnp->dn_blkptr[0]) ==
 594  582              dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 595  583          ASSERT(dnp->dn_nlevels < 2 ||
 596  584              BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
 597  585              BP_GET_LSIZE(&dnp->dn_blkptr[0]) == 1 << dnp->dn_indblkshift);
 598  586  
 599  587          if (dn->dn_next_type[txgoff] != 0) {
 600  588                  dnp->dn_type = dn->dn_type;
 601  589                  dn->dn_next_type[txgoff] = 0;
 602  590          }
 603  591  
 604  592          if (dn->dn_next_blksz[txgoff] != 0) {
 605  593                  ASSERT(P2PHASE(dn->dn_next_blksz[txgoff],
 606  594                      SPA_MINBLOCKSIZE) == 0);
 607  595                  ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
 608  596                      dn->dn_maxblkid == 0 || list_head(list) != NULL ||
 609  597                      dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT ==
 610  598                      dnp->dn_datablkszsec ||
 611  599                      range_tree_space(dn->dn_free_ranges[txgoff]) != 0);
 612  600                  dnp->dn_datablkszsec =
 613  601                      dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT;
 614  602                  dn->dn_next_blksz[txgoff] = 0;
 615  603          }
 616  604  
 617  605          if (dn->dn_next_bonuslen[txgoff] != 0) {
 618  606                  if (dn->dn_next_bonuslen[txgoff] == DN_ZERO_BONUSLEN)
 619  607                          dnp->dn_bonuslen = 0;
 620  608                  else
 621  609                          dnp->dn_bonuslen = dn->dn_next_bonuslen[txgoff];
 622  610                  ASSERT(dnp->dn_bonuslen <= DN_MAX_BONUSLEN);
 623  611                  dn->dn_next_bonuslen[txgoff] = 0;
 624  612          }
 625  613  
 626  614          if (dn->dn_next_bonustype[txgoff] != 0) {
 627  615                  ASSERT(DMU_OT_IS_VALID(dn->dn_next_bonustype[txgoff]));
 628  616                  dnp->dn_bonustype = dn->dn_next_bonustype[txgoff];
 629  617                  dn->dn_next_bonustype[txgoff] = 0;
 630  618          }
 631  619  
 632  620          boolean_t freeing_dnode = dn->dn_free_txg > 0 &&
 633  621              dn->dn_free_txg <= tx->tx_txg;
 634  622  
 635  623          /*
 636  624           * Remove the spill block if we have been explicitly asked to
 637  625           * remove it, or if the object is being removed.
 638  626           */
 639  627          if (dn->dn_rm_spillblk[txgoff] || freeing_dnode) {
 640  628                  if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
 641  629                          kill_spill = B_TRUE;
 642  630                  dn->dn_rm_spillblk[txgoff] = 0;
 643  631          }
 644  632  
 645  633          if (dn->dn_next_indblkshift[txgoff] != 0) {
 646  634                  ASSERT(dnp->dn_nlevels == 1);
 647  635                  dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff];
 648  636                  dn->dn_next_indblkshift[txgoff] = 0;
 649  637          }
 650  638  
 651  639          /*
 652  640           * Just take the live (open-context) values for checksum and compress.
 653  641           * Strictly speaking it's a future leak, but nothing bad happens if we
 654  642           * start using the new checksum or compress algorithm a little early.
 655  643           */
 656  644          dnp->dn_checksum = dn->dn_checksum;
 657  645          dnp->dn_compress = dn->dn_compress;
 658  646  
 659  647          mutex_exit(&dn->dn_mtx);
 660  648  
 661  649          if (kill_spill) {
 662  650                  free_blocks(dn, &dn->dn_phys->dn_spill, 1, tx);
 663  651                  mutex_enter(&dn->dn_mtx);
 664  652                  dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR;
 665  653                  mutex_exit(&dn->dn_mtx);
 666  654          }
 667  655  
 668  656          /* process all the "freed" ranges in the file */
 669  657          if (dn->dn_free_ranges[txgoff] != NULL) {
 670  658                  dnode_sync_free_range_arg_t dsfra;
 671  659                  dsfra.dsfra_dnode = dn;
 672  660                  dsfra.dsfra_tx = tx;
 673  661                  mutex_enter(&dn->dn_mtx);
 674  662                  range_tree_vacate(dn->dn_free_ranges[txgoff],
 675  663                      dnode_sync_free_range, &dsfra);
 676  664                  range_tree_destroy(dn->dn_free_ranges[txgoff]);
 677  665                  dn->dn_free_ranges[txgoff] = NULL;
 678  666                  mutex_exit(&dn->dn_mtx);
 679  667          }
 680  668  
 681  669          if (freeing_dnode) {
 682  670                  dnode_sync_free(dn, tx);
 683  671                  return;
 684  672          }
 685  673  
 686  674          if (dn->dn_next_nlevels[txgoff]) {
 687  675                  dnode_increase_indirection(dn, tx);
 688  676                  dn->dn_next_nlevels[txgoff] = 0;
 689  677          }
 690  678  
 691  679          if (dn->dn_next_nblkptr[txgoff]) {
 692  680                  /* this should only happen on a realloc */
 693  681                  ASSERT(dn->dn_allocated_txg == tx->tx_txg);
 694  682                  if (dn->dn_next_nblkptr[txgoff] > dnp->dn_nblkptr) {
 695  683                          /* zero the new blkptrs we are gaining */
 696  684                          bzero(dnp->dn_blkptr + dnp->dn_nblkptr,
 697  685                              sizeof (blkptr_t) *
 698  686                              (dn->dn_next_nblkptr[txgoff] - dnp->dn_nblkptr));
 699  687  #ifdef ZFS_DEBUG
 700  688                  } else {
 701  689                          int i;
 702  690                          ASSERT(dn->dn_next_nblkptr[txgoff] < dnp->dn_nblkptr);
 703  691                          /* the blkptrs we are losing better be unallocated */
 704  692                          for (i = dn->dn_next_nblkptr[txgoff];
 705  693                              i < dnp->dn_nblkptr; i++)
 706  694                                  ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[i]));
 707  695  #endif
 708  696                  }
 709  697                  mutex_enter(&dn->dn_mtx);
 710  698                  dnp->dn_nblkptr = dn->dn_next_nblkptr[txgoff];
 711  699                  dn->dn_next_nblkptr[txgoff] = 0;
 712  700                  mutex_exit(&dn->dn_mtx);
 713  701          }
 714  702  
 715  703          dbuf_sync_list(list, tx);
 716  704  
 717  705          if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
 718  706                  ASSERT3P(list_head(list), ==, NULL);
 719  707                  dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
 720  708          }
 721  709  
 722  710          /*
 723  711           * Although we have dropped our reference to the dnode, it
 724  712           * can't be evicted until its written, and we haven't yet
 725  713           * initiated the IO for the dnode's dbuf.
 726  714           */
 727  715  }
  
    | 
      ↓ open down ↓ | 
    217 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX