merge Wdiff usr/src/uts/common/fs/zfs/dbuf.c

Print this page

4374 dn_free_ranges should use range_tree_t
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Max Grossman <max.grossman@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com
Reviewed by: Garrett D'Amore <garrett@damore.org>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Approved by: Dan McDonald <danmcd@omniti.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/dbuf.c
          +++ new/usr/src/uts/common/fs/zfs/dbuf.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each

↓ open down ↓

13 lines elided

↑ open up ↑

  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24      - * Copyright (c) 2013 by Delphix. All rights reserved.
       24 + * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  25   25   * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  26   26   * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  27   27   */
  28   28  
  29   29  #include <sys/zfs_context.h>
  30   30  #include <sys/dmu.h>
  31   31  #include <sys/dmu_send.h>
  32   32  #include <sys/dmu_impl.h>
  33   33  #include <sys/dbuf.h>
  34   34  #include <sys/dmu_objset.h>
  35   35  #include <sys/dsl_dataset.h>
  36   36  #include <sys/dsl_dir.h>
  37   37  #include <sys/dmu_tx.h>
  38   38  #include <sys/spa.h>
  39   39  #include <sys/zio.h>
  40   40  #include <sys/dmu_zfetch.h>
  41   41  #include <sys/sa.h>
  42   42  #include <sys/sa_impl.h>
       43 +#include <sys/range_tree.h>
  43   44  
  44   45  /*
  45   46   * Number of times that zfs_free_range() took the slow path while doing
  46   47   * a zfs receive.  A nonzero value indicates a potential performance problem.
  47   48   */
  48   49  uint64_t zfs_free_range_recv_miss;
  49   50  
  50   51  static void dbuf_destroy(dmu_buf_impl_t *db);
  51   52  static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
  52   53  static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);

  53   54  
  54   55  /*
  55   56   * Global data structures and functions for the dbuf cache.
  56   57   */
  57   58  static kmem_cache_t *dbuf_cache;
  58   59  
  59   60  /* ARGSUSED */
  60   61  static int
  61   62  dbuf_cons(void *vdb, void *unused, int kmflag)
  62   63  {
  63   64          dmu_buf_impl_t *db = vdb;
  64   65          bzero(db, sizeof (dmu_buf_impl_t));
  65   66  
  66   67          mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
  67   68          cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
  68   69          refcount_create(&db->db_holds);
  69   70          return (0);
  70   71  }
  71   72  
  72   73  /* ARGSUSED */
  73   74  static void
  74   75  dbuf_dest(void *vdb, void *unused)
  75   76  {
  76   77          dmu_buf_impl_t *db = vdb;
  77   78          mutex_destroy(&db->db_mtx);
  78   79          cv_destroy(&db->db_changed);
  79   80          refcount_destroy(&db->db_holds);
  80   81  }
  81   82  
  82   83  /*
  83   84   * dbuf hash table routines
  84   85   */
  85   86  static dbuf_hash_table_t dbuf_hash_table;
  86   87  
  87   88  static uint64_t dbuf_hash_count;
  88   89  
  89   90  static uint64_t
  90   91  dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
  91   92  {
  92   93          uintptr_t osv = (uintptr_t)os;
  93   94          uint64_t crc = -1ULL;
  94   95  
  95   96          ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
  96   97          crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
  97   98          crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
  98   99          crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
  99  100          crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
 100  101          crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
 101  102          crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
 102  103  
 103  104          crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
 104  105  
 105  106          return (crc);
 106  107  }
 107  108  
 108  109  #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
 109  110  
 110  111  #define DBUF_EQUAL(dbuf, os, obj, level, blkid)         \
 111  112          ((dbuf)->db.db_object == (obj) &&               \
 112  113          (dbuf)->db_objset == (os) &&                    \
 113  114          (dbuf)->db_level == (level) &&                  \
 114  115          (dbuf)->db_blkid == (blkid))
 115  116  
 116  117  dmu_buf_impl_t *
 117  118  dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
 118  119  {
 119  120          dbuf_hash_table_t *h = &dbuf_hash_table;
 120  121          objset_t *os = dn->dn_objset;
 121  122          uint64_t obj = dn->dn_object;
 122  123          uint64_t hv = DBUF_HASH(os, obj, level, blkid);
 123  124          uint64_t idx = hv & h->hash_table_mask;
 124  125          dmu_buf_impl_t *db;
 125  126  
 126  127          mutex_enter(DBUF_HASH_MUTEX(h, idx));
 127  128          for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
 128  129                  if (DBUF_EQUAL(db, os, obj, level, blkid)) {
 129  130                          mutex_enter(&db->db_mtx);
 130  131                          if (db->db_state != DB_EVICTING) {
 131  132                                  mutex_exit(DBUF_HASH_MUTEX(h, idx));
 132  133                                  return (db);
 133  134                          }
 134  135                          mutex_exit(&db->db_mtx);
 135  136                  }
 136  137          }
 137  138          mutex_exit(DBUF_HASH_MUTEX(h, idx));
 138  139          return (NULL);
 139  140  }
 140  141  
 141  142  /*
 142  143   * Insert an entry into the hash table.  If there is already an element
 143  144   * equal to elem in the hash table, then the already existing element
 144  145   * will be returned and the new element will not be inserted.
 145  146   * Otherwise returns NULL.
 146  147   */
 147  148  static dmu_buf_impl_t *
 148  149  dbuf_hash_insert(dmu_buf_impl_t *db)
 149  150  {
 150  151          dbuf_hash_table_t *h = &dbuf_hash_table;
 151  152          objset_t *os = db->db_objset;
 152  153          uint64_t obj = db->db.db_object;
 153  154          int level = db->db_level;
 154  155          uint64_t blkid = db->db_blkid;
 155  156          uint64_t hv = DBUF_HASH(os, obj, level, blkid);
 156  157          uint64_t idx = hv & h->hash_table_mask;
 157  158          dmu_buf_impl_t *dbf;
 158  159  
 159  160          mutex_enter(DBUF_HASH_MUTEX(h, idx));
 160  161          for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
 161  162                  if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
 162  163                          mutex_enter(&dbf->db_mtx);
 163  164                          if (dbf->db_state != DB_EVICTING) {
 164  165                                  mutex_exit(DBUF_HASH_MUTEX(h, idx));
 165  166                                  return (dbf);
 166  167                          }
 167  168                          mutex_exit(&dbf->db_mtx);
 168  169                  }
 169  170          }
 170  171  
 171  172          mutex_enter(&db->db_mtx);
 172  173          db->db_hash_next = h->hash_table[idx];
 173  174          h->hash_table[idx] = db;
 174  175          mutex_exit(DBUF_HASH_MUTEX(h, idx));
 175  176          atomic_add_64(&dbuf_hash_count, 1);
 176  177  
 177  178          return (NULL);
 178  179  }
 179  180  
 180  181  /*
 181  182   * Remove an entry from the hash table.  This operation will
 182  183   * fail if there are any existing holds on the db.
 183  184   */
 184  185  static void
 185  186  dbuf_hash_remove(dmu_buf_impl_t *db)
 186  187  {
 187  188          dbuf_hash_table_t *h = &dbuf_hash_table;
 188  189          uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
 189  190              db->db_level, db->db_blkid);
 190  191          uint64_t idx = hv & h->hash_table_mask;
 191  192          dmu_buf_impl_t *dbf, **dbp;
 192  193  
 193  194          /*
 194  195           * We musn't hold db_mtx to maintin lock ordering:
 195  196           * DBUF_HASH_MUTEX > db_mtx.
 196  197           */
 197  198          ASSERT(refcount_is_zero(&db->db_holds));
 198  199          ASSERT(db->db_state == DB_EVICTING);
 199  200          ASSERT(!MUTEX_HELD(&db->db_mtx));
 200  201  
 201  202          mutex_enter(DBUF_HASH_MUTEX(h, idx));
 202  203          dbp = &h->hash_table[idx];
 203  204          while ((dbf = *dbp) != db) {
 204  205                  dbp = &dbf->db_hash_next;
 205  206                  ASSERT(dbf != NULL);
 206  207          }
 207  208          *dbp = db->db_hash_next;
 208  209          db->db_hash_next = NULL;
 209  210          mutex_exit(DBUF_HASH_MUTEX(h, idx));
 210  211          atomic_add_64(&dbuf_hash_count, -1);
 211  212  }
 212  213  
 213  214  static arc_evict_func_t dbuf_do_evict;
 214  215  
 215  216  static void
 216  217  dbuf_evict_user(dmu_buf_impl_t *db)
 217  218  {
 218  219          ASSERT(MUTEX_HELD(&db->db_mtx));
 219  220  
 220  221          if (db->db_level != 0 || db->db_evict_func == NULL)
 221  222                  return;
 222  223  
 223  224          if (db->db_user_data_ptr_ptr)
 224  225                  *db->db_user_data_ptr_ptr = db->db.db_data;
 225  226          db->db_evict_func(&db->db, db->db_user_ptr);
 226  227          db->db_user_ptr = NULL;
 227  228          db->db_user_data_ptr_ptr = NULL;
 228  229          db->db_evict_func = NULL;
 229  230  }
 230  231  
 231  232  boolean_t
 232  233  dbuf_is_metadata(dmu_buf_impl_t *db)
 233  234  {
 234  235          if (db->db_level > 0) {
 235  236                  return (B_TRUE);
 236  237          } else {
 237  238                  boolean_t is_metadata;
 238  239  
 239  240                  DB_DNODE_ENTER(db);
 240  241                  is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
 241  242                  DB_DNODE_EXIT(db);
 242  243  
 243  244                  return (is_metadata);
 244  245          }
 245  246  }
 246  247  
 247  248  void
 248  249  dbuf_evict(dmu_buf_impl_t *db)
 249  250  {
 250  251          ASSERT(MUTEX_HELD(&db->db_mtx));
 251  252          ASSERT(db->db_buf == NULL);
 252  253          ASSERT(db->db_data_pending == NULL);
 253  254  
 254  255          dbuf_clear(db);
 255  256          dbuf_destroy(db);
 256  257  }
 257  258  
 258  259  void
 259  260  dbuf_init(void)
 260  261  {
 261  262          uint64_t hsize = 1ULL << 16;
 262  263          dbuf_hash_table_t *h = &dbuf_hash_table;
 263  264          int i;
 264  265  
 265  266          /*
 266  267           * The hash table is big enough to fill all of physical memory
 267  268           * with an average 4K block size.  The table will take up
 268  269           * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
 269  270           */
 270  271          while (hsize * 4096 < physmem * PAGESIZE)
 271  272                  hsize <<= 1;
 272  273  
 273  274  retry:
 274  275          h->hash_table_mask = hsize - 1;
 275  276          h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
 276  277          if (h->hash_table == NULL) {
 277  278                  /* XXX - we should really return an error instead of assert */
 278  279                  ASSERT(hsize > (1ULL << 10));
 279  280                  hsize >>= 1;
 280  281                  goto retry;
 281  282          }
 282  283  
 283  284          dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
 284  285              sizeof (dmu_buf_impl_t),
 285  286              0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
 286  287  
 287  288          for (i = 0; i < DBUF_MUTEXES; i++)
 288  289                  mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
 289  290  }
 290  291  
 291  292  void
 292  293  dbuf_fini(void)
 293  294  {
 294  295          dbuf_hash_table_t *h = &dbuf_hash_table;
 295  296          int i;
 296  297  
 297  298          for (i = 0; i < DBUF_MUTEXES; i++)
 298  299                  mutex_destroy(&h->hash_mutexes[i]);
 299  300          kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
 300  301          kmem_cache_destroy(dbuf_cache);
 301  302  }
 302  303  
 303  304  /*
 304  305   * Other stuff.
 305  306   */
 306  307  
 307  308  #ifdef ZFS_DEBUG
 308  309  static void
 309  310  dbuf_verify(dmu_buf_impl_t *db)
 310  311  {
 311  312          dnode_t *dn;
 312  313          dbuf_dirty_record_t *dr;
 313  314  
 314  315          ASSERT(MUTEX_HELD(&db->db_mtx));
 315  316  
 316  317          if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
 317  318                  return;
 318  319  
 319  320          ASSERT(db->db_objset != NULL);
 320  321          DB_DNODE_ENTER(db);
 321  322          dn = DB_DNODE(db);
 322  323          if (dn == NULL) {
 323  324                  ASSERT(db->db_parent == NULL);
 324  325                  ASSERT(db->db_blkptr == NULL);
 325  326          } else {
 326  327                  ASSERT3U(db->db.db_object, ==, dn->dn_object);
 327  328                  ASSERT3P(db->db_objset, ==, dn->dn_objset);
 328  329                  ASSERT3U(db->db_level, <, dn->dn_nlevels);
 329  330                  ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
 330  331                      db->db_blkid == DMU_SPILL_BLKID ||
 331  332                      !list_is_empty(&dn->dn_dbufs));
 332  333          }
 333  334          if (db->db_blkid == DMU_BONUS_BLKID) {
 334  335                  ASSERT(dn != NULL);
 335  336                  ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
 336  337                  ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
 337  338          } else if (db->db_blkid == DMU_SPILL_BLKID) {
 338  339                  ASSERT(dn != NULL);
 339  340                  ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
 340  341                  ASSERT0(db->db.db_offset);
 341  342          } else {
 342  343                  ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
 343  344          }
 344  345  
 345  346          for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
 346  347                  ASSERT(dr->dr_dbuf == db);
 347  348  
 348  349          for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
 349  350                  ASSERT(dr->dr_dbuf == db);
 350  351  
 351  352          /*
 352  353           * We can't assert that db_size matches dn_datablksz because it
 353  354           * can be momentarily different when another thread is doing
 354  355           * dnode_set_blksz().
 355  356           */
 356  357          if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
 357  358                  dr = db->db_data_pending;
 358  359                  /*
 359  360                   * It should only be modified in syncing context, so
 360  361                   * make sure we only have one copy of the data.
 361  362                   */
 362  363                  ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
 363  364          }
 364  365  
 365  366          /* verify db->db_blkptr */
 366  367          if (db->db_blkptr) {
 367  368                  if (db->db_parent == dn->dn_dbuf) {
 368  369                          /* db is pointed to by the dnode */
 369  370                          /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
 370  371                          if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
 371  372                                  ASSERT(db->db_parent == NULL);
 372  373                          else
 373  374                                  ASSERT(db->db_parent != NULL);
 374  375                          if (db->db_blkid != DMU_SPILL_BLKID)
 375  376                                  ASSERT3P(db->db_blkptr, ==,
 376  377                                      &dn->dn_phys->dn_blkptr[db->db_blkid]);
 377  378                  } else {
 378  379                          /* db is pointed to by an indirect block */
 379  380                          int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
 380  381                          ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
 381  382                          ASSERT3U(db->db_parent->db.db_object, ==,
 382  383                              db->db.db_object);
 383  384                          /*
 384  385                           * dnode_grow_indblksz() can make this fail if we don't
 385  386                           * have the struct_rwlock.  XXX indblksz no longer
 386  387                           * grows.  safe to do this now?
 387  388                           */
 388  389                          if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
 389  390                                  ASSERT3P(db->db_blkptr, ==,
 390  391                                      ((blkptr_t *)db->db_parent->db.db_data +
 391  392                                      db->db_blkid % epb));
 392  393                          }
 393  394                  }
 394  395          }
 395  396          if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
 396  397              (db->db_buf == NULL || db->db_buf->b_data) &&
 397  398              db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
 398  399              db->db_state != DB_FILL && !dn->dn_free_txg) {
 399  400                  /*
 400  401                   * If the blkptr isn't set but they have nonzero data,
 401  402                   * it had better be dirty, otherwise we'll lose that
 402  403                   * data when we evict this buffer.
 403  404                   */
 404  405                  if (db->db_dirtycnt == 0) {
 405  406                          uint64_t *buf = db->db.db_data;
 406  407                          int i;
 407  408  
 408  409                          for (i = 0; i < db->db.db_size >> 3; i++) {
 409  410                                  ASSERT(buf[i] == 0);
 410  411                          }
 411  412                  }
 412  413          }
 413  414          DB_DNODE_EXIT(db);
 414  415  }
 415  416  #endif
 416  417  
 417  418  static void
 418  419  dbuf_update_data(dmu_buf_impl_t *db)
 419  420  {
 420  421          ASSERT(MUTEX_HELD(&db->db_mtx));
 421  422          if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
 422  423                  ASSERT(!refcount_is_zero(&db->db_holds));
 423  424                  *db->db_user_data_ptr_ptr = db->db.db_data;
 424  425          }
 425  426  }
 426  427  
 427  428  static void
 428  429  dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
 429  430  {
 430  431          ASSERT(MUTEX_HELD(&db->db_mtx));
 431  432          ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
 432  433          db->db_buf = buf;
 433  434          if (buf != NULL) {
 434  435                  ASSERT(buf->b_data != NULL);
 435  436                  db->db.db_data = buf->b_data;
 436  437                  if (!arc_released(buf))
 437  438                          arc_set_callback(buf, dbuf_do_evict, db);
 438  439                  dbuf_update_data(db);
 439  440          } else {
 440  441                  dbuf_evict_user(db);
 441  442                  db->db.db_data = NULL;
 442  443                  if (db->db_state != DB_NOFILL)
 443  444                          db->db_state = DB_UNCACHED;
 444  445          }
 445  446  }
 446  447  
 447  448  /*
 448  449   * Loan out an arc_buf for read.  Return the loaned arc_buf.
 449  450   */
 450  451  arc_buf_t *
 451  452  dbuf_loan_arcbuf(dmu_buf_impl_t *db)
 452  453  {
 453  454          arc_buf_t *abuf;
 454  455  
 455  456          mutex_enter(&db->db_mtx);
 456  457          if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
 457  458                  int blksz = db->db.db_size;
 458  459                  spa_t *spa = db->db_objset->os_spa;
 459  460  
 460  461                  mutex_exit(&db->db_mtx);
 461  462                  abuf = arc_loan_buf(spa, blksz);
 462  463                  bcopy(db->db.db_data, abuf->b_data, blksz);
 463  464          } else {
 464  465                  abuf = db->db_buf;
 465  466                  arc_loan_inuse_buf(abuf, db);
 466  467                  dbuf_set_data(db, NULL);
 467  468                  mutex_exit(&db->db_mtx);
 468  469          }
 469  470          return (abuf);
 470  471  }
 471  472  
 472  473  uint64_t
 473  474  dbuf_whichblock(dnode_t *dn, uint64_t offset)
 474  475  {
 475  476          if (dn->dn_datablkshift) {
 476  477                  return (offset >> dn->dn_datablkshift);
 477  478          } else {
 478  479                  ASSERT3U(offset, <, dn->dn_datablksz);
 479  480                  return (0);
 480  481          }
 481  482  }
 482  483  
 483  484  static void
 484  485  dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 485  486  {
 486  487          dmu_buf_impl_t *db = vdb;
 487  488  
 488  489          mutex_enter(&db->db_mtx);
 489  490          ASSERT3U(db->db_state, ==, DB_READ);
 490  491          /*
 491  492           * All reads are synchronous, so we must have a hold on the dbuf
 492  493           */
 493  494          ASSERT(refcount_count(&db->db_holds) > 0);
 494  495          ASSERT(db->db_buf == NULL);
 495  496          ASSERT(db->db.db_data == NULL);
 496  497          if (db->db_level == 0 && db->db_freed_in_flight) {
 497  498                  /* we were freed in flight; disregard any error */
 498  499                  arc_release(buf, db);
 499  500                  bzero(buf->b_data, db->db.db_size);
 500  501                  arc_buf_freeze(buf);
 501  502                  db->db_freed_in_flight = FALSE;
 502  503                  dbuf_set_data(db, buf);
 503  504                  db->db_state = DB_CACHED;
 504  505          } else if (zio == NULL || zio->io_error == 0) {
 505  506                  dbuf_set_data(db, buf);
 506  507                  db->db_state = DB_CACHED;
 507  508          } else {
 508  509                  ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 509  510                  ASSERT3P(db->db_buf, ==, NULL);
 510  511                  VERIFY(arc_buf_remove_ref(buf, db));
 511  512                  db->db_state = DB_UNCACHED;
 512  513          }
 513  514          cv_broadcast(&db->db_changed);
 514  515          dbuf_rele_and_unlock(db, NULL);
 515  516  }
 516  517  
 517  518  static void
 518  519  dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
 519  520  {
 520  521          dnode_t *dn;
 521  522          zbookmark_t zb;
 522  523          uint32_t aflags = ARC_NOWAIT;
 523  524  
 524  525          DB_DNODE_ENTER(db);
 525  526          dn = DB_DNODE(db);
 526  527          ASSERT(!refcount_is_zero(&db->db_holds));
 527  528          /* We need the struct_rwlock to prevent db_blkptr from changing. */
 528  529          ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 529  530          ASSERT(MUTEX_HELD(&db->db_mtx));
 530  531          ASSERT(db->db_state == DB_UNCACHED);
 531  532          ASSERT(db->db_buf == NULL);
 532  533  
 533  534          if (db->db_blkid == DMU_BONUS_BLKID) {
 534  535                  int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
 535  536  
 536  537                  ASSERT3U(bonuslen, <=, db->db.db_size);
 537  538                  db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
 538  539                  arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
 539  540                  if (bonuslen < DN_MAX_BONUSLEN)
 540  541                          bzero(db->db.db_data, DN_MAX_BONUSLEN);
 541  542                  if (bonuslen)
 542  543                          bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
 543  544                  DB_DNODE_EXIT(db);
 544  545                  dbuf_update_data(db);
 545  546                  db->db_state = DB_CACHED;
 546  547                  mutex_exit(&db->db_mtx);
 547  548                  return;
 548  549          }
 549  550  
 550  551          /*
 551  552           * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
 552  553           * processes the delete record and clears the bp while we are waiting
 553  554           * for the dn_mtx (resulting in a "no" from block_freed).
 554  555           */
 555  556          if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
 556  557              (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
 557  558              BP_IS_HOLE(db->db_blkptr)))) {
 558  559                  arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 559  560  
 560  561                  DB_DNODE_EXIT(db);
 561  562                  dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa,
 562  563                      db->db.db_size, db, type));
 563  564                  bzero(db->db.db_data, db->db.db_size);
 564  565                  db->db_state = DB_CACHED;
 565  566                  *flags |= DB_RF_CACHED;
 566  567                  mutex_exit(&db->db_mtx);
 567  568                  return;
 568  569          }
 569  570  
 570  571          DB_DNODE_EXIT(db);
 571  572  
 572  573          db->db_state = DB_READ;
 573  574          mutex_exit(&db->db_mtx);
 574  575  
 575  576          if (DBUF_IS_L2CACHEABLE(db))
 576  577                  aflags |= ARC_L2CACHE;
 577  578          if (DBUF_IS_L2COMPRESSIBLE(db))
 578  579                  aflags |= ARC_L2COMPRESS;
 579  580  
 580  581          SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
 581  582              db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
 582  583              db->db.db_object, db->db_level, db->db_blkid);
 583  584  
 584  585          dbuf_add_ref(db, NULL);
 585  586  
 586  587          (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
 587  588              dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
 588  589              (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
 589  590              &aflags, &zb);
 590  591          if (aflags & ARC_CACHED)
 591  592                  *flags |= DB_RF_CACHED;
 592  593  }
 593  594  
 594  595  int
 595  596  dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 596  597  {
 597  598          int err = 0;
 598  599          boolean_t havepzio = (zio != NULL);
 599  600          boolean_t prefetch;
 600  601          dnode_t *dn;
 601  602  
 602  603          /*
 603  604           * We don't have to hold the mutex to check db_state because it
 604  605           * can't be freed while we have a hold on the buffer.
 605  606           */
 606  607          ASSERT(!refcount_is_zero(&db->db_holds));
 607  608  
 608  609          if (db->db_state == DB_NOFILL)
 609  610                  return (SET_ERROR(EIO));
 610  611  
 611  612          DB_DNODE_ENTER(db);
 612  613          dn = DB_DNODE(db);
 613  614          if ((flags & DB_RF_HAVESTRUCT) == 0)
 614  615                  rw_enter(&dn->dn_struct_rwlock, RW_READER);
 615  616  
 616  617          prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
 617  618              (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
 618  619              DBUF_IS_CACHEABLE(db);
 619  620  
 620  621          mutex_enter(&db->db_mtx);
 621  622          if (db->db_state == DB_CACHED) {
 622  623                  mutex_exit(&db->db_mtx);
 623  624                  if (prefetch)
 624  625                          dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
 625  626                              db->db.db_size, TRUE);
 626  627                  if ((flags & DB_RF_HAVESTRUCT) == 0)
 627  628                          rw_exit(&dn->dn_struct_rwlock);
 628  629                  DB_DNODE_EXIT(db);
 629  630          } else if (db->db_state == DB_UNCACHED) {
 630  631                  spa_t *spa = dn->dn_objset->os_spa;
 631  632  
 632  633                  if (zio == NULL)
 633  634                          zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 634  635                  dbuf_read_impl(db, zio, &flags);
 635  636  
 636  637                  /* dbuf_read_impl has dropped db_mtx for us */
 637  638  
 638  639                  if (prefetch)
 639  640                          dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
 640  641                              db->db.db_size, flags & DB_RF_CACHED);
 641  642  
 642  643                  if ((flags & DB_RF_HAVESTRUCT) == 0)
 643  644                          rw_exit(&dn->dn_struct_rwlock);
 644  645                  DB_DNODE_EXIT(db);
 645  646  
 646  647                  if (!havepzio)
 647  648                          err = zio_wait(zio);
 648  649          } else {
 649  650                  /*
 650  651                   * Another reader came in while the dbuf was in flight
 651  652                   * between UNCACHED and CACHED.  Either a writer will finish
 652  653                   * writing the buffer (sending the dbuf to CACHED) or the
 653  654                   * first reader's request will reach the read_done callback
 654  655                   * and send the dbuf to CACHED.  Otherwise, a failure
 655  656                   * occurred and the dbuf went to UNCACHED.
 656  657                   */
 657  658                  mutex_exit(&db->db_mtx);
 658  659                  if (prefetch)
 659  660                          dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
 660  661                              db->db.db_size, TRUE);
 661  662                  if ((flags & DB_RF_HAVESTRUCT) == 0)
 662  663                          rw_exit(&dn->dn_struct_rwlock);
 663  664                  DB_DNODE_EXIT(db);
 664  665  
 665  666                  /* Skip the wait per the caller's request. */
 666  667                  mutex_enter(&db->db_mtx);
 667  668                  if ((flags & DB_RF_NEVERWAIT) == 0) {
 668  669                          while (db->db_state == DB_READ ||
 669  670                              db->db_state == DB_FILL) {
 670  671                                  ASSERT(db->db_state == DB_READ ||
 671  672                                      (flags & DB_RF_HAVESTRUCT) == 0);
 672  673                                  cv_wait(&db->db_changed, &db->db_mtx);
 673  674                          }
 674  675                          if (db->db_state == DB_UNCACHED)
 675  676                                  err = SET_ERROR(EIO);
 676  677                  }
 677  678                  mutex_exit(&db->db_mtx);
 678  679          }
 679  680  
 680  681          ASSERT(err || havepzio || db->db_state == DB_CACHED);
 681  682          return (err);
 682  683  }
 683  684  
 684  685  static void
 685  686  dbuf_noread(dmu_buf_impl_t *db)
 686  687  {
 687  688          ASSERT(!refcount_is_zero(&db->db_holds));
 688  689          ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 689  690          mutex_enter(&db->db_mtx);
 690  691          while (db->db_state == DB_READ || db->db_state == DB_FILL)
 691  692                  cv_wait(&db->db_changed, &db->db_mtx);
 692  693          if (db->db_state == DB_UNCACHED) {
 693  694                  arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 694  695                  spa_t *spa = db->db_objset->os_spa;
 695  696  
 696  697                  ASSERT(db->db_buf == NULL);
 697  698                  ASSERT(db->db.db_data == NULL);
 698  699                  dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
 699  700                  db->db_state = DB_FILL;
 700  701          } else if (db->db_state == DB_NOFILL) {
 701  702                  dbuf_set_data(db, NULL);
 702  703          } else {
 703  704                  ASSERT3U(db->db_state, ==, DB_CACHED);
 704  705          }
 705  706          mutex_exit(&db->db_mtx);
 706  707  }
 707  708  
 708  709  /*
 709  710   * This is our just-in-time copy function.  It makes a copy of
 710  711   * buffers, that have been modified in a previous transaction
 711  712   * group, before we modify them in the current active group.
 712  713   *
 713  714   * This function is used in two places: when we are dirtying a
 714  715   * buffer for the first time in a txg, and when we are freeing
 715  716   * a range in a dnode that includes this buffer.
 716  717   *
 717  718   * Note that when we are called from dbuf_free_range() we do
 718  719   * not put a hold on the buffer, we just traverse the active
 719  720   * dbuf list for the dnode.
 720  721   */
 721  722  static void
 722  723  dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
 723  724  {
 724  725          dbuf_dirty_record_t *dr = db->db_last_dirty;
 725  726  
 726  727          ASSERT(MUTEX_HELD(&db->db_mtx));
 727  728          ASSERT(db->db.db_data != NULL);
 728  729          ASSERT(db->db_level == 0);
 729  730          ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
 730  731  
 731  732          if (dr == NULL ||
 732  733              (dr->dt.dl.dr_data !=
 733  734              ((db->db_blkid  == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
 734  735                  return;
 735  736  
 736  737          /*
 737  738           * If the last dirty record for this dbuf has not yet synced
 738  739           * and its referencing the dbuf data, either:
 739  740           *      reset the reference to point to a new copy,
 740  741           * or (if there a no active holders)
 741  742           *      just null out the current db_data pointer.
 742  743           */
 743  744          ASSERT(dr->dr_txg >= txg - 2);
 744  745          if (db->db_blkid == DMU_BONUS_BLKID) {
 745  746                  /* Note that the data bufs here are zio_bufs */
 746  747                  dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
 747  748                  arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
 748  749                  bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
 749  750          } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
 750  751                  int size = db->db.db_size;
 751  752                  arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 752  753                  spa_t *spa = db->db_objset->os_spa;
 753  754  
 754  755                  dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
 755  756                  bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
 756  757          } else {
 757  758                  dbuf_set_data(db, NULL);
 758  759          }
 759  760  }
 760  761  
 761  762  void
 762  763  dbuf_unoverride(dbuf_dirty_record_t *dr)
 763  764  {
 764  765          dmu_buf_impl_t *db = dr->dr_dbuf;
 765  766          blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
 766  767          uint64_t txg = dr->dr_txg;
 767  768  
 768  769          ASSERT(MUTEX_HELD(&db->db_mtx));
 769  770          ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
 770  771          ASSERT(db->db_level == 0);
 771  772  
 772  773          if (db->db_blkid == DMU_BONUS_BLKID ||
 773  774              dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
 774  775                  return;
 775  776  
 776  777          ASSERT(db->db_data_pending != dr);
 777  778  
 778  779          /* free this block */
 779  780          if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
 780  781                  zio_free(db->db_objset->os_spa, txg, bp);
 781  782  
 782  783          dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 783  784          dr->dt.dl.dr_nopwrite = B_FALSE;
 784  785  
 785  786          /*
 786  787           * Release the already-written buffer, so we leave it in
 787  788           * a consistent dirty state.  Note that all callers are
 788  789           * modifying the buffer, so they will immediately do
 789  790           * another (redundant) arc_release().  Therefore, leave
 790  791           * the buf thawed to save the effort of freezing &
 791  792           * immediately re-thawing it.
 792  793           */
 793  794          arc_release(dr->dt.dl.dr_data, db);
 794  795  }
 795  796  
 796  797  /*
 797  798   * Evict (if its unreferenced) or clear (if its referenced) any level-0
 798  799   * data blocks in the free range, so that any future readers will find
 799  800   * empty blocks.
 800  801   *
 801  802   * This is a no-op if the dataset is in the middle of an incremental
 802  803   * receive; see comment below for details.
 803  804   */
 804  805  void
 805  806  dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
 806  807  {
 807  808          dmu_buf_impl_t *db, *db_next;
 808  809          uint64_t txg = tx->tx_txg;
 809  810  
 810  811          if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID))
 811  812                  end = dn->dn_maxblkid;
 812  813          dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
 813  814  
 814  815          mutex_enter(&dn->dn_dbufs_mtx);
 815  816          if (start >= dn->dn_unlisted_l0_blkid * dn->dn_datablksz) {
 816  817                  /* There can't be any dbufs in this range; no need to search. */
 817  818                  mutex_exit(&dn->dn_dbufs_mtx);
 818  819                  return;
 819  820          } else if (dmu_objset_is_receiving(dn->dn_objset)) {
 820  821                  /*
 821  822                   * If we are receiving, we expect there to be no dbufs in
 822  823                   * the range to be freed, because receive modifies each
 823  824                   * block at most once, and in offset order.  If this is
 824  825                   * not the case, it can lead to performance problems,
 825  826                   * so note that we unexpectedly took the slow path.
 826  827                   */
 827  828                  atomic_inc_64(&zfs_free_range_recv_miss);
 828  829          }
 829  830  
 830  831          for (db = list_head(&dn->dn_dbufs); db != NULL; db = db_next) {
 831  832                  db_next = list_next(&dn->dn_dbufs, db);
 832  833                  ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 833  834  
 834  835                  if (db->db_level != 0)
 835  836                          continue;
 836  837                  if (db->db_blkid < start || db->db_blkid > end)
 837  838                          continue;
 838  839  
 839  840                  /* found a level 0 buffer in the range */
 840  841                  mutex_enter(&db->db_mtx);
 841  842                  if (dbuf_undirty(db, tx)) {
 842  843                          /* mutex has been dropped and dbuf destroyed */
 843  844                          continue;
 844  845                  }
 845  846  
 846  847                  if (db->db_state == DB_UNCACHED ||
 847  848                      db->db_state == DB_NOFILL ||
 848  849                      db->db_state == DB_EVICTING) {
 849  850                          ASSERT(db->db.db_data == NULL);
 850  851                          mutex_exit(&db->db_mtx);
 851  852                          continue;
 852  853                  }
 853  854                  if (db->db_state == DB_READ || db->db_state == DB_FILL) {
 854  855                          /* will be handled in dbuf_read_done or dbuf_rele */
 855  856                          db->db_freed_in_flight = TRUE;
 856  857                          mutex_exit(&db->db_mtx);
 857  858                          continue;
 858  859                  }
 859  860                  if (refcount_count(&db->db_holds) == 0) {
 860  861                          ASSERT(db->db_buf);
 861  862                          dbuf_clear(db);
 862  863                          continue;
 863  864                  }
 864  865                  /* The dbuf is referenced */
 865  866  
 866  867                  if (db->db_last_dirty != NULL) {
 867  868                          dbuf_dirty_record_t *dr = db->db_last_dirty;
 868  869  
 869  870                          if (dr->dr_txg == txg) {
 870  871                                  /*
 871  872                                   * This buffer is "in-use", re-adjust the file
 872  873                                   * size to reflect that this buffer may
 873  874                                   * contain new data when we sync.
 874  875                                   */
 875  876                                  if (db->db_blkid != DMU_SPILL_BLKID &&
 876  877                                      db->db_blkid > dn->dn_maxblkid)
 877  878                                          dn->dn_maxblkid = db->db_blkid;
 878  879                                  dbuf_unoverride(dr);
 879  880                          } else {
 880  881                                  /*
 881  882                                   * This dbuf is not dirty in the open context.
 882  883                                   * Either uncache it (if its not referenced in
 883  884                                   * the open context) or reset its contents to
 884  885                                   * empty.
 885  886                                   */
 886  887                                  dbuf_fix_old_data(db, txg);
 887  888                          }
 888  889                  }
 889  890                  /* clear the contents if its cached */
 890  891                  if (db->db_state == DB_CACHED) {
 891  892                          ASSERT(db->db.db_data != NULL);
 892  893                          arc_release(db->db_buf, db);
 893  894                          bzero(db->db.db_data, db->db.db_size);
 894  895                          arc_buf_freeze(db->db_buf);
 895  896                  }
 896  897  
 897  898                  mutex_exit(&db->db_mtx);
 898  899          }
 899  900          mutex_exit(&dn->dn_dbufs_mtx);
 900  901  }
 901  902  
 902  903  static int
 903  904  dbuf_block_freeable(dmu_buf_impl_t *db)
 904  905  {
 905  906          dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
 906  907          uint64_t birth_txg = 0;
 907  908  
 908  909          /*
 909  910           * We don't need any locking to protect db_blkptr:
 910  911           * If it's syncing, then db_last_dirty will be set
 911  912           * so we'll ignore db_blkptr.
 912  913           *
 913  914           * This logic ensures that only block births for
 914  915           * filled blocks are considered.
 915  916           */
 916  917          ASSERT(MUTEX_HELD(&db->db_mtx));
 917  918          if (db->db_last_dirty && (db->db_blkptr == NULL ||
 918  919              !BP_IS_HOLE(db->db_blkptr))) {
 919  920                  birth_txg = db->db_last_dirty->dr_txg;
 920  921          } else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
 921  922                  birth_txg = db->db_blkptr->blk_birth;
 922  923          }
 923  924  
 924  925          /*
 925  926           * If this block don't exist or is in a snapshot, it can't be freed.
 926  927           * Don't pass the bp to dsl_dataset_block_freeable() since we
 927  928           * are holding the db_mtx lock and might deadlock if we are
 928  929           * prefetching a dedup-ed block.
 929  930           */
 930  931          if (birth_txg != 0)
 931  932                  return (ds == NULL ||
 932  933                      dsl_dataset_block_freeable(ds, NULL, birth_txg));
 933  934          else
 934  935                  return (B_FALSE);
 935  936  }
 936  937  
 937  938  void
 938  939  dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
 939  940  {
 940  941          arc_buf_t *buf, *obuf;
 941  942          int osize = db->db.db_size;
 942  943          arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 943  944          dnode_t *dn;
 944  945  
 945  946          ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 946  947  
 947  948          DB_DNODE_ENTER(db);
 948  949          dn = DB_DNODE(db);
 949  950  
 950  951          /* XXX does *this* func really need the lock? */
 951  952          ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
 952  953  
 953  954          /*
 954  955           * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held
 955  956           * is OK, because there can be no other references to the db
 956  957           * when we are changing its size, so no concurrent DB_FILL can
 957  958           * be happening.
 958  959           */
 959  960          /*
 960  961           * XXX we should be doing a dbuf_read, checking the return
 961  962           * value and returning that up to our callers
 962  963           */
 963  964          dmu_buf_will_dirty(&db->db, tx);
 964  965  
 965  966          /* create the data buffer for the new block */
 966  967          buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
 967  968  
 968  969          /* copy old block data to the new block */
 969  970          obuf = db->db_buf;
 970  971          bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
 971  972          /* zero the remainder */
 972  973          if (size > osize)
 973  974                  bzero((uint8_t *)buf->b_data + osize, size - osize);
 974  975  
 975  976          mutex_enter(&db->db_mtx);
 976  977          dbuf_set_data(db, buf);
 977  978          VERIFY(arc_buf_remove_ref(obuf, db));
 978  979          db->db.db_size = size;
 979  980  
 980  981          if (db->db_level == 0) {
 981  982                  ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
 982  983                  db->db_last_dirty->dt.dl.dr_data = buf;
 983  984          }
 984  985          mutex_exit(&db->db_mtx);
 985  986  
 986  987          dnode_willuse_space(dn, size-osize, tx);
 987  988          DB_DNODE_EXIT(db);
 988  989  }
 989  990  
 990  991  void
 991  992  dbuf_release_bp(dmu_buf_impl_t *db)
 992  993  {
 993  994          objset_t *os = db->db_objset;
 994  995  
 995  996          ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
 996  997          ASSERT(arc_released(os->os_phys_buf) ||
 997  998              list_link_active(&os->os_dsl_dataset->ds_synced_link));
 998  999          ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
 999 1000  
1000 1001          (void) arc_release(db->db_buf, db);
1001 1002  }
1002 1003  
1003 1004  dbuf_dirty_record_t *
1004 1005  dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1005 1006  {
1006 1007          dnode_t *dn;
1007 1008          objset_t *os;
1008 1009          dbuf_dirty_record_t **drp, *dr;
1009 1010          int drop_struct_lock = FALSE;
1010 1011          boolean_t do_free_accounting = B_FALSE;
1011 1012          int txgoff = tx->tx_txg & TXG_MASK;
1012 1013  
1013 1014          ASSERT(tx->tx_txg != 0);
1014 1015          ASSERT(!refcount_is_zero(&db->db_holds));
1015 1016          DMU_TX_DIRTY_BUF(tx, db);
1016 1017  
1017 1018          DB_DNODE_ENTER(db);
1018 1019          dn = DB_DNODE(db);
1019 1020          /*
1020 1021           * Shouldn't dirty a regular buffer in syncing context.  Private
1021 1022           * objects may be dirtied in syncing context, but only if they
1022 1023           * were already pre-dirtied in open context.
1023 1024           */
1024 1025          ASSERT(!dmu_tx_is_syncing(tx) ||
1025 1026              BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
1026 1027              DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1027 1028              dn->dn_objset->os_dsl_dataset == NULL);
1028 1029          /*
1029 1030           * We make this assert for private objects as well, but after we
1030 1031           * check if we're already dirty.  They are allowed to re-dirty
1031 1032           * in syncing context.
1032 1033           */
1033 1034          ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1034 1035              dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1035 1036              (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1036 1037  
1037 1038          mutex_enter(&db->db_mtx);
1038 1039          /*
1039 1040           * XXX make this true for indirects too?  The problem is that
1040 1041           * transactions created with dmu_tx_create_assigned() from
1041 1042           * syncing context don't bother holding ahead.
1042 1043           */
1043 1044          ASSERT(db->db_level != 0 ||
1044 1045              db->db_state == DB_CACHED || db->db_state == DB_FILL ||
1045 1046              db->db_state == DB_NOFILL);
1046 1047  
1047 1048          mutex_enter(&dn->dn_mtx);
1048 1049          /*
1049 1050           * Don't set dirtyctx to SYNC if we're just modifying this as we
1050 1051           * initialize the objset.
1051 1052           */
1052 1053          if (dn->dn_dirtyctx == DN_UNDIRTIED &&
1053 1054              !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
1054 1055                  dn->dn_dirtyctx =
1055 1056                      (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
1056 1057                  ASSERT(dn->dn_dirtyctx_firstset == NULL);
1057 1058                  dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
1058 1059          }
1059 1060          mutex_exit(&dn->dn_mtx);
1060 1061  
1061 1062          if (db->db_blkid == DMU_SPILL_BLKID)
1062 1063                  dn->dn_have_spill = B_TRUE;
1063 1064  
1064 1065          /*
1065 1066           * If this buffer is already dirty, we're done.
1066 1067           */
1067 1068          drp = &db->db_last_dirty;
1068 1069          ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
1069 1070              db->db.db_object == DMU_META_DNODE_OBJECT);
1070 1071          while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
1071 1072                  drp = &dr->dr_next;
1072 1073          if (dr && dr->dr_txg == tx->tx_txg) {
1073 1074                  DB_DNODE_EXIT(db);
1074 1075  
1075 1076                  if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
1076 1077                          /*
1077 1078                           * If this buffer has already been written out,
1078 1079                           * we now need to reset its state.
1079 1080                           */
1080 1081                          dbuf_unoverride(dr);
1081 1082                          if (db->db.db_object != DMU_META_DNODE_OBJECT &&
1082 1083                              db->db_state != DB_NOFILL)
1083 1084                                  arc_buf_thaw(db->db_buf);
1084 1085                  }
1085 1086                  mutex_exit(&db->db_mtx);
1086 1087                  return (dr);
1087 1088          }
1088 1089  
1089 1090          /*
1090 1091           * Only valid if not already dirty.
1091 1092           */
1092 1093          ASSERT(dn->dn_object == 0 ||
1093 1094              dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1094 1095              (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1095 1096  
1096 1097          ASSERT3U(dn->dn_nlevels, >, db->db_level);
1097 1098          ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
1098 1099              dn->dn_phys->dn_nlevels > db->db_level ||
1099 1100              dn->dn_next_nlevels[txgoff] > db->db_level ||
1100 1101              dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
1101 1102              dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
1102 1103  
1103 1104          /*
1104 1105           * We should only be dirtying in syncing context if it's the
1105 1106           * mos or we're initializing the os or it's a special object.
1106 1107           * However, we are allowed to dirty in syncing context provided
1107 1108           * we already dirtied it in open context.  Hence we must make
1108 1109           * this assertion only if we're not already dirty.
1109 1110           */
1110 1111          os = dn->dn_objset;
1111 1112          ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1112 1113              os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
1113 1114          ASSERT(db->db.db_size != 0);
1114 1115  
1115 1116          dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1116 1117  
1117 1118          if (db->db_blkid != DMU_BONUS_BLKID) {
1118 1119                  /*
1119 1120                   * Update the accounting.
1120 1121                   * Note: we delay "free accounting" until after we drop
1121 1122                   * the db_mtx.  This keeps us from grabbing other locks
1122 1123                   * (and possibly deadlocking) in bp_get_dsize() while
1123 1124                   * also holding the db_mtx.
1124 1125                   */
1125 1126                  dnode_willuse_space(dn, db->db.db_size, tx);
1126 1127                  do_free_accounting = dbuf_block_freeable(db);
1127 1128          }
1128 1129  
1129 1130          /*
1130 1131           * If this buffer is dirty in an old transaction group we need
1131 1132           * to make a copy of it so that the changes we make in this
1132 1133           * transaction group won't leak out when we sync the older txg.
1133 1134           */
1134 1135          dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
1135 1136          if (db->db_level == 0) {
1136 1137                  void *data_old = db->db_buf;
1137 1138  
1138 1139                  if (db->db_state != DB_NOFILL) {
1139 1140                          if (db->db_blkid == DMU_BONUS_BLKID) {
1140 1141                                  dbuf_fix_old_data(db, tx->tx_txg);
1141 1142                                  data_old = db->db.db_data;
1142 1143                          } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
1143 1144                                  /*
1144 1145                                   * Release the data buffer from the cache so
1145 1146                                   * that we can modify it without impacting
1146 1147                                   * possible other users of this cached data
1147 1148                                   * block.  Note that indirect blocks and
1148 1149                                   * private objects are not released until the
1149 1150                                   * syncing state (since they are only modified
1150 1151                                   * then).
1151 1152                                   */
1152 1153                                  arc_release(db->db_buf, db);
1153 1154                                  dbuf_fix_old_data(db, tx->tx_txg);
1154 1155                                  data_old = db->db_buf;
1155 1156                          }
1156 1157                          ASSERT(data_old != NULL);
1157 1158                  }
1158 1159                  dr->dt.dl.dr_data = data_old;
1159 1160          } else {
1160 1161                  mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
1161 1162                  list_create(&dr->dt.di.dr_children,
1162 1163                      sizeof (dbuf_dirty_record_t),
1163 1164                      offsetof(dbuf_dirty_record_t, dr_dirty_node));
1164 1165          }
1165 1166          if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
1166 1167                  dr->dr_accounted = db->db.db_size;
1167 1168          dr->dr_dbuf = db;
1168 1169          dr->dr_txg = tx->tx_txg;
1169 1170          dr->dr_next = *drp;

↓ open down ↓

1117 lines elided

↑ open up ↑

1170 1171          *drp = dr;
1171 1172  
1172 1173          /*
1173 1174           * We could have been freed_in_flight between the dbuf_noread
1174 1175           * and dbuf_dirty.  We win, as though the dbuf_noread() had
1175 1176           * happened after the free.
1176 1177           */
1177 1178          if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1178 1179              db->db_blkid != DMU_SPILL_BLKID) {
1179 1180                  mutex_enter(&dn->dn_mtx);
1180      -                dnode_clear_range(dn, db->db_blkid, 1, tx);
     1181 +                if (dn->dn_free_ranges[txgoff] != NULL) {
     1182 +                        range_tree_clear(dn->dn_free_ranges[txgoff],
     1183 +                            db->db_blkid, 1);
     1184 +                }
1181 1185                  mutex_exit(&dn->dn_mtx);
1182 1186                  db->db_freed_in_flight = FALSE;
1183 1187          }
1184 1188  
1185 1189          /*
1186 1190           * This buffer is now part of this txg
1187 1191           */
1188 1192          dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
1189 1193          db->db_dirtycnt += 1;
1190 1194          ASSERT3U(db->db_dirtycnt, <=, 3);

1191 1195  
1192 1196          mutex_exit(&db->db_mtx);
1193 1197  
1194 1198          if (db->db_blkid == DMU_BONUS_BLKID ||
1195 1199              db->db_blkid == DMU_SPILL_BLKID) {
1196 1200                  mutex_enter(&dn->dn_mtx);
1197 1201                  ASSERT(!list_link_active(&dr->dr_dirty_node));
1198 1202                  list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1199 1203                  mutex_exit(&dn->dn_mtx);
1200 1204                  dnode_setdirty(dn, tx);
1201 1205                  DB_DNODE_EXIT(db);
1202 1206                  return (dr);
1203 1207          } else if (do_free_accounting) {
1204 1208                  blkptr_t *bp = db->db_blkptr;
1205 1209                  int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
1206 1210                      bp_get_dsize(os->os_spa, bp) : db->db.db_size;
1207 1211                  /*
1208 1212                   * This is only a guess -- if the dbuf is dirty
1209 1213                   * in a previous txg, we don't know how much
1210 1214                   * space it will use on disk yet.  We should
1211 1215                   * really have the struct_rwlock to access
1212 1216                   * db_blkptr, but since this is just a guess,
1213 1217                   * it's OK if we get an odd answer.
1214 1218                   */
1215 1219                  ddt_prefetch(os->os_spa, bp);
1216 1220                  dnode_willuse_space(dn, -willfree, tx);
1217 1221          }
1218 1222  
1219 1223          if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
1220 1224                  rw_enter(&dn->dn_struct_rwlock, RW_READER);
1221 1225                  drop_struct_lock = TRUE;
1222 1226          }
1223 1227  
1224 1228          if (db->db_level == 0) {
1225 1229                  dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
1226 1230                  ASSERT(dn->dn_maxblkid >= db->db_blkid);
1227 1231          }
1228 1232  
1229 1233          if (db->db_level+1 < dn->dn_nlevels) {
1230 1234                  dmu_buf_impl_t *parent = db->db_parent;
1231 1235                  dbuf_dirty_record_t *di;
1232 1236                  int parent_held = FALSE;
1233 1237  
1234 1238                  if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
1235 1239                          int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1236 1240  
1237 1241                          parent = dbuf_hold_level(dn, db->db_level+1,
1238 1242                              db->db_blkid >> epbs, FTAG);
1239 1243                          ASSERT(parent != NULL);
1240 1244                          parent_held = TRUE;
1241 1245                  }
1242 1246                  if (drop_struct_lock)
1243 1247                          rw_exit(&dn->dn_struct_rwlock);
1244 1248                  ASSERT3U(db->db_level+1, ==, parent->db_level);
1245 1249                  di = dbuf_dirty(parent, tx);
1246 1250                  if (parent_held)
1247 1251                          dbuf_rele(parent, FTAG);
1248 1252  
1249 1253                  mutex_enter(&db->db_mtx);
1250 1254                  /*
1251 1255                   * Since we've dropped the mutex, it's possible that
1252 1256                   * dbuf_undirty() might have changed this out from under us.
1253 1257                   */
1254 1258                  if (db->db_last_dirty == dr ||
1255 1259                      dn->dn_object == DMU_META_DNODE_OBJECT) {
1256 1260                          mutex_enter(&di->dt.di.dr_mtx);
1257 1261                          ASSERT3U(di->dr_txg, ==, tx->tx_txg);
1258 1262                          ASSERT(!list_link_active(&dr->dr_dirty_node));
1259 1263                          list_insert_tail(&di->dt.di.dr_children, dr);
1260 1264                          mutex_exit(&di->dt.di.dr_mtx);
1261 1265                          dr->dr_parent = di;
1262 1266                  }
1263 1267                  mutex_exit(&db->db_mtx);
1264 1268          } else {
1265 1269                  ASSERT(db->db_level+1 == dn->dn_nlevels);
1266 1270                  ASSERT(db->db_blkid < dn->dn_nblkptr);
1267 1271                  ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
1268 1272                  mutex_enter(&dn->dn_mtx);
1269 1273                  ASSERT(!list_link_active(&dr->dr_dirty_node));
1270 1274                  list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1271 1275                  mutex_exit(&dn->dn_mtx);
1272 1276                  if (drop_struct_lock)
1273 1277                          rw_exit(&dn->dn_struct_rwlock);
1274 1278          }
1275 1279  
1276 1280          dnode_setdirty(dn, tx);
1277 1281          DB_DNODE_EXIT(db);
1278 1282          return (dr);
1279 1283  }
1280 1284  
1281 1285  /*
1282 1286   * Undirty a buffer in the transaction group referenced by the given
1283 1287   * transaction.  Return whether this evicted the dbuf.
1284 1288   */
1285 1289  static boolean_t
1286 1290  dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1287 1291  {
1288 1292          dnode_t *dn;
1289 1293          uint64_t txg = tx->tx_txg;
1290 1294          dbuf_dirty_record_t *dr, **drp;
1291 1295  
1292 1296          ASSERT(txg != 0);
1293 1297          ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1294 1298          ASSERT0(db->db_level);
1295 1299          ASSERT(MUTEX_HELD(&db->db_mtx));
1296 1300  
1297 1301          /*
1298 1302           * If this buffer is not dirty, we're done.
1299 1303           */
1300 1304          for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
1301 1305                  if (dr->dr_txg <= txg)
1302 1306                          break;
1303 1307          if (dr == NULL || dr->dr_txg < txg)
1304 1308                  return (B_FALSE);
1305 1309          ASSERT(dr->dr_txg == txg);
1306 1310          ASSERT(dr->dr_dbuf == db);
1307 1311  
1308 1312          DB_DNODE_ENTER(db);
1309 1313          dn = DB_DNODE(db);
1310 1314  
1311 1315          dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1312 1316  
1313 1317          ASSERT(db->db.db_size != 0);
1314 1318  
1315 1319          /*
1316 1320           * Any space we accounted for in dp_dirty_* will be cleaned up by
1317 1321           * dsl_pool_sync().  This is relatively rare so the discrepancy
1318 1322           * is not a big deal.
1319 1323           */
1320 1324  
1321 1325          *drp = dr->dr_next;
1322 1326  
1323 1327          /*
1324 1328           * Note that there are three places in dbuf_dirty()
1325 1329           * where this dirty record may be put on a list.
1326 1330           * Make sure to do a list_remove corresponding to
1327 1331           * every one of those list_insert calls.
1328 1332           */
1329 1333          if (dr->dr_parent) {
1330 1334                  mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
1331 1335                  list_remove(&dr->dr_parent->dt.di.dr_children, dr);
1332 1336                  mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
1333 1337          } else if (db->db_blkid == DMU_SPILL_BLKID ||
1334 1338              db->db_level+1 == dn->dn_nlevels) {
1335 1339                  ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
1336 1340                  mutex_enter(&dn->dn_mtx);
1337 1341                  list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
1338 1342                  mutex_exit(&dn->dn_mtx);
1339 1343          }
1340 1344          DB_DNODE_EXIT(db);
1341 1345  
1342 1346          if (db->db_state != DB_NOFILL) {
1343 1347                  dbuf_unoverride(dr);
1344 1348  
1345 1349                  ASSERT(db->db_buf != NULL);
1346 1350                  ASSERT(dr->dt.dl.dr_data != NULL);
1347 1351                  if (dr->dt.dl.dr_data != db->db_buf)
1348 1352                          VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
1349 1353          }
1350 1354          kmem_free(dr, sizeof (dbuf_dirty_record_t));
1351 1355  
1352 1356          ASSERT(db->db_dirtycnt > 0);
1353 1357          db->db_dirtycnt -= 1;
1354 1358  
1355 1359          if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
1356 1360                  arc_buf_t *buf = db->db_buf;
1357 1361  
1358 1362                  ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
1359 1363                  dbuf_set_data(db, NULL);
1360 1364                  VERIFY(arc_buf_remove_ref(buf, db));
1361 1365                  dbuf_evict(db);
1362 1366                  return (B_TRUE);
1363 1367          }
1364 1368  
1365 1369          return (B_FALSE);
1366 1370  }
1367 1371  
1368 1372  void
1369 1373  dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
1370 1374  {
1371 1375          dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1372 1376          int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
1373 1377  
1374 1378          ASSERT(tx->tx_txg != 0);
1375 1379          ASSERT(!refcount_is_zero(&db->db_holds));
1376 1380  
1377 1381          DB_DNODE_ENTER(db);
1378 1382          if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
1379 1383                  rf |= DB_RF_HAVESTRUCT;
1380 1384          DB_DNODE_EXIT(db);
1381 1385          (void) dbuf_read(db, NULL, rf);
1382 1386          (void) dbuf_dirty(db, tx);
1383 1387  }
1384 1388  
1385 1389  void
1386 1390  dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1387 1391  {
1388 1392          dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1389 1393  
1390 1394          db->db_state = DB_NOFILL;
1391 1395  
1392 1396          dmu_buf_will_fill(db_fake, tx);
1393 1397  }
1394 1398  
1395 1399  void
1396 1400  dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1397 1401  {
1398 1402          dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1399 1403  
1400 1404          ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1401 1405          ASSERT(tx->tx_txg != 0);
1402 1406          ASSERT(db->db_level == 0);
1403 1407          ASSERT(!refcount_is_zero(&db->db_holds));
1404 1408  
1405 1409          ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
1406 1410              dmu_tx_private_ok(tx));
1407 1411  
1408 1412          dbuf_noread(db);
1409 1413          (void) dbuf_dirty(db, tx);
1410 1414  }
1411 1415  
1412 1416  #pragma weak dmu_buf_fill_done = dbuf_fill_done
1413 1417  /* ARGSUSED */
1414 1418  void
1415 1419  dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
1416 1420  {
1417 1421          mutex_enter(&db->db_mtx);
1418 1422          DBUF_VERIFY(db);
1419 1423  
1420 1424          if (db->db_state == DB_FILL) {
1421 1425                  if (db->db_level == 0 && db->db_freed_in_flight) {
1422 1426                          ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1423 1427                          /* we were freed while filling */
1424 1428                          /* XXX dbuf_undirty? */
1425 1429                          bzero(db->db.db_data, db->db.db_size);
1426 1430                          db->db_freed_in_flight = FALSE;
1427 1431                  }
1428 1432                  db->db_state = DB_CACHED;
1429 1433                  cv_broadcast(&db->db_changed);
1430 1434          }
1431 1435          mutex_exit(&db->db_mtx);
1432 1436  }
1433 1437  
1434 1438  /*
1435 1439   * Directly assign a provided arc buf to a given dbuf if it's not referenced
1436 1440   * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
1437 1441   */
1438 1442  void
1439 1443  dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
1440 1444  {
1441 1445          ASSERT(!refcount_is_zero(&db->db_holds));
1442 1446          ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1443 1447          ASSERT(db->db_level == 0);
1444 1448          ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
1445 1449          ASSERT(buf != NULL);
1446 1450          ASSERT(arc_buf_size(buf) == db->db.db_size);
1447 1451          ASSERT(tx->tx_txg != 0);
1448 1452  
1449 1453          arc_return_buf(buf, db);
1450 1454          ASSERT(arc_released(buf));
1451 1455  
1452 1456          mutex_enter(&db->db_mtx);
1453 1457  
1454 1458          while (db->db_state == DB_READ || db->db_state == DB_FILL)
1455 1459                  cv_wait(&db->db_changed, &db->db_mtx);
1456 1460  
1457 1461          ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
1458 1462  
1459 1463          if (db->db_state == DB_CACHED &&
1460 1464              refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
1461 1465                  mutex_exit(&db->db_mtx);
1462 1466                  (void) dbuf_dirty(db, tx);
1463 1467                  bcopy(buf->b_data, db->db.db_data, db->db.db_size);
1464 1468                  VERIFY(arc_buf_remove_ref(buf, db));
1465 1469                  xuio_stat_wbuf_copied();
1466 1470                  return;
1467 1471          }
1468 1472  
1469 1473          xuio_stat_wbuf_nocopy();
1470 1474          if (db->db_state == DB_CACHED) {
1471 1475                  dbuf_dirty_record_t *dr = db->db_last_dirty;
1472 1476  
1473 1477                  ASSERT(db->db_buf != NULL);
1474 1478                  if (dr != NULL && dr->dr_txg == tx->tx_txg) {
1475 1479                          ASSERT(dr->dt.dl.dr_data == db->db_buf);
1476 1480                          if (!arc_released(db->db_buf)) {
1477 1481                                  ASSERT(dr->dt.dl.dr_override_state ==
1478 1482                                      DR_OVERRIDDEN);
1479 1483                                  arc_release(db->db_buf, db);
1480 1484                          }
1481 1485                          dr->dt.dl.dr_data = buf;
1482 1486                          VERIFY(arc_buf_remove_ref(db->db_buf, db));
1483 1487                  } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
1484 1488                          arc_release(db->db_buf, db);
1485 1489                          VERIFY(arc_buf_remove_ref(db->db_buf, db));
1486 1490                  }
1487 1491                  db->db_buf = NULL;
1488 1492          }
1489 1493          ASSERT(db->db_buf == NULL);
1490 1494          dbuf_set_data(db, buf);
1491 1495          db->db_state = DB_FILL;
1492 1496          mutex_exit(&db->db_mtx);
1493 1497          (void) dbuf_dirty(db, tx);
1494 1498          dmu_buf_fill_done(&db->db, tx);
1495 1499  }
1496 1500  
1497 1501  /*
1498 1502   * "Clear" the contents of this dbuf.  This will mark the dbuf
1499 1503   * EVICTING and clear *most* of its references.  Unfortunately,
1500 1504   * when we are not holding the dn_dbufs_mtx, we can't clear the
1501 1505   * entry in the dn_dbufs list.  We have to wait until dbuf_destroy()
1502 1506   * in this case.  For callers from the DMU we will usually see:
1503 1507   *      dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
1504 1508   * For the arc callback, we will usually see:
1505 1509   *      dbuf_do_evict()->dbuf_clear();dbuf_destroy()
1506 1510   * Sometimes, though, we will get a mix of these two:
1507 1511   *      DMU: dbuf_clear()->arc_buf_evict()
1508 1512   *      ARC: dbuf_do_evict()->dbuf_destroy()
1509 1513   */
1510 1514  void
1511 1515  dbuf_clear(dmu_buf_impl_t *db)
1512 1516  {
1513 1517          dnode_t *dn;
1514 1518          dmu_buf_impl_t *parent = db->db_parent;
1515 1519          dmu_buf_impl_t *dndb;
1516 1520          int dbuf_gone = FALSE;
1517 1521  
1518 1522          ASSERT(MUTEX_HELD(&db->db_mtx));
1519 1523          ASSERT(refcount_is_zero(&db->db_holds));
1520 1524  
1521 1525          dbuf_evict_user(db);
1522 1526  
1523 1527          if (db->db_state == DB_CACHED) {
1524 1528                  ASSERT(db->db.db_data != NULL);
1525 1529                  if (db->db_blkid == DMU_BONUS_BLKID) {
1526 1530                          zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
1527 1531                          arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
1528 1532                  }
1529 1533                  db->db.db_data = NULL;
1530 1534                  db->db_state = DB_UNCACHED;
1531 1535          }
1532 1536  
1533 1537          ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
1534 1538          ASSERT(db->db_data_pending == NULL);
1535 1539  
1536 1540          db->db_state = DB_EVICTING;
1537 1541          db->db_blkptr = NULL;
1538 1542  
1539 1543          DB_DNODE_ENTER(db);
1540 1544          dn = DB_DNODE(db);
1541 1545          dndb = dn->dn_dbuf;
1542 1546          if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
1543 1547                  list_remove(&dn->dn_dbufs, db);
1544 1548                  (void) atomic_dec_32_nv(&dn->dn_dbufs_count);
1545 1549                  membar_producer();
1546 1550                  DB_DNODE_EXIT(db);
1547 1551                  /*
1548 1552                   * Decrementing the dbuf count means that the hold corresponding
1549 1553                   * to the removed dbuf is no longer discounted in dnode_move(),
1550 1554                   * so the dnode cannot be moved until after we release the hold.
1551 1555                   * The membar_producer() ensures visibility of the decremented
1552 1556                   * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
1553 1557                   * release any lock.
1554 1558                   */
1555 1559                  dnode_rele(dn, db);
1556 1560                  db->db_dnode_handle = NULL;
1557 1561          } else {
1558 1562                  DB_DNODE_EXIT(db);
1559 1563          }
1560 1564  
1561 1565          if (db->db_buf)
1562 1566                  dbuf_gone = arc_buf_evict(db->db_buf);
1563 1567  
1564 1568          if (!dbuf_gone)
1565 1569                  mutex_exit(&db->db_mtx);
1566 1570  
1567 1571          /*
1568 1572           * If this dbuf is referenced from an indirect dbuf,
1569 1573           * decrement the ref count on the indirect dbuf.
1570 1574           */
1571 1575          if (parent && parent != dndb)
1572 1576                  dbuf_rele(parent, db);
1573 1577  }
1574 1578  
1575 1579  static int
1576 1580  dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
1577 1581      dmu_buf_impl_t **parentp, blkptr_t **bpp)
1578 1582  {
1579 1583          int nlevels, epbs;
1580 1584  
1581 1585          *parentp = NULL;
1582 1586          *bpp = NULL;
1583 1587  
1584 1588          ASSERT(blkid != DMU_BONUS_BLKID);
1585 1589  
1586 1590          if (blkid == DMU_SPILL_BLKID) {
1587 1591                  mutex_enter(&dn->dn_mtx);
1588 1592                  if (dn->dn_have_spill &&
1589 1593                      (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
1590 1594                          *bpp = &dn->dn_phys->dn_spill;
1591 1595                  else
1592 1596                          *bpp = NULL;
1593 1597                  dbuf_add_ref(dn->dn_dbuf, NULL);
1594 1598                  *parentp = dn->dn_dbuf;
1595 1599                  mutex_exit(&dn->dn_mtx);
1596 1600                  return (0);
1597 1601          }
1598 1602  
1599 1603          if (dn->dn_phys->dn_nlevels == 0)
1600 1604                  nlevels = 1;
1601 1605          else
1602 1606                  nlevels = dn->dn_phys->dn_nlevels;
1603 1607  
1604 1608          epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1605 1609  
1606 1610          ASSERT3U(level * epbs, <, 64);
1607 1611          ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1608 1612          if (level >= nlevels ||
1609 1613              (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
1610 1614                  /* the buffer has no parent yet */
1611 1615                  return (SET_ERROR(ENOENT));
1612 1616          } else if (level < nlevels-1) {
1613 1617                  /* this block is referenced from an indirect block */
1614 1618                  int err = dbuf_hold_impl(dn, level+1,
1615 1619                      blkid >> epbs, fail_sparse, NULL, parentp);
1616 1620                  if (err)
1617 1621                          return (err);
1618 1622                  err = dbuf_read(*parentp, NULL,
1619 1623                      (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
1620 1624                  if (err) {
1621 1625                          dbuf_rele(*parentp, NULL);
1622 1626                          *parentp = NULL;
1623 1627                          return (err);
1624 1628                  }
1625 1629                  *bpp = ((blkptr_t *)(*parentp)->db.db_data) +
1626 1630                      (blkid & ((1ULL << epbs) - 1));
1627 1631                  return (0);
1628 1632          } else {
1629 1633                  /* the block is referenced from the dnode */
1630 1634                  ASSERT3U(level, ==, nlevels-1);
1631 1635                  ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
1632 1636                      blkid < dn->dn_phys->dn_nblkptr);
1633 1637                  if (dn->dn_dbuf) {
1634 1638                          dbuf_add_ref(dn->dn_dbuf, NULL);
1635 1639                          *parentp = dn->dn_dbuf;
1636 1640                  }
1637 1641                  *bpp = &dn->dn_phys->dn_blkptr[blkid];
1638 1642                  return (0);
1639 1643          }
1640 1644  }
1641 1645  
1642 1646  static dmu_buf_impl_t *
1643 1647  dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
1644 1648      dmu_buf_impl_t *parent, blkptr_t *blkptr)
1645 1649  {
1646 1650          objset_t *os = dn->dn_objset;
1647 1651          dmu_buf_impl_t *db, *odb;
1648 1652  
1649 1653          ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1650 1654          ASSERT(dn->dn_type != DMU_OT_NONE);
1651 1655  
1652 1656          db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
1653 1657  
1654 1658          db->db_objset = os;
1655 1659          db->db.db_object = dn->dn_object;
1656 1660          db->db_level = level;
1657 1661          db->db_blkid = blkid;
1658 1662          db->db_last_dirty = NULL;
1659 1663          db->db_dirtycnt = 0;
1660 1664          db->db_dnode_handle = dn->dn_handle;
1661 1665          db->db_parent = parent;
1662 1666          db->db_blkptr = blkptr;
1663 1667  
1664 1668          db->db_user_ptr = NULL;
1665 1669          db->db_user_data_ptr_ptr = NULL;
1666 1670          db->db_evict_func = NULL;
1667 1671          db->db_immediate_evict = 0;
1668 1672          db->db_freed_in_flight = 0;
1669 1673  
1670 1674          if (blkid == DMU_BONUS_BLKID) {
1671 1675                  ASSERT3P(parent, ==, dn->dn_dbuf);
1672 1676                  db->db.db_size = DN_MAX_BONUSLEN -
1673 1677                      (dn->dn_nblkptr-1) * sizeof (blkptr_t);
1674 1678                  ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
1675 1679                  db->db.db_offset = DMU_BONUS_BLKID;
1676 1680                  db->db_state = DB_UNCACHED;
1677 1681                  /* the bonus dbuf is not placed in the hash table */
1678 1682                  arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1679 1683                  return (db);
1680 1684          } else if (blkid == DMU_SPILL_BLKID) {
1681 1685                  db->db.db_size = (blkptr != NULL) ?
1682 1686                      BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
1683 1687                  db->db.db_offset = 0;
1684 1688          } else {
1685 1689                  int blocksize =
1686 1690                      db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
1687 1691                  db->db.db_size = blocksize;
1688 1692                  db->db.db_offset = db->db_blkid * blocksize;
1689 1693          }
1690 1694  
1691 1695          /*
1692 1696           * Hold the dn_dbufs_mtx while we get the new dbuf
1693 1697           * in the hash table *and* added to the dbufs list.
1694 1698           * This prevents a possible deadlock with someone
1695 1699           * trying to look up this dbuf before its added to the
1696 1700           * dn_dbufs list.
1697 1701           */
1698 1702          mutex_enter(&dn->dn_dbufs_mtx);
1699 1703          db->db_state = DB_EVICTING;
1700 1704          if ((odb = dbuf_hash_insert(db)) != NULL) {
1701 1705                  /* someone else inserted it first */
1702 1706                  kmem_cache_free(dbuf_cache, db);
1703 1707                  mutex_exit(&dn->dn_dbufs_mtx);
1704 1708                  return (odb);
1705 1709          }
1706 1710          list_insert_head(&dn->dn_dbufs, db);
1707 1711          if (db->db_level == 0 && db->db_blkid >=
1708 1712              dn->dn_unlisted_l0_blkid)
1709 1713                  dn->dn_unlisted_l0_blkid = db->db_blkid + 1;
1710 1714          db->db_state = DB_UNCACHED;
1711 1715          mutex_exit(&dn->dn_dbufs_mtx);
1712 1716          arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1713 1717  
1714 1718          if (parent && parent != dn->dn_dbuf)
1715 1719                  dbuf_add_ref(parent, db);
1716 1720  
1717 1721          ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1718 1722              refcount_count(&dn->dn_holds) > 0);
1719 1723          (void) refcount_add(&dn->dn_holds, db);
1720 1724          (void) atomic_inc_32_nv(&dn->dn_dbufs_count);
1721 1725  
1722 1726          dprintf_dbuf(db, "db=%p\n", db);
1723 1727  
1724 1728          return (db);
1725 1729  }
1726 1730  
1727 1731  static int
1728 1732  dbuf_do_evict(void *private)
1729 1733  {
1730 1734          arc_buf_t *buf = private;
1731 1735          dmu_buf_impl_t *db = buf->b_private;
1732 1736  
1733 1737          if (!MUTEX_HELD(&db->db_mtx))
1734 1738                  mutex_enter(&db->db_mtx);
1735 1739  
1736 1740          ASSERT(refcount_is_zero(&db->db_holds));
1737 1741  
1738 1742          if (db->db_state != DB_EVICTING) {
1739 1743                  ASSERT(db->db_state == DB_CACHED);
1740 1744                  DBUF_VERIFY(db);
1741 1745                  db->db_buf = NULL;
1742 1746                  dbuf_evict(db);
1743 1747          } else {
1744 1748                  mutex_exit(&db->db_mtx);
1745 1749                  dbuf_destroy(db);
1746 1750          }
1747 1751          return (0);
1748 1752  }
1749 1753  
1750 1754  static void
1751 1755  dbuf_destroy(dmu_buf_impl_t *db)
1752 1756  {
1753 1757          ASSERT(refcount_is_zero(&db->db_holds));
1754 1758  
1755 1759          if (db->db_blkid != DMU_BONUS_BLKID) {
1756 1760                  /*
1757 1761                   * If this dbuf is still on the dn_dbufs list,
1758 1762                   * remove it from that list.
1759 1763                   */
1760 1764                  if (db->db_dnode_handle != NULL) {
1761 1765                          dnode_t *dn;
1762 1766  
1763 1767                          DB_DNODE_ENTER(db);
1764 1768                          dn = DB_DNODE(db);
1765 1769                          mutex_enter(&dn->dn_dbufs_mtx);
1766 1770                          list_remove(&dn->dn_dbufs, db);
1767 1771                          (void) atomic_dec_32_nv(&dn->dn_dbufs_count);
1768 1772                          mutex_exit(&dn->dn_dbufs_mtx);
1769 1773                          DB_DNODE_EXIT(db);
1770 1774                          /*
1771 1775                           * Decrementing the dbuf count means that the hold
1772 1776                           * corresponding to the removed dbuf is no longer
1773 1777                           * discounted in dnode_move(), so the dnode cannot be
1774 1778                           * moved until after we release the hold.
1775 1779                           */
1776 1780                          dnode_rele(dn, db);
1777 1781                          db->db_dnode_handle = NULL;
1778 1782                  }
1779 1783                  dbuf_hash_remove(db);
1780 1784          }
1781 1785          db->db_parent = NULL;
1782 1786          db->db_buf = NULL;
1783 1787  
1784 1788          ASSERT(!list_link_active(&db->db_link));
1785 1789          ASSERT(db->db.db_data == NULL);
1786 1790          ASSERT(db->db_hash_next == NULL);
1787 1791          ASSERT(db->db_blkptr == NULL);
1788 1792          ASSERT(db->db_data_pending == NULL);
1789 1793  
1790 1794          kmem_cache_free(dbuf_cache, db);
1791 1795          arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1792 1796  }
1793 1797  
1794 1798  void
1795 1799  dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
1796 1800  {
1797 1801          dmu_buf_impl_t *db = NULL;
1798 1802          blkptr_t *bp = NULL;
1799 1803  
1800 1804          ASSERT(blkid != DMU_BONUS_BLKID);
1801 1805          ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1802 1806  
1803 1807          if (dnode_block_freed(dn, blkid))
1804 1808                  return;
1805 1809  
1806 1810          /* dbuf_find() returns with db_mtx held */
1807 1811          if (db = dbuf_find(dn, 0, blkid)) {
1808 1812                  /*
1809 1813                   * This dbuf is already in the cache.  We assume that
1810 1814                   * it is already CACHED, or else about to be either
1811 1815                   * read or filled.
1812 1816                   */
1813 1817                  mutex_exit(&db->db_mtx);
1814 1818                  return;
1815 1819          }
1816 1820  
1817 1821          if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
1818 1822                  if (bp && !BP_IS_HOLE(bp)) {
1819 1823                          dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
1820 1824                          uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
1821 1825                          zbookmark_t zb;
1822 1826  
1823 1827                          SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
1824 1828                              dn->dn_object, 0, blkid);
1825 1829  
1826 1830                          (void) arc_read(NULL, dn->dn_objset->os_spa,
1827 1831                              bp, NULL, NULL, prio,
1828 1832                              ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
1829 1833                              &aflags, &zb);
1830 1834                  }
1831 1835                  if (db)
1832 1836                          dbuf_rele(db, NULL);
1833 1837          }
1834 1838  }
1835 1839  
1836 1840  /*
1837 1841   * Returns with db_holds incremented, and db_mtx not held.
1838 1842   * Note: dn_struct_rwlock must be held.
1839 1843   */
1840 1844  int
1841 1845  dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
1842 1846      void *tag, dmu_buf_impl_t **dbp)
1843 1847  {
1844 1848          dmu_buf_impl_t *db, *parent = NULL;
1845 1849  
1846 1850          ASSERT(blkid != DMU_BONUS_BLKID);
1847 1851          ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1848 1852          ASSERT3U(dn->dn_nlevels, >, level);
1849 1853  
1850 1854          *dbp = NULL;
1851 1855  top:
1852 1856          /* dbuf_find() returns with db_mtx held */
1853 1857          db = dbuf_find(dn, level, blkid);
1854 1858  
1855 1859          if (db == NULL) {
1856 1860                  blkptr_t *bp = NULL;
1857 1861                  int err;
1858 1862  
1859 1863                  ASSERT3P(parent, ==, NULL);
1860 1864                  err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
1861 1865                  if (fail_sparse) {
1862 1866                          if (err == 0 && bp && BP_IS_HOLE(bp))
1863 1867                                  err = SET_ERROR(ENOENT);
1864 1868                          if (err) {
1865 1869                                  if (parent)
1866 1870                                          dbuf_rele(parent, NULL);
1867 1871                                  return (err);
1868 1872                          }
1869 1873                  }
1870 1874                  if (err && err != ENOENT)
1871 1875                          return (err);
1872 1876                  db = dbuf_create(dn, level, blkid, parent, bp);
1873 1877          }
1874 1878  
1875 1879          if (db->db_buf && refcount_is_zero(&db->db_holds)) {
1876 1880                  arc_buf_add_ref(db->db_buf, db);
1877 1881                  if (db->db_buf->b_data == NULL) {
1878 1882                          dbuf_clear(db);
1879 1883                          if (parent) {
1880 1884                                  dbuf_rele(parent, NULL);
1881 1885                                  parent = NULL;
1882 1886                          }
1883 1887                          goto top;
1884 1888                  }
1885 1889                  ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
1886 1890          }
1887 1891  
1888 1892          ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
1889 1893  
1890 1894          /*
1891 1895           * If this buffer is currently syncing out, and we are are
1892 1896           * still referencing it from db_data, we need to make a copy
1893 1897           * of it in case we decide we want to dirty it again in this txg.
1894 1898           */
1895 1899          if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1896 1900              dn->dn_object != DMU_META_DNODE_OBJECT &&
1897 1901              db->db_state == DB_CACHED && db->db_data_pending) {
1898 1902                  dbuf_dirty_record_t *dr = db->db_data_pending;
1899 1903  
1900 1904                  if (dr->dt.dl.dr_data == db->db_buf) {
1901 1905                          arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1902 1906  
1903 1907                          dbuf_set_data(db,
1904 1908                              arc_buf_alloc(dn->dn_objset->os_spa,
1905 1909                              db->db.db_size, db, type));
1906 1910                          bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
1907 1911                              db->db.db_size);
1908 1912                  }
1909 1913          }
1910 1914  
1911 1915          (void) refcount_add(&db->db_holds, tag);
1912 1916          dbuf_update_data(db);
1913 1917          DBUF_VERIFY(db);
1914 1918          mutex_exit(&db->db_mtx);
1915 1919  
1916 1920          /* NOTE: we can't rele the parent until after we drop the db_mtx */
1917 1921          if (parent)
1918 1922                  dbuf_rele(parent, NULL);
1919 1923  
1920 1924          ASSERT3P(DB_DNODE(db), ==, dn);
1921 1925          ASSERT3U(db->db_blkid, ==, blkid);
1922 1926          ASSERT3U(db->db_level, ==, level);
1923 1927          *dbp = db;
1924 1928  
1925 1929          return (0);
1926 1930  }
1927 1931  
1928 1932  dmu_buf_impl_t *
1929 1933  dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
1930 1934  {
1931 1935          dmu_buf_impl_t *db;
1932 1936          int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
1933 1937          return (err ? NULL : db);
1934 1938  }
1935 1939  
1936 1940  dmu_buf_impl_t *
1937 1941  dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
1938 1942  {
1939 1943          dmu_buf_impl_t *db;
1940 1944          int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
1941 1945          return (err ? NULL : db);
1942 1946  }
1943 1947  
1944 1948  void
1945 1949  dbuf_create_bonus(dnode_t *dn)
1946 1950  {
1947 1951          ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
1948 1952  
1949 1953          ASSERT(dn->dn_bonus == NULL);
1950 1954          dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
1951 1955  }
1952 1956  
1953 1957  int
1954 1958  dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
1955 1959  {
1956 1960          dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1957 1961          dnode_t *dn;
1958 1962  
1959 1963          if (db->db_blkid != DMU_SPILL_BLKID)
1960 1964                  return (SET_ERROR(ENOTSUP));
1961 1965          if (blksz == 0)
1962 1966                  blksz = SPA_MINBLOCKSIZE;
1963 1967          if (blksz > SPA_MAXBLOCKSIZE)
1964 1968                  blksz = SPA_MAXBLOCKSIZE;
1965 1969          else
1966 1970                  blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
1967 1971  
1968 1972          DB_DNODE_ENTER(db);
1969 1973          dn = DB_DNODE(db);
1970 1974          rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1971 1975          dbuf_new_size(db, blksz, tx);
1972 1976          rw_exit(&dn->dn_struct_rwlock);
1973 1977          DB_DNODE_EXIT(db);
1974 1978  
1975 1979          return (0);
1976 1980  }
1977 1981  
1978 1982  void
1979 1983  dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
1980 1984  {
1981 1985          dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
1982 1986  }
1983 1987  
1984 1988  #pragma weak dmu_buf_add_ref = dbuf_add_ref
1985 1989  void
1986 1990  dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
1987 1991  {
1988 1992          int64_t holds = refcount_add(&db->db_holds, tag);
1989 1993          ASSERT(holds > 1);
1990 1994  }
1991 1995  
1992 1996  /*
1993 1997   * If you call dbuf_rele() you had better not be referencing the dnode handle
1994 1998   * unless you have some other direct or indirect hold on the dnode. (An indirect
1995 1999   * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
1996 2000   * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
1997 2001   * dnode's parent dbuf evicting its dnode handles.
1998 2002   */
1999 2003  void
2000 2004  dbuf_rele(dmu_buf_impl_t *db, void *tag)
2001 2005  {
2002 2006          mutex_enter(&db->db_mtx);
2003 2007          dbuf_rele_and_unlock(db, tag);
2004 2008  }
2005 2009  
2006 2010  void
2007 2011  dmu_buf_rele(dmu_buf_t *db, void *tag)
2008 2012  {
2009 2013          dbuf_rele((dmu_buf_impl_t *)db, tag);
2010 2014  }
2011 2015  
2012 2016  /*
2013 2017   * dbuf_rele() for an already-locked dbuf.  This is necessary to allow
2014 2018   * db_dirtycnt and db_holds to be updated atomically.
2015 2019   */
2016 2020  void
2017 2021  dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
2018 2022  {
2019 2023          int64_t holds;
2020 2024  
2021 2025          ASSERT(MUTEX_HELD(&db->db_mtx));
2022 2026          DBUF_VERIFY(db);
2023 2027  
2024 2028          /*
2025 2029           * Remove the reference to the dbuf before removing its hold on the
2026 2030           * dnode so we can guarantee in dnode_move() that a referenced bonus
2027 2031           * buffer has a corresponding dnode hold.
2028 2032           */
2029 2033          holds = refcount_remove(&db->db_holds, tag);
2030 2034          ASSERT(holds >= 0);
2031 2035  
2032 2036          /*
2033 2037           * We can't freeze indirects if there is a possibility that they
2034 2038           * may be modified in the current syncing context.
2035 2039           */
2036 2040          if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
2037 2041                  arc_buf_freeze(db->db_buf);
2038 2042  
2039 2043          if (holds == db->db_dirtycnt &&
2040 2044              db->db_level == 0 && db->db_immediate_evict)
2041 2045                  dbuf_evict_user(db);
2042 2046  
2043 2047          if (holds == 0) {
2044 2048                  if (db->db_blkid == DMU_BONUS_BLKID) {
2045 2049                          mutex_exit(&db->db_mtx);
2046 2050  
2047 2051                          /*
2048 2052                           * If the dnode moves here, we cannot cross this barrier
2049 2053                           * until the move completes.
2050 2054                           */
2051 2055                          DB_DNODE_ENTER(db);
2052 2056                          (void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count);
2053 2057                          DB_DNODE_EXIT(db);
2054 2058                          /*
2055 2059                           * The bonus buffer's dnode hold is no longer discounted
2056 2060                           * in dnode_move(). The dnode cannot move until after
2057 2061                           * the dnode_rele().
2058 2062                           */
2059 2063                          dnode_rele(DB_DNODE(db), db);
2060 2064                  } else if (db->db_buf == NULL) {
2061 2065                          /*
2062 2066                           * This is a special case: we never associated this
2063 2067                           * dbuf with any data allocated from the ARC.
2064 2068                           */
2065 2069                          ASSERT(db->db_state == DB_UNCACHED ||
2066 2070                              db->db_state == DB_NOFILL);
2067 2071                          dbuf_evict(db);
2068 2072                  } else if (arc_released(db->db_buf)) {
2069 2073                          arc_buf_t *buf = db->db_buf;
2070 2074                          /*
2071 2075                           * This dbuf has anonymous data associated with it.
2072 2076                           */
2073 2077                          dbuf_set_data(db, NULL);
2074 2078                          VERIFY(arc_buf_remove_ref(buf, db));
2075 2079                          dbuf_evict(db);
2076 2080                  } else {
2077 2081                          VERIFY(!arc_buf_remove_ref(db->db_buf, db));
2078 2082  
2079 2083                          /*
2080 2084                           * A dbuf will be eligible for eviction if either the
2081 2085                           * 'primarycache' property is set or a duplicate
2082 2086                           * copy of this buffer is already cached in the arc.
2083 2087                           *
2084 2088                           * In the case of the 'primarycache' a buffer
2085 2089                           * is considered for eviction if it matches the
2086 2090                           * criteria set in the property.
2087 2091                           *
2088 2092                           * To decide if our buffer is considered a
2089 2093                           * duplicate, we must call into the arc to determine
2090 2094                           * if multiple buffers are referencing the same
2091 2095                           * block on-disk. If so, then we simply evict
2092 2096                           * ourselves.
2093 2097                           */
2094 2098                          if (!DBUF_IS_CACHEABLE(db) ||
2095 2099                              arc_buf_eviction_needed(db->db_buf))
2096 2100                                  dbuf_clear(db);
2097 2101                          else
2098 2102                                  mutex_exit(&db->db_mtx);
2099 2103                  }
2100 2104          } else {
2101 2105                  mutex_exit(&db->db_mtx);
2102 2106          }
2103 2107  }
2104 2108  
2105 2109  #pragma weak dmu_buf_refcount = dbuf_refcount
2106 2110  uint64_t
2107 2111  dbuf_refcount(dmu_buf_impl_t *db)
2108 2112  {
2109 2113          return (refcount_count(&db->db_holds));
2110 2114  }
2111 2115  
2112 2116  void *
2113 2117  dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
2114 2118      dmu_buf_evict_func_t *evict_func)
2115 2119  {
2116 2120          return (dmu_buf_update_user(db_fake, NULL, user_ptr,
2117 2121              user_data_ptr_ptr, evict_func));
2118 2122  }
2119 2123  
2120 2124  void *
2121 2125  dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
2122 2126      dmu_buf_evict_func_t *evict_func)
2123 2127  {
2124 2128          dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2125 2129  
2126 2130          db->db_immediate_evict = TRUE;
2127 2131          return (dmu_buf_update_user(db_fake, NULL, user_ptr,
2128 2132              user_data_ptr_ptr, evict_func));
2129 2133  }
2130 2134  
2131 2135  void *
2132 2136  dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
2133 2137      void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
2134 2138  {
2135 2139          dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2136 2140          ASSERT(db->db_level == 0);
2137 2141  
2138 2142          ASSERT((user_ptr == NULL) == (evict_func == NULL));
2139 2143  
2140 2144          mutex_enter(&db->db_mtx);
2141 2145  
2142 2146          if (db->db_user_ptr == old_user_ptr) {
2143 2147                  db->db_user_ptr = user_ptr;
2144 2148                  db->db_user_data_ptr_ptr = user_data_ptr_ptr;
2145 2149                  db->db_evict_func = evict_func;
2146 2150  
2147 2151                  dbuf_update_data(db);
2148 2152          } else {
2149 2153                  old_user_ptr = db->db_user_ptr;
2150 2154          }
2151 2155  
2152 2156          mutex_exit(&db->db_mtx);
2153 2157          return (old_user_ptr);
2154 2158  }
2155 2159  
2156 2160  void *
2157 2161  dmu_buf_get_user(dmu_buf_t *db_fake)
2158 2162  {
2159 2163          dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2160 2164          ASSERT(!refcount_is_zero(&db->db_holds));
2161 2165  
2162 2166          return (db->db_user_ptr);
2163 2167  }
2164 2168  
2165 2169  boolean_t
2166 2170  dmu_buf_freeable(dmu_buf_t *dbuf)
2167 2171  {
2168 2172          boolean_t res = B_FALSE;
2169 2173          dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
2170 2174  
2171 2175          if (db->db_blkptr)
2172 2176                  res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
2173 2177                      db->db_blkptr, db->db_blkptr->blk_birth);
2174 2178  
2175 2179          return (res);
2176 2180  }
2177 2181  
2178 2182  blkptr_t *
2179 2183  dmu_buf_get_blkptr(dmu_buf_t *db)
2180 2184  {
2181 2185          dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
2182 2186          return (dbi->db_blkptr);
2183 2187  }
2184 2188  
2185 2189  static void
2186 2190  dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
2187 2191  {
2188 2192          /* ASSERT(dmu_tx_is_syncing(tx) */
2189 2193          ASSERT(MUTEX_HELD(&db->db_mtx));
2190 2194  
2191 2195          if (db->db_blkptr != NULL)
2192 2196                  return;
2193 2197  
2194 2198          if (db->db_blkid == DMU_SPILL_BLKID) {
2195 2199                  db->db_blkptr = &dn->dn_phys->dn_spill;
2196 2200                  BP_ZERO(db->db_blkptr);
2197 2201                  return;
2198 2202          }
2199 2203          if (db->db_level == dn->dn_phys->dn_nlevels-1) {
2200 2204                  /*
2201 2205                   * This buffer was allocated at a time when there was
2202 2206                   * no available blkptrs from the dnode, or it was
2203 2207                   * inappropriate to hook it in (i.e., nlevels mis-match).
2204 2208                   */
2205 2209                  ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
2206 2210                  ASSERT(db->db_parent == NULL);
2207 2211                  db->db_parent = dn->dn_dbuf;
2208 2212                  db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
2209 2213                  DBUF_VERIFY(db);
2210 2214          } else {
2211 2215                  dmu_buf_impl_t *parent = db->db_parent;
2212 2216                  int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2213 2217  
2214 2218                  ASSERT(dn->dn_phys->dn_nlevels > 1);
2215 2219                  if (parent == NULL) {
2216 2220                          mutex_exit(&db->db_mtx);
2217 2221                          rw_enter(&dn->dn_struct_rwlock, RW_READER);
2218 2222                          (void) dbuf_hold_impl(dn, db->db_level+1,
2219 2223                              db->db_blkid >> epbs, FALSE, db, &parent);
2220 2224                          rw_exit(&dn->dn_struct_rwlock);
2221 2225                          mutex_enter(&db->db_mtx);
2222 2226                          db->db_parent = parent;
2223 2227                  }
2224 2228                  db->db_blkptr = (blkptr_t *)parent->db.db_data +
2225 2229                      (db->db_blkid & ((1ULL << epbs) - 1));
2226 2230                  DBUF_VERIFY(db);
2227 2231          }
2228 2232  }
2229 2233  
2230 2234  static void
2231 2235  dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2232 2236  {
2233 2237          dmu_buf_impl_t *db = dr->dr_dbuf;
2234 2238          dnode_t *dn;
2235 2239          zio_t *zio;
2236 2240  
2237 2241          ASSERT(dmu_tx_is_syncing(tx));
2238 2242  
2239 2243          dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2240 2244  
2241 2245          mutex_enter(&db->db_mtx);
2242 2246  
2243 2247          ASSERT(db->db_level > 0);
2244 2248          DBUF_VERIFY(db);
2245 2249  
2246 2250          /* Read the block if it hasn't been read yet. */
2247 2251          if (db->db_buf == NULL) {
2248 2252                  mutex_exit(&db->db_mtx);
2249 2253                  (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
2250 2254                  mutex_enter(&db->db_mtx);
2251 2255          }
2252 2256          ASSERT3U(db->db_state, ==, DB_CACHED);
2253 2257          ASSERT(db->db_buf != NULL);
2254 2258  
2255 2259          DB_DNODE_ENTER(db);
2256 2260          dn = DB_DNODE(db);
2257 2261          /* Indirect block size must match what the dnode thinks it is. */
2258 2262          ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2259 2263          dbuf_check_blkptr(dn, db);
2260 2264          DB_DNODE_EXIT(db);
2261 2265  
2262 2266          /* Provide the pending dirty record to child dbufs */
2263 2267          db->db_data_pending = dr;
2264 2268  
2265 2269          mutex_exit(&db->db_mtx);
2266 2270          dbuf_write(dr, db->db_buf, tx);
2267 2271  
2268 2272          zio = dr->dr_zio;
2269 2273          mutex_enter(&dr->dt.di.dr_mtx);
2270 2274          dbuf_sync_list(&dr->dt.di.dr_children, tx);
2271 2275          ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2272 2276          mutex_exit(&dr->dt.di.dr_mtx);
2273 2277          zio_nowait(zio);
2274 2278  }
2275 2279  
2276 2280  static void
2277 2281  dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2278 2282  {
2279 2283          arc_buf_t **datap = &dr->dt.dl.dr_data;
2280 2284          dmu_buf_impl_t *db = dr->dr_dbuf;
2281 2285          dnode_t *dn;
2282 2286          objset_t *os;
2283 2287          uint64_t txg = tx->tx_txg;
2284 2288  
2285 2289          ASSERT(dmu_tx_is_syncing(tx));
2286 2290  
2287 2291          dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2288 2292  
2289 2293          mutex_enter(&db->db_mtx);
2290 2294          /*
2291 2295           * To be synced, we must be dirtied.  But we
2292 2296           * might have been freed after the dirty.
2293 2297           */
2294 2298          if (db->db_state == DB_UNCACHED) {
2295 2299                  /* This buffer has been freed since it was dirtied */
2296 2300                  ASSERT(db->db.db_data == NULL);
2297 2301          } else if (db->db_state == DB_FILL) {
2298 2302                  /* This buffer was freed and is now being re-filled */
2299 2303                  ASSERT(db->db.db_data != dr->dt.dl.dr_data);
2300 2304          } else {
2301 2305                  ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
2302 2306          }
2303 2307          DBUF_VERIFY(db);
2304 2308  
2305 2309          DB_DNODE_ENTER(db);
2306 2310          dn = DB_DNODE(db);
2307 2311  
2308 2312          if (db->db_blkid == DMU_SPILL_BLKID) {
2309 2313                  mutex_enter(&dn->dn_mtx);
2310 2314                  dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
2311 2315                  mutex_exit(&dn->dn_mtx);
2312 2316          }
2313 2317  
2314 2318          /*
2315 2319           * If this is a bonus buffer, simply copy the bonus data into the
2316 2320           * dnode.  It will be written out when the dnode is synced (and it
2317 2321           * will be synced, since it must have been dirty for dbuf_sync to
2318 2322           * be called).
2319 2323           */
2320 2324          if (db->db_blkid == DMU_BONUS_BLKID) {
2321 2325                  dbuf_dirty_record_t **drp;
2322 2326  
2323 2327                  ASSERT(*datap != NULL);
2324 2328                  ASSERT0(db->db_level);
2325 2329                  ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
2326 2330                  bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
2327 2331                  DB_DNODE_EXIT(db);
2328 2332  
2329 2333                  if (*datap != db->db.db_data) {
2330 2334                          zio_buf_free(*datap, DN_MAX_BONUSLEN);
2331 2335                          arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
2332 2336                  }
2333 2337                  db->db_data_pending = NULL;
2334 2338                  drp = &db->db_last_dirty;
2335 2339                  while (*drp != dr)
2336 2340                          drp = &(*drp)->dr_next;
2337 2341                  ASSERT(dr->dr_next == NULL);
2338 2342                  ASSERT(dr->dr_dbuf == db);
2339 2343                  *drp = dr->dr_next;
2340 2344                  kmem_free(dr, sizeof (dbuf_dirty_record_t));
2341 2345                  ASSERT(db->db_dirtycnt > 0);
2342 2346                  db->db_dirtycnt -= 1;
2343 2347                  dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
2344 2348                  return;
2345 2349          }
2346 2350  
2347 2351          os = dn->dn_objset;
2348 2352  
2349 2353          /*
2350 2354           * This function may have dropped the db_mtx lock allowing a dmu_sync
2351 2355           * operation to sneak in. As a result, we need to ensure that we
2352 2356           * don't check the dr_override_state until we have returned from
2353 2357           * dbuf_check_blkptr.
2354 2358           */
2355 2359          dbuf_check_blkptr(dn, db);
2356 2360  
2357 2361          /*
2358 2362           * If this buffer is in the middle of an immediate write,
2359 2363           * wait for the synchronous IO to complete.
2360 2364           */
2361 2365          while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
2362 2366                  ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
2363 2367                  cv_wait(&db->db_changed, &db->db_mtx);
2364 2368                  ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
2365 2369          }
2366 2370  
2367 2371          if (db->db_state != DB_NOFILL &&
2368 2372              dn->dn_object != DMU_META_DNODE_OBJECT &&
2369 2373              refcount_count(&db->db_holds) > 1 &&
2370 2374              dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
2371 2375              *datap == db->db_buf) {
2372 2376                  /*
2373 2377                   * If this buffer is currently "in use" (i.e., there
2374 2378                   * are active holds and db_data still references it),
2375 2379                   * then make a copy before we start the write so that
2376 2380                   * any modifications from the open txg will not leak
2377 2381                   * into this write.
2378 2382                   *
2379 2383                   * NOTE: this copy does not need to be made for
2380 2384                   * objects only modified in the syncing context (e.g.
2381 2385                   * DNONE_DNODE blocks).
2382 2386                   */
2383 2387                  int blksz = arc_buf_size(*datap);
2384 2388                  arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
2385 2389                  *datap = arc_buf_alloc(os->os_spa, blksz, db, type);
2386 2390                  bcopy(db->db.db_data, (*datap)->b_data, blksz);
2387 2391          }
2388 2392          db->db_data_pending = dr;
2389 2393  
2390 2394          mutex_exit(&db->db_mtx);
2391 2395  
2392 2396          dbuf_write(dr, *datap, tx);
2393 2397  
2394 2398          ASSERT(!list_link_active(&dr->dr_dirty_node));
2395 2399          if (dn->dn_object == DMU_META_DNODE_OBJECT) {
2396 2400                  list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
2397 2401                  DB_DNODE_EXIT(db);
2398 2402          } else {
2399 2403                  /*
2400 2404                   * Although zio_nowait() does not "wait for an IO", it does
2401 2405                   * initiate the IO. If this is an empty write it seems plausible
2402 2406                   * that the IO could actually be completed before the nowait
2403 2407                   * returns. We need to DB_DNODE_EXIT() first in case
2404 2408                   * zio_nowait() invalidates the dbuf.
2405 2409                   */
2406 2410                  DB_DNODE_EXIT(db);
2407 2411                  zio_nowait(dr->dr_zio);
2408 2412          }
2409 2413  }
2410 2414  
2411 2415  void
2412 2416  dbuf_sync_list(list_t *list, dmu_tx_t *tx)
2413 2417  {
2414 2418          dbuf_dirty_record_t *dr;
2415 2419  
2416 2420          while (dr = list_head(list)) {
2417 2421                  if (dr->dr_zio != NULL) {
2418 2422                          /*
2419 2423                           * If we find an already initialized zio then we
2420 2424                           * are processing the meta-dnode, and we have finished.
2421 2425                           * The dbufs for all dnodes are put back on the list
2422 2426                           * during processing, so that we can zio_wait()
2423 2427                           * these IOs after initiating all child IOs.
2424 2428                           */
2425 2429                          ASSERT3U(dr->dr_dbuf->db.db_object, ==,
2426 2430                              DMU_META_DNODE_OBJECT);
2427 2431                          break;
2428 2432                  }
2429 2433                  list_remove(list, dr);
2430 2434                  if (dr->dr_dbuf->db_level > 0)
2431 2435                          dbuf_sync_indirect(dr, tx);
2432 2436                  else
2433 2437                          dbuf_sync_leaf(dr, tx);
2434 2438          }
2435 2439  }
2436 2440  
2437 2441  /* ARGSUSED */
2438 2442  static void
2439 2443  dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
2440 2444  {
2441 2445          dmu_buf_impl_t *db = vdb;
2442 2446          dnode_t *dn;
2443 2447          blkptr_t *bp = zio->io_bp;
2444 2448          blkptr_t *bp_orig = &zio->io_bp_orig;
2445 2449          spa_t *spa = zio->io_spa;
2446 2450          int64_t delta;
2447 2451          uint64_t fill = 0;
2448 2452          int i;
2449 2453  
2450 2454          ASSERT(db->db_blkptr == bp);
2451 2455  
2452 2456          DB_DNODE_ENTER(db);
2453 2457          dn = DB_DNODE(db);
2454 2458          delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
2455 2459          dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
2456 2460          zio->io_prev_space_delta = delta;
2457 2461  
2458 2462          if (bp->blk_birth != 0) {
2459 2463                  ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
2460 2464                      BP_GET_TYPE(bp) == dn->dn_type) ||
2461 2465                      (db->db_blkid == DMU_SPILL_BLKID &&
2462 2466                      BP_GET_TYPE(bp) == dn->dn_bonustype));
2463 2467                  ASSERT(BP_GET_LEVEL(bp) == db->db_level);
2464 2468          }
2465 2469  
2466 2470          mutex_enter(&db->db_mtx);
2467 2471  
2468 2472  #ifdef ZFS_DEBUG
2469 2473          if (db->db_blkid == DMU_SPILL_BLKID) {
2470 2474                  ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
2471 2475                  ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
2472 2476                      db->db_blkptr == &dn->dn_phys->dn_spill);
2473 2477          }
2474 2478  #endif
2475 2479  
2476 2480          if (db->db_level == 0) {
2477 2481                  mutex_enter(&dn->dn_mtx);
2478 2482                  if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
2479 2483                      db->db_blkid != DMU_SPILL_BLKID)
2480 2484                          dn->dn_phys->dn_maxblkid = db->db_blkid;
2481 2485                  mutex_exit(&dn->dn_mtx);
2482 2486  
2483 2487                  if (dn->dn_type == DMU_OT_DNODE) {
2484 2488                          dnode_phys_t *dnp = db->db.db_data;
2485 2489                          for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
2486 2490                              i--, dnp++) {
2487 2491                                  if (dnp->dn_type != DMU_OT_NONE)
2488 2492                                          fill++;
2489 2493                          }
2490 2494                  } else {
2491 2495                          if (BP_IS_HOLE(bp)) {
2492 2496                                  fill = 0;
2493 2497                          } else {
2494 2498                                  fill = 1;
2495 2499                          }
2496 2500                  }
2497 2501          } else {
2498 2502                  blkptr_t *ibp = db->db.db_data;
2499 2503                  ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2500 2504                  for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
2501 2505                          if (BP_IS_HOLE(ibp))
2502 2506                                  continue;
2503 2507                          fill += ibp->blk_fill;
2504 2508                  }
2505 2509          }
2506 2510          DB_DNODE_EXIT(db);
2507 2511  
2508 2512          bp->blk_fill = fill;
2509 2513  
2510 2514          mutex_exit(&db->db_mtx);
2511 2515  }
2512 2516  
2513 2517  /*
2514 2518   * The SPA will call this callback several times for each zio - once
2515 2519   * for every physical child i/o (zio->io_phys_children times).  This
2516 2520   * allows the DMU to monitor the progress of each logical i/o.  For example,
2517 2521   * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
2518 2522   * block.  There may be a long delay before all copies/fragments are completed,
2519 2523   * so this callback allows us to retire dirty space gradually, as the physical
2520 2524   * i/os complete.
2521 2525   */
2522 2526  /* ARGSUSED */
2523 2527  static void
2524 2528  dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
2525 2529  {
2526 2530          dmu_buf_impl_t *db = arg;
2527 2531          objset_t *os = db->db_objset;
2528 2532          dsl_pool_t *dp = dmu_objset_pool(os);
2529 2533          dbuf_dirty_record_t *dr;
2530 2534          int delta = 0;
2531 2535  
2532 2536          dr = db->db_data_pending;
2533 2537          ASSERT3U(dr->dr_txg, ==, zio->io_txg);
2534 2538  
2535 2539          /*
2536 2540           * The callback will be called io_phys_children times.  Retire one
2537 2541           * portion of our dirty space each time we are called.  Any rounding
2538 2542           * error will be cleaned up by dsl_pool_sync()'s call to
2539 2543           * dsl_pool_undirty_space().
2540 2544           */
2541 2545          delta = dr->dr_accounted / zio->io_phys_children;
2542 2546          dsl_pool_undirty_space(dp, delta, zio->io_txg);
2543 2547  }
2544 2548  
2545 2549  /* ARGSUSED */
2546 2550  static void
2547 2551  dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
2548 2552  {
2549 2553          dmu_buf_impl_t *db = vdb;
2550 2554          blkptr_t *bp_orig = &zio->io_bp_orig;
2551 2555          blkptr_t *bp = db->db_blkptr;
2552 2556          objset_t *os = db->db_objset;
2553 2557          dmu_tx_t *tx = os->os_synctx;
2554 2558          dbuf_dirty_record_t **drp, *dr;
2555 2559  
2556 2560          ASSERT0(zio->io_error);
2557 2561          ASSERT(db->db_blkptr == bp);
2558 2562  
2559 2563          /*
2560 2564           * For nopwrites and rewrites we ensure that the bp matches our
2561 2565           * original and bypass all the accounting.
2562 2566           */
2563 2567          if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
2564 2568                  ASSERT(BP_EQUAL(bp, bp_orig));
2565 2569          } else {
2566 2570                  dsl_dataset_t *ds = os->os_dsl_dataset;
2567 2571                  (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
2568 2572                  dsl_dataset_block_born(ds, bp, tx);
2569 2573          }
2570 2574  
2571 2575          mutex_enter(&db->db_mtx);
2572 2576  
2573 2577          DBUF_VERIFY(db);
2574 2578  
2575 2579          drp = &db->db_last_dirty;
2576 2580          while ((dr = *drp) != db->db_data_pending)
2577 2581                  drp = &dr->dr_next;
2578 2582          ASSERT(!list_link_active(&dr->dr_dirty_node));
2579 2583          ASSERT(dr->dr_dbuf == db);
2580 2584          ASSERT(dr->dr_next == NULL);
2581 2585          *drp = dr->dr_next;
2582 2586  
2583 2587  #ifdef ZFS_DEBUG
2584 2588          if (db->db_blkid == DMU_SPILL_BLKID) {
2585 2589                  dnode_t *dn;
2586 2590  
2587 2591                  DB_DNODE_ENTER(db);
2588 2592                  dn = DB_DNODE(db);
2589 2593                  ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
2590 2594                  ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
2591 2595                      db->db_blkptr == &dn->dn_phys->dn_spill);
2592 2596                  DB_DNODE_EXIT(db);
2593 2597          }
2594 2598  #endif
2595 2599  
2596 2600          if (db->db_level == 0) {
2597 2601                  ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2598 2602                  ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
2599 2603                  if (db->db_state != DB_NOFILL) {
2600 2604                          if (dr->dt.dl.dr_data != db->db_buf)
2601 2605                                  VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
2602 2606                                      db));
2603 2607                          else if (!arc_released(db->db_buf))
2604 2608                                  arc_set_callback(db->db_buf, dbuf_do_evict, db);
2605 2609                  }
2606 2610          } else {
2607 2611                  dnode_t *dn;
2608 2612  
2609 2613                  DB_DNODE_ENTER(db);
2610 2614                  dn = DB_DNODE(db);
2611 2615                  ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2612 2616                  ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
2613 2617                  if (!BP_IS_HOLE(db->db_blkptr)) {
2614 2618                          int epbs =
2615 2619                              dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2616 2620                          ASSERT3U(db->db_blkid, <=,
2617 2621                              dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
2618 2622                          ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
2619 2623                              db->db.db_size);
2620 2624                          arc_set_callback(db->db_buf, dbuf_do_evict, db);
2621 2625                  }
2622 2626                  DB_DNODE_EXIT(db);
2623 2627                  mutex_destroy(&dr->dt.di.dr_mtx);
2624 2628                  list_destroy(&dr->dt.di.dr_children);
2625 2629          }
2626 2630          kmem_free(dr, sizeof (dbuf_dirty_record_t));
2627 2631  
2628 2632          cv_broadcast(&db->db_changed);
2629 2633          ASSERT(db->db_dirtycnt > 0);
2630 2634          db->db_dirtycnt -= 1;
2631 2635          db->db_data_pending = NULL;
2632 2636          dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg);
2633 2637  }
2634 2638  
2635 2639  static void
2636 2640  dbuf_write_nofill_ready(zio_t *zio)
2637 2641  {
2638 2642          dbuf_write_ready(zio, NULL, zio->io_private);
2639 2643  }
2640 2644  
2641 2645  static void
2642 2646  dbuf_write_nofill_done(zio_t *zio)
2643 2647  {
2644 2648          dbuf_write_done(zio, NULL, zio->io_private);
2645 2649  }
2646 2650  
2647 2651  static void
2648 2652  dbuf_write_override_ready(zio_t *zio)
2649 2653  {
2650 2654          dbuf_dirty_record_t *dr = zio->io_private;
2651 2655          dmu_buf_impl_t *db = dr->dr_dbuf;
2652 2656  
2653 2657          dbuf_write_ready(zio, NULL, db);
2654 2658  }
2655 2659  
2656 2660  static void
2657 2661  dbuf_write_override_done(zio_t *zio)
2658 2662  {
2659 2663          dbuf_dirty_record_t *dr = zio->io_private;
2660 2664          dmu_buf_impl_t *db = dr->dr_dbuf;
2661 2665          blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
2662 2666  
2663 2667          mutex_enter(&db->db_mtx);
2664 2668          if (!BP_EQUAL(zio->io_bp, obp)) {
2665 2669                  if (!BP_IS_HOLE(obp))
2666 2670                          dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
2667 2671                  arc_release(dr->dt.dl.dr_data, db);
2668 2672          }
2669 2673          mutex_exit(&db->db_mtx);
2670 2674  
2671 2675          dbuf_write_done(zio, NULL, db);
2672 2676  }
2673 2677  
2674 2678  /* Issue I/O to commit a dirty buffer to disk. */
2675 2679  static void
2676 2680  dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
2677 2681  {
2678 2682          dmu_buf_impl_t *db = dr->dr_dbuf;
2679 2683          dnode_t *dn;
2680 2684          objset_t *os;
2681 2685          dmu_buf_impl_t *parent = db->db_parent;
2682 2686          uint64_t txg = tx->tx_txg;
2683 2687          zbookmark_t zb;
2684 2688          zio_prop_t zp;
2685 2689          zio_t *zio;
2686 2690          int wp_flag = 0;
2687 2691  
2688 2692          DB_DNODE_ENTER(db);
2689 2693          dn = DB_DNODE(db);
2690 2694          os = dn->dn_objset;
2691 2695  
2692 2696          if (db->db_state != DB_NOFILL) {
2693 2697                  if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
2694 2698                          /*
2695 2699                           * Private object buffers are released here rather
2696 2700                           * than in dbuf_dirty() since they are only modified
2697 2701                           * in the syncing context and we don't want the
2698 2702                           * overhead of making multiple copies of the data.
2699 2703                           */
2700 2704                          if (BP_IS_HOLE(db->db_blkptr)) {
2701 2705                                  arc_buf_thaw(data);
2702 2706                          } else {
2703 2707                                  dbuf_release_bp(db);
2704 2708                          }
2705 2709                  }
2706 2710          }
2707 2711  
2708 2712          if (parent != dn->dn_dbuf) {
2709 2713                  /* Our parent is an indirect block. */
2710 2714                  /* We have a dirty parent that has been scheduled for write. */
2711 2715                  ASSERT(parent && parent->db_data_pending);
2712 2716                  /* Our parent's buffer is one level closer to the dnode. */
2713 2717                  ASSERT(db->db_level == parent->db_level-1);
2714 2718                  /*
2715 2719                   * We're about to modify our parent's db_data by modifying
2716 2720                   * our block pointer, so the parent must be released.
2717 2721                   */
2718 2722                  ASSERT(arc_released(parent->db_buf));
2719 2723                  zio = parent->db_data_pending->dr_zio;
2720 2724          } else {
2721 2725                  /* Our parent is the dnode itself. */
2722 2726                  ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
2723 2727                      db->db_blkid != DMU_SPILL_BLKID) ||
2724 2728                      (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
2725 2729                  if (db->db_blkid != DMU_SPILL_BLKID)
2726 2730                          ASSERT3P(db->db_blkptr, ==,
2727 2731                              &dn->dn_phys->dn_blkptr[db->db_blkid]);
2728 2732                  zio = dn->dn_zio;
2729 2733          }
2730 2734  
2731 2735          ASSERT(db->db_level == 0 || data == db->db_buf);
2732 2736          ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
2733 2737          ASSERT(zio);
2734 2738  
2735 2739          SET_BOOKMARK(&zb, os->os_dsl_dataset ?
2736 2740              os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
2737 2741              db->db.db_object, db->db_level, db->db_blkid);
2738 2742  
2739 2743          if (db->db_blkid == DMU_SPILL_BLKID)
2740 2744                  wp_flag = WP_SPILL;
2741 2745          wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
2742 2746  
2743 2747          dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
2744 2748          DB_DNODE_EXIT(db);
2745 2749  
2746 2750          if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
2747 2751                  ASSERT(db->db_state != DB_NOFILL);
2748 2752                  dr->dr_zio = zio_write(zio, os->os_spa, txg,
2749 2753                      db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
2750 2754                      dbuf_write_override_ready, NULL, dbuf_write_override_done,
2751 2755                      dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2752 2756                  mutex_enter(&db->db_mtx);
2753 2757                  dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
2754 2758                  zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
2755 2759                      dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
2756 2760                  mutex_exit(&db->db_mtx);
2757 2761          } else if (db->db_state == DB_NOFILL) {
2758 2762                  ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
2759 2763                      zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
2760 2764                  dr->dr_zio = zio_write(zio, os->os_spa, txg,
2761 2765                      db->db_blkptr, NULL, db->db.db_size, &zp,
2762 2766                      dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db,
2763 2767                      ZIO_PRIORITY_ASYNC_WRITE,
2764 2768                      ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
2765 2769          } else {
2766 2770                  ASSERT(arc_released(data));
2767 2771                  dr->dr_zio = arc_write(zio, os->os_spa, txg,
2768 2772                      db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db),
2769 2773                      DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
2770 2774                      dbuf_write_physdone, dbuf_write_done, db,
2771 2775                      ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2772 2776          }
2773 2777  }

↓ open down ↓

1583 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX