Print this page
6288 dmu_buf_will_dirty could be faster
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Justin Gibbs <gibbs@scsiguy.com>
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Approved by: Robert Mustacchi <rm@joyent.com>
6267 dn_bonus evicted too early
Reviewed by: Richard Yao <ryao@gentoo.org>
Reviewed by: Xin LI <delphij@freebsd.org>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Approved by: Richard Lowe <richlowe@richlowe.net>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/dbuf.c
          +++ new/usr/src/uts/common/fs/zfs/dbuf.c
↓ open down ↓ 264 lines elided ↑ open up ↑
 265  265                   * Immediate eviction occurs when holds == dirtycnt.
 266  266                   * For normal eviction buffers, holds is zero on
 267  267                   * eviction, except when dbuf_fix_old_data() calls
 268  268                   * dbuf_clear_data().  However, the hold count can grow
 269  269                   * during eviction even though db_mtx is held (see
 270  270                   * dmu_bonus_hold() for an example), so we can only
 271  271                   * test the generic invariant that holds >= dirtycnt.
 272  272                   */
 273  273                  ASSERT3U(holds, >=, db->db_dirtycnt);
 274  274          } else {
 275      -                if (db->db_immediate_evict == TRUE)
      275 +                if (db->db_user_immediate_evict == TRUE)
 276  276                          ASSERT3U(holds, >=, db->db_dirtycnt);
 277  277                  else
 278  278                          ASSERT3U(holds, >, 0);
 279  279          }
 280  280  #endif
 281  281  }
 282  282  
 283  283  static void
 284  284  dbuf_evict_user(dmu_buf_impl_t *db)
 285  285  {
↓ open down ↓ 804 lines elided ↑ open up ↑
1090 1090          objset_t *os = db->db_objset;
1091 1091  
1092 1092          ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
1093 1093          ASSERT(arc_released(os->os_phys_buf) ||
1094 1094              list_link_active(&os->os_dsl_dataset->ds_synced_link));
1095 1095          ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
1096 1096  
1097 1097          (void) arc_release(db->db_buf, db);
1098 1098  }
1099 1099  
     1100 +/*
     1101 + * We already have a dirty record for this TXG, and we are being
     1102 + * dirtied again.
     1103 + */
     1104 +static void
     1105 +dbuf_redirty(dbuf_dirty_record_t *dr)
     1106 +{
     1107 +        dmu_buf_impl_t *db = dr->dr_dbuf;
     1108 +
     1109 +        ASSERT(MUTEX_HELD(&db->db_mtx));
     1110 +
     1111 +        if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
     1112 +                /*
     1113 +                 * If this buffer has already been written out,
     1114 +                 * we now need to reset its state.
     1115 +                 */
     1116 +                dbuf_unoverride(dr);
     1117 +                if (db->db.db_object != DMU_META_DNODE_OBJECT &&
     1118 +                    db->db_state != DB_NOFILL) {
     1119 +                        /* Already released on initial dirty, so just thaw. */
     1120 +                        ASSERT(arc_released(db->db_buf));
     1121 +                        arc_buf_thaw(db->db_buf);
     1122 +                }
     1123 +        }
     1124 +}
     1125 +
1100 1126  dbuf_dirty_record_t *
1101 1127  dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1102 1128  {
1103 1129          dnode_t *dn;
1104 1130          objset_t *os;
1105 1131          dbuf_dirty_record_t **drp, *dr;
1106 1132          int drop_struct_lock = FALSE;
1107 1133          boolean_t do_free_accounting = B_FALSE;
1108 1134          int txgoff = tx->tx_txg & TXG_MASK;
1109 1135  
↓ open down ↓ 52 lines elided ↑ open up ↑
1162 1188           * If this buffer is already dirty, we're done.
1163 1189           */
1164 1190          drp = &db->db_last_dirty;
1165 1191          ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
1166 1192              db->db.db_object == DMU_META_DNODE_OBJECT);
1167 1193          while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
1168 1194                  drp = &dr->dr_next;
1169 1195          if (dr && dr->dr_txg == tx->tx_txg) {
1170 1196                  DB_DNODE_EXIT(db);
1171 1197  
1172      -                if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
1173      -                        /*
1174      -                         * If this buffer has already been written out,
1175      -                         * we now need to reset its state.
1176      -                         */
1177      -                        dbuf_unoverride(dr);
1178      -                        if (db->db.db_object != DMU_META_DNODE_OBJECT &&
1179      -                            db->db_state != DB_NOFILL)
1180      -                                arc_buf_thaw(db->db_buf);
1181      -                }
     1198 +                dbuf_redirty(dr);
1182 1199                  mutex_exit(&db->db_mtx);
1183 1200                  return (dr);
1184 1201          }
1185 1202  
1186 1203          /*
1187 1204           * Only valid if not already dirty.
1188 1205           */
1189 1206          ASSERT(dn->dn_object == 0 ||
1190 1207              dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1191 1208              (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
↓ open down ↓ 289 lines elided ↑ open up ↑
1481 1498  
1482 1499  void
1483 1500  dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
1484 1501  {
1485 1502          dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1486 1503          int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
1487 1504  
1488 1505          ASSERT(tx->tx_txg != 0);
1489 1506          ASSERT(!refcount_is_zero(&db->db_holds));
1490 1507  
     1508 +        /*
     1509 +         * Quick check for dirtyness.  For already dirty blocks, this
     1510 +         * reduces runtime of this function by >90%, and overall performance
     1511 +         * by 50% for some workloads (e.g. file deletion with indirect blocks
     1512 +         * cached).
     1513 +         */
     1514 +        mutex_enter(&db->db_mtx);
     1515 +        dbuf_dirty_record_t *dr;
     1516 +        for (dr = db->db_last_dirty;
     1517 +            dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) {
     1518 +                /*
     1519 +                 * It's possible that it is already dirty but not cached,
     1520 +                 * because there are some calls to dbuf_dirty() that don't
     1521 +                 * go through dmu_buf_will_dirty().
     1522 +                 */
     1523 +                if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) {
     1524 +                        /* This dbuf is already dirty and cached. */
     1525 +                        dbuf_redirty(dr);
     1526 +                        mutex_exit(&db->db_mtx);
     1527 +                        return;
     1528 +                }
     1529 +        }
     1530 +        mutex_exit(&db->db_mtx);
     1531 +
1491 1532          DB_DNODE_ENTER(db);
1492 1533          if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
1493 1534                  rf |= DB_RF_HAVESTRUCT;
1494 1535          DB_DNODE_EXIT(db);
1495 1536          (void) dbuf_read(db, NULL, rf);
1496 1537          (void) dbuf_dirty(db, tx);
1497 1538  }
1498 1539  
1499 1540  void
1500 1541  dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
↓ open down ↓ 303 lines elided ↑ open up ↑
1804 1845          db->db.db_object = dn->dn_object;
1805 1846          db->db_level = level;
1806 1847          db->db_blkid = blkid;
1807 1848          db->db_last_dirty = NULL;
1808 1849          db->db_dirtycnt = 0;
1809 1850          db->db_dnode_handle = dn->dn_handle;
1810 1851          db->db_parent = parent;
1811 1852          db->db_blkptr = blkptr;
1812 1853  
1813 1854          db->db_user = NULL;
1814      -        db->db_immediate_evict = 0;
1815      -        db->db_freed_in_flight = 0;
     1855 +        db->db_user_immediate_evict = FALSE;
     1856 +        db->db_freed_in_flight = FALSE;
     1857 +        db->db_pending_evict = FALSE;
1816 1858  
1817 1859          if (blkid == DMU_BONUS_BLKID) {
1818 1860                  ASSERT3P(parent, ==, dn->dn_dbuf);
1819 1861                  db->db.db_size = DN_MAX_BONUSLEN -
1820 1862                      (dn->dn_nblkptr-1) * sizeof (blkptr_t);
1821 1863                  ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
1822 1864                  db->db.db_offset = DMU_BONUS_BLKID;
1823 1865                  db->db_state = DB_UNCACHED;
1824 1866                  /* the bonus dbuf is not placed in the hash table */
1825 1867                  arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
↓ open down ↓ 371 lines elided ↑ open up ↑
2197 2239          ASSERT(holds >= 0);
2198 2240  
2199 2241          /*
2200 2242           * We can't freeze indirects if there is a possibility that they
2201 2243           * may be modified in the current syncing context.
2202 2244           */
2203 2245          if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
2204 2246                  arc_buf_freeze(db->db_buf);
2205 2247  
2206 2248          if (holds == db->db_dirtycnt &&
2207      -            db->db_level == 0 && db->db_immediate_evict)
     2249 +            db->db_level == 0 && db->db_user_immediate_evict)
2208 2250                  dbuf_evict_user(db);
2209 2251  
2210 2252          if (holds == 0) {
2211 2253                  if (db->db_blkid == DMU_BONUS_BLKID) {
2212 2254                          dnode_t *dn;
     2255 +                        boolean_t evict_dbuf = db->db_pending_evict;
2213 2256  
2214 2257                          /*
2215 2258                           * If the dnode moves here, we cannot cross this
2216 2259                           * barrier until the move completes.
2217 2260                           */
2218 2261                          DB_DNODE_ENTER(db);
2219 2262  
2220 2263                          dn = DB_DNODE(db);
2221 2264                          atomic_dec_32(&dn->dn_dbufs_count);
2222 2265  
2223 2266                          /*
2224 2267                           * Decrementing the dbuf count means that the bonus
2225 2268                           * buffer's dnode hold is no longer discounted in
2226 2269                           * dnode_move(). The dnode cannot move until after
2227      -                         * the dnode_rele_and_unlock() below.
     2270 +                         * the dnode_rele() below.
2228 2271                           */
2229 2272                          DB_DNODE_EXIT(db);
2230 2273  
2231 2274                          /*
2232 2275                           * Do not reference db after its lock is dropped.
2233 2276                           * Another thread may evict it.
2234 2277                           */
2235 2278                          mutex_exit(&db->db_mtx);
2236 2279  
2237      -                        /*
2238      -                         * If the dnode has been freed, evict the bonus
2239      -                         * buffer immediately.  The data in the bonus
2240      -                         * buffer is no longer relevant and this prevents
2241      -                         * a stale bonus buffer from being associated
2242      -                         * with this dnode_t should the dnode_t be reused
2243      -                         * prior to being destroyed.
2244      -                         */
2245      -                        mutex_enter(&dn->dn_mtx);
2246      -                        if (dn->dn_type == DMU_OT_NONE ||
2247      -                            dn->dn_free_txg != 0) {
2248      -                                /*
2249      -                                 * Drop dn_mtx.  It is a leaf lock and
2250      -                                 * cannot be held when dnode_evict_bonus()
2251      -                                 * acquires other locks in order to
2252      -                                 * perform the eviction.
2253      -                                 *
2254      -                                 * Freed dnodes cannot be reused until the
2255      -                                 * last hold is released.  Since this bonus
2256      -                                 * buffer has a hold, the dnode will remain
2257      -                                 * in the free state, even without dn_mtx
2258      -                                 * held, until the dnode_rele_and_unlock()
2259      -                                 * below.
2260      -                                 */
2261      -                                mutex_exit(&dn->dn_mtx);
     2280 +                        if (evict_dbuf)
2262 2281                                  dnode_evict_bonus(dn);
2263      -                                mutex_enter(&dn->dn_mtx);
2264      -                        }
2265      -                        dnode_rele_and_unlock(dn, db);
     2282 +
     2283 +                        dnode_rele(dn, db);
2266 2284                  } else if (db->db_buf == NULL) {
2267 2285                          /*
2268 2286                           * This is a special case: we never associated this
2269 2287                           * dbuf with any data allocated from the ARC.
2270 2288                           */
2271 2289                          ASSERT(db->db_state == DB_UNCACHED ||
2272 2290                              db->db_state == DB_NOFILL);
2273 2291                          dbuf_evict(db);
2274 2292                  } else if (arc_released(db->db_buf)) {
2275 2293                          arc_buf_t *buf = db->db_buf;
↓ open down ↓ 26 lines elided ↑ open up ↑
2302 2320                                      !BP_IS_HOLE(db->db_blkptr) &&
2303 2321                                      !BP_IS_EMBEDDED(db->db_blkptr)) {
2304 2322                                          spa_t *spa =
2305 2323                                              dmu_objset_spa(db->db_objset);
2306 2324                                          blkptr_t bp = *db->db_blkptr;
2307 2325                                          dbuf_clear(db);
2308 2326                                          arc_freed(spa, &bp);
2309 2327                                  } else {
2310 2328                                          dbuf_clear(db);
2311 2329                                  }
2312      -                        } else if (db->db_objset->os_evicting ||
     2330 +                        } else if (db->db_pending_evict ||
2313 2331                              arc_buf_eviction_needed(db->db_buf)) {
2314 2332                                  dbuf_clear(db);
2315 2333                          } else {
2316 2334                                  mutex_exit(&db->db_mtx);
2317 2335                          }
2318 2336                  }
2319 2337          } else {
2320 2338                  mutex_exit(&db->db_mtx);
2321 2339          }
2322 2340  }
↓ open down ↓ 27 lines elided ↑ open up ↑
2350 2368  dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
2351 2369  {
2352 2370          return (dmu_buf_replace_user(db_fake, NULL, user));
2353 2371  }
2354 2372  
2355 2373  void *
2356 2374  dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)
2357 2375  {
2358 2376          dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2359 2377  
2360      -        db->db_immediate_evict = TRUE;
     2378 +        db->db_user_immediate_evict = TRUE;
2361 2379          return (dmu_buf_set_user(db_fake, user));
2362 2380  }
2363 2381  
2364 2382  void *
2365 2383  dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
2366 2384  {
2367 2385          return (dmu_buf_replace_user(db_fake, user, NULL));
2368 2386  }
2369 2387  
2370 2388  void *
↓ open down ↓ 647 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX