Print this page
NEX-19394 backport 9337 zfs get all is slow due to uncached metadata
Reviewed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Thomas Caputi <tcaputi@datto.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
 Conflicts:
  usr/src/uts/common/fs/zfs/dbuf.c
  usr/src/uts/common/fs/zfs/dmu.c
  usr/src/uts/common/fs/zfs/sys/dmu_objset.h
NEX-9752 backport illumos 6950 ARC should cache compressed data
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
6950 ARC should cache compressed data
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Dan Kimmel <dan.kimmel@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Don Brady <don.brady@intel.com>
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
6267 dn_bonus evicted too early
Reviewed by: Richard Yao <ryao@gentoo.org>
Reviewed by: Xin LI <delphij@freebsd.org>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
NEX-4582 update wrc test cases for allow to use write back cache per tree of datasets
Reviewed by: Steve Peng <steve.peng@nexenta.com>
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
5960 zfs recv should prefetch indirect blocks
5925 zfs receive -o origin=
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
5911 ZFS "hangs" while deleting file
Reviewed by: Bayard Bell <buffer.g.overflow@gmail.com>
Reviewed by: Alek Pinchuk <alek@nexenta.com>
Reviewed by: Simon Klinkert <simon.klinkert@gmail.com>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
NEX-1823 Slow performance doing of a large dataset
5911 ZFS "hangs" while deleting file
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Bayard Bell <bayard.bell@nexenta.com>
NEX-3165 segregate ddt in arc
SUP-507 Delete or truncate of large files delayed on datasets with small recordsize
Reviewed by: Albert Lee <trisk@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Reviewed by: Ilya Usvyatsky <ilya.usvyatsky@nexenta.com>
Reviewed by: Tony Nguyen <tony.nguyen@nexenta.com>
4370 avoid transmitting holes during zfs send
4371 DMU code clean up
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
Approved by: Garrett D'Amore <garrett@damore.org>
OS-80 support for vdev and CoS properties for the new I/O scheduler
OS-95 lint warning introduced by OS-61
Make special vdev subtree topology the same as regular vdev subtree to simplify testcase setup
Fixup merge issues
Issue #7: add cacheability to the properties
          Contributors: Boris Protopopov
DDT is placed either into special or to L2ARC but not in both
Support for secondarycache=data option
Align mutex tables in arc.c and dbuf.c to 64 bytes (cache line), place each kmutex_t on cache line by itself to avoid false sharing
re #12585 rb4049 ZFS++ work port - refactoring to improve separation of open/closed code, bug fixes, performance improvements - open code
Bug 11205: add missing libzfs_closed_stubs.c to fix opensource-only build.
ZFS plus work: special vdevs, cos, cos/vdev properties

@@ -21,10 +21,11 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
  */
 
 #ifndef _SYS_DBUF_H
 #define _SYS_DBUF_H
 

@@ -53,10 +54,12 @@
 #define DB_RF_HAVESTRUCT        (1 << 2)
 #define DB_RF_NOPREFETCH        (1 << 3)
 #define DB_RF_NEVERWAIT         (1 << 4)
 #define DB_RF_CACHED            (1 << 5)
 
+#define DBUF_EVICT_ALL          -1
+
 /*
  * The simplified state transition diagram for dbufs looks like:
  *
  *              +----> READ ----+
  *              |               |

@@ -81,10 +84,17 @@
         DB_READ,
         DB_CACHED,
         DB_EVICTING
 } dbuf_states_t;
 
+typedef enum dbuf_cached_state {
+        DB_NO_CACHE = -1,
+        DB_DBUF_CACHE,
+        DB_DBUF_METADATA_CACHE,
+        DB_CACHE_MAX
+} dbuf_cached_state_t;
+
 struct dnode;
 struct dmu_tx;
 
 /*
  * level = 0 means the user data

@@ -123,10 +133,13 @@
         unsigned int dr_accounted;
 
         /* A copy of the bp that points to us */
         blkptr_t dr_bp_copy;
 
+        /* use special class of dirty entry */
+        boolean_t dr_usesc;
+
         union dirty_types {
                 struct dirty_indirect {
 
                         /* protect access to list */
                         kmutex_t dr_mtx;

@@ -227,15 +240,16 @@
          * Our link on the owner dnodes's dn_dbufs list.
          * Protected by its dn_dbufs_mtx.
          */
         avl_node_t db_link;
 
-        /*
-         * Link in dbuf_cache.
-         */
+        /* Link in dbuf_cache or dbuf_metadata_cache */
         multilist_node_t db_cache_link;
 
+        /* Tells us which dbuf cache this dbuf is in, if any */
+        dbuf_cached_state_t db_caching_status;
+
         /* Data which is unique to data (leaf) blocks: */
 
         /* User callback information. */
         dmu_buf_user_t *db_user;
 

@@ -261,15 +275,23 @@
         uint8_t db_dirtycnt;
 } dmu_buf_impl_t;
 
 /* Note: the dbuf hash table is exposed only for the mdb module */
 #define DBUF_MUTEXES 256
-#define DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)])
+#define DBUF_LOCK_PAD   64
+typedef struct {
+        kmutex_t mtx;
+#ifdef _KERNEL
+        unsigned char pad[(DBUF_LOCK_PAD - sizeof (kmutex_t))];
+#endif
+} dbuf_mutex_t;
+#define DBUF_HASH_MUTEX(h, idx) \
+        (&((h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)].mtx))
 typedef struct dbuf_hash_table {
         uint64_t hash_table_mask;
         dmu_buf_impl_t **hash_table;
-        kmutex_t hash_mutexes[DBUF_MUTEXES];
+        dbuf_mutex_t hash_mutexes[DBUF_MUTEXES];
 } dbuf_hash_table_t;
 
 uint64_t dbuf_whichblock(struct dnode *di, int64_t level, uint64_t offset);
 
 dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data);

@@ -304,10 +326,12 @@
 void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx);
 void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
 void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
 void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
 dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+dbuf_dirty_record_t *dbuf_dirty_sc(dmu_buf_impl_t *db, dmu_tx_t *tx,
+    boolean_t usesc);
 arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db);
 void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
     bp_embedded_type_t etype, enum zio_compress comp,
     int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx);
 

@@ -316,12 +340,10 @@
 void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 void dbuf_unoverride(dbuf_dirty_record_t *dr);
 void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx);
 void dbuf_release_bp(dmu_buf_impl_t *db);
 
-boolean_t dbuf_can_remap(const dmu_buf_impl_t *buf);
-
 void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,
     struct dmu_tx *);
 
 void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx);
 

@@ -333,23 +355,39 @@
 
 void dbuf_init(void);
 void dbuf_fini(void);
 
 boolean_t dbuf_is_metadata(dmu_buf_impl_t *db);
+boolean_t dbuf_is_ddt(dmu_buf_impl_t *db);
+boolean_t dbuf_ddt_is_l2cacheable(dmu_buf_impl_t *db);
+boolean_t dbuf_meta_is_l2cacheable(dmu_buf_impl_t *db);
 
 #define DBUF_GET_BUFC_TYPE(_db) \
-        (dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
+        (dbuf_is_ddt(_db) ? ARC_BUFC_DDT :\
+        (dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA))
 
 #define DBUF_IS_CACHEABLE(_db)                                          \
         ((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL ||         \
         (dbuf_is_metadata(_db) &&                                       \
         ((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA)))
 
+/*
+ * Checks whether we need to cache dbuf in l2arc.
+ * Metadata is l2cacheable if it is not placed on special device
+ * or it is placed on special device in "dual" mode. We need to check
+ * for ddt in ZFS_CACHE_ALL and ZFS_CACHE_METADATA because it is in MOS.
+ * ZFS_CACHE_DATA mode actually means to cache both data and cacheable
+ * metadata.
+ */
 #define DBUF_IS_L2CACHEABLE(_db)                                        \
-        ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL ||       \
-        (dbuf_is_metadata(_db) &&                                       \
-        ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
+        (((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL &&      \
+        (dbuf_ddt_is_l2cacheable(_db) == B_TRUE)) ||                    \
+        ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA &&  \
+        (dbuf_is_metadata(_db)) &&                                      \
+        (dbuf_ddt_is_l2cacheable(_db) == B_TRUE)) ||                    \
+        ((dbuf_meta_is_l2cacheable(_db) == B_TRUE) &&                   \
+        ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_DATA)))
 
 #define DNODE_LEVEL_IS_L2CACHEABLE(_dn, _level)                         \
         ((_dn)->dn_objset->os_secondary_cache == ZFS_CACHE_ALL ||       \
         (((_level) > 0 ||                                               \
         DMU_OT_IS_METADATA((_dn)->dn_handle->dnh_dnode->dn_type)) &&    \