Print this page
NEX-19394 backport 9337 zfs get all is slow due to uncached metadata
Reviewed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Thomas Caputi <tcaputi@datto.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
Conflicts:
usr/src/uts/common/fs/zfs/dbuf.c
usr/src/uts/common/fs/zfs/dmu.c
usr/src/uts/common/fs/zfs/sys/dmu_objset.h
NEX-9752 backport illumos 6950 ARC should cache compressed data
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
6950 ARC should cache compressed data
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Dan Kimmel <dan.kimmel@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Don Brady <don.brady@intel.com>
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
6267 dn_bonus evicted too early
Reviewed by: Richard Yao <ryao@gentoo.org>
Reviewed by: Xin LI <delphij@freebsd.org>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
NEX-4582 update wrc test cases for allow to use write back cache per tree of datasets
Reviewed by: Steve Peng <steve.peng@nexenta.com>
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
5960 zfs recv should prefetch indirect blocks
5925 zfs receive -o origin=
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
5911 ZFS "hangs" while deleting file
Reviewed by: Bayard Bell <buffer.g.overflow@gmail.com>
Reviewed by: Alek Pinchuk <alek@nexenta.com>
Reviewed by: Simon Klinkert <simon.klinkert@gmail.com>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
NEX-1823 Slow performance doing of a large dataset
5911 ZFS "hangs" while deleting file
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Bayard Bell <bayard.bell@nexenta.com>
NEX-3165 segregate ddt in arc
SUP-507 Delete or truncate of large files delayed on datasets with small recordsize
Reviewed by: Albert Lee <trisk@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Reviewed by: Ilya Usvyatsky <ilya.usvyatsky@nexenta.com>
Reviewed by: Tony Nguyen <tony.nguyen@nexenta.com>
4370 avoid transmitting holes during zfs send
4371 DMU code clean up
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
Approved by: Garrett D'Amore <garrett@damore.org>
OS-80 support for vdev and CoS properties for the new I/O scheduler
OS-95 lint warning introduced by OS-61
Make special vdev subtree topology the same as regular vdev subtree to simplify testcase setup
Fixup merge issues
Issue #7: add cacheability to the properties
Contributors: Boris Protopopov
DDT is placed either into special or to L2ARC but not in both
Support for secondarycache=data option
Align mutex tables in arc.c and dbuf.c to 64 bytes (cache line), place each kmutex_t on cache line by itself to avoid false sharing
re #12585 rb4049 ZFS++ work port - refactoring to improve separation of open/closed code, bug fixes, performance improvements - open code
Bug 11205: add missing libzfs_closed_stubs.c to fix opensource-only build.
ZFS plus work: special vdevs, cos, cos/vdev properties
@@ -21,10 +21,11 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
*/
#ifndef _SYS_DBUF_H
#define _SYS_DBUF_H
@@ -53,10 +54,12 @@
#define DB_RF_HAVESTRUCT (1 << 2)
#define DB_RF_NOPREFETCH (1 << 3)
#define DB_RF_NEVERWAIT (1 << 4)
#define DB_RF_CACHED (1 << 5)
+#define DBUF_EVICT_ALL -1
+
/*
* The simplified state transition diagram for dbufs looks like:
*
* +----> READ ----+
* | |
@@ -81,10 +84,17 @@
DB_READ,
DB_CACHED,
DB_EVICTING
} dbuf_states_t;
+typedef enum dbuf_cached_state {
+ DB_NO_CACHE = -1,
+ DB_DBUF_CACHE,
+ DB_DBUF_METADATA_CACHE,
+ DB_CACHE_MAX
+} dbuf_cached_state_t;
+
struct dnode;
struct dmu_tx;
/*
* level = 0 means the user data
@@ -123,10 +133,13 @@
unsigned int dr_accounted;
/* A copy of the bp that points to us */
blkptr_t dr_bp_copy;
+ /* use special class of dirty entry */
+ boolean_t dr_usesc;
+
union dirty_types {
struct dirty_indirect {
/* protect access to list */
kmutex_t dr_mtx;
@@ -227,15 +240,16 @@
* Our link on the owner dnodes's dn_dbufs list.
* Protected by its dn_dbufs_mtx.
*/
avl_node_t db_link;
- /*
- * Link in dbuf_cache.
- */
+ /* Link in dbuf_cache or dbuf_metadata_cache */
multilist_node_t db_cache_link;
+ /* Tells us which dbuf cache this dbuf is in, if any */
+ dbuf_cached_state_t db_caching_status;
+
/* Data which is unique to data (leaf) blocks: */
/* User callback information. */
dmu_buf_user_t *db_user;
@@ -261,15 +275,23 @@
uint8_t db_dirtycnt;
} dmu_buf_impl_t;
/* Note: the dbuf hash table is exposed only for the mdb module */
#define DBUF_MUTEXES 256
-#define DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)])
+#define DBUF_LOCK_PAD 64
+typedef struct {
+ kmutex_t mtx;
+#ifdef _KERNEL
+ unsigned char pad[(DBUF_LOCK_PAD - sizeof (kmutex_t))];
+#endif
+} dbuf_mutex_t;
+#define DBUF_HASH_MUTEX(h, idx) \
+ (&((h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)].mtx))
typedef struct dbuf_hash_table {
uint64_t hash_table_mask;
dmu_buf_impl_t **hash_table;
- kmutex_t hash_mutexes[DBUF_MUTEXES];
+ dbuf_mutex_t hash_mutexes[DBUF_MUTEXES];
} dbuf_hash_table_t;
uint64_t dbuf_whichblock(struct dnode *di, int64_t level, uint64_t offset);
dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data);
@@ -304,10 +326,12 @@
void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+dbuf_dirty_record_t *dbuf_dirty_sc(dmu_buf_impl_t *db, dmu_tx_t *tx,
+ boolean_t usesc);
arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db);
void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
bp_embedded_type_t etype, enum zio_compress comp,
int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx);
@@ -316,12 +340,10 @@
void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
void dbuf_unoverride(dbuf_dirty_record_t *dr);
void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx);
void dbuf_release_bp(dmu_buf_impl_t *db);
-boolean_t dbuf_can_remap(const dmu_buf_impl_t *buf);
-
void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,
struct dmu_tx *);
void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx);
@@ -333,23 +355,39 @@
void dbuf_init(void);
void dbuf_fini(void);
boolean_t dbuf_is_metadata(dmu_buf_impl_t *db);
+boolean_t dbuf_is_ddt(dmu_buf_impl_t *db);
+boolean_t dbuf_ddt_is_l2cacheable(dmu_buf_impl_t *db);
+boolean_t dbuf_meta_is_l2cacheable(dmu_buf_impl_t *db);
#define DBUF_GET_BUFC_TYPE(_db) \
- (dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
+ (dbuf_is_ddt(_db) ? ARC_BUFC_DDT :\
+ (dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA))
#define DBUF_IS_CACHEABLE(_db) \
((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL || \
(dbuf_is_metadata(_db) && \
((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA)))
+/*
+ * Checks whether we need to cache dbuf in l2arc.
+ * Metadata is l2cacheable if it is not placed on special device
+ * or it is placed on special device in "dual" mode. We need to check
+ * for ddt in ZFS_CACHE_ALL and ZFS_CACHE_METADATA because it is in MOS.
+ * ZFS_CACHE_DATA mode actually means to cache both data and cacheable
+ * metadata.
+ */
#define DBUF_IS_L2CACHEABLE(_db) \
- ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL || \
- (dbuf_is_metadata(_db) && \
- ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
+ (((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL && \
+ (dbuf_ddt_is_l2cacheable(_db) == B_TRUE)) || \
+ ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA && \
+ (dbuf_is_metadata(_db)) && \
+ (dbuf_ddt_is_l2cacheable(_db) == B_TRUE)) || \
+ ((dbuf_meta_is_l2cacheable(_db) == B_TRUE) && \
+ ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_DATA)))
#define DNODE_LEVEL_IS_L2CACHEABLE(_dn, _level) \
((_dn)->dn_objset->os_secondary_cache == ZFS_CACHE_ALL || \
(((_level) > 0 || \
DMU_OT_IS_METADATA((_dn)->dn_handle->dnh_dnode->dn_type)) && \