Print this page
NEX-19394 backport 9337 zfs get all is slow due to uncached metadata
Reviewed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Thomas Caputi <tcaputi@datto.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
 Conflicts:
  usr/src/uts/common/fs/zfs/dbuf.c
  usr/src/uts/common/fs/zfs/dmu.c
  usr/src/uts/common/fs/zfs/sys/dmu_objset.h
NEX-9752 backport illumos 6950 ARC should cache compressed data
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
6950 ARC should cache compressed data
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Dan Kimmel <dan.kimmel@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Don Brady <don.brady@intel.com>
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
6267 dn_bonus evicted too early
Reviewed by: Richard Yao <ryao@gentoo.org>
Reviewed by: Xin LI <delphij@freebsd.org>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
NEX-4582 update wrc test cases for allow to use write back cache per tree of datasets
Reviewed by: Steve Peng <steve.peng@nexenta.com>
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
5960 zfs recv should prefetch indirect blocks
5925 zfs receive -o origin=
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
5911 ZFS "hangs" while deleting file
Reviewed by: Bayard Bell <buffer.g.overflow@gmail.com>
Reviewed by: Alek Pinchuk <alek@nexenta.com>
Reviewed by: Simon Klinkert <simon.klinkert@gmail.com>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
NEX-1823 Slow performance doing of a large dataset
5911 ZFS "hangs" while deleting file
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Bayard Bell <bayard.bell@nexenta.com>
NEX-3165 segregate ddt in arc
SUP-507 Delete or truncate of large files delayed on datasets with small recordsize
Reviewed by: Albert Lee <trisk@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Reviewed by: Ilya Usvyatsky <ilya.usvyatsky@nexenta.com>
Reviewed by: Tony Nguyen <tony.nguyen@nexenta.com>
4370 avoid transmitting holes during zfs send
4371 DMU code clean up
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
Approved by: Garrett D'Amore <garrett@damore.org>
OS-80 support for vdev and CoS properties for the new I/O scheduler
OS-95 lint warning introduced by OS-61
Make special vdev subtree topology the same as regular vdev subtree to simplify testcase setup
Fixup merge issues
Issue #7: add cacheability to the properties
          Contributors: Boris Protopopov
DDT is placed either into special or to L2ARC but not in both
Support for secondarycache=data option
Align mutex tables in arc.c and dbuf.c to 64 bytes (cache line), place each kmutex_t on cache line by itself to avoid false sharing
re #12585 rb4049 ZFS++ work port - refactoring to improve separation of open/closed code, bug fixes, performance improvements - open code
Bug 11205: add missing libzfs_closed_stubs.c to fix opensource-only build.
ZFS plus work: special vdevs, cos, cos/vdev properties
        
*** 21,30 ****
--- 21,31 ----
  /*
   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
   * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
   * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
   * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+  * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
   */
  
  #ifndef _SYS_DBUF_H
  #define _SYS_DBUF_H
  
*** 53,62 ****
--- 54,65 ----
  #define DB_RF_HAVESTRUCT        (1 << 2)
  #define DB_RF_NOPREFETCH        (1 << 3)
  #define DB_RF_NEVERWAIT         (1 << 4)
  #define DB_RF_CACHED            (1 << 5)
  
+ #define DBUF_EVICT_ALL          -1
+ 
  /*
   * The simplified state transition diagram for dbufs looks like:
   *
   *              +----> READ ----+
   *              |               |
*** 81,90 ****
--- 84,100 ----
          DB_READ,
          DB_CACHED,
          DB_EVICTING
  } dbuf_states_t;
  
+ typedef enum dbuf_cached_state {
+         DB_NO_CACHE = -1,
+         DB_DBUF_CACHE,
+         DB_DBUF_METADATA_CACHE,
+         DB_CACHE_MAX
+ } dbuf_cached_state_t;
+ 
  struct dnode;
  struct dmu_tx;
  
  /*
   * level = 0 means the user data
*** 123,132 ****
--- 133,145 ----
          unsigned int dr_accounted;
  
          /* A copy of the bp that points to us */
          blkptr_t dr_bp_copy;
  
+         /* use special class of dirty entry */
+         boolean_t dr_usesc;
+ 
          union dirty_types {
                  struct dirty_indirect {
  
                          /* protect access to list */
                          kmutex_t dr_mtx;
*** 227,241 ****
           * Our link on the owner dnodes's dn_dbufs list.
           * Protected by its dn_dbufs_mtx.
           */
          avl_node_t db_link;
  
!         /*
!          * Link in dbuf_cache.
!          */
          multilist_node_t db_cache_link;
  
          /* Data which is unique to data (leaf) blocks: */
  
          /* User callback information. */
          dmu_buf_user_t *db_user;
  
--- 240,255 ----
           * Our link on the owner dnodes's dn_dbufs list.
           * Protected by its dn_dbufs_mtx.
           */
          avl_node_t db_link;
  
!         /* Link in dbuf_cache or dbuf_metadata_cache */
          multilist_node_t db_cache_link;
  
+         /* Tells us which dbuf cache this dbuf is in, if any */
+         dbuf_cached_state_t db_caching_status;
+ 
          /* Data which is unique to data (leaf) blocks: */
  
          /* User callback information. */
          dmu_buf_user_t *db_user;
  
*** 261,275 ****
          uint8_t db_dirtycnt;
  } dmu_buf_impl_t;
  
  /* Note: the dbuf hash table is exposed only for the mdb module */
  #define DBUF_MUTEXES 256
! #define DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)])
  typedef struct dbuf_hash_table {
          uint64_t hash_table_mask;
          dmu_buf_impl_t **hash_table;
!         kmutex_t hash_mutexes[DBUF_MUTEXES];
  } dbuf_hash_table_t;
  
  uint64_t dbuf_whichblock(struct dnode *di, int64_t level, uint64_t offset);
  
  dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data);
--- 275,297 ----
          uint8_t db_dirtycnt;
  } dmu_buf_impl_t;
  
  /* Note: the dbuf hash table is exposed only for the mdb module */
  #define DBUF_MUTEXES    256
! #define DBUF_LOCK_PAD   64
! typedef struct {
!         kmutex_t mtx;
! #ifdef _KERNEL
!         unsigned char pad[(DBUF_LOCK_PAD - sizeof (kmutex_t))];
! #endif
! } dbuf_mutex_t;
! #define DBUF_HASH_MUTEX(h, idx) \
!         (&((h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)].mtx))
  typedef struct dbuf_hash_table {
          uint64_t hash_table_mask;
          dmu_buf_impl_t **hash_table;
!         dbuf_mutex_t hash_mutexes[DBUF_MUTEXES];
  } dbuf_hash_table_t;
  
  uint64_t dbuf_whichblock(struct dnode *di, int64_t level, uint64_t offset);
  
  dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data);
*** 304,313 ****
--- 326,337 ----
  void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx);
  void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
  void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
  void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
  dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+ dbuf_dirty_record_t *dbuf_dirty_sc(dmu_buf_impl_t *db, dmu_tx_t *tx,
+     boolean_t usesc);
  arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db);
  void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
      bp_embedded_type_t etype, enum zio_compress comp,
      int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx);
  
*** 316,327 ****
  void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
  void dbuf_unoverride(dbuf_dirty_record_t *dr);
  void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx);
  void dbuf_release_bp(dmu_buf_impl_t *db);
  
- boolean_t dbuf_can_remap(const dmu_buf_impl_t *buf);
- 
  void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,
      struct dmu_tx *);
  
  void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx);
  
--- 340,349 ----
*** 333,355 ****
  
  void dbuf_init(void);
  void dbuf_fini(void);
  
  boolean_t dbuf_is_metadata(dmu_buf_impl_t *db);
  
  #define DBUF_GET_BUFC_TYPE(_db) \
!         (dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
  
  #define DBUF_IS_CACHEABLE(_db)                                          \
          ((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL ||         \
          (dbuf_is_metadata(_db) &&                                       \
          ((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA)))
  
  #define DBUF_IS_L2CACHEABLE(_db)                                        \
!         ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL ||       \
!         (dbuf_is_metadata(_db) &&                                       \
!         ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
  
  #define DNODE_LEVEL_IS_L2CACHEABLE(_dn, _level)                         \
          ((_dn)->dn_objset->os_secondary_cache == ZFS_CACHE_ALL ||       \
          (((_level) > 0 ||                                               \
          DMU_OT_IS_METADATA((_dn)->dn_handle->dnh_dnode->dn_type)) &&    \
--- 355,393 ----
  
  void dbuf_init(void);
  void dbuf_fini(void);
  
  boolean_t dbuf_is_metadata(dmu_buf_impl_t *db);
+ boolean_t dbuf_is_ddt(dmu_buf_impl_t *db);
+ boolean_t dbuf_ddt_is_l2cacheable(dmu_buf_impl_t *db);
+ boolean_t dbuf_meta_is_l2cacheable(dmu_buf_impl_t *db);
  
  #define DBUF_GET_BUFC_TYPE(_db) \
!         (dbuf_is_ddt(_db) ? ARC_BUFC_DDT :\
!         (dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA))
  
  #define DBUF_IS_CACHEABLE(_db)                                          \
          ((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL ||         \
          (dbuf_is_metadata(_db) &&                                       \
          ((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA)))
  
+ /*
+  * Checks whether we need to cache dbuf in l2arc.
+  * Metadata is l2cacheable if it is not placed on special device
+  * or it is placed on special device in "dual" mode. We need to check
+  * for ddt in ZFS_CACHE_ALL and ZFS_CACHE_METADATA because it is in MOS.
+  * ZFS_CACHE_DATA mode actually means to cache both data and cacheable
+  * metadata.
+  */
  #define DBUF_IS_L2CACHEABLE(_db)                                        \
!         (((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL &&      \
!         (dbuf_ddt_is_l2cacheable(_db) == B_TRUE)) ||                    \
!         ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA &&  \
!         (dbuf_is_metadata(_db)) &&                                      \
!         (dbuf_ddt_is_l2cacheable(_db) == B_TRUE)) ||                    \
!         ((dbuf_meta_is_l2cacheable(_db) == B_TRUE) &&                   \
!         ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_DATA)))
  
  #define DNODE_LEVEL_IS_L2CACHEABLE(_dn, _level)                         \
          ((_dn)->dn_objset->os_secondary_cache == ZFS_CACHE_ALL ||       \
          (((_level) > 0 ||                                               \
          DMU_OT_IS_METADATA((_dn)->dn_handle->dnh_dnode->dn_type)) &&    \