Print this page
usr/src/uts/common/fs/zfs/autosnap.c

@@ -18,13 +18,14 @@
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
- * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  * Copyright 2013 Saso Kiselkov. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2017 Joyent, Inc.
  * Copyright (c) 2017 Datto Inc.
  */

@@ -33,11 +34,10 @@
 #define _SYS_SPA_H
 
 #include <sys/avl.h>
 #include <sys/zfs_context.h>
 #include <sys/nvpair.h>
-#include <sys/sysevent.h>
 #include <sys/sysmacros.h>
 #include <sys/types.h>
 #include <sys/fs/zfs.h>
 #include <sys/dmu.h>
 

@@ -59,10 +59,12 @@
 typedef struct ddt ddt_t;
 typedef struct ddt_entry ddt_entry_t;
 struct dsl_pool;
 struct dsl_dataset;
 
+struct zfs_autosnap;
+
 /*
  * General-purpose 32-bit and 64-bit bitfield encodings.
  */
 #define BF32_DECODE(x, low, len)        P2PHASE((x) >> (low), 1U << (len))
 #define BF64_DECODE(x, low, len)        P2PHASE((x) >> (low), 1ULL << (len))

@@ -178,11 +180,11 @@
  *      +-------+-------+-------+-------+-------+-------+-------+-------+
  * 4    |               vdev3           | GRID  |         ASIZE         |
  *      +-------+-------+-------+-------+-------+-------+-------+-------+
  * 5    |G|                      offset3                                |
  *      +-------+-------+-------+-------+-------+-------+-------+-------+
- * 6    |BDX|lvl| type  | cksum |E| comp|    PSIZE      |     LSIZE     |
+ * 6    |BDS|lvl| type  | cksum |E| comp|    PSIZE      |     LSIZE     |
  *      +-------+-------+-------+-------+-------+-------+-------+-------+
  * 7    |                       padding                                 |
  *      +-------+-------+-------+-------+-------+-------+-------+-------+
  * 8    |                       padding                                 |
  *      +-------+-------+-------+-------+-------+-------+-------+-------+

@@ -212,18 +214,15 @@
  * cksum        checksum function
  * comp         compression function
  * G            gang block indicator
  * B            byteorder (endianness)
  * D            dedup
- * X            encryption (on version 30, which is not supported)
+ * S            special WBC block (unused for embedded blocks)
  * E            blkptr_t contains embedded data (see below)
  * lvl          level of indirection
  * type         DMU object type
- * phys birth   txg when dva[0] was written; zero if same as logical birth txg
- *              note that typically all the dva's would be written in this
- *              txg, but they could be different if they were moved by
- *              device removal.
+ * phys birth   txg of block allocation; zero if same as logical birth txg
  * log. birth   transaction group in which the block was logically born
  * fill count   number of non-zero blocks under this bp
  * checksum[4]  256-bit checksum of the data this bp describes
  */
 

@@ -241,11 +240,11 @@
  * 2    |      payload                                                  |
  * 3    |      payload                                                  |
  * 4    |      payload                                                  |
  * 5    |      payload                                                  |
  *      +-------+-------+-------+-------+-------+-------+-------+-------+
- * 6    |BDX|lvl| type  | etype |E| comp| PSIZE|              LSIZE     |
+ * 6    |BDS|lvl| type  | etype |E| comp| PSIZE|              LSIZE     |
  *      +-------+-------+-------+-------+-------+-------+-------+-------+
  * 7    |      payload                                                  |
  * 8    |      payload                                                  |
  * 9    |      payload                                                  |
  *      +-------+-------+-------+-------+-------+-------+-------+-------+

@@ -261,11 +260,11 @@
  * Legend:
  *
  * payload              contains the embedded data
  * B (byteorder)        byteorder (endianness)
  * D (dedup)            padding (set to zero)
- * X                    encryption (set to zero; see above)
+ * S                    special WBC block (unused for embedded blocks)
  * E (embedded)         set to one
  * lvl                  indirection level
  * type                 DMU object type
  * etype                how to interpret embedded data (BP_EMBEDDED_TYPE_*)
  * comp                 compression function of payload

@@ -320,11 +319,10 @@
 #define BPE_IS_PAYLOADWORD(bp, wp) \
         ((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth)
 
 #define SPA_BLKPTRSHIFT 7               /* blkptr_t is 128 bytes        */
 #define SPA_DVAS_PER_BP 3               /* Number of DVAs in a bp       */
-#define SPA_SYNC_MIN_VDEVS 3            /* min vdevs to update during sync */
 
 /*
  * A block is a hole when it has either 1) never been written to, or
  * 2) is zero-filled. In both cases, ZFS can return all zeroes for all reads
  * without physically allocating disk space. Holes are represented in the

@@ -405,10 +403,13 @@
 #define BP_SET_TYPE(bp, x)              BF64_SET((bp)->blk_prop, 48, 8, x)
 
 #define BP_GET_LEVEL(bp)                BF64_GET((bp)->blk_prop, 56, 5)
 #define BP_SET_LEVEL(bp, x)             BF64_SET((bp)->blk_prop, 56, 5, x)
 
+#define BP_IS_SPECIAL(bp)               BF64_GET((bp)->blk_prop, 61, 1)
+#define BP_SET_SPECIAL(bp, x)   BF64_SET((bp)->blk_prop, 61, 1, x)
+
 #define BP_GET_DEDUP(bp)                BF64_GET((bp)->blk_prop, 62, 1)
 #define BP_SET_DEDUP(bp, x)             BF64_SET((bp)->blk_prop, 62, 1, x)
 
 #define BP_GET_BYTEORDER(bp)            BF64_GET((bp)->blk_prop, 63, 1)
 #define BP_SET_BYTEORDER(bp, x)         BF64_SET((bp)->blk_prop, 63, 1, x)

@@ -465,23 +466,25 @@
         (0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \
         ((zc1).zc_word[1] - (zc2).zc_word[1]) | \
         ((zc1).zc_word[2] - (zc2).zc_word[2]) | \
         ((zc1).zc_word[3] - (zc2).zc_word[3])))
 
+#define ZIO_CHECKSUM_BSWAP(_zc) \
+        do { \
+                zio_cksum_t *zc = (_zc); \
+                zc->zc_word[0] = BSWAP_64(zc->zc_word[0]); \
+                zc->zc_word[1] = BSWAP_64(zc->zc_word[1]); \
+                zc->zc_word[2] = BSWAP_64(zc->zc_word[2]); \
+                zc->zc_word[3] = BSWAP_64(zc->zc_word[3]); \
+                _NOTE(NOTREACHED) \
+                _NOTE(CONSTCOND) \
+        } while (0)
+
 #define ZIO_CHECKSUM_IS_ZERO(zc) \
         (0 == ((zc)->zc_word[0] | (zc)->zc_word[1] | \
         (zc)->zc_word[2] | (zc)->zc_word[3]))
 
-#define ZIO_CHECKSUM_BSWAP(zcp)                                 \
-{                                                               \
-        (zcp)->zc_word[0] = BSWAP_64((zcp)->zc_word[0]);        \
-        (zcp)->zc_word[1] = BSWAP_64((zcp)->zc_word[1]);        \
-        (zcp)->zc_word[2] = BSWAP_64((zcp)->zc_word[2]);        \
-        (zcp)->zc_word[3] = BSWAP_64((zcp)->zc_word[3]);        \
-}
-
-
 #define DVA_IS_VALID(dva)       (DVA_GET_ASIZE(dva) != 0)
 
 #define ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3)   \
 {                                               \
         (zcp)->zc_word[0] = w0;                 \

@@ -604,32 +607,56 @@
         }                                                               \
         ASSERT(len < size);                                             \
 }
 
 #define BP_GET_BUFC_TYPE(bp)                                            \
-        (BP_IS_METADATA(bp) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
+        ((BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP || \
+        BP_GET_TYPE(bp) == DMU_OT_DDT_STATS) ? ARC_BUFC_DDT : \
+        (BP_IS_METADATA(bp) ? ARC_BUFC_METADATA : ARC_BUFC_DATA))
 
 typedef enum spa_import_type {
         SPA_IMPORT_EXISTING,
         SPA_IMPORT_ASSEMBLE
 } spa_import_type_t;
 
+/*
+ * Should we force sending TRIM commands even to devices which evidently
+ * don't support it?
+ *      OFF: no, only send to devices which indicated support
+ *      ON: yes, force send to everybody
+ */
+typedef enum {
+        SPA_FORCE_TRIM_OFF = 0, /* default */
+        SPA_FORCE_TRIM_ON
+} spa_force_trim_t;
+
+/*
+ * Should we send TRIM commands in-line during normal pool operation while
+ * deleting stuff?
+ *      OFF: no
+ *      ON: yes
+ */
+typedef enum {
+        SPA_AUTO_TRIM_OFF = 0,  /* default */
+        SPA_AUTO_TRIM_ON
+} spa_auto_trim_t;
+
 /* state manipulation functions */
 extern int spa_open(const char *pool, spa_t **, void *tag);
 extern int spa_open_rewind(const char *pool, spa_t **, void *tag,
     nvlist_t *policy, nvlist_t **config);
-extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot,
-    size_t buflen);
+extern int spa_get_stats(const char *name, nvlist_t **config,
+    char *altroot, size_t buflen);
 extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props,
     nvlist_t *zplprops);
 extern int spa_import_rootpool(char *devpath, char *devid);
 extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props,
     uint64_t flags);
 extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
 extern int spa_destroy(char *pool);
 extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
-    boolean_t hardforce);
+    boolean_t hardforce, boolean_t saveconfig);
 extern int spa_reset(char *pool);
 extern void spa_async_request(spa_t *spa, int flag);
 extern void spa_async_unrequest(spa_t *spa, int flag);
 extern void spa_async_suspend(spa_t *spa);
 extern void spa_async_resume(spa_t *spa);

@@ -644,10 +671,12 @@
 #define SPA_ASYNC_RESILVER_DONE 0x08
 #define SPA_ASYNC_RESILVER      0x10
 #define SPA_ASYNC_AUTOEXPAND    0x20
 #define SPA_ASYNC_REMOVE_DONE   0x40
 #define SPA_ASYNC_REMOVE_STOP   0x80
+#define SPA_ASYNC_L2CACHE_REBUILD               0x100
+#define SPA_ASYNC_MAN_TRIM_TASKQ_DESTROY        0x200
 
 /*
  * Controls the behavior of spa_vdev_remove().
  */
 #define SPA_REMOVE_UNSPARE      0x01

@@ -659,15 +688,24 @@
     int replacing);
 extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid,
     int replace_done);
 extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
 extern boolean_t spa_vdev_remove_active(spa_t *spa);
+extern int spa_vdev_setl2adddt(spa_t *spa, uint64_t guid, const char *newval);
 extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
 extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru);
 extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
     nvlist_t *props, boolean_t exp);
 
+extern int spa_load_vdev_props(spa_t *spa);
+
+extern int spa_vdev_prop_validate(spa_t *spa, uint64_t vdev_guid,
+    nvlist_t *nvp);
+extern int spa_vdev_prop_set(spa_t *spa, uint64_t vdev_guid, nvlist_t *nvp);
+extern int spa_vdev_prop_get(spa_t *spa, uint64_t vdev_guid, nvlist_t **nvp);
+extern int spa_vdev_props_sync_task_do(spa_t *spa);
+
 /* spare state (which is global across all pools) */
 extern void spa_spare_add(vdev_t *vd);
 extern void spa_spare_remove(vdev_t *vd);
 extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt);
 extern void spa_spare_activate(vdev_t *vd);

@@ -682,10 +720,17 @@
 /* scanning */
 extern int spa_scan(spa_t *spa, pool_scan_func_t func);
 extern int spa_scan_stop(spa_t *spa);
 extern int spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t flag);
 
+/* trimming */
+extern void spa_man_trim(spa_t *spa, uint64_t rate);
+extern void spa_man_trim_stop(spa_t *spa);
+extern void spa_get_trim_prog(spa_t *spa, uint64_t *prog, uint64_t *rate,
+    uint64_t *start_time, uint64_t *stop_time);
+extern void spa_trim_stop_wait(spa_t *spa);
+
 /* spa syncing */
 extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */
 extern void spa_sync_allpools(void);
 
 /* spa namespace global mutex */

@@ -696,11 +741,11 @@
  */
 
 #define SPA_CONFIG_UPDATE_POOL  0
 #define SPA_CONFIG_UPDATE_VDEVS 1
 
-extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t);
+extern void spa_config_sync(spa_t *, boolean_t, boolean_t);
 extern void spa_config_load(void);
 extern nvlist_t *spa_all_configs(uint64_t *);
 extern void spa_config_set(spa_t *spa, nvlist_t *config);
 extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg,
     int getstats);

@@ -708,10 +753,15 @@
 
 /*
  * Miscellaneous SPA routines in spa_misc.c
  */
 
+/* dedup ceiling helper functions */
+extern uint64_t spa_get_ddts_size(spa_t *spa, boolean_t phys);
+extern int spa_get_l2arc_ddt_utilization(spa_t *spa);
+extern boolean_t spa_enable_dedup_cap(spa_t *spa);
+
 /* Namespace manipulation */
 extern spa_t *spa_lookup(const char *name);
 extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot);
 extern void spa_remove(spa_t *spa);
 extern spa_t *spa_next(spa_t *prev);

@@ -759,20 +809,19 @@
         SPA_LOG_GOOD,           /* log(s) are good */
 } spa_log_state_t;
 
 extern spa_log_state_t spa_get_log_state(spa_t *spa);
 extern void spa_set_log_state(spa_t *spa, spa_log_state_t state);
-extern int spa_reset_logs(spa_t *spa);
+extern int spa_offline_log(spa_t *spa);
 
 /* Log claim callback */
 extern void spa_claim_notify(zio_t *zio);
 
 /* Accessor functions */
 extern boolean_t spa_shutting_down(spa_t *spa);
 extern struct dsl_pool *spa_get_dsl(spa_t *spa);
 extern boolean_t spa_is_initializing(spa_t *spa);
-extern boolean_t spa_indirect_vdevs_loaded(spa_t *spa);
 extern blkptr_t *spa_get_rootblkptr(spa_t *spa);
 extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp);
 extern void spa_altroot(spa_t *, char *, size_t);
 extern int spa_sync_pass(spa_t *spa);
 extern char *spa_name(spa_t *spa);

@@ -781,43 +830,48 @@
 extern uint64_t spa_last_synced_txg(spa_t *spa);
 extern uint64_t spa_first_txg(spa_t *spa);
 extern uint64_t spa_syncing_txg(spa_t *spa);
 extern uint64_t spa_final_dirty_txg(spa_t *spa);
 extern uint64_t spa_version(spa_t *spa);
+extern int spa_get_obj_mtx_sz(spa_t *spa);
 extern pool_state_t spa_state(spa_t *spa);
 extern spa_load_state_t spa_load_state(spa_t *spa);
 extern uint64_t spa_freeze_txg(spa_t *spa);
 extern uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize);
 extern uint64_t spa_get_dspace(spa_t *spa);
 extern uint64_t spa_get_slop_space(spa_t *spa);
 extern void spa_update_dspace(spa_t *spa);
+extern void spa_update_latency(spa_t *spa);
 extern uint64_t spa_version(spa_t *spa);
 extern boolean_t spa_deflate(spa_t *spa);
 extern metaslab_class_t *spa_normal_class(spa_t *spa);
 extern metaslab_class_t *spa_log_class(spa_t *spa);
+extern metaslab_class_t *spa_special_class(spa_t *spa);
 extern void spa_evicting_os_register(spa_t *, objset_t *os);
 extern void spa_evicting_os_deregister(spa_t *, objset_t *os);
 extern void spa_evicting_os_wait(spa_t *spa);
+extern uint64_t spa_class_alloc_percentage(metaslab_class_t *mc);
 extern int spa_max_replication(spa_t *spa);
 extern int spa_prev_software_version(spa_t *spa);
 extern int spa_busy(void);
 extern uint8_t spa_get_failmode(spa_t *spa);
 extern boolean_t spa_suspended(spa_t *spa);
 extern uint64_t spa_bootfs(spa_t *spa);
 extern uint64_t spa_delegation(spa_t *spa);
 extern objset_t *spa_meta_objset(spa_t *spa);
 extern uint64_t spa_deadman_synctime(spa_t *spa);
+extern spa_force_trim_t spa_get_force_trim(spa_t *spa);
+extern spa_auto_trim_t spa_get_auto_trim(spa_t *spa);
 
 /* Miscellaneous support routines */
-extern void spa_load_failed(spa_t *spa, const char *fmt, ...);
-extern void spa_load_note(spa_t *spa, const char *fmt, ...);
 extern void spa_activate_mos_feature(spa_t *spa, const char *feature,
     dmu_tx_t *tx);
 extern void spa_deactivate_mos_feature(spa_t *spa, const char *feature);
 extern int spa_rename(const char *oldname, const char *newname);
 extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid);
 extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid);
+extern boolean_t spa_config_guid_exists(uint64_t pool_guid);
 extern char *spa_strdup(const char *);
 extern void spa_strfree(char *);
 extern uint64_t spa_get_random(uint64_t range);
 extern uint64_t spa_generate_guid(spa_t *spa);
 extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp);

@@ -833,26 +887,28 @@
 extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp);
 extern boolean_t spa_has_slogs(spa_t *spa);
 extern boolean_t spa_is_root(spa_t *spa);
 extern boolean_t spa_writeable(spa_t *spa);
 extern boolean_t spa_has_pending_synctask(spa_t *spa);
+extern boolean_t spa_has_special(spa_t *spa);
 extern int spa_maxblocksize(spa_t *spa);
 extern void zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp);
-extern boolean_t zfs_dva_valid(spa_t *spa, const dva_t *dva,
-    const blkptr_t *bp);
-typedef void (*spa_remap_cb_t)(uint64_t vdev, uint64_t offset, uint64_t size,
-    void *arg);
-extern boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp,
-    spa_remap_cb_t callback, void *arg);
-extern uint64_t spa_get_last_removal_txg(spa_t *spa);
-extern boolean_t spa_trust_config(spa_t *spa);
-extern uint64_t spa_missing_tvds_allowed(spa_t *spa);
-extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing);
+extern boolean_t spa_wbc_present(spa_t *spa);
+extern boolean_t spa_wbc_active(spa_t *spa);
+extern struct zfs_autosnap *spa_get_autosnap(spa_t *spa);
+extern void wbc_purge_window(spa_t *spa, dmu_tx_t *tx);
 
 extern int spa_mode(spa_t *spa);
 extern uint64_t zfs_strtonum(const char *str, char **nptr);
 
+/* Selector for dynamic I/O balancing between special and regular vdevs */
+extern boolean_t spa_use_special_class(spa_t *spa);
+
+/* Pool perfmon thread management */
+extern void spa_start_perfmon_thread(spa_t *spa);
+extern boolean_t spa_stop_perfmon_thread(spa_t *spa);
+
 extern char *spa_his_ievent_table[];
 
 extern void spa_history_create_obj(spa_t *spa, dmu_tx_t *tx);
 extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read,
     char *his_buf);

@@ -897,15 +953,24 @@
 extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t);
 
 /* asynchronous event notification */
 extern void spa_event_notify(spa_t *spa, vdev_t *vdev, nvlist_t *hist_nvl,
     const char *name);
-extern sysevent_t *spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl,
-    const char *name);
-extern void spa_event_post(sysevent_t *ev);
-extern void spa_event_discard(sysevent_t *ev);
 
+extern int spa_wbc_mode(const char *name);
+
+typedef enum spa_wbc_mode {
+        WBC_MODE_OFF,
+        WBC_MODE_ACTIVE,
+        WBC_MODE_PASSIVE
+} spa_wbc_mode_t;
+
+/* TRIM/UNMAP kstat update */
+extern void spa_trimstats_update(spa_t *spa, uint64_t extents, uint64_t bytes,
+    uint64_t extents_skipped, uint64_t bytes_skipped);
+extern void spa_trimstats_auto_slow_incr(spa_t *spa);
+
 #ifdef ZFS_DEBUG
 #define dprintf_bp(bp, fmt, ...) do {                           \
         if (zfs_flags & ZFS_DEBUG_DPRINTF) {                    \
         char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP);  \
         snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp));        \