1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
14 */
15
16 #ifndef _SYS_WRCACHE_H
17 #define _SYS_WRCACHE_H
18
19 #include <sys/zfs_context.h>
20 #include <sys/sysmacros.h>
21 #include <sys/types.h>
22 #include <sys/fs/zfs.h>
23 #include <sys/spa.h>
24 #include <sys/dmu.h>
25 #include <sys/dmu_traverse.h>
26 #include <sys/dsl_dataset.h>
27 #include <sys/dsl_pool.h>
28
29 #ifdef __cplusplus
30 extern "C" {
31 #endif
32
33 /*
34 * field 'blk_prop' of wbc_block_t
35 *
36 * 64 48 32 16 0
37 * +-------+-------+-------+-------+-------+-------+-------+-------+
38 * | RESERVED |D(1)|comp(7)| PSIZE(16) | LSIZE(16) |
39 * +-------+-------+-------+-------+-------+-------+-------+-------+
40 *
41 * Legend:
42 * D deleted
43 * comp compression function of payload
44 * PSIZE size of payload after compression, in bytes
45 * LSIZE logical size of payload, in bytes
46 *
47 * Deleted block is a block whos dva was freed,
48 * so this block must not be used by wbc_move_logic
49 * and after move has been finished need to free only block-structure
50 */
51 #define WBCBP_GET_LSIZE(bp) \
52 BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1)
53 #define WBCBP_SET_LSIZE(bp, x) do { \
54 BF64_SET_SB((bp)->blk_prop, \
55 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
56 _NOTE(CONSTCOND) } while (0)
57
58 #define WBCBP_GET_PSIZE(bp) \
59 BF64_GET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1)
60 #define WBCBP_SET_PSIZE(bp, x) do { \
61 BF64_SET_SB((bp)->blk_prop, \
62 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
63 _NOTE(CONSTCOND) } while (0)
64
65 #define WBCBP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 7)
66 #define WBCBP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 7, x)
67
68 #define WBCBP_IS_DELETED(wbcbp) BF64_GET((wbcbp)->blk_prop, 39, 1)
69 #define WBCBP_MARK_DELETED(wbcbp) BF64_SET((wbcbp)->blk_prop, 39, 1, 1)
70
71 typedef struct wbc_data wbc_data_t;
72
73 /*
74 * WBC Instance is a dataset (DS) and
75 * all the children DSs of that DS.
76 */
77 typedef struct wbc_instance {
78 avl_node_t node;
79
80 wbc_data_t *wbc_data;
81 void *wbc_autosnap_hdl;
82 char ds_name[ZFS_MAX_DATASET_NAME_LEN];
83
84 /* copy of dsl_dataset_t->ds_object */
85 uint64_t ds_object;
86
87 /*
88 * TXG of the right boundary WBC-window
89 * can be 0 if this instance is 'idle'
90 */
91 uint64_t txg_to_rele;
92
93 /*
94 * txg of the specific TXG sync that
95 * executed 'off' on this instance
96 */
97 uint64_t txg_off;
98
99 boolean_t fini_migration;
100 boolean_t fini_done;
101 } wbc_instance_t;
102
103 /*
104 * WBC statistics
105 */
106 typedef struct wbc_stat {
107 uint64_t wbc_spa_util; /* spa average utilization */
108 clock_t wbc_stat_lbolt; /* last statistics update */
109 boolean_t wbc_stat_update; /* statstics update flag */
110 } wbc_stat_t;
111
112 /*
113 * wbc_data is a global per ZFS pool structure contains all
114 * information associated with write cache and
115 * is attached to spa structure.
116 */
117 struct wbc_data {
118 kthread_t *wbc_init_thread;
119 kthread_t *wbc_thread; /* move thread */
120 kthread_t *wbc_walk_thread; /* collector thread */
121
122 kmutex_t wbc_lock;
123 kcondvar_t wbc_cv;
124
125 /* TASKQ that does async finalization of wbc_instances */
126 taskq_t *wbc_instance_fini;
127
128 uint64_t wbc_instance_fini_cnt;
129
130 avl_tree_t wbc_blocks; /* collected blocks */
131 avl_tree_t wbc_moved_blocks; /* moved blocks */
132
133 uint64_t wbc_window_bytes; /* bytes in current wnd */
134 uint64_t wbc_altered_bytes; /* bytes altered in new wnd */
135 uint64_t wbc_roll_threshold; /* max percent can be altered */
136 uint64_t wbc_altered_limit; /* max bytes can be altered */
137
138 uint64_t wbc_blocks_count; /* amount of blocks */
139
140 taskq_t *wbc_move_taskq; /* pending blocks taskq */
141 uint64_t wbc_move_threads; /* taskq number of threads */
142
143 uint64_t wbc_start_txg; /* left boundary */
144 uint64_t wbc_finish_txg; /* right boundary */
145 uint64_t wbc_txg_to_rele; /* txg to rele */
146 uint64_t wbc_blocks_in; /* collected */
147 uint64_t wbc_blocks_out; /* planned */
148 uint64_t wbc_blocks_mv; /* moved */
149 uint64_t wbc_blocks_mv_last; /* latest number of moved blocks */
150
151 uint64_t wbc_latest_window_time;
152
153 /* Tree of watched datasets and corresponding data */
154 avl_tree_t wbc_instances;
155 boolean_t wbc_ready_to_use;
156
157 spa_t *wbc_spa; /* parent spa */
158 wbc_stat_t wbc_stat; /* WBC statistics */
159
160 uint64_t wbc_fault_moves; /* amount of fault moves */
161
162 boolean_t wbc_purge; /* should purge queued blocks */
163 boolean_t wbc_walk; /* should walk */
164 boolean_t wbc_locked; /* do not walk while locked */
165 boolean_t wbc_walking; /* currently walking */
166 boolean_t wbc_wait_for_window;
167
168 boolean_t wbc_delete; /* delete state */
169
170 boolean_t wbc_thr_exit; /* exit flag */
171 boolean_t wbc_isvalid; /* WBC is inited */
172 boolean_t wbc_first_move; /* TRUE until the 1 WBC-win opened */
173 };
174
175 /* !!! Do not change these constants !!! */
176 #define WBC_SPECIAL_DVA 0
177 #define WBC_NORMAL_DVA 1
178
179 /*
180 * In-core representation of a block which will be moved
181 */
182 typedef struct wbc_block {
183 avl_node_t node;
184
185 /* associated WBC */
186 wbc_data_t *data;
187
188 /*
189 * size, compression
190 */
191 uint64_t blk_prop;
192
193 /* birth txg for arc lookup */
194 uint64_t btxg;
195
196 /* dvas of blocks to move */
197 dva_t dva[2];
198
199 kmutex_t lock;
200 } wbc_block_t;
201
202 typedef struct wbc_parseblock_cb {
203 wbc_data_t *wbc_data;
204
205 /*
206 * A bookmark for resume
207 */
208 zbookmark_phys_t zb;
209
210 /*
211 * Total size of all collected blocks
212 */
213 uint64_t bt_size;
214
215 /*
216 * The time we started traversal process
217 */
218 hrtime_t start_time;
219
220 uint64_t actv_txg;
221 } wbc_parseblock_cb_t;
222
223 /*
224 * This structure describes ZFS_PROP_WBC_MODE property
225 */
226 typedef struct {
227 /*
228 * copy of dsl_dataset_t->ds_object of dataset
229 * for which user does wbc_mode=on
230 */
231 uint64_t root_ds_object;
232
233 /*
234 * TXG when user did 'wbc_mode=off'
235 */
236 uint64_t txg_off;
237
238 /*
239 * Flags. Now is not used.
240 */
241 uint64_t flags;
242 } wbc_mode_prop_val_t;
243
244 #define WBC_MODE_PROP_VAL_SZ (sizeof (wbc_mode_prop_val_t) / sizeof (uint64_t))
245
246 void wbc_init(wbc_data_t *wbc_data, spa_t *spa);
247 void wbc_fini(wbc_data_t *wbc_data);
248
249 void wbc_activate(spa_t *spa, boolean_t pool_creation);
250 void wbc_deactivate(spa_t *spa);
251
252 int wbc_select_dva(wbc_data_t *wbc_data, zio_t *zio);
253 boolean_t wbc_bp_is_migrated(wbc_data_t *wbc_data, const blkptr_t *bp);
254 int wbc_first_valid_dva(const blkptr_t *bp,
255 wbc_data_t *wbc_data, boolean_t removal);
256
257 wbc_data_t *spa_get_wbc_data(spa_t *spa);
258
259 void wbc_add_bytes(spa_t *spa, uint64_t txg, uint64_t bytes);
260
261 /*
262 * write cache thread.
263 */
264 void wbc_start_thread(spa_t *);
265 boolean_t wbc_stop_thread(spa_t *);
266 void wbc_trigger_wbcthread(spa_t *, uint64_t);
267
268 /*
269 * callback function for traverse_dataset which validates
270 * the block pointer and adds to the list.
271 */
272 blkptr_cb_t wbc_traverse_ds_cb;
273
274 boolean_t wbc_check_parseblocks_hold(spa_t *);
275 void wbc_check_parseblocks_rele(spa_t *spa);
276 boolean_t wbc_try_hold(spa_t *);
277 int wbc_walk_lock(spa_t *);
278 void wbc_walk_unlock(spa_t *);
279
280 void wbc_process_objset(wbc_data_t *wbc_data, objset_t *os, boolean_t destroy);
281 void wbc_mode_changed(void *arg, uint64_t newval);
282 int wbc_check_dataset(const char *ds_name);
283
284 boolean_t wbc_try_disable(wbc_data_t *wbc_data);
285
286 #ifdef __cplusplus
287 }
288 #endif
289
290 #endif /* _SYS_WRCACHE_H */