1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
25 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
26 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
27 */
28
29 #ifndef _SYS_DBUF_H
30 #define _SYS_DBUF_H
31
32 #include <sys/dmu.h>
33 #include <sys/spa.h>
34 #include <sys/txg.h>
35 #include <sys/zio.h>
36 #include <sys/arc.h>
37 #include <sys/zfs_context.h>
38 #include <sys/refcount.h>
39 #include <sys/zrlock.h>
40 #include <sys/multilist.h>
41
42 #ifdef __cplusplus
43 extern "C" {
44 #endif
45
46 #define IN_DMU_SYNC 2
47
48 /*
49 * define flags for dbuf_read
50 */
51
52 #define DB_RF_MUST_SUCCEED (1 << 0)
53 #define DB_RF_CANFAIL (1 << 1)
54 #define DB_RF_HAVESTRUCT (1 << 2)
55 #define DB_RF_NOPREFETCH (1 << 3)
56 #define DB_RF_NEVERWAIT (1 << 4)
57 #define DB_RF_CACHED (1 << 5)
58
59 #define DBUF_EVICT_ALL -1
60
61 /*
62 * The simplified state transition diagram for dbufs looks like:
63 *
64 * +----> READ ----+
65 * | |
66 * | V
67 * (alloc)-->UNCACHED CACHED-->EVICTING-->(free)
68 * | ^ ^
69 * | | |
70 * +----> FILL ----+ |
71 * | |
72 * | |
73 * +--------> NOFILL -------+
74 *
75 * DB_SEARCH is an invalid state for a dbuf. It is used by dbuf_free_range
76 * to find all dbufs in a range of a dnode and must be less than any other
77 * dbuf_states_t (see comment on dn_dbufs in dnode.h).
78 */
79 typedef enum dbuf_states {
80 DB_SEARCH = -1,
81 DB_UNCACHED,
82 DB_FILL,
83 DB_NOFILL,
84 DB_READ,
85 DB_CACHED,
86 DB_EVICTING
87 } dbuf_states_t;
88
89 typedef enum dbuf_cached_state {
90 DB_NO_CACHE = -1,
91 DB_DBUF_CACHE,
92 DB_DBUF_METADATA_CACHE,
93 DB_CACHE_MAX
94 } dbuf_cached_state_t;
95
96 struct dnode;
97 struct dmu_tx;
98
99 /*
100 * level = 0 means the user data
101 * level = 1 means the single indirect block
102 * etc.
103 */
104
105 struct dmu_buf_impl;
106
107 typedef enum override_states {
108 DR_NOT_OVERRIDDEN,
109 DR_IN_DMU_SYNC,
110 DR_OVERRIDDEN
111 } override_states_t;
112
113 typedef struct dbuf_dirty_record {
114 /* link on our parents dirty list */
115 list_node_t dr_dirty_node;
116
117 /* transaction group this data will sync in */
118 uint64_t dr_txg;
119
120 /* zio of outstanding write IO */
121 zio_t *dr_zio;
122
123 /* pointer back to our dbuf */
124 struct dmu_buf_impl *dr_dbuf;
125
126 /* pointer to next dirty record */
127 struct dbuf_dirty_record *dr_next;
128
129 /* pointer to parent dirty record */
130 struct dbuf_dirty_record *dr_parent;
131
132 /* How much space was changed to dsl_pool_dirty_space() for this? */
133 unsigned int dr_accounted;
134
135 /* A copy of the bp that points to us */
136 blkptr_t dr_bp_copy;
137
138 /* use special class of dirty entry */
139 boolean_t dr_usesc;
140
141 union dirty_types {
142 struct dirty_indirect {
143
144 /* protect access to list */
145 kmutex_t dr_mtx;
146
147 /* Our list of dirty children */
148 list_t dr_children;
149 } di;
150 struct dirty_leaf {
151
152 /*
153 * dr_data is set when we dirty the buffer
154 * so that we can retain the pointer even if it
155 * gets COW'd in a subsequent transaction group.
156 */
157 arc_buf_t *dr_data;
158 blkptr_t dr_overridden_by;
159 override_states_t dr_override_state;
160 uint8_t dr_copies;
161 boolean_t dr_nopwrite;
162 } dl;
163 } dt;
164 } dbuf_dirty_record_t;
165
166 typedef struct dmu_buf_impl {
167 /*
168 * The following members are immutable, with the exception of
169 * db.db_data, which is protected by db_mtx.
170 */
171
172 /* the publicly visible structure */
173 dmu_buf_t db;
174
175 /* the objset we belong to */
176 struct objset *db_objset;
177
178 /*
179 * handle to safely access the dnode we belong to (NULL when evicted)
180 */
181 struct dnode_handle *db_dnode_handle;
182
183 /*
184 * our parent buffer; if the dnode points to us directly,
185 * db_parent == db_dnode_handle->dnh_dnode->dn_dbuf
186 * only accessed by sync thread ???
187 * (NULL when evicted)
188 * May change from NULL to non-NULL under the protection of db_mtx
189 * (see dbuf_check_blkptr())
190 */
191 struct dmu_buf_impl *db_parent;
192
193 /*
194 * link for hash table of all dmu_buf_impl_t's
195 */
196 struct dmu_buf_impl *db_hash_next;
197
198 /* our block number */
199 uint64_t db_blkid;
200
201 /*
202 * Pointer to the blkptr_t which points to us. May be NULL if we
203 * don't have one yet. (NULL when evicted)
204 */
205 blkptr_t *db_blkptr;
206
207 /*
208 * Our indirection level. Data buffers have db_level==0.
209 * Indirect buffers which point to data buffers have
210 * db_level==1. etc. Buffers which contain dnodes have
211 * db_level==0, since the dnodes are stored in a file.
212 */
213 uint8_t db_level;
214
215 /* db_mtx protects the members below */
216 kmutex_t db_mtx;
217
218 /*
219 * Current state of the buffer
220 */
221 dbuf_states_t db_state;
222
223 /*
224 * Refcount accessed by dmu_buf_{hold,rele}.
225 * If nonzero, the buffer can't be destroyed.
226 * Protected by db_mtx.
227 */
228 refcount_t db_holds;
229
230 /* buffer holding our data */
231 arc_buf_t *db_buf;
232
233 kcondvar_t db_changed;
234 dbuf_dirty_record_t *db_data_pending;
235
236 /* pointer to most recent dirty record for this buffer */
237 dbuf_dirty_record_t *db_last_dirty;
238
239 /*
240 * Our link on the owner dnodes's dn_dbufs list.
241 * Protected by its dn_dbufs_mtx.
242 */
243 avl_node_t db_link;
244
245 /* Link in dbuf_cache or dbuf_metadata_cache */
246 multilist_node_t db_cache_link;
247
248 /* Tells us which dbuf cache this dbuf is in, if any */
249 dbuf_cached_state_t db_caching_status;
250
251 /* Data which is unique to data (leaf) blocks: */
252
253 /* User callback information. */
254 dmu_buf_user_t *db_user;
255
256 /*
257 * Evict user data as soon as the dirty and reference
258 * counts are equal.
259 */
260 uint8_t db_user_immediate_evict;
261
262 /*
263 * This block was freed while a read or write was
264 * active.
265 */
266 uint8_t db_freed_in_flight;
267
268 /*
269 * dnode_evict_dbufs() or dnode_evict_bonus() tried to
270 * evict this dbuf, but couldn't due to outstanding
271 * references. Evict once the refcount drops to 0.
272 */
273 uint8_t db_pending_evict;
274
275 uint8_t db_dirtycnt;
276 } dmu_buf_impl_t;
277
278 /* Note: the dbuf hash table is exposed only for the mdb module */
279 #define DBUF_MUTEXES 256
280 #define DBUF_LOCK_PAD 64
281 typedef struct {
282 kmutex_t mtx;
283 #ifdef _KERNEL
284 unsigned char pad[(DBUF_LOCK_PAD - sizeof (kmutex_t))];
285 #endif
286 } dbuf_mutex_t;
287 #define DBUF_HASH_MUTEX(h, idx) \
288 (&((h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)].mtx))
289 typedef struct dbuf_hash_table {
290 uint64_t hash_table_mask;
291 dmu_buf_impl_t **hash_table;
292 dbuf_mutex_t hash_mutexes[DBUF_MUTEXES];
293 } dbuf_hash_table_t;
294
295 uint64_t dbuf_whichblock(struct dnode *di, int64_t level, uint64_t offset);
296
297 dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data);
298 void dbuf_create_bonus(struct dnode *dn);
299 int dbuf_spill_set_blksz(dmu_buf_t *db, uint64_t blksz, dmu_tx_t *tx);
300 void dbuf_spill_hold(struct dnode *dn, dmu_buf_impl_t **dbp, void *tag);
301
302 void dbuf_rm_spill(struct dnode *dn, dmu_tx_t *tx);
303
304 dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag);
305 dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
306 void *tag);
307 int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid,
308 boolean_t fail_sparse, boolean_t fail_uncached,
309 void *tag, dmu_buf_impl_t **dbp);
310
311 void dbuf_prefetch(struct dnode *dn, int64_t level, uint64_t blkid,
312 zio_priority_t prio, arc_flags_t aflags);
313
314 void dbuf_add_ref(dmu_buf_impl_t *db, void *tag);
315 boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t obj,
316 uint64_t blkid, void *tag);
317 uint64_t dbuf_refcount(dmu_buf_impl_t *db);
318
319 void dbuf_rele(dmu_buf_impl_t *db, void *tag);
320 void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag);
321
322 dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level,
323 uint64_t blkid);
324
325 int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
326 void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx);
327 void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
328 void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
329 void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
330 dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
331 dbuf_dirty_record_t *dbuf_dirty_sc(dmu_buf_impl_t *db, dmu_tx_t *tx,
332 boolean_t usesc);
333 arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db);
334 void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
335 bp_embedded_type_t etype, enum zio_compress comp,
336 int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx);
337
338 void dbuf_destroy(dmu_buf_impl_t *db);
339
340 void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
341 void dbuf_unoverride(dbuf_dirty_record_t *dr);
342 void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx);
343 void dbuf_release_bp(dmu_buf_impl_t *db);
344
345 void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,
346 struct dmu_tx *);
347
348 void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx);
349
350 #define DB_DNODE(_db) ((_db)->db_dnode_handle->dnh_dnode)
351 #define DB_DNODE_LOCK(_db) ((_db)->db_dnode_handle->dnh_zrlock)
352 #define DB_DNODE_ENTER(_db) (zrl_add(&DB_DNODE_LOCK(_db)))
353 #define DB_DNODE_EXIT(_db) (zrl_remove(&DB_DNODE_LOCK(_db)))
354 #define DB_DNODE_HELD(_db) (!zrl_is_zero(&DB_DNODE_LOCK(_db)))
355
356 void dbuf_init(void);
357 void dbuf_fini(void);
358
359 boolean_t dbuf_is_metadata(dmu_buf_impl_t *db);
360 boolean_t dbuf_is_ddt(dmu_buf_impl_t *db);
361 boolean_t dbuf_ddt_is_l2cacheable(dmu_buf_impl_t *db);
362 boolean_t dbuf_meta_is_l2cacheable(dmu_buf_impl_t *db);
363
364 #define DBUF_GET_BUFC_TYPE(_db) \
365 (dbuf_is_ddt(_db) ? ARC_BUFC_DDT :\
366 (dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA))
367
368 #define DBUF_IS_CACHEABLE(_db) \
369 ((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL || \
370 (dbuf_is_metadata(_db) && \
371 ((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA)))
372
373 /*
374 * Checks whether we need to cache dbuf in l2arc.
375 * Metadata is l2cacheable if it is not placed on special device
376 * or it is placed on special device in "dual" mode. We need to check
377 * for ddt in ZFS_CACHE_ALL and ZFS_CACHE_METADATA because it is in MOS.
378 * ZFS_CACHE_DATA mode actually means to cache both data and cacheable
379 * metadata.
380 */
381 #define DBUF_IS_L2CACHEABLE(_db) \
382 (((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL && \
383 (dbuf_ddt_is_l2cacheable(_db) == B_TRUE)) || \
384 ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA && \
385 (dbuf_is_metadata(_db)) && \
386 (dbuf_ddt_is_l2cacheable(_db) == B_TRUE)) || \
387 ((dbuf_meta_is_l2cacheable(_db) == B_TRUE) && \
388 ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_DATA)))
389
390 #define DNODE_LEVEL_IS_L2CACHEABLE(_dn, _level) \
391 ((_dn)->dn_objset->os_secondary_cache == ZFS_CACHE_ALL || \
392 (((_level) > 0 || \
393 DMU_OT_IS_METADATA((_dn)->dn_handle->dnh_dnode->dn_type)) && \
394 ((_dn)->dn_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
395
396 #ifdef ZFS_DEBUG
397
398 /*
399 * There should be a ## between the string literal and fmt, to make it
400 * clear that we're joining two strings together, but gcc does not
401 * support that preprocessor token.
402 */
403 #define dprintf_dbuf(dbuf, fmt, ...) do { \
404 if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
405 char __db_buf[32]; \
406 uint64_t __db_obj = (dbuf)->db.db_object; \
407 if (__db_obj == DMU_META_DNODE_OBJECT) \
408 (void) strcpy(__db_buf, "mdn"); \
409 else \
410 (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \
411 (u_longlong_t)__db_obj); \
412 dprintf_ds((dbuf)->db_objset->os_dsl_dataset, \
413 "obj=%s lvl=%u blkid=%lld " fmt, \
414 __db_buf, (dbuf)->db_level, \
415 (u_longlong_t)(dbuf)->db_blkid, __VA_ARGS__); \
416 } \
417 _NOTE(CONSTCOND) } while (0)
418
419 #define dprintf_dbuf_bp(db, bp, fmt, ...) do { \
420 if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
421 char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \
422 snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp); \
423 dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf); \
424 kmem_free(__blkbuf, BP_SPRINTF_LEN); \
425 } \
426 _NOTE(CONSTCOND) } while (0)
427
428 #define DBUF_VERIFY(db) dbuf_verify(db)
429
430 #else
431
432 #define dprintf_dbuf(db, fmt, ...)
433 #define dprintf_dbuf_bp(db, bp, fmt, ...)
434 #define DBUF_VERIFY(db)
435
436 #endif
437
438
439 #ifdef __cplusplus
440 }
441 #endif
442
443 #endif /* _SYS_DBUF_H */