Print this page
6281 prefetching should apply to 1MB reads
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Alexander Motin <mav@freebsd.org>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Reviewed by: Justin Gibbs <gibbs@scsiguy.com>
Reviewed by: Xin Li <delphij@freebsd.org>
Approved by: Gordon Ross <gordon.ross@nexenta.com>
5987 zfs prefetch code needs work
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Approved by: Gordon Ross <gordon.ross@nexenta.com>
NEX-4582 update wrc test cases for allow to use write back cache per tree of datasets
Reviewed by: Steve Peng <steve.peng@nexenta.com>
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
5960 zfs recv should prefetch indirect blocks
5925 zfs receive -o origin=
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/fs/zfs/dmu_zfetch.c
+++ new/usr/src/uts/common/fs/zfs/dmu_zfetch.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 24 */
25 25
26 26 /*
27 27 * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
28 28 */
29 29
30 30 #include <sys/zfs_context.h>
31 31 #include <sys/dnode.h>
32 32 #include <sys/dmu_objset.h>
33 33 #include <sys/dmu_zfetch.h>
34 34 #include <sys/dmu.h>
35 35 #include <sys/dbuf.h>
36 36 #include <sys/kstat.h>
37 37
38 38 /*
39 39 * This tunable disables predictive prefetch. Note that it leaves "prescient"
40 40 * prefetch (e.g. prefetch for zfs send) intact. Unlike predictive prefetch,
41 41 * prescient prefetch never issues i/os that end up not being needed,
42 42 * so it can't hurt performance.
43 43 */
44 44 boolean_t zfs_prefetch_disable = B_FALSE;
45 45
46 46 /* max # of streams per zfetch */
47 47 uint32_t zfetch_max_streams = 8;
48 48 /* min time before stream reclaim */
49 49 uint32_t zfetch_min_sec_reap = 2;
50 50 /* max bytes to prefetch per stream (default 8MB) */
51 51 uint32_t zfetch_max_distance = 8 * 1024 * 1024;
52 52 /* max bytes to prefetch indirects for per stream (default 64MB) */
53 53 uint32_t zfetch_max_idistance = 64 * 1024 * 1024;
54 54 /* max number of bytes in an array_read in which we allow prefetching (1MB) */
55 55 uint64_t zfetch_array_rd_sz = 1024 * 1024;
56 56
57 57 typedef struct zfetch_stats {
58 58 kstat_named_t zfetchstat_hits;
59 59 kstat_named_t zfetchstat_misses;
60 60 kstat_named_t zfetchstat_max_streams;
61 61 } zfetch_stats_t;
62 62
63 63 static zfetch_stats_t zfetch_stats = {
64 64 { "hits", KSTAT_DATA_UINT64 },
65 65 { "misses", KSTAT_DATA_UINT64 },
66 66 { "max_streams", KSTAT_DATA_UINT64 },
67 67 };
68 68
69 69 #define ZFETCHSTAT_BUMP(stat) \
70 70 atomic_inc_64(&zfetch_stats.stat.value.ui64);
71 71
72 72 kstat_t *zfetch_ksp;
73 73
74 74 void
75 75 zfetch_init(void)
76 76 {
77 77 zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc",
78 78 KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t),
79 79 KSTAT_FLAG_VIRTUAL);
80 80
81 81 if (zfetch_ksp != NULL) {
82 82 zfetch_ksp->ks_data = &zfetch_stats;
83 83 kstat_install(zfetch_ksp);
84 84 }
85 85 }
86 86
87 87 void
88 88 zfetch_fini(void)
89 89 {
90 90 if (zfetch_ksp != NULL) {
91 91 kstat_delete(zfetch_ksp);
92 92 zfetch_ksp = NULL;
93 93 }
94 94 }
95 95
96 96 /*
97 97 * This takes a pointer to a zfetch structure and a dnode. It performs the
98 98 * necessary setup for the zfetch structure, grokking data from the
99 99 * associated dnode.
100 100 */
101 101 void
102 102 dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
103 103 {
104 104 if (zf == NULL)
105 105 return;
106 106
107 107 zf->zf_dnode = dno;
108 108
109 109 list_create(&zf->zf_stream, sizeof (zstream_t),
110 110 offsetof(zstream_t, zs_node));
111 111
112 112 rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL);
113 113 }
114 114
115 115 static void
116 116 dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
117 117 {
118 118 ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
119 119 list_remove(&zf->zf_stream, zs);
120 120 mutex_destroy(&zs->zs_lock);
121 121 kmem_free(zs, sizeof (*zs));
122 122 }
123 123
124 124 /*
125 125 * Clean-up state associated with a zfetch structure (e.g. destroy the
126 126 * streams). This doesn't free the zfetch_t itself, that's left to the caller.
127 127 */
128 128 void
129 129 dmu_zfetch_fini(zfetch_t *zf)
130 130 {
131 131 zstream_t *zs;
132 132
133 133 ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock));
134 134
135 135 rw_enter(&zf->zf_rwlock, RW_WRITER);
136 136 while ((zs = list_head(&zf->zf_stream)) != NULL)
137 137 dmu_zfetch_stream_remove(zf, zs);
138 138 rw_exit(&zf->zf_rwlock);
139 139 list_destroy(&zf->zf_stream);
140 140 rw_destroy(&zf->zf_rwlock);
141 141
142 142 zf->zf_dnode = NULL;
143 143 }
144 144
145 145 /*
146 146 * If there aren't too many streams already, create a new stream.
147 147 * The "blkid" argument is the next block that we expect this stream to access.
148 148 * While we're here, clean up old streams (which haven't been
149 149 * accessed for at least zfetch_min_sec_reap seconds).
150 150 */
151 151 static void
152 152 dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
153 153 {
154 154 zstream_t *zs_next;
155 155 int numstreams = 0;
156 156
157 157 ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
158 158
159 159 /*
160 160 * Clean up old streams.
161 161 */
162 162 for (zstream_t *zs = list_head(&zf->zf_stream);
163 163 zs != NULL; zs = zs_next) {
164 164 zs_next = list_next(&zf->zf_stream, zs);
165 165 if (((gethrtime() - zs->zs_atime) / NANOSEC) >
166 166 zfetch_min_sec_reap)
167 167 dmu_zfetch_stream_remove(zf, zs);
168 168 else
169 169 numstreams++;
170 170 }
171 171
172 172 /*
173 173 * The maximum number of streams is normally zfetch_max_streams,
174 174 * but for small files we lower it such that it's at least possible
175 175 * for all the streams to be non-overlapping.
176 176 *
177 177 * If we are already at the maximum number of streams for this file,
178 178 * even after removing old streams, then don't create this stream.
179 179 */
180 180 uint32_t max_streams = MAX(1, MIN(zfetch_max_streams,
181 181 zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz /
182 182 zfetch_max_distance));
183 183 if (numstreams >= max_streams) {
184 184 ZFETCHSTAT_BUMP(zfetchstat_max_streams);
185 185 return;
186 186 }
187 187
188 188 zstream_t *zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
189 189 zs->zs_blkid = blkid;
190 190 zs->zs_pf_blkid = blkid;
191 191 zs->zs_ipf_blkid = blkid;
192 192 zs->zs_atime = gethrtime();
193 193 mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL);
194 194
195 195 list_insert_head(&zf->zf_stream, zs);
196 196 }
197 197
198 198 /*
199 199 * This is the predictive prefetch entry point. It associates dnode access
200 200 * specified with blkid and nblks arguments with prefetch stream, predicts
201 201 * further accesses based on that stats and initiates speculative prefetch.
202 202 * fetch_data argument specifies whether actual data blocks should be fetched:
203 203 * FALSE -- prefetch only indirect blocks for predicted data blocks;
|
↓ open down ↓ |
203 lines elided |
↑ open up ↑ |
204 204 * TRUE -- prefetch predicted data blocks plus following indirect blocks.
205 205 */
206 206 void
207 207 dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data)
208 208 {
209 209 zstream_t *zs;
210 210 int64_t pf_start, ipf_start, ipf_istart, ipf_iend;
211 211 int64_t pf_ahead_blks, max_blks;
212 212 int epbs, max_dist_blks, pf_nblks, ipf_nblks;
213 213 uint64_t end_of_access_blkid = blkid + nblks;
214 - spa_t *spa = zf->zf_dnode->dn_objset->os_spa;
215 214
216 215 if (zfs_prefetch_disable)
217 216 return;
218 217
219 218 /*
220 - * If we haven't yet loaded the indirect vdevs' mappings, we
221 - * can only read from blocks that we carefully ensure are on
222 - * concrete vdevs (or previously-loaded indirect vdevs). So we
223 - * can't allow the predictive prefetcher to attempt reads of other
224 - * blocks (e.g. of the MOS's dnode obejct).
225 - */
226 - if (!spa_indirect_vdevs_loaded(spa))
227 - return;
228 -
229 - /*
230 219 * As a fast path for small (single-block) files, ignore access
231 220 * to the first block.
232 221 */
233 222 if (blkid == 0)
234 223 return;
235 224
236 225 rw_enter(&zf->zf_rwlock, RW_READER);
237 226
238 - /*
239 - * Find matching prefetch stream. Depending on whether the accesses
240 - * are block-aligned, first block of the new access may either follow
241 - * the last block of the previous access, or be equal to it.
242 - */
243 227 for (zs = list_head(&zf->zf_stream); zs != NULL;
244 228 zs = list_next(&zf->zf_stream, zs)) {
245 - if (blkid == zs->zs_blkid || blkid + 1 == zs->zs_blkid) {
229 + if (blkid == zs->zs_blkid) {
246 230 mutex_enter(&zs->zs_lock);
247 231 /*
248 232 * zs_blkid could have changed before we
249 233 * acquired zs_lock; re-check them here.
250 234 */
251 - if (blkid == zs->zs_blkid) {
252 - break;
253 - } else if (blkid + 1 == zs->zs_blkid) {
254 - blkid++;
255 - nblks--;
256 - if (nblks == 0) {
257 - /* Already prefetched this before. */
258 - mutex_exit(&zs->zs_lock);
259 - rw_exit(&zf->zf_rwlock);
260 - return;
261 - }
262 - break;
235 + if (blkid != zs->zs_blkid) {
236 + mutex_exit(&zs->zs_lock);
237 + continue;
263 238 }
264 - mutex_exit(&zs->zs_lock);
239 + break;
265 240 }
266 241 }
267 242
268 243 if (zs == NULL) {
269 244 /*
270 245 * This access is not part of any existing stream. Create
271 246 * a new stream for it.
272 247 */
273 248 ZFETCHSTAT_BUMP(zfetchstat_misses);
274 249 if (rw_tryupgrade(&zf->zf_rwlock))
275 250 dmu_zfetch_stream_create(zf, end_of_access_blkid);
276 251 rw_exit(&zf->zf_rwlock);
277 252 return;
278 253 }
279 254
280 255 /*
281 256 * This access was to a block that we issued a prefetch for on
282 257 * behalf of this stream. Issue further prefetches for this stream.
283 258 *
284 259 * Normally, we start prefetching where we stopped
285 260 * prefetching last (zs_pf_blkid). But when we get our first
286 261 * hit on this stream, zs_pf_blkid == zs_blkid, we don't
287 262 * want to prefetch the block we just accessed. In this case,
288 263 * start just after the block we just accessed.
289 264 */
290 265 pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid);
291 266
292 267 /*
293 268 * Double our amount of prefetched data, but don't let the
294 269 * prefetch get further ahead than zfetch_max_distance.
295 270 */
296 271 if (fetch_data) {
297 272 max_dist_blks =
298 273 zfetch_max_distance >> zf->zf_dnode->dn_datablkshift;
299 274 /*
300 275 * Previously, we were (zs_pf_blkid - blkid) ahead. We
301 276 * want to now be double that, so read that amount again,
302 277 * plus the amount we are catching up by (i.e. the amount
303 278 * read just now).
304 279 */
305 280 pf_ahead_blks = zs->zs_pf_blkid - blkid + nblks;
306 281 max_blks = max_dist_blks - (pf_start - end_of_access_blkid);
307 282 pf_nblks = MIN(pf_ahead_blks, max_blks);
308 283 } else {
309 284 pf_nblks = 0;
310 285 }
311 286
312 287 zs->zs_pf_blkid = pf_start + pf_nblks;
313 288
314 289 /*
315 290 * Do the same for indirects, starting from where we stopped last,
316 291 * or where we will stop reading data blocks (and the indirects
317 292 * that point to them).
318 293 */
319 294 ipf_start = MAX(zs->zs_ipf_blkid, zs->zs_pf_blkid);
320 295 max_dist_blks = zfetch_max_idistance >> zf->zf_dnode->dn_datablkshift;
321 296 /*
322 297 * We want to double our distance ahead of the data prefetch
323 298 * (or reader, if we are not prefetching data). Previously, we
324 299 * were (zs_ipf_blkid - blkid) ahead. To double that, we read
325 300 * that amount again, plus the amount we are catching up by
326 301 * (i.e. the amount read now + the amount of data prefetched now).
327 302 */
328 303 pf_ahead_blks = zs->zs_ipf_blkid - blkid + nblks + pf_nblks;
329 304 max_blks = max_dist_blks - (ipf_start - end_of_access_blkid);
330 305 ipf_nblks = MIN(pf_ahead_blks, max_blks);
331 306 zs->zs_ipf_blkid = ipf_start + ipf_nblks;
332 307
333 308 epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
334 309 ipf_istart = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs;
335 310 ipf_iend = P2ROUNDUP(zs->zs_ipf_blkid, 1 << epbs) >> epbs;
336 311
337 312 zs->zs_atime = gethrtime();
338 313 zs->zs_blkid = end_of_access_blkid;
339 314 mutex_exit(&zs->zs_lock);
340 315 rw_exit(&zf->zf_rwlock);
341 316
342 317 /*
343 318 * dbuf_prefetch() is asynchronous (even when it needs to read
344 319 * indirect blocks), but we still prefer to drop our locks before
345 320 * calling it to reduce the time we hold them.
346 321 */
347 322
348 323 for (int i = 0; i < pf_nblks; i++) {
349 324 dbuf_prefetch(zf->zf_dnode, 0, pf_start + i,
350 325 ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
351 326 }
352 327 for (int64_t iblk = ipf_istart; iblk < ipf_iend; iblk++) {
353 328 dbuf_prefetch(zf->zf_dnode, 1, iblk,
354 329 ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
355 330 }
356 331 ZFETCHSTAT_BUMP(zfetchstat_hits);
357 332 }
|
↓ open down ↓ |
83 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX