Print this page
6319 assertion failed in zio_ddt_write: bp->blk_birth == txg
Reviewed by: George Wilson <george.wilson@delphix.com>
Approved by: Dan McDonald <danmcd@omniti.com>
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/fs/zfs/zio.c
+++ new/usr/src/uts/common/fs/zfs/zio.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
24 24 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
25 25 * Copyright 2013 Joyent, Inc. All rights reserved.
26 26 */
27 27
28 28 #include <sys/sysmacros.h>
29 29 #include <sys/zfs_context.h>
30 30 #include <sys/fm/fs/zfs.h>
31 31 #include <sys/spa.h>
32 32 #include <sys/txg.h>
33 33 #include <sys/spa_impl.h>
34 34 #include <sys/vdev_impl.h>
35 35 #include <sys/zio_impl.h>
36 36 #include <sys/zio_compress.h>
37 37 #include <sys/zio_checksum.h>
38 38 #include <sys/dmu_objset.h>
39 39 #include <sys/arc.h>
40 40 #include <sys/ddt.h>
41 41 #include <sys/zfs_zone.h>
42 42 #include <sys/blkptr.h>
43 43 #include <sys/zfeature.h>
44 44
45 45 /*
46 46 * ==========================================================================
47 47 * I/O type descriptions
48 48 * ==========================================================================
49 49 */
50 50 const char *zio_type_name[ZIO_TYPES] = {
51 51 "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim",
52 52 "zio_ioctl"
53 53 };
54 54
55 55 /*
56 56 * ==========================================================================
57 57 * I/O kmem caches
58 58 * ==========================================================================
59 59 */
60 60 kmem_cache_t *zio_cache;
61 61 kmem_cache_t *zio_link_cache;
62 62 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
63 63 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
64 64
65 65 #ifdef _KERNEL
66 66 extern vmem_t *zio_alloc_arena;
67 67 #endif
68 68
69 69 #define ZIO_PIPELINE_CONTINUE 0x100
70 70 #define ZIO_PIPELINE_STOP 0x101
71 71
72 72 /*
73 73 * The following actions directly effect the spa's sync-to-convergence logic.
74 74 * The values below define the sync pass when we start performing the action.
75 75 * Care should be taken when changing these values as they directly impact
76 76 * spa_sync() performance. Tuning these values may introduce subtle performance
77 77 * pathologies and should only be done in the context of performance analysis.
78 78 * These tunables will eventually be removed and replaced with #defines once
79 79 * enough analysis has been done to determine optimal values.
80 80 *
81 81 * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
82 82 * regular blocks are not deferred.
83 83 */
84 84 int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
85 85 int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */
86 86 int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
87 87
88 88 /*
89 89 * An allocating zio is one that either currently has the DVA allocate
90 90 * stage set or will have it later in its lifetime.
91 91 */
92 92 #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
93 93
94 94 boolean_t zio_requeue_io_start_cut_in_line = B_TRUE;
95 95
96 96 #ifdef ZFS_DEBUG
97 97 int zio_buf_debug_limit = 16384;
98 98 #else
99 99 int zio_buf_debug_limit = 0;
100 100 #endif
101 101
102 102 void
103 103 zio_init(void)
104 104 {
105 105 size_t c;
106 106 vmem_t *data_alloc_arena = NULL;
107 107
108 108 #ifdef _KERNEL
109 109 data_alloc_arena = zio_alloc_arena;
110 110 #endif
111 111 zio_cache = kmem_cache_create("zio_cache",
112 112 sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
113 113 zio_link_cache = kmem_cache_create("zio_link_cache",
114 114 sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
115 115
116 116 /*
117 117 * For small buffers, we want a cache for each multiple of
118 118 * SPA_MINBLOCKSIZE. For larger buffers, we want a cache
119 119 * for each quarter-power of 2.
120 120 */
121 121 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
122 122 size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
123 123 size_t p2 = size;
124 124 size_t align = 0;
125 125 size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0;
126 126
127 127 while (!ISP2(p2))
128 128 p2 &= p2 - 1;
129 129
130 130 #ifndef _KERNEL
131 131 /*
132 132 * If we are using watchpoints, put each buffer on its own page,
133 133 * to eliminate the performance overhead of trapping to the
134 134 * kernel when modifying a non-watched buffer that shares the
135 135 * page with a watched buffer.
136 136 */
137 137 if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
138 138 continue;
139 139 #endif
140 140 if (size <= 4 * SPA_MINBLOCKSIZE) {
141 141 align = SPA_MINBLOCKSIZE;
142 142 } else if (IS_P2ALIGNED(size, p2 >> 2)) {
143 143 align = MIN(p2 >> 2, PAGESIZE);
144 144 }
145 145
146 146 if (align != 0) {
147 147 char name[36];
148 148 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
149 149 zio_buf_cache[c] = kmem_cache_create(name, size,
150 150 align, NULL, NULL, NULL, NULL, NULL, cflags);
151 151
152 152 /*
153 153 * Since zio_data bufs do not appear in crash dumps, we
154 154 * pass KMC_NOTOUCH so that no allocator metadata is
155 155 * stored with the buffers.
156 156 */
157 157 (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
158 158 zio_data_buf_cache[c] = kmem_cache_create(name, size,
159 159 align, NULL, NULL, NULL, NULL, data_alloc_arena,
160 160 cflags | KMC_NOTOUCH);
161 161 }
162 162 }
163 163
164 164 while (--c != 0) {
165 165 ASSERT(zio_buf_cache[c] != NULL);
166 166 if (zio_buf_cache[c - 1] == NULL)
167 167 zio_buf_cache[c - 1] = zio_buf_cache[c];
168 168
169 169 ASSERT(zio_data_buf_cache[c] != NULL);
170 170 if (zio_data_buf_cache[c - 1] == NULL)
171 171 zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
172 172 }
173 173
174 174 zio_inject_init();
175 175 }
176 176
177 177 void
178 178 zio_fini(void)
179 179 {
180 180 size_t c;
181 181 kmem_cache_t *last_cache = NULL;
182 182 kmem_cache_t *last_data_cache = NULL;
183 183
184 184 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
185 185 if (zio_buf_cache[c] != last_cache) {
186 186 last_cache = zio_buf_cache[c];
187 187 kmem_cache_destroy(zio_buf_cache[c]);
188 188 }
189 189 zio_buf_cache[c] = NULL;
190 190
191 191 if (zio_data_buf_cache[c] != last_data_cache) {
192 192 last_data_cache = zio_data_buf_cache[c];
193 193 kmem_cache_destroy(zio_data_buf_cache[c]);
194 194 }
195 195 zio_data_buf_cache[c] = NULL;
196 196 }
197 197
198 198 kmem_cache_destroy(zio_link_cache);
199 199 kmem_cache_destroy(zio_cache);
200 200
201 201 zio_inject_fini();
202 202 }
203 203
204 204 /*
205 205 * ==========================================================================
206 206 * Allocate and free I/O buffers
207 207 * ==========================================================================
208 208 */
209 209
210 210 /*
211 211 * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a
212 212 * crashdump if the kernel panics, so use it judiciously. Obviously, it's
213 213 * useful to inspect ZFS metadata, but if possible, we should avoid keeping
214 214 * excess / transient data in-core during a crashdump.
215 215 */
216 216 void *
217 217 zio_buf_alloc(size_t size)
218 218 {
219 219 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
220 220
221 221 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
222 222
223 223 return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
224 224 }
225 225
226 226 /*
227 227 * Use zio_data_buf_alloc to allocate data. The data will not appear in a
228 228 * crashdump if the kernel panics. This exists so that we will limit the amount
229 229 * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount
230 230 * of kernel heap dumped to disk when the kernel panics)
231 231 */
232 232 void *
233 233 zio_data_buf_alloc(size_t size)
234 234 {
235 235 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
236 236
237 237 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
238 238
239 239 return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
240 240 }
241 241
242 242 void
243 243 zio_buf_free(void *buf, size_t size)
244 244 {
245 245 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
246 246
247 247 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
248 248
249 249 kmem_cache_free(zio_buf_cache[c], buf);
250 250 }
251 251
252 252 void
253 253 zio_data_buf_free(void *buf, size_t size)
254 254 {
255 255 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
256 256
257 257 VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
258 258
259 259 kmem_cache_free(zio_data_buf_cache[c], buf);
260 260 }
261 261
262 262 /*
263 263 * ==========================================================================
264 264 * Push and pop I/O transform buffers
265 265 * ==========================================================================
266 266 */
267 267 static void
268 268 zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
269 269 zio_transform_func_t *transform)
270 270 {
271 271 zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
272 272
273 273 zt->zt_orig_data = zio->io_data;
274 274 zt->zt_orig_size = zio->io_size;
275 275 zt->zt_bufsize = bufsize;
276 276 zt->zt_transform = transform;
277 277
278 278 zt->zt_next = zio->io_transform_stack;
279 279 zio->io_transform_stack = zt;
280 280
281 281 zio->io_data = data;
282 282 zio->io_size = size;
283 283 }
284 284
285 285 static void
286 286 zio_pop_transforms(zio_t *zio)
287 287 {
288 288 zio_transform_t *zt;
289 289
290 290 while ((zt = zio->io_transform_stack) != NULL) {
291 291 if (zt->zt_transform != NULL)
292 292 zt->zt_transform(zio,
293 293 zt->zt_orig_data, zt->zt_orig_size);
294 294
295 295 if (zt->zt_bufsize != 0)
296 296 zio_buf_free(zio->io_data, zt->zt_bufsize);
297 297
298 298 zio->io_data = zt->zt_orig_data;
299 299 zio->io_size = zt->zt_orig_size;
300 300 zio->io_transform_stack = zt->zt_next;
301 301
302 302 kmem_free(zt, sizeof (zio_transform_t));
303 303 }
304 304 }
305 305
306 306 /*
307 307 * ==========================================================================
308 308 * I/O transform callbacks for subblocks and decompression
309 309 * ==========================================================================
310 310 */
311 311 static void
312 312 zio_subblock(zio_t *zio, void *data, uint64_t size)
313 313 {
314 314 ASSERT(zio->io_size > size);
315 315
316 316 if (zio->io_type == ZIO_TYPE_READ)
317 317 bcopy(zio->io_data, data, size);
318 318 }
319 319
320 320 static void
321 321 zio_decompress(zio_t *zio, void *data, uint64_t size)
322 322 {
323 323 if (zio->io_error == 0 &&
324 324 zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
325 325 zio->io_data, data, zio->io_size, size) != 0)
326 326 zio->io_error = SET_ERROR(EIO);
327 327 }
328 328
329 329 /*
330 330 * ==========================================================================
331 331 * I/O parent/child relationships and pipeline interlocks
332 332 * ==========================================================================
333 333 */
334 334 /*
335 335 * NOTE - Callers to zio_walk_parents() and zio_walk_children must
336 336 * continue calling these functions until they return NULL.
337 337 * Otherwise, the next caller will pick up the list walk in
338 338 * some indeterminate state. (Otherwise every caller would
339 339 * have to pass in a cookie to keep the state represented by
340 340 * io_walk_link, which gets annoying.)
341 341 */
342 342 zio_t *
343 343 zio_walk_parents(zio_t *cio)
344 344 {
345 345 zio_link_t *zl = cio->io_walk_link;
346 346 list_t *pl = &cio->io_parent_list;
347 347
348 348 zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl);
349 349 cio->io_walk_link = zl;
350 350
351 351 if (zl == NULL)
352 352 return (NULL);
353 353
354 354 ASSERT(zl->zl_child == cio);
355 355 return (zl->zl_parent);
356 356 }
357 357
358 358 zio_t *
359 359 zio_walk_children(zio_t *pio)
360 360 {
361 361 zio_link_t *zl = pio->io_walk_link;
362 362 list_t *cl = &pio->io_child_list;
363 363
364 364 zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl);
365 365 pio->io_walk_link = zl;
366 366
367 367 if (zl == NULL)
368 368 return (NULL);
369 369
370 370 ASSERT(zl->zl_parent == pio);
371 371 return (zl->zl_child);
372 372 }
373 373
374 374 zio_t *
375 375 zio_unique_parent(zio_t *cio)
376 376 {
377 377 zio_t *pio = zio_walk_parents(cio);
378 378
379 379 VERIFY(zio_walk_parents(cio) == NULL);
380 380 return (pio);
381 381 }
382 382
383 383 void
384 384 zio_add_child(zio_t *pio, zio_t *cio)
385 385 {
386 386 zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
387 387
388 388 /*
389 389 * Logical I/Os can have logical, gang, or vdev children.
390 390 * Gang I/Os can have gang or vdev children.
391 391 * Vdev I/Os can only have vdev children.
392 392 * The following ASSERT captures all of these constraints.
393 393 */
394 394 ASSERT(cio->io_child_type <= pio->io_child_type);
395 395
396 396 zl->zl_parent = pio;
397 397 zl->zl_child = cio;
398 398
399 399 mutex_enter(&cio->io_lock);
400 400 mutex_enter(&pio->io_lock);
401 401
402 402 ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
403 403
404 404 for (int w = 0; w < ZIO_WAIT_TYPES; w++)
405 405 pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
406 406
407 407 list_insert_head(&pio->io_child_list, zl);
408 408 list_insert_head(&cio->io_parent_list, zl);
409 409
410 410 pio->io_child_count++;
411 411 cio->io_parent_count++;
412 412
413 413 mutex_exit(&pio->io_lock);
414 414 mutex_exit(&cio->io_lock);
415 415 }
416 416
417 417 static void
418 418 zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
419 419 {
420 420 ASSERT(zl->zl_parent == pio);
421 421 ASSERT(zl->zl_child == cio);
422 422
423 423 mutex_enter(&cio->io_lock);
424 424 mutex_enter(&pio->io_lock);
425 425
426 426 list_remove(&pio->io_child_list, zl);
427 427 list_remove(&cio->io_parent_list, zl);
428 428
429 429 pio->io_child_count--;
430 430 cio->io_parent_count--;
431 431
432 432 mutex_exit(&pio->io_lock);
433 433 mutex_exit(&cio->io_lock);
434 434
435 435 kmem_cache_free(zio_link_cache, zl);
436 436 }
437 437
438 438 static boolean_t
439 439 zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
440 440 {
441 441 uint64_t *countp = &zio->io_children[child][wait];
442 442 boolean_t waiting = B_FALSE;
443 443
444 444 mutex_enter(&zio->io_lock);
445 445 ASSERT(zio->io_stall == NULL);
446 446 if (*countp != 0) {
447 447 zio->io_stage >>= 1;
448 448 zio->io_stall = countp;
449 449 waiting = B_TRUE;
450 450 }
451 451 mutex_exit(&zio->io_lock);
452 452
453 453 return (waiting);
454 454 }
455 455
456 456 static void
457 457 zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
458 458 {
459 459 uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
460 460 int *errorp = &pio->io_child_error[zio->io_child_type];
461 461
462 462 mutex_enter(&pio->io_lock);
463 463 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
464 464 *errorp = zio_worst_error(*errorp, zio->io_error);
465 465 pio->io_reexecute |= zio->io_reexecute;
466 466 ASSERT3U(*countp, >, 0);
467 467
468 468 (*countp)--;
469 469
470 470 if (*countp == 0 && pio->io_stall == countp) {
471 471 pio->io_stall = NULL;
472 472 mutex_exit(&pio->io_lock);
473 473 zio_execute(pio);
474 474 } else {
475 475 mutex_exit(&pio->io_lock);
476 476 }
477 477 }
478 478
479 479 static void
480 480 zio_inherit_child_errors(zio_t *zio, enum zio_child c)
481 481 {
482 482 if (zio->io_child_error[c] != 0 && zio->io_error == 0)
483 483 zio->io_error = zio->io_child_error[c];
484 484 }
485 485
486 486 /*
487 487 * ==========================================================================
488 488 * Create the various types of I/O (read, write, free, etc)
489 489 * ==========================================================================
490 490 */
491 491 static zio_t *
492 492 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
493 493 void *data, uint64_t size, zio_done_func_t *done, void *private,
494 494 zio_type_t type, zio_priority_t priority, enum zio_flag flags,
495 495 vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb,
496 496 enum zio_stage stage, enum zio_stage pipeline)
497 497 {
498 498 zio_t *zio;
499 499
500 500 ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
501 501 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
502 502 ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
503 503
504 504 ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
505 505 ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
506 506 ASSERT(vd || stage == ZIO_STAGE_OPEN);
507 507
508 508 zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
509 509 bzero(zio, sizeof (zio_t));
510 510
511 511 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
512 512 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
513 513
514 514 list_create(&zio->io_parent_list, sizeof (zio_link_t),
515 515 offsetof(zio_link_t, zl_parent_node));
516 516 list_create(&zio->io_child_list, sizeof (zio_link_t),
517 517 offsetof(zio_link_t, zl_child_node));
518 518
519 519 if (vd != NULL)
520 520 zio->io_child_type = ZIO_CHILD_VDEV;
521 521 else if (flags & ZIO_FLAG_GANG_CHILD)
522 522 zio->io_child_type = ZIO_CHILD_GANG;
523 523 else if (flags & ZIO_FLAG_DDT_CHILD)
524 524 zio->io_child_type = ZIO_CHILD_DDT;
525 525 else
526 526 zio->io_child_type = ZIO_CHILD_LOGICAL;
527 527
528 528 if (bp != NULL) {
529 529 zio->io_bp = (blkptr_t *)bp;
530 530 zio->io_bp_copy = *bp;
531 531 zio->io_bp_orig = *bp;
532 532 if (type != ZIO_TYPE_WRITE ||
533 533 zio->io_child_type == ZIO_CHILD_DDT)
534 534 zio->io_bp = &zio->io_bp_copy; /* so caller can free */
535 535 if (zio->io_child_type == ZIO_CHILD_LOGICAL)
536 536 zio->io_logical = zio;
537 537 if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
538 538 pipeline |= ZIO_GANG_STAGES;
539 539 }
540 540
541 541 zio->io_spa = spa;
542 542 zio->io_txg = txg;
543 543 zio->io_done = done;
544 544 zio->io_private = private;
545 545 zio->io_type = type;
546 546 zio->io_priority = priority;
547 547 zio->io_vd = vd;
548 548 zio->io_offset = offset;
549 549 zio->io_orig_data = zio->io_data = data;
550 550 zio->io_orig_size = zio->io_size = size;
551 551 zio->io_orig_flags = zio->io_flags = flags;
552 552 zio->io_orig_stage = zio->io_stage = stage;
553 553 zio->io_orig_pipeline = zio->io_pipeline = pipeline;
554 554
555 555 zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
556 556 zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
557 557
558 558 if (zb != NULL)
559 559 zio->io_bookmark = *zb;
560 560
561 561 if (pio != NULL) {
562 562 zio->io_zoneid = pio->io_zoneid;
563 563 if (zio->io_logical == NULL)
564 564 zio->io_logical = pio->io_logical;
565 565 if (zio->io_child_type == ZIO_CHILD_GANG)
566 566 zio->io_gang_leader = pio->io_gang_leader;
567 567 zio_add_child(pio, zio);
568 568 } else {
569 569 zfs_zone_zio_init(zio);
570 570 }
571 571
572 572 return (zio);
573 573 }
574 574
575 575 static void
576 576 zio_destroy(zio_t *zio)
577 577 {
578 578 list_destroy(&zio->io_parent_list);
579 579 list_destroy(&zio->io_child_list);
580 580 mutex_destroy(&zio->io_lock);
581 581 cv_destroy(&zio->io_cv);
582 582 kmem_cache_free(zio_cache, zio);
583 583 }
584 584
585 585 zio_t *
586 586 zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
587 587 void *private, enum zio_flag flags)
588 588 {
589 589 zio_t *zio;
590 590
591 591 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
592 592 ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
593 593 ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
594 594
595 595 return (zio);
596 596 }
597 597
598 598 zio_t *
599 599 zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
600 600 {
601 601 return (zio_null(NULL, spa, NULL, done, private, flags));
602 602 }
603 603
604 604 void
605 605 zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp)
606 606 {
607 607 if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) {
608 608 zfs_panic_recover("blkptr at %p has invalid TYPE %llu",
609 609 bp, (longlong_t)BP_GET_TYPE(bp));
610 610 }
611 611 if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS ||
612 612 BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) {
613 613 zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu",
614 614 bp, (longlong_t)BP_GET_CHECKSUM(bp));
615 615 }
616 616 if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS ||
617 617 BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) {
618 618 zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu",
619 619 bp, (longlong_t)BP_GET_COMPRESS(bp));
620 620 }
621 621 if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) {
622 622 zfs_panic_recover("blkptr at %p has invalid LSIZE %llu",
623 623 bp, (longlong_t)BP_GET_LSIZE(bp));
624 624 }
625 625 if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) {
626 626 zfs_panic_recover("blkptr at %p has invalid PSIZE %llu",
627 627 bp, (longlong_t)BP_GET_PSIZE(bp));
628 628 }
629 629
630 630 if (BP_IS_EMBEDDED(bp)) {
631 631 if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) {
632 632 zfs_panic_recover("blkptr at %p has invalid ETYPE %llu",
633 633 bp, (longlong_t)BPE_GET_ETYPE(bp));
634 634 }
635 635 }
636 636
637 637 /*
638 638 * Pool-specific checks.
639 639 *
640 640 * Note: it would be nice to verify that the blk_birth and
641 641 * BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze()
642 642 * allows the birth time of log blocks (and dmu_sync()-ed blocks
643 643 * that are in the log) to be arbitrarily large.
644 644 */
645 645 for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
646 646 uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]);
647 647 if (vdevid >= spa->spa_root_vdev->vdev_children) {
648 648 zfs_panic_recover("blkptr at %p DVA %u has invalid "
649 649 "VDEV %llu",
650 650 bp, i, (longlong_t)vdevid);
651 651 }
652 652 vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
653 653 if (vd == NULL) {
654 654 zfs_panic_recover("blkptr at %p DVA %u has invalid "
655 655 "VDEV %llu",
656 656 bp, i, (longlong_t)vdevid);
657 657 }
658 658 if (vd->vdev_ops == &vdev_hole_ops) {
659 659 zfs_panic_recover("blkptr at %p DVA %u has hole "
660 660 "VDEV %llu",
661 661 bp, i, (longlong_t)vdevid);
662 662
663 663 }
664 664 if (vd->vdev_ops == &vdev_missing_ops) {
665 665 /*
666 666 * "missing" vdevs are valid during import, but we
667 667 * don't have their detailed info (e.g. asize), so
668 668 * we can't perform any more checks on them.
669 669 */
670 670 continue;
671 671 }
672 672 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
673 673 uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[i]);
674 674 if (BP_IS_GANG(bp))
675 675 asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
676 676 if (offset + asize > vd->vdev_asize) {
677 677 zfs_panic_recover("blkptr at %p DVA %u has invalid "
678 678 "OFFSET %llu",
679 679 bp, i, (longlong_t)offset);
680 680 }
681 681 }
682 682 }
683 683
684 684 zio_t *
685 685 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
686 686 void *data, uint64_t size, zio_done_func_t *done, void *private,
687 687 zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
688 688 {
689 689 zio_t *zio;
690 690
691 691 zfs_blkptr_verify(spa, bp);
692 692
693 693 zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
694 694 data, size, done, private,
695 695 ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
696 696 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
697 697 ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
698 698
699 699 return (zio);
700 700 }
701 701
702 702 zio_t *
703 703 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
704 704 void *data, uint64_t size, const zio_prop_t *zp,
705 705 zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done,
706 706 void *private,
707 707 zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
708 708 {
709 709 zio_t *zio;
710 710
711 711 ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
712 712 zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
713 713 zp->zp_compress >= ZIO_COMPRESS_OFF &&
714 714 zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
715 715 DMU_OT_IS_VALID(zp->zp_type) &&
716 716 zp->zp_level < 32 &&
717 717 zp->zp_copies > 0 &&
718 718 zp->zp_copies <= spa_max_replication(spa));
719 719
720 720 zio = zio_create(pio, spa, txg, bp, data, size, done, private,
721 721 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
722 722 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
723 723 ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
724 724
725 725 zio->io_ready = ready;
726 726 zio->io_physdone = physdone;
727 727 zio->io_prop = *zp;
728 728
729 729 /*
730 730 * Data can be NULL if we are going to call zio_write_override() to
731 731 * provide the already-allocated BP. But we may need the data to
732 732 * verify a dedup hit (if requested). In this case, don't try to
733 733 * dedup (just take the already-allocated BP verbatim).
734 734 */
735 735 if (data == NULL && zio->io_prop.zp_dedup_verify) {
736 736 zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
737 737 }
738 738
739 739 return (zio);
740 740 }
741 741
742 742 zio_t *
743 743 zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
744 744 uint64_t size, zio_done_func_t *done, void *private,
745 745 zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb)
746 746 {
747 747 zio_t *zio;
748 748
749 749 zio = zio_create(pio, spa, txg, bp, data, size, done, private,
750 750 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
751 751 ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
752 752
753 753 return (zio);
754 754 }
755 755
756 756 void
757 757 zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
758 758 {
759 759 ASSERT(zio->io_type == ZIO_TYPE_WRITE);
760 760 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
761 761 ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
762 762 ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
763 763
764 764 /*
765 765 * We must reset the io_prop to match the values that existed
766 766 * when the bp was first written by dmu_sync() keeping in mind
767 767 * that nopwrite and dedup are mutually exclusive.
768 768 */
769 769 zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
770 770 zio->io_prop.zp_nopwrite = nopwrite;
771 771 zio->io_prop.zp_copies = copies;
772 772 zio->io_bp_override = bp;
773 773 }
774 774
775 775 void
776 776 zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
777 777 {
778 778
779 779 /*
780 780 * The check for EMBEDDED is a performance optimization. We
781 781 * process the free here (by ignoring it) rather than
782 782 * putting it on the list and then processing it in zio_free_sync().
783 783 */
784 784 if (BP_IS_EMBEDDED(bp))
785 785 return;
786 786 metaslab_check_free(spa, bp);
787 787
788 788 /*
789 789 * Frees that are for the currently-syncing txg, are not going to be
790 790 * deferred, and which will not need to do a read (i.e. not GANG or
791 791 * DEDUP), can be processed immediately. Otherwise, put them on the
792 792 * in-memory list for later processing.
793 793 */
794 794 if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) ||
795 795 txg != spa->spa_syncing_txg ||
796 796 spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) {
797 797 bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
798 798 } else {
799 799 VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 0)));
800 800 }
801 801 }
802 802
803 803 zio_t *
804 804 zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
805 805 enum zio_flag flags)
806 806 {
807 807 zio_t *zio;
808 808 enum zio_stage stage = ZIO_FREE_PIPELINE;
809 809
810 810 ASSERT(!BP_IS_HOLE(bp));
811 811 ASSERT(spa_syncing_txg(spa) == txg);
812 812 ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
813 813
814 814 if (BP_IS_EMBEDDED(bp))
815 815 return (zio_null(pio, spa, NULL, NULL, NULL, 0));
816 816
817 817 metaslab_check_free(spa, bp);
818 818 arc_freed(spa, bp);
819 819
820 820 /*
821 821 * GANG and DEDUP blocks can induce a read (for the gang block header,
822 822 * or the DDT), so issue them asynchronously so that this thread is
823 823 * not tied up.
824 824 */
825 825 if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp))
826 826 stage |= ZIO_STAGE_ISSUE_ASYNC;
827 827
828 828 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
829 829 NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags,
830 830 NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
831 831
832 832 return (zio);
833 833 }
834 834
835 835 zio_t *
836 836 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
837 837 zio_done_func_t *done, void *private, enum zio_flag flags)
838 838 {
839 839 zio_t *zio;
840 840
841 841 dprintf_bp(bp, "claiming in txg %llu", txg);
842 842
843 843 if (BP_IS_EMBEDDED(bp))
844 844 return (zio_null(pio, spa, NULL, NULL, NULL, 0));
845 845
846 846 /*
847 847 * A claim is an allocation of a specific block. Claims are needed
848 848 * to support immediate writes in the intent log. The issue is that
849 849 * immediate writes contain committed data, but in a txg that was
850 850 * *not* committed. Upon opening the pool after an unclean shutdown,
851 851 * the intent log claims all blocks that contain immediate write data
852 852 * so that the SPA knows they're in use.
853 853 *
854 854 * All claims *must* be resolved in the first txg -- before the SPA
855 855 * starts allocating blocks -- so that nothing is allocated twice.
856 856 * If txg == 0 we just verify that the block is claimable.
857 857 */
858 858 ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
859 859 ASSERT(txg == spa_first_txg(spa) || txg == 0);
860 860 ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */
861 861
862 862 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
863 863 done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
864 864 NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
865 865
866 866 return (zio);
867 867 }
868 868
869 869 zio_t *
870 870 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
871 871 zio_done_func_t *done, void *private, enum zio_flag flags)
872 872 {
873 873 zio_t *zio;
874 874 int c;
875 875
876 876 if (vd->vdev_children == 0) {
877 877 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
878 878 ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
879 879 ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
880 880
881 881 zio->io_cmd = cmd;
882 882 } else {
883 883 zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
884 884
885 885 for (c = 0; c < vd->vdev_children; c++)
886 886 zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
887 887 done, private, flags));
888 888 }
889 889
890 890 return (zio);
891 891 }
892 892
893 893 zio_t *
894 894 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
895 895 void *data, int checksum, zio_done_func_t *done, void *private,
896 896 zio_priority_t priority, enum zio_flag flags, boolean_t labels)
897 897 {
898 898 zio_t *zio;
899 899
900 900 ASSERT(vd->vdev_children == 0);
901 901 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
902 902 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
903 903 ASSERT3U(offset + size, <=, vd->vdev_psize);
904 904
905 905 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
906 906 ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
907 907 NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
908 908
909 909 zio->io_prop.zp_checksum = checksum;
910 910
911 911 return (zio);
912 912 }
913 913
914 914 zio_t *
915 915 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
916 916 void *data, int checksum, zio_done_func_t *done, void *private,
917 917 zio_priority_t priority, enum zio_flag flags, boolean_t labels)
918 918 {
919 919 zio_t *zio;
920 920
921 921 ASSERT(vd->vdev_children == 0);
922 922 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
923 923 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
924 924 ASSERT3U(offset + size, <=, vd->vdev_psize);
925 925
926 926 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
927 927 ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
928 928 NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
929 929
930 930 zio->io_prop.zp_checksum = checksum;
931 931
932 932 if (zio_checksum_table[checksum].ci_eck) {
933 933 /*
934 934 * zec checksums are necessarily destructive -- they modify
935 935 * the end of the write buffer to hold the verifier/checksum.
936 936 * Therefore, we must make a local copy in case the data is
937 937 * being written to multiple places in parallel.
938 938 */
939 939 void *wbuf = zio_buf_alloc(size);
940 940 bcopy(data, wbuf, size);
941 941 zio_push_transform(zio, wbuf, size, size, NULL);
942 942 }
943 943
944 944 return (zio);
945 945 }
946 946
947 947 /*
948 948 * Create a child I/O to do some work for us.
949 949 */
950 950 zio_t *
951 951 zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
952 952 void *data, uint64_t size, int type, zio_priority_t priority,
953 953 enum zio_flag flags, zio_done_func_t *done, void *private)
954 954 {
955 955 enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
956 956 zio_t *zio;
957 957
958 958 ASSERT(vd->vdev_parent ==
959 959 (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev));
960 960
961 961 if (type == ZIO_TYPE_READ && bp != NULL) {
962 962 /*
963 963 * If we have the bp, then the child should perform the
964 964 * checksum and the parent need not. This pushes error
965 965 * detection as close to the leaves as possible and
966 966 * eliminates redundant checksums in the interior nodes.
967 967 */
968 968 pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
969 969 pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
970 970 }
971 971
972 972 if (vd->vdev_children == 0)
973 973 offset += VDEV_LABEL_START_SIZE;
974 974
975 975 flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE;
976 976
977 977 /*
978 978 * If we've decided to do a repair, the write is not speculative --
979 979 * even if the original read was.
980 980 */
981 981 if (flags & ZIO_FLAG_IO_REPAIR)
982 982 flags &= ~ZIO_FLAG_SPECULATIVE;
983 983
984 984 zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
985 985 done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
986 986 ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
987 987
988 988 zio->io_physdone = pio->io_physdone;
989 989 if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
990 990 zio->io_logical->io_phys_children++;
991 991
992 992 return (zio);
993 993 }
994 994
995 995 zio_t *
996 996 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
997 997 int type, zio_priority_t priority, enum zio_flag flags,
998 998 zio_done_func_t *done, void *private)
999 999 {
1000 1000 zio_t *zio;
1001 1001
1002 1002 ASSERT(vd->vdev_ops->vdev_op_leaf);
1003 1003
1004 1004 zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
1005 1005 data, size, done, private, type, priority,
1006 1006 flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
1007 1007 vd, offset, NULL,
1008 1008 ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
1009 1009
1010 1010 return (zio);
1011 1011 }
1012 1012
1013 1013 void
1014 1014 zio_flush(zio_t *zio, vdev_t *vd)
1015 1015 {
1016 1016 zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
1017 1017 NULL, NULL,
1018 1018 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
1019 1019 }
1020 1020
1021 1021 void
1022 1022 zio_shrink(zio_t *zio, uint64_t size)
1023 1023 {
1024 1024 ASSERT(zio->io_executor == NULL);
1025 1025 ASSERT(zio->io_orig_size == zio->io_size);
1026 1026 ASSERT(size <= zio->io_size);
1027 1027
1028 1028 /*
1029 1029 * We don't shrink for raidz because of problems with the
1030 1030 * reconstruction when reading back less than the block size.
1031 1031 * Note, BP_IS_RAIDZ() assumes no compression.
1032 1032 */
1033 1033 ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
1034 1034 if (!BP_IS_RAIDZ(zio->io_bp))
1035 1035 zio->io_orig_size = zio->io_size = size;
1036 1036 }
1037 1037
1038 1038 /*
1039 1039 * ==========================================================================
1040 1040 * Prepare to read and write logical blocks
1041 1041 * ==========================================================================
1042 1042 */
1043 1043
1044 1044 static int
1045 1045 zio_read_bp_init(zio_t *zio)
1046 1046 {
1047 1047 blkptr_t *bp = zio->io_bp;
1048 1048
1049 1049 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
1050 1050 zio->io_child_type == ZIO_CHILD_LOGICAL &&
1051 1051 !(zio->io_flags & ZIO_FLAG_RAW)) {
1052 1052 uint64_t psize =
1053 1053 BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
1054 1054 void *cbuf = zio_buf_alloc(psize);
1055 1055
1056 1056 zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
1057 1057 }
1058 1058
1059 1059 if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
1060 1060 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1061 1061 decode_embedded_bp_compressed(bp, zio->io_data);
1062 1062 } else {
1063 1063 ASSERT(!BP_IS_EMBEDDED(bp));
1064 1064 }
1065 1065
1066 1066 if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
1067 1067 zio->io_flags |= ZIO_FLAG_DONT_CACHE;
1068 1068
1069 1069 if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
1070 1070 zio->io_flags |= ZIO_FLAG_DONT_CACHE;
1071 1071
1072 1072 if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
1073 1073 zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
1074 1074
1075 1075 return (ZIO_PIPELINE_CONTINUE);
1076 1076 }
1077 1077
1078 1078 static int
1079 1079 zio_write_bp_init(zio_t *zio)
1080 1080 {
1081 1081 spa_t *spa = zio->io_spa;
1082 1082 zio_prop_t *zp = &zio->io_prop;
1083 1083 enum zio_compress compress = zp->zp_compress;
1084 1084 blkptr_t *bp = zio->io_bp;
1085 1085 uint64_t lsize = zio->io_size;
1086 1086 uint64_t psize = lsize;
1087 1087 int pass = 1;
1088 1088
1089 1089 /*
1090 1090 * If our children haven't all reached the ready stage,
1091 1091 * wait for them and then repeat this pipeline stage.
1092 1092 */
1093 1093 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
1094 1094 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
1095 1095 return (ZIO_PIPELINE_STOP);
1096 1096
1097 1097 if (!IO_IS_ALLOCATING(zio))
1098 1098 return (ZIO_PIPELINE_CONTINUE);
1099 1099
1100 1100 ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
1101 1101
1102 1102 if (zio->io_bp_override) {
1103 1103 ASSERT(bp->blk_birth != zio->io_txg);
1104 1104 ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
1105 1105
1106 1106 *bp = *zio->io_bp_override;
1107 1107 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1108 1108
1109 1109 if (BP_IS_EMBEDDED(bp))
1110 1110 return (ZIO_PIPELINE_CONTINUE);
1111 1111
1112 1112 /*
1113 1113 * If we've been overridden and nopwrite is set then
1114 1114 * set the flag accordingly to indicate that a nopwrite
1115 1115 * has already occurred.
1116 1116 */
1117 1117 if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
1118 1118 ASSERT(!zp->zp_dedup);
1119 1119 zio->io_flags |= ZIO_FLAG_NOPWRITE;
1120 1120 return (ZIO_PIPELINE_CONTINUE);
1121 1121 }
1122 1122
1123 1123 ASSERT(!zp->zp_nopwrite);
1124 1124
1125 1125 if (BP_IS_HOLE(bp) || !zp->zp_dedup)
|
↓ open down ↓ |
1125 lines elided |
↑ open up ↑ |
1126 1126 return (ZIO_PIPELINE_CONTINUE);
1127 1127
1128 1128 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup ||
1129 1129 zp->zp_dedup_verify);
1130 1130
1131 1131 if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
1132 1132 BP_SET_DEDUP(bp, 1);
1133 1133 zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
1134 1134 return (ZIO_PIPELINE_CONTINUE);
1135 1135 }
1136 + zio->io_bp_override = NULL;
1137 + BP_ZERO(bp);
1136 1138 }
1137 1139
1138 1140 if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
1139 1141 /*
1140 1142 * We're rewriting an existing block, which means we're
1141 1143 * working on behalf of spa_sync(). For spa_sync() to
1142 1144 * converge, it must eventually be the case that we don't
1143 1145 * have to allocate new blocks. But compression changes
1144 1146 * the blocksize, which forces a reallocate, and makes
1145 1147 * convergence take longer. Therefore, after the first
1146 1148 * few passes, stop compressing to ensure convergence.
1147 1149 */
1148 1150 pass = spa_sync_pass(spa);
1149 1151
1150 1152 ASSERT(zio->io_txg == spa_syncing_txg(spa));
1151 1153 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1152 1154 ASSERT(!BP_GET_DEDUP(bp));
1153 1155
1154 1156 if (pass >= zfs_sync_pass_dont_compress)
1155 1157 compress = ZIO_COMPRESS_OFF;
1156 1158
1157 1159 /* Make sure someone doesn't change their mind on overwrites */
1158 1160 ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp),
1159 1161 spa_max_replication(spa)) == BP_GET_NDVAS(bp));
1160 1162 }
1161 1163
1162 1164 if (compress != ZIO_COMPRESS_OFF) {
1163 1165 void *cbuf = zio_buf_alloc(lsize);
1164 1166 psize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
1165 1167 if (psize == 0 || psize == lsize) {
1166 1168 compress = ZIO_COMPRESS_OFF;
1167 1169 zio_buf_free(cbuf, lsize);
1168 1170 } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE &&
1169 1171 zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
1170 1172 spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
1171 1173 encode_embedded_bp_compressed(bp,
1172 1174 cbuf, compress, lsize, psize);
1173 1175 BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
1174 1176 BP_SET_TYPE(bp, zio->io_prop.zp_type);
1175 1177 BP_SET_LEVEL(bp, zio->io_prop.zp_level);
1176 1178 zio_buf_free(cbuf, lsize);
1177 1179 bp->blk_birth = zio->io_txg;
1178 1180 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1179 1181 ASSERT(spa_feature_is_active(spa,
1180 1182 SPA_FEATURE_EMBEDDED_DATA));
1181 1183 return (ZIO_PIPELINE_CONTINUE);
1182 1184 } else {
1183 1185 /*
1184 1186 * Round up compressed size to MINBLOCKSIZE and
1185 1187 * zero the tail.
1186 1188 */
1187 1189 size_t rounded =
1188 1190 P2ROUNDUP(psize, (size_t)SPA_MINBLOCKSIZE);
1189 1191 if (rounded > psize) {
1190 1192 bzero((char *)cbuf + psize, rounded - psize);
1191 1193 psize = rounded;
1192 1194 }
1193 1195 if (psize == lsize) {
1194 1196 compress = ZIO_COMPRESS_OFF;
1195 1197 zio_buf_free(cbuf, lsize);
1196 1198 } else {
1197 1199 zio_push_transform(zio, cbuf,
1198 1200 psize, lsize, NULL);
1199 1201 }
1200 1202 }
1201 1203 }
1202 1204
1203 1205 /*
1204 1206 * The final pass of spa_sync() must be all rewrites, but the first
1205 1207 * few passes offer a trade-off: allocating blocks defers convergence,
1206 1208 * but newly allocated blocks are sequential, so they can be written
1207 1209 * to disk faster. Therefore, we allow the first few passes of
1208 1210 * spa_sync() to allocate new blocks, but force rewrites after that.
1209 1211 * There should only be a handful of blocks after pass 1 in any case.
1210 1212 */
1211 1213 if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
1212 1214 BP_GET_PSIZE(bp) == psize &&
1213 1215 pass >= zfs_sync_pass_rewrite) {
1214 1216 ASSERT(psize != 0);
1215 1217 enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
1216 1218 zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
1217 1219 zio->io_flags |= ZIO_FLAG_IO_REWRITE;
1218 1220 } else {
1219 1221 BP_ZERO(bp);
1220 1222 zio->io_pipeline = ZIO_WRITE_PIPELINE;
1221 1223 }
1222 1224
1223 1225 if (psize == 0) {
1224 1226 if (zio->io_bp_orig.blk_birth != 0 &&
1225 1227 spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
1226 1228 BP_SET_LSIZE(bp, lsize);
1227 1229 BP_SET_TYPE(bp, zp->zp_type);
1228 1230 BP_SET_LEVEL(bp, zp->zp_level);
1229 1231 BP_SET_BIRTH(bp, zio->io_txg, 0);
1230 1232 }
1231 1233 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1232 1234 } else {
1233 1235 ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
1234 1236 BP_SET_LSIZE(bp, lsize);
1235 1237 BP_SET_TYPE(bp, zp->zp_type);
1236 1238 BP_SET_LEVEL(bp, zp->zp_level);
1237 1239 BP_SET_PSIZE(bp, psize);
1238 1240 BP_SET_COMPRESS(bp, compress);
1239 1241 BP_SET_CHECKSUM(bp, zp->zp_checksum);
1240 1242 BP_SET_DEDUP(bp, zp->zp_dedup);
1241 1243 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
1242 1244 if (zp->zp_dedup) {
1243 1245 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1244 1246 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1245 1247 zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
1246 1248 }
1247 1249 if (zp->zp_nopwrite) {
1248 1250 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1249 1251 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1250 1252 zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
1251 1253 }
1252 1254 }
1253 1255
1254 1256 return (ZIO_PIPELINE_CONTINUE);
1255 1257 }
1256 1258
1257 1259 static int
1258 1260 zio_free_bp_init(zio_t *zio)
1259 1261 {
1260 1262 blkptr_t *bp = zio->io_bp;
1261 1263
1262 1264 if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
1263 1265 if (BP_GET_DEDUP(bp))
1264 1266 zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
1265 1267 }
1266 1268
1267 1269 return (ZIO_PIPELINE_CONTINUE);
1268 1270 }
1269 1271
1270 1272 /*
1271 1273 * ==========================================================================
1272 1274 * Execute the I/O pipeline
1273 1275 * ==========================================================================
1274 1276 */
1275 1277
1276 1278 static void
1277 1279 zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
1278 1280 {
1279 1281 spa_t *spa = zio->io_spa;
1280 1282 zio_type_t t = zio->io_type;
1281 1283 int flags = (cutinline ? TQ_FRONT : 0);
1282 1284
1283 1285 /*
1284 1286 * If we're a config writer or a probe, the normal issue and
1285 1287 * interrupt threads may all be blocked waiting for the config lock.
1286 1288 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
1287 1289 */
1288 1290 if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
1289 1291 t = ZIO_TYPE_NULL;
1290 1292
1291 1293 /*
1292 1294 * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
1293 1295 */
1294 1296 if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
1295 1297 t = ZIO_TYPE_NULL;
1296 1298
1297 1299 /*
1298 1300 * If this is a high priority I/O, then use the high priority taskq if
1299 1301 * available.
1300 1302 */
1301 1303 if (zio->io_priority == ZIO_PRIORITY_NOW &&
1302 1304 spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
1303 1305 q++;
1304 1306
1305 1307 ASSERT3U(q, <, ZIO_TASKQ_TYPES);
1306 1308
1307 1309 /*
1308 1310 * NB: We are assuming that the zio can only be dispatched
1309 1311 * to a single taskq at a time. It would be a grievous error
1310 1312 * to dispatch the zio to another taskq at the same time.
1311 1313 */
1312 1314 ASSERT(zio->io_tqent.tqent_next == NULL);
1313 1315 spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
1314 1316 flags, &zio->io_tqent);
1315 1317 }
1316 1318
1317 1319 static boolean_t
1318 1320 zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
1319 1321 {
1320 1322 kthread_t *executor = zio->io_executor;
1321 1323 spa_t *spa = zio->io_spa;
1322 1324
1323 1325 for (zio_type_t t = 0; t < ZIO_TYPES; t++) {
1324 1326 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
1325 1327 uint_t i;
1326 1328 for (i = 0; i < tqs->stqs_count; i++) {
1327 1329 if (taskq_member(tqs->stqs_taskq[i], executor))
1328 1330 return (B_TRUE);
1329 1331 }
1330 1332 }
1331 1333
1332 1334 return (B_FALSE);
1333 1335 }
1334 1336
1335 1337 static int
1336 1338 zio_issue_async(zio_t *zio)
1337 1339 {
1338 1340 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
1339 1341
1340 1342 return (ZIO_PIPELINE_STOP);
1341 1343 }
1342 1344
1343 1345 void
1344 1346 zio_interrupt(zio_t *zio)
1345 1347 {
1346 1348 zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
1347 1349 }
1348 1350
1349 1351 /*
1350 1352 * Execute the I/O pipeline until one of the following occurs:
1351 1353 *
1352 1354 * (1) the I/O completes
1353 1355 * (2) the pipeline stalls waiting for dependent child I/Os
1354 1356 * (3) the I/O issues, so we're waiting for an I/O completion interrupt
1355 1357 * (4) the I/O is delegated by vdev-level caching or aggregation
1356 1358 * (5) the I/O is deferred due to vdev-level queueing
1357 1359 * (6) the I/O is handed off to another thread.
1358 1360 *
1359 1361 * In all cases, the pipeline stops whenever there's no CPU work; it never
1360 1362 * burns a thread in cv_wait().
1361 1363 *
1362 1364 * There's no locking on io_stage because there's no legitimate way
1363 1365 * for multiple threads to be attempting to process the same I/O.
1364 1366 */
1365 1367 static zio_pipe_stage_t *zio_pipeline[];
1366 1368
1367 1369 void
1368 1370 zio_execute(zio_t *zio)
1369 1371 {
1370 1372 zio->io_executor = curthread;
1371 1373
1372 1374 while (zio->io_stage < ZIO_STAGE_DONE) {
1373 1375 enum zio_stage pipeline = zio->io_pipeline;
1374 1376 enum zio_stage stage = zio->io_stage;
1375 1377 int rv;
1376 1378
1377 1379 ASSERT(!MUTEX_HELD(&zio->io_lock));
1378 1380 ASSERT(ISP2(stage));
1379 1381 ASSERT(zio->io_stall == NULL);
1380 1382
1381 1383 do {
1382 1384 stage <<= 1;
1383 1385 } while ((stage & pipeline) == 0);
1384 1386
1385 1387 ASSERT(stage <= ZIO_STAGE_DONE);
1386 1388
1387 1389 /*
1388 1390 * If we are in interrupt context and this pipeline stage
1389 1391 * will grab a config lock that is held across I/O,
1390 1392 * or may wait for an I/O that needs an interrupt thread
1391 1393 * to complete, issue async to avoid deadlock.
1392 1394 *
1393 1395 * For VDEV_IO_START, we cut in line so that the io will
1394 1396 * be sent to disk promptly.
1395 1397 */
1396 1398 if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
1397 1399 zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
1398 1400 boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
1399 1401 zio_requeue_io_start_cut_in_line : B_FALSE;
1400 1402 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
1401 1403 return;
1402 1404 }
1403 1405
1404 1406 zio->io_stage = stage;
1405 1407 rv = zio_pipeline[highbit64(stage) - 1](zio);
1406 1408
1407 1409 if (rv == ZIO_PIPELINE_STOP)
1408 1410 return;
1409 1411
1410 1412 ASSERT(rv == ZIO_PIPELINE_CONTINUE);
1411 1413 }
1412 1414 }
1413 1415
1414 1416 /*
1415 1417 * ==========================================================================
1416 1418 * Initiate I/O, either sync or async
1417 1419 * ==========================================================================
1418 1420 */
1419 1421 int
1420 1422 zio_wait(zio_t *zio)
1421 1423 {
1422 1424 int error;
1423 1425
1424 1426 ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
1425 1427 ASSERT(zio->io_executor == NULL);
1426 1428
1427 1429 zio->io_waiter = curthread;
1428 1430
1429 1431 zio_execute(zio);
1430 1432
1431 1433 mutex_enter(&zio->io_lock);
1432 1434 while (zio->io_executor != NULL)
1433 1435 cv_wait(&zio->io_cv, &zio->io_lock);
1434 1436 mutex_exit(&zio->io_lock);
1435 1437
1436 1438 error = zio->io_error;
1437 1439 zio_destroy(zio);
1438 1440
1439 1441 return (error);
1440 1442 }
1441 1443
1442 1444 void
1443 1445 zio_nowait(zio_t *zio)
1444 1446 {
1445 1447 ASSERT(zio->io_executor == NULL);
1446 1448
1447 1449 if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
1448 1450 zio_unique_parent(zio) == NULL) {
1449 1451 /*
1450 1452 * This is a logical async I/O with no parent to wait for it.
1451 1453 * We add it to the spa_async_root_zio "Godfather" I/O which
1452 1454 * will ensure they complete prior to unloading the pool.
1453 1455 */
1454 1456 spa_t *spa = zio->io_spa;
1455 1457
1456 1458 zio_add_child(spa->spa_async_zio_root[CPU_SEQID], zio);
1457 1459 }
1458 1460
1459 1461 zio_execute(zio);
1460 1462 }
1461 1463
1462 1464 /*
1463 1465 * ==========================================================================
1464 1466 * Reexecute or suspend/resume failed I/O
1465 1467 * ==========================================================================
1466 1468 */
1467 1469
1468 1470 static void
1469 1471 zio_reexecute(zio_t *pio)
1470 1472 {
1471 1473 zio_t *cio, *cio_next;
1472 1474
1473 1475 ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
1474 1476 ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
1475 1477 ASSERT(pio->io_gang_leader == NULL);
1476 1478 ASSERT(pio->io_gang_tree == NULL);
1477 1479
1478 1480 pio->io_flags = pio->io_orig_flags;
1479 1481 pio->io_stage = pio->io_orig_stage;
1480 1482 pio->io_pipeline = pio->io_orig_pipeline;
1481 1483 pio->io_reexecute = 0;
1482 1484 pio->io_flags |= ZIO_FLAG_REEXECUTED;
1483 1485 pio->io_error = 0;
1484 1486 for (int w = 0; w < ZIO_WAIT_TYPES; w++)
1485 1487 pio->io_state[w] = 0;
1486 1488 for (int c = 0; c < ZIO_CHILD_TYPES; c++)
1487 1489 pio->io_child_error[c] = 0;
1488 1490
1489 1491 if (IO_IS_ALLOCATING(pio))
1490 1492 BP_ZERO(pio->io_bp);
1491 1493
1492 1494 /*
1493 1495 * As we reexecute pio's children, new children could be created.
1494 1496 * New children go to the head of pio's io_child_list, however,
1495 1497 * so we will (correctly) not reexecute them. The key is that
1496 1498 * the remainder of pio's io_child_list, from 'cio_next' onward,
1497 1499 * cannot be affected by any side effects of reexecuting 'cio'.
1498 1500 */
1499 1501 for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) {
1500 1502 cio_next = zio_walk_children(pio);
1501 1503 mutex_enter(&pio->io_lock);
1502 1504 for (int w = 0; w < ZIO_WAIT_TYPES; w++)
1503 1505 pio->io_children[cio->io_child_type][w]++;
1504 1506 mutex_exit(&pio->io_lock);
1505 1507 zio_reexecute(cio);
1506 1508 }
1507 1509
1508 1510 /*
1509 1511 * Now that all children have been reexecuted, execute the parent.
1510 1512 * We don't reexecute "The Godfather" I/O here as it's the
1511 1513 * responsibility of the caller to wait on him.
1512 1514 */
1513 1515 if (!(pio->io_flags & ZIO_FLAG_GODFATHER))
1514 1516 zio_execute(pio);
1515 1517 }
1516 1518
1517 1519 void
1518 1520 zio_suspend(spa_t *spa, zio_t *zio)
1519 1521 {
1520 1522 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
1521 1523 fm_panic("Pool '%s' has encountered an uncorrectable I/O "
1522 1524 "failure and the failure mode property for this pool "
1523 1525 "is set to panic.", spa_name(spa));
1524 1526
1525 1527 zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0);
1526 1528
1527 1529 mutex_enter(&spa->spa_suspend_lock);
1528 1530
1529 1531 if (spa->spa_suspend_zio_root == NULL)
1530 1532 spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
1531 1533 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
1532 1534 ZIO_FLAG_GODFATHER);
1533 1535
1534 1536 spa->spa_suspended = B_TRUE;
1535 1537
1536 1538 if (zio != NULL) {
1537 1539 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
1538 1540 ASSERT(zio != spa->spa_suspend_zio_root);
1539 1541 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1540 1542 ASSERT(zio_unique_parent(zio) == NULL);
1541 1543 ASSERT(zio->io_stage == ZIO_STAGE_DONE);
1542 1544 zio_add_child(spa->spa_suspend_zio_root, zio);
1543 1545 }
1544 1546
1545 1547 mutex_exit(&spa->spa_suspend_lock);
1546 1548 }
1547 1549
1548 1550 int
1549 1551 zio_resume(spa_t *spa)
1550 1552 {
1551 1553 zio_t *pio;
1552 1554
1553 1555 /*
1554 1556 * Reexecute all previously suspended i/o.
1555 1557 */
1556 1558 mutex_enter(&spa->spa_suspend_lock);
1557 1559 spa->spa_suspended = B_FALSE;
1558 1560 cv_broadcast(&spa->spa_suspend_cv);
1559 1561 pio = spa->spa_suspend_zio_root;
1560 1562 spa->spa_suspend_zio_root = NULL;
1561 1563 mutex_exit(&spa->spa_suspend_lock);
1562 1564
1563 1565 if (pio == NULL)
1564 1566 return (0);
1565 1567
1566 1568 zio_reexecute(pio);
1567 1569 return (zio_wait(pio));
1568 1570 }
1569 1571
1570 1572 void
1571 1573 zio_resume_wait(spa_t *spa)
1572 1574 {
1573 1575 mutex_enter(&spa->spa_suspend_lock);
1574 1576 while (spa_suspended(spa))
1575 1577 cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
1576 1578 mutex_exit(&spa->spa_suspend_lock);
1577 1579 }
1578 1580
1579 1581 /*
1580 1582 * ==========================================================================
1581 1583 * Gang blocks.
1582 1584 *
1583 1585 * A gang block is a collection of small blocks that looks to the DMU
1584 1586 * like one large block. When zio_dva_allocate() cannot find a block
1585 1587 * of the requested size, due to either severe fragmentation or the pool
1586 1588 * being nearly full, it calls zio_write_gang_block() to construct the
1587 1589 * block from smaller fragments.
1588 1590 *
1589 1591 * A gang block consists of a gang header (zio_gbh_phys_t) and up to
1590 1592 * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like
1591 1593 * an indirect block: it's an array of block pointers. It consumes
1592 1594 * only one sector and hence is allocatable regardless of fragmentation.
1593 1595 * The gang header's bps point to its gang members, which hold the data.
1594 1596 *
1595 1597 * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
1596 1598 * as the verifier to ensure uniqueness of the SHA256 checksum.
1597 1599 * Critically, the gang block bp's blk_cksum is the checksum of the data,
1598 1600 * not the gang header. This ensures that data block signatures (needed for
1599 1601 * deduplication) are independent of how the block is physically stored.
1600 1602 *
1601 1603 * Gang blocks can be nested: a gang member may itself be a gang block.
1602 1604 * Thus every gang block is a tree in which root and all interior nodes are
1603 1605 * gang headers, and the leaves are normal blocks that contain user data.
1604 1606 * The root of the gang tree is called the gang leader.
1605 1607 *
1606 1608 * To perform any operation (read, rewrite, free, claim) on a gang block,
1607 1609 * zio_gang_assemble() first assembles the gang tree (minus data leaves)
1608 1610 * in the io_gang_tree field of the original logical i/o by recursively
1609 1611 * reading the gang leader and all gang headers below it. This yields
1610 1612 * an in-core tree containing the contents of every gang header and the
1611 1613 * bps for every constituent of the gang block.
1612 1614 *
1613 1615 * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
1614 1616 * and invokes a callback on each bp. To free a gang block, zio_gang_issue()
1615 1617 * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
1616 1618 * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
1617 1619 * zio_read_gang() is a wrapper around zio_read() that omits reading gang
1618 1620 * headers, since we already have those in io_gang_tree. zio_rewrite_gang()
1619 1621 * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
1620 1622 * of the gang header plus zio_checksum_compute() of the data to update the
1621 1623 * gang header's blk_cksum as described above.
1622 1624 *
1623 1625 * The two-phase assemble/issue model solves the problem of partial failure --
1624 1626 * what if you'd freed part of a gang block but then couldn't read the
1625 1627 * gang header for another part? Assembling the entire gang tree first
1626 1628 * ensures that all the necessary gang header I/O has succeeded before
1627 1629 * starting the actual work of free, claim, or write. Once the gang tree
1628 1630 * is assembled, free and claim are in-memory operations that cannot fail.
1629 1631 *
1630 1632 * In the event that a gang write fails, zio_dva_unallocate() walks the
1631 1633 * gang tree to immediately free (i.e. insert back into the space map)
1632 1634 * everything we've allocated. This ensures that we don't get ENOSPC
1633 1635 * errors during repeated suspend/resume cycles due to a flaky device.
1634 1636 *
1635 1637 * Gang rewrites only happen during sync-to-convergence. If we can't assemble
1636 1638 * the gang tree, we won't modify the block, so we can safely defer the free
1637 1639 * (knowing that the block is still intact). If we *can* assemble the gang
1638 1640 * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
1639 1641 * each constituent bp and we can allocate a new block on the next sync pass.
1640 1642 *
1641 1643 * In all cases, the gang tree allows complete recovery from partial failure.
1642 1644 * ==========================================================================
1643 1645 */
1644 1646
1645 1647 static zio_t *
1646 1648 zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1647 1649 {
1648 1650 if (gn != NULL)
1649 1651 return (pio);
1650 1652
1651 1653 return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp),
1652 1654 NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1653 1655 &pio->io_bookmark));
1654 1656 }
1655 1657
1656 1658 zio_t *
1657 1659 zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1658 1660 {
1659 1661 zio_t *zio;
1660 1662
1661 1663 if (gn != NULL) {
1662 1664 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1663 1665 gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority,
1664 1666 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1665 1667 /*
1666 1668 * As we rewrite each gang header, the pipeline will compute
1667 1669 * a new gang block header checksum for it; but no one will
1668 1670 * compute a new data checksum, so we do that here. The one
1669 1671 * exception is the gang leader: the pipeline already computed
1670 1672 * its data checksum because that stage precedes gang assembly.
1671 1673 * (Presently, nothing actually uses interior data checksums;
1672 1674 * this is just good hygiene.)
1673 1675 */
1674 1676 if (gn != pio->io_gang_leader->io_gang_tree) {
1675 1677 zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
1676 1678 data, BP_GET_PSIZE(bp));
1677 1679 }
1678 1680 /*
1679 1681 * If we are here to damage data for testing purposes,
1680 1682 * leave the GBH alone so that we can detect the damage.
1681 1683 */
1682 1684 if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
1683 1685 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
1684 1686 } else {
1685 1687 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1686 1688 data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority,
1687 1689 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1688 1690 }
1689 1691
1690 1692 return (zio);
1691 1693 }
1692 1694
1693 1695 /* ARGSUSED */
1694 1696 zio_t *
1695 1697 zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1696 1698 {
1697 1699 return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
1698 1700 ZIO_GANG_CHILD_FLAGS(pio)));
1699 1701 }
1700 1702
1701 1703 /* ARGSUSED */
1702 1704 zio_t *
1703 1705 zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1704 1706 {
1705 1707 return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
1706 1708 NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
1707 1709 }
1708 1710
1709 1711 static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
1710 1712 NULL,
1711 1713 zio_read_gang,
1712 1714 zio_rewrite_gang,
1713 1715 zio_free_gang,
1714 1716 zio_claim_gang,
1715 1717 NULL
1716 1718 };
1717 1719
1718 1720 static void zio_gang_tree_assemble_done(zio_t *zio);
1719 1721
1720 1722 static zio_gang_node_t *
1721 1723 zio_gang_node_alloc(zio_gang_node_t **gnpp)
1722 1724 {
1723 1725 zio_gang_node_t *gn;
1724 1726
1725 1727 ASSERT(*gnpp == NULL);
1726 1728
1727 1729 gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
1728 1730 gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
1729 1731 *gnpp = gn;
1730 1732
1731 1733 return (gn);
1732 1734 }
1733 1735
1734 1736 static void
1735 1737 zio_gang_node_free(zio_gang_node_t **gnpp)
1736 1738 {
1737 1739 zio_gang_node_t *gn = *gnpp;
1738 1740
1739 1741 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
1740 1742 ASSERT(gn->gn_child[g] == NULL);
1741 1743
1742 1744 zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
1743 1745 kmem_free(gn, sizeof (*gn));
1744 1746 *gnpp = NULL;
1745 1747 }
1746 1748
1747 1749 static void
1748 1750 zio_gang_tree_free(zio_gang_node_t **gnpp)
1749 1751 {
1750 1752 zio_gang_node_t *gn = *gnpp;
1751 1753
1752 1754 if (gn == NULL)
1753 1755 return;
1754 1756
1755 1757 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
1756 1758 zio_gang_tree_free(&gn->gn_child[g]);
1757 1759
1758 1760 zio_gang_node_free(gnpp);
1759 1761 }
1760 1762
1761 1763 static void
1762 1764 zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
1763 1765 {
1764 1766 zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
1765 1767
1766 1768 ASSERT(gio->io_gang_leader == gio);
1767 1769 ASSERT(BP_IS_GANG(bp));
1768 1770
1769 1771 zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh,
1770 1772 SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn,
1771 1773 gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
1772 1774 }
1773 1775
1774 1776 static void
1775 1777 zio_gang_tree_assemble_done(zio_t *zio)
1776 1778 {
1777 1779 zio_t *gio = zio->io_gang_leader;
1778 1780 zio_gang_node_t *gn = zio->io_private;
1779 1781 blkptr_t *bp = zio->io_bp;
1780 1782
1781 1783 ASSERT(gio == zio_unique_parent(zio));
1782 1784 ASSERT(zio->io_child_count == 0);
1783 1785
1784 1786 if (zio->io_error)
1785 1787 return;
1786 1788
1787 1789 if (BP_SHOULD_BYTESWAP(bp))
1788 1790 byteswap_uint64_array(zio->io_data, zio->io_size);
1789 1791
1790 1792 ASSERT(zio->io_data == gn->gn_gbh);
1791 1793 ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
1792 1794 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
1793 1795
1794 1796 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1795 1797 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1796 1798 if (!BP_IS_GANG(gbp))
1797 1799 continue;
1798 1800 zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
1799 1801 }
1800 1802 }
1801 1803
1802 1804 static void
1803 1805 zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
1804 1806 {
1805 1807 zio_t *gio = pio->io_gang_leader;
1806 1808 zio_t *zio;
1807 1809
1808 1810 ASSERT(BP_IS_GANG(bp) == !!gn);
1809 1811 ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
1810 1812 ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
1811 1813
1812 1814 /*
1813 1815 * If you're a gang header, your data is in gn->gn_gbh.
1814 1816 * If you're a gang member, your data is in 'data' and gn == NULL.
1815 1817 */
1816 1818 zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data);
1817 1819
1818 1820 if (gn != NULL) {
1819 1821 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
1820 1822
1821 1823 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1822 1824 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1823 1825 if (BP_IS_HOLE(gbp))
1824 1826 continue;
1825 1827 zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data);
1826 1828 data = (char *)data + BP_GET_PSIZE(gbp);
1827 1829 }
1828 1830 }
1829 1831
1830 1832 if (gn == gio->io_gang_tree)
1831 1833 ASSERT3P((char *)gio->io_data + gio->io_size, ==, data);
1832 1834
1833 1835 if (zio != pio)
1834 1836 zio_nowait(zio);
1835 1837 }
1836 1838
1837 1839 static int
1838 1840 zio_gang_assemble(zio_t *zio)
1839 1841 {
1840 1842 blkptr_t *bp = zio->io_bp;
1841 1843
1842 1844 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
1843 1845 ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1844 1846
1845 1847 zio->io_gang_leader = zio;
1846 1848
1847 1849 zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
1848 1850
1849 1851 return (ZIO_PIPELINE_CONTINUE);
1850 1852 }
1851 1853
1852 1854 static int
1853 1855 zio_gang_issue(zio_t *zio)
1854 1856 {
1855 1857 blkptr_t *bp = zio->io_bp;
1856 1858
1857 1859 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
1858 1860 return (ZIO_PIPELINE_STOP);
1859 1861
1860 1862 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
1861 1863 ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1862 1864
1863 1865 if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
1864 1866 zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data);
1865 1867 else
1866 1868 zio_gang_tree_free(&zio->io_gang_tree);
1867 1869
1868 1870 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1869 1871
1870 1872 return (ZIO_PIPELINE_CONTINUE);
1871 1873 }
1872 1874
1873 1875 static void
1874 1876 zio_write_gang_member_ready(zio_t *zio)
1875 1877 {
1876 1878 zio_t *pio = zio_unique_parent(zio);
1877 1879 zio_t *gio = zio->io_gang_leader;
1878 1880 dva_t *cdva = zio->io_bp->blk_dva;
1879 1881 dva_t *pdva = pio->io_bp->blk_dva;
1880 1882 uint64_t asize;
1881 1883
1882 1884 if (BP_IS_HOLE(zio->io_bp))
1883 1885 return;
1884 1886
1885 1887 ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
1886 1888
1887 1889 ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
1888 1890 ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
1889 1891 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
1890 1892 ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
1891 1893 ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
1892 1894
1893 1895 mutex_enter(&pio->io_lock);
1894 1896 for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
1895 1897 ASSERT(DVA_GET_GANG(&pdva[d]));
1896 1898 asize = DVA_GET_ASIZE(&pdva[d]);
1897 1899 asize += DVA_GET_ASIZE(&cdva[d]);
1898 1900 DVA_SET_ASIZE(&pdva[d], asize);
1899 1901 }
1900 1902 mutex_exit(&pio->io_lock);
1901 1903 }
1902 1904
1903 1905 static int
1904 1906 zio_write_gang_block(zio_t *pio)
1905 1907 {
1906 1908 spa_t *spa = pio->io_spa;
1907 1909 blkptr_t *bp = pio->io_bp;
1908 1910 zio_t *gio = pio->io_gang_leader;
1909 1911 zio_t *zio;
1910 1912 zio_gang_node_t *gn, **gnpp;
1911 1913 zio_gbh_phys_t *gbh;
1912 1914 uint64_t txg = pio->io_txg;
1913 1915 uint64_t resid = pio->io_size;
1914 1916 uint64_t lsize;
1915 1917 int copies = gio->io_prop.zp_copies;
1916 1918 int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
1917 1919 zio_prop_t zp;
1918 1920 int error;
1919 1921
1920 1922 error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE,
1921 1923 bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp,
1922 1924 METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
1923 1925 if (error) {
1924 1926 pio->io_error = error;
1925 1927 return (ZIO_PIPELINE_CONTINUE);
1926 1928 }
1927 1929
1928 1930 if (pio == gio) {
1929 1931 gnpp = &gio->io_gang_tree;
1930 1932 } else {
1931 1933 gnpp = pio->io_private;
1932 1934 ASSERT(pio->io_ready == zio_write_gang_member_ready);
1933 1935 }
1934 1936
1935 1937 gn = zio_gang_node_alloc(gnpp);
1936 1938 gbh = gn->gn_gbh;
1937 1939 bzero(gbh, SPA_GANGBLOCKSIZE);
1938 1940
1939 1941 /*
1940 1942 * Create the gang header.
1941 1943 */
1942 1944 zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL,
1943 1945 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1944 1946
1945 1947 /*
1946 1948 * Create and nowait the gang children.
1947 1949 */
1948 1950 for (int g = 0; resid != 0; resid -= lsize, g++) {
1949 1951 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
1950 1952 SPA_MINBLOCKSIZE);
1951 1953 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
1952 1954
1953 1955 zp.zp_checksum = gio->io_prop.zp_checksum;
1954 1956 zp.zp_compress = ZIO_COMPRESS_OFF;
1955 1957 zp.zp_type = DMU_OT_NONE;
1956 1958 zp.zp_level = 0;
1957 1959 zp.zp_copies = gio->io_prop.zp_copies;
1958 1960 zp.zp_dedup = B_FALSE;
1959 1961 zp.zp_dedup_verify = B_FALSE;
1960 1962 zp.zp_nopwrite = B_FALSE;
1961 1963
1962 1964 zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
1963 1965 (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
1964 1966 zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g],
1965 1967 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1966 1968 &pio->io_bookmark));
1967 1969 }
1968 1970
1969 1971 /*
1970 1972 * Set pio's pipeline to just wait for zio to finish.
1971 1973 */
1972 1974 pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1973 1975
1974 1976 zio_nowait(zio);
1975 1977
1976 1978 return (ZIO_PIPELINE_CONTINUE);
1977 1979 }
1978 1980
1979 1981 /*
1980 1982 * The zio_nop_write stage in the pipeline determines if allocating
1981 1983 * a new bp is necessary. By leveraging a cryptographically secure checksum,
1982 1984 * such as SHA256, we can compare the checksums of the new data and the old
1983 1985 * to determine if allocating a new block is required. The nopwrite
1984 1986 * feature can handle writes in either syncing or open context (i.e. zil
1985 1987 * writes) and as a result is mutually exclusive with dedup.
1986 1988 */
1987 1989 static int
1988 1990 zio_nop_write(zio_t *zio)
1989 1991 {
1990 1992 blkptr_t *bp = zio->io_bp;
1991 1993 blkptr_t *bp_orig = &zio->io_bp_orig;
1992 1994 zio_prop_t *zp = &zio->io_prop;
1993 1995
1994 1996 ASSERT(BP_GET_LEVEL(bp) == 0);
1995 1997 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1996 1998 ASSERT(zp->zp_nopwrite);
1997 1999 ASSERT(!zp->zp_dedup);
1998 2000 ASSERT(zio->io_bp_override == NULL);
1999 2001 ASSERT(IO_IS_ALLOCATING(zio));
2000 2002
2001 2003 /*
2002 2004 * Check to see if the original bp and the new bp have matching
2003 2005 * characteristics (i.e. same checksum, compression algorithms, etc).
2004 2006 * If they don't then just continue with the pipeline which will
2005 2007 * allocate a new bp.
2006 2008 */
2007 2009 if (BP_IS_HOLE(bp_orig) ||
2008 2010 !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup ||
2009 2011 BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
2010 2012 BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
2011 2013 BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
2012 2014 zp->zp_copies != BP_GET_NDVAS(bp_orig))
2013 2015 return (ZIO_PIPELINE_CONTINUE);
2014 2016
2015 2017 /*
2016 2018 * If the checksums match then reset the pipeline so that we
2017 2019 * avoid allocating a new bp and issuing any I/O.
2018 2020 */
2019 2021 if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
2020 2022 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup);
2021 2023 ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
2022 2024 ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
2023 2025 ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
2024 2026 ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
2025 2027 sizeof (uint64_t)) == 0);
2026 2028
2027 2029 *bp = *bp_orig;
2028 2030 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2029 2031 zio->io_flags |= ZIO_FLAG_NOPWRITE;
2030 2032 }
2031 2033
2032 2034 return (ZIO_PIPELINE_CONTINUE);
2033 2035 }
2034 2036
2035 2037 /*
2036 2038 * ==========================================================================
2037 2039 * Dedup
2038 2040 * ==========================================================================
2039 2041 */
2040 2042 static void
2041 2043 zio_ddt_child_read_done(zio_t *zio)
2042 2044 {
2043 2045 blkptr_t *bp = zio->io_bp;
2044 2046 ddt_entry_t *dde = zio->io_private;
2045 2047 ddt_phys_t *ddp;
2046 2048 zio_t *pio = zio_unique_parent(zio);
2047 2049
2048 2050 mutex_enter(&pio->io_lock);
2049 2051 ddp = ddt_phys_select(dde, bp);
2050 2052 if (zio->io_error == 0)
2051 2053 ddt_phys_clear(ddp); /* this ddp doesn't need repair */
2052 2054 if (zio->io_error == 0 && dde->dde_repair_data == NULL)
2053 2055 dde->dde_repair_data = zio->io_data;
2054 2056 else
2055 2057 zio_buf_free(zio->io_data, zio->io_size);
2056 2058 mutex_exit(&pio->io_lock);
2057 2059 }
2058 2060
2059 2061 static int
2060 2062 zio_ddt_read_start(zio_t *zio)
2061 2063 {
2062 2064 blkptr_t *bp = zio->io_bp;
2063 2065
2064 2066 ASSERT(BP_GET_DEDUP(bp));
2065 2067 ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
2066 2068 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2067 2069
2068 2070 if (zio->io_child_error[ZIO_CHILD_DDT]) {
2069 2071 ddt_t *ddt = ddt_select(zio->io_spa, bp);
2070 2072 ddt_entry_t *dde = ddt_repair_start(ddt, bp);
2071 2073 ddt_phys_t *ddp = dde->dde_phys;
2072 2074 ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
2073 2075 blkptr_t blk;
2074 2076
2075 2077 ASSERT(zio->io_vsd == NULL);
2076 2078 zio->io_vsd = dde;
2077 2079
2078 2080 if (ddp_self == NULL)
2079 2081 return (ZIO_PIPELINE_CONTINUE);
2080 2082
2081 2083 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
2082 2084 if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
2083 2085 continue;
2084 2086 ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
2085 2087 &blk);
2086 2088 zio_nowait(zio_read(zio, zio->io_spa, &blk,
2087 2089 zio_buf_alloc(zio->io_size), zio->io_size,
2088 2090 zio_ddt_child_read_done, dde, zio->io_priority,
2089 2091 ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE,
2090 2092 &zio->io_bookmark));
2091 2093 }
2092 2094 return (ZIO_PIPELINE_CONTINUE);
2093 2095 }
2094 2096
2095 2097 zio_nowait(zio_read(zio, zio->io_spa, bp,
2096 2098 zio->io_data, zio->io_size, NULL, NULL, zio->io_priority,
2097 2099 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
2098 2100
2099 2101 return (ZIO_PIPELINE_CONTINUE);
2100 2102 }
2101 2103
2102 2104 static int
2103 2105 zio_ddt_read_done(zio_t *zio)
2104 2106 {
2105 2107 blkptr_t *bp = zio->io_bp;
2106 2108
2107 2109 if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
2108 2110 return (ZIO_PIPELINE_STOP);
2109 2111
2110 2112 ASSERT(BP_GET_DEDUP(bp));
2111 2113 ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
2112 2114 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2113 2115
2114 2116 if (zio->io_child_error[ZIO_CHILD_DDT]) {
2115 2117 ddt_t *ddt = ddt_select(zio->io_spa, bp);
2116 2118 ddt_entry_t *dde = zio->io_vsd;
2117 2119 if (ddt == NULL) {
2118 2120 ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
2119 2121 return (ZIO_PIPELINE_CONTINUE);
2120 2122 }
2121 2123 if (dde == NULL) {
2122 2124 zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
2123 2125 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
2124 2126 return (ZIO_PIPELINE_STOP);
2125 2127 }
2126 2128 if (dde->dde_repair_data != NULL) {
2127 2129 bcopy(dde->dde_repair_data, zio->io_data, zio->io_size);
2128 2130 zio->io_child_error[ZIO_CHILD_DDT] = 0;
2129 2131 }
2130 2132 ddt_repair_done(ddt, dde);
2131 2133 zio->io_vsd = NULL;
2132 2134 }
2133 2135
2134 2136 ASSERT(zio->io_vsd == NULL);
2135 2137
2136 2138 return (ZIO_PIPELINE_CONTINUE);
2137 2139 }
2138 2140
2139 2141 static boolean_t
2140 2142 zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
2141 2143 {
2142 2144 spa_t *spa = zio->io_spa;
2143 2145
2144 2146 /*
2145 2147 * Note: we compare the original data, not the transformed data,
2146 2148 * because when zio->io_bp is an override bp, we will not have
2147 2149 * pushed the I/O transforms. That's an important optimization
2148 2150 * because otherwise we'd compress/encrypt all dmu_sync() data twice.
2149 2151 */
2150 2152 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
2151 2153 zio_t *lio = dde->dde_lead_zio[p];
2152 2154
2153 2155 if (lio != NULL) {
2154 2156 return (lio->io_orig_size != zio->io_orig_size ||
2155 2157 bcmp(zio->io_orig_data, lio->io_orig_data,
2156 2158 zio->io_orig_size) != 0);
2157 2159 }
2158 2160 }
2159 2161
2160 2162 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
2161 2163 ddt_phys_t *ddp = &dde->dde_phys[p];
2162 2164
2163 2165 if (ddp->ddp_phys_birth != 0) {
2164 2166 arc_buf_t *abuf = NULL;
2165 2167 arc_flags_t aflags = ARC_FLAG_WAIT;
2166 2168 blkptr_t blk = *zio->io_bp;
2167 2169 int error;
2168 2170
2169 2171 ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
2170 2172
2171 2173 ddt_exit(ddt);
2172 2174
2173 2175 error = arc_read(NULL, spa, &blk,
2174 2176 arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
2175 2177 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
2176 2178 &aflags, &zio->io_bookmark);
2177 2179
2178 2180 if (error == 0) {
2179 2181 if (arc_buf_size(abuf) != zio->io_orig_size ||
2180 2182 bcmp(abuf->b_data, zio->io_orig_data,
2181 2183 zio->io_orig_size) != 0)
2182 2184 error = SET_ERROR(EEXIST);
2183 2185 VERIFY(arc_buf_remove_ref(abuf, &abuf));
2184 2186 }
2185 2187
2186 2188 ddt_enter(ddt);
2187 2189 return (error != 0);
2188 2190 }
2189 2191 }
2190 2192
2191 2193 return (B_FALSE);
2192 2194 }
2193 2195
2194 2196 static void
2195 2197 zio_ddt_child_write_ready(zio_t *zio)
2196 2198 {
2197 2199 int p = zio->io_prop.zp_copies;
2198 2200 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
2199 2201 ddt_entry_t *dde = zio->io_private;
2200 2202 ddt_phys_t *ddp = &dde->dde_phys[p];
2201 2203 zio_t *pio;
2202 2204
2203 2205 if (zio->io_error)
2204 2206 return;
2205 2207
2206 2208 ddt_enter(ddt);
2207 2209
2208 2210 ASSERT(dde->dde_lead_zio[p] == zio);
2209 2211
2210 2212 ddt_phys_fill(ddp, zio->io_bp);
2211 2213
2212 2214 while ((pio = zio_walk_parents(zio)) != NULL)
2213 2215 ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
2214 2216
2215 2217 ddt_exit(ddt);
2216 2218 }
2217 2219
2218 2220 static void
2219 2221 zio_ddt_child_write_done(zio_t *zio)
2220 2222 {
2221 2223 int p = zio->io_prop.zp_copies;
2222 2224 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
2223 2225 ddt_entry_t *dde = zio->io_private;
2224 2226 ddt_phys_t *ddp = &dde->dde_phys[p];
2225 2227
2226 2228 ddt_enter(ddt);
2227 2229
2228 2230 ASSERT(ddp->ddp_refcnt == 0);
2229 2231 ASSERT(dde->dde_lead_zio[p] == zio);
2230 2232 dde->dde_lead_zio[p] = NULL;
2231 2233
2232 2234 if (zio->io_error == 0) {
2233 2235 while (zio_walk_parents(zio) != NULL)
2234 2236 ddt_phys_addref(ddp);
2235 2237 } else {
2236 2238 ddt_phys_clear(ddp);
2237 2239 }
2238 2240
2239 2241 ddt_exit(ddt);
2240 2242 }
2241 2243
2242 2244 static void
2243 2245 zio_ddt_ditto_write_done(zio_t *zio)
2244 2246 {
2245 2247 int p = DDT_PHYS_DITTO;
2246 2248 zio_prop_t *zp = &zio->io_prop;
2247 2249 blkptr_t *bp = zio->io_bp;
2248 2250 ddt_t *ddt = ddt_select(zio->io_spa, bp);
2249 2251 ddt_entry_t *dde = zio->io_private;
2250 2252 ddt_phys_t *ddp = &dde->dde_phys[p];
2251 2253 ddt_key_t *ddk = &dde->dde_key;
2252 2254
2253 2255 ddt_enter(ddt);
2254 2256
2255 2257 ASSERT(ddp->ddp_refcnt == 0);
2256 2258 ASSERT(dde->dde_lead_zio[p] == zio);
2257 2259 dde->dde_lead_zio[p] = NULL;
2258 2260
2259 2261 if (zio->io_error == 0) {
2260 2262 ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
2261 2263 ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
2262 2264 ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
2263 2265 if (ddp->ddp_phys_birth != 0)
2264 2266 ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
2265 2267 ddt_phys_fill(ddp, bp);
2266 2268 }
2267 2269
2268 2270 ddt_exit(ddt);
2269 2271 }
2270 2272
2271 2273 static int
2272 2274 zio_ddt_write(zio_t *zio)
2273 2275 {
2274 2276 spa_t *spa = zio->io_spa;
2275 2277 blkptr_t *bp = zio->io_bp;
2276 2278 uint64_t txg = zio->io_txg;
2277 2279 zio_prop_t *zp = &zio->io_prop;
2278 2280 int p = zp->zp_copies;
2279 2281 int ditto_copies;
2280 2282 zio_t *cio = NULL;
2281 2283 zio_t *dio = NULL;
2282 2284 ddt_t *ddt = ddt_select(spa, bp);
2283 2285 ddt_entry_t *dde;
2284 2286 ddt_phys_t *ddp;
2285 2287
2286 2288 ASSERT(BP_GET_DEDUP(bp));
2287 2289 ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
2288 2290 ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
2289 2291
2290 2292 ddt_enter(ddt);
2291 2293 dde = ddt_lookup(ddt, bp, B_TRUE);
2292 2294 ddp = &dde->dde_phys[p];
2293 2295
2294 2296 if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
2295 2297 /*
2296 2298 * If we're using a weak checksum, upgrade to a strong checksum
2297 2299 * and try again. If we're already using a strong checksum,
2298 2300 * we can't resolve it, so just convert to an ordinary write.
2299 2301 * (And automatically e-mail a paper to Nature?)
2300 2302 */
2301 2303 if (!zio_checksum_table[zp->zp_checksum].ci_dedup) {
2302 2304 zp->zp_checksum = spa_dedup_checksum(spa);
2303 2305 zio_pop_transforms(zio);
2304 2306 zio->io_stage = ZIO_STAGE_OPEN;
2305 2307 BP_ZERO(bp);
2306 2308 } else {
2307 2309 zp->zp_dedup = B_FALSE;
2308 2310 }
2309 2311 zio->io_pipeline = ZIO_WRITE_PIPELINE;
2310 2312 ddt_exit(ddt);
2311 2313 return (ZIO_PIPELINE_CONTINUE);
2312 2314 }
2313 2315
2314 2316 ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
2315 2317 ASSERT(ditto_copies < SPA_DVAS_PER_BP);
2316 2318
2317 2319 if (ditto_copies > ddt_ditto_copies_present(dde) &&
2318 2320 dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
2319 2321 zio_prop_t czp = *zp;
2320 2322
2321 2323 czp.zp_copies = ditto_copies;
2322 2324
2323 2325 /*
2324 2326 * If we arrived here with an override bp, we won't have run
2325 2327 * the transform stack, so we won't have the data we need to
2326 2328 * generate a child i/o. So, toss the override bp and restart.
2327 2329 * This is safe, because using the override bp is just an
2328 2330 * optimization; and it's rare, so the cost doesn't matter.
2329 2331 */
2330 2332 if (zio->io_bp_override) {
2331 2333 zio_pop_transforms(zio);
2332 2334 zio->io_stage = ZIO_STAGE_OPEN;
2333 2335 zio->io_pipeline = ZIO_WRITE_PIPELINE;
2334 2336 zio->io_bp_override = NULL;
2335 2337 BP_ZERO(bp);
2336 2338 ddt_exit(ddt);
2337 2339 return (ZIO_PIPELINE_CONTINUE);
2338 2340 }
2339 2341
2340 2342 dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2341 2343 zio->io_orig_size, &czp, NULL, NULL,
2342 2344 zio_ddt_ditto_write_done, dde, zio->io_priority,
2343 2345 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2344 2346
2345 2347 zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
2346 2348 dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
2347 2349 }
2348 2350
2349 2351 if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
2350 2352 if (ddp->ddp_phys_birth != 0)
2351 2353 ddt_bp_fill(ddp, bp, txg);
2352 2354 if (dde->dde_lead_zio[p] != NULL)
2353 2355 zio_add_child(zio, dde->dde_lead_zio[p]);
2354 2356 else
2355 2357 ddt_phys_addref(ddp);
2356 2358 } else if (zio->io_bp_override) {
2357 2359 ASSERT(bp->blk_birth == txg);
2358 2360 ASSERT(BP_EQUAL(bp, zio->io_bp_override));
2359 2361 ddt_phys_fill(ddp, bp);
2360 2362 ddt_phys_addref(ddp);
2361 2363 } else {
2362 2364 cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2363 2365 zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL,
2364 2366 zio_ddt_child_write_done, dde, zio->io_priority,
2365 2367 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2366 2368
2367 2369 zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL);
2368 2370 dde->dde_lead_zio[p] = cio;
2369 2371 }
2370 2372
2371 2373 ddt_exit(ddt);
2372 2374
2373 2375 if (cio)
2374 2376 zio_nowait(cio);
2375 2377 if (dio)
2376 2378 zio_nowait(dio);
2377 2379
2378 2380 return (ZIO_PIPELINE_CONTINUE);
2379 2381 }
2380 2382
2381 2383 ddt_entry_t *freedde; /* for debugging */
2382 2384
2383 2385 static int
2384 2386 zio_ddt_free(zio_t *zio)
2385 2387 {
2386 2388 spa_t *spa = zio->io_spa;
2387 2389 blkptr_t *bp = zio->io_bp;
2388 2390 ddt_t *ddt = ddt_select(spa, bp);
2389 2391 ddt_entry_t *dde;
2390 2392 ddt_phys_t *ddp;
2391 2393
2392 2394 ASSERT(BP_GET_DEDUP(bp));
2393 2395 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2394 2396
2395 2397 ddt_enter(ddt);
2396 2398 freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
2397 2399 ddp = ddt_phys_select(dde, bp);
2398 2400 ddt_phys_decref(ddp);
2399 2401 ddt_exit(ddt);
2400 2402
2401 2403 return (ZIO_PIPELINE_CONTINUE);
2402 2404 }
2403 2405
2404 2406 /*
2405 2407 * ==========================================================================
2406 2408 * Allocate and free blocks
2407 2409 * ==========================================================================
2408 2410 */
2409 2411 static int
2410 2412 zio_dva_allocate(zio_t *zio)
2411 2413 {
2412 2414 spa_t *spa = zio->io_spa;
2413 2415 metaslab_class_t *mc = spa_normal_class(spa);
2414 2416 blkptr_t *bp = zio->io_bp;
2415 2417 int error;
2416 2418 int flags = 0;
2417 2419
2418 2420 if (zio->io_gang_leader == NULL) {
2419 2421 ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
2420 2422 zio->io_gang_leader = zio;
2421 2423 }
2422 2424
2423 2425 ASSERT(BP_IS_HOLE(bp));
2424 2426 ASSERT0(BP_GET_NDVAS(bp));
2425 2427 ASSERT3U(zio->io_prop.zp_copies, >, 0);
2426 2428 ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
2427 2429 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
2428 2430
2429 2431 /*
2430 2432 * The dump device does not support gang blocks so allocation on
2431 2433 * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid
2432 2434 * the "fast" gang feature.
2433 2435 */
2434 2436 flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0;
2435 2437 flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ?
2436 2438 METASLAB_GANG_CHILD : 0;
2437 2439 error = metaslab_alloc(spa, mc, zio->io_size, bp,
2438 2440 zio->io_prop.zp_copies, zio->io_txg, NULL, flags);
2439 2441
2440 2442 if (error) {
2441 2443 spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
2442 2444 "size %llu, error %d", spa_name(spa), zio, zio->io_size,
2443 2445 error);
2444 2446 if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
2445 2447 return (zio_write_gang_block(zio));
2446 2448 zio->io_error = error;
2447 2449 }
2448 2450
2449 2451 return (ZIO_PIPELINE_CONTINUE);
2450 2452 }
2451 2453
2452 2454 static int
2453 2455 zio_dva_free(zio_t *zio)
2454 2456 {
2455 2457 metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
2456 2458
2457 2459 return (ZIO_PIPELINE_CONTINUE);
2458 2460 }
2459 2461
2460 2462 static int
2461 2463 zio_dva_claim(zio_t *zio)
2462 2464 {
2463 2465 int error;
2464 2466
2465 2467 error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
2466 2468 if (error)
2467 2469 zio->io_error = error;
2468 2470
2469 2471 return (ZIO_PIPELINE_CONTINUE);
2470 2472 }
2471 2473
2472 2474 /*
2473 2475 * Undo an allocation. This is used by zio_done() when an I/O fails
2474 2476 * and we want to give back the block we just allocated.
2475 2477 * This handles both normal blocks and gang blocks.
2476 2478 */
2477 2479 static void
2478 2480 zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
2479 2481 {
2480 2482 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
2481 2483 ASSERT(zio->io_bp_override == NULL);
2482 2484
2483 2485 if (!BP_IS_HOLE(bp))
2484 2486 metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
2485 2487
2486 2488 if (gn != NULL) {
2487 2489 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
2488 2490 zio_dva_unallocate(zio, gn->gn_child[g],
2489 2491 &gn->gn_gbh->zg_blkptr[g]);
2490 2492 }
2491 2493 }
2492 2494 }
2493 2495
2494 2496 /*
2495 2497 * Try to allocate an intent log block. Return 0 on success, errno on failure.
2496 2498 */
2497 2499 int
2498 2500 zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
2499 2501 uint64_t size, boolean_t use_slog)
2500 2502 {
2501 2503 int error = 1;
2502 2504
2503 2505 ASSERT(txg > spa_syncing_txg(spa));
2504 2506
2505 2507 /*
2506 2508 * ZIL blocks are always contiguous (i.e. not gang blocks) so we
2507 2509 * set the METASLAB_GANG_AVOID flag so that they don't "fast gang"
2508 2510 * when allocating them.
2509 2511 */
2510 2512 if (use_slog) {
2511 2513 error = metaslab_alloc(spa, spa_log_class(spa), size,
2512 2514 new_bp, 1, txg, old_bp,
2513 2515 METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID);
2514 2516 }
2515 2517
2516 2518 if (error) {
2517 2519 error = metaslab_alloc(spa, spa_normal_class(spa), size,
2518 2520 new_bp, 1, txg, old_bp,
2519 2521 METASLAB_HINTBP_AVOID);
2520 2522 }
2521 2523
2522 2524 if (error == 0) {
2523 2525 BP_SET_LSIZE(new_bp, size);
2524 2526 BP_SET_PSIZE(new_bp, size);
2525 2527 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
2526 2528 BP_SET_CHECKSUM(new_bp,
2527 2529 spa_version(spa) >= SPA_VERSION_SLIM_ZIL
2528 2530 ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
2529 2531 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
2530 2532 BP_SET_LEVEL(new_bp, 0);
2531 2533 BP_SET_DEDUP(new_bp, 0);
2532 2534 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
2533 2535 }
2534 2536
2535 2537 return (error);
2536 2538 }
2537 2539
2538 2540 /*
2539 2541 * Free an intent log block.
2540 2542 */
2541 2543 void
2542 2544 zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
2543 2545 {
2544 2546 ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG);
2545 2547 ASSERT(!BP_IS_GANG(bp));
2546 2548
2547 2549 zio_free(spa, txg, bp);
2548 2550 }
2549 2551
2550 2552 /*
2551 2553 * ==========================================================================
2552 2554 * Read and write to physical devices
2553 2555 * ==========================================================================
2554 2556 */
2555 2557
2556 2558
2557 2559 /*
2558 2560 * Issue an I/O to the underlying vdev. Typically the issue pipeline
2559 2561 * stops after this stage and will resume upon I/O completion.
2560 2562 * However, there are instances where the vdev layer may need to
2561 2563 * continue the pipeline when an I/O was not issued. Since the I/O
2562 2564 * that was sent to the vdev layer might be different than the one
2563 2565 * currently active in the pipeline (see vdev_queue_io()), we explicitly
2564 2566 * force the underlying vdev layers to call either zio_execute() or
2565 2567 * zio_interrupt() to ensure that the pipeline continues with the correct I/O.
2566 2568 */
2567 2569 static int
2568 2570 zio_vdev_io_start(zio_t *zio)
2569 2571 {
2570 2572 vdev_t *vd = zio->io_vd;
2571 2573 uint64_t align;
2572 2574 spa_t *spa = zio->io_spa;
2573 2575
2574 2576 ASSERT(zio->io_error == 0);
2575 2577 ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
2576 2578
2577 2579 if (vd == NULL) {
2578 2580 if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
2579 2581 spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
2580 2582
2581 2583 /*
2582 2584 * The mirror_ops handle multiple DVAs in a single BP.
2583 2585 */
2584 2586 vdev_mirror_ops.vdev_op_io_start(zio);
2585 2587 return (ZIO_PIPELINE_STOP);
2586 2588 }
2587 2589
2588 2590 /*
2589 2591 * We keep track of time-sensitive I/Os so that the scan thread
2590 2592 * can quickly react to certain workloads. In particular, we care
2591 2593 * about non-scrubbing, top-level reads and writes with the following
2592 2594 * characteristics:
2593 2595 * - synchronous writes of user data to non-slog devices
2594 2596 * - any reads of user data
2595 2597 * When these conditions are met, adjust the timestamp of spa_last_io
2596 2598 * which allows the scan thread to adjust its workload accordingly.
2597 2599 */
2598 2600 if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
2599 2601 vd == vd->vdev_top && !vd->vdev_islog &&
2600 2602 zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
2601 2603 zio->io_txg != spa_syncing_txg(spa)) {
2602 2604 uint64_t old = spa->spa_last_io;
2603 2605 uint64_t new = ddi_get_lbolt64();
2604 2606 if (old != new)
2605 2607 (void) atomic_cas_64(&spa->spa_last_io, old, new);
2606 2608 }
2607 2609
2608 2610 align = 1ULL << vd->vdev_top->vdev_ashift;
2609 2611
2610 2612 if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
2611 2613 P2PHASE(zio->io_size, align) != 0) {
2612 2614 /* Transform logical writes to be a full physical block size. */
2613 2615 uint64_t asize = P2ROUNDUP(zio->io_size, align);
2614 2616 char *abuf = zio_buf_alloc(asize);
2615 2617 ASSERT(vd == vd->vdev_top);
2616 2618 if (zio->io_type == ZIO_TYPE_WRITE) {
2617 2619 bcopy(zio->io_data, abuf, zio->io_size);
2618 2620 bzero(abuf + zio->io_size, asize - zio->io_size);
2619 2621 }
2620 2622 zio_push_transform(zio, abuf, asize, asize, zio_subblock);
2621 2623 }
2622 2624
2623 2625 /*
2624 2626 * If this is not a physical io, make sure that it is properly aligned
2625 2627 * before proceeding.
2626 2628 */
2627 2629 if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
2628 2630 ASSERT0(P2PHASE(zio->io_offset, align));
2629 2631 ASSERT0(P2PHASE(zio->io_size, align));
2630 2632 } else {
2631 2633 /*
2632 2634 * For physical writes, we allow 512b aligned writes and assume
2633 2635 * the device will perform a read-modify-write as necessary.
2634 2636 */
2635 2637 ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE));
2636 2638 ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE));
2637 2639 }
2638 2640
2639 2641 VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
2640 2642
2641 2643 /*
2642 2644 * If this is a repair I/O, and there's no self-healing involved --
2643 2645 * that is, we're just resilvering what we expect to resilver --
2644 2646 * then don't do the I/O unless zio's txg is actually in vd's DTL.
2645 2647 * This prevents spurious resilvering with nested replication.
2646 2648 * For example, given a mirror of mirrors, (A+B)+(C+D), if only
2647 2649 * A is out of date, we'll read from C+D, then use the data to
2648 2650 * resilver A+B -- but we don't actually want to resilver B, just A.
2649 2651 * The top-level mirror has no way to know this, so instead we just
2650 2652 * discard unnecessary repairs as we work our way down the vdev tree.
2651 2653 * The same logic applies to any form of nested replication:
2652 2654 * ditto + mirror, RAID-Z + replacing, etc. This covers them all.
2653 2655 */
2654 2656 if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
2655 2657 !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
2656 2658 zio->io_txg != 0 && /* not a delegated i/o */
2657 2659 !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
2658 2660 ASSERT(zio->io_type == ZIO_TYPE_WRITE);
2659 2661 zio_vdev_io_bypass(zio);
2660 2662 return (ZIO_PIPELINE_CONTINUE);
2661 2663 }
2662 2664
2663 2665 if (vd->vdev_ops->vdev_op_leaf &&
2664 2666 (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
2665 2667
2666 2668 if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio))
2667 2669 return (ZIO_PIPELINE_CONTINUE);
2668 2670
2669 2671 if ((zio = vdev_queue_io(zio)) == NULL)
2670 2672 return (ZIO_PIPELINE_STOP);
2671 2673
2672 2674 if (!vdev_accessible(vd, zio)) {
2673 2675 zio->io_error = SET_ERROR(ENXIO);
2674 2676 zio_interrupt(zio);
2675 2677 return (ZIO_PIPELINE_STOP);
2676 2678 }
2677 2679 }
2678 2680
2679 2681 vd->vdev_ops->vdev_op_io_start(zio);
2680 2682 return (ZIO_PIPELINE_STOP);
2681 2683 }
2682 2684
2683 2685 static int
2684 2686 zio_vdev_io_done(zio_t *zio)
2685 2687 {
2686 2688 vdev_t *vd = zio->io_vd;
2687 2689 vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
2688 2690 boolean_t unexpected_error = B_FALSE;
2689 2691
2690 2692 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
2691 2693 return (ZIO_PIPELINE_STOP);
2692 2694
2693 2695 ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
2694 2696
2695 2697 if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
2696 2698
2697 2699 vdev_queue_io_done(zio);
2698 2700
2699 2701 if (zio->io_type == ZIO_TYPE_WRITE)
2700 2702 vdev_cache_write(zio);
2701 2703
2702 2704 if (zio_injection_enabled && zio->io_error == 0)
2703 2705 zio->io_error = zio_handle_device_injection(vd,
2704 2706 zio, EIO);
2705 2707
2706 2708 if (zio_injection_enabled && zio->io_error == 0)
2707 2709 zio->io_error = zio_handle_label_injection(zio, EIO);
2708 2710
2709 2711 if (zio->io_error) {
2710 2712 if (!vdev_accessible(vd, zio)) {
2711 2713 zio->io_error = SET_ERROR(ENXIO);
2712 2714 } else {
2713 2715 unexpected_error = B_TRUE;
2714 2716 }
2715 2717 }
2716 2718 }
2717 2719
2718 2720 ops->vdev_op_io_done(zio);
2719 2721
2720 2722 if (unexpected_error)
2721 2723 VERIFY(vdev_probe(vd, zio) == NULL);
2722 2724
2723 2725 return (ZIO_PIPELINE_CONTINUE);
2724 2726 }
2725 2727
2726 2728 /*
2727 2729 * For non-raidz ZIOs, we can just copy aside the bad data read from the
2728 2730 * disk, and use that to finish the checksum ereport later.
2729 2731 */
2730 2732 static void
2731 2733 zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
2732 2734 const void *good_buf)
2733 2735 {
2734 2736 /* no processing needed */
2735 2737 zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
2736 2738 }
2737 2739
2738 2740 /*ARGSUSED*/
2739 2741 void
2740 2742 zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
2741 2743 {
2742 2744 void *buf = zio_buf_alloc(zio->io_size);
2743 2745
2744 2746 bcopy(zio->io_data, buf, zio->io_size);
2745 2747
2746 2748 zcr->zcr_cbinfo = zio->io_size;
2747 2749 zcr->zcr_cbdata = buf;
2748 2750 zcr->zcr_finish = zio_vsd_default_cksum_finish;
2749 2751 zcr->zcr_free = zio_buf_free;
2750 2752 }
2751 2753
2752 2754 static int
2753 2755 zio_vdev_io_assess(zio_t *zio)
2754 2756 {
2755 2757 vdev_t *vd = zio->io_vd;
2756 2758
2757 2759 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
2758 2760 return (ZIO_PIPELINE_STOP);
2759 2761
2760 2762 if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
2761 2763 spa_config_exit(zio->io_spa, SCL_ZIO, zio);
2762 2764
2763 2765 if (zio->io_vsd != NULL) {
2764 2766 zio->io_vsd_ops->vsd_free(zio);
2765 2767 zio->io_vsd = NULL;
2766 2768 }
2767 2769
2768 2770 if (zio_injection_enabled && zio->io_error == 0)
2769 2771 zio->io_error = zio_handle_fault_injection(zio, EIO);
2770 2772
2771 2773 /*
2772 2774 * If the I/O failed, determine whether we should attempt to retry it.
2773 2775 *
2774 2776 * On retry, we cut in line in the issue queue, since we don't want
2775 2777 * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
2776 2778 */
2777 2779 if (zio->io_error && vd == NULL &&
2778 2780 !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
2779 2781 ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */
2780 2782 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */
2781 2783 zio->io_error = 0;
2782 2784 zio->io_flags |= ZIO_FLAG_IO_RETRY |
2783 2785 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
2784 2786 zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
2785 2787 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
2786 2788 zio_requeue_io_start_cut_in_line);
2787 2789 return (ZIO_PIPELINE_STOP);
2788 2790 }
2789 2791
2790 2792 /*
2791 2793 * If we got an error on a leaf device, convert it to ENXIO
2792 2794 * if the device is not accessible at all.
2793 2795 */
2794 2796 if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
2795 2797 !vdev_accessible(vd, zio))
2796 2798 zio->io_error = SET_ERROR(ENXIO);
2797 2799
2798 2800 /*
2799 2801 * If we can't write to an interior vdev (mirror or RAID-Z),
2800 2802 * set vdev_cant_write so that we stop trying to allocate from it.
2801 2803 */
2802 2804 if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
2803 2805 vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
2804 2806 vd->vdev_cant_write = B_TRUE;
2805 2807 }
2806 2808
2807 2809 if (zio->io_error)
2808 2810 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2809 2811
2810 2812 if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
2811 2813 zio->io_physdone != NULL) {
2812 2814 ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
2813 2815 ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
2814 2816 zio->io_physdone(zio->io_logical);
2815 2817 }
2816 2818
2817 2819 return (ZIO_PIPELINE_CONTINUE);
2818 2820 }
2819 2821
2820 2822 void
2821 2823 zio_vdev_io_reissue(zio_t *zio)
2822 2824 {
2823 2825 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
2824 2826 ASSERT(zio->io_error == 0);
2825 2827
2826 2828 zio->io_stage >>= 1;
2827 2829 }
2828 2830
2829 2831 void
2830 2832 zio_vdev_io_redone(zio_t *zio)
2831 2833 {
2832 2834 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
2833 2835
2834 2836 zio->io_stage >>= 1;
2835 2837 }
2836 2838
2837 2839 void
2838 2840 zio_vdev_io_bypass(zio_t *zio)
2839 2841 {
2840 2842 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
2841 2843 ASSERT(zio->io_error == 0);
2842 2844
2843 2845 zio->io_flags |= ZIO_FLAG_IO_BYPASS;
2844 2846 zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
2845 2847 }
2846 2848
2847 2849 /*
2848 2850 * ==========================================================================
2849 2851 * Generate and verify checksums
2850 2852 * ==========================================================================
2851 2853 */
2852 2854 static int
2853 2855 zio_checksum_generate(zio_t *zio)
2854 2856 {
2855 2857 blkptr_t *bp = zio->io_bp;
2856 2858 enum zio_checksum checksum;
2857 2859
2858 2860 if (bp == NULL) {
2859 2861 /*
2860 2862 * This is zio_write_phys().
2861 2863 * We're either generating a label checksum, or none at all.
2862 2864 */
2863 2865 checksum = zio->io_prop.zp_checksum;
2864 2866
2865 2867 if (checksum == ZIO_CHECKSUM_OFF)
2866 2868 return (ZIO_PIPELINE_CONTINUE);
2867 2869
2868 2870 ASSERT(checksum == ZIO_CHECKSUM_LABEL);
2869 2871 } else {
2870 2872 if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
2871 2873 ASSERT(!IO_IS_ALLOCATING(zio));
2872 2874 checksum = ZIO_CHECKSUM_GANG_HEADER;
2873 2875 } else {
2874 2876 checksum = BP_GET_CHECKSUM(bp);
2875 2877 }
2876 2878 }
2877 2879
2878 2880 zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size);
2879 2881
2880 2882 return (ZIO_PIPELINE_CONTINUE);
2881 2883 }
2882 2884
2883 2885 static int
2884 2886 zio_checksum_verify(zio_t *zio)
2885 2887 {
2886 2888 zio_bad_cksum_t info;
2887 2889 blkptr_t *bp = zio->io_bp;
2888 2890 int error;
2889 2891
2890 2892 ASSERT(zio->io_vd != NULL);
2891 2893
2892 2894 if (bp == NULL) {
2893 2895 /*
2894 2896 * This is zio_read_phys().
2895 2897 * We're either verifying a label checksum, or nothing at all.
2896 2898 */
2897 2899 if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
2898 2900 return (ZIO_PIPELINE_CONTINUE);
2899 2901
2900 2902 ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
2901 2903 }
2902 2904
2903 2905 if ((error = zio_checksum_error(zio, &info)) != 0) {
2904 2906 zio->io_error = error;
2905 2907 if (error == ECKSUM &&
2906 2908 !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2907 2909 zfs_ereport_start_checksum(zio->io_spa,
2908 2910 zio->io_vd, zio, zio->io_offset,
2909 2911 zio->io_size, NULL, &info);
2910 2912 }
2911 2913 }
2912 2914
2913 2915 return (ZIO_PIPELINE_CONTINUE);
2914 2916 }
2915 2917
2916 2918 /*
2917 2919 * Called by RAID-Z to ensure we don't compute the checksum twice.
2918 2920 */
2919 2921 void
2920 2922 zio_checksum_verified(zio_t *zio)
2921 2923 {
2922 2924 zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
2923 2925 }
2924 2926
2925 2927 /*
2926 2928 * ==========================================================================
2927 2929 * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
2928 2930 * An error of 0 indicates success. ENXIO indicates whole-device failure,
2929 2931 * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO
2930 2932 * indicate errors that are specific to one I/O, and most likely permanent.
2931 2933 * Any other error is presumed to be worse because we weren't expecting it.
2932 2934 * ==========================================================================
2933 2935 */
2934 2936 int
2935 2937 zio_worst_error(int e1, int e2)
2936 2938 {
2937 2939 static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
2938 2940 int r1, r2;
2939 2941
2940 2942 for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
2941 2943 if (e1 == zio_error_rank[r1])
2942 2944 break;
2943 2945
2944 2946 for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
2945 2947 if (e2 == zio_error_rank[r2])
2946 2948 break;
2947 2949
2948 2950 return (r1 > r2 ? e1 : e2);
2949 2951 }
2950 2952
2951 2953 /*
2952 2954 * ==========================================================================
2953 2955 * I/O completion
2954 2956 * ==========================================================================
2955 2957 */
2956 2958 static int
2957 2959 zio_ready(zio_t *zio)
2958 2960 {
2959 2961 blkptr_t *bp = zio->io_bp;
2960 2962 zio_t *pio, *pio_next;
2961 2963
2962 2964 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
2963 2965 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
2964 2966 return (ZIO_PIPELINE_STOP);
2965 2967
2966 2968 if (zio->io_ready) {
2967 2969 ASSERT(IO_IS_ALLOCATING(zio));
2968 2970 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
2969 2971 (zio->io_flags & ZIO_FLAG_NOPWRITE));
2970 2972 ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
2971 2973
2972 2974 zio->io_ready(zio);
2973 2975 }
2974 2976
2975 2977 if (bp != NULL && bp != &zio->io_bp_copy)
2976 2978 zio->io_bp_copy = *bp;
2977 2979
2978 2980 if (zio->io_error)
2979 2981 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2980 2982
2981 2983 mutex_enter(&zio->io_lock);
2982 2984 zio->io_state[ZIO_WAIT_READY] = 1;
2983 2985 pio = zio_walk_parents(zio);
2984 2986 mutex_exit(&zio->io_lock);
2985 2987
2986 2988 /*
2987 2989 * As we notify zio's parents, new parents could be added.
2988 2990 * New parents go to the head of zio's io_parent_list, however,
2989 2991 * so we will (correctly) not notify them. The remainder of zio's
2990 2992 * io_parent_list, from 'pio_next' onward, cannot change because
2991 2993 * all parents must wait for us to be done before they can be done.
2992 2994 */
2993 2995 for (; pio != NULL; pio = pio_next) {
2994 2996 pio_next = zio_walk_parents(zio);
2995 2997 zio_notify_parent(pio, zio, ZIO_WAIT_READY);
2996 2998 }
2997 2999
2998 3000 if (zio->io_flags & ZIO_FLAG_NODATA) {
2999 3001 if (BP_IS_GANG(bp)) {
3000 3002 zio->io_flags &= ~ZIO_FLAG_NODATA;
3001 3003 } else {
3002 3004 ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE);
3003 3005 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
3004 3006 }
3005 3007 }
3006 3008
3007 3009 if (zio_injection_enabled &&
3008 3010 zio->io_spa->spa_syncing_txg == zio->io_txg)
3009 3011 zio_handle_ignored_writes(zio);
3010 3012
3011 3013 return (ZIO_PIPELINE_CONTINUE);
3012 3014 }
3013 3015
3014 3016 static int
3015 3017 zio_done(zio_t *zio)
3016 3018 {
3017 3019 spa_t *spa = zio->io_spa;
3018 3020 zio_t *lio = zio->io_logical;
3019 3021 blkptr_t *bp = zio->io_bp;
3020 3022 vdev_t *vd = zio->io_vd;
3021 3023 uint64_t psize = zio->io_size;
3022 3024 zio_t *pio, *pio_next;
3023 3025
3024 3026 /*
3025 3027 * If our children haven't all completed,
3026 3028 * wait for them and then repeat this pipeline stage.
3027 3029 */
3028 3030 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
3029 3031 zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
3030 3032 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
3031 3033 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
3032 3034 return (ZIO_PIPELINE_STOP);
3033 3035
3034 3036 for (int c = 0; c < ZIO_CHILD_TYPES; c++)
3035 3037 for (int w = 0; w < ZIO_WAIT_TYPES; w++)
3036 3038 ASSERT(zio->io_children[c][w] == 0);
3037 3039
3038 3040 if (bp != NULL && !BP_IS_EMBEDDED(bp)) {
3039 3041 ASSERT(bp->blk_pad[0] == 0);
3040 3042 ASSERT(bp->blk_pad[1] == 0);
3041 3043 ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
3042 3044 (bp == zio_unique_parent(zio)->io_bp));
3043 3045 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
3044 3046 zio->io_bp_override == NULL &&
3045 3047 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
3046 3048 ASSERT(!BP_SHOULD_BYTESWAP(bp));
3047 3049 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp));
3048 3050 ASSERT(BP_COUNT_GANG(bp) == 0 ||
3049 3051 (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
3050 3052 }
3051 3053 if (zio->io_flags & ZIO_FLAG_NOPWRITE)
3052 3054 VERIFY(BP_EQUAL(bp, &zio->io_bp_orig));
3053 3055 }
3054 3056
3055 3057 /*
3056 3058 * If there were child vdev/gang/ddt errors, they apply to us now.
3057 3059 */
3058 3060 zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
3059 3061 zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
3060 3062 zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
3061 3063
3062 3064 /*
3063 3065 * If the I/O on the transformed data was successful, generate any
3064 3066 * checksum reports now while we still have the transformed data.
3065 3067 */
3066 3068 if (zio->io_error == 0) {
3067 3069 while (zio->io_cksum_report != NULL) {
3068 3070 zio_cksum_report_t *zcr = zio->io_cksum_report;
3069 3071 uint64_t align = zcr->zcr_align;
3070 3072 uint64_t asize = P2ROUNDUP(psize, align);
3071 3073 char *abuf = zio->io_data;
3072 3074
3073 3075 if (asize != psize) {
3074 3076 abuf = zio_buf_alloc(asize);
3075 3077 bcopy(zio->io_data, abuf, psize);
3076 3078 bzero(abuf + psize, asize - psize);
3077 3079 }
3078 3080
3079 3081 zio->io_cksum_report = zcr->zcr_next;
3080 3082 zcr->zcr_next = NULL;
3081 3083 zcr->zcr_finish(zcr, abuf);
3082 3084 zfs_ereport_free_checksum(zcr);
3083 3085
3084 3086 if (asize != psize)
3085 3087 zio_buf_free(abuf, asize);
3086 3088 }
3087 3089 }
3088 3090
3089 3091 zio_pop_transforms(zio); /* note: may set zio->io_error */
3090 3092
3091 3093 vdev_stat_update(zio, psize);
3092 3094
3093 3095 if (zio->io_error) {
3094 3096 /*
3095 3097 * If this I/O is attached to a particular vdev,
3096 3098 * generate an error message describing the I/O failure
3097 3099 * at the block level. We ignore these errors if the
3098 3100 * device is currently unavailable.
3099 3101 */
3100 3102 if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
3101 3103 zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);
3102 3104
3103 3105 if ((zio->io_error == EIO || !(zio->io_flags &
3104 3106 (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
3105 3107 zio == lio) {
3106 3108 /*
3107 3109 * For logical I/O requests, tell the SPA to log the
3108 3110 * error and generate a logical data ereport.
3109 3111 */
3110 3112 spa_log_error(spa, zio);
3111 3113 zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio,
3112 3114 0, 0);
3113 3115 }
3114 3116 }
3115 3117
3116 3118 if (zio->io_error && zio == lio) {
3117 3119 /*
3118 3120 * Determine whether zio should be reexecuted. This will
3119 3121 * propagate all the way to the root via zio_notify_parent().
3120 3122 */
3121 3123 ASSERT(vd == NULL && bp != NULL);
3122 3124 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
3123 3125
3124 3126 if (IO_IS_ALLOCATING(zio) &&
3125 3127 !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
3126 3128 if (zio->io_error != ENOSPC)
3127 3129 zio->io_reexecute |= ZIO_REEXECUTE_NOW;
3128 3130 else
3129 3131 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3130 3132 }
3131 3133
3132 3134 if ((zio->io_type == ZIO_TYPE_READ ||
3133 3135 zio->io_type == ZIO_TYPE_FREE) &&
3134 3136 !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
3135 3137 zio->io_error == ENXIO &&
3136 3138 spa_load_state(spa) == SPA_LOAD_NONE &&
3137 3139 spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
3138 3140 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3139 3141
3140 3142 if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
3141 3143 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3142 3144
3143 3145 /*
3144 3146 * Here is a possibly good place to attempt to do
3145 3147 * either combinatorial reconstruction or error correction
3146 3148 * based on checksums. It also might be a good place
3147 3149 * to send out preliminary ereports before we suspend
3148 3150 * processing.
3149 3151 */
3150 3152 }
3151 3153
3152 3154 /*
3153 3155 * If there were logical child errors, they apply to us now.
3154 3156 * We defer this until now to avoid conflating logical child
3155 3157 * errors with errors that happened to the zio itself when
3156 3158 * updating vdev stats and reporting FMA events above.
3157 3159 */
3158 3160 zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
3159 3161
3160 3162 if ((zio->io_error || zio->io_reexecute) &&
3161 3163 IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
3162 3164 !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
3163 3165 zio_dva_unallocate(zio, zio->io_gang_tree, bp);
3164 3166
3165 3167 zio_gang_tree_free(&zio->io_gang_tree);
3166 3168
3167 3169 /*
3168 3170 * Godfather I/Os should never suspend.
3169 3171 */
3170 3172 if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
3171 3173 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
3172 3174 zio->io_reexecute = 0;
3173 3175
3174 3176 if (zio->io_reexecute) {
3175 3177 /*
3176 3178 * This is a logical I/O that wants to reexecute.
3177 3179 *
3178 3180 * Reexecute is top-down. When an i/o fails, if it's not
3179 3181 * the root, it simply notifies its parent and sticks around.
3180 3182 * The parent, seeing that it still has children in zio_done(),
3181 3183 * does the same. This percolates all the way up to the root.
3182 3184 * The root i/o will reexecute or suspend the entire tree.
3183 3185 *
3184 3186 * This approach ensures that zio_reexecute() honors
3185 3187 * all the original i/o dependency relationships, e.g.
3186 3188 * parents not executing until children are ready.
3187 3189 */
3188 3190 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
3189 3191
3190 3192 zio->io_gang_leader = NULL;
3191 3193
3192 3194 mutex_enter(&zio->io_lock);
3193 3195 zio->io_state[ZIO_WAIT_DONE] = 1;
3194 3196 mutex_exit(&zio->io_lock);
3195 3197
3196 3198 /*
3197 3199 * "The Godfather" I/O monitors its children but is
3198 3200 * not a true parent to them. It will track them through
3199 3201 * the pipeline but severs its ties whenever they get into
3200 3202 * trouble (e.g. suspended). This allows "The Godfather"
3201 3203 * I/O to return status without blocking.
3202 3204 */
3203 3205 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
3204 3206 zio_link_t *zl = zio->io_walk_link;
3205 3207 pio_next = zio_walk_parents(zio);
3206 3208
3207 3209 if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
3208 3210 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
3209 3211 zio_remove_child(pio, zio, zl);
3210 3212 zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3211 3213 }
3212 3214 }
3213 3215
3214 3216 if ((pio = zio_unique_parent(zio)) != NULL) {
3215 3217 /*
3216 3218 * We're not a root i/o, so there's nothing to do
3217 3219 * but notify our parent. Don't propagate errors
3218 3220 * upward since we haven't permanently failed yet.
3219 3221 */
3220 3222 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
3221 3223 zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
3222 3224 zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3223 3225 } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
3224 3226 /*
3225 3227 * We'd fail again if we reexecuted now, so suspend
3226 3228 * until conditions improve (e.g. device comes online).
3227 3229 */
3228 3230 zio_suspend(spa, zio);
3229 3231 } else {
3230 3232 /*
3231 3233 * Reexecution is potentially a huge amount of work.
3232 3234 * Hand it off to the otherwise-unused claim taskq.
3233 3235 */
3234 3236 ASSERT(zio->io_tqent.tqent_next == NULL);
3235 3237 spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM,
3236 3238 ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio,
3237 3239 0, &zio->io_tqent);
3238 3240 }
3239 3241 return (ZIO_PIPELINE_STOP);
3240 3242 }
3241 3243
3242 3244 ASSERT(zio->io_child_count == 0);
3243 3245 ASSERT(zio->io_reexecute == 0);
3244 3246 ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
3245 3247
3246 3248 /*
3247 3249 * Report any checksum errors, since the I/O is complete.
3248 3250 */
3249 3251 while (zio->io_cksum_report != NULL) {
3250 3252 zio_cksum_report_t *zcr = zio->io_cksum_report;
3251 3253 zio->io_cksum_report = zcr->zcr_next;
3252 3254 zcr->zcr_next = NULL;
3253 3255 zcr->zcr_finish(zcr, NULL);
3254 3256 zfs_ereport_free_checksum(zcr);
3255 3257 }
3256 3258
3257 3259 /*
3258 3260 * It is the responsibility of the done callback to ensure that this
3259 3261 * particular zio is no longer discoverable for adoption, and as
3260 3262 * such, cannot acquire any new parents.
3261 3263 */
3262 3264 if (zio->io_done)
3263 3265 zio->io_done(zio);
3264 3266
3265 3267 mutex_enter(&zio->io_lock);
3266 3268 zio->io_state[ZIO_WAIT_DONE] = 1;
3267 3269 mutex_exit(&zio->io_lock);
3268 3270
3269 3271 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
3270 3272 zio_link_t *zl = zio->io_walk_link;
3271 3273 pio_next = zio_walk_parents(zio);
3272 3274 zio_remove_child(pio, zio, zl);
3273 3275 zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3274 3276 }
3275 3277
3276 3278 if (zio->io_waiter != NULL) {
3277 3279 mutex_enter(&zio->io_lock);
3278 3280 zio->io_executor = NULL;
3279 3281 cv_broadcast(&zio->io_cv);
3280 3282 mutex_exit(&zio->io_lock);
3281 3283 } else {
3282 3284 zio_destroy(zio);
3283 3285 }
3284 3286
3285 3287 return (ZIO_PIPELINE_STOP);
3286 3288 }
3287 3289
3288 3290 /*
3289 3291 * ==========================================================================
3290 3292 * I/O pipeline definition
3291 3293 * ==========================================================================
3292 3294 */
3293 3295 static zio_pipe_stage_t *zio_pipeline[] = {
3294 3296 NULL,
3295 3297 zio_read_bp_init,
3296 3298 zio_free_bp_init,
3297 3299 zio_issue_async,
3298 3300 zio_write_bp_init,
3299 3301 zio_checksum_generate,
3300 3302 zio_nop_write,
3301 3303 zio_ddt_read_start,
3302 3304 zio_ddt_read_done,
3303 3305 zio_ddt_write,
3304 3306 zio_ddt_free,
3305 3307 zio_gang_assemble,
3306 3308 zio_gang_issue,
3307 3309 zio_dva_allocate,
3308 3310 zio_dva_free,
3309 3311 zio_dva_claim,
3310 3312 zio_ready,
3311 3313 zio_vdev_io_start,
3312 3314 zio_vdev_io_done,
3313 3315 zio_vdev_io_assess,
3314 3316 zio_checksum_verify,
3315 3317 zio_done
3316 3318 };
3317 3319
3318 3320 /* dnp is the dnode for zb1->zb_object */
3319 3321 boolean_t
3320 3322 zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_phys_t *zb1,
3321 3323 const zbookmark_phys_t *zb2)
3322 3324 {
3323 3325 uint64_t zb1nextL0, zb2thisobj;
3324 3326
3325 3327 ASSERT(zb1->zb_objset == zb2->zb_objset);
3326 3328 ASSERT(zb2->zb_level == 0);
3327 3329
3328 3330 /* The objset_phys_t isn't before anything. */
3329 3331 if (dnp == NULL)
3330 3332 return (B_FALSE);
3331 3333
3332 3334 zb1nextL0 = (zb1->zb_blkid + 1) <<
3333 3335 ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
3334 3336
3335 3337 zb2thisobj = zb2->zb_object ? zb2->zb_object :
3336 3338 zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
3337 3339
3338 3340 if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
3339 3341 uint64_t nextobj = zb1nextL0 *
3340 3342 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
3341 3343 return (nextobj <= zb2thisobj);
3342 3344 }
3343 3345
3344 3346 if (zb1->zb_object < zb2thisobj)
3345 3347 return (B_TRUE);
3346 3348 if (zb1->zb_object > zb2thisobj)
3347 3349 return (B_FALSE);
3348 3350 if (zb2->zb_object == DMU_META_DNODE_OBJECT)
3349 3351 return (B_FALSE);
3350 3352 return (zb1nextL0 <= zb2->zb_blkid);
3351 3353 }
|
↓ open down ↓ |
2206 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX