1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 * Copyright (c) 2012 by Delphix. All rights reserved.
25 * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
26 */
27
28 #include <sys/spa.h>
29 #include <sys/spa_impl.h>
30 #include <sys/vdev.h>
31 #include <sys/vdev_impl.h>
32 #include <sys/zio.h>
33 #include <sys/zio_checksum.h>
34
35 #include <sys/fm/fs/zfs.h>
36 #include <sys/fm/protocol.h>
37 #include <sys/fm/util.h>
38 #include <sys/sysevent.h>
39
40 /*
41 * This general routine is responsible for generating all the different ZFS
42 * ereports. The payload is dependent on the class, and which arguments are
43 * supplied to the function:
44 *
45 * EREPORT POOL VDEV IO
46 * block X X X
47 * data X X
48 * device X X
49 * pool X
50 *
51 * If we are in a loading state, all errors are chained together by the same
52 * SPA-wide ENA (Error Numeric Association).
53 *
54 * For isolated I/O requests, we get the ENA from the zio_t. The propagation
55 * gets very complicated due to RAID-Z, gang blocks, and vdev caching. We want
56 * to chain together all ereports associated with a logical piece of data. For
57 * read I/Os, there are basically three 'types' of I/O, which form a roughly
58 * layered diagram:
59 *
60 * +---------------+
61 * | Aggregate I/O | No associated logical data or device
62 * +---------------+
63 * |
64 * V
65 * +---------------+ Reads associated with a piece of logical data.
66 * | Read I/O | This includes reads on behalf of RAID-Z,
67 * +---------------+ mirrors, gang blocks, retries, etc.
68 * |
69 * V
70 * +---------------+ Reads associated with a particular device, but
71 * | Physical I/O | no logical data. Issued as part of vdev caching
72 * +---------------+ and I/O aggregation.
73 *
74 * Note that 'physical I/O' here is not the same terminology as used in the rest
75 * of ZIO. Typically, 'physical I/O' simply means that there is no attached
76 * blockpointer. But I/O with no associated block pointer can still be related
77 * to a logical piece of data (i.e. RAID-Z requests).
78 *
79 * Purely physical I/O always have unique ENAs. They are not related to a
80 * particular piece of logical data, and therefore cannot be chained together.
81 * We still generate an ereport, but the DE doesn't correlate it with any
82 * logical piece of data. When such an I/O fails, the delegated I/O requests
83 * will issue a retry, which will trigger the 'real' ereport with the correct
84 * ENA.
85 *
86 * We keep track of the ENA for a ZIO chain through the 'io_logical' member.
87 * When a new logical I/O is issued, we set this to point to itself. Child I/Os
88 * then inherit this pointer, so that when it is first set subsequent failures
89 * will use the same ENA. For vdev cache fill and queue aggregation I/O,
90 * this pointer is set to NULL, and no ereport will be generated (since it
91 * doesn't actually correspond to any particular device or piece of data,
92 * and the caller will always retry without caching or queueing anyway).
93 *
94 * For checksum errors, we want to include more information about the actual
95 * error which occurs. Accordingly, we build an ereport when the error is
96 * noticed, but instead of sending it in immediately, we hang it off of the
97 * io_cksum_report field of the logical IO. When the logical IO completes
98 * (successfully or not), zfs_ereport_finish_checksum() is called with the
99 * good and bad versions of the buffer (if available), and we annotate the
100 * ereport with information about the differences.
101 */
102 #ifdef _KERNEL
103 static void
104 zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
105 const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
106 uint64_t stateoroffset, uint64_t size)
107 {
108 nvlist_t *ereport, *detector;
109
110 uint64_t ena;
111 char class[64];
112
113 /*
114 * If we are doing a spa_tryimport() or in recovery mode,
115 * ignore errors.
116 */
117 if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT ||
118 spa_load_state(spa) == SPA_LOAD_RECOVER)
119 return;
120
121 /*
122 * If we are in the middle of opening a pool, and the previous attempt
123 * failed, don't bother logging any new ereports - we're just going to
124 * get the same diagnosis anyway.
125 */
126 if (spa_load_state(spa) != SPA_LOAD_NONE &&
127 spa->spa_last_open_failed)
128 return;
129
130 if (zio != NULL) {
131 /*
132 * If this is not a read or write zio, ignore the error. This
133 * can occur if the DKIOCFLUSHWRITECACHE ioctl fails.
134 */
135 if (zio->io_type != ZIO_TYPE_READ &&
136 zio->io_type != ZIO_TYPE_WRITE)
137 return;
138
139 /*
140 * Ignore any errors from speculative I/Os, as failure is an
141 * expected result.
142 */
143 if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
144 return;
145
146 /*
147 * If this I/O is not a retry I/O, don't post an ereport.
148 * Otherwise, we risk making bad diagnoses based on B_FAILFAST
149 * I/Os.
150 */
151 if (zio->io_error == EIO &&
152 !(zio->io_flags & ZIO_FLAG_IO_RETRY))
153 return;
154
155 if (vd != NULL) {
156 /*
157 * If the vdev has already been marked as failing due
158 * to a failed probe, then ignore any subsequent I/O
159 * errors, as the DE will automatically fault the vdev
160 * on the first such failure. This also catches cases
161 * where vdev_remove_wanted is set and the device has
162 * not yet been asynchronously placed into the REMOVED
163 * state.
164 */
165 if (zio->io_vd == vd && !vdev_accessible(vd, zio))
166 return;
167
168 /*
169 * Ignore checksum errors for reads from DTL regions of
170 * leaf vdevs.
171 */
172 if (zio->io_type == ZIO_TYPE_READ &&
173 zio->io_error == ECKSUM &&
174 vd->vdev_ops->vdev_op_leaf &&
175 vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1))
176 return;
177 }
178 }
179
180 /*
181 * For probe failure, we want to avoid posting ereports if we've
182 * already removed the device in the meantime.
183 */
184 if (vd != NULL &&
185 strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 &&
186 (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED))
187 return;
188
189 if ((ereport = fm_nvlist_create(NULL)) == NULL)
190 return;
191
192 if ((detector = fm_nvlist_create(NULL)) == NULL) {
193 fm_nvlist_destroy(ereport, FM_NVA_FREE);
194 return;
195 }
196
197 /*
198 * Serialize ereport generation
199 */
200 mutex_enter(&spa->spa_errlist_lock);
201
202 /*
203 * Determine the ENA to use for this event. If we are in a loading
204 * state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use
205 * a root zio-wide ENA. Otherwise, simply use a unique ENA.
206 */
207 if (spa_load_state(spa) != SPA_LOAD_NONE) {
208 if (spa->spa_ena == 0)
209 spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1);
210 ena = spa->spa_ena;
211 } else if (zio != NULL && zio->io_logical != NULL) {
212 if (zio->io_logical->io_ena == 0)
213 zio->io_logical->io_ena =
214 fm_ena_generate(0, FM_ENA_FMT1);
215 ena = zio->io_logical->io_ena;
216 } else {
217 ena = fm_ena_generate(0, FM_ENA_FMT1);
218 }
219
220 /*
221 * Construct the full class, detector, and other standard FMA fields.
222 */
223 (void) snprintf(class, sizeof (class), "%s.%s",
224 ZFS_ERROR_CLASS, subclass);
225
226 fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa),
227 vd != NULL ? vd->vdev_guid : 0);
228
229 fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL);
230
231 /*
232 * Construct the per-ereport payload, depending on which parameters are
233 * passed in.
234 */
235
236 /*
237 * Generic payload members common to all ereports.
238 */
239 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL,
240 DATA_TYPE_STRING, spa_name(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
241 DATA_TYPE_UINT64, spa_guid(spa),
242 FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32,
243 spa_load_state(spa), NULL);
244
245 if (spa != NULL) {
246 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE,
247 DATA_TYPE_STRING,
248 spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ?
249 FM_EREPORT_FAILMODE_WAIT :
250 spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ?
251 FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC,
252 NULL);
253 }
254
255 if (vd != NULL) {
256 vdev_t *pvd = vd->vdev_parent;
257
258 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
259 DATA_TYPE_UINT64, vd->vdev_guid,
260 FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
261 DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL);
262 if (vd->vdev_path != NULL)
263 fm_payload_set(ereport,
264 FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH,
265 DATA_TYPE_STRING, vd->vdev_path, NULL);
266 if (vd->vdev_devid != NULL)
267 fm_payload_set(ereport,
268 FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID,
269 DATA_TYPE_STRING, vd->vdev_devid, NULL);
270 if (vd->vdev_fru != NULL)
271 fm_payload_set(ereport,
272 FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU,
273 DATA_TYPE_STRING, vd->vdev_fru, NULL);
274
275 if (pvd != NULL) {
276 fm_payload_set(ereport,
277 FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID,
278 DATA_TYPE_UINT64, pvd->vdev_guid,
279 FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE,
280 DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type,
281 NULL);
282 if (pvd->vdev_path)
283 fm_payload_set(ereport,
284 FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH,
285 DATA_TYPE_STRING, pvd->vdev_path, NULL);
286 if (pvd->vdev_devid)
287 fm_payload_set(ereport,
288 FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID,
289 DATA_TYPE_STRING, pvd->vdev_devid, NULL);
290 }
291 }
292
293 if (zio != NULL) {
294 /*
295 * Payload common to all I/Os.
296 */
297 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR,
298 DATA_TYPE_INT32, zio->io_error, NULL);
299
300 /*
301 * If the 'size' parameter is non-zero, it indicates this is a
302 * RAID-Z or other I/O where the physical offset and length are
303 * provided for us, instead of within the zio_t.
304 */
305 if (vd != NULL) {
306 /*
307 * The 'stateoroffset' and 'size' parameters are
308 * overloaded to represent the timeout and latency,
309 * respectively, in a timeout report.
310 */
311 if (strcmp(subclass, FM_EREPORT_ZFS_TIMEOUT) == 0)
312 fm_payload_set(ereport,
313 FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMEOUT,
314 DATA_TYPE_UINT64, stateoroffset,
315 FM_EREPORT_PAYLOAD_ZFS_ZIO_LATENCY,
316 DATA_TYPE_UINT64, size, NULL);
317 else if (size)
318 fm_payload_set(ereport,
319 FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
320 DATA_TYPE_UINT64, stateoroffset,
321 FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
322 DATA_TYPE_UINT64, size, NULL);
323 else
324 fm_payload_set(ereport,
325 FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
326 DATA_TYPE_UINT64, zio->io_offset,
327 FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
328 DATA_TYPE_UINT64, zio->io_size, NULL);
329 }
330
331 /*
332 * Payload for I/Os with corresponding logical information.
333 */
334 if (zio->io_logical != NULL)
335 fm_payload_set(ereport,
336 FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET,
337 DATA_TYPE_UINT64,
338 zio->io_logical->io_bookmark.zb_objset,
339 FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT,
340 DATA_TYPE_UINT64,
341 zio->io_logical->io_bookmark.zb_object,
342 FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL,
343 DATA_TYPE_INT64,
344 zio->io_logical->io_bookmark.zb_level,
345 FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
346 DATA_TYPE_UINT64,
347 zio->io_logical->io_bookmark.zb_blkid, NULL);
348 } else if (vd != NULL) {
349 /*
350 * If we have a vdev but no zio, this is a device fault, and the
351 * 'stateoroffset' parameter indicates the previous state of the
352 * vdev.
353 */
354 fm_payload_set(ereport,
355 FM_EREPORT_PAYLOAD_ZFS_PREV_STATE,
356 DATA_TYPE_UINT64, stateoroffset, NULL);
357 }
358
359 mutex_exit(&spa->spa_errlist_lock);
360
361 *ereport_out = ereport;
362 *detector_out = detector;
363 }
364
365 /* if it's <= 128 bytes, save the corruption directly */
366 #define ZFM_MAX_INLINE (128 / sizeof (uint64_t))
367
368 #define MAX_RANGES 16
369
370 typedef struct zfs_ecksum_info {
371 /* histograms of set and cleared bits by bit number in a 64-bit word */
372 uint16_t zei_histogram_set[sizeof (uint64_t) * NBBY];
373 uint16_t zei_histogram_cleared[sizeof (uint64_t) * NBBY];
374
375 /* inline arrays of bits set and cleared. */
376 uint64_t zei_bits_set[ZFM_MAX_INLINE];
377 uint64_t zei_bits_cleared[ZFM_MAX_INLINE];
378
379 /*
380 * for each range, the number of bits set and cleared. The Hamming
381 * distance between the good and bad buffers is the sum of them all.
382 */
383 uint32_t zei_range_sets[MAX_RANGES];
384 uint32_t zei_range_clears[MAX_RANGES];
385
386 struct zei_ranges {
387 uint32_t zr_start;
388 uint32_t zr_end;
389 } zei_ranges[MAX_RANGES];
390
391 size_t zei_range_count;
392 uint32_t zei_mingap;
393 uint32_t zei_allowed_mingap;
394
395 } zfs_ecksum_info_t;
396
397 static void
398 update_histogram(uint64_t value_arg, uint16_t *hist, uint32_t *count)
399 {
400 size_t i;
401 size_t bits = 0;
402 uint64_t value = BE_64(value_arg);
403
404 /* We store the bits in big-endian (largest-first) order */
405 for (i = 0; i < 64; i++) {
406 if (value & (1ull << i)) {
407 hist[63 - i]++;
408 ++bits;
409 }
410 }
411 /* update the count of bits changed */
412 *count += bits;
413 }
414
415 /*
416 * We've now filled up the range array, and need to increase "mingap" and
417 * shrink the range list accordingly. zei_mingap is always the smallest
418 * distance between array entries, so we set the new_allowed_gap to be
419 * one greater than that. We then go through the list, joining together
420 * any ranges which are closer than the new_allowed_gap.
421 *
422 * By construction, there will be at least one. We also update zei_mingap
423 * to the new smallest gap, to prepare for our next invocation.
424 */
425 static void
426 shrink_ranges(zfs_ecksum_info_t *eip)
427 {
428 uint32_t mingap = UINT32_MAX;
429 uint32_t new_allowed_gap = eip->zei_mingap + 1;
430
431 size_t idx, output;
432 size_t max = eip->zei_range_count;
433
434 struct zei_ranges *r = eip->zei_ranges;
435
436 ASSERT3U(eip->zei_range_count, >, 0);
437 ASSERT3U(eip->zei_range_count, <=, MAX_RANGES);
438
439 output = idx = 0;
440 while (idx < max - 1) {
441 uint32_t start = r[idx].zr_start;
442 uint32_t end = r[idx].zr_end;
443
444 while (idx < max - 1) {
445 idx++;
446
447 uint32_t nstart = r[idx].zr_start;
448 uint32_t nend = r[idx].zr_end;
449
450 uint32_t gap = nstart - end;
451 if (gap < new_allowed_gap) {
452 end = nend;
453 continue;
454 }
455 if (gap < mingap)
456 mingap = gap;
457 break;
458 }
459 r[output].zr_start = start;
460 r[output].zr_end = end;
461 output++;
462 }
463 ASSERT3U(output, <, eip->zei_range_count);
464 eip->zei_range_count = output;
465 eip->zei_mingap = mingap;
466 eip->zei_allowed_mingap = new_allowed_gap;
467 }
468
469 static void
470 add_range(zfs_ecksum_info_t *eip, int start, int end)
471 {
472 struct zei_ranges *r = eip->zei_ranges;
473 size_t count = eip->zei_range_count;
474
475 if (count >= MAX_RANGES) {
476 shrink_ranges(eip);
477 count = eip->zei_range_count;
478 }
479 if (count == 0) {
480 eip->zei_mingap = UINT32_MAX;
481 eip->zei_allowed_mingap = 1;
482 } else {
483 int gap = start - r[count - 1].zr_end;
484
485 if (gap < eip->zei_allowed_mingap) {
486 r[count - 1].zr_end = end;
487 return;
488 }
489 if (gap < eip->zei_mingap)
490 eip->zei_mingap = gap;
491 }
492 r[count].zr_start = start;
493 r[count].zr_end = end;
494 eip->zei_range_count++;
495 }
496
497 static size_t
498 range_total_size(zfs_ecksum_info_t *eip)
499 {
500 struct zei_ranges *r = eip->zei_ranges;
501 size_t count = eip->zei_range_count;
502 size_t result = 0;
503 size_t idx;
504
505 for (idx = 0; idx < count; idx++)
506 result += (r[idx].zr_end - r[idx].zr_start);
507
508 return (result);
509 }
510
511 static zfs_ecksum_info_t *
512 annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
513 const uint8_t *goodbuf, const uint8_t *badbuf, size_t size,
514 boolean_t drop_if_identical)
515 {
516 const uint64_t *good = (const uint64_t *)goodbuf;
517 const uint64_t *bad = (const uint64_t *)badbuf;
518
519 uint64_t allset = 0;
520 uint64_t allcleared = 0;
521
522 size_t nui64s = size / sizeof (uint64_t);
523
524 size_t inline_size;
525 int no_inline = 0;
526 size_t idx;
527 size_t range;
528
529 size_t offset = 0;
530 ssize_t start = -1;
531
532 zfs_ecksum_info_t *eip = kmem_zalloc(sizeof (*eip), KM_SLEEP);
533
534 /* don't do any annotation for injected checksum errors */
535 if (info != NULL && info->zbc_injected)
536 return (eip);
537
538 if (info != NULL && info->zbc_has_cksum) {
539 fm_payload_set(ereport,
540 FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED,
541 DATA_TYPE_UINT64_ARRAY,
542 sizeof (info->zbc_expected) / sizeof (uint64_t),
543 (uint64_t *)&info->zbc_expected,
544 FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL,
545 DATA_TYPE_UINT64_ARRAY,
546 sizeof (info->zbc_actual) / sizeof (uint64_t),
547 (uint64_t *)&info->zbc_actual,
548 FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO,
549 DATA_TYPE_STRING,
550 info->zbc_checksum_name,
551 NULL);
552
553 if (info->zbc_byteswapped) {
554 fm_payload_set(ereport,
555 FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP,
556 DATA_TYPE_BOOLEAN, 1,
557 NULL);
558 }
559 }
560
561 if (badbuf == NULL || goodbuf == NULL)
562 return (eip);
563
564 ASSERT3U(nui64s, <=, UINT16_MAX);
565 ASSERT3U(size, ==, nui64s * sizeof (uint64_t));
566 ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
567 ASSERT3U(size, <=, UINT32_MAX);
568
569 /* build up the range list by comparing the two buffers. */
570 for (idx = 0; idx < nui64s; idx++) {
571 if (good[idx] == bad[idx]) {
572 if (start == -1)
573 continue;
574
575 add_range(eip, start, idx);
576 start = -1;
577 } else {
578 if (start != -1)
579 continue;
580
581 start = idx;
582 }
583 }
584 if (start != -1)
585 add_range(eip, start, idx);
586
587 /* See if it will fit in our inline buffers */
588 inline_size = range_total_size(eip);
589 if (inline_size > ZFM_MAX_INLINE)
590 no_inline = 1;
591
592 /*
593 * If there is no change and we want to drop if the buffers are
594 * identical, do so.
595 */
596 if (inline_size == 0 && drop_if_identical) {
597 kmem_free(eip, sizeof (*eip));
598 return (NULL);
599 }
600
601 /*
602 * Now walk through the ranges, filling in the details of the
603 * differences. Also convert our uint64_t-array offsets to byte
604 * offsets.
605 */
606 for (range = 0; range < eip->zei_range_count; range++) {
607 size_t start = eip->zei_ranges[range].zr_start;
608 size_t end = eip->zei_ranges[range].zr_end;
609
610 for (idx = start; idx < end; idx++) {
611 uint64_t set, cleared;
612
613 // bits set in bad, but not in good
614 set = ((~good[idx]) & bad[idx]);
615 // bits set in good, but not in bad
616 cleared = (good[idx] & (~bad[idx]));
617
618 allset |= set;
619 allcleared |= cleared;
620
621 if (!no_inline) {
622 ASSERT3U(offset, <, inline_size);
623 eip->zei_bits_set[offset] = set;
624 eip->zei_bits_cleared[offset] = cleared;
625 offset++;
626 }
627
628 update_histogram(set, eip->zei_histogram_set,
629 &eip->zei_range_sets[range]);
630 update_histogram(cleared, eip->zei_histogram_cleared,
631 &eip->zei_range_clears[range]);
632 }
633
634 /* convert to byte offsets */
635 eip->zei_ranges[range].zr_start *= sizeof (uint64_t);
636 eip->zei_ranges[range].zr_end *= sizeof (uint64_t);
637 }
638 eip->zei_allowed_mingap *= sizeof (uint64_t);
639 inline_size *= sizeof (uint64_t);
640
641 /* fill in ereport */
642 fm_payload_set(ereport,
643 FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES,
644 DATA_TYPE_UINT32_ARRAY, 2 * eip->zei_range_count,
645 (uint32_t *)eip->zei_ranges,
646 FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP,
647 DATA_TYPE_UINT32, eip->zei_allowed_mingap,
648 FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS,
649 DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_sets,
650 FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS,
651 DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_clears,
652 NULL);
653
654 if (!no_inline) {
655 fm_payload_set(ereport,
656 FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS,
657 DATA_TYPE_UINT8_ARRAY,
658 inline_size, (uint8_t *)eip->zei_bits_set,
659 FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS,
660 DATA_TYPE_UINT8_ARRAY,
661 inline_size, (uint8_t *)eip->zei_bits_cleared,
662 NULL);
663 } else {
664 fm_payload_set(ereport,
665 FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM,
666 DATA_TYPE_UINT16_ARRAY,
667 NBBY * sizeof (uint64_t), eip->zei_histogram_set,
668 FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM,
669 DATA_TYPE_UINT16_ARRAY,
670 NBBY * sizeof (uint64_t), eip->zei_histogram_cleared,
671 NULL);
672 }
673 return (eip);
674 }
675 #endif
676
677 void
678 zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
679 uint64_t stateoroffset, uint64_t size)
680 {
681 #ifdef _KERNEL
682 nvlist_t *ereport = NULL;
683 nvlist_t *detector = NULL;
684
685 zfs_ereport_start(&ereport, &detector,
686 subclass, spa, vd, zio, stateoroffset, size);
687
688 if (ereport == NULL)
689 return;
690
691 fm_ereport_post(ereport, EVCH_SLEEP);
692
693 fm_nvlist_destroy(ereport, FM_NVA_FREE);
694 fm_nvlist_destroy(detector, FM_NVA_FREE);
695 #endif
696 }
697
698 void
699 zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd,
700 struct zio *zio, uint64_t offset, uint64_t length, void *arg,
701 zio_bad_cksum_t *info)
702 {
703 zio_cksum_report_t *report = kmem_zalloc(sizeof (*report), KM_SLEEP);
704
705 if (zio->io_vsd != NULL)
706 zio->io_vsd_ops->vsd_cksum_report(zio, report, arg);
707 else
708 zio_vsd_default_cksum_report(zio, report, arg);
709
710 /* copy the checksum failure information if it was provided */
711 if (info != NULL) {
712 report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP);
713 bcopy(info, report->zcr_ckinfo, sizeof (*info));
714 }
715
716 report->zcr_align = 1ULL << vd->vdev_top->vdev_ashift;
717 report->zcr_length = length;
718
719 #ifdef _KERNEL
720 zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector,
721 FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length);
722
723 if (report->zcr_ereport == NULL) {
724 report->zcr_free(report->zcr_cbdata, report->zcr_cbinfo);
725 if (report->zcr_ckinfo != NULL) {
726 kmem_free(report->zcr_ckinfo,
727 sizeof (*report->zcr_ckinfo));
728 }
729 kmem_free(report, sizeof (*report));
730 return;
731 }
732 #endif
733
734 mutex_enter(&spa->spa_errlist_lock);
735 report->zcr_next = zio->io_logical->io_cksum_report;
736 zio->io_logical->io_cksum_report = report;
737 mutex_exit(&spa->spa_errlist_lock);
738 }
739
740 void
741 zfs_ereport_finish_checksum(zio_cksum_report_t *report,
742 const void *good_data, const void *bad_data, boolean_t drop_if_identical)
743 {
744 #ifdef _KERNEL
745 zfs_ecksum_info_t *info = NULL;
746 info = annotate_ecksum(report->zcr_ereport, report->zcr_ckinfo,
747 good_data, bad_data, report->zcr_length, drop_if_identical);
748
749 if (info != NULL)
750 fm_ereport_post(report->zcr_ereport, EVCH_SLEEP);
751
752 fm_nvlist_destroy(report->zcr_ereport, FM_NVA_FREE);
753 fm_nvlist_destroy(report->zcr_detector, FM_NVA_FREE);
754 report->zcr_ereport = report->zcr_detector = NULL;
755
756 if (info != NULL)
757 kmem_free(info, sizeof (*info));
758 #endif
759 }
760
761 void
762 zfs_ereport_free_checksum(zio_cksum_report_t *rpt)
763 {
764 #ifdef _KERNEL
765 if (rpt->zcr_ereport != NULL) {
766 fm_nvlist_destroy(rpt->zcr_ereport,
767 FM_NVA_FREE);
768 fm_nvlist_destroy(rpt->zcr_detector,
769 FM_NVA_FREE);
770 }
771 #endif
772 rpt->zcr_free(rpt->zcr_cbdata, rpt->zcr_cbinfo);
773
774 if (rpt->zcr_ckinfo != NULL)
775 kmem_free(rpt->zcr_ckinfo, sizeof (*rpt->zcr_ckinfo));
776
777 kmem_free(rpt, sizeof (*rpt));
778 }
779
780 void
781 zfs_ereport_send_interim_checksum(zio_cksum_report_t *report)
782 {
783 #ifdef _KERNEL
784 fm_ereport_post(report->zcr_ereport, EVCH_SLEEP);
785 #endif
786 }
787
788 void
789 zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
790 struct zio *zio, uint64_t offset, uint64_t length,
791 const void *good_data, const void *bad_data, zio_bad_cksum_t *zbc)
792 {
793 #ifdef _KERNEL
794 nvlist_t *ereport = NULL;
795 nvlist_t *detector = NULL;
796 zfs_ecksum_info_t *info;
797
798 zfs_ereport_start(&ereport, &detector,
799 FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length);
800
801 if (ereport == NULL)
802 return;
803
804 info = annotate_ecksum(ereport, zbc, good_data, bad_data, length,
805 B_FALSE);
806
807 if (info != NULL)
808 fm_ereport_post(ereport, EVCH_SLEEP);
809
810 fm_nvlist_destroy(ereport, FM_NVA_FREE);
811 fm_nvlist_destroy(detector, FM_NVA_FREE);
812
813 if (info != NULL)
814 kmem_free(info, sizeof (*info));
815 #endif
816 }
817
818 static void
819 zfs_post_common(spa_t *spa, vdev_t *vd, const char *name)
820 {
821 #ifdef _KERNEL
822 nvlist_t *resource;
823 char class[64];
824
825 if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT)
826 return;
827
828 if ((resource = fm_nvlist_create(NULL)) == NULL)
829 return;
830
831 (void) snprintf(class, sizeof (class), "%s.%s.%s", FM_RSRC_RESOURCE,
832 ZFS_ERROR_CLASS, name);
833 VERIFY(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION) == 0);
834 VERIFY(nvlist_add_string(resource, FM_CLASS, class) == 0);
835 VERIFY(nvlist_add_uint64(resource,
836 FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)) == 0);
837 if (vd)
838 VERIFY(nvlist_add_uint64(resource,
839 FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid) == 0);
840
841 fm_ereport_post(resource, EVCH_SLEEP);
842
843 fm_nvlist_destroy(resource, FM_NVA_FREE);
844 #endif
845 }
846
847 /*
848 * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev
849 * has been removed from the system. This will cause the DE to ignore any
850 * recent I/O errors, inferring that they are due to the asynchronous device
851 * removal.
852 */
853 void
854 zfs_post_remove(spa_t *spa, vdev_t *vd)
855 {
856 zfs_post_common(spa, vd, FM_RESOURCE_REMOVED);
857 }
858
859 /*
860 * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool
861 * has the 'autoreplace' property set, and therefore any broken vdevs will be
862 * handled by higher level logic, and no vdev fault should be generated.
863 */
864 void
865 zfs_post_autoreplace(spa_t *spa, vdev_t *vd)
866 {
867 zfs_post_common(spa, vd, FM_RESOURCE_AUTOREPLACE);
868 }
869
870 /*
871 * The 'resource.fs.zfs.statechange' event is an internal signal that the
872 * given vdev has transitioned its state to DEGRADED or HEALTHY. This will
873 * cause the retire agent to repair any outstanding fault management cases
874 * open because the device was not found (fault.fs.zfs.device).
875 */
876 void
877 zfs_post_state_change(spa_t *spa, vdev_t *vd)
878 {
879 zfs_post_common(spa, vd, FM_RESOURCE_STATECHANGE);
880 }