1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  * Copyright (c) 2012 by Delphix. All rights reserved.
  25  * Copyright 2012 Nexenta Systems, Inc.  All rights reserved.
  26  */
  27 
  28 #include <sys/spa.h>
  29 #include <sys/spa_impl.h>
  30 #include <sys/vdev.h>
  31 #include <sys/vdev_impl.h>
  32 #include <sys/zio.h>
  33 #include <sys/zio_checksum.h>
  34 
  35 #include <sys/fm/fs/zfs.h>
  36 #include <sys/fm/protocol.h>
  37 #include <sys/fm/util.h>
  38 #include <sys/sysevent.h>
  39 
  40 /*
  41  * This general routine is responsible for generating all the different ZFS
  42  * ereports.  The payload is dependent on the class, and which arguments are
  43  * supplied to the function:
  44  *
  45  *      EREPORT                 POOL    VDEV    IO
  46  *      block                   X       X       X
  47  *      data                    X               X
  48  *      device                  X       X
  49  *      pool                    X
  50  *
  51  * If we are in a loading state, all errors are chained together by the same
  52  * SPA-wide ENA (Error Numeric Association).
  53  *
  54  * For isolated I/O requests, we get the ENA from the zio_t. The propagation
  55  * gets very complicated due to RAID-Z, gang blocks, and vdev caching.  We want
  56  * to chain together all ereports associated with a logical piece of data.  For
  57  * read I/Os, there  are basically three 'types' of I/O, which form a roughly
  58  * layered diagram:
  59  *
  60  *      +---------------+
  61  *      | Aggregate I/O |       No associated logical data or device
  62  *      +---------------+
  63  *              |
  64  *              V
  65  *      +---------------+       Reads associated with a piece of logical data.
  66  *      |   Read I/O    |       This includes reads on behalf of RAID-Z,
  67  *      +---------------+       mirrors, gang blocks, retries, etc.
  68  *              |
  69  *              V
  70  *      +---------------+       Reads associated with a particular device, but
  71  *      | Physical I/O  |       no logical data.  Issued as part of vdev caching
  72  *      +---------------+       and I/O aggregation.
  73  *
  74  * Note that 'physical I/O' here is not the same terminology as used in the rest
  75  * of ZIO.  Typically, 'physical I/O' simply means that there is no attached
  76  * blockpointer.  But I/O with no associated block pointer can still be related
  77  * to a logical piece of data (i.e. RAID-Z requests).
  78  *
  79  * Purely physical I/O always have unique ENAs.  They are not related to a
  80  * particular piece of logical data, and therefore cannot be chained together.
  81  * We still generate an ereport, but the DE doesn't correlate it with any
  82  * logical piece of data.  When such an I/O fails, the delegated I/O requests
  83  * will issue a retry, which will trigger the 'real' ereport with the correct
  84  * ENA.
  85  *
  86  * We keep track of the ENA for a ZIO chain through the 'io_logical' member.
  87  * When a new logical I/O is issued, we set this to point to itself.  Child I/Os
  88  * then inherit this pointer, so that when it is first set subsequent failures
  89  * will use the same ENA.  For vdev cache fill and queue aggregation I/O,
  90  * this pointer is set to NULL, and no ereport will be generated (since it
  91  * doesn't actually correspond to any particular device or piece of data,
  92  * and the caller will always retry without caching or queueing anyway).
  93  *
  94  * For checksum errors, we want to include more information about the actual
  95  * error which occurs.  Accordingly, we build an ereport when the error is
  96  * noticed, but instead of sending it in immediately, we hang it off of the
  97  * io_cksum_report field of the logical IO.  When the logical IO completes
  98  * (successfully or not), zfs_ereport_finish_checksum() is called with the
  99  * good and bad versions of the buffer (if available), and we annotate the
 100  * ereport with information about the differences.
 101  */
 102 #ifdef _KERNEL
 103 static void
 104 zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
 105     const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
 106     uint64_t stateoroffset, uint64_t size)
 107 {
 108         nvlist_t *ereport, *detector;
 109 
 110         uint64_t ena;
 111         char class[64];
 112 
 113         /*
 114          * If we are doing a spa_tryimport() or in recovery mode,
 115          * ignore errors.
 116          */
 117         if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT ||
 118             spa_load_state(spa) == SPA_LOAD_RECOVER)
 119                 return;
 120 
 121         /*
 122          * If we are in the middle of opening a pool, and the previous attempt
 123          * failed, don't bother logging any new ereports - we're just going to
 124          * get the same diagnosis anyway.
 125          */
 126         if (spa_load_state(spa) != SPA_LOAD_NONE &&
 127             spa->spa_last_open_failed)
 128                 return;
 129 
 130         if (zio != NULL) {
 131                 /*
 132                  * If this is not a read or write zio, ignore the error.  This
 133                  * can occur if the DKIOCFLUSHWRITECACHE ioctl fails.
 134                  */
 135                 if (zio->io_type != ZIO_TYPE_READ &&
 136                     zio->io_type != ZIO_TYPE_WRITE)
 137                         return;
 138 
 139                 /*
 140                  * Ignore any errors from speculative I/Os, as failure is an
 141                  * expected result.
 142                  */
 143                 if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
 144                         return;
 145 
 146                 /*
 147                  * If this I/O is not a retry I/O, don't post an ereport.
 148                  * Otherwise, we risk making bad diagnoses based on B_FAILFAST
 149                  * I/Os.
 150                  */
 151                 if (zio->io_error == EIO &&
 152                     !(zio->io_flags & ZIO_FLAG_IO_RETRY))
 153                         return;
 154 
 155                 if (vd != NULL) {
 156                         /*
 157                          * If the vdev has already been marked as failing due
 158                          * to a failed probe, then ignore any subsequent I/O
 159                          * errors, as the DE will automatically fault the vdev
 160                          * on the first such failure.  This also catches cases
 161                          * where vdev_remove_wanted is set and the device has
 162                          * not yet been asynchronously placed into the REMOVED
 163                          * state.
 164                          */
 165                         if (zio->io_vd == vd && !vdev_accessible(vd, zio))
 166                                 return;
 167 
 168                         /*
 169                          * Ignore checksum errors for reads from DTL regions of
 170                          * leaf vdevs.
 171                          */
 172                         if (zio->io_type == ZIO_TYPE_READ &&
 173                             zio->io_error == ECKSUM &&
 174                             vd->vdev_ops->vdev_op_leaf &&
 175                             vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1))
 176                                 return;
 177                 }
 178         }
 179 
 180         /*
 181          * For probe failure, we want to avoid posting ereports if we've
 182          * already removed the device in the meantime.
 183          */
 184         if (vd != NULL &&
 185             strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 &&
 186             (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED))
 187                 return;
 188 
 189         if ((ereport = fm_nvlist_create(NULL)) == NULL)
 190                 return;
 191 
 192         if ((detector = fm_nvlist_create(NULL)) == NULL) {
 193                 fm_nvlist_destroy(ereport, FM_NVA_FREE);
 194                 return;
 195         }
 196 
 197         /*
 198          * Serialize ereport generation
 199          */
 200         mutex_enter(&spa->spa_errlist_lock);
 201 
 202         /*
 203          * Determine the ENA to use for this event.  If we are in a loading
 204          * state, use a SPA-wide ENA.  Otherwise, if we are in an I/O state, use
 205          * a root zio-wide ENA.  Otherwise, simply use a unique ENA.
 206          */
 207         if (spa_load_state(spa) != SPA_LOAD_NONE) {
 208                 if (spa->spa_ena == 0)
 209                         spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1);
 210                 ena = spa->spa_ena;
 211         } else if (zio != NULL && zio->io_logical != NULL) {
 212                 if (zio->io_logical->io_ena == 0)
 213                         zio->io_logical->io_ena =
 214                             fm_ena_generate(0, FM_ENA_FMT1);
 215                 ena = zio->io_logical->io_ena;
 216         } else {
 217                 ena = fm_ena_generate(0, FM_ENA_FMT1);
 218         }
 219 
 220         /*
 221          * Construct the full class, detector, and other standard FMA fields.
 222          */
 223         (void) snprintf(class, sizeof (class), "%s.%s",
 224             ZFS_ERROR_CLASS, subclass);
 225 
 226         fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa),
 227             vd != NULL ? vd->vdev_guid : 0);
 228 
 229         fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL);
 230 
 231         /*
 232          * Construct the per-ereport payload, depending on which parameters are
 233          * passed in.
 234          */
 235 
 236         /*
 237          * Generic payload members common to all ereports.
 238          */
 239         fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL,
 240             DATA_TYPE_STRING, spa_name(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
 241             DATA_TYPE_UINT64, spa_guid(spa),
 242             FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32,
 243             spa_load_state(spa), NULL);
 244 
 245         if (spa != NULL) {
 246                 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE,
 247                     DATA_TYPE_STRING,
 248                     spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ?
 249                     FM_EREPORT_FAILMODE_WAIT :
 250                     spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ?
 251                     FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC,
 252                     NULL);
 253         }
 254 
 255         if (vd != NULL) {
 256                 vdev_t *pvd = vd->vdev_parent;
 257 
 258                 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
 259                     DATA_TYPE_UINT64, vd->vdev_guid,
 260                     FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
 261                     DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL);
 262                 if (vd->vdev_path != NULL)
 263                         fm_payload_set(ereport,
 264                             FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH,
 265                             DATA_TYPE_STRING, vd->vdev_path, NULL);
 266                 if (vd->vdev_devid != NULL)
 267                         fm_payload_set(ereport,
 268                             FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID,
 269                             DATA_TYPE_STRING, vd->vdev_devid, NULL);
 270                 if (vd->vdev_fru != NULL)
 271                         fm_payload_set(ereport,
 272                             FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU,
 273                             DATA_TYPE_STRING, vd->vdev_fru, NULL);
 274 
 275                 if (pvd != NULL) {
 276                         fm_payload_set(ereport,
 277                             FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID,
 278                             DATA_TYPE_UINT64, pvd->vdev_guid,
 279                             FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE,
 280                             DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type,
 281                             NULL);
 282                         if (pvd->vdev_path)
 283                                 fm_payload_set(ereport,
 284                                     FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH,
 285                                     DATA_TYPE_STRING, pvd->vdev_path, NULL);
 286                         if (pvd->vdev_devid)
 287                                 fm_payload_set(ereport,
 288                                     FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID,
 289                                     DATA_TYPE_STRING, pvd->vdev_devid, NULL);
 290                 }
 291         }
 292 
 293         if (zio != NULL) {
 294                 /*
 295                  * Payload common to all I/Os.
 296                  */
 297                 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR,
 298                     DATA_TYPE_INT32, zio->io_error, NULL);
 299 
 300                 /*
 301                  * If the 'size' parameter is non-zero, it indicates this is a
 302                  * RAID-Z or other I/O where the physical offset and length are
 303                  * provided for us, instead of within the zio_t.
 304                  */
 305                 if (vd != NULL) {
 306                         /*
 307                          * The 'stateoroffset' and 'size' parameters are
 308                          * overloaded to represent the timeout and latency,
 309                          * respectively, in a timeout report.
 310                          */
 311                         if (strcmp(subclass, FM_EREPORT_ZFS_TIMEOUT) == 0)
 312                                 fm_payload_set(ereport,
 313                                     FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMEOUT,
 314                                     DATA_TYPE_UINT64, stateoroffset,
 315                                     FM_EREPORT_PAYLOAD_ZFS_ZIO_LATENCY,
 316                                     DATA_TYPE_UINT64, size, NULL);
 317                         else if (size)
 318                                 fm_payload_set(ereport,
 319                                     FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
 320                                     DATA_TYPE_UINT64, stateoroffset,
 321                                     FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
 322                                     DATA_TYPE_UINT64, size, NULL);
 323                         else
 324                                 fm_payload_set(ereport,
 325                                     FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
 326                                     DATA_TYPE_UINT64, zio->io_offset,
 327                                     FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
 328                                     DATA_TYPE_UINT64, zio->io_size, NULL);
 329                 }
 330 
 331                 /*
 332                  * Payload for I/Os with corresponding logical information.
 333                  */
 334                 if (zio->io_logical != NULL)
 335                         fm_payload_set(ereport,
 336                             FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET,
 337                             DATA_TYPE_UINT64,
 338                             zio->io_logical->io_bookmark.zb_objset,
 339                             FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT,
 340                             DATA_TYPE_UINT64,
 341                             zio->io_logical->io_bookmark.zb_object,
 342                             FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL,
 343                             DATA_TYPE_INT64,
 344                             zio->io_logical->io_bookmark.zb_level,
 345                             FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
 346                             DATA_TYPE_UINT64,
 347                             zio->io_logical->io_bookmark.zb_blkid, NULL);
 348         } else if (vd != NULL) {
 349                 /*
 350                  * If we have a vdev but no zio, this is a device fault, and the
 351                  * 'stateoroffset' parameter indicates the previous state of the
 352                  * vdev.
 353                  */
 354                 fm_payload_set(ereport,
 355                     FM_EREPORT_PAYLOAD_ZFS_PREV_STATE,
 356                     DATA_TYPE_UINT64, stateoroffset, NULL);
 357         }
 358 
 359         mutex_exit(&spa->spa_errlist_lock);
 360 
 361         *ereport_out = ereport;
 362         *detector_out = detector;
 363 }
 364 
 365 /* if it's <= 128 bytes, save the corruption directly */
 366 #define ZFM_MAX_INLINE          (128 / sizeof (uint64_t))
 367 
 368 #define MAX_RANGES              16
 369 
 370 typedef struct zfs_ecksum_info {
 371         /* histograms of set and cleared bits by bit number in a 64-bit word */
 372         uint16_t zei_histogram_set[sizeof (uint64_t) * NBBY];
 373         uint16_t zei_histogram_cleared[sizeof (uint64_t) * NBBY];
 374 
 375         /* inline arrays of bits set and cleared. */
 376         uint64_t zei_bits_set[ZFM_MAX_INLINE];
 377         uint64_t zei_bits_cleared[ZFM_MAX_INLINE];
 378 
 379         /*
 380          * for each range, the number of bits set and cleared.  The Hamming
 381          * distance between the good and bad buffers is the sum of them all.
 382          */
 383         uint32_t zei_range_sets[MAX_RANGES];
 384         uint32_t zei_range_clears[MAX_RANGES];
 385 
 386         struct zei_ranges {
 387                 uint32_t        zr_start;
 388                 uint32_t        zr_end;
 389         } zei_ranges[MAX_RANGES];
 390 
 391         size_t  zei_range_count;
 392         uint32_t zei_mingap;
 393         uint32_t zei_allowed_mingap;
 394 
 395 } zfs_ecksum_info_t;
 396 
 397 static void
 398 update_histogram(uint64_t value_arg, uint16_t *hist, uint32_t *count)
 399 {
 400         size_t i;
 401         size_t bits = 0;
 402         uint64_t value = BE_64(value_arg);
 403 
 404         /* We store the bits in big-endian (largest-first) order */
 405         for (i = 0; i < 64; i++) {
 406                 if (value & (1ull << i)) {
 407                         hist[63 - i]++;
 408                         ++bits;
 409                 }
 410         }
 411         /* update the count of bits changed */
 412         *count += bits;
 413 }
 414 
 415 /*
 416  * We've now filled up the range array, and need to increase "mingap" and
 417  * shrink the range list accordingly.  zei_mingap is always the smallest
 418  * distance between array entries, so we set the new_allowed_gap to be
 419  * one greater than that.  We then go through the list, joining together
 420  * any ranges which are closer than the new_allowed_gap.
 421  *
 422  * By construction, there will be at least one.  We also update zei_mingap
 423  * to the new smallest gap, to prepare for our next invocation.
 424  */
 425 static void
 426 shrink_ranges(zfs_ecksum_info_t *eip)
 427 {
 428         uint32_t mingap = UINT32_MAX;
 429         uint32_t new_allowed_gap = eip->zei_mingap + 1;
 430 
 431         size_t idx, output;
 432         size_t max = eip->zei_range_count;
 433 
 434         struct zei_ranges *r = eip->zei_ranges;
 435 
 436         ASSERT3U(eip->zei_range_count, >, 0);
 437         ASSERT3U(eip->zei_range_count, <=, MAX_RANGES);
 438 
 439         output = idx = 0;
 440         while (idx < max - 1) {
 441                 uint32_t start = r[idx].zr_start;
 442                 uint32_t end = r[idx].zr_end;
 443 
 444                 while (idx < max - 1) {
 445                         idx++;
 446 
 447                         uint32_t nstart = r[idx].zr_start;
 448                         uint32_t nend = r[idx].zr_end;
 449 
 450                         uint32_t gap = nstart - end;
 451                         if (gap < new_allowed_gap) {
 452                                 end = nend;
 453                                 continue;
 454                         }
 455                         if (gap < mingap)
 456                                 mingap = gap;
 457                         break;
 458                 }
 459                 r[output].zr_start = start;
 460                 r[output].zr_end = end;
 461                 output++;
 462         }
 463         ASSERT3U(output, <, eip->zei_range_count);
 464         eip->zei_range_count = output;
 465         eip->zei_mingap = mingap;
 466         eip->zei_allowed_mingap = new_allowed_gap;
 467 }
 468 
 469 static void
 470 add_range(zfs_ecksum_info_t *eip, int start, int end)
 471 {
 472         struct zei_ranges *r = eip->zei_ranges;
 473         size_t count = eip->zei_range_count;
 474 
 475         if (count >= MAX_RANGES) {
 476                 shrink_ranges(eip);
 477                 count = eip->zei_range_count;
 478         }
 479         if (count == 0) {
 480                 eip->zei_mingap = UINT32_MAX;
 481                 eip->zei_allowed_mingap = 1;
 482         } else {
 483                 int gap = start - r[count - 1].zr_end;
 484 
 485                 if (gap < eip->zei_allowed_mingap) {
 486                         r[count - 1].zr_end = end;
 487                         return;
 488                 }
 489                 if (gap < eip->zei_mingap)
 490                         eip->zei_mingap = gap;
 491         }
 492         r[count].zr_start = start;
 493         r[count].zr_end = end;
 494         eip->zei_range_count++;
 495 }
 496 
 497 static size_t
 498 range_total_size(zfs_ecksum_info_t *eip)
 499 {
 500         struct zei_ranges *r = eip->zei_ranges;
 501         size_t count = eip->zei_range_count;
 502         size_t result = 0;
 503         size_t idx;
 504 
 505         for (idx = 0; idx < count; idx++)
 506                 result += (r[idx].zr_end - r[idx].zr_start);
 507 
 508         return (result);
 509 }
 510 
 511 static zfs_ecksum_info_t *
 512 annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
 513     const uint8_t *goodbuf, const uint8_t *badbuf, size_t size,
 514     boolean_t drop_if_identical)
 515 {
 516         const uint64_t *good = (const uint64_t *)goodbuf;
 517         const uint64_t *bad = (const uint64_t *)badbuf;
 518 
 519         uint64_t allset = 0;
 520         uint64_t allcleared = 0;
 521 
 522         size_t nui64s = size / sizeof (uint64_t);
 523 
 524         size_t inline_size;
 525         int no_inline = 0;
 526         size_t idx;
 527         size_t range;
 528 
 529         size_t offset = 0;
 530         ssize_t start = -1;
 531 
 532         zfs_ecksum_info_t *eip = kmem_zalloc(sizeof (*eip), KM_SLEEP);
 533 
 534         /* don't do any annotation for injected checksum errors */
 535         if (info != NULL && info->zbc_injected)
 536                 return (eip);
 537 
 538         if (info != NULL && info->zbc_has_cksum) {
 539                 fm_payload_set(ereport,
 540                     FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED,
 541                     DATA_TYPE_UINT64_ARRAY,
 542                     sizeof (info->zbc_expected) / sizeof (uint64_t),
 543                     (uint64_t *)&info->zbc_expected,
 544                     FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL,
 545                     DATA_TYPE_UINT64_ARRAY,
 546                     sizeof (info->zbc_actual) / sizeof (uint64_t),
 547                     (uint64_t *)&info->zbc_actual,
 548                     FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO,
 549                     DATA_TYPE_STRING,
 550                     info->zbc_checksum_name,
 551                     NULL);
 552 
 553                 if (info->zbc_byteswapped) {
 554                         fm_payload_set(ereport,
 555                             FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP,
 556                             DATA_TYPE_BOOLEAN, 1,
 557                             NULL);
 558                 }
 559         }
 560 
 561         if (badbuf == NULL || goodbuf == NULL)
 562                 return (eip);
 563 
 564         ASSERT3U(nui64s, <=, UINT16_MAX);
 565         ASSERT3U(size, ==, nui64s * sizeof (uint64_t));
 566         ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
 567         ASSERT3U(size, <=, UINT32_MAX);
 568 
 569         /* build up the range list by comparing the two buffers. */
 570         for (idx = 0; idx < nui64s; idx++) {
 571                 if (good[idx] == bad[idx]) {
 572                         if (start == -1)
 573                                 continue;
 574 
 575                         add_range(eip, start, idx);
 576                         start = -1;
 577                 } else {
 578                         if (start != -1)
 579                                 continue;
 580 
 581                         start = idx;
 582                 }
 583         }
 584         if (start != -1)
 585                 add_range(eip, start, idx);
 586 
 587         /* See if it will fit in our inline buffers */
 588         inline_size = range_total_size(eip);
 589         if (inline_size > ZFM_MAX_INLINE)
 590                 no_inline = 1;
 591 
 592         /*
 593          * If there is no change and we want to drop if the buffers are
 594          * identical, do so.
 595          */
 596         if (inline_size == 0 && drop_if_identical) {
 597                 kmem_free(eip, sizeof (*eip));
 598                 return (NULL);
 599         }
 600 
 601         /*
 602          * Now walk through the ranges, filling in the details of the
 603          * differences.  Also convert our uint64_t-array offsets to byte
 604          * offsets.
 605          */
 606         for (range = 0; range < eip->zei_range_count; range++) {
 607                 size_t start = eip->zei_ranges[range].zr_start;
 608                 size_t end = eip->zei_ranges[range].zr_end;
 609 
 610                 for (idx = start; idx < end; idx++) {
 611                         uint64_t set, cleared;
 612 
 613                         // bits set in bad, but not in good
 614                         set = ((~good[idx]) & bad[idx]);
 615                         // bits set in good, but not in bad
 616                         cleared = (good[idx] & (~bad[idx]));
 617 
 618                         allset |= set;
 619                         allcleared |= cleared;
 620 
 621                         if (!no_inline) {
 622                                 ASSERT3U(offset, <, inline_size);
 623                                 eip->zei_bits_set[offset] = set;
 624                                 eip->zei_bits_cleared[offset] = cleared;
 625                                 offset++;
 626                         }
 627 
 628                         update_histogram(set, eip->zei_histogram_set,
 629                             &eip->zei_range_sets[range]);
 630                         update_histogram(cleared, eip->zei_histogram_cleared,
 631                             &eip->zei_range_clears[range]);
 632                 }
 633 
 634                 /* convert to byte offsets */
 635                 eip->zei_ranges[range].zr_start      *= sizeof (uint64_t);
 636                 eip->zei_ranges[range].zr_end        *= sizeof (uint64_t);
 637         }
 638         eip->zei_allowed_mingap      *= sizeof (uint64_t);
 639         inline_size             *= sizeof (uint64_t);
 640 
 641         /* fill in ereport */
 642         fm_payload_set(ereport,
 643             FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES,
 644             DATA_TYPE_UINT32_ARRAY, 2 * eip->zei_range_count,
 645             (uint32_t *)eip->zei_ranges,
 646             FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP,
 647             DATA_TYPE_UINT32, eip->zei_allowed_mingap,
 648             FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS,
 649             DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_sets,
 650             FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS,
 651             DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_clears,
 652             NULL);
 653 
 654         if (!no_inline) {
 655                 fm_payload_set(ereport,
 656                     FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS,
 657                     DATA_TYPE_UINT8_ARRAY,
 658                     inline_size, (uint8_t *)eip->zei_bits_set,
 659                     FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS,
 660                     DATA_TYPE_UINT8_ARRAY,
 661                     inline_size, (uint8_t *)eip->zei_bits_cleared,
 662                     NULL);
 663         } else {
 664                 fm_payload_set(ereport,
 665                     FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM,
 666                     DATA_TYPE_UINT16_ARRAY,
 667                     NBBY * sizeof (uint64_t), eip->zei_histogram_set,
 668                     FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM,
 669                     DATA_TYPE_UINT16_ARRAY,
 670                     NBBY * sizeof (uint64_t), eip->zei_histogram_cleared,
 671                     NULL);
 672         }
 673         return (eip);
 674 }
 675 #endif
 676 
 677 void
 678 zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
 679     uint64_t stateoroffset, uint64_t size)
 680 {
 681 #ifdef _KERNEL
 682         nvlist_t *ereport = NULL;
 683         nvlist_t *detector = NULL;
 684 
 685         zfs_ereport_start(&ereport, &detector,
 686             subclass, spa, vd, zio, stateoroffset, size);
 687 
 688         if (ereport == NULL)
 689                 return;
 690 
 691         fm_ereport_post(ereport, EVCH_SLEEP);
 692 
 693         fm_nvlist_destroy(ereport, FM_NVA_FREE);
 694         fm_nvlist_destroy(detector, FM_NVA_FREE);
 695 #endif
 696 }
 697 
 698 void
 699 zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd,
 700     struct zio *zio, uint64_t offset, uint64_t length, void *arg,
 701     zio_bad_cksum_t *info)
 702 {
 703         zio_cksum_report_t *report = kmem_zalloc(sizeof (*report), KM_SLEEP);
 704 
 705         if (zio->io_vsd != NULL)
 706                 zio->io_vsd_ops->vsd_cksum_report(zio, report, arg);
 707         else
 708                 zio_vsd_default_cksum_report(zio, report, arg);
 709 
 710         /* copy the checksum failure information if it was provided */
 711         if (info != NULL) {
 712                 report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP);
 713                 bcopy(info, report->zcr_ckinfo, sizeof (*info));
 714         }
 715 
 716         report->zcr_align = 1ULL << vd->vdev_top->vdev_ashift;
 717         report->zcr_length = length;
 718 
 719 #ifdef _KERNEL
 720         zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector,
 721             FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length);
 722 
 723         if (report->zcr_ereport == NULL) {
 724                 report->zcr_free(report->zcr_cbdata, report->zcr_cbinfo);
 725                 if (report->zcr_ckinfo != NULL) {
 726                         kmem_free(report->zcr_ckinfo,
 727                             sizeof (*report->zcr_ckinfo));
 728                 }
 729                 kmem_free(report, sizeof (*report));
 730                 return;
 731         }
 732 #endif
 733 
 734         mutex_enter(&spa->spa_errlist_lock);
 735         report->zcr_next = zio->io_logical->io_cksum_report;
 736         zio->io_logical->io_cksum_report = report;
 737         mutex_exit(&spa->spa_errlist_lock);
 738 }
 739 
 740 void
 741 zfs_ereport_finish_checksum(zio_cksum_report_t *report,
 742     const void *good_data, const void *bad_data, boolean_t drop_if_identical)
 743 {
 744 #ifdef _KERNEL
 745         zfs_ecksum_info_t *info = NULL;
 746         info = annotate_ecksum(report->zcr_ereport, report->zcr_ckinfo,
 747             good_data, bad_data, report->zcr_length, drop_if_identical);
 748 
 749         if (info != NULL)
 750                 fm_ereport_post(report->zcr_ereport, EVCH_SLEEP);
 751 
 752         fm_nvlist_destroy(report->zcr_ereport, FM_NVA_FREE);
 753         fm_nvlist_destroy(report->zcr_detector, FM_NVA_FREE);
 754         report->zcr_ereport = report->zcr_detector = NULL;
 755 
 756         if (info != NULL)
 757                 kmem_free(info, sizeof (*info));
 758 #endif
 759 }
 760 
 761 void
 762 zfs_ereport_free_checksum(zio_cksum_report_t *rpt)
 763 {
 764 #ifdef _KERNEL
 765         if (rpt->zcr_ereport != NULL) {
 766                 fm_nvlist_destroy(rpt->zcr_ereport,
 767                     FM_NVA_FREE);
 768                 fm_nvlist_destroy(rpt->zcr_detector,
 769                     FM_NVA_FREE);
 770         }
 771 #endif
 772         rpt->zcr_free(rpt->zcr_cbdata, rpt->zcr_cbinfo);
 773 
 774         if (rpt->zcr_ckinfo != NULL)
 775                 kmem_free(rpt->zcr_ckinfo, sizeof (*rpt->zcr_ckinfo));
 776 
 777         kmem_free(rpt, sizeof (*rpt));
 778 }
 779 
 780 void
 781 zfs_ereport_send_interim_checksum(zio_cksum_report_t *report)
 782 {
 783 #ifdef _KERNEL
 784         fm_ereport_post(report->zcr_ereport, EVCH_SLEEP);
 785 #endif
 786 }
 787 
 788 void
 789 zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
 790     struct zio *zio, uint64_t offset, uint64_t length,
 791     const void *good_data, const void *bad_data, zio_bad_cksum_t *zbc)
 792 {
 793 #ifdef _KERNEL
 794         nvlist_t *ereport = NULL;
 795         nvlist_t *detector = NULL;
 796         zfs_ecksum_info_t *info;
 797 
 798         zfs_ereport_start(&ereport, &detector,
 799             FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length);
 800 
 801         if (ereport == NULL)
 802                 return;
 803 
 804         info = annotate_ecksum(ereport, zbc, good_data, bad_data, length,
 805             B_FALSE);
 806 
 807         if (info != NULL)
 808                 fm_ereport_post(ereport, EVCH_SLEEP);
 809 
 810         fm_nvlist_destroy(ereport, FM_NVA_FREE);
 811         fm_nvlist_destroy(detector, FM_NVA_FREE);
 812 
 813         if (info != NULL)
 814                 kmem_free(info, sizeof (*info));
 815 #endif
 816 }
 817 
 818 static void
 819 zfs_post_common(spa_t *spa, vdev_t *vd, const char *name)
 820 {
 821 #ifdef _KERNEL
 822         nvlist_t *resource;
 823         char class[64];
 824 
 825         if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT)
 826                 return;
 827 
 828         if ((resource = fm_nvlist_create(NULL)) == NULL)
 829                 return;
 830 
 831         (void) snprintf(class, sizeof (class), "%s.%s.%s", FM_RSRC_RESOURCE,
 832             ZFS_ERROR_CLASS, name);
 833         VERIFY(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION) == 0);
 834         VERIFY(nvlist_add_string(resource, FM_CLASS, class) == 0);
 835         VERIFY(nvlist_add_uint64(resource,
 836             FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)) == 0);
 837         if (vd)
 838                 VERIFY(nvlist_add_uint64(resource,
 839                     FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid) == 0);
 840 
 841         fm_ereport_post(resource, EVCH_SLEEP);
 842 
 843         fm_nvlist_destroy(resource, FM_NVA_FREE);
 844 #endif
 845 }
 846 
 847 /*
 848  * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev
 849  * has been removed from the system.  This will cause the DE to ignore any
 850  * recent I/O errors, inferring that they are due to the asynchronous device
 851  * removal.
 852  */
 853 void
 854 zfs_post_remove(spa_t *spa, vdev_t *vd)
 855 {
 856         zfs_post_common(spa, vd, FM_RESOURCE_REMOVED);
 857 }
 858 
 859 /*
 860  * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool
 861  * has the 'autoreplace' property set, and therefore any broken vdevs will be
 862  * handled by higher level logic, and no vdev fault should be generated.
 863  */
 864 void
 865 zfs_post_autoreplace(spa_t *spa, vdev_t *vd)
 866 {
 867         zfs_post_common(spa, vd, FM_RESOURCE_AUTOREPLACE);
 868 }
 869 
 870 /*
 871  * The 'resource.fs.zfs.statechange' event is an internal signal that the
 872  * given vdev has transitioned its state to DEGRADED or HEALTHY.  This will
 873  * cause the retire agent to repair any outstanding fault management cases
 874  * open because the device was not found (fault.fs.zfs.device).
 875  */
 876 void
 877 zfs_post_state_change(spa_t *spa, vdev_t *vd)
 878 {
 879         zfs_post_common(spa, vd, FM_RESOURCE_STATECHANGE);
 880 }