1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 #include <sys/cdefs.h>
  27 
  28 static uint64_t zfs_crc64_table[256];
  29 
  30 #define ECKSUM  666
  31 
  32 #define ASSERT3S(x, y, z)       ((void)0)
  33 #define ASSERT3U(x, y, z)       ((void)0)
  34 #define ASSERT3P(x, y, z)       ((void)0)
  35 #define ASSERT0(x)              ((void)0)
  36 #define ASSERT(x)               ((void)0)
  37 
  38 #define panic(...)      do {                                            \
  39         printf(__VA_ARGS__);                                            \
  40         for (;;) ;                                                      \
  41 } while (0)
  42 
  43 #define kmem_alloc(size, flag)  zfs_alloc((size))
  44 #define kmem_free(ptr, size)    zfs_free((ptr), (size))
  45 
  46 static void
  47 zfs_init_crc(void)
  48 {
  49         int i, j;
  50         uint64_t *ct;
  51 
  52         /*
  53          * Calculate the crc64 table (used for the zap hash
  54          * function).
  55          */
  56         if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
  57                 memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
  58                 for (i = 0; i < 256; i++)
  59                         for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
  60                                 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
  61         }
  62 }
  63 
  64 static void
  65 zio_checksum_off(const void *buf, uint64_t size,
  66     const void *ctx_template, zio_cksum_t *zcp)
  67 {
  68         ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
  69 }
  70 
  71 /*
  72  * Signature for checksum functions.
  73  */
  74 typedef void zio_checksum_t(const void *data, uint64_t size,
  75     const void *ctx_template, zio_cksum_t *zcp);
  76 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
  77 typedef void zio_checksum_tmpl_free_t(void *ctx_template);
  78 
  79 typedef enum zio_checksum_flags {
  80         /* Strong enough for metadata? */
  81         ZCHECKSUM_FLAG_METADATA = (1 << 1),
  82         /* ZIO embedded checksum */
  83         ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
  84         /* Strong enough for dedup (without verification)? */
  85         ZCHECKSUM_FLAG_DEDUP = (1 << 3),
  86         /* Uses salt value */
  87         ZCHECKSUM_FLAG_SALTED = (1 << 4),
  88         /* Strong enough for nopwrite? */
  89         ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
  90 } zio_checksum_flags_t;
  91 
  92 /*
  93  * Information about each checksum function.
  94  */
  95 typedef struct zio_checksum_info {
  96         /* checksum function for each byteorder */
  97         zio_checksum_t                  *ci_func[2];
  98         zio_checksum_tmpl_init_t        *ci_tmpl_init;
  99         zio_checksum_tmpl_free_t        *ci_tmpl_free;
 100         zio_checksum_flags_t            ci_flags;
 101         const char                      *ci_name;       /* descriptive name */
 102 } zio_checksum_info_t;
 103 
 104 #include "blkptr.c"
 105 
 106 #include "fletcher.c"
 107 #include "sha256.c"
 108 
 109 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
 110         {{NULL, NULL}, NULL, NULL, 0, "inherit"},
 111         {{NULL, NULL}, NULL, NULL, 0, "on"},
 112         {{zio_checksum_off,     zio_checksum_off}, NULL, NULL, 0, "off"},
 113         {{zio_checksum_SHA256,  zio_checksum_SHA256}, NULL, NULL,
 114             ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"},
 115         {{zio_checksum_SHA256,  zio_checksum_SHA256}, NULL, NULL,
 116             ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"},
 117         {{fletcher_2_native,    fletcher_2_byteswap}, NULL, NULL,
 118             ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
 119         {{fletcher_2_native,    fletcher_2_byteswap}, NULL, NULL,
 120             0, "fletcher2"},
 121         {{fletcher_4_native,    fletcher_4_byteswap}, NULL, NULL,
 122             ZCHECKSUM_FLAG_METADATA, "fletcher4"},
 123         {{zio_checksum_SHA256,  zio_checksum_SHA256}, NULL, NULL,
 124             ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
 125             ZCHECKSUM_FLAG_NOPWRITE, "SHA256"},
 126         {{fletcher_4_native,    fletcher_4_byteswap}, NULL, NULL,
 127             ZCHECKSUM_FLAG_EMBEDDED, "zillog2"},
 128         {{zio_checksum_off,     zio_checksum_off}, NULL, NULL,
 129             0, "noparity"},
 130         {{zio_checksum_SHA512_native,   zio_checksum_SHA512_byteswap},
 131             NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
 132             ZCHECKSUM_FLAG_NOPWRITE, "SHA512"},
 133         /* no skein and edonr for now */
 134         {{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA |
 135             ZCHECKSUM_FLAG_DEDUP | ZCHECKSUM_FLAG_SALTED |
 136             ZCHECKSUM_FLAG_NOPWRITE, "skein"},
 137         {{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA |
 138             ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
 139 };
 140 
 141 /*
 142  * Common signature for all zio compress/decompress functions.
 143  */
 144 typedef size_t zio_compress_func_t(void *src, void *dst,
 145     size_t s_len, size_t d_len, int);
 146 typedef int zio_decompress_func_t(void *src, void *dst,
 147     size_t s_len, size_t d_len, int);
 148 
 149 extern int gzip_decompress(void *src, void *dst,
 150     size_t s_len, size_t d_len, int);
 151 /*
 152  * Information about each compression function.
 153  */
 154 typedef struct zio_compress_info {
 155         zio_compress_func_t     *ci_compress;   /* compression function */
 156         zio_decompress_func_t   *ci_decompress; /* decompression function */
 157         int                     ci_level;       /* level parameter */
 158         const char              *ci_name;       /* algorithm name */
 159 } zio_compress_info_t;
 160 
 161 #include "lzjb.c"
 162 #include "zle.c"
 163 #include "lz4.c"
 164 
 165 /*
 166  * Compression vectors.
 167  */
 168 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
 169         {NULL,                  NULL,                   0,      "inherit"},
 170         {NULL,                  NULL,                   0,      "on"},
 171         {NULL,                  NULL,                   0,      "uncompressed"},
 172         {NULL,                  lzjb_decompress,        0,      "lzjb"},
 173         {NULL,                  NULL,                   0,      "empty"},
 174         {NULL,                  gzip_decompress,        1,      "gzip-1"},
 175         {NULL,                  gzip_decompress,        2,      "gzip-2"},
 176         {NULL,                  gzip_decompress,        3,      "gzip-3"},
 177         {NULL,                  gzip_decompress,        4,      "gzip-4"},
 178         {NULL,                  gzip_decompress,        5,      "gzip-5"},
 179         {NULL,                  gzip_decompress,        6,      "gzip-6"},
 180         {NULL,                  gzip_decompress,        7,      "gzip-7"},
 181         {NULL,                  gzip_decompress,        8,      "gzip-8"},
 182         {NULL,                  gzip_decompress,        9,      "gzip-9"},
 183         {NULL,                  zle_decompress,         64,     "zle"},
 184         {NULL,                  lz4_decompress,         0,      "lz4"},
 185 };
 186 
 187 static void
 188 byteswap_uint64_array(void *vbuf, size_t size)
 189 {
 190         uint64_t *buf = vbuf;
 191         size_t count = size >> 3;
 192         int i;
 193 
 194         ASSERT((size & 7) == 0);
 195 
 196         for (i = 0; i < count; i++)
 197                 buf[i] = BSWAP_64(buf[i]);
 198 }
 199 
 200 /*
 201  * Set the external verifier for a gang block based on <vdev, offset, txg>,
 202  * a tuple which is guaranteed to be unique for the life of the pool.
 203  */
 204 static void
 205 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
 206 {
 207         const dva_t *dva = BP_IDENTITY(bp);
 208         uint64_t txg = BP_PHYSICAL_BIRTH(bp);
 209 
 210         ASSERT(BP_IS_GANG(bp));
 211 
 212         ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
 213 }
 214 
 215 /*
 216  * Set the external verifier for a label block based on its offset.
 217  * The vdev is implicit, and the txg is unknowable at pool open time --
 218  * hence the logic in vdev_uberblock_load() to find the most recent copy.
 219  */
 220 static void
 221 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
 222 {
 223         ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
 224 }
 225 
 226 /*
 227  * Calls the template init function of a checksum which supports context
 228  * templates and installs the template into the spa_t.
 229  */
 230 static void
 231 zio_checksum_template_init(enum zio_checksum checksum, const spa_t *spa)
 232 {
 233         zio_checksum_info_t *ci = &zio_checksum_table[checksum];
 234 
 235         if (ci->ci_tmpl_init == NULL)
 236                 return;
 237 #if 0   /* for now we dont have anything here */
 238         if (spa->spa_cksum_tmpls[checksum] != NULL)
 239                 return;
 240 
 241         VERIFY(ci->ci_tmpl_free != NULL);
 242         mutex_enter(&spa->spa_cksum_tmpls_lock);
 243         if (spa->spa_cksum_tmpls[checksum] == NULL) {
 244                 spa->spa_cksum_tmpls[checksum] =
 245                     ci->ci_tmpl_init(&spa->spa_cksum_salt);
 246                 VERIFY(spa->spa_cksum_tmpls[checksum] != NULL);
 247         }
 248         mutex_exit(&spa->spa_cksum_tmpls_lock);
 249 #endif
 250 }
 251 
 252 static int
 253 zio_checksum_verify(const blkptr_t *bp, void *data)
 254 {
 255         uint64_t size;
 256         unsigned int checksum;
 257         zio_checksum_info_t *ci;
 258         zio_cksum_t actual_cksum, expected_cksum, verifier;
 259         int byteswap;
 260 
 261         checksum = BP_GET_CHECKSUM(bp);
 262         size = BP_GET_PSIZE(bp);
 263 
 264         if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
 265                 return (EINVAL);
 266         ci = &zio_checksum_table[checksum];
 267         if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
 268                 return (EINVAL);
 269 
 270         zio_checksum_template_init(checksum, NULL);
 271         if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
 272                 zio_eck_t *eck;
 273 
 274                 ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
 275                     checksum == ZIO_CHECKSUM_LABEL);
 276 
 277                 eck = (zio_eck_t *)((char *)data + size) - 1;
 278 
 279                 if (checksum == ZIO_CHECKSUM_GANG_HEADER)
 280                         zio_checksum_gang_verifier(&verifier, bp);
 281                 else if (checksum == ZIO_CHECKSUM_LABEL)
 282                         zio_checksum_label_verifier(&verifier,
 283                             DVA_GET_OFFSET(BP_IDENTITY(bp)));
 284                 else
 285                         verifier = bp->blk_cksum;
 286 
 287                 byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
 288 
 289                 if (byteswap)
 290                         byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
 291 
 292                 expected_cksum = eck->zec_cksum;
 293                 eck->zec_cksum = verifier;
 294                 ci->ci_func[byteswap](data, size, NULL, &actual_cksum);
 295                 eck->zec_cksum = expected_cksum;
 296 
 297                 if (byteswap)
 298                         byteswap_uint64_array(&expected_cksum,
 299                             sizeof (zio_cksum_t));
 300         } else {
 301                 expected_cksum = bp->blk_cksum;
 302                 ci->ci_func[0](data, size, NULL, &actual_cksum);
 303         }
 304 
 305         if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
 306                 /*printf("ZFS: read checksum failed\n");*/
 307                 return (EIO);
 308         }
 309 
 310         return (0);
 311 }
 312 
 313 static int
 314 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
 315         void *dest, uint64_t destsize)
 316 {
 317         zio_compress_info_t *ci;
 318 
 319         if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
 320                 printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
 321                 return (EIO);
 322         }
 323 
 324         ci = &zio_compress_table[cpfunc];
 325         if (!ci->ci_decompress) {
 326                 printf("ZFS: unsupported compression algorithm %s\n",
 327                     ci->ci_name);
 328                 return (EIO);
 329         }
 330 
 331         return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
 332 }
 333 
 334 static uint64_t
 335 zap_hash(uint64_t salt, const char *name)
 336 {
 337         const uint8_t *cp;
 338         uint8_t c;
 339         uint64_t crc = salt;
 340 
 341         ASSERT(crc != 0);
 342         ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 343         for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
 344                 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
 345 
 346         /*
 347          * Only use 28 bits, since we need 4 bits in the cookie for the
 348          * collision differentiator.  We MUST use the high bits, since
 349          * those are the onces that we first pay attention to when
 350          * chosing the bucket.
 351          */
 352         crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
 353 
 354         return (crc);
 355 }
 356 
 357 static void *zfs_alloc(size_t size);
 358 static void zfs_free(void *ptr, size_t size);
 359 
 360 typedef struct raidz_col {
 361         uint64_t rc_devidx;             /* child device index for I/O */
 362         uint64_t rc_offset;             /* device offset */
 363         uint64_t rc_size;               /* I/O size */
 364         void *rc_data;                  /* I/O data */
 365         int rc_error;                   /* I/O error for this device */
 366         uint8_t rc_tried;               /* Did we attempt this I/O column? */
 367         uint8_t rc_skipped;             /* Did we skip this I/O column? */
 368 } raidz_col_t;
 369 
 370 typedef struct raidz_map {
 371         uint64_t rm_cols;               /* Regular column count */
 372         uint64_t rm_scols;              /* Count including skipped columns */
 373         uint64_t rm_bigcols;            /* Number of oversized columns */
 374         uint64_t rm_asize;              /* Actual total I/O size */
 375         uint64_t rm_missingdata;        /* Count of missing data devices */
 376         uint64_t rm_missingparity;      /* Count of missing parity devices */
 377         uint64_t rm_firstdatacol;       /* First data column/parity count */
 378         uint64_t rm_nskip;              /* Skipped sectors for padding */
 379         uint64_t rm_skipstart;          /* Column index of padding start */
 380         uintptr_t rm_reports;           /* # of referencing checksum reports */
 381         uint8_t rm_freed;               /* map no longer has referencing ZIO */
 382         uint8_t rm_ecksuminjected;      /* checksum error was injected */
 383         raidz_col_t rm_col[1];          /* Flexible array of I/O columns */
 384 } raidz_map_t;
 385 
 386 #define VDEV_RAIDZ_P            0
 387 #define VDEV_RAIDZ_Q            1
 388 #define VDEV_RAIDZ_R            2
 389 
 390 #define VDEV_RAIDZ_MUL_2(x)     (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
 391 #define VDEV_RAIDZ_MUL_4(x)     (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
 392 
 393 /*
 394  * We provide a mechanism to perform the field multiplication operation on a
 395  * 64-bit value all at once rather than a byte at a time. This works by
 396  * creating a mask from the top bit in each byte and using that to
 397  * conditionally apply the XOR of 0x1d.
 398  */
 399 #define VDEV_RAIDZ_64MUL_2(x, mask) \
 400 { \
 401         (mask) = (x) & 0x8080808080808080ULL; \
 402         (mask) = ((mask) << 1) - ((mask) >> 7); \
 403         (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
 404             ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
 405 }
 406 
 407 #define VDEV_RAIDZ_64MUL_4(x, mask) \
 408 { \
 409         VDEV_RAIDZ_64MUL_2((x), mask); \
 410         VDEV_RAIDZ_64MUL_2((x), mask); \
 411 }
 412 
 413 /*
 414  * These two tables represent powers and logs of 2 in the Galois field defined
 415  * above. These values were computed by repeatedly multiplying by 2 as above.
 416  */
 417 static const uint8_t vdev_raidz_pow2[256] = {
 418         0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
 419         0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
 420         0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
 421         0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
 422         0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
 423         0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
 424         0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
 425         0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
 426         0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
 427         0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
 428         0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
 429         0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
 430         0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
 431         0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
 432         0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
 433         0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
 434         0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
 435         0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
 436         0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
 437         0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
 438         0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
 439         0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
 440         0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
 441         0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
 442         0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
 443         0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
 444         0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
 445         0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
 446         0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
 447         0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
 448         0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
 449         0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
 450 };
 451 static const uint8_t vdev_raidz_log2[256] = {
 452         0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
 453         0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
 454         0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
 455         0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
 456         0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
 457         0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
 458         0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
 459         0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
 460         0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
 461         0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
 462         0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
 463         0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
 464         0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
 465         0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
 466         0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
 467         0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
 468         0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
 469         0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
 470         0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
 471         0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
 472         0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
 473         0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
 474         0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
 475         0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
 476         0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
 477         0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
 478         0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
 479         0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
 480         0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
 481         0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
 482         0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
 483         0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
 484 };
 485 
 486 /*
 487  * Multiply a given number by 2 raised to the given power.
 488  */
 489 static uint8_t
 490 vdev_raidz_exp2(uint8_t a, int exp)
 491 {
 492         if (a == 0)
 493                 return (0);
 494 
 495         ASSERT(exp >= 0);
 496         ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
 497 
 498         exp += vdev_raidz_log2[a];
 499         if (exp > 255)
 500                 exp -= 255;
 501 
 502         return (vdev_raidz_pow2[exp]);
 503 }
 504 
 505 static void
 506 vdev_raidz_generate_parity_p(raidz_map_t *rm)
 507 {
 508         uint64_t *p, *src, pcount __attribute__((unused)), ccount, i;
 509         int c;
 510 
 511         pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 512 
 513         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 514                 src = rm->rm_col[c].rc_data;
 515                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 516                 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
 517 
 518                 if (c == rm->rm_firstdatacol) {
 519                         ASSERT(ccount == pcount);
 520                         for (i = 0; i < ccount; i++, src++, p++) {
 521                                 *p = *src;
 522                         }
 523                 } else {
 524                         ASSERT(ccount <= pcount);
 525                         for (i = 0; i < ccount; i++, src++, p++) {
 526                                 *p ^= *src;
 527                         }
 528                 }
 529         }
 530 }
 531 
 532 static void
 533 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
 534 {
 535         uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
 536         int c;
 537 
 538         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 539         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 540             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 541 
 542         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 543                 src = rm->rm_col[c].rc_data;
 544                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 545                 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 546 
 547                 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
 548 
 549                 if (c == rm->rm_firstdatacol) {
 550                         ASSERT(ccnt == pcnt || ccnt == 0);
 551                         for (i = 0; i < ccnt; i++, src++, p++, q++) {
 552                                 *p = *src;
 553                                 *q = *src;
 554                         }
 555                         for (; i < pcnt; i++, src++, p++, q++) {
 556                                 *p = 0;
 557                                 *q = 0;
 558                         }
 559                 } else {
 560                         ASSERT(ccnt <= pcnt);
 561 
 562                         /*
 563                          * Apply the algorithm described above by multiplying
 564                          * the previous result and adding in the new value.
 565                          */
 566                         for (i = 0; i < ccnt; i++, src++, p++, q++) {
 567                                 *p ^= *src;
 568 
 569                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 570                                 *q ^= *src;
 571                         }
 572 
 573                         /*
 574                          * Treat short columns as though they are full of 0s.
 575                          * Note that there's therefore nothing needed for P.
 576                          */
 577                         for (; i < pcnt; i++, q++) {
 578                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 579                         }
 580                 }
 581         }
 582 }
 583 
 584 static void
 585 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
 586 {
 587         uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
 588         int c;
 589 
 590         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 591         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 592             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 593         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 594             rm->rm_col[VDEV_RAIDZ_R].rc_size);
 595 
 596         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 597                 src = rm->rm_col[c].rc_data;
 598                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 599                 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 600                 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
 601 
 602                 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
 603 
 604                 if (c == rm->rm_firstdatacol) {
 605                         ASSERT(ccnt == pcnt || ccnt == 0);
 606                         for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
 607                                 *p = *src;
 608                                 *q = *src;
 609                                 *r = *src;
 610                         }
 611                         for (; i < pcnt; i++, src++, p++, q++, r++) {
 612                                 *p = 0;
 613                                 *q = 0;
 614                                 *r = 0;
 615                         }
 616                 } else {
 617                         ASSERT(ccnt <= pcnt);
 618 
 619                         /*
 620                          * Apply the algorithm described above by multiplying
 621                          * the previous result and adding in the new value.
 622                          */
 623                         for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
 624                                 *p ^= *src;
 625 
 626                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 627                                 *q ^= *src;
 628 
 629                                 VDEV_RAIDZ_64MUL_4(*r, mask);
 630                                 *r ^= *src;
 631                         }
 632 
 633                         /*
 634                          * Treat short columns as though they are full of 0s.
 635                          * Note that there's therefore nothing needed for P.
 636                          */
 637                         for (; i < pcnt; i++, q++, r++) {
 638                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 639                                 VDEV_RAIDZ_64MUL_4(*r, mask);
 640                         }
 641                 }
 642         }
 643 }
 644 
 645 /*
 646  * Generate RAID parity in the first virtual columns according to the number of
 647  * parity columns available.
 648  */
 649 static void
 650 vdev_raidz_generate_parity(raidz_map_t *rm)
 651 {
 652         switch (rm->rm_firstdatacol) {
 653         case 1:
 654                 vdev_raidz_generate_parity_p(rm);
 655                 break;
 656         case 2:
 657                 vdev_raidz_generate_parity_pq(rm);
 658                 break;
 659         case 3:
 660                 vdev_raidz_generate_parity_pqr(rm);
 661                 break;
 662         default:
 663                 panic("invalid RAID-Z configuration");
 664         }
 665 }
 666 
 667 /* BEGIN CSTYLED */
 668 /*
 669  * In the general case of reconstruction, we must solve the system of linear
 670  * equations defined by the coeffecients used to generate parity as well as
 671  * the contents of the data and parity disks. This can be expressed with
 672  * vectors for the original data (D) and the actual data (d) and parity (p)
 673  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
 674  *
 675  *            __   __                     __     __
 676  *            |     |         __     __   |  p_0  |
 677  *            |  V  |         |  D_0  |   | p_m-1 |
 678  *            |     |    x    |   :   | = |  d_0  |
 679  *            |  I  |         | D_n-1 |   |   :   |
 680  *            |     |         ~~     ~~   | d_n-1 |
 681  *            ~~   ~~                     ~~     ~~
 682  *
 683  * I is simply a square identity matrix of size n, and V is a vandermonde
 684  * matrix defined by the coeffecients we chose for the various parity columns
 685  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
 686  * computation as well as linear separability.
 687  *
 688  *      __               __               __     __
 689  *      |   1   ..  1 1 1 |               |  p_0  |
 690  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
 691  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
 692  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
 693  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
 694  *      |   :       : : : |   |   :   |   |  d_2  |
 695  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
 696  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
 697  *      |   0   ..  0 0 1 |               | d_n-1 |
 698  *      ~~               ~~               ~~     ~~
 699  *
 700  * Note that I, V, d, and p are known. To compute D, we must invert the
 701  * matrix and use the known data and parity values to reconstruct the unknown
 702  * data values. We begin by removing the rows in V|I and d|p that correspond
 703  * to failed or missing columns; we then make V|I square (n x n) and d|p
 704  * sized n by removing rows corresponding to unused parity from the bottom up
 705  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
 706  * using Gauss-Jordan elimination. In the example below we use m=3 parity
 707  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
 708  *           __                               __
 709  *           |  1   1   1   1   1   1   1   1  |
 710  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
 711  *           |  19 205 116  29  64  16  4   1  |      / /
 712  *           |  1   0   0   0   0   0   0   0  |     / /
 713  *           |  0   1   0   0   0   0   0   0  | <--' /
 714  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
 715  *           |  0   0   0   1   0   0   0   0  |
 716  *           |  0   0   0   0   1   0   0   0  |
 717  *           |  0   0   0   0   0   1   0   0  |
 718  *           |  0   0   0   0   0   0   1   0  |
 719  *           |  0   0   0   0   0   0   0   1  |
 720  *           ~~                               ~~
 721  *           __                               __
 722  *           |  1   1   1   1   1   1   1   1  |
 723  *           | 128  64  32  16  8   4   2   1  |
 724  *           |  19 205 116  29  64  16  4   1  |
 725  *           |  1   0   0   0   0   0   0   0  |
 726  *           |  0   1   0   0   0   0   0   0  |
 727  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
 728  *           |  0   0   0   1   0   0   0   0  |
 729  *           |  0   0   0   0   1   0   0   0  |
 730  *           |  0   0   0   0   0   1   0   0  |
 731  *           |  0   0   0   0   0   0   1   0  |
 732  *           |  0   0   0   0   0   0   0   1  |
 733  *           ~~                               ~~
 734  *
 735  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
 736  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
 737  * matrix is not singular.
 738  * __                                                                 __
 739  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
 740  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
 741  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 742  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 743  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 744  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 745  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 746  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 747  * ~~                                                                 ~~
 748  * __                                                                 __
 749  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 750  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
 751  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
 752  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 753  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 754  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 755  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 756  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 757  * ~~                                                                 ~~
 758  * __                                                                 __
 759  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 760  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
 761  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
 762  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 763  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 764  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 765  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 766  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 767  * ~~                                                                 ~~
 768  * __                                                                 __
 769  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 770  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
 771  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
 772  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 773  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 774  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 775  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 776  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 777  * ~~                                                                 ~~
 778  * __                                                                 __
 779  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 780  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
 781  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
 782  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 783  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 784  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 785  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 786  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 787  * ~~                                                                 ~~
 788  * __                                                                 __
 789  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 790  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
 791  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
 792  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 793  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 794  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 795  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 796  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 797  * ~~                                                                 ~~
 798  *                   __                               __
 799  *                   |  0   0   1   0   0   0   0   0  |
 800  *                   | 167 100  5   41 159 169 217 208 |
 801  *                   | 166 100  4   40 158 168 216 209 |
 802  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
 803  *                   |  0   0   0   0   1   0   0   0  |
 804  *                   |  0   0   0   0   0   1   0   0  |
 805  *                   |  0   0   0   0   0   0   1   0  |
 806  *                   |  0   0   0   0   0   0   0   1  |
 807  *                   ~~                               ~~
 808  *
 809  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
 810  * of the missing data.
 811  *
 812  * As is apparent from the example above, the only non-trivial rows in the
 813  * inverse matrix correspond to the data disks that we're trying to
 814  * reconstruct. Indeed, those are the only rows we need as the others would
 815  * only be useful for reconstructing data known or assumed to be valid. For
 816  * that reason, we only build the coefficients in the rows that correspond to
 817  * targeted columns.
 818  */
 819 /* END CSTYLED */
 820 
 821 static void
 822 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
 823     uint8_t **rows)
 824 {
 825         int i, j;
 826         int pow;
 827 
 828         ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
 829 
 830         /*
 831          * Fill in the missing rows of interest.
 832          */
 833         for (i = 0; i < nmap; i++) {
 834                 ASSERT3S(0, <=, map[i]);
 835                 ASSERT3S(map[i], <=, 2);
 836 
 837                 pow = map[i] * n;
 838                 if (pow > 255)
 839                         pow -= 255;
 840                 ASSERT(pow <= 255);
 841 
 842                 for (j = 0; j < n; j++) {
 843                         pow -= map[i];
 844                         if (pow < 0)
 845                                 pow += 255;
 846                         rows[i][j] = vdev_raidz_pow2[pow];
 847                 }
 848         }
 849 }
 850 
 851 static void
 852 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
 853     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
 854 {
 855         int i, j, ii, jj;
 856         uint8_t log;
 857 
 858         /*
 859          * Assert that the first nmissing entries from the array of used
 860          * columns correspond to parity columns and that subsequent entries
 861          * correspond to data columns.
 862          */
 863         for (i = 0; i < nmissing; i++) {
 864                 ASSERT3S(used[i], <, rm->rm_firstdatacol);
 865         }
 866         for (; i < n; i++) {
 867                 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
 868         }
 869 
 870         /*
 871          * First initialize the storage where we'll compute the inverse rows.
 872          */
 873         for (i = 0; i < nmissing; i++) {
 874                 for (j = 0; j < n; j++) {
 875                         invrows[i][j] = (i == j) ? 1 : 0;
 876                 }
 877         }
 878 
 879         /*
 880          * Subtract all trivial rows from the rows of consequence.
 881          */
 882         for (i = 0; i < nmissing; i++) {
 883                 for (j = nmissing; j < n; j++) {
 884                         ASSERT3U(used[j], >=, rm->rm_firstdatacol);
 885                         jj = used[j] - rm->rm_firstdatacol;
 886                         ASSERT3S(jj, <, n);
 887                         invrows[i][j] = rows[i][jj];
 888                         rows[i][jj] = 0;
 889                 }
 890         }
 891 
 892         /*
 893          * For each of the rows of interest, we must normalize it and subtract
 894          * a multiple of it from the other rows.
 895          */
 896         for (i = 0; i < nmissing; i++) {
 897                 for (j = 0; j < missing[i]; j++) {
 898                         ASSERT3U(rows[i][j], ==, 0);
 899                 }
 900                 ASSERT3U(rows[i][missing[i]], !=, 0);
 901 
 902                 /*
 903                  * Compute the inverse of the first element and multiply each
 904                  * element in the row by that value.
 905                  */
 906                 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
 907 
 908                 for (j = 0; j < n; j++) {
 909                         rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
 910                         invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
 911                 }
 912 
 913                 for (ii = 0; ii < nmissing; ii++) {
 914                         if (i == ii)
 915                                 continue;
 916 
 917                         ASSERT3U(rows[ii][missing[i]], !=, 0);
 918 
 919                         log = vdev_raidz_log2[rows[ii][missing[i]]];
 920 
 921                         for (j = 0; j < n; j++) {
 922                                 rows[ii][j] ^=
 923                                     vdev_raidz_exp2(rows[i][j], log);
 924                                 invrows[ii][j] ^=
 925                                     vdev_raidz_exp2(invrows[i][j], log);
 926                         }
 927                 }
 928         }
 929 
 930         /*
 931          * Verify that the data that is left in the rows are properly part of
 932          * an identity matrix.
 933          */
 934         for (i = 0; i < nmissing; i++) {
 935                 for (j = 0; j < n; j++) {
 936                         if (j == missing[i]) {
 937                                 ASSERT3U(rows[i][j], ==, 1);
 938                         } else {
 939                                 ASSERT3U(rows[i][j], ==, 0);
 940                         }
 941                 }
 942         }
 943 }
 944 
 945 static void
 946 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
 947     int *missing, uint8_t **invrows, const uint8_t *used)
 948 {
 949         int i, j, x, cc, c;
 950         uint8_t *src;
 951         uint64_t ccount;
 952         uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
 953         uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
 954         uint8_t log, val;
 955         int ll;
 956         uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
 957         uint8_t *p, *pp;
 958         size_t psize;
 959 
 960         log = 0;        /* gcc */
 961         psize = sizeof (invlog[0][0]) * n * nmissing;
 962         p = zfs_alloc(psize);
 963 
 964         for (pp = p, i = 0; i < nmissing; i++) {
 965                 invlog[i] = pp;
 966                 pp += n;
 967         }
 968 
 969         for (i = 0; i < nmissing; i++) {
 970                 for (j = 0; j < n; j++) {
 971                         ASSERT3U(invrows[i][j], !=, 0);
 972                         invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
 973                 }
 974         }
 975 
 976         for (i = 0; i < n; i++) {
 977                 c = used[i];
 978                 ASSERT3U(c, <, rm->rm_cols);
 979 
 980                 src = rm->rm_col[c].rc_data;
 981                 ccount = rm->rm_col[c].rc_size;
 982                 for (j = 0; j < nmissing; j++) {
 983                         cc = missing[j] + rm->rm_firstdatacol;
 984                         ASSERT3U(cc, >=, rm->rm_firstdatacol);
 985                         ASSERT3U(cc, <, rm->rm_cols);
 986                         ASSERT3U(cc, !=, c);
 987 
 988                         dst[j] = rm->rm_col[cc].rc_data;
 989                         dcount[j] = rm->rm_col[cc].rc_size;
 990                 }
 991 
 992                 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
 993 
 994                 for (x = 0; x < ccount; x++, src++) {
 995                         if (*src != 0)
 996                                 log = vdev_raidz_log2[*src];
 997 
 998                         for (cc = 0; cc < nmissing; cc++) {
 999                                 if (x >= dcount[cc])
1000                                         continue;
1001 
1002                                 if (*src == 0) {
1003                                         val = 0;
1004                                 } else {
1005                                         if ((ll = log + invlog[cc][i]) >= 255)
1006                                                 ll -= 255;
1007                                         val = vdev_raidz_pow2[ll];
1008                                 }
1009 
1010                                 if (i == 0)
1011                                         dst[cc][x] = val;
1012                                 else
1013                                         dst[cc][x] ^= val;
1014                         }
1015                 }
1016         }
1017 
1018         zfs_free(p, psize);
1019 }
1020 
1021 static int
1022 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1023 {
1024         int n, i, c, t, tt;
1025         int nmissing_rows;
1026         int missing_rows[VDEV_RAIDZ_MAXPARITY];
1027         int parity_map[VDEV_RAIDZ_MAXPARITY];
1028 
1029         uint8_t *p, *pp;
1030         size_t psize;
1031 
1032         uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1033         uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1034         uint8_t *used;
1035 
1036         int code = 0;
1037 
1038 
1039         n = rm->rm_cols - rm->rm_firstdatacol;
1040 
1041         /*
1042          * Figure out which data columns are missing.
1043          */
1044         nmissing_rows = 0;
1045         for (t = 0; t < ntgts; t++) {
1046                 if (tgts[t] >= rm->rm_firstdatacol) {
1047                         missing_rows[nmissing_rows++] =
1048                             tgts[t] - rm->rm_firstdatacol;
1049                 }
1050         }
1051 
1052         /*
1053          * Figure out which parity columns to use to help generate the missing
1054          * data columns.
1055          */
1056         for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1057                 ASSERT(tt < ntgts);
1058                 ASSERT(c < rm->rm_firstdatacol);
1059 
1060                 /*
1061                  * Skip any targeted parity columns.
1062                  */
1063                 if (c == tgts[tt]) {
1064                         tt++;
1065                         continue;
1066                 }
1067 
1068                 code |= 1 << c;
1069 
1070                 parity_map[i] = c;
1071                 i++;
1072         }
1073 
1074         ASSERT(code != 0);
1075         ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1076 
1077         psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1078             nmissing_rows * n + sizeof (used[0]) * n;
1079         p = kmem_alloc(psize, KM_SLEEP);
1080 
1081         for (pp = p, i = 0; i < nmissing_rows; i++) {
1082                 rows[i] = pp;
1083                 pp += n;
1084                 invrows[i] = pp;
1085                 pp += n;
1086         }
1087         used = pp;
1088 
1089         for (i = 0; i < nmissing_rows; i++) {
1090                 used[i] = parity_map[i];
1091         }
1092 
1093         for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1094                 if (tt < nmissing_rows &&
1095                     c == missing_rows[tt] + rm->rm_firstdatacol) {
1096                         tt++;
1097                         continue;
1098                 }
1099 
1100                 ASSERT3S(i, <, n);
1101                 used[i] = c;
1102                 i++;
1103         }
1104 
1105         /*
1106          * Initialize the interesting rows of the matrix.
1107          */
1108         vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1109 
1110         /*
1111          * Invert the matrix.
1112          */
1113         vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1114             invrows, used);
1115 
1116         /*
1117          * Reconstruct the missing data using the generated matrix.
1118          */
1119         vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1120             invrows, used);
1121 
1122         kmem_free(p, psize);
1123 
1124         return (code);
1125 }
1126 
1127 static int
1128 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1129 {
1130         int tgts[VDEV_RAIDZ_MAXPARITY];
1131         int ntgts;
1132         int i, c;
1133         int code;
1134         int nbadparity, nbaddata;
1135 
1136         /*
1137          * The tgts list must already be sorted.
1138          */
1139         for (i = 1; i < nt; i++) {
1140                 ASSERT(t[i] > t[i - 1]);
1141         }
1142 
1143         nbadparity = rm->rm_firstdatacol;
1144         nbaddata = rm->rm_cols - nbadparity;
1145         ntgts = 0;
1146         for (i = 0, c = 0; c < rm->rm_cols; c++) {
1147                 if (i < nt && c == t[i]) {
1148                         tgts[ntgts++] = c;
1149                         i++;
1150                 } else if (rm->rm_col[c].rc_error != 0) {
1151                         tgts[ntgts++] = c;
1152                 } else if (c >= rm->rm_firstdatacol) {
1153                         nbaddata--;
1154                 } else {
1155                         nbadparity--;
1156                 }
1157         }
1158 
1159         ASSERT(ntgts >= nt);
1160         ASSERT(nbaddata >= 0);
1161         ASSERT(nbaddata + nbadparity == ntgts);
1162 
1163         code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1164         ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1165         ASSERT(code > 0);
1166         return (code);
1167 }
1168 
1169 static raidz_map_t *
1170 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1171     uint64_t dcols, uint64_t nparity)
1172 {
1173         raidz_map_t *rm;
1174         uint64_t b = offset >> unit_shift;
1175         uint64_t s = size >> unit_shift;
1176         uint64_t f = b % dcols;
1177         uint64_t o = (b / dcols) << unit_shift;
1178         uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1179 
1180         q = s / (dcols - nparity);
1181         r = s - q * (dcols - nparity);
1182         bc = (r == 0 ? 0 : r + nparity);
1183         tot = s + nparity * (q + (r == 0 ? 0 : 1));
1184 
1185         if (q == 0) {
1186                 acols = bc;
1187                 scols = MIN(dcols, roundup(bc, nparity + 1));
1188         } else {
1189                 acols = dcols;
1190                 scols = dcols;
1191         }
1192 
1193         ASSERT3U(acols, <=, scols);
1194 
1195         rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols]));
1196 
1197         rm->rm_cols = acols;
1198         rm->rm_scols = scols;
1199         rm->rm_bigcols = bc;
1200         rm->rm_skipstart = bc;
1201         rm->rm_missingdata = 0;
1202         rm->rm_missingparity = 0;
1203         rm->rm_firstdatacol = nparity;
1204         rm->rm_reports = 0;
1205         rm->rm_freed = 0;
1206         rm->rm_ecksuminjected = 0;
1207 
1208         asize = 0;
1209 
1210         for (c = 0; c < scols; c++) {
1211                 col = f + c;
1212                 coff = o;
1213                 if (col >= dcols) {
1214                         col -= dcols;
1215                         coff += 1ULL << unit_shift;
1216                 }
1217                 rm->rm_col[c].rc_devidx = col;
1218                 rm->rm_col[c].rc_offset = coff;
1219                 rm->rm_col[c].rc_data = NULL;
1220                 rm->rm_col[c].rc_error = 0;
1221                 rm->rm_col[c].rc_tried = 0;
1222                 rm->rm_col[c].rc_skipped = 0;
1223 
1224                 if (c >= acols)
1225                         rm->rm_col[c].rc_size = 0;
1226                 else if (c < bc)
1227                         rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1228                 else
1229                         rm->rm_col[c].rc_size = q << unit_shift;
1230 
1231                 asize += rm->rm_col[c].rc_size;
1232         }
1233 
1234         ASSERT3U(asize, ==, tot << unit_shift);
1235         rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1236         rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1237         ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1238         ASSERT3U(rm->rm_nskip, <=, nparity);
1239 
1240         for (c = 0; c < rm->rm_firstdatacol; c++)
1241                 rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size);
1242 
1243         rm->rm_col[c].rc_data = data;
1244 
1245         for (c = c + 1; c < acols; c++)
1246                 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1247                     rm->rm_col[c - 1].rc_size;
1248 
1249         /*
1250          * If all data stored spans all columns, there's a danger that parity
1251          * will always be on the same device and, since parity isn't read
1252          * during normal operation, that that device's I/O bandwidth won't be
1253          * used effectively. We therefore switch the parity every 1MB.
1254          *
1255          * ... at least that was, ostensibly, the theory. As a practical
1256          * matter unless we juggle the parity between all devices evenly, we
1257          * won't see any benefit. Further, occasional writes that aren't a
1258          * multiple of the LCM of the number of children and the minimum
1259          * stripe width are sufficient to avoid pessimal behavior.
1260          * Unfortunately, this decision created an implicit on-disk format
1261          * requirement that we need to support for all eternity, but only
1262          * for single-parity RAID-Z.
1263          *
1264          * If we intend to skip a sector in the zeroth column for padding
1265          * we must make sure to note this swap. We will never intend to
1266          * skip the first column since at least one data and one parity
1267          * column must appear in each row.
1268          */
1269         ASSERT(rm->rm_cols >= 2);
1270         ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1271 
1272         if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1273                 devidx = rm->rm_col[0].rc_devidx;
1274                 o = rm->rm_col[0].rc_offset;
1275                 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1276                 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1277                 rm->rm_col[1].rc_devidx = devidx;
1278                 rm->rm_col[1].rc_offset = o;
1279 
1280                 if (rm->rm_skipstart == 0)
1281                         rm->rm_skipstart = 1;
1282         }
1283 
1284         return (rm);
1285 }
1286 
1287 static void
1288 vdev_raidz_map_free(raidz_map_t *rm)
1289 {
1290         int c;
1291 
1292         for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1293                 zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
1294 
1295         zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
1296 }
1297 
1298 static vdev_t *
1299 vdev_child(vdev_t *pvd, uint64_t devidx)
1300 {
1301         vdev_t *cvd;
1302 
1303         STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1304                 if (cvd->v_id == devidx)
1305                         break;
1306         }
1307 
1308         return (cvd);
1309 }
1310 
1311 /*
1312  * We keep track of whether or not there were any injected errors, so that
1313  * any ereports we generate can note it.
1314  */
1315 static int
1316 raidz_checksum_verify(const blkptr_t *bp, void *data, uint64_t size)
1317 {
1318 
1319         return (zio_checksum_verify(bp, data));
1320 }
1321 
1322 /*
1323  * Generate the parity from the data columns. If we tried and were able to
1324  * read the parity without error, verify that the generated parity matches the
1325  * data we read. If it doesn't, we fire off a checksum error. Return the
1326  * number such failures.
1327  */
1328 static int
1329 raidz_parity_verify(raidz_map_t *rm)
1330 {
1331         void *orig[VDEV_RAIDZ_MAXPARITY];
1332         int c, ret = 0;
1333         raidz_col_t *rc;
1334 
1335         for (c = 0; c < rm->rm_firstdatacol; c++) {
1336                 rc = &rm->rm_col[c];
1337                 if (!rc->rc_tried || rc->rc_error != 0)
1338                         continue;
1339                 orig[c] = zfs_alloc(rc->rc_size);
1340                 bcopy(rc->rc_data, orig[c], rc->rc_size);
1341         }
1342 
1343         vdev_raidz_generate_parity(rm);
1344 
1345         for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1346                 rc = &rm->rm_col[c];
1347                 if (!rc->rc_tried || rc->rc_error != 0)
1348                         continue;
1349                 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1350                         rc->rc_error = ECKSUM;
1351                         ret++;
1352                 }
1353                 zfs_free(orig[c], rc->rc_size);
1354         }
1355 
1356         return (ret);
1357 }
1358 
1359 /*
1360  * Iterate over all combinations of bad data and attempt a reconstruction.
1361  * Note that the algorithm below is non-optimal because it doesn't take into
1362  * account how reconstruction is actually performed. For example, with
1363  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1364  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1365  * cases we'd only use parity information in column 0.
1366  */
1367 static int
1368 vdev_raidz_combrec(raidz_map_t *rm, const blkptr_t *bp, void *data,
1369     off_t offset, uint64_t bytes, int total_errors, int data_errors)
1370 {
1371         raidz_col_t *rc;
1372         void *orig[VDEV_RAIDZ_MAXPARITY];
1373         int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1374         int *tgts = &tstore[1];
1375         int current, next, i, c, n;
1376         int code, ret = 0;
1377 
1378         ASSERT(total_errors < rm->rm_firstdatacol);
1379 
1380         /*
1381          * This simplifies one edge condition.
1382          */
1383         tgts[-1] = -1;
1384 
1385         for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1386                 /*
1387                  * Initialize the targets array by finding the first n columns
1388                  * that contain no error.
1389                  *
1390                  * If there were no data errors, we need to ensure that we're
1391                  * always explicitly attempting to reconstruct at least one
1392                  * data column. To do this, we simply push the highest target
1393                  * up into the data columns.
1394                  */
1395                 for (c = 0, i = 0; i < n; i++) {
1396                         if (i == n - 1 && data_errors == 0 &&
1397                             c < rm->rm_firstdatacol) {
1398                                 c = rm->rm_firstdatacol;
1399                         }
1400 
1401                         while (rm->rm_col[c].rc_error != 0) {
1402                                 c++;
1403                                 ASSERT3S(c, <, rm->rm_cols);
1404                         }
1405 
1406                         tgts[i] = c++;
1407                 }
1408 
1409                 /*
1410                  * Setting tgts[n] simplifies the other edge condition.
1411                  */
1412                 tgts[n] = rm->rm_cols;
1413 
1414                 /*
1415                  * These buffers were allocated in previous iterations.
1416                  */
1417                 for (i = 0; i < n - 1; i++) {
1418                         ASSERT(orig[i] != NULL);
1419                 }
1420 
1421                 orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size);
1422 
1423                 current = 0;
1424                 next = tgts[current];
1425 
1426                 while (current != n) {
1427                         tgts[current] = next;
1428                         current = 0;
1429 
1430                         /*
1431                          * Save off the original data that we're going to
1432                          * attempt to reconstruct.
1433                          */
1434                         for (i = 0; i < n; i++) {
1435                                 ASSERT(orig[i] != NULL);
1436                                 c = tgts[i];
1437                                 ASSERT3S(c, >=, 0);
1438                                 ASSERT3S(c, <, rm->rm_cols);
1439                                 rc = &rm->rm_col[c];
1440                                 bcopy(rc->rc_data, orig[i], rc->rc_size);
1441                         }
1442 
1443                         /*
1444                          * Attempt a reconstruction and exit the outer loop on
1445                          * success.
1446                          */
1447                         code = vdev_raidz_reconstruct(rm, tgts, n);
1448                         if (raidz_checksum_verify(bp, data, bytes) == 0) {
1449                                 for (i = 0; i < n; i++) {
1450                                         c = tgts[i];
1451                                         rc = &rm->rm_col[c];
1452                                         ASSERT(rc->rc_error == 0);
1453                                         rc->rc_error = ECKSUM;
1454                                 }
1455 
1456                                 ret = code;
1457                                 goto done;
1458                         }
1459 
1460                         /*
1461                          * Restore the original data.
1462                          */
1463                         for (i = 0; i < n; i++) {
1464                                 c = tgts[i];
1465                                 rc = &rm->rm_col[c];
1466                                 bcopy(orig[i], rc->rc_data, rc->rc_size);
1467                         }
1468 
1469                         do {
1470                                 /*
1471                                  * Find the next valid column after the current
1472                                  * position..
1473                                  */
1474                                 for (next = tgts[current] + 1;
1475                                     next < rm->rm_cols &&
1476                                     rm->rm_col[next].rc_error != 0; next++)
1477                                         continue;
1478 
1479                                 ASSERT(next <= tgts[current + 1]);
1480 
1481                                 /*
1482                                  * If that spot is available, we're done here.
1483                                  */
1484                                 if (next != tgts[current + 1])
1485                                         break;
1486 
1487                                 /*
1488                                  * Otherwise, find the next valid column after
1489                                  * the previous position.
1490                                  */
1491                                 for (c = tgts[current - 1] + 1;
1492                                     rm->rm_col[c].rc_error != 0; c++)
1493                                         continue;
1494 
1495                                 tgts[current] = c;
1496                                 current++;
1497 
1498                         } while (current != n);
1499                 }
1500         }
1501         n--;
1502 done:
1503         for (i = n - 1; i >= 0; i--) {
1504                 zfs_free(orig[i], rm->rm_col[0].rc_size);
1505         }
1506 
1507         return (ret);
1508 }
1509 
1510 static int
1511 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1512     off_t offset, size_t bytes)
1513 {
1514         vdev_t *tvd = vd->v_top;
1515         vdev_t *cvd;
1516         raidz_map_t *rm;
1517         raidz_col_t *rc;
1518         int c, error;
1519         int unexpected_errors;
1520         int parity_errors;
1521         int parity_untried;
1522         int data_errors;
1523         int total_errors;
1524         int n;
1525         int tgts[VDEV_RAIDZ_MAXPARITY];
1526         int code;
1527 
1528         rc = NULL;      /* gcc */
1529         error = 0;
1530 
1531         rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1532             vd->v_nchildren, vd->v_nparity);
1533 
1534         /*
1535          * Iterate over the columns in reverse order so that we hit the parity
1536          * last -- any errors along the way will force us to read the parity.
1537          */
1538         for (c = rm->rm_cols - 1; c >= 0; c--) {
1539                 rc = &rm->rm_col[c];
1540                 cvd = vdev_child(vd, rc->rc_devidx);
1541                 if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1542                         if (c >= rm->rm_firstdatacol)
1543                                 rm->rm_missingdata++;
1544                         else
1545                                 rm->rm_missingparity++;
1546                         rc->rc_error = ENXIO;
1547                         rc->rc_tried = 1;    /* don't even try */
1548                         rc->rc_skipped = 1;
1549                         continue;
1550                 }
1551 #if 0           /* XXX: Too hard for the boot code. */
1552                 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1553                         if (c >= rm->rm_firstdatacol)
1554                                 rm->rm_missingdata++;
1555                         else
1556                                 rm->rm_missingparity++;
1557                         rc->rc_error = ESTALE;
1558                         rc->rc_skipped = 1;
1559                         continue;
1560                 }
1561 #endif
1562                 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1563                         rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1564                             rc->rc_offset, rc->rc_size);
1565                         rc->rc_tried = 1;
1566                         rc->rc_skipped = 0;
1567                 }
1568         }
1569 
1570 reconstruct:
1571         unexpected_errors = 0;
1572         parity_errors = 0;
1573         parity_untried = 0;
1574         data_errors = 0;
1575         total_errors = 0;
1576 
1577         ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1578         ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1579 
1580         for (c = 0; c < rm->rm_cols; c++) {
1581                 rc = &rm->rm_col[c];
1582 
1583                 if (rc->rc_error) {
1584                         ASSERT(rc->rc_error != ECKSUM);      /* child has no bp */
1585 
1586                         if (c < rm->rm_firstdatacol)
1587                                 parity_errors++;
1588                         else
1589                                 data_errors++;
1590 
1591                         if (!rc->rc_skipped)
1592                                 unexpected_errors++;
1593 
1594                         total_errors++;
1595                 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1596                         parity_untried++;
1597                 }
1598         }
1599 
1600         /*
1601          * There are three potential phases for a read:
1602          *      1. produce valid data from the columns read
1603          *      2. read all disks and try again
1604          *      3. perform combinatorial reconstruction
1605          *
1606          * Each phase is progressively both more expensive and less likely to
1607          * occur. If we encounter more errors than we can repair or all phases
1608          * fail, we have no choice but to return an error.
1609          */
1610 
1611         /*
1612          * If the number of errors we saw was correctable -- less than or equal
1613          * to the number of parity disks read -- attempt to produce data that
1614          * has a valid checksum. Naturally, this case applies in the absence of
1615          * any errors.
1616          */
1617         if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1618                 if (data_errors == 0) {
1619                         if (raidz_checksum_verify(bp, data, bytes) == 0) {
1620                                 /*
1621                                  * If we read parity information (unnecessarily
1622                                  * as it happens since no reconstruction was
1623                                  * needed) regenerate and verify the parity.
1624                                  * We also regenerate parity when resilvering
1625                                  * so we can write it out to the failed device
1626                                  * later.
1627                                  */
1628                                 if (parity_errors + parity_untried <
1629                                     rm->rm_firstdatacol) {
1630                                         n = raidz_parity_verify(rm);
1631                                         unexpected_errors += n;
1632                                         ASSERT(parity_errors + n <=
1633                                             rm->rm_firstdatacol);
1634                                 }
1635                                 goto done;
1636                         }
1637                 } else {
1638                         /*
1639                          * We either attempt to read all the parity columns or
1640                          * none of them. If we didn't try to read parity, we
1641                          * wouldn't be here in the correctable case. There must
1642                          * also have been fewer parity errors than parity
1643                          * columns or, again, we wouldn't be in this code path.
1644                          */
1645                         ASSERT(parity_untried == 0);
1646                         ASSERT(parity_errors < rm->rm_firstdatacol);
1647 
1648                         /*
1649                          * Identify the data columns that reported an error.
1650                          */
1651                         n = 0;
1652                         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1653                                 rc = &rm->rm_col[c];
1654                                 if (rc->rc_error != 0) {
1655                                         ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1656                                         tgts[n++] = c;
1657                                 }
1658                         }
1659 
1660                         ASSERT(rm->rm_firstdatacol >= n);
1661 
1662                         code = vdev_raidz_reconstruct(rm, tgts, n);
1663 
1664                         if (raidz_checksum_verify(bp, data, bytes) == 0) {
1665                                 /*
1666                                  * If we read more parity disks than were used
1667                                  * for reconstruction, confirm that the other
1668                                  * parity disks produced correct data. This
1669                                  * routine is suboptimal in that it regenerates
1670                                  * the parity that we already used in addition
1671                                  * to the parity that we're attempting to
1672                                  * verify, but this should be a relatively
1673                                  * uncommon case, and can be optimized if it
1674                                  * becomes a problem. Note that we regenerate
1675                                  * parity when resilvering so we can write it
1676                                  * out to failed devices later.
1677                                  */
1678                                 if (parity_errors < rm->rm_firstdatacol - n) {
1679                                         n = raidz_parity_verify(rm);
1680                                         unexpected_errors += n;
1681                                         ASSERT(parity_errors + n <=
1682                                             rm->rm_firstdatacol);
1683                                 }
1684 
1685                                 goto done;
1686                         }
1687                 }
1688         }
1689 
1690         /*
1691          * This isn't a typical situation -- either we got a read
1692          * error or a child silently returned bad data. Read every
1693          * block so we can try again with as much data and parity as
1694          * we can track down. If we've already been through once
1695          * before, all children will be marked as tried so we'll
1696          * proceed to combinatorial reconstruction.
1697          */
1698         unexpected_errors = 1;
1699         rm->rm_missingdata = 0;
1700         rm->rm_missingparity = 0;
1701 
1702         n = 0;
1703         for (c = 0; c < rm->rm_cols; c++) {
1704                 rc = &rm->rm_col[c];
1705 
1706                 if (rc->rc_tried)
1707                         continue;
1708 
1709                 cvd = vdev_child(vd, rc->rc_devidx);
1710                 ASSERT(cvd != NULL);
1711                 rc->rc_error = cvd->v_read(cvd, NULL,
1712                     rc->rc_data, rc->rc_offset, rc->rc_size);
1713                 if (rc->rc_error == 0)
1714                         n++;
1715                 rc->rc_tried = 1;
1716                 rc->rc_skipped = 0;
1717         }
1718         /*
1719          * If we managed to read anything more, retry the
1720          * reconstruction.
1721          */
1722         if (n > 0)
1723                 goto reconstruct;
1724 
1725         /*
1726          * At this point we've attempted to reconstruct the data given the
1727          * errors we detected, and we've attempted to read all columns. There
1728          * must, therefore, be one or more additional problems -- silent errors
1729          * resulting in invalid data rather than explicit I/O errors resulting
1730          * in absent data. We check if there is enough additional data to
1731          * possibly reconstruct the data and then perform combinatorial
1732          * reconstruction over all possible combinations. If that fails,
1733          * we're cooked.
1734          */
1735         if (total_errors > rm->rm_firstdatacol) {
1736                 error = EIO;
1737         } else if (total_errors < rm->rm_firstdatacol &&
1738             (code = vdev_raidz_combrec(rm, bp, data, offset, bytes,
1739              total_errors, data_errors)) != 0) {
1740                 /*
1741                  * If we didn't use all the available parity for the
1742                  * combinatorial reconstruction, verify that the remaining
1743                  * parity is correct.
1744                  */
1745                 if (code != (1 << rm->rm_firstdatacol) - 1)
1746                         (void) raidz_parity_verify(rm);
1747         } else {
1748                 /*
1749                  * We're here because either:
1750                  *
1751                  *      total_errors == rm_first_datacol, or
1752                  *      vdev_raidz_combrec() failed
1753                  *
1754                  * In either case, there is enough bad data to prevent
1755                  * reconstruction.
1756                  *
1757                  * Start checksum ereports for all children which haven't
1758                  * failed, and the IO wasn't speculative.
1759                  */
1760                 error = ECKSUM;
1761         }
1762 
1763 done:
1764         vdev_raidz_map_free(rm);
1765 
1766         return (error);
1767 }