1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  25  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  26  * Copyright (c) 2014 Integros [integros.com]
  27  */
  28 
  29 #include <sys/zfs_context.h>
  30 #include <sys/spa.h>
  31 #include <sys/vdev_impl.h>
  32 #include <sys/vdev_disk.h>
  33 #include <sys/vdev_file.h>
  34 #include <sys/vdev_raidz.h>
  35 #include <sys/zio.h>
  36 #include <sys/zio_checksum.h>
  37 #include <sys/abd.h>
  38 #include <sys/fs/zfs.h>
  39 #include <sys/fm/fs/zfs.h>
  40 
  41 /*
  42  * Virtual device vector for RAID-Z.
  43  *
  44  * This vdev supports single, double, and triple parity. For single parity,
  45  * we use a simple XOR of all the data columns. For double or triple parity,
  46  * we use a special case of Reed-Solomon coding. This extends the
  47  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
  48  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
  49  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
  50  * former is also based. The latter is designed to provide higher performance
  51  * for writes.
  52  *
  53  * Note that the Plank paper claimed to support arbitrary N+M, but was then
  54  * amended six years later identifying a critical flaw that invalidates its
  55  * claims. Nevertheless, the technique can be adapted to work for up to
  56  * triple parity. For additional parity, the amendment "Note: Correction to
  57  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
  58  * is viable, but the additional complexity means that write performance will
  59  * suffer.
  60  *
  61  * All of the methods above operate on a Galois field, defined over the
  62  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
  63  * can be expressed with a single byte. Briefly, the operations on the
  64  * field are defined as follows:
  65  *
  66  *   o addition (+) is represented by a bitwise XOR
  67  *   o subtraction (-) is therefore identical to addition: A + B = A - B
  68  *   o multiplication of A by 2 is defined by the following bitwise expression:
  69  *
  70  *      (A * 2)_7 = A_6
  71  *      (A * 2)_6 = A_5
  72  *      (A * 2)_5 = A_4
  73  *      (A * 2)_4 = A_3 + A_7
  74  *      (A * 2)_3 = A_2 + A_7
  75  *      (A * 2)_2 = A_1 + A_7
  76  *      (A * 2)_1 = A_0
  77  *      (A * 2)_0 = A_7
  78  *
  79  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
  80  * As an aside, this multiplication is derived from the error correcting
  81  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
  82  *
  83  * Observe that any number in the field (except for 0) can be expressed as a
  84  * power of 2 -- a generator for the field. We store a table of the powers of
  85  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
  86  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
  87  * than field addition). The inverse of a field element A (A^-1) is therefore
  88  * A ^ (255 - 1) = A^254.
  89  *
  90  * The up-to-three parity columns, P, Q, R over several data columns,
  91  * D_0, ... D_n-1, can be expressed by field operations:
  92  *
  93  *      P = D_0 + D_1 + ... + D_n-2 + D_n-1
  94  *      Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
  95  *        = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
  96  *      R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
  97  *        = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
  98  *
  99  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
 100  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
 101  * independent coefficients. (There are no additional coefficients that have
 102  * this property which is why the uncorrected Plank method breaks down.)
 103  *
 104  * See the reconstruction code below for how P, Q and R can used individually
 105  * or in concert to recover missing data columns.
 106  */
 107 
 108 typedef struct raidz_col {
 109         uint64_t rc_devidx;             /* child device index for I/O */
 110         uint64_t rc_offset;             /* device offset */
 111         uint64_t rc_size;               /* I/O size */
 112         abd_t *rc_abd;                  /* I/O data */
 113         void *rc_gdata;                 /* used to store the "good" version */
 114         int rc_error;                   /* I/O error for this device */
 115         uint8_t rc_tried;               /* Did we attempt this I/O column? */
 116         uint8_t rc_skipped;             /* Did we skip this I/O column? */
 117 } raidz_col_t;
 118 
 119 typedef struct raidz_map {
 120         uint64_t rm_cols;               /* Regular column count */
 121         uint64_t rm_scols;              /* Count including skipped columns */
 122         uint64_t rm_bigcols;            /* Number of oversized columns */
 123         uint64_t rm_asize;              /* Actual total I/O size */
 124         uint64_t rm_missingdata;        /* Count of missing data devices */
 125         uint64_t rm_missingparity;      /* Count of missing parity devices */
 126         uint64_t rm_firstdatacol;       /* First data column/parity count */
 127         uint64_t rm_nskip;              /* Skipped sectors for padding */
 128         uint64_t rm_skipstart;          /* Column index of padding start */
 129         abd_t *rm_abd_copy;             /* rm_asize-buffer of copied data */
 130         uintptr_t rm_reports;           /* # of referencing checksum reports */
 131         uint8_t rm_freed;               /* map no longer has referencing ZIO */
 132         uint8_t rm_ecksuminjected;      /* checksum error was injected */
 133         raidz_col_t rm_col[1];          /* Flexible array of I/O columns */
 134 } raidz_map_t;
 135 
 136 #define VDEV_RAIDZ_P            0
 137 #define VDEV_RAIDZ_Q            1
 138 #define VDEV_RAIDZ_R            2
 139 
 140 #define VDEV_RAIDZ_MUL_2(x)     (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
 141 #define VDEV_RAIDZ_MUL_4(x)     (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
 142 
 143 /*
 144  * We provide a mechanism to perform the field multiplication operation on a
 145  * 64-bit value all at once rather than a byte at a time. This works by
 146  * creating a mask from the top bit in each byte and using that to
 147  * conditionally apply the XOR of 0x1d.
 148  */
 149 #define VDEV_RAIDZ_64MUL_2(x, mask) \
 150 { \
 151         (mask) = (x) & 0x8080808080808080ULL; \
 152         (mask) = ((mask) << 1) - ((mask) >> 7); \
 153         (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
 154             ((mask) & 0x1d1d1d1d1d1d1d1d); \
 155 }
 156 
 157 #define VDEV_RAIDZ_64MUL_4(x, mask) \
 158 { \
 159         VDEV_RAIDZ_64MUL_2((x), mask); \
 160         VDEV_RAIDZ_64MUL_2((x), mask); \
 161 }
 162 
 163 #define VDEV_LABEL_OFFSET(x)    (x + VDEV_LABEL_START_SIZE)
 164 
 165 /*
 166  * Force reconstruction to use the general purpose method.
 167  */
 168 int vdev_raidz_default_to_general;
 169 
 170 /* Powers of 2 in the Galois field defined above. */
 171 static const uint8_t vdev_raidz_pow2[256] = {
 172         0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
 173         0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
 174         0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
 175         0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
 176         0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
 177         0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
 178         0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
 179         0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
 180         0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
 181         0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
 182         0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
 183         0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
 184         0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
 185         0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
 186         0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
 187         0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
 188         0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
 189         0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
 190         0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
 191         0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
 192         0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
 193         0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
 194         0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
 195         0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
 196         0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
 197         0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
 198         0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
 199         0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
 200         0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
 201         0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
 202         0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
 203         0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
 204 };
 205 /* Logs of 2 in the Galois field defined above. */
 206 static const uint8_t vdev_raidz_log2[256] = {
 207         0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
 208         0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
 209         0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
 210         0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
 211         0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
 212         0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
 213         0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
 214         0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
 215         0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
 216         0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
 217         0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
 218         0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
 219         0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
 220         0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
 221         0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
 222         0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
 223         0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
 224         0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
 225         0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
 226         0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
 227         0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
 228         0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
 229         0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
 230         0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
 231         0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
 232         0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
 233         0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
 234         0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
 235         0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
 236         0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
 237         0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
 238         0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
 239 };
 240 
 241 static void vdev_raidz_generate_parity(raidz_map_t *rm);
 242 
 243 /*
 244  * Multiply a given number by 2 raised to the given power.
 245  */
 246 static uint8_t
 247 vdev_raidz_exp2(uint_t a, int exp)
 248 {
 249         if (a == 0)
 250                 return (0);
 251 
 252         ASSERT(exp >= 0);
 253         ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
 254 
 255         exp += vdev_raidz_log2[a];
 256         if (exp > 255)
 257                 exp -= 255;
 258 
 259         return (vdev_raidz_pow2[exp]);
 260 }
 261 
 262 static void
 263 vdev_raidz_map_free(raidz_map_t *rm)
 264 {
 265         int c;
 266         size_t size;
 267 
 268         for (c = 0; c < rm->rm_firstdatacol; c++) {
 269                 abd_free(rm->rm_col[c].rc_abd);
 270 
 271                 if (rm->rm_col[c].rc_gdata != NULL)
 272                         zio_buf_free(rm->rm_col[c].rc_gdata,
 273                             rm->rm_col[c].rc_size);
 274         }
 275 
 276         size = 0;
 277         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 278                 abd_put(rm->rm_col[c].rc_abd);
 279                 size += rm->rm_col[c].rc_size;
 280         }
 281 
 282         if (rm->rm_abd_copy != NULL)
 283                 abd_free(rm->rm_abd_copy);
 284 
 285         kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
 286 }
 287 
 288 static void
 289 vdev_raidz_map_free_vsd(zio_t *zio)
 290 {
 291         raidz_map_t *rm = zio->io_vsd;
 292 
 293         ASSERT0(rm->rm_freed);
 294         rm->rm_freed = 1;
 295 
 296         if (rm->rm_reports == 0)
 297                 vdev_raidz_map_free(rm);
 298 }
 299 
 300 /*ARGSUSED*/
 301 static void
 302 vdev_raidz_cksum_free(void *arg, size_t ignored)
 303 {
 304         raidz_map_t *rm = arg;
 305 
 306         ASSERT3U(rm->rm_reports, >, 0);
 307 
 308         if (--rm->rm_reports == 0 && rm->rm_freed != 0)
 309                 vdev_raidz_map_free(rm);
 310 }
 311 
 312 static void
 313 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
 314 {
 315         raidz_map_t *rm = zcr->zcr_cbdata;
 316         size_t c = zcr->zcr_cbinfo;
 317         size_t x;
 318 
 319         const char *good = NULL;
 320         char *bad;
 321 
 322         if (good_data == NULL) {
 323                 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
 324                 return;
 325         }
 326 
 327         if (c < rm->rm_firstdatacol) {
 328                 /*
 329                  * The first time through, calculate the parity blocks for
 330                  * the good data (this relies on the fact that the good
 331                  * data never changes for a given logical ZIO)
 332                  */
 333                 if (rm->rm_col[0].rc_gdata == NULL) {
 334                         abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
 335                         char *buf;
 336                         int offset;
 337 
 338                         /*
 339                          * Set up the rm_col[]s to generate the parity for
 340                          * good_data, first saving the parity bufs and
 341                          * replacing them with buffers to hold the result.
 342                          */
 343                         for (x = 0; x < rm->rm_firstdatacol; x++) {
 344                                 bad_parity[x] = rm->rm_col[x].rc_abd;
 345                                 rm->rm_col[x].rc_gdata =
 346                                     zio_buf_alloc(rm->rm_col[x].rc_size);
 347                                 rm->rm_col[x].rc_abd =
 348                                     abd_get_from_buf(rm->rm_col[x].rc_gdata,
 349                                     rm->rm_col[x].rc_size);
 350                         }
 351 
 352                         /* fill in the data columns from good_data */
 353                         buf = (char *)good_data;
 354                         for (; x < rm->rm_cols; x++) {
 355                                 abd_put(rm->rm_col[x].rc_abd);
 356                                 rm->rm_col[x].rc_abd = abd_get_from_buf(buf,
 357                                     rm->rm_col[x].rc_size);
 358                                 buf += rm->rm_col[x].rc_size;
 359                         }
 360 
 361                         /*
 362                          * Construct the parity from the good data.
 363                          */
 364                         vdev_raidz_generate_parity(rm);
 365 
 366                         /* restore everything back to its original state */
 367                         for (x = 0; x < rm->rm_firstdatacol; x++) {
 368                                 abd_put(rm->rm_col[x].rc_abd);
 369                                 rm->rm_col[x].rc_abd = bad_parity[x];
 370                         }
 371 
 372                         offset = 0;
 373                         for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
 374                                 abd_put(rm->rm_col[x].rc_abd);
 375                                 rm->rm_col[x].rc_abd = abd_get_offset(
 376                                     rm->rm_abd_copy, offset);
 377                                 offset += rm->rm_col[x].rc_size;
 378                         }
 379                 }
 380 
 381                 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
 382                 good = rm->rm_col[c].rc_gdata;
 383         } else {
 384                 /* adjust good_data to point at the start of our column */
 385                 good = good_data;
 386 
 387                 for (x = rm->rm_firstdatacol; x < c; x++)
 388                         good += rm->rm_col[x].rc_size;
 389         }
 390 
 391         bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size);
 392         /* we drop the ereport if it ends up that the data was good */
 393         zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
 394         abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size);
 395 }
 396 
 397 /*
 398  * Invoked indirectly by zfs_ereport_start_checksum(), called
 399  * below when our read operation fails completely.  The main point
 400  * is to keep a copy of everything we read from disk, so that at
 401  * vdev_raidz_cksum_finish() time we can compare it with the good data.
 402  */
 403 static void
 404 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
 405 {
 406         size_t c = (size_t)(uintptr_t)arg;
 407         size_t offset;
 408 
 409         raidz_map_t *rm = zio->io_vsd;
 410         size_t size;
 411 
 412         /* set up the report and bump the refcount  */
 413         zcr->zcr_cbdata = rm;
 414         zcr->zcr_cbinfo = c;
 415         zcr->zcr_finish = vdev_raidz_cksum_finish;
 416         zcr->zcr_free = vdev_raidz_cksum_free;
 417 
 418         rm->rm_reports++;
 419         ASSERT3U(rm->rm_reports, >, 0);
 420 
 421         if (rm->rm_abd_copy != NULL)
 422                 return;
 423 
 424         /*
 425          * It's the first time we're called for this raidz_map_t, so we need
 426          * to copy the data aside; there's no guarantee that our zio's buffer
 427          * won't be re-used for something else.
 428          *
 429          * Our parity data is already in separate buffers, so there's no need
 430          * to copy them.
 431          */
 432 
 433         size = 0;
 434         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
 435                 size += rm->rm_col[c].rc_size;
 436 
 437         rm->rm_abd_copy =
 438             abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size);
 439 
 440         for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 441                 raidz_col_t *col = &rm->rm_col[c];
 442                 abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset);
 443 
 444                 abd_copy(tmp, col->rc_abd, col->rc_size);
 445                 abd_put(col->rc_abd);
 446                 col->rc_abd = tmp;
 447 
 448                 offset += col->rc_size;
 449         }
 450         ASSERT3U(offset, ==, size);
 451 }
 452 
 453 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
 454         vdev_raidz_map_free_vsd,
 455         vdev_raidz_cksum_report
 456 };
 457 
 458 /*
 459  * Divides the IO evenly across all child vdevs; usually, dcols is
 460  * the number of children in the target vdev.
 461  */
 462 static raidz_map_t *
 463 vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset,
 464     uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
 465 {
 466         raidz_map_t *rm;
 467         /* The starting RAIDZ (parent) vdev sector of the block. */
 468         uint64_t b = offset >> unit_shift;
 469         /* The zio's size in units of the vdev's minimum sector size. */
 470         uint64_t s = size >> unit_shift;
 471         /* The first column for this stripe. */
 472         uint64_t f = b % dcols;
 473         /* The starting byte offset on each child vdev. */
 474         uint64_t o = (b / dcols) << unit_shift;
 475         uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
 476         uint64_t off = 0;
 477 
 478         /*
 479          * "Quotient": The number of data sectors for this stripe on all but
 480          * the "big column" child vdevs that also contain "remainder" data.
 481          */
 482         q = s / (dcols - nparity);
 483 
 484         /*
 485          * "Remainder": The number of partial stripe data sectors in this I/O.
 486          * This will add a sector to some, but not all, child vdevs.
 487          */
 488         r = s - q * (dcols - nparity);
 489 
 490         /* The number of "big columns" - those which contain remainder data. */
 491         bc = (r == 0 ? 0 : r + nparity);
 492 
 493         /*
 494          * The total number of data and parity sectors associated with
 495          * this I/O.
 496          */
 497         tot = s + nparity * (q + (r == 0 ? 0 : 1));
 498 
 499         /* acols: The columns that will be accessed. */
 500         /* scols: The columns that will be accessed or skipped. */
 501         if (q == 0) {
 502                 /* Our I/O request doesn't span all child vdevs. */
 503                 acols = bc;
 504                 scols = MIN(dcols, roundup(bc, nparity + 1));
 505         } else {
 506                 acols = dcols;
 507                 scols = dcols;
 508         }
 509 
 510         ASSERT3U(acols, <=, scols);
 511 
 512         rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
 513 
 514         rm->rm_cols = acols;
 515         rm->rm_scols = scols;
 516         rm->rm_bigcols = bc;
 517         rm->rm_skipstart = bc;
 518         rm->rm_missingdata = 0;
 519         rm->rm_missingparity = 0;
 520         rm->rm_firstdatacol = nparity;
 521         rm->rm_abd_copy = NULL;
 522         rm->rm_reports = 0;
 523         rm->rm_freed = 0;
 524         rm->rm_ecksuminjected = 0;
 525 
 526         asize = 0;
 527 
 528         for (c = 0; c < scols; c++) {
 529                 col = f + c;
 530                 coff = o;
 531                 if (col >= dcols) {
 532                         col -= dcols;
 533                         coff += 1ULL << unit_shift;
 534                 }
 535                 rm->rm_col[c].rc_devidx = col;
 536                 rm->rm_col[c].rc_offset = coff;
 537                 rm->rm_col[c].rc_abd = NULL;
 538                 rm->rm_col[c].rc_gdata = NULL;
 539                 rm->rm_col[c].rc_error = 0;
 540                 rm->rm_col[c].rc_tried = 0;
 541                 rm->rm_col[c].rc_skipped = 0;
 542 
 543                 if (c >= acols)
 544                         rm->rm_col[c].rc_size = 0;
 545                 else if (c < bc)
 546                         rm->rm_col[c].rc_size = (q + 1) << unit_shift;
 547                 else
 548                         rm->rm_col[c].rc_size = q << unit_shift;
 549 
 550                 asize += rm->rm_col[c].rc_size;
 551         }
 552 
 553         ASSERT3U(asize, ==, tot << unit_shift);
 554         rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
 555         rm->rm_nskip = roundup(tot, nparity + 1) - tot;
 556         ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
 557         ASSERT3U(rm->rm_nskip, <=, nparity);
 558 
 559         for (c = 0; c < rm->rm_firstdatacol; c++)
 560                 rm->rm_col[c].rc_abd =
 561                     abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);
 562 
 563         rm->rm_col[c].rc_abd = abd_get_offset(abd, 0);
 564         off = rm->rm_col[c].rc_size;
 565 
 566         for (c = c + 1; c < acols; c++) {
 567                 rm->rm_col[c].rc_abd = abd_get_offset(abd, off);
 568                 off += rm->rm_col[c].rc_size;
 569         }
 570 
 571         /*
 572          * If all data stored spans all columns, there's a danger that parity
 573          * will always be on the same device and, since parity isn't read
 574          * during normal operation, that that device's I/O bandwidth won't be
 575          * used effectively. We therefore switch the parity every 1MB.
 576          *
 577          * ... at least that was, ostensibly, the theory. As a practical
 578          * matter unless we juggle the parity between all devices evenly, we
 579          * won't see any benefit. Further, occasional writes that aren't a
 580          * multiple of the LCM of the number of children and the minimum
 581          * stripe width are sufficient to avoid pessimal behavior.
 582          * Unfortunately, this decision created an implicit on-disk format
 583          * requirement that we need to support for all eternity, but only
 584          * for single-parity RAID-Z.
 585          *
 586          * If we intend to skip a sector in the zeroth column for padding
 587          * we must make sure to note this swap. We will never intend to
 588          * skip the first column since at least one data and one parity
 589          * column must appear in each row.
 590          */
 591         ASSERT(rm->rm_cols >= 2);
 592         ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
 593 
 594         if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
 595                 devidx = rm->rm_col[0].rc_devidx;
 596                 o = rm->rm_col[0].rc_offset;
 597                 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
 598                 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
 599                 rm->rm_col[1].rc_devidx = devidx;
 600                 rm->rm_col[1].rc_offset = o;
 601 
 602                 if (rm->rm_skipstart == 0)
 603                         rm->rm_skipstart = 1;
 604         }
 605 
 606         return (rm);
 607 }
 608 
 609 struct pqr_struct {
 610         uint64_t *p;
 611         uint64_t *q;
 612         uint64_t *r;
 613 };
 614 
 615 static int
 616 vdev_raidz_p_func(void *buf, size_t size, void *private)
 617 {
 618         struct pqr_struct *pqr = private;
 619         const uint64_t *src = buf;
 620         int i, cnt = size / sizeof (src[0]);
 621 
 622         ASSERT(pqr->p && !pqr->q && !pqr->r);
 623 
 624         for (i = 0; i < cnt; i++, src++, pqr->p++)
 625                 *pqr->p ^= *src;
 626 
 627         return (0);
 628 }
 629 
 630 static int
 631 vdev_raidz_pq_func(void *buf, size_t size, void *private)
 632 {
 633         struct pqr_struct *pqr = private;
 634         const uint64_t *src = buf;
 635         uint64_t mask;
 636         int i, cnt = size / sizeof (src[0]);
 637 
 638         ASSERT(pqr->p && pqr->q && !pqr->r);
 639 
 640         for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
 641                 *pqr->p ^= *src;
 642                 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
 643                 *pqr->q ^= *src;
 644         }
 645 
 646         return (0);
 647 }
 648 
 649 static int
 650 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
 651 {
 652         struct pqr_struct *pqr = private;
 653         const uint64_t *src = buf;
 654         uint64_t mask;
 655         int i, cnt = size / sizeof (src[0]);
 656 
 657         ASSERT(pqr->p && pqr->q && pqr->r);
 658 
 659         for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
 660                 *pqr->p ^= *src;
 661                 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
 662                 *pqr->q ^= *src;
 663                 VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
 664                 *pqr->r ^= *src;
 665         }
 666 
 667         return (0);
 668 }
 669 
 670 static void
 671 vdev_raidz_generate_parity_p(raidz_map_t *rm)
 672 {
 673         uint64_t *p;
 674         int c;
 675         abd_t *src;
 676 
 677         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 678                 src = rm->rm_col[c].rc_abd;
 679                 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
 680 
 681                 if (c == rm->rm_firstdatacol) {
 682                         abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
 683                 } else {
 684                         struct pqr_struct pqr = { p, NULL, NULL };
 685                         (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
 686                             vdev_raidz_p_func, &pqr);
 687                 }
 688         }
 689 }
 690 
 691 static void
 692 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
 693 {
 694         uint64_t *p, *q, pcnt, ccnt, mask, i;
 695         int c;
 696         abd_t *src;
 697 
 698         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
 699         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 700             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 701 
 702         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 703                 src = rm->rm_col[c].rc_abd;
 704                 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
 705                 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
 706 
 707                 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
 708 
 709                 if (c == rm->rm_firstdatacol) {
 710                         abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
 711                         (void) memcpy(q, p, rm->rm_col[c].rc_size);
 712                 } else {
 713                         struct pqr_struct pqr = { p, q, NULL };
 714                         (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
 715                             vdev_raidz_pq_func, &pqr);
 716                 }
 717 
 718                 if (c == rm->rm_firstdatacol) {
 719                         for (i = ccnt; i < pcnt; i++) {
 720                                 p[i] = 0;
 721                                 q[i] = 0;
 722                         }
 723                 } else {
 724                         /*
 725                          * Treat short columns as though they are full of 0s.
 726                          * Note that there's therefore nothing needed for P.
 727                          */
 728                         for (i = ccnt; i < pcnt; i++) {
 729                                 VDEV_RAIDZ_64MUL_2(q[i], mask);
 730                         }
 731                 }
 732         }
 733 }
 734 
 735 static void
 736 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
 737 {
 738         uint64_t *p, *q, *r, pcnt, ccnt, mask, i;
 739         int c;
 740         abd_t *src;
 741 
 742         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
 743         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 744             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 745         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 746             rm->rm_col[VDEV_RAIDZ_R].rc_size);
 747 
 748         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 749                 src = rm->rm_col[c].rc_abd;
 750                 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
 751                 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
 752                 r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
 753 
 754                 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
 755 
 756                 if (c == rm->rm_firstdatacol) {
 757                         abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
 758                         (void) memcpy(q, p, rm->rm_col[c].rc_size);
 759                         (void) memcpy(r, p, rm->rm_col[c].rc_size);
 760                 } else {
 761                         struct pqr_struct pqr = { p, q, r };
 762                         (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
 763                             vdev_raidz_pqr_func, &pqr);
 764                 }
 765 
 766                 if (c == rm->rm_firstdatacol) {
 767                         for (i = ccnt; i < pcnt; i++) {
 768                                 p[i] = 0;
 769                                 q[i] = 0;
 770                                 r[i] = 0;
 771                         }
 772                 } else {
 773                         /*
 774                          * Treat short columns as though they are full of 0s.
 775                          * Note that there's therefore nothing needed for P.
 776                          */
 777                         for (i = ccnt; i < pcnt; i++) {
 778                                 VDEV_RAIDZ_64MUL_2(q[i], mask);
 779                                 VDEV_RAIDZ_64MUL_4(r[i], mask);
 780                         }
 781                 }
 782         }
 783 }
 784 
 785 /*
 786  * Generate RAID parity in the first virtual columns according to the number of
 787  * parity columns available.
 788  */
 789 static void
 790 vdev_raidz_generate_parity(raidz_map_t *rm)
 791 {
 792         switch (rm->rm_firstdatacol) {
 793         case 1:
 794                 vdev_raidz_generate_parity_p(rm);
 795                 break;
 796         case 2:
 797                 vdev_raidz_generate_parity_pq(rm);
 798                 break;
 799         case 3:
 800                 vdev_raidz_generate_parity_pqr(rm);
 801                 break;
 802         default:
 803                 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
 804         }
 805 }
 806 
 807 /* ARGSUSED */
 808 static int
 809 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
 810 {
 811         uint64_t *dst = dbuf;
 812         uint64_t *src = sbuf;
 813         int cnt = size / sizeof (src[0]);
 814 
 815         for (int i = 0; i < cnt; i++) {
 816                 dst[i] ^= src[i];
 817         }
 818 
 819         return (0);
 820 }
 821 
 822 /* ARGSUSED */
 823 static int
 824 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
 825     void *private)
 826 {
 827         uint64_t *dst = dbuf;
 828         uint64_t *src = sbuf;
 829         uint64_t mask;
 830         int cnt = size / sizeof (dst[0]);
 831 
 832         for (int i = 0; i < cnt; i++, dst++, src++) {
 833                 VDEV_RAIDZ_64MUL_2(*dst, mask);
 834                 *dst ^= *src;
 835         }
 836 
 837         return (0);
 838 }
 839 
 840 /* ARGSUSED */
 841 static int
 842 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
 843 {
 844         uint64_t *dst = buf;
 845         uint64_t mask;
 846         int cnt = size / sizeof (dst[0]);
 847 
 848         for (int i = 0; i < cnt; i++, dst++) {
 849                 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */
 850                 VDEV_RAIDZ_64MUL_2(*dst, mask);
 851         }
 852 
 853         return (0);
 854 }
 855 
 856 struct reconst_q_struct {
 857         uint64_t *q;
 858         int exp;
 859 };
 860 
 861 static int
 862 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
 863 {
 864         struct reconst_q_struct *rq = private;
 865         uint64_t *dst = buf;
 866         int cnt = size / sizeof (dst[0]);
 867 
 868         for (int i = 0; i < cnt; i++, dst++, rq->q++) {
 869                 *dst ^= *rq->q;
 870 
 871                 int j;
 872                 uint8_t *b;
 873                 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
 874                         *b = vdev_raidz_exp2(*b, rq->exp);
 875                 }
 876         }
 877 
 878         return (0);
 879 }
 880 
 881 struct reconst_pq_struct {
 882         uint8_t *p;
 883         uint8_t *q;
 884         uint8_t *pxy;
 885         uint8_t *qxy;
 886         int aexp;
 887         int bexp;
 888 };
 889 
 890 static int
 891 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
 892 {
 893         struct reconst_pq_struct *rpq = private;
 894         uint8_t *xd = xbuf;
 895         uint8_t *yd = ybuf;
 896 
 897         for (int i = 0; i < size;
 898             i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
 899                 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
 900                     vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
 901                 *yd = *rpq->p ^ *rpq->pxy ^ *xd;
 902         }
 903 
 904         return (0);
 905 }
 906 
 907 static int
 908 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
 909 {
 910         struct reconst_pq_struct *rpq = private;
 911         uint8_t *xd = xbuf;
 912 
 913         for (int i = 0; i < size;
 914             i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
 915                 /* same operation as vdev_raidz_reconst_pq_func() on xd */
 916                 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
 917                     vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
 918         }
 919 
 920         return (0);
 921 }
 922 
 923 static int
 924 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
 925 {
 926         int x = tgts[0];
 927         int c;
 928         abd_t *dst, *src;
 929 
 930         ASSERT(ntgts == 1);
 931         ASSERT(x >= rm->rm_firstdatacol);
 932         ASSERT(x < rm->rm_cols);
 933 
 934         ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
 935         ASSERT(rm->rm_col[x].rc_size > 0);
 936 
 937         src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
 938         dst = rm->rm_col[x].rc_abd;
 939 
 940         abd_copy(dst, src, rm->rm_col[x].rc_size);
 941 
 942         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 943                 uint64_t size = MIN(rm->rm_col[x].rc_size,
 944                     rm->rm_col[c].rc_size);
 945 
 946                 src = rm->rm_col[c].rc_abd;
 947                 dst = rm->rm_col[x].rc_abd;
 948 
 949                 if (c == x)
 950                         continue;
 951 
 952                 (void) abd_iterate_func2(dst, src, 0, 0, size,
 953                     vdev_raidz_reconst_p_func, NULL);
 954         }
 955 
 956         return (1 << VDEV_RAIDZ_P);
 957 }
 958 
 959 static int
 960 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
 961 {
 962         int x = tgts[0];
 963         int c, exp;
 964         abd_t *dst, *src;
 965 
 966         ASSERT(ntgts == 1);
 967 
 968         ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 969 
 970         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 971                 uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
 972                     rm->rm_col[c].rc_size);
 973 
 974                 src = rm->rm_col[c].rc_abd;
 975                 dst = rm->rm_col[x].rc_abd;
 976 
 977                 if (c == rm->rm_firstdatacol) {
 978                         abd_copy(dst, src, size);
 979                         if (rm->rm_col[x].rc_size > size)
 980                                 abd_zero_off(dst, size,
 981                                     rm->rm_col[x].rc_size - size);
 982                 } else {
 983                         ASSERT3U(size, <=, rm->rm_col[x].rc_size);
 984                         (void) abd_iterate_func2(dst, src, 0, 0, size,
 985                             vdev_raidz_reconst_q_pre_func, NULL);
 986                         (void) abd_iterate_func(dst,
 987                             size, rm->rm_col[x].rc_size - size,
 988                             vdev_raidz_reconst_q_pre_tail_func, NULL);
 989                 }
 990         }
 991 
 992         src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
 993         dst = rm->rm_col[x].rc_abd;
 994         exp = 255 - (rm->rm_cols - 1 - x);
 995 
 996         struct reconst_q_struct rq = { abd_to_buf(src), exp };
 997         (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
 998             vdev_raidz_reconst_q_post_func, &rq);
 999 
1000         return (1 << VDEV_RAIDZ_Q);
1001 }
1002 
1003 static int
1004 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
1005 {
1006         uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
1007         abd_t *pdata, *qdata;
1008         uint64_t xsize, ysize;
1009         int x = tgts[0];
1010         int y = tgts[1];
1011         abd_t *xd, *yd;
1012 
1013         ASSERT(ntgts == 2);
1014         ASSERT(x < y);
1015         ASSERT(x >= rm->rm_firstdatacol);
1016         ASSERT(y < rm->rm_cols);
1017 
1018         ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
1019 
1020         /*
1021          * Move the parity data aside -- we're going to compute parity as
1022          * though columns x and y were full of zeros -- Pxy and Qxy. We want to
1023          * reuse the parity generation mechanism without trashing the actual
1024          * parity so we make those columns appear to be full of zeros by
1025          * setting their lengths to zero.
1026          */
1027         pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
1028         qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
1029         xsize = rm->rm_col[x].rc_size;
1030         ysize = rm->rm_col[y].rc_size;
1031 
1032         rm->rm_col[VDEV_RAIDZ_P].rc_abd =
1033             abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
1034         rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
1035             abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
1036         rm->rm_col[x].rc_size = 0;
1037         rm->rm_col[y].rc_size = 0;
1038 
1039         vdev_raidz_generate_parity_pq(rm);
1040 
1041         rm->rm_col[x].rc_size = xsize;
1042         rm->rm_col[y].rc_size = ysize;
1043 
1044         p = abd_to_buf(pdata);
1045         q = abd_to_buf(qdata);
1046         pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1047         qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1048         xd = rm->rm_col[x].rc_abd;
1049         yd = rm->rm_col[y].rc_abd;
1050 
1051         /*
1052          * We now have:
1053          *      Pxy = P + D_x + D_y
1054          *      Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
1055          *
1056          * We can then solve for D_x:
1057          *      D_x = A * (P + Pxy) + B * (Q + Qxy)
1058          * where
1059          *      A = 2^(x - y) * (2^(x - y) + 1)^-1
1060          *      B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
1061          *
1062          * With D_x in hand, we can easily solve for D_y:
1063          *      D_y = P + Pxy + D_x
1064          */
1065 
1066         a = vdev_raidz_pow2[255 + x - y];
1067         b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
1068         tmp = 255 - vdev_raidz_log2[a ^ 1];
1069 
1070         aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
1071         bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
1072 
1073         ASSERT3U(xsize, >=, ysize);
1074         struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
1075         (void) abd_iterate_func2(xd, yd, 0, 0, ysize,
1076             vdev_raidz_reconst_pq_func, &rpq);
1077         (void) abd_iterate_func(xd, ysize, xsize - ysize,
1078             vdev_raidz_reconst_pq_tail_func, &rpq);
1079 
1080         abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1081         abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1082 
1083         /*
1084          * Restore the saved parity data.
1085          */
1086         rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
1087         rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;
1088 
1089         return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
1090 }
1091 
1092 /* BEGIN CSTYLED */
1093 /*
1094  * In the general case of reconstruction, we must solve the system of linear
1095  * equations defined by the coeffecients used to generate parity as well as
1096  * the contents of the data and parity disks. This can be expressed with
1097  * vectors for the original data (D) and the actual data (d) and parity (p)
1098  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1099  *
1100  *            __   __                     __     __
1101  *            |     |         __     __   |  p_0  |
1102  *            |  V  |         |  D_0  |   | p_m-1 |
1103  *            |     |    x    |   :   | = |  d_0  |
1104  *            |  I  |         | D_n-1 |   |   :   |
1105  *            |     |         ~~     ~~   | d_n-1 |
1106  *            ~~   ~~                     ~~     ~~
1107  *
1108  * I is simply a square identity matrix of size n, and V is a vandermonde
1109  * matrix defined by the coeffecients we chose for the various parity columns
1110  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1111  * computation as well as linear separability.
1112  *
1113  *      __               __               __     __
1114  *      |   1   ..  1 1 1 |               |  p_0  |
1115  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
1116  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
1117  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
1118  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
1119  *      |   :       : : : |   |   :   |   |  d_2  |
1120  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
1121  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
1122  *      |   0   ..  0 0 1 |               | d_n-1 |
1123  *      ~~               ~~               ~~     ~~
1124  *
1125  * Note that I, V, d, and p are known. To compute D, we must invert the
1126  * matrix and use the known data and parity values to reconstruct the unknown
1127  * data values. We begin by removing the rows in V|I and d|p that correspond
1128  * to failed or missing columns; we then make V|I square (n x n) and d|p
1129  * sized n by removing rows corresponding to unused parity from the bottom up
1130  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1131  * using Gauss-Jordan elimination. In the example below we use m=3 parity
1132  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1133  *           __                               __
1134  *           |  1   1   1   1   1   1   1   1  |
1135  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
1136  *           |  19 205 116  29  64  16  4   1  |      / /
1137  *           |  1   0   0   0   0   0   0   0  |     / /
1138  *           |  0   1   0   0   0   0   0   0  | <--' /
1139  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
1140  *           |  0   0   0   1   0   0   0   0  |
1141  *           |  0   0   0   0   1   0   0   0  |
1142  *           |  0   0   0   0   0   1   0   0  |
1143  *           |  0   0   0   0   0   0   1   0  |
1144  *           |  0   0   0   0   0   0   0   1  |
1145  *           ~~                               ~~
1146  *           __                               __
1147  *           |  1   1   1   1   1   1   1   1  |
1148  *           |  19 205 116  29  64  16  4   1  |
1149  *           |  1   0   0   0   0   0   0   0  |
1150  *  (V|I)' = |  0   0   0   1   0   0   0   0  |
1151  *           |  0   0   0   0   1   0   0   0  |
1152  *           |  0   0   0   0   0   1   0   0  |
1153  *           |  0   0   0   0   0   0   1   0  |
1154  *           |  0   0   0   0   0   0   0   1  |
1155  *           ~~                               ~~
1156  *
1157  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1158  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1159  * matrix is not singular.
1160  * __                                                                 __
1161  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1162  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1163  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1164  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1165  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1166  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1167  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1168  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1169  * ~~                                                                 ~~
1170  * __                                                                 __
1171  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1172  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1173  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1174  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1175  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1176  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1177  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1178  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1179  * ~~                                                                 ~~
1180  * __                                                                 __
1181  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1182  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1183  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1184  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1185  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1186  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1187  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1188  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1189  * ~~                                                                 ~~
1190  * __                                                                 __
1191  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1192  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1193  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1194  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1195  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1196  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1197  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1198  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1199  * ~~                                                                 ~~
1200  * __                                                                 __
1201  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1202  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1203  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1204  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1205  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1206  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1207  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1208  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1209  * ~~                                                                 ~~
1210  * __                                                                 __
1211  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1212  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1213  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1214  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1215  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1216  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1217  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1218  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1219  * ~~                                                                 ~~
1220  *                   __                               __
1221  *                   |  0   0   1   0   0   0   0   0  |
1222  *                   | 167 100  5   41 159 169 217 208 |
1223  *                   | 166 100  4   40 158 168 216 209 |
1224  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1225  *                   |  0   0   0   0   1   0   0   0  |
1226  *                   |  0   0   0   0   0   1   0   0  |
1227  *                   |  0   0   0   0   0   0   1   0  |
1228  *                   |  0   0   0   0   0   0   0   1  |
1229  *                   ~~                               ~~
1230  *
1231  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1232  * of the missing data.
1233  *
1234  * As is apparent from the example above, the only non-trivial rows in the
1235  * inverse matrix correspond to the data disks that we're trying to
1236  * reconstruct. Indeed, those are the only rows we need as the others would
1237  * only be useful for reconstructing data known or assumed to be valid. For
1238  * that reason, we only build the coefficients in the rows that correspond to
1239  * targeted columns.
1240  */
1241 /* END CSTYLED */
1242 
1243 static void
1244 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1245     uint8_t **rows)
1246 {
1247         int i, j;
1248         int pow;
1249 
1250         ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1251 
1252         /*
1253          * Fill in the missing rows of interest.
1254          */
1255         for (i = 0; i < nmap; i++) {
1256                 ASSERT3S(0, <=, map[i]);
1257                 ASSERT3S(map[i], <=, 2);
1258 
1259                 pow = map[i] * n;
1260                 if (pow > 255)
1261                         pow -= 255;
1262                 ASSERT(pow <= 255);
1263 
1264                 for (j = 0; j < n; j++) {
1265                         pow -= map[i];
1266                         if (pow < 0)
1267                                 pow += 255;
1268                         rows[i][j] = vdev_raidz_pow2[pow];
1269                 }
1270         }
1271 }
1272 
1273 static void
1274 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1275     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1276 {
1277         int i, j, ii, jj;
1278         uint8_t log;
1279 
1280         /*
1281          * Assert that the first nmissing entries from the array of used
1282          * columns correspond to parity columns and that subsequent entries
1283          * correspond to data columns.
1284          */
1285         for (i = 0; i < nmissing; i++) {
1286                 ASSERT3S(used[i], <, rm->rm_firstdatacol);
1287         }
1288         for (; i < n; i++) {
1289                 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1290         }
1291 
1292         /*
1293          * First initialize the storage where we'll compute the inverse rows.
1294          */
1295         for (i = 0; i < nmissing; i++) {
1296                 for (j = 0; j < n; j++) {
1297                         invrows[i][j] = (i == j) ? 1 : 0;
1298                 }
1299         }
1300 
1301         /*
1302          * Subtract all trivial rows from the rows of consequence.
1303          */
1304         for (i = 0; i < nmissing; i++) {
1305                 for (j = nmissing; j < n; j++) {
1306                         ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1307                         jj = used[j] - rm->rm_firstdatacol;
1308                         ASSERT3S(jj, <, n);
1309                         invrows[i][j] = rows[i][jj];
1310                         rows[i][jj] = 0;
1311                 }
1312         }
1313 
1314         /*
1315          * For each of the rows of interest, we must normalize it and subtract
1316          * a multiple of it from the other rows.
1317          */
1318         for (i = 0; i < nmissing; i++) {
1319                 for (j = 0; j < missing[i]; j++) {
1320                         ASSERT0(rows[i][j]);
1321                 }
1322                 ASSERT3U(rows[i][missing[i]], !=, 0);
1323 
1324                 /*
1325                  * Compute the inverse of the first element and multiply each
1326                  * element in the row by that value.
1327                  */
1328                 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1329 
1330                 for (j = 0; j < n; j++) {
1331                         rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1332                         invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1333                 }
1334 
1335                 for (ii = 0; ii < nmissing; ii++) {
1336                         if (i == ii)
1337                                 continue;
1338 
1339                         ASSERT3U(rows[ii][missing[i]], !=, 0);
1340 
1341                         log = vdev_raidz_log2[rows[ii][missing[i]]];
1342 
1343                         for (j = 0; j < n; j++) {
1344                                 rows[ii][j] ^=
1345                                     vdev_raidz_exp2(rows[i][j], log);
1346                                 invrows[ii][j] ^=
1347                                     vdev_raidz_exp2(invrows[i][j], log);
1348                         }
1349                 }
1350         }
1351 
1352         /*
1353          * Verify that the data that is left in the rows are properly part of
1354          * an identity matrix.
1355          */
1356         for (i = 0; i < nmissing; i++) {
1357                 for (j = 0; j < n; j++) {
1358                         if (j == missing[i]) {
1359                                 ASSERT3U(rows[i][j], ==, 1);
1360                         } else {
1361                                 ASSERT0(rows[i][j]);
1362                         }
1363                 }
1364         }
1365 }
1366 
1367 static void
1368 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1369     int *missing, uint8_t **invrows, const uint8_t *used)
1370 {
1371         int i, j, x, cc, c;
1372         uint8_t *src;
1373         uint64_t ccount;
1374         uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1375         uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1376         uint8_t log = 0;
1377         uint8_t val;
1378         int ll;
1379         uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1380         uint8_t *p, *pp;
1381         size_t psize;
1382 
1383         psize = sizeof (invlog[0][0]) * n * nmissing;
1384         p = kmem_alloc(psize, KM_SLEEP);
1385 
1386         for (pp = p, i = 0; i < nmissing; i++) {
1387                 invlog[i] = pp;
1388                 pp += n;
1389         }
1390 
1391         for (i = 0; i < nmissing; i++) {
1392                 for (j = 0; j < n; j++) {
1393                         ASSERT3U(invrows[i][j], !=, 0);
1394                         invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1395                 }
1396         }
1397 
1398         for (i = 0; i < n; i++) {
1399                 c = used[i];
1400                 ASSERT3U(c, <, rm->rm_cols);
1401 
1402                 src = abd_to_buf(rm->rm_col[c].rc_abd);
1403                 ccount = rm->rm_col[c].rc_size;
1404                 for (j = 0; j < nmissing; j++) {
1405                         cc = missing[j] + rm->rm_firstdatacol;
1406                         ASSERT3U(cc, >=, rm->rm_firstdatacol);
1407                         ASSERT3U(cc, <, rm->rm_cols);
1408                         ASSERT3U(cc, !=, c);
1409 
1410                         dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
1411                         dcount[j] = rm->rm_col[cc].rc_size;
1412                 }
1413 
1414                 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1415 
1416                 for (x = 0; x < ccount; x++, src++) {
1417                         if (*src != 0)
1418                                 log = vdev_raidz_log2[*src];
1419 
1420                         for (cc = 0; cc < nmissing; cc++) {
1421                                 if (x >= dcount[cc])
1422                                         continue;
1423 
1424                                 if (*src == 0) {
1425                                         val = 0;
1426                                 } else {
1427                                         if ((ll = log + invlog[cc][i]) >= 255)
1428                                                 ll -= 255;
1429                                         val = vdev_raidz_pow2[ll];
1430                                 }
1431 
1432                                 if (i == 0)
1433                                         dst[cc][x] = val;
1434                                 else
1435                                         dst[cc][x] ^= val;
1436                         }
1437                 }
1438         }
1439 
1440         kmem_free(p, psize);
1441 }
1442 
1443 static int
1444 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1445 {
1446         int n, i, c, t, tt;
1447         int nmissing_rows;
1448         int missing_rows[VDEV_RAIDZ_MAXPARITY];
1449         int parity_map[VDEV_RAIDZ_MAXPARITY];
1450 
1451         uint8_t *p, *pp;
1452         size_t psize;
1453 
1454         uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1455         uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1456         uint8_t *used;
1457 
1458         abd_t **bufs = NULL;
1459 
1460         int code = 0;
1461 
1462         /*
1463          * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1464          * temporary linear ABDs.
1465          */
1466         if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
1467                 bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);
1468 
1469                 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1470                         raidz_col_t *col = &rm->rm_col[c];
1471 
1472                         bufs[c] = col->rc_abd;
1473                         col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
1474                         abd_copy(col->rc_abd, bufs[c], col->rc_size);
1475                 }
1476         }
1477 
1478         n = rm->rm_cols - rm->rm_firstdatacol;
1479 
1480         /*
1481          * Figure out which data columns are missing.
1482          */
1483         nmissing_rows = 0;
1484         for (t = 0; t < ntgts; t++) {
1485                 if (tgts[t] >= rm->rm_firstdatacol) {
1486                         missing_rows[nmissing_rows++] =
1487                             tgts[t] - rm->rm_firstdatacol;
1488                 }
1489         }
1490 
1491         /*
1492          * Figure out which parity columns to use to help generate the missing
1493          * data columns.
1494          */
1495         for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1496                 ASSERT(tt < ntgts);
1497                 ASSERT(c < rm->rm_firstdatacol);
1498 
1499                 /*
1500                  * Skip any targeted parity columns.
1501                  */
1502                 if (c == tgts[tt]) {
1503                         tt++;
1504                         continue;
1505                 }
1506 
1507                 code |= 1 << c;
1508 
1509                 parity_map[i] = c;
1510                 i++;
1511         }
1512 
1513         ASSERT(code != 0);
1514         ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1515 
1516         psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1517             nmissing_rows * n + sizeof (used[0]) * n;
1518         p = kmem_alloc(psize, KM_SLEEP);
1519 
1520         for (pp = p, i = 0; i < nmissing_rows; i++) {
1521                 rows[i] = pp;
1522                 pp += n;
1523                 invrows[i] = pp;
1524                 pp += n;
1525         }
1526         used = pp;
1527 
1528         for (i = 0; i < nmissing_rows; i++) {
1529                 used[i] = parity_map[i];
1530         }
1531 
1532         for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1533                 if (tt < nmissing_rows &&
1534                     c == missing_rows[tt] + rm->rm_firstdatacol) {
1535                         tt++;
1536                         continue;
1537                 }
1538 
1539                 ASSERT3S(i, <, n);
1540                 used[i] = c;
1541                 i++;
1542         }
1543 
1544         /*
1545          * Initialize the interesting rows of the matrix.
1546          */
1547         vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1548 
1549         /*
1550          * Invert the matrix.
1551          */
1552         vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1553             invrows, used);
1554 
1555         /*
1556          * Reconstruct the missing data using the generated matrix.
1557          */
1558         vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1559             invrows, used);
1560 
1561         kmem_free(p, psize);
1562 
1563         /*
1564          * copy back from temporary linear abds and free them
1565          */
1566         if (bufs) {
1567                 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1568                         raidz_col_t *col = &rm->rm_col[c];
1569 
1570                         abd_copy(bufs[c], col->rc_abd, col->rc_size);
1571                         abd_free(col->rc_abd);
1572                         col->rc_abd = bufs[c];
1573                 }
1574                 kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
1575         }
1576 
1577         return (code);
1578 }
1579 
1580 static int
1581 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1582 {
1583         int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1584         int ntgts;
1585         int i, c;
1586         int code;
1587         int nbadparity, nbaddata;
1588         int parity_valid[VDEV_RAIDZ_MAXPARITY];
1589 
1590         /*
1591          * The tgts list must already be sorted.
1592          */
1593         for (i = 1; i < nt; i++) {
1594                 ASSERT(t[i] > t[i - 1]);
1595         }
1596 
1597         nbadparity = rm->rm_firstdatacol;
1598         nbaddata = rm->rm_cols - nbadparity;
1599         ntgts = 0;
1600         for (i = 0, c = 0; c < rm->rm_cols; c++) {
1601                 if (c < rm->rm_firstdatacol)
1602                         parity_valid[c] = B_FALSE;
1603 
1604                 if (i < nt && c == t[i]) {
1605                         tgts[ntgts++] = c;
1606                         i++;
1607                 } else if (rm->rm_col[c].rc_error != 0) {
1608                         tgts[ntgts++] = c;
1609                 } else if (c >= rm->rm_firstdatacol) {
1610                         nbaddata--;
1611                 } else {
1612                         parity_valid[c] = B_TRUE;
1613                         nbadparity--;
1614                 }
1615         }
1616 
1617         ASSERT(ntgts >= nt);
1618         ASSERT(nbaddata >= 0);
1619         ASSERT(nbaddata + nbadparity == ntgts);
1620 
1621         dt = &tgts[nbadparity];
1622 
1623         /*
1624          * See if we can use any of our optimized reconstruction routines.
1625          */
1626         if (!vdev_raidz_default_to_general) {
1627                 switch (nbaddata) {
1628                 case 1:
1629                         if (parity_valid[VDEV_RAIDZ_P])
1630                                 return (vdev_raidz_reconstruct_p(rm, dt, 1));
1631 
1632                         ASSERT(rm->rm_firstdatacol > 1);
1633 
1634                         if (parity_valid[VDEV_RAIDZ_Q])
1635                                 return (vdev_raidz_reconstruct_q(rm, dt, 1));
1636 
1637                         ASSERT(rm->rm_firstdatacol > 2);
1638                         break;
1639 
1640                 case 2:
1641                         ASSERT(rm->rm_firstdatacol > 1);
1642 
1643                         if (parity_valid[VDEV_RAIDZ_P] &&
1644                             parity_valid[VDEV_RAIDZ_Q])
1645                                 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1646 
1647                         ASSERT(rm->rm_firstdatacol > 2);
1648 
1649                         break;
1650                 }
1651         }
1652 
1653         code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1654         ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1655         ASSERT(code > 0);
1656         return (code);
1657 }
1658 
1659 static int
1660 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1661     uint64_t *ashift)
1662 {
1663         vdev_t *cvd;
1664         uint64_t nparity = vd->vdev_nparity;
1665         int c;
1666         int lasterror = 0;
1667         int numerrors = 0;
1668 
1669         ASSERT(nparity > 0);
1670 
1671         if (nparity > VDEV_RAIDZ_MAXPARITY ||
1672             vd->vdev_children < nparity + 1) {
1673                 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1674                 return (SET_ERROR(EINVAL));
1675         }
1676 
1677         vdev_open_children(vd);
1678 
1679         for (c = 0; c < vd->vdev_children; c++) {
1680                 cvd = vd->vdev_child[c];
1681 
1682                 if (cvd->vdev_open_error != 0) {
1683                         lasterror = cvd->vdev_open_error;
1684                         numerrors++;
1685                         continue;
1686                 }
1687 
1688                 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1689                 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1690                 *ashift = MAX(*ashift, cvd->vdev_ashift);
1691         }
1692 
1693         *asize *= vd->vdev_children;
1694         *max_asize *= vd->vdev_children;
1695 
1696         if (numerrors > nparity) {
1697                 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1698                 return (lasterror);
1699         }
1700 
1701         return (0);
1702 }
1703 
1704 static void
1705 vdev_raidz_close(vdev_t *vd)
1706 {
1707         int c;
1708 
1709         for (c = 0; c < vd->vdev_children; c++)
1710                 vdev_close(vd->vdev_child[c]);
1711 }
1712 
1713 /*
1714  * Handle a read or write I/O to a RAID-Z dump device.
1715  *
1716  * The dump device is in a unique situation compared to other ZFS datasets:
1717  * writing to this device should be as simple and fast as possible.  In
1718  * addition, durability matters much less since the dump will be extracted
1719  * once the machine reboots.  For that reason, this function eschews parity for
1720  * performance and simplicity.  The dump device uses the checksum setting
1721  * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1722  * dataset.
1723  *
1724  * Blocks of size 128 KB have been preallocated for this volume.  I/Os less than
1725  * 128 KB will not fill an entire block; in addition, they may not be properly
1726  * aligned.  In that case, this function uses the preallocated 128 KB block and
1727  * omits reading or writing any "empty" portions of that block, as opposed to
1728  * allocating a fresh appropriately-sized block.
1729  *
1730  * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1731  *
1732  *     vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1733  *
1734  * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1735  * allocated which spans all five child vdevs.  8 KB of data would be written to
1736  * each of four vdevs, with the fifth containing the parity bits.
1737  *
1738  *       parity    data     data     data     data
1739  *     |   PP   |   XX   |   XX   |   XX   |   XX   |
1740  *         ^        ^        ^        ^        ^
1741  *         |        |        |        |        |
1742  *   8 KB parity    ------8 KB data blocks------
1743  *
1744  * However, when writing to the dump device, the behavior is different:
1745  *
1746  *     vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1747  *
1748  * Unlike the normal RAID-Z case in which the block is allocated based on the
1749  * I/O size, reads and writes here always use a 128 KB logical I/O size.  If the
1750  * I/O size is less than 128 KB, only the actual portions of data are written.
1751  * In this example the data is written to the third data vdev since that vdev
1752  * contains the offset [64 KB, 96 KB).
1753  *
1754  *       parity    data     data     data     data
1755  *     |        |        |        |   XX   |        |
1756  *                                    ^
1757  *                                    |
1758  *                             32 KB data block
1759  *
1760  * As a result, an individual I/O may not span all child vdevs; moreover, a
1761  * small I/O may only operate on a single child vdev.
1762  *
1763  * Note that since there are no parity bits calculated or written, this format
1764  * remains the same no matter how many parity bits are used in a normal RAID-Z
1765  * stripe.  On a RAID-Z3 configuration with seven child vdevs, the example above
1766  * would look like:
1767  *
1768  *       parity   parity   parity    data     data     data     data
1769  *     |        |        |        |        |        |   XX   |        |
1770  *                                                      ^
1771  *                                                      |
1772  *                                               32 KB data block
1773  */
1774 int
1775 vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1776     uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1777 {
1778         vdev_t *tvd = vd->vdev_top;
1779         vdev_t *cvd;
1780         raidz_map_t *rm;
1781         raidz_col_t *rc;
1782         int c, err = 0;
1783 
1784         uint64_t start, end, colstart, colend;
1785         uint64_t coloffset, colsize, colskip;
1786 
1787         int flags = doread ? B_READ : B_WRITE;
1788 
1789 #ifdef  _KERNEL
1790 
1791         /*
1792          * Don't write past the end of the block
1793          */
1794         VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1795 
1796         start = offset;
1797         end = start + size;
1798 
1799         /*
1800          * Allocate a RAID-Z map for this block.  Note that this block starts
1801          * from the "original" offset, this is, the offset of the extent which
1802          * contains the requisite offset of the data being read or written.
1803          *
1804          * Even if this I/O operation doesn't span the full block size, let's
1805          * treat the on-disk format as if the only blocks are the complete 128
1806          * KB size.
1807          */
1808         abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
1809             SPA_OLD_MAXBLOCKSIZE);
1810         rm = vdev_raidz_map_alloc(abd,
1811             SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift,
1812             vd->vdev_children, vd->vdev_nparity);
1813 
1814         coloffset = origoffset;
1815 
1816         for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1817             c++, coloffset += rc->rc_size) {
1818                 rc = &rm->rm_col[c];
1819                 cvd = vd->vdev_child[rc->rc_devidx];
1820 
1821                 /*
1822                  * Find the start and end of this column in the RAID-Z map,
1823                  * keeping in mind that the stated size and offset of the
1824                  * operation may not fill the entire column for this vdev.
1825                  *
1826                  * If any portion of the data spans this column, issue the
1827                  * appropriate operation to the vdev.
1828                  */
1829                 if (coloffset + rc->rc_size <= start)
1830                         continue;
1831                 if (coloffset >= end)
1832                         continue;
1833 
1834                 colstart = MAX(coloffset, start);
1835                 colend = MIN(end, coloffset + rc->rc_size);
1836                 colsize = colend - colstart;
1837                 colskip = colstart - coloffset;
1838 
1839                 VERIFY3U(colsize, <=, rc->rc_size);
1840                 VERIFY3U(colskip, <=, rc->rc_size);
1841 
1842                 /*
1843                  * Note that the child vdev will have a vdev label at the start
1844                  * of its range of offsets, hence the need for
1845                  * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
1846                  * example of why this calculation is needed.
1847                  */
1848                 if ((err = vdev_disk_physio(cvd,
1849                     ((char *)abd_to_buf(rc->rc_abd)) + colskip, colsize,
1850                     VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1851                     flags, isdump)) != 0)
1852                         break;
1853         }
1854 
1855         vdev_raidz_map_free(rm);
1856         abd_put(abd);
1857 #endif  /* KERNEL */
1858 
1859         return (err);
1860 }
1861 
1862 static uint64_t
1863 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1864 {
1865         uint64_t asize;
1866         uint64_t ashift = vd->vdev_top->vdev_ashift;
1867         uint64_t cols = vd->vdev_children;
1868         uint64_t nparity = vd->vdev_nparity;
1869 
1870         asize = ((psize - 1) >> ashift) + 1;
1871         asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1872         asize = roundup(asize, nparity + 1) << ashift;
1873 
1874         return (asize);
1875 }
1876 
1877 static void
1878 vdev_raidz_child_done(zio_t *zio)
1879 {
1880         raidz_col_t *rc = zio->io_private;
1881 
1882         rc->rc_error = zio->io_error;
1883         rc->rc_tried = 1;
1884         rc->rc_skipped = 0;
1885 }
1886 
1887 /*
1888  * Start an IO operation on a RAIDZ VDev
1889  *
1890  * Outline:
1891  * - For write operations:
1892  *   1. Generate the parity data
1893  *   2. Create child zio write operations to each column's vdev, for both
1894  *      data and parity.
1895  *   3. If the column skips any sectors for padding, create optional dummy
1896  *      write zio children for those areas to improve aggregation continuity.
1897  * - For read operations:
1898  *   1. Create child zio read operations to each data column's vdev to read
1899  *      the range of data required for zio.
1900  *   2. If this is a scrub or resilver operation, or if any of the data
1901  *      vdevs have had errors, then create zio read operations to the parity
1902  *      columns' VDevs as well.
1903  */
1904 static void
1905 vdev_raidz_io_start(zio_t *zio)
1906 {
1907         vdev_t *vd = zio->io_vd;
1908         vdev_t *tvd = vd->vdev_top;
1909         vdev_t *cvd;
1910         raidz_map_t *rm;
1911         raidz_col_t *rc;
1912         int c, i;
1913 
1914         rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
1915             tvd->vdev_ashift, vd->vdev_children,
1916             vd->vdev_nparity);
1917 
1918         zio->io_vsd = rm;
1919         zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1920 
1921         ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1922 
1923         if (zio->io_type == ZIO_TYPE_WRITE) {
1924                 vdev_raidz_generate_parity(rm);
1925 
1926                 for (c = 0; c < rm->rm_cols; c++) {
1927                         rc = &rm->rm_col[c];
1928                         cvd = vd->vdev_child[rc->rc_devidx];
1929                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1930                             rc->rc_offset, rc->rc_abd, rc->rc_size,
1931                             zio->io_type, zio->io_priority, 0,
1932                             vdev_raidz_child_done, rc));
1933                 }
1934 
1935                 /*
1936                  * Generate optional I/Os for any skipped sectors to improve
1937                  * aggregation contiguity.
1938                  */
1939                 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1940                         ASSERT(c <= rm->rm_scols);
1941                         if (c == rm->rm_scols)
1942                                 c = 0;
1943                         rc = &rm->rm_col[c];
1944                         cvd = vd->vdev_child[rc->rc_devidx];
1945                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1946                             rc->rc_offset + rc->rc_size, NULL,
1947                             1 << tvd->vdev_ashift,
1948                             zio->io_type, zio->io_priority,
1949                             ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1950                 }
1951 
1952                 zio_execute(zio);
1953                 return;
1954         }
1955 
1956         ASSERT(zio->io_type == ZIO_TYPE_READ);
1957 
1958         /*
1959          * Iterate over the columns in reverse order so that we hit the parity
1960          * last -- any errors along the way will force us to read the parity.
1961          */
1962         for (c = rm->rm_cols - 1; c >= 0; c--) {
1963                 rc = &rm->rm_col[c];
1964                 cvd = vd->vdev_child[rc->rc_devidx];
1965                 if (!vdev_readable(cvd)) {
1966                         if (c >= rm->rm_firstdatacol)
1967                                 rm->rm_missingdata++;
1968                         else
1969                                 rm->rm_missingparity++;
1970                         rc->rc_error = SET_ERROR(ENXIO);
1971                         rc->rc_tried = 1;    /* don't even try */
1972                         rc->rc_skipped = 1;
1973                         continue;
1974                 }
1975                 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1976                         if (c >= rm->rm_firstdatacol)
1977                                 rm->rm_missingdata++;
1978                         else
1979                                 rm->rm_missingparity++;
1980                         rc->rc_error = SET_ERROR(ESTALE);
1981                         rc->rc_skipped = 1;
1982                         continue;
1983                 }
1984                 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1985                     (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1986                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1987                             rc->rc_offset, rc->rc_abd, rc->rc_size,
1988                             zio->io_type, zio->io_priority, 0,
1989                             vdev_raidz_child_done, rc));
1990                 }
1991         }
1992 
1993         zio_execute(zio);
1994 }
1995 
1996 
1997 /*
1998  * Report a checksum error for a child of a RAID-Z device.
1999  */
2000 static void
2001 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
2002 {
2003         void *buf;
2004         vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
2005 
2006         if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2007                 zio_bad_cksum_t zbc;
2008                 raidz_map_t *rm = zio->io_vsd;
2009 
2010                 mutex_enter(&vd->vdev_stat_lock);
2011                 vd->vdev_stat.vs_checksum_errors++;
2012                 mutex_exit(&vd->vdev_stat_lock);
2013 
2014                 zbc.zbc_has_cksum = 0;
2015                 zbc.zbc_injected = rm->rm_ecksuminjected;
2016 
2017                 buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size);
2018                 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
2019                     rc->rc_offset, rc->rc_size, buf, bad_data,
2020                     &zbc);
2021                 abd_return_buf(rc->rc_abd, buf, rc->rc_size);
2022         }
2023 }
2024 
2025 /*
2026  * We keep track of whether or not there were any injected errors, so that
2027  * any ereports we generate can note it.
2028  */
2029 static int
2030 raidz_checksum_verify(zio_t *zio)
2031 {
2032         zio_bad_cksum_t zbc;
2033         raidz_map_t *rm = zio->io_vsd;
2034 
2035         int ret = zio_checksum_error(zio, &zbc);
2036         if (ret != 0 && zbc.zbc_injected != 0)
2037                 rm->rm_ecksuminjected = 1;
2038 
2039         return (ret);
2040 }
2041 
2042 /*
2043  * Generate the parity from the data columns. If we tried and were able to
2044  * read the parity without error, verify that the generated parity matches the
2045  * data we read. If it doesn't, we fire off a checksum error. Return the
2046  * number such failures.
2047  */
2048 static int
2049 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
2050 {
2051         void *orig[VDEV_RAIDZ_MAXPARITY];
2052         int c, ret = 0;
2053         raidz_col_t *rc;
2054 
2055         blkptr_t *bp = zio->io_bp;
2056         enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
2057             (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
2058 
2059         if (checksum == ZIO_CHECKSUM_NOPARITY)
2060                 return (ret);
2061 
2062         for (c = 0; c < rm->rm_firstdatacol; c++) {
2063                 rc = &rm->rm_col[c];
2064                 if (!rc->rc_tried || rc->rc_error != 0)
2065                         continue;
2066                 orig[c] = zio_buf_alloc(rc->rc_size);
2067                 abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size);
2068         }
2069 
2070         vdev_raidz_generate_parity(rm);
2071 
2072         for (c = 0; c < rm->rm_firstdatacol; c++) {
2073                 rc = &rm->rm_col[c];
2074                 if (!rc->rc_tried || rc->rc_error != 0)
2075                         continue;
2076                 if (abd_cmp_buf(rc->rc_abd, orig[c], rc->rc_size) != 0) {
2077                         raidz_checksum_error(zio, rc, orig[c]);
2078                         rc->rc_error = SET_ERROR(ECKSUM);
2079                         ret++;
2080                 }
2081                 zio_buf_free(orig[c], rc->rc_size);
2082         }
2083 
2084         return (ret);
2085 }
2086 
2087 /*
2088  * Keep statistics on all the ways that we used parity to correct data.
2089  */
2090 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
2091 
2092 static int
2093 vdev_raidz_worst_error(raidz_map_t *rm)
2094 {
2095         int error = 0;
2096 
2097         for (int c = 0; c < rm->rm_cols; c++)
2098                 error = zio_worst_error(error, rm->rm_col[c].rc_error);
2099 
2100         return (error);
2101 }
2102 
2103 /*
2104  * Iterate over all combinations of bad data and attempt a reconstruction.
2105  * Note that the algorithm below is non-optimal because it doesn't take into
2106  * account how reconstruction is actually performed. For example, with
2107  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
2108  * is targeted as invalid as if columns 1 and 4 are targeted since in both
2109  * cases we'd only use parity information in column 0.
2110  */
2111 static int
2112 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
2113 {
2114         raidz_map_t *rm = zio->io_vsd;
2115         raidz_col_t *rc;
2116         void *orig[VDEV_RAIDZ_MAXPARITY];
2117         int tstore[VDEV_RAIDZ_MAXPARITY + 2];
2118         int *tgts = &tstore[1];
2119         int current, next, i, c, n;
2120         int code, ret = 0;
2121 
2122         ASSERT(total_errors < rm->rm_firstdatacol);
2123 
2124         /*
2125          * This simplifies one edge condition.
2126          */
2127         tgts[-1] = -1;
2128 
2129         for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
2130                 /*
2131                  * Initialize the targets array by finding the first n columns
2132                  * that contain no error.
2133                  *
2134                  * If there were no data errors, we need to ensure that we're
2135                  * always explicitly attempting to reconstruct at least one
2136                  * data column. To do this, we simply push the highest target
2137                  * up into the data columns.
2138                  */
2139                 for (c = 0, i = 0; i < n; i++) {
2140                         if (i == n - 1 && data_errors == 0 &&
2141                             c < rm->rm_firstdatacol) {
2142                                 c = rm->rm_firstdatacol;
2143                         }
2144 
2145                         while (rm->rm_col[c].rc_error != 0) {
2146                                 c++;
2147                                 ASSERT3S(c, <, rm->rm_cols);
2148                         }
2149 
2150                         tgts[i] = c++;
2151                 }
2152 
2153                 /*
2154                  * Setting tgts[n] simplifies the other edge condition.
2155                  */
2156                 tgts[n] = rm->rm_cols;
2157 
2158                 /*
2159                  * These buffers were allocated in previous iterations.
2160                  */
2161                 for (i = 0; i < n - 1; i++) {
2162                         ASSERT(orig[i] != NULL);
2163                 }
2164 
2165                 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
2166 
2167                 current = 0;
2168                 next = tgts[current];
2169 
2170                 while (current != n) {
2171                         tgts[current] = next;
2172                         current = 0;
2173 
2174                         /*
2175                          * Save off the original data that we're going to
2176                          * attempt to reconstruct.
2177                          */
2178                         for (i = 0; i < n; i++) {
2179                                 ASSERT(orig[i] != NULL);
2180                                 c = tgts[i];
2181                                 ASSERT3S(c, >=, 0);
2182                                 ASSERT3S(c, <, rm->rm_cols);
2183                                 rc = &rm->rm_col[c];
2184                                 abd_copy_to_buf(orig[i], rc->rc_abd,
2185                                     rc->rc_size);
2186                         }
2187 
2188                         /*
2189                          * Attempt a reconstruction and exit the outer loop on
2190                          * success.
2191                          */
2192                         code = vdev_raidz_reconstruct(rm, tgts, n);
2193                         if (raidz_checksum_verify(zio) == 0) {
2194                                 atomic_inc_64(&raidz_corrected[code]);
2195 
2196                                 for (i = 0; i < n; i++) {
2197                                         c = tgts[i];
2198                                         rc = &rm->rm_col[c];
2199                                         ASSERT(rc->rc_error == 0);
2200                                         if (rc->rc_tried)
2201                                                 raidz_checksum_error(zio, rc,
2202                                                     orig[i]);
2203                                         rc->rc_error = SET_ERROR(ECKSUM);
2204                                 }
2205 
2206                                 ret = code;
2207                                 goto done;
2208                         }
2209 
2210                         /*
2211                          * Restore the original data.
2212                          */
2213                         for (i = 0; i < n; i++) {
2214                                 c = tgts[i];
2215                                 rc = &rm->rm_col[c];
2216                                 abd_copy_from_buf(rc->rc_abd, orig[i],
2217                                     rc->rc_size);
2218                         }
2219 
2220                         do {
2221                                 /*
2222                                  * Find the next valid column after the current
2223                                  * position..
2224                                  */
2225                                 for (next = tgts[current] + 1;
2226                                     next < rm->rm_cols &&
2227                                     rm->rm_col[next].rc_error != 0; next++)
2228                                         continue;
2229 
2230                                 ASSERT(next <= tgts[current + 1]);
2231 
2232                                 /*
2233                                  * If that spot is available, we're done here.
2234                                  */
2235                                 if (next != tgts[current + 1])
2236                                         break;
2237 
2238                                 /*
2239                                  * Otherwise, find the next valid column after
2240                                  * the previous position.
2241                                  */
2242                                 for (c = tgts[current - 1] + 1;
2243                                     rm->rm_col[c].rc_error != 0; c++)
2244                                         continue;
2245 
2246                                 tgts[current] = c;
2247                                 current++;
2248 
2249                         } while (current != n);
2250                 }
2251         }
2252         n--;
2253 done:
2254         for (i = 0; i < n; i++) {
2255                 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2256         }
2257 
2258         return (ret);
2259 }
2260 
2261 /*
2262  * Complete an IO operation on a RAIDZ VDev
2263  *
2264  * Outline:
2265  * - For write operations:
2266  *   1. Check for errors on the child IOs.
2267  *   2. Return, setting an error code if too few child VDevs were written
2268  *      to reconstruct the data later.  Note that partial writes are
2269  *      considered successful if they can be reconstructed at all.
2270  * - For read operations:
2271  *   1. Check for errors on the child IOs.
2272  *   2. If data errors occurred:
2273  *      a. Try to reassemble the data from the parity available.
2274  *      b. If we haven't yet read the parity drives, read them now.
2275  *      c. If all parity drives have been read but the data still doesn't
2276  *         reassemble with a correct checksum, then try combinatorial
2277  *         reconstruction.
2278  *      d. If that doesn't work, return an error.
2279  *   3. If there were unexpected errors or this is a resilver operation,
2280  *      rewrite the vdevs that had errors.
2281  */
2282 static void
2283 vdev_raidz_io_done(zio_t *zio)
2284 {
2285         vdev_t *vd = zio->io_vd;
2286         vdev_t *cvd;
2287         raidz_map_t *rm = zio->io_vsd;
2288         raidz_col_t *rc;
2289         int unexpected_errors = 0;
2290         int parity_errors = 0;
2291         int parity_untried = 0;
2292         int data_errors = 0;
2293         int total_errors = 0;
2294         int n, c;
2295         int tgts[VDEV_RAIDZ_MAXPARITY];
2296         int code;
2297 
2298         ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
2299 
2300         ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2301         ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2302 
2303         for (c = 0; c < rm->rm_cols; c++) {
2304                 rc = &rm->rm_col[c];
2305 
2306                 if (rc->rc_error) {
2307                         ASSERT(rc->rc_error != ECKSUM);      /* child has no bp */
2308 
2309                         if (c < rm->rm_firstdatacol)
2310                                 parity_errors++;
2311                         else
2312                                 data_errors++;
2313 
2314                         if (!rc->rc_skipped)
2315                                 unexpected_errors++;
2316 
2317                         total_errors++;
2318                 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2319                         parity_untried++;
2320                 }
2321         }
2322 
2323         if (zio->io_type == ZIO_TYPE_WRITE) {
2324                 /*
2325                  * XXX -- for now, treat partial writes as a success.
2326                  * (If we couldn't write enough columns to reconstruct
2327                  * the data, the I/O failed.  Otherwise, good enough.)
2328                  *
2329                  * Now that we support write reallocation, it would be better
2330                  * to treat partial failure as real failure unless there are
2331                  * no non-degraded top-level vdevs left, and not update DTLs
2332                  * if we intend to reallocate.
2333                  */
2334                 /* XXPOLICY */
2335                 if (total_errors > rm->rm_firstdatacol)
2336                         zio->io_error = vdev_raidz_worst_error(rm);
2337 
2338                 return;
2339         }
2340 
2341         ASSERT(zio->io_type == ZIO_TYPE_READ);
2342         /*
2343          * There are three potential phases for a read:
2344          *      1. produce valid data from the columns read
2345          *      2. read all disks and try again
2346          *      3. perform combinatorial reconstruction
2347          *
2348          * Each phase is progressively both more expensive and less likely to
2349          * occur. If we encounter more errors than we can repair or all phases
2350          * fail, we have no choice but to return an error.
2351          */
2352 
2353         /*
2354          * If the number of errors we saw was correctable -- less than or equal
2355          * to the number of parity disks read -- attempt to produce data that
2356          * has a valid checksum. Naturally, this case applies in the absence of
2357          * any errors.
2358          */
2359         if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2360                 if (data_errors == 0) {
2361                         if (raidz_checksum_verify(zio) == 0) {
2362                                 /*
2363                                  * If we read parity information (unnecessarily
2364                                  * as it happens since no reconstruction was
2365                                  * needed) regenerate and verify the parity.
2366                                  * We also regenerate parity when resilvering
2367                                  * so we can write it out to the failed device
2368                                  * later.
2369                                  */
2370                                 if (parity_errors + parity_untried <
2371                                     rm->rm_firstdatacol ||
2372                                     (zio->io_flags & ZIO_FLAG_RESILVER)) {
2373                                         n = raidz_parity_verify(zio, rm);
2374                                         unexpected_errors += n;
2375                                         ASSERT(parity_errors + n <=
2376                                             rm->rm_firstdatacol);
2377                                 }
2378                                 goto done;
2379                         }
2380                 } else {
2381                         /*
2382                          * We either attempt to read all the parity columns or
2383                          * none of them. If we didn't try to read parity, we
2384                          * wouldn't be here in the correctable case. There must
2385                          * also have been fewer parity errors than parity
2386                          * columns or, again, we wouldn't be in this code path.
2387                          */
2388                         ASSERT(parity_untried == 0);
2389                         ASSERT(parity_errors < rm->rm_firstdatacol);
2390 
2391                         /*
2392                          * Identify the data columns that reported an error.
2393                          */
2394                         n = 0;
2395                         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2396                                 rc = &rm->rm_col[c];
2397                                 if (rc->rc_error != 0) {
2398                                         ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2399                                         tgts[n++] = c;
2400                                 }
2401                         }
2402 
2403                         ASSERT(rm->rm_firstdatacol >= n);
2404 
2405                         code = vdev_raidz_reconstruct(rm, tgts, n);
2406 
2407                         if (raidz_checksum_verify(zio) == 0) {
2408                                 atomic_inc_64(&raidz_corrected[code]);
2409 
2410                                 /*
2411                                  * If we read more parity disks than were used
2412                                  * for reconstruction, confirm that the other
2413                                  * parity disks produced correct data. This
2414                                  * routine is suboptimal in that it regenerates
2415                                  * the parity that we already used in addition
2416                                  * to the parity that we're attempting to
2417                                  * verify, but this should be a relatively
2418                                  * uncommon case, and can be optimized if it
2419                                  * becomes a problem. Note that we regenerate
2420                                  * parity when resilvering so we can write it
2421                                  * out to failed devices later.
2422                                  */
2423                                 if (parity_errors < rm->rm_firstdatacol - n ||
2424                                     (zio->io_flags & ZIO_FLAG_RESILVER)) {
2425                                         n = raidz_parity_verify(zio, rm);
2426                                         unexpected_errors += n;
2427                                         ASSERT(parity_errors + n <=
2428                                             rm->rm_firstdatacol);
2429                                 }
2430 
2431                                 goto done;
2432                         }
2433                 }
2434         }
2435 
2436         /*
2437          * This isn't a typical situation -- either we got a read error or
2438          * a child silently returned bad data. Read every block so we can
2439          * try again with as much data and parity as we can track down. If
2440          * we've already been through once before, all children will be marked
2441          * as tried so we'll proceed to combinatorial reconstruction.
2442          */
2443         unexpected_errors = 1;
2444         rm->rm_missingdata = 0;
2445         rm->rm_missingparity = 0;
2446 
2447         for (c = 0; c < rm->rm_cols; c++) {
2448                 if (rm->rm_col[c].rc_tried)
2449                         continue;
2450 
2451                 zio_vdev_io_redone(zio);
2452                 do {
2453                         rc = &rm->rm_col[c];
2454                         if (rc->rc_tried)
2455                                 continue;
2456                         zio_nowait(zio_vdev_child_io(zio, NULL,
2457                             vd->vdev_child[rc->rc_devidx],
2458                             rc->rc_offset, rc->rc_abd, rc->rc_size,
2459                             zio->io_type, zio->io_priority, 0,
2460                             vdev_raidz_child_done, rc));
2461                 } while (++c < rm->rm_cols);
2462 
2463                 return;
2464         }
2465 
2466         /*
2467          * At this point we've attempted to reconstruct the data given the
2468          * errors we detected, and we've attempted to read all columns. There
2469          * must, therefore, be one or more additional problems -- silent errors
2470          * resulting in invalid data rather than explicit I/O errors resulting
2471          * in absent data. We check if there is enough additional data to
2472          * possibly reconstruct the data and then perform combinatorial
2473          * reconstruction over all possible combinations. If that fails,
2474          * we're cooked.
2475          */
2476         if (total_errors > rm->rm_firstdatacol) {
2477                 zio->io_error = vdev_raidz_worst_error(rm);
2478 
2479         } else if (total_errors < rm->rm_firstdatacol &&
2480             (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2481                 /*
2482                  * If we didn't use all the available parity for the
2483                  * combinatorial reconstruction, verify that the remaining
2484                  * parity is correct.
2485                  */
2486                 if (code != (1 << rm->rm_firstdatacol) - 1)
2487                         (void) raidz_parity_verify(zio, rm);
2488         } else {
2489                 /*
2490                  * We're here because either:
2491                  *
2492                  *      total_errors == rm_first_datacol, or
2493                  *      vdev_raidz_combrec() failed
2494                  *
2495                  * In either case, there is enough bad data to prevent
2496                  * reconstruction.
2497                  *
2498                  * Start checksum ereports for all children which haven't
2499                  * failed, and the IO wasn't speculative.
2500                  */
2501                 zio->io_error = SET_ERROR(ECKSUM);
2502 
2503                 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2504                         for (c = 0; c < rm->rm_cols; c++) {
2505                                 rc = &rm->rm_col[c];
2506                                 if (rc->rc_error == 0) {
2507                                         zio_bad_cksum_t zbc;
2508                                         zbc.zbc_has_cksum = 0;
2509                                         zbc.zbc_injected =
2510                                             rm->rm_ecksuminjected;
2511 
2512                                         zfs_ereport_start_checksum(
2513                                             zio->io_spa,
2514                                             vd->vdev_child[rc->rc_devidx],
2515                                             zio, rc->rc_offset, rc->rc_size,
2516                                             (void *)(uintptr_t)c, &zbc);
2517                                 }
2518                         }
2519                 }
2520         }
2521 
2522 done:
2523         zio_checksum_verified(zio);
2524 
2525         if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2526             (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2527                 /*
2528                  * Use the good data we have in hand to repair damaged children.
2529                  */
2530                 for (c = 0; c < rm->rm_cols; c++) {
2531                         rc = &rm->rm_col[c];
2532                         cvd = vd->vdev_child[rc->rc_devidx];
2533 
2534                         if (rc->rc_error == 0)
2535                                 continue;
2536 
2537                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2538                             rc->rc_offset, rc->rc_abd, rc->rc_size,
2539                             ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2540                             ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2541                             ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2542                 }
2543         }
2544 }
2545 
2546 static void
2547 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2548 {
2549         if (faulted > vd->vdev_nparity)
2550                 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2551                     VDEV_AUX_NO_REPLICAS);
2552         else if (degraded + faulted != 0)
2553                 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2554         else
2555                 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2556 }
2557 
2558 vdev_ops_t vdev_raidz_ops = {
2559         vdev_raidz_open,
2560         vdev_raidz_close,
2561         vdev_raidz_asize,
2562         vdev_raidz_io_start,
2563         vdev_raidz_io_done,
2564         vdev_raidz_state_change,
2565         NULL,
2566         NULL,
2567         NULL,
2568         VDEV_TYPE_RAIDZ,        /* name of this vdev type */
2569         B_FALSE                 /* not a leaf vdev */
2570 };