1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  25  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  26  * Copyright (c) 2014 Integros [integros.com]
  27  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  28  */
  29 
  30 #include <sys/zfs_context.h>
  31 #include <sys/spa.h>
  32 #include <sys/spa_impl.h>
  33 #include <sys/vdev_impl.h>
  34 #include <sys/vdev_disk.h>
  35 #include <sys/vdev_file.h>
  36 #include <sys/vdev_raidz.h>
  37 #include <sys/zio.h>
  38 #include <sys/zio_checksum.h>
  39 #include <sys/abd.h>
  40 #include <sys/fs/zfs.h>
  41 #include <sys/fm/fs/zfs.h>
  42 #include <sys/dkioc_free_util.h>
  43 
  44 /*
  45  * Virtual device vector for RAID-Z.
  46  *
  47  * This vdev supports single, double, and triple parity. For single parity,
  48  * we use a simple XOR of all the data columns. For double or triple parity,
  49  * we use a special case of Reed-Solomon coding. This extends the
  50  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
  51  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
  52  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
  53  * former is also based. The latter is designed to provide higher performance
  54  * for writes.
  55  *
  56  * Note that the Plank paper claimed to support arbitrary N+M, but was then
  57  * amended six years later identifying a critical flaw that invalidates its
  58  * claims. Nevertheless, the technique can be adapted to work for up to
  59  * triple parity. For additional parity, the amendment "Note: Correction to
  60  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
  61  * is viable, but the additional complexity means that write performance will
  62  * suffer.
  63  *
  64  * All of the methods above operate on a Galois field, defined over the
  65  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
  66  * can be expressed with a single byte. Briefly, the operations on the
  67  * field are defined as follows:
  68  *
  69  *   o addition (+) is represented by a bitwise XOR
  70  *   o subtraction (-) is therefore identical to addition: A + B = A - B
  71  *   o multiplication of A by 2 is defined by the following bitwise expression:
  72  *
  73  *      (A * 2)_7 = A_6
  74  *      (A * 2)_6 = A_5
  75  *      (A * 2)_5 = A_4
  76  *      (A * 2)_4 = A_3 + A_7
  77  *      (A * 2)_3 = A_2 + A_7
  78  *      (A * 2)_2 = A_1 + A_7
  79  *      (A * 2)_1 = A_0
  80  *      (A * 2)_0 = A_7
  81  *
  82  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
  83  * As an aside, this multiplication is derived from the error correcting
  84  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
  85  *
  86  * Observe that any number in the field (except for 0) can be expressed as a
  87  * power of 2 -- a generator for the field. We store a table of the powers of
  88  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
  89  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
  90  * than field addition). The inverse of a field element A (A^-1) is therefore
  91  * A ^ (255 - 1) = A^254.
  92  *
  93  * The up-to-three parity columns, P, Q, R over several data columns,
  94  * D_0, ... D_n-1, can be expressed by field operations:
  95  *
  96  *      P = D_0 + D_1 + ... + D_n-2 + D_n-1
  97  *      Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
  98  *        = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
  99  *      R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
 100  *        = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
 101  *
 102  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
 103  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
 104  * independent coefficients. (There are no additional coefficients that have
 105  * this property which is why the uncorrected Plank method breaks down.)
 106  *
 107  * See the reconstruction code below for how P, Q and R can used individually
 108  * or in concert to recover missing data columns.
 109  */
 110 
 111 typedef struct raidz_col {
 112         uint64_t rc_devidx;             /* child device index for I/O */
 113         uint64_t rc_offset;             /* device offset */
 114         uint64_t rc_size;               /* I/O size */
 115         abd_t *rc_abd;                  /* I/O data */
 116         void *rc_gdata;                 /* used to store the "good" version */
 117         int rc_error;                   /* I/O error for this device */
 118         uint8_t rc_tried;               /* Did we attempt this I/O column? */
 119         uint8_t rc_skipped;             /* Did we skip this I/O column? */
 120 } raidz_col_t;
 121 
 122 typedef struct raidz_map {
 123         uint64_t rm_cols;               /* Regular column count */
 124         uint64_t rm_scols;              /* Count including skipped columns */
 125         uint64_t rm_bigcols;            /* Number of oversized columns */
 126         uint64_t rm_asize;              /* Actual total I/O size */
 127         uint64_t rm_missingdata;        /* Count of missing data devices */
 128         uint64_t rm_missingparity;      /* Count of missing parity devices */
 129         uint64_t rm_firstdatacol;       /* First data column/parity count */
 130         uint64_t rm_nskip;              /* Skipped sectors for padding */
 131         uint64_t rm_skipstart;          /* Column index of padding start */
 132         abd_t *rm_abd_copy;             /* rm_asize-buffer of copied data */
 133         uintptr_t rm_reports;           /* # of referencing checksum reports */
 134         uint8_t rm_freed;               /* map no longer has referencing ZIO */
 135         uint8_t rm_ecksuminjected;      /* checksum error was injected */
 136         raidz_col_t rm_col[1];          /* Flexible array of I/O columns */
 137 } raidz_map_t;
 138 
 139 #define VDEV_RAIDZ_P            0
 140 #define VDEV_RAIDZ_Q            1
 141 #define VDEV_RAIDZ_R            2
 142 
 143 #define VDEV_RAIDZ_MUL_2(x)     (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
 144 #define VDEV_RAIDZ_MUL_4(x)     (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
 145 
 146 /*
 147  * We provide a mechanism to perform the field multiplication operation on a
 148  * 64-bit value all at once rather than a byte at a time. This works by
 149  * creating a mask from the top bit in each byte and using that to
 150  * conditionally apply the XOR of 0x1d.
 151  */
 152 #define VDEV_RAIDZ_64MUL_2(x, mask) \
 153 { \
 154         (mask) = (x) & 0x8080808080808080ULL; \
 155         (mask) = ((mask) << 1) - ((mask) >> 7); \
 156         (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
 157             ((mask) & 0x1d1d1d1d1d1d1d1d); \
 158 }
 159 
 160 #define VDEV_RAIDZ_64MUL_4(x, mask) \
 161 { \
 162         VDEV_RAIDZ_64MUL_2((x), mask); \
 163         VDEV_RAIDZ_64MUL_2((x), mask); \
 164 }
 165 
 166 #define VDEV_LABEL_OFFSET(x)    (x + VDEV_LABEL_START_SIZE)
 167 
 168 /*
 169  * Force reconstruction to use the general purpose method.
 170  */
 171 int vdev_raidz_default_to_general;
 172 
 173 /*
 174  * xor_p hook for external acceleration libraries.
 175  */
 176 int (*zfs_xorp_hook)(int vects, int len, void **array) = NULL;
 177 
 178 /*
 179  * These two tables represent powers and logs of 2 in the Galois field defined
 180  * above. These values were computed by repeatedly multiplying by 2 as above.
 181  */
 182 static const uint8_t vdev_raidz_pow2[256] = {
 183         0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
 184         0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
 185         0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
 186         0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
 187         0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
 188         0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
 189         0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
 190         0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
 191         0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
 192         0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
 193         0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
 194         0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
 195         0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
 196         0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
 197         0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
 198         0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
 199         0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
 200         0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
 201         0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
 202         0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
 203         0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
 204         0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
 205         0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
 206         0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
 207         0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
 208         0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
 209         0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
 210         0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
 211         0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
 212         0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
 213         0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
 214         0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
 215 };
 216 /* Logs of 2 in the Galois field defined above. */
 217 static const uint8_t vdev_raidz_log2[256] = {
 218         0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
 219         0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
 220         0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
 221         0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
 222         0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
 223         0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
 224         0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
 225         0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
 226         0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
 227         0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
 228         0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
 229         0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
 230         0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
 231         0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
 232         0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
 233         0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
 234         0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
 235         0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
 236         0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
 237         0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
 238         0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
 239         0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
 240         0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
 241         0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
 242         0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
 243         0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
 244         0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
 245         0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
 246         0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
 247         0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
 248         0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
 249         0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
 250 };
 251 
 252 static void vdev_raidz_generate_parity(raidz_map_t *rm);
 253 static void vdev_raidz_trim_done(zio_t *zio);
 254 
 255 /*
 256  * Multiply a given number by 2 raised to the given power.
 257  */
 258 static uint8_t
 259 vdev_raidz_exp2(uint_t a, int exp)
 260 {
 261         if (a == 0)
 262                 return (0);
 263 
 264         ASSERT(exp >= 0);
 265         ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
 266 
 267         exp += vdev_raidz_log2[a];
 268         if (exp > 255)
 269                 exp -= 255;
 270 
 271         return (vdev_raidz_pow2[exp]);
 272 }
 273 
 274 static void
 275 vdev_raidz_map_free(raidz_map_t *rm)
 276 {
 277         int c;
 278         size_t size;
 279 
 280         for (c = 0; c < rm->rm_firstdatacol; c++) {
 281                 /*
 282                  * TRIM doesn't allocate data blocks,
 283                  * so 'rc_abd' is NULL in this case.
 284                  * See vdev_raidz_trim() and vdev_raidz_map_alloc()
 285                  * for more details.
 286                  */
 287                 if (rm->rm_col[c].rc_abd != NULL)
 288                         abd_free(rm->rm_col[c].rc_abd);
 289 
 290                 if (rm->rm_col[c].rc_gdata != NULL)
 291                         zio_buf_free(rm->rm_col[c].rc_gdata,
 292                             rm->rm_col[c].rc_size);
 293         }
 294 
 295         size = 0;
 296         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 297                 /*
 298                  * TRIM doesn't allocate data blocks,
 299                  * so 'rc_abd' is NULL in this case
 300                  * See vdev_raidz_trim() and vdev_raidz_map_alloc()
 301                  * for more details.
 302                  */
 303                 if (rm->rm_col[c].rc_abd != NULL)
 304                         abd_put(rm->rm_col[c].rc_abd);
 305                 size += rm->rm_col[c].rc_size;
 306         }
 307 
 308         if (rm->rm_abd_copy != NULL)
 309                 abd_free(rm->rm_abd_copy);
 310 
 311         kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
 312 }
 313 
 314 static void
 315 vdev_raidz_map_free_vsd(zio_t *zio)
 316 {
 317         raidz_map_t *rm = zio->io_vsd;
 318 
 319         ASSERT0(rm->rm_freed);
 320         rm->rm_freed = 1;
 321 
 322         if (rm->rm_reports == 0)
 323                 vdev_raidz_map_free(rm);
 324 }
 325 
 326 /*ARGSUSED*/
 327 static void
 328 vdev_raidz_cksum_free(void *arg, size_t ignored)
 329 {
 330         raidz_map_t *rm = arg;
 331 
 332         ASSERT3U(rm->rm_reports, >, 0);
 333 
 334         if (--rm->rm_reports == 0 && rm->rm_freed != 0)
 335                 vdev_raidz_map_free(rm);
 336 }
 337 
 338 static void
 339 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
 340 {
 341         raidz_map_t *rm = zcr->zcr_cbdata;
 342         size_t c = zcr->zcr_cbinfo;
 343         size_t x;
 344 
 345         const char *good = NULL;
 346         char *bad;
 347 
 348         if (good_data == NULL) {
 349                 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
 350                 return;
 351         }
 352 
 353         if (c < rm->rm_firstdatacol) {
 354                 /*
 355                  * The first time through, calculate the parity blocks for
 356                  * the good data (this relies on the fact that the good
 357                  * data never changes for a given logical ZIO)
 358                  */
 359                 if (rm->rm_col[0].rc_gdata == NULL) {
 360                         abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
 361                         char *buf;
 362                         int offset;
 363 
 364                         /*
 365                          * Set up the rm_col[]s to generate the parity for
 366                          * good_data, first saving the parity bufs and
 367                          * replacing them with buffers to hold the result.
 368                          */
 369                         for (x = 0; x < rm->rm_firstdatacol; x++) {
 370                                 bad_parity[x] = rm->rm_col[x].rc_abd;
 371                                 rm->rm_col[x].rc_gdata =
 372                                     zio_buf_alloc(rm->rm_col[x].rc_size);
 373                                 rm->rm_col[x].rc_abd =
 374                                     abd_get_from_buf(rm->rm_col[x].rc_gdata,
 375                                     rm->rm_col[x].rc_size);
 376                         }
 377 
 378                         /* fill in the data columns from good_data */
 379                         buf = (char *)good_data;
 380                         for (; x < rm->rm_cols; x++) {
 381                                 abd_put(rm->rm_col[x].rc_abd);
 382                                 rm->rm_col[x].rc_abd = abd_get_from_buf(buf,
 383                                     rm->rm_col[x].rc_size);
 384                                 buf += rm->rm_col[x].rc_size;
 385                         }
 386 
 387                         /*
 388                          * Construct the parity from the good data.
 389                          */
 390                         vdev_raidz_generate_parity(rm);
 391 
 392                         /* restore everything back to its original state */
 393                         for (x = 0; x < rm->rm_firstdatacol; x++) {
 394                                 abd_put(rm->rm_col[x].rc_abd);
 395                                 rm->rm_col[x].rc_abd = bad_parity[x];
 396                         }
 397 
 398                         offset = 0;
 399                         for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
 400                                 abd_put(rm->rm_col[x].rc_abd);
 401                                 rm->rm_col[x].rc_abd = abd_get_offset(
 402                                     rm->rm_abd_copy, offset);
 403                                 offset += rm->rm_col[x].rc_size;
 404                         }
 405                 }
 406 
 407                 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
 408                 good = rm->rm_col[c].rc_gdata;
 409         } else {
 410                 /* adjust good_data to point at the start of our column */
 411                 good = good_data;
 412 
 413                 for (x = rm->rm_firstdatacol; x < c; x++)
 414                         good += rm->rm_col[x].rc_size;
 415         }
 416 
 417         bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size);
 418         /* we drop the ereport if it ends up that the data was good */
 419         zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
 420         abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size);
 421 }
 422 
 423 /*
 424  * Invoked indirectly by zfs_ereport_start_checksum(), called
 425  * below when our read operation fails completely.  The main point
 426  * is to keep a copy of everything we read from disk, so that at
 427  * vdev_raidz_cksum_finish() time we can compare it with the good data.
 428  */
 429 static void
 430 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
 431 {
 432         size_t c = (size_t)(uintptr_t)arg;
 433         size_t offset;
 434 
 435         raidz_map_t *rm = zio->io_vsd;
 436         size_t size;
 437 
 438         /* set up the report and bump the refcount  */
 439         zcr->zcr_cbdata = rm;
 440         zcr->zcr_cbinfo = c;
 441         zcr->zcr_finish = vdev_raidz_cksum_finish;
 442         zcr->zcr_free = vdev_raidz_cksum_free;
 443 
 444         rm->rm_reports++;
 445         ASSERT3U(rm->rm_reports, >, 0);
 446 
 447         if (rm->rm_abd_copy != NULL)
 448                 return;
 449 
 450         /*
 451          * It's the first time we're called for this raidz_map_t, so we need
 452          * to copy the data aside; there's no guarantee that our zio's buffer
 453          * won't be re-used for something else.
 454          *
 455          * Our parity data is already in separate buffers, so there's no need
 456          * to copy them.
 457          */
 458 
 459         size = 0;
 460         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
 461                 size += rm->rm_col[c].rc_size;
 462 
 463         rm->rm_abd_copy =
 464             abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size);
 465 
 466         for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 467                 raidz_col_t *col = &rm->rm_col[c];
 468                 abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset);
 469 
 470                 abd_copy(tmp, col->rc_abd, col->rc_size);
 471                 abd_put(col->rc_abd);
 472                 col->rc_abd = tmp;
 473 
 474                 offset += col->rc_size;
 475         }
 476         ASSERT3U(offset, ==, size);
 477 }
 478 
 479 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
 480         vdev_raidz_map_free_vsd,
 481         vdev_raidz_cksum_report
 482 };
 483 
 484 /*
 485  * Allocates and computes a raidz column map, which directs the raidz column
 486  * handling algorithms where to locate and store data and parity columns for
 487  * a particular DVA. Usually, dcols is the number of children in the target
 488  * vdev.
 489  *
 490  * The `io_offset', `io_size' and `io_data' hold the offset, size and data
 491  * of the zio for which this map is to be computed.
 492  * The `unit_shift' parameter contains the minimum allocation bitshift of
 493  * the storage pool. The `dcols' parameter contains the number of drives in
 494  * this raidz vdev (including parity drives), with `nparity' denoting how
 495  * many those contain the parity (one, two or three).
 496  *
 497  * The `alloc_io_bufs' flag denotes whether you want the constructed raidz
 498  * map to contain allocated buffers to hold column IO data or not (if
 499  * you're using this function simply to determine raidz geometry, you'll
 500  * want to pass B_FALSE here).
 501  */
 502 static raidz_map_t *
 503 vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset,
 504     uint64_t unit_shift, uint64_t dcols, uint64_t nparity,
 505     boolean_t alloc_data)
 506 {
 507         raidz_map_t *rm;
 508         /* The starting RAIDZ (parent) vdev sector of the block. */
 509         uint64_t b = offset >> unit_shift;
 510         /* The zio's size in units of the vdev's minimum sector size. */
 511         uint64_t s = size >> unit_shift;
 512         /* The first column for this stripe. */
 513         uint64_t f = b % dcols;
 514         /* The starting byte offset on each child vdev. */
 515         uint64_t o = (b / dcols) << unit_shift;
 516         uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
 517         uint64_t off = 0;
 518 
 519         /*
 520          * "Quotient": The number of data sectors for this stripe on all but
 521          * the "big column" child vdevs that also contain "remainder" data.
 522          */
 523         q = s / (dcols - nparity);
 524 
 525         /*
 526          * "Remainder": The number of partial stripe data sectors in this I/O.
 527          * This will add a sector to some, but not all, child vdevs.
 528          */
 529         r = s - q * (dcols - nparity);
 530 
 531         /* The number of "big columns" - those which contain remainder data. */
 532         bc = (r == 0 ? 0 : r + nparity);
 533 
 534         /*
 535          * The total number of data and parity sectors associated with
 536          * this I/O.
 537          */
 538         tot = s + nparity * (q + (r == 0 ? 0 : 1));
 539 
 540         /* acols: The columns that will be accessed. */
 541         /* scols: The columns that will be accessed or skipped. */
 542         if (q == 0) {
 543                 /* Our I/O request doesn't span all child vdevs. */
 544                 acols = bc;
 545                 scols = MIN(dcols, roundup(bc, nparity + 1));
 546         } else {
 547                 acols = dcols;
 548                 scols = dcols;
 549         }
 550 
 551         ASSERT3U(acols, <=, scols);
 552 
 553         rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
 554 
 555         rm->rm_cols = acols;
 556         rm->rm_scols = scols;
 557         rm->rm_bigcols = bc;
 558         rm->rm_skipstart = bc;
 559         rm->rm_missingdata = 0;
 560         rm->rm_missingparity = 0;
 561         rm->rm_firstdatacol = nparity;
 562         rm->rm_abd_copy = NULL;
 563         rm->rm_reports = 0;
 564         rm->rm_freed = 0;
 565         rm->rm_ecksuminjected = 0;
 566 
 567         asize = 0;
 568 
 569         for (c = 0; c < scols; c++) {
 570                 col = f + c;
 571                 coff = o;
 572                 if (col >= dcols) {
 573                         col -= dcols;
 574                         coff += 1ULL << unit_shift;
 575                 }
 576                 rm->rm_col[c].rc_devidx = col;
 577                 rm->rm_col[c].rc_offset = coff;
 578                 rm->rm_col[c].rc_abd = NULL;
 579                 rm->rm_col[c].rc_gdata = NULL;
 580                 rm->rm_col[c].rc_error = 0;
 581                 rm->rm_col[c].rc_tried = 0;
 582                 rm->rm_col[c].rc_skipped = 0;
 583 
 584                 if (c >= acols)
 585                         rm->rm_col[c].rc_size = 0;
 586                 else if (c < bc)
 587                         rm->rm_col[c].rc_size = (q + 1) << unit_shift;
 588                 else
 589                         rm->rm_col[c].rc_size = q << unit_shift;
 590 
 591                 asize += rm->rm_col[c].rc_size;
 592         }
 593 
 594         ASSERT3U(asize, ==, tot << unit_shift);
 595         rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
 596         rm->rm_nskip = roundup(tot, nparity + 1) - tot;
 597         ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
 598         ASSERT3U(rm->rm_nskip, <=, nparity);
 599 
 600         if (alloc_data) {
 601                 for (c = 0; c < rm->rm_firstdatacol; c++) {
 602                         rm->rm_col[c].rc_abd =
 603                             abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);
 604                 }
 605 
 606                 rm->rm_col[c].rc_abd = abd_get_offset(abd, 0);
 607                 off = rm->rm_col[c].rc_size;
 608 
 609                 for (c = c + 1; c < acols; c++) {
 610                         rm->rm_col[c].rc_abd = abd_get_offset(abd, off);
 611                         off += rm->rm_col[c].rc_size;
 612                 }
 613         }
 614 
 615         /*
 616          * If all data stored spans all columns, there's a danger that parity
 617          * will always be on the same device and, since parity isn't read
 618          * during normal operation, that that device's I/O bandwidth won't be
 619          * used effectively. We therefore switch the parity every 1MB.
 620          *
 621          * ... at least that was, ostensibly, the theory. As a practical
 622          * matter unless we juggle the parity between all devices evenly, we
 623          * won't see any benefit. Further, occasional writes that aren't a
 624          * multiple of the LCM of the number of children and the minimum
 625          * stripe width are sufficient to avoid pessimal behavior.
 626          * Unfortunately, this decision created an implicit on-disk format
 627          * requirement that we need to support for all eternity, but only
 628          * for single-parity RAID-Z.
 629          *
 630          * If we intend to skip a sector in the zeroth column for padding
 631          * we must make sure to note this swap. We will never intend to
 632          * skip the first column since at least one data and one parity
 633          * column must appear in each row.
 634          */
 635         ASSERT(rm->rm_cols >= 2);
 636         ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
 637 
 638         if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
 639                 devidx = rm->rm_col[0].rc_devidx;
 640                 o = rm->rm_col[0].rc_offset;
 641                 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
 642                 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
 643                 rm->rm_col[1].rc_devidx = devidx;
 644                 rm->rm_col[1].rc_offset = o;
 645 
 646                 if (rm->rm_skipstart == 0)
 647                         rm->rm_skipstart = 1;
 648         }
 649 
 650         return (rm);
 651 }
 652 
 653 struct pqr_struct {
 654         uint64_t *p;
 655         uint64_t *q;
 656         uint64_t *r;
 657 };
 658 
 659 static int
 660 vdev_raidz_p_func(void *buf, size_t size, void *private)
 661 {
 662         struct pqr_struct *pqr = private;
 663         const uint64_t *src = buf;
 664         int i, cnt = size / sizeof (src[0]);
 665 
 666         ASSERT(pqr->p && !pqr->q && !pqr->r);
 667 
 668         for (i = 0; i < cnt; i++, src++, pqr->p++)
 669                 *pqr->p ^= *src;
 670 
 671         return (0);
 672 }
 673 
 674 static int
 675 vdev_raidz_pq_func(void *buf, size_t size, void *private)
 676 {
 677         struct pqr_struct *pqr = private;
 678         const uint64_t *src = buf;
 679         uint64_t mask;
 680         int i, cnt = size / sizeof (src[0]);
 681 
 682         ASSERT(pqr->p && pqr->q && !pqr->r);
 683 
 684         for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
 685                 *pqr->p ^= *src;
 686                 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
 687                 *pqr->q ^= *src;
 688         }
 689 
 690         return (0);
 691 }
 692 
 693 static int
 694 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
 695 {
 696         struct pqr_struct *pqr = private;
 697         const uint64_t *src = buf;
 698         uint64_t mask;
 699         int i, cnt = size / sizeof (src[0]);
 700 
 701         ASSERT(pqr->p && pqr->q && pqr->r);
 702 
 703         for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
 704                 *pqr->p ^= *src;
 705                 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
 706                 *pqr->q ^= *src;
 707                 VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
 708                 *pqr->r ^= *src;
 709         }
 710 
 711         return (0);
 712 }
 713 
 714 /*
 715  * software acceleration of XOR calculations, requirements
 716  *
 717  * the (src/dst) vectors needs to be 64 byte aligned
 718  * all the vectors have to be the same size
 719  */
 720 #define RAIDZ_ACCELERATION_ALIGNMENT    64ul
 721 #define UNALIGNED(addr) \
 722         ((unsigned long)(addr) & (RAIDZ_ACCELERATION_ALIGNMENT-1))
 723 
 724 static void
 725 vdev_raidz_generate_parity_p(raidz_map_t *rm)
 726 {
 727         uint64_t *p;
 728         int c;
 729         abd_t *src;
 730 
 731 #if 0
 732         /* FIXME: needs to be reviewed and changed to support ABD */
 733         int parity_done;
 734         void *va[16];
 735         void **array;
 736         int j, nvects;
 737 
 738         parity_done = 0;
 739         while (0 && zfs_xorp_hook && !parity_done) {
 740                 unsigned long no_accel = 0;
 741                 /* at least two columns (plus one for result) */
 742                 if (rm->rm_cols < 3) {
 743                         DTRACE_PROBE1(raidz_few_cols, int, rm->rm_cols);
 744                         break;
 745                 }
 746                 /* check sizes and alignment */
 747                 no_accel = UNALIGNED(rm->rm_col[VDEV_RAIDZ_P].rc_data);
 748                 if (no_accel) {
 749                         DTRACE_PROBE1(raidz_unaligned_dst, unsigned long,
 750                             no_accel);
 751                         break;
 752                 }
 753                 pcount = rm->rm_col[rm->rm_firstdatacol].rc_size;
 754                 nvects = 1; /* for the destination */
 755                 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 756                         no_accel = UNALIGNED(rm->rm_col[c].rc_data);
 757                         if (no_accel) {
 758                                 DTRACE_PROBE1(raidz_unaligned_src,
 759                                     unsigned long, no_accel);
 760                                 break;
 761                         }
 762                         if (rm->rm_col[c].rc_size != pcount) {
 763                                 DTRACE_PROBE(raidz_sizes_vary);
 764                                 no_accel = 1;
 765                                 break;
 766                         }
 767                         nvects++;
 768                 }
 769                 if (no_accel)
 770                         break;
 771                 if (nvects > 16) {
 772                         array = kmem_alloc(nvects * sizeof (void *),
 773                             KM_NOSLEEP);
 774                         if (array == NULL) {
 775                                 DTRACE_PROBE(raidz_alloc_failed);
 776                                 break;
 777                         }
 778                 } else {
 779                         array = va;
 780                 }
 781                 for (j = 0, c = rm->rm_firstdatacol; c < rm->rm_cols;
 782                     c++, j++) {
 783                         array[j] = rm->rm_col[c].rc_data;
 784                 }
 785                 array[j] = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 786                 if (zfs_xorp_hook(nvects,
 787                     rm->rm_col[rm->rm_firstdatacol].rc_size, array)) {
 788                         DTRACE_PROBE(raidz_accel_failure);
 789                         break;
 790                 }
 791                 if (array != va) {
 792                         kmem_free(array, nvects * sizeof (void *));
 793                 }
 794                 parity_done = 1;
 795                 DTRACE_PROBE(raidz_accel_success);
 796         }
 797         if (parity_done)
 798                 return;
 799 #endif
 800         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 801                 src = rm->rm_col[c].rc_abd;
 802                 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
 803 
 804                 if (c == rm->rm_firstdatacol) {
 805                         abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
 806                 } else {
 807                         struct pqr_struct pqr = { p, NULL, NULL };
 808                         (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
 809                             vdev_raidz_p_func, &pqr);
 810                 }
 811         }
 812 }
 813 
 814 static void
 815 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
 816 {
 817         uint64_t *p, *q, pcnt, ccnt, mask, i;
 818         int c;
 819         abd_t *src;
 820 
 821         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
 822         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 823             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 824 
 825         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 826                 src = rm->rm_col[c].rc_abd;
 827                 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
 828                 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
 829 
 830                 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
 831 
 832                 if (c == rm->rm_firstdatacol) {
 833                         abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
 834                         (void) memcpy(q, p, rm->rm_col[c].rc_size);
 835                 } else {
 836                         struct pqr_struct pqr = { p, q, NULL };
 837                         (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
 838                             vdev_raidz_pq_func, &pqr);
 839                 }
 840 
 841                 if (c == rm->rm_firstdatacol) {
 842                         for (i = ccnt; i < pcnt; i++) {
 843                                 p[i] = 0;
 844                                 q[i] = 0;
 845                         }
 846                 } else {
 847                         /*
 848                          * Treat short columns as though they are full of 0s.
 849                          * Note that there's therefore nothing needed for P.
 850                          */
 851                         for (i = ccnt; i < pcnt; i++) {
 852                                 VDEV_RAIDZ_64MUL_2(q[i], mask);
 853                         }
 854                 }
 855         }
 856 }
 857 
 858 static void
 859 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
 860 {
 861         uint64_t *p, *q, *r, pcnt, ccnt, mask, i;
 862         int c;
 863         abd_t *src;
 864 
 865         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
 866         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 867             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 868         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 869             rm->rm_col[VDEV_RAIDZ_R].rc_size);
 870 
 871         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 872                 src = rm->rm_col[c].rc_abd;
 873                 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
 874                 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
 875                 r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
 876 
 877                 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
 878 
 879                 if (c == rm->rm_firstdatacol) {
 880                         abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
 881                         (void) memcpy(q, p, rm->rm_col[c].rc_size);
 882                         (void) memcpy(r, p, rm->rm_col[c].rc_size);
 883                 } else {
 884                         struct pqr_struct pqr = { p, q, r };
 885                         (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
 886                             vdev_raidz_pqr_func, &pqr);
 887                 }
 888 
 889                 if (c == rm->rm_firstdatacol) {
 890                         for (i = ccnt; i < pcnt; i++) {
 891                                 p[i] = 0;
 892                                 q[i] = 0;
 893                                 r[i] = 0;
 894                         }
 895                 } else {
 896                         /*
 897                          * Treat short columns as though they are full of 0s.
 898                          * Note that there's therefore nothing needed for P.
 899                          */
 900                         for (i = ccnt; i < pcnt; i++) {
 901                                 VDEV_RAIDZ_64MUL_2(q[i], mask);
 902                                 VDEV_RAIDZ_64MUL_4(r[i], mask);
 903                         }
 904                 }
 905         }
 906 }
 907 
 908 /*
 909  * Generate RAID parity in the first virtual columns according to the number of
 910  * parity columns available.
 911  */
 912 static void
 913 vdev_raidz_generate_parity(raidz_map_t *rm)
 914 {
 915         switch (rm->rm_firstdatacol) {
 916         case 1:
 917                 vdev_raidz_generate_parity_p(rm);
 918                 break;
 919         case 2:
 920                 vdev_raidz_generate_parity_pq(rm);
 921                 break;
 922         case 3:
 923                 vdev_raidz_generate_parity_pqr(rm);
 924                 break;
 925         default:
 926                 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
 927         }
 928 }
 929 
 930 /* ARGSUSED */
 931 static int
 932 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
 933 {
 934         uint64_t *dst = dbuf;
 935         uint64_t *src = sbuf;
 936         int cnt = size / sizeof (src[0]);
 937 
 938         for (int i = 0; i < cnt; i++) {
 939                 dst[i] ^= src[i];
 940         }
 941 
 942         return (0);
 943 }
 944 
 945 /* ARGSUSED */
 946 static int
 947 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
 948     void *private)
 949 {
 950         uint64_t *dst = dbuf;
 951         uint64_t *src = sbuf;
 952         uint64_t mask;
 953         int cnt = size / sizeof (dst[0]);
 954 
 955         for (int i = 0; i < cnt; i++, dst++, src++) {
 956                 VDEV_RAIDZ_64MUL_2(*dst, mask);
 957                 *dst ^= *src;
 958         }
 959 
 960         return (0);
 961 }
 962 
 963 /* ARGSUSED */
 964 static int
 965 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
 966 {
 967         uint64_t *dst = buf;
 968         uint64_t mask;
 969         int cnt = size / sizeof (dst[0]);
 970 
 971         for (int i = 0; i < cnt; i++, dst++) {
 972                 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */
 973                 VDEV_RAIDZ_64MUL_2(*dst, mask);
 974         }
 975 
 976         return (0);
 977 }
 978 
 979 struct reconst_q_struct {
 980         uint64_t *q;
 981         int exp;
 982 };
 983 
 984 static int
 985 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
 986 {
 987         struct reconst_q_struct *rq = private;
 988         uint64_t *dst = buf;
 989         int cnt = size / sizeof (dst[0]);
 990 
 991         for (int i = 0; i < cnt; i++, dst++, rq->q++) {
 992                 *dst ^= *rq->q;
 993 
 994                 int j;
 995                 uint8_t *b;
 996                 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
 997                         *b = vdev_raidz_exp2(*b, rq->exp);
 998                 }
 999         }
1000 
1001         return (0);
1002 }
1003 
1004 struct reconst_pq_struct {
1005         uint8_t *p;
1006         uint8_t *q;
1007         uint8_t *pxy;
1008         uint8_t *qxy;
1009         int aexp;
1010         int bexp;
1011 };
1012 
1013 static int
1014 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
1015 {
1016         struct reconst_pq_struct *rpq = private;
1017         uint8_t *xd = xbuf;
1018         uint8_t *yd = ybuf;
1019 
1020         for (int i = 0; i < size;
1021             i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
1022                 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
1023                     vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
1024                 *yd = *rpq->p ^ *rpq->pxy ^ *xd;
1025         }
1026 
1027         return (0);
1028 }
1029 
1030 static int
1031 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
1032 {
1033         struct reconst_pq_struct *rpq = private;
1034         uint8_t *xd = xbuf;
1035 
1036         for (int i = 0; i < size;
1037             i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
1038                 /* same operation as vdev_raidz_reconst_pq_func() on xd */
1039                 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
1040                     vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
1041         }
1042 
1043         return (0);
1044 }
1045 
1046 static int
1047 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
1048 {
1049         int x = tgts[0];
1050         int c;
1051         abd_t *dst, *src;
1052 
1053         ASSERT(ntgts == 1);
1054         ASSERT(x >= rm->rm_firstdatacol);
1055         ASSERT(x < rm->rm_cols);
1056 
1057         ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
1058         ASSERT(rm->rm_col[x].rc_size > 0);
1059 
1060         src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
1061         dst = rm->rm_col[x].rc_abd;
1062 
1063         abd_copy(dst, src, rm->rm_col[x].rc_size);
1064 
1065         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1066                 uint64_t size = MIN(rm->rm_col[x].rc_size,
1067                     rm->rm_col[c].rc_size);
1068 
1069                 src = rm->rm_col[c].rc_abd;
1070                 dst = rm->rm_col[x].rc_abd;
1071 
1072                 if (c == x)
1073                         continue;
1074 
1075                 (void) abd_iterate_func2(dst, src, 0, 0, size,
1076                     vdev_raidz_reconst_p_func, NULL);
1077         }
1078 
1079         return (1 << VDEV_RAIDZ_P);
1080 }
1081 
1082 static int
1083 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
1084 {
1085         int x = tgts[0];
1086         int c, exp;
1087         abd_t *dst, *src;
1088 
1089         ASSERT(ntgts == 1);
1090 
1091         ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);
1092 
1093         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1094                 uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
1095                     rm->rm_col[c].rc_size);
1096 
1097                 src = rm->rm_col[c].rc_abd;
1098                 dst = rm->rm_col[x].rc_abd;
1099 
1100                 if (c == rm->rm_firstdatacol) {
1101                         abd_copy(dst, src, size);
1102                         if (rm->rm_col[x].rc_size > size)
1103                                 abd_zero_off(dst, size,
1104                                     rm->rm_col[x].rc_size - size);
1105                 } else {
1106                         ASSERT3U(size, <=, rm->rm_col[x].rc_size);
1107                         (void) abd_iterate_func2(dst, src, 0, 0, size,
1108                             vdev_raidz_reconst_q_pre_func, NULL);
1109                         (void) abd_iterate_func(dst,
1110                             size, rm->rm_col[x].rc_size - size,
1111                             vdev_raidz_reconst_q_pre_tail_func, NULL);
1112                 }
1113         }
1114 
1115         src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
1116         dst = rm->rm_col[x].rc_abd;
1117         exp = 255 - (rm->rm_cols - 1 - x);
1118 
1119         struct reconst_q_struct rq = { abd_to_buf(src), exp };
1120         (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
1121             vdev_raidz_reconst_q_post_func, &rq);
1122 
1123         return (1 << VDEV_RAIDZ_Q);
1124 }
1125 
1126 static int
1127 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
1128 {
1129         uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
1130         abd_t *pdata, *qdata;
1131         uint64_t xsize, ysize;
1132         int x = tgts[0];
1133         int y = tgts[1];
1134         abd_t *xd, *yd;
1135 
1136         ASSERT(ntgts == 2);
1137         ASSERT(x < y);
1138         ASSERT(x >= rm->rm_firstdatacol);
1139         ASSERT(y < rm->rm_cols);
1140 
1141         ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
1142 
1143         /*
1144          * Move the parity data aside -- we're going to compute parity as
1145          * though columns x and y were full of zeros -- Pxy and Qxy. We want to
1146          * reuse the parity generation mechanism without trashing the actual
1147          * parity so we make those columns appear to be full of zeros by
1148          * setting their lengths to zero.
1149          */
1150         pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
1151         qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
1152         xsize = rm->rm_col[x].rc_size;
1153         ysize = rm->rm_col[y].rc_size;
1154 
1155         rm->rm_col[VDEV_RAIDZ_P].rc_abd =
1156             abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
1157         rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
1158             abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
1159         rm->rm_col[x].rc_size = 0;
1160         rm->rm_col[y].rc_size = 0;
1161 
1162         vdev_raidz_generate_parity_pq(rm);
1163 
1164         rm->rm_col[x].rc_size = xsize;
1165         rm->rm_col[y].rc_size = ysize;
1166 
1167         p = abd_to_buf(pdata);
1168         q = abd_to_buf(qdata);
1169         pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1170         qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1171         xd = rm->rm_col[x].rc_abd;
1172         yd = rm->rm_col[y].rc_abd;
1173 
1174         /*
1175          * We now have:
1176          *      Pxy = P + D_x + D_y
1177          *      Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
1178          *
1179          * We can then solve for D_x:
1180          *      D_x = A * (P + Pxy) + B * (Q + Qxy)
1181          * where
1182          *      A = 2^(x - y) * (2^(x - y) + 1)^-1
1183          *      B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
1184          *
1185          * With D_x in hand, we can easily solve for D_y:
1186          *      D_y = P + Pxy + D_x
1187          */
1188 
1189         a = vdev_raidz_pow2[255 + x - y];
1190         b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
1191         tmp = 255 - vdev_raidz_log2[a ^ 1];
1192 
1193         aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
1194         bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
1195 
1196         ASSERT3U(xsize, >=, ysize);
1197         struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
1198         (void) abd_iterate_func2(xd, yd, 0, 0, ysize,
1199             vdev_raidz_reconst_pq_func, &rpq);
1200         (void) abd_iterate_func(xd, ysize, xsize - ysize,
1201             vdev_raidz_reconst_pq_tail_func, &rpq);
1202 
1203         abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1204         abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1205 
1206         /*
1207          * Restore the saved parity data.
1208          */
1209         rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
1210         rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;
1211 
1212         return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
1213 }
1214 
1215 /* BEGIN CSTYLED */
1216 /*
1217  * In the general case of reconstruction, we must solve the system of linear
1218  * equations defined by the coeffecients used to generate parity as well as
1219  * the contents of the data and parity disks. This can be expressed with
1220  * vectors for the original data (D) and the actual data (d) and parity (p)
1221  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1222  *
1223  *            __   __                     __     __
1224  *            |     |         __     __   |  p_0  |
1225  *            |  V  |         |  D_0  |   | p_m-1 |
1226  *            |     |    x    |   :   | = |  d_0  |
1227  *            |  I  |         | D_n-1 |   |   :   |
1228  *            |     |         ~~     ~~   | d_n-1 |
1229  *            ~~   ~~                     ~~     ~~
1230  *
1231  * I is simply a square identity matrix of size n, and V is a vandermonde
1232  * matrix defined by the coeffecients we chose for the various parity columns
1233  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1234  * computation as well as linear separability.
1235  *
1236  *      __               __               __     __
1237  *      |   1   ..  1 1 1 |               |  p_0  |
1238  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
1239  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
1240  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
1241  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
1242  *      |   :       : : : |   |   :   |   |  d_2  |
1243  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
1244  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
1245  *      |   0   ..  0 0 1 |               | d_n-1 |
1246  *      ~~               ~~               ~~     ~~
1247  *
1248  * Note that I, V, d, and p are known. To compute D, we must invert the
1249  * matrix and use the known data and parity values to reconstruct the unknown
1250  * data values. We begin by removing the rows in V|I and d|p that correspond
1251  * to failed or missing columns; we then make V|I square (n x n) and d|p
1252  * sized n by removing rows corresponding to unused parity from the bottom up
1253  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1254  * using Gauss-Jordan elimination. In the example below we use m=3 parity
1255  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1256  *           __                               __
1257  *           |  1   1   1   1   1   1   1   1  |
1258  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
1259  *           |  19 205 116  29  64  16  4   1  |      / /
1260  *           |  1   0   0   0   0   0   0   0  |     / /
1261  *           |  0   1   0   0   0   0   0   0  | <--' /
1262  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
1263  *           |  0   0   0   1   0   0   0   0  |
1264  *           |  0   0   0   0   1   0   0   0  |
1265  *           |  0   0   0   0   0   1   0   0  |
1266  *           |  0   0   0   0   0   0   1   0  |
1267  *           |  0   0   0   0   0   0   0   1  |
1268  *           ~~                               ~~
1269  *           __                               __
1270  *           |  1   1   1   1   1   1   1   1  |
1271  *           |  19 205 116  29  64  16  4   1  |
1272  *           |  1   0   0   0   0   0   0   0  |
1273  *  (V|I)' = |  0   0   0   1   0   0   0   0  |
1274  *           |  0   0   0   0   1   0   0   0  |
1275  *           |  0   0   0   0   0   1   0   0  |
1276  *           |  0   0   0   0   0   0   1   0  |
1277  *           |  0   0   0   0   0   0   0   1  |
1278  *           ~~                               ~~
1279  *
1280  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1281  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1282  * matrix is not singular.
1283  * __                                                                 __
1284  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1285  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1286  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1287  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1288  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1289  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1290  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1291  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1292  * ~~                                                                 ~~
1293  * __                                                                 __
1294  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1295  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1296  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1297  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1298  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1299  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1300  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1301  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1302  * ~~                                                                 ~~
1303  * __                                                                 __
1304  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1305  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1306  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1307  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1308  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1309  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1310  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1311  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1312  * ~~                                                                 ~~
1313  * __                                                                 __
1314  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1315  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1316  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1317  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1318  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1319  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1320  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1321  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1322  * ~~                                                                 ~~
1323  * __                                                                 __
1324  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1325  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1326  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1327  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1328  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1329  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1330  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1331  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1332  * ~~                                                                 ~~
1333  * __                                                                 __
1334  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1335  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1336  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1337  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1338  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1339  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1340  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1341  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1342  * ~~                                                                 ~~
1343  *                   __                               __
1344  *                   |  0   0   1   0   0   0   0   0  |
1345  *                   | 167 100  5   41 159 169 217 208 |
1346  *                   | 166 100  4   40 158 168 216 209 |
1347  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1348  *                   |  0   0   0   0   1   0   0   0  |
1349  *                   |  0   0   0   0   0   1   0   0  |
1350  *                   |  0   0   0   0   0   0   1   0  |
1351  *                   |  0   0   0   0   0   0   0   1  |
1352  *                   ~~                               ~~
1353  *
1354  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1355  * of the missing data.
1356  *
1357  * As is apparent from the example above, the only non-trivial rows in the
1358  * inverse matrix correspond to the data disks that we're trying to
1359  * reconstruct. Indeed, those are the only rows we need as the others would
1360  * only be useful for reconstructing data known or assumed to be valid. For
1361  * that reason, we only build the coefficients in the rows that correspond to
1362  * targeted columns.
1363  */
1364 /* END CSTYLED */
1365 
1366 static void
1367 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1368     uint8_t **rows)
1369 {
1370         int i, j;
1371         int pow;
1372 
1373         ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1374 
1375         /*
1376          * Fill in the missing rows of interest.
1377          */
1378         for (i = 0; i < nmap; i++) {
1379                 ASSERT3S(0, <=, map[i]);
1380                 ASSERT3S(map[i], <=, 2);
1381 
1382                 pow = map[i] * n;
1383                 if (pow > 255)
1384                         pow -= 255;
1385                 ASSERT(pow <= 255);
1386 
1387                 for (j = 0; j < n; j++) {
1388                         pow -= map[i];
1389                         if (pow < 0)
1390                                 pow += 255;
1391                         rows[i][j] = vdev_raidz_pow2[pow];
1392                 }
1393         }
1394 }
1395 
1396 static void
1397 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1398     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1399 {
1400         int i, j, ii, jj;
1401         uint8_t log;
1402 
1403         /*
1404          * Assert that the first nmissing entries from the array of used
1405          * columns correspond to parity columns and that subsequent entries
1406          * correspond to data columns.
1407          */
1408         for (i = 0; i < nmissing; i++) {
1409                 ASSERT3S(used[i], <, rm->rm_firstdatacol);
1410         }
1411         for (; i < n; i++) {
1412                 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1413         }
1414 
1415         /*
1416          * First initialize the storage where we'll compute the inverse rows.
1417          */
1418         for (i = 0; i < nmissing; i++) {
1419                 for (j = 0; j < n; j++) {
1420                         invrows[i][j] = (i == j) ? 1 : 0;
1421                 }
1422         }
1423 
1424         /*
1425          * Subtract all trivial rows from the rows of consequence.
1426          */
1427         for (i = 0; i < nmissing; i++) {
1428                 for (j = nmissing; j < n; j++) {
1429                         ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1430                         jj = used[j] - rm->rm_firstdatacol;
1431                         ASSERT3S(jj, <, n);
1432                         invrows[i][j] = rows[i][jj];
1433                         rows[i][jj] = 0;
1434                 }
1435         }
1436 
1437         /*
1438          * For each of the rows of interest, we must normalize it and subtract
1439          * a multiple of it from the other rows.
1440          */
1441         for (i = 0; i < nmissing; i++) {
1442                 for (j = 0; j < missing[i]; j++) {
1443                         ASSERT0(rows[i][j]);
1444                 }
1445                 ASSERT3U(rows[i][missing[i]], !=, 0);
1446 
1447                 /*
1448                  * Compute the inverse of the first element and multiply each
1449                  * element in the row by that value.
1450                  */
1451                 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1452 
1453                 for (j = 0; j < n; j++) {
1454                         rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1455                         invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1456                 }
1457 
1458                 for (ii = 0; ii < nmissing; ii++) {
1459                         if (i == ii)
1460                                 continue;
1461 
1462                         ASSERT3U(rows[ii][missing[i]], !=, 0);
1463 
1464                         log = vdev_raidz_log2[rows[ii][missing[i]]];
1465 
1466                         for (j = 0; j < n; j++) {
1467                                 rows[ii][j] ^=
1468                                     vdev_raidz_exp2(rows[i][j], log);
1469                                 invrows[ii][j] ^=
1470                                     vdev_raidz_exp2(invrows[i][j], log);
1471                         }
1472                 }
1473         }
1474 
1475         /*
1476          * Verify that the data that is left in the rows are properly part of
1477          * an identity matrix.
1478          */
1479         for (i = 0; i < nmissing; i++) {
1480                 for (j = 0; j < n; j++) {
1481                         if (j == missing[i]) {
1482                                 ASSERT3U(rows[i][j], ==, 1);
1483                         } else {
1484                                 ASSERT0(rows[i][j]);
1485                         }
1486                 }
1487         }
1488 }
1489 
1490 static void
1491 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1492     int *missing, uint8_t **invrows, const uint8_t *used)
1493 {
1494         int i, j, x, cc, c;
1495         uint8_t *src;
1496         uint64_t ccount;
1497         uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1498         uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1499         uint8_t log = 0;
1500         uint8_t val;
1501         int ll;
1502         uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1503         uint8_t *p, *pp;
1504         size_t psize;
1505 
1506         psize = sizeof (invlog[0][0]) * n * nmissing;
1507         p = kmem_alloc(psize, KM_SLEEP);
1508 
1509         for (pp = p, i = 0; i < nmissing; i++) {
1510                 invlog[i] = pp;
1511                 pp += n;
1512         }
1513 
1514         for (i = 0; i < nmissing; i++) {
1515                 for (j = 0; j < n; j++) {
1516                         ASSERT3U(invrows[i][j], !=, 0);
1517                         invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1518                 }
1519         }
1520 
1521         for (i = 0; i < n; i++) {
1522                 c = used[i];
1523                 ASSERT3U(c, <, rm->rm_cols);
1524 
1525                 src = abd_to_buf(rm->rm_col[c].rc_abd);
1526                 ccount = rm->rm_col[c].rc_size;
1527                 for (j = 0; j < nmissing; j++) {
1528                         cc = missing[j] + rm->rm_firstdatacol;
1529                         ASSERT3U(cc, >=, rm->rm_firstdatacol);
1530                         ASSERT3U(cc, <, rm->rm_cols);
1531                         ASSERT3U(cc, !=, c);
1532 
1533                         dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
1534                         dcount[j] = rm->rm_col[cc].rc_size;
1535                 }
1536 
1537                 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1538 
1539                 for (x = 0; x < ccount; x++, src++) {
1540                         if (*src != 0)
1541                                 log = vdev_raidz_log2[*src];
1542 
1543                         for (cc = 0; cc < nmissing; cc++) {
1544                                 if (x >= dcount[cc])
1545                                         continue;
1546 
1547                                 if (*src == 0) {
1548                                         val = 0;
1549                                 } else {
1550                                         if ((ll = log + invlog[cc][i]) >= 255)
1551                                                 ll -= 255;
1552                                         val = vdev_raidz_pow2[ll];
1553                                 }
1554 
1555                                 if (i == 0)
1556                                         dst[cc][x] = val;
1557                                 else
1558                                         dst[cc][x] ^= val;
1559                         }
1560                 }
1561         }
1562 
1563         kmem_free(p, psize);
1564 }
1565 
1566 static int
1567 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1568 {
1569         int n, i, c, t, tt;
1570         int nmissing_rows;
1571         int missing_rows[VDEV_RAIDZ_MAXPARITY];
1572         int parity_map[VDEV_RAIDZ_MAXPARITY];
1573 
1574         uint8_t *p, *pp;
1575         size_t psize;
1576 
1577         uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1578         uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1579         uint8_t *used;
1580 
1581         abd_t **bufs = NULL;
1582 
1583         int code = 0;
1584 
1585         /*
1586          * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1587          * temporary linear ABDs.
1588          */
1589         if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
1590                 bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);
1591 
1592                 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1593                         raidz_col_t *col = &rm->rm_col[c];
1594 
1595                         bufs[c] = col->rc_abd;
1596                         col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
1597                         abd_copy(col->rc_abd, bufs[c], col->rc_size);
1598                 }
1599         }
1600 
1601         n = rm->rm_cols - rm->rm_firstdatacol;
1602 
1603         /*
1604          * Figure out which data columns are missing.
1605          */
1606         nmissing_rows = 0;
1607         for (t = 0; t < ntgts; t++) {
1608                 if (tgts[t] >= rm->rm_firstdatacol) {
1609                         missing_rows[nmissing_rows++] =
1610                             tgts[t] - rm->rm_firstdatacol;
1611                 }
1612         }
1613 
1614         /*
1615          * Figure out which parity columns to use to help generate the missing
1616          * data columns.
1617          */
1618         for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1619                 ASSERT(tt < ntgts);
1620                 ASSERT(c < rm->rm_firstdatacol);
1621 
1622                 /*
1623                  * Skip any targeted parity columns.
1624                  */
1625                 if (c == tgts[tt]) {
1626                         tt++;
1627                         continue;
1628                 }
1629 
1630                 code |= 1 << c;
1631 
1632                 parity_map[i] = c;
1633                 i++;
1634         }
1635 
1636         ASSERT(code != 0);
1637         ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1638 
1639         psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1640             nmissing_rows * n + sizeof (used[0]) * n;
1641         p = kmem_alloc(psize, KM_SLEEP);
1642 
1643         for (pp = p, i = 0; i < nmissing_rows; i++) {
1644                 rows[i] = pp;
1645                 pp += n;
1646                 invrows[i] = pp;
1647                 pp += n;
1648         }
1649         used = pp;
1650 
1651         for (i = 0; i < nmissing_rows; i++) {
1652                 used[i] = parity_map[i];
1653         }
1654 
1655         for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1656                 if (tt < nmissing_rows &&
1657                     c == missing_rows[tt] + rm->rm_firstdatacol) {
1658                         tt++;
1659                         continue;
1660                 }
1661 
1662                 ASSERT3S(i, <, n);
1663                 used[i] = c;
1664                 i++;
1665         }
1666 
1667         /*
1668          * Initialize the interesting rows of the matrix.
1669          */
1670         vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1671 
1672         /*
1673          * Invert the matrix.
1674          */
1675         vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1676             invrows, used);
1677 
1678         /*
1679          * Reconstruct the missing data using the generated matrix.
1680          */
1681         vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1682             invrows, used);
1683 
1684         kmem_free(p, psize);
1685 
1686         /*
1687          * copy back from temporary linear abds and free them
1688          */
1689         if (bufs) {
1690                 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1691                         raidz_col_t *col = &rm->rm_col[c];
1692 
1693                         abd_copy(bufs[c], col->rc_abd, col->rc_size);
1694                         abd_free(col->rc_abd);
1695                         col->rc_abd = bufs[c];
1696                 }
1697                 kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
1698         }
1699 
1700         return (code);
1701 }
1702 
1703 static int
1704 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1705 {
1706         int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1707         int ntgts;
1708         int i, c;
1709         int code;
1710         int nbadparity, nbaddata;
1711         int parity_valid[VDEV_RAIDZ_MAXPARITY];
1712 
1713         /*
1714          * The tgts list must already be sorted.
1715          */
1716         for (i = 1; i < nt; i++) {
1717                 ASSERT(t[i] > t[i - 1]);
1718         }
1719 
1720         nbadparity = rm->rm_firstdatacol;
1721         nbaddata = rm->rm_cols - nbadparity;
1722         ntgts = 0;
1723         for (i = 0, c = 0; c < rm->rm_cols; c++) {
1724                 if (c < rm->rm_firstdatacol)
1725                         parity_valid[c] = B_FALSE;
1726 
1727                 if (i < nt && c == t[i]) {
1728                         tgts[ntgts++] = c;
1729                         i++;
1730                 } else if (rm->rm_col[c].rc_error != 0) {
1731                         tgts[ntgts++] = c;
1732                 } else if (c >= rm->rm_firstdatacol) {
1733                         nbaddata--;
1734                 } else {
1735                         parity_valid[c] = B_TRUE;
1736                         nbadparity--;
1737                 }
1738         }
1739 
1740         ASSERT(ntgts >= nt);
1741         ASSERT(nbaddata >= 0);
1742         ASSERT(nbaddata + nbadparity == ntgts);
1743 
1744         dt = &tgts[nbadparity];
1745 
1746         /*
1747          * See if we can use any of our optimized reconstruction routines.
1748          */
1749         if (!vdev_raidz_default_to_general) {
1750                 switch (nbaddata) {
1751                 case 1:
1752                         if (parity_valid[VDEV_RAIDZ_P])
1753                                 return (vdev_raidz_reconstruct_p(rm, dt, 1));
1754 
1755                         ASSERT(rm->rm_firstdatacol > 1);
1756 
1757                         if (parity_valid[VDEV_RAIDZ_Q])
1758                                 return (vdev_raidz_reconstruct_q(rm, dt, 1));
1759 
1760                         ASSERT(rm->rm_firstdatacol > 2);
1761                         break;
1762 
1763                 case 2:
1764                         ASSERT(rm->rm_firstdatacol > 1);
1765 
1766                         if (parity_valid[VDEV_RAIDZ_P] &&
1767                             parity_valid[VDEV_RAIDZ_Q])
1768                                 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1769 
1770                         ASSERT(rm->rm_firstdatacol > 2);
1771 
1772                         break;
1773                 }
1774         }
1775 
1776         code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1777         ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1778         ASSERT(code > 0);
1779         return (code);
1780 }
1781 
1782 static int
1783 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1784     uint64_t *ashift)
1785 {
1786         vdev_t *cvd;
1787         uint64_t nparity = vd->vdev_nparity;
1788         int c;
1789         int lasterror = 0;
1790         int numerrors = 0;
1791 
1792         ASSERT(nparity > 0);
1793 
1794         if (nparity > VDEV_RAIDZ_MAXPARITY ||
1795             vd->vdev_children < nparity + 1) {
1796                 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1797                 return (SET_ERROR(EINVAL));
1798         }
1799 
1800         vdev_open_children(vd);
1801 
1802         for (c = 0; c < vd->vdev_children; c++) {
1803                 cvd = vd->vdev_child[c];
1804 
1805                 if (cvd->vdev_open_error != 0) {
1806                         lasterror = cvd->vdev_open_error;
1807                         numerrors++;
1808                         continue;
1809                 }
1810 
1811                 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1812                 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1813                 *ashift = MAX(*ashift, cvd->vdev_ashift);
1814         }
1815 
1816         *asize *= vd->vdev_children;
1817         *max_asize *= vd->vdev_children;
1818 
1819         if (numerrors > nparity) {
1820                 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1821                 return (lasterror);
1822         }
1823 
1824         return (0);
1825 }
1826 
1827 static void
1828 vdev_raidz_close(vdev_t *vd)
1829 {
1830         int c;
1831 
1832         for (c = 0; c < vd->vdev_children; c++)
1833                 vdev_close(vd->vdev_child[c]);
1834 }
1835 
1836 /*
1837  * Handle a read or write I/O to a RAID-Z dump device.
1838  *
1839  * The dump device is in a unique situation compared to other ZFS datasets:
1840  * writing to this device should be as simple and fast as possible.  In
1841  * addition, durability matters much less since the dump will be extracted
1842  * once the machine reboots.  For that reason, this function eschews parity for
1843  * performance and simplicity.  The dump device uses the checksum setting
1844  * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1845  * dataset.
1846  *
1847  * Blocks of size 128 KB have been preallocated for this volume.  I/Os less than
1848  * 128 KB will not fill an entire block; in addition, they may not be properly
1849  * aligned.  In that case, this function uses the preallocated 128 KB block and
1850  * omits reading or writing any "empty" portions of that block, as opposed to
1851  * allocating a fresh appropriately-sized block.
1852  *
1853  * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1854  *
1855  *     vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1856  *
1857  * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1858  * allocated which spans all five child vdevs.  8 KB of data would be written to
1859  * each of four vdevs, with the fifth containing the parity bits.
1860  *
1861  *       parity    data     data     data     data
1862  *     |   PP   |   XX   |   XX   |   XX   |   XX   |
1863  *         ^        ^        ^        ^        ^
1864  *         |        |        |        |        |
1865  *   8 KB parity    ------8 KB data blocks------
1866  *
1867  * However, when writing to the dump device, the behavior is different:
1868  *
1869  *     vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1870  *
1871  * Unlike the normal RAID-Z case in which the block is allocated based on the
1872  * I/O size, reads and writes here always use a 128 KB logical I/O size.  If the
1873  * I/O size is less than 128 KB, only the actual portions of data are written.
1874  * In this example the data is written to the third data vdev since that vdev
1875  * contains the offset [64 KB, 96 KB).
1876  *
1877  *       parity    data     data     data     data
1878  *     |        |        |        |   XX   |        |
1879  *                                    ^
1880  *                                    |
1881  *                             32 KB data block
1882  *
1883  * As a result, an individual I/O may not span all child vdevs; moreover, a
1884  * small I/O may only operate on a single child vdev.
1885  *
1886  * Note that since there are no parity bits calculated or written, this format
1887  * remains the same no matter how many parity bits are used in a normal RAID-Z
1888  * stripe.  On a RAID-Z3 configuration with seven child vdevs, the example above
1889  * would look like:
1890  *
1891  *       parity   parity   parity    data     data     data     data
1892  *     |        |        |        |        |        |   XX   |        |
1893  *                                                      ^
1894  *                                                      |
1895  *                                               32 KB data block
1896  */
1897 int
1898 vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1899     uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1900 {
1901         vdev_t *tvd = vd->vdev_top;
1902         vdev_t *cvd;
1903         raidz_map_t *rm;
1904         raidz_col_t *rc;
1905         int c, err = 0;
1906 
1907         uint64_t start, end, colstart, colend;
1908         uint64_t coloffset, colsize, colskip;
1909 
1910         int flags = doread ? B_READ : B_WRITE;
1911 
1912 #ifdef  _KERNEL
1913 
1914         /*
1915          * Don't write past the end of the block
1916          */
1917         VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1918 
1919         start = offset;
1920         end = start + size;
1921 
1922         /*
1923          * Allocate a RAID-Z map for this block.  Note that this block starts
1924          * from the "original" offset, this is, the offset of the extent which
1925          * contains the requisite offset of the data being read or written.
1926          *
1927          * Even if this I/O operation doesn't span the full block size, let's
1928          * treat the on-disk format as if the only blocks are the complete 128
1929          * KB size.
1930          */
1931         abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
1932             SPA_OLD_MAXBLOCKSIZE);
1933         rm = vdev_raidz_map_alloc(abd,
1934             SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift,
1935             vd->vdev_children, vd->vdev_nparity, B_TRUE);
1936 
1937         coloffset = origoffset;
1938 
1939         for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1940             c++, coloffset += rc->rc_size) {
1941                 rc = &rm->rm_col[c];
1942                 cvd = vd->vdev_child[rc->rc_devidx];
1943 
1944                 /*
1945                  * Find the start and end of this column in the RAID-Z map,
1946                  * keeping in mind that the stated size and offset of the
1947                  * operation may not fill the entire column for this vdev.
1948                  *
1949                  * If any portion of the data spans this column, issue the
1950                  * appropriate operation to the vdev.
1951                  */
1952                 if (coloffset + rc->rc_size <= start)
1953                         continue;
1954                 if (coloffset >= end)
1955                         continue;
1956 
1957                 colstart = MAX(coloffset, start);
1958                 colend = MIN(end, coloffset + rc->rc_size);
1959                 colsize = colend - colstart;
1960                 colskip = colstart - coloffset;
1961 
1962                 VERIFY3U(colsize, <=, rc->rc_size);
1963                 VERIFY3U(colskip, <=, rc->rc_size);
1964 
1965                 /*
1966                  * Note that the child vdev will have a vdev label at the start
1967                  * of its range of offsets, hence the need for
1968                  * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
1969                  * example of why this calculation is needed.
1970                  */
1971                 if ((err = vdev_disk_physio(cvd,
1972                     ((char *)abd_to_buf(rc->rc_abd)) + colskip, colsize,
1973                     VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1974                     flags, isdump)) != 0)
1975                         break;
1976         }
1977 
1978         vdev_raidz_map_free(rm);
1979         abd_put(abd);
1980 #endif  /* KERNEL */
1981 
1982         return (err);
1983 }
1984 
1985 static uint64_t
1986 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1987 {
1988         uint64_t asize;
1989         uint64_t ashift = vd->vdev_top->vdev_ashift;
1990         uint64_t cols = vd->vdev_children;
1991         uint64_t nparity = vd->vdev_nparity;
1992 
1993         asize = ((psize - 1) >> ashift) + 1;
1994         asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1995         asize = roundup(asize, nparity + 1) << ashift;
1996 
1997         return (asize);
1998 }
1999 
2000 /*
2001  * Converts an allocated size on a raidz vdev back to a logical block
2002  * size. This is used in trimming to figure out the appropriate logical
2003  * size to pass to vdev_raidz_map_alloc when splitting up extents of free
2004  * space obtained from metaslabs. However, a range of free space on a
2005  * raidz vdev might have originally consisted of multiple blocks and
2006  * those, taken together with their skip blocks, might not always align
2007  * neatly to a new vdev_raidz_map_alloc covering the entire unified
2008  * range. So to ensure that the newly allocated raidz map *always* fits
2009  * within the asize passed to this function and never exceeds it (since
2010  * that might trim allocated data past it), we round it down to the
2011  * nearest suitable multiple of the vdev ashift (hence the "_floor" in
2012  * this function's name).
2013  * This function is in effect an inverse of vdev_raidz_asize. However,
2014  * since multiple psizes can map to a single asize (due to variable padding,
2015  * this function instead returns the largest chunk that still fits inside
2016  * the specified asize).
2017  */
2018 static uint64_t
2019 vdev_raidz_psize_floor(vdev_t *vd, uint64_t asize)
2020 {
2021         uint64_t psize;
2022         uint64_t ashift = vd->vdev_top->vdev_ashift;
2023         uint64_t cols = vd->vdev_children;
2024         uint64_t nparity = vd->vdev_nparity;
2025 
2026         psize = (asize - (nparity << ashift));
2027         psize /= cols;
2028         psize *= cols - nparity;
2029         psize += (1 << ashift) - 1;
2030 
2031         psize = P2ALIGN(psize, 1 << ashift);
2032 
2033         return (psize);
2034 }
2035 
2036 static void
2037 vdev_raidz_child_done(zio_t *zio)
2038 {
2039         raidz_col_t *rc = zio->io_private;
2040 
2041         rc->rc_error = zio->io_error;
2042         rc->rc_tried = 1;
2043         rc->rc_skipped = 0;
2044 }
2045 
2046 /*
2047  * Start an IO operation on a RAIDZ VDev
2048  *
2049  * Outline:
2050  * - For write operations:
2051  *   1. Generate the parity data
2052  *   2. Create child zio write operations to each column's vdev, for both
2053  *      data and parity.
2054  *   3. If the column skips any sectors for padding, create optional dummy
2055  *      write zio children for those areas to improve aggregation continuity.
2056  * - For read operations:
2057  *   1. Create child zio read operations to each data column's vdev to read
2058  *      the range of data required for zio.
2059  *   2. If this is a scrub or resilver operation, or if any of the data
2060  *      vdevs have had errors, then create zio read operations to the parity
2061  *      columns' VDevs as well.
2062  */
2063 static void
2064 vdev_raidz_io_start(zio_t *zio)
2065 {
2066         vdev_t *vd = zio->io_vd;
2067         vdev_t *tvd = vd->vdev_top;
2068         vdev_t *cvd;
2069         raidz_map_t *rm;
2070         raidz_col_t *rc;
2071         int c, i;
2072 
2073         rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
2074             tvd->vdev_ashift, vd->vdev_children,
2075             vd->vdev_nparity, B_TRUE);
2076 
2077         zio->io_vsd = rm;
2078         zio->io_vsd_ops = &vdev_raidz_vsd_ops;
2079 
2080         ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
2081 
2082         if (zio->io_type == ZIO_TYPE_WRITE) {
2083                 vdev_raidz_generate_parity(rm);
2084 
2085                 for (c = 0; c < rm->rm_cols; c++) {
2086                         rc = &rm->rm_col[c];
2087                         cvd = vd->vdev_child[rc->rc_devidx];
2088                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2089                             rc->rc_offset, rc->rc_abd, rc->rc_size,
2090                             zio->io_type, zio->io_priority, 0,
2091                             vdev_raidz_child_done, rc));
2092                 }
2093 
2094                 /*
2095                  * Generate optional I/Os for any skipped sectors to improve
2096                  * aggregation contiguity.
2097                  */
2098                 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
2099                         ASSERT(c <= rm->rm_scols);
2100                         if (c == rm->rm_scols)
2101                                 c = 0;
2102                         rc = &rm->rm_col[c];
2103                         cvd = vd->vdev_child[rc->rc_devidx];
2104                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2105                             rc->rc_offset + rc->rc_size, NULL,
2106                             1 << tvd->vdev_ashift,
2107                             zio->io_type, zio->io_priority,
2108                             ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
2109                 }
2110 
2111                 zio_execute(zio);
2112                 return;
2113         }
2114 
2115         ASSERT(zio->io_type == ZIO_TYPE_READ);
2116 
2117         /*
2118          * Iterate over the columns in reverse order so that we hit the parity
2119          * last -- any errors along the way will force us to read the parity.
2120          */
2121         for (c = rm->rm_cols - 1; c >= 0; c--) {
2122                 rc = &rm->rm_col[c];
2123                 cvd = vd->vdev_child[rc->rc_devidx];
2124                 if (!vdev_readable(cvd)) {
2125                         if (c >= rm->rm_firstdatacol)
2126                                 rm->rm_missingdata++;
2127                         else
2128                                 rm->rm_missingparity++;
2129                         rc->rc_error = SET_ERROR(ENXIO);
2130                         rc->rc_tried = 1;    /* don't even try */
2131                         rc->rc_skipped = 1;
2132                         continue;
2133                 }
2134                 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2135                         if (c >= rm->rm_firstdatacol)
2136                                 rm->rm_missingdata++;
2137                         else
2138                                 rm->rm_missingparity++;
2139                         rc->rc_error = SET_ERROR(ESTALE);
2140                         rc->rc_skipped = 1;
2141                         continue;
2142                 }
2143                 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
2144                     (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
2145                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2146                             rc->rc_offset, rc->rc_abd, rc->rc_size,
2147                             zio->io_type, zio->io_priority, 0,
2148                             vdev_raidz_child_done, rc));
2149                 }
2150         }
2151 
2152         zio_execute(zio);
2153 }
2154 
2155 
2156 /*
2157  * Report a checksum error for a child of a RAID-Z device.
2158  */
2159 static void
2160 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
2161 {
2162         void *buf;
2163         vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
2164         vdev_stat_t *vs = &vd->vdev_stat;
2165         spa_t *spa = zio->io_spa;
2166 
2167         if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2168                 zio_bad_cksum_t zbc;
2169                 raidz_map_t *rm = zio->io_vsd;
2170 
2171                 mutex_enter(&vd->vdev_stat_lock);
2172                 vd->vdev_stat.vs_checksum_errors++;
2173                 mutex_exit(&vd->vdev_stat_lock);
2174 
2175                 zbc.zbc_has_cksum = 0;
2176                 zbc.zbc_injected = rm->rm_ecksuminjected;
2177 
2178                 buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size);
2179                 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
2180                     rc->rc_offset, rc->rc_size, buf, bad_data,
2181                     &zbc);
2182                 abd_return_buf(rc->rc_abd, buf, rc->rc_size);
2183         }
2184 
2185         if (vd->vdev_isspecial && (vs->vs_checksum_errors ||
2186             vs->vs_read_errors || vs->vs_write_errors) &&
2187             !spa->spa_special_has_errors) {
2188                 spa->spa_special_has_errors = B_TRUE;
2189         }
2190 }
2191 
2192 /*
2193  * We keep track of whether or not there were any injected errors, so that
2194  * any ereports we generate can note it.
2195  */
2196 static int
2197 raidz_checksum_verify(zio_t *zio)
2198 {
2199         zio_bad_cksum_t zbc;
2200         raidz_map_t *rm = zio->io_vsd;
2201 
2202         int ret = zio_checksum_error(zio, &zbc);
2203         if (ret != 0 && zbc.zbc_injected != 0)
2204                 rm->rm_ecksuminjected = 1;
2205 
2206         return (ret);
2207 }
2208 
2209 /*
2210  * Generate the parity from the data columns. If we tried and were able to
2211  * read the parity without error, verify that the generated parity matches the
2212  * data we read. If it doesn't, we fire off a checksum error. Return the
2213  * number such failures.
2214  */
2215 static int
2216 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
2217 {
2218         void *orig[VDEV_RAIDZ_MAXPARITY];
2219         int c, ret = 0;
2220         raidz_col_t *rc;
2221 
2222         blkptr_t *bp = zio->io_bp;
2223         enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
2224             (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
2225 
2226         if (checksum == ZIO_CHECKSUM_NOPARITY)
2227                 return (ret);
2228 
2229         for (c = 0; c < rm->rm_firstdatacol; c++) {
2230                 rc = &rm->rm_col[c];
2231                 if (!rc->rc_tried || rc->rc_error != 0)
2232                         continue;
2233                 orig[c] = zio_buf_alloc(rc->rc_size);
2234                 abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size);
2235         }
2236 
2237         vdev_raidz_generate_parity(rm);
2238 
2239         for (c = 0; c < rm->rm_firstdatacol; c++) {
2240                 rc = &rm->rm_col[c];
2241                 if (!rc->rc_tried || rc->rc_error != 0)
2242                         continue;
2243                 if (abd_cmp_buf(rc->rc_abd, orig[c], rc->rc_size) != 0) {
2244                         raidz_checksum_error(zio, rc, orig[c]);
2245                         rc->rc_error = SET_ERROR(ECKSUM);
2246                         ret++;
2247                 }
2248                 zio_buf_free(orig[c], rc->rc_size);
2249         }
2250 
2251         return (ret);
2252 }
2253 
2254 /*
2255  * Keep statistics on all the ways that we used parity to correct data.
2256  */
2257 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
2258 
2259 static int
2260 vdev_raidz_worst_error(raidz_map_t *rm)
2261 {
2262         int error = 0;
2263 
2264         for (int c = 0; c < rm->rm_cols; c++)
2265                 error = zio_worst_error(error, rm->rm_col[c].rc_error);
2266 
2267         return (error);
2268 }
2269 
2270 /*
2271  * Iterate over all combinations of bad data and attempt a reconstruction.
2272  * Note that the algorithm below is non-optimal because it doesn't take into
2273  * account how reconstruction is actually performed. For example, with
2274  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
2275  * is targeted as invalid as if columns 1 and 4 are targeted since in both
2276  * cases we'd only use parity information in column 0.
2277  */
2278 static int
2279 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
2280 {
2281         raidz_map_t *rm = zio->io_vsd;
2282         raidz_col_t *rc;
2283         void *orig[VDEV_RAIDZ_MAXPARITY];
2284         int tstore[VDEV_RAIDZ_MAXPARITY + 2];
2285         int *tgts = &tstore[1];
2286         int current, next, i, c, n;
2287         int code, ret = 0;
2288 
2289         ASSERT(total_errors < rm->rm_firstdatacol);
2290 
2291         /*
2292          * This simplifies one edge condition.
2293          */
2294         tgts[-1] = -1;
2295 
2296         for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
2297                 /*
2298                  * Initialize the targets array by finding the first n columns
2299                  * that contain no error.
2300                  *
2301                  * If there were no data errors, we need to ensure that we're
2302                  * always explicitly attempting to reconstruct at least one
2303                  * data column. To do this, we simply push the highest target
2304                  * up into the data columns.
2305                  */
2306                 for (c = 0, i = 0; i < n; i++) {
2307                         if (i == n - 1 && data_errors == 0 &&
2308                             c < rm->rm_firstdatacol) {
2309                                 c = rm->rm_firstdatacol;
2310                         }
2311 
2312                         while (rm->rm_col[c].rc_error != 0) {
2313                                 c++;
2314                                 ASSERT3S(c, <, rm->rm_cols);
2315                         }
2316 
2317                         tgts[i] = c++;
2318                 }
2319 
2320                 /*
2321                  * Setting tgts[n] simplifies the other edge condition.
2322                  */
2323                 tgts[n] = rm->rm_cols;
2324 
2325                 /*
2326                  * These buffers were allocated in previous iterations.
2327                  */
2328                 for (i = 0; i < n - 1; i++) {
2329                         ASSERT(orig[i] != NULL);
2330                 }
2331 
2332                 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
2333 
2334                 current = 0;
2335                 next = tgts[current];
2336 
2337                 while (current != n) {
2338                         tgts[current] = next;
2339                         current = 0;
2340 
2341                         /*
2342                          * Save off the original data that we're going to
2343                          * attempt to reconstruct.
2344                          */
2345                         for (i = 0; i < n; i++) {
2346                                 ASSERT(orig[i] != NULL);
2347                                 c = tgts[i];
2348                                 ASSERT3S(c, >=, 0);
2349                                 ASSERT3S(c, <, rm->rm_cols);
2350                                 rc = &rm->rm_col[c];
2351                                 abd_copy_to_buf(orig[i], rc->rc_abd,
2352                                     rc->rc_size);
2353                         }
2354 
2355                         /*
2356                          * Attempt a reconstruction and exit the outer loop on
2357                          * success.
2358                          */
2359                         code = vdev_raidz_reconstruct(rm, tgts, n);
2360                         if (raidz_checksum_verify(zio) == 0) {
2361                                 atomic_inc_64(&raidz_corrected[code]);
2362 
2363                                 for (i = 0; i < n; i++) {
2364                                         c = tgts[i];
2365                                         rc = &rm->rm_col[c];
2366                                         ASSERT(rc->rc_error == 0);
2367                                         if (rc->rc_tried)
2368                                                 raidz_checksum_error(zio, rc,
2369                                                     orig[i]);
2370                                         rc->rc_error = SET_ERROR(ECKSUM);
2371                                 }
2372 
2373                                 ret = code;
2374                                 goto done;
2375                         }
2376 
2377                         /*
2378                          * Restore the original data.
2379                          */
2380                         for (i = 0; i < n; i++) {
2381                                 c = tgts[i];
2382                                 rc = &rm->rm_col[c];
2383                                 abd_copy_from_buf(rc->rc_abd, orig[i],
2384                                     rc->rc_size);
2385                         }
2386 
2387                         do {
2388                                 /*
2389                                  * Find the next valid column after the current
2390                                  * position..
2391                                  */
2392                                 for (next = tgts[current] + 1;
2393                                     next < rm->rm_cols &&
2394                                     rm->rm_col[next].rc_error != 0; next++)
2395                                         continue;
2396 
2397                                 ASSERT(next <= tgts[current + 1]);
2398 
2399                                 /*
2400                                  * If that spot is available, we're done here.
2401                                  */
2402                                 if (next != tgts[current + 1])
2403                                         break;
2404 
2405                                 /*
2406                                  * Otherwise, find the next valid column after
2407                                  * the previous position.
2408                                  */
2409                                 for (c = tgts[current - 1] + 1;
2410                                     rm->rm_col[c].rc_error != 0; c++)
2411                                         continue;
2412 
2413                                 tgts[current] = c;
2414                                 current++;
2415 
2416                         } while (current != n);
2417                 }
2418         }
2419         n--;
2420 done:
2421         for (i = 0; i < n; i++) {
2422                 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2423         }
2424 
2425         return (ret);
2426 }
2427 
2428 /*
2429  * Complete an IO operation on a RAIDZ VDev
2430  *
2431  * Outline:
2432  * - For write operations:
2433  *   1. Check for errors on the child IOs.
2434  *   2. Return, setting an error code if too few child VDevs were written
2435  *      to reconstruct the data later.  Note that partial writes are
2436  *      considered successful if they can be reconstructed at all.
2437  * - For read operations:
2438  *   1. Check for errors on the child IOs.
2439  *   2. If data errors occurred:
2440  *      a. Try to reassemble the data from the parity available.
2441  *      b. If we haven't yet read the parity drives, read them now.
2442  *      c. If all parity drives have been read but the data still doesn't
2443  *         reassemble with a correct checksum, then try combinatorial
2444  *         reconstruction.
2445  *      d. If that doesn't work, return an error.
2446  *   3. If there were unexpected errors or this is a resilver operation,
2447  *      rewrite the vdevs that had errors.
2448  */
2449 static void
2450 vdev_raidz_io_done(zio_t *zio)
2451 {
2452         vdev_t *vd = zio->io_vd;
2453         vdev_t *cvd;
2454         raidz_map_t *rm = zio->io_vsd;
2455         raidz_col_t *rc;
2456         int unexpected_errors = 0;
2457         int parity_errors = 0;
2458         int parity_untried = 0;
2459         int data_errors = 0;
2460         int total_errors = 0;
2461         int n, c;
2462         int tgts[VDEV_RAIDZ_MAXPARITY];
2463         int code;
2464 
2465         ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2466         ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2467 
2468         for (c = 0; c < rm->rm_cols; c++) {
2469                 rc = &rm->rm_col[c];
2470 
2471                 if (rc->rc_error) {
2472                         ASSERT(rc->rc_error != ECKSUM);      /* child has no bp */
2473 
2474                         if (c < rm->rm_firstdatacol)
2475                                 parity_errors++;
2476                         else
2477                                 data_errors++;
2478 
2479                         if (!rc->rc_skipped)
2480                                 unexpected_errors++;
2481 
2482                         total_errors++;
2483                 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2484                         parity_untried++;
2485                 }
2486         }
2487 
2488         if (zio->io_type == ZIO_TYPE_WRITE) {
2489                 /*
2490                  * XXX -- for now, treat partial writes as a success.
2491                  * (If we couldn't write enough columns to reconstruct
2492                  * the data, the I/O failed.  Otherwise, good enough.)
2493                  *
2494                  * Now that we support write reallocation, it would be better
2495                  * to treat partial failure as real failure unless there are
2496                  * no non-degraded top-level vdevs left, and not update DTLs
2497                  * if we intend to reallocate.
2498                  */
2499                 /* XXPOLICY */
2500                 if (total_errors > rm->rm_firstdatacol)
2501                         zio->io_error = vdev_raidz_worst_error(rm);
2502 
2503                 return;
2504         }
2505 
2506         ASSERT(zio->io_type == ZIO_TYPE_READ);
2507         /*
2508          * There are three potential phases for a read:
2509          *      1. produce valid data from the columns read
2510          *      2. read all disks and try again
2511          *      3. perform combinatorial reconstruction
2512          *
2513          * Each phase is progressively both more expensive and less likely to
2514          * occur. If we encounter more errors than we can repair or all phases
2515          * fail, we have no choice but to return an error.
2516          */
2517 
2518         /*
2519          * If the number of errors we saw was correctable -- less than or equal
2520          * to the number of parity disks read -- attempt to produce data that
2521          * has a valid checksum. Naturally, this case applies in the absence of
2522          * any errors.
2523          */
2524         if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2525                 if (data_errors == 0) {
2526                         if (raidz_checksum_verify(zio) == 0) {
2527                                 /*
2528                                  * If we read parity information (unnecessarily
2529                                  * as it happens since no reconstruction was
2530                                  * needed) regenerate and verify the parity.
2531                                  * We also regenerate parity when resilvering
2532                                  * so we can write it out to the failed device
2533                                  * later.
2534                                  */
2535                                 if (parity_errors + parity_untried <
2536                                     rm->rm_firstdatacol ||
2537                                     (zio->io_flags & ZIO_FLAG_RESILVER)) {
2538                                         n = raidz_parity_verify(zio, rm);
2539                                         unexpected_errors += n;
2540                                         ASSERT(parity_errors + n <=
2541                                             rm->rm_firstdatacol);
2542                                 }
2543                                 goto done;
2544                         }
2545                 } else {
2546                         /*
2547                          * We either attempt to read all the parity columns or
2548                          * none of them. If we didn't try to read parity, we
2549                          * wouldn't be here in the correctable case. There must
2550                          * also have been fewer parity errors than parity
2551                          * columns or, again, we wouldn't be in this code path.
2552                          */
2553                         ASSERT(parity_untried == 0);
2554                         ASSERT(parity_errors < rm->rm_firstdatacol);
2555 
2556                         /*
2557                          * Identify the data columns that reported an error.
2558                          */
2559                         n = 0;
2560                         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2561                                 rc = &rm->rm_col[c];
2562                                 if (rc->rc_error != 0) {
2563                                         ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2564                                         tgts[n++] = c;
2565                                 }
2566                         }
2567 
2568                         ASSERT(rm->rm_firstdatacol >= n);
2569 
2570                         code = vdev_raidz_reconstruct(rm, tgts, n);
2571 
2572                         if (raidz_checksum_verify(zio) == 0) {
2573                                 atomic_inc_64(&raidz_corrected[code]);
2574 
2575                                 /*
2576                                  * If we read more parity disks than were used
2577                                  * for reconstruction, confirm that the other
2578                                  * parity disks produced correct data. This
2579                                  * routine is suboptimal in that it regenerates
2580                                  * the parity that we already used in addition
2581                                  * to the parity that we're attempting to
2582                                  * verify, but this should be a relatively
2583                                  * uncommon case, and can be optimized if it
2584                                  * becomes a problem. Note that we regenerate
2585                                  * parity when resilvering so we can write it
2586                                  * out to failed devices later.
2587                                  */
2588                                 if (parity_errors < rm->rm_firstdatacol - n ||
2589                                     (zio->io_flags & ZIO_FLAG_RESILVER)) {
2590                                         n = raidz_parity_verify(zio, rm);
2591                                         unexpected_errors += n;
2592                                         ASSERT(parity_errors + n <=
2593                                             rm->rm_firstdatacol);
2594                                 }
2595 
2596                                 goto done;
2597                         }
2598                 }
2599         }
2600 
2601         /*
2602          * This isn't a typical situation -- either we got a read error or
2603          * a child silently returned bad data. Read every block so we can
2604          * try again with as much data and parity as we can track down. If
2605          * we've already been through once before, all children will be marked
2606          * as tried so we'll proceed to combinatorial reconstruction.
2607          */
2608         unexpected_errors = 1;
2609         rm->rm_missingdata = 0;
2610         rm->rm_missingparity = 0;
2611 
2612         for (c = 0; c < rm->rm_cols; c++) {
2613                 if (rm->rm_col[c].rc_tried)
2614                         continue;
2615 
2616                 zio_vdev_io_redone(zio);
2617                 do {
2618                         rc = &rm->rm_col[c];
2619                         if (rc->rc_tried)
2620                                 continue;
2621                         zio_nowait(zio_vdev_child_io(zio, NULL,
2622                             vd->vdev_child[rc->rc_devidx],
2623                             rc->rc_offset, rc->rc_abd, rc->rc_size,
2624                             zio->io_type, zio->io_priority, 0,
2625                             vdev_raidz_child_done, rc));
2626                 } while (++c < rm->rm_cols);
2627 
2628                 return;
2629         }
2630 
2631         /*
2632          * At this point we've attempted to reconstruct the data given the
2633          * errors we detected, and we've attempted to read all columns. There
2634          * must, therefore, be one or more additional problems -- silent errors
2635          * resulting in invalid data rather than explicit I/O errors resulting
2636          * in absent data. We check if there is enough additional data to
2637          * possibly reconstruct the data and then perform combinatorial
2638          * reconstruction over all possible combinations. If that fails,
2639          * we're cooked.
2640          */
2641         if (total_errors > rm->rm_firstdatacol) {
2642                 zio->io_error = vdev_raidz_worst_error(rm);
2643 
2644         } else if (total_errors < rm->rm_firstdatacol &&
2645             (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2646                 /*
2647                  * If we didn't use all the available parity for the
2648                  * combinatorial reconstruction, verify that the remaining
2649                  * parity is correct.
2650                  */
2651                 if (code != (1 << rm->rm_firstdatacol) - 1)
2652                         (void) raidz_parity_verify(zio, rm);
2653         } else {
2654                 /*
2655                  * We're here because either:
2656                  *
2657                  *      total_errors == rm_first_datacol, or
2658                  *      vdev_raidz_combrec() failed
2659                  *
2660                  * In either case, there is enough bad data to prevent
2661                  * reconstruction.
2662                  *
2663                  * Start checksum ereports for all children which haven't
2664                  * failed, and the IO wasn't speculative.
2665                  */
2666                 zio->io_error = SET_ERROR(ECKSUM);
2667 
2668                 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2669                         for (c = 0; c < rm->rm_cols; c++) {
2670                                 rc = &rm->rm_col[c];
2671                                 if (rc->rc_error == 0) {
2672                                         zio_bad_cksum_t zbc;
2673                                         zbc.zbc_has_cksum = 0;
2674                                         zbc.zbc_injected =
2675                                             rm->rm_ecksuminjected;
2676 
2677                                         zfs_ereport_start_checksum(
2678                                             zio->io_spa,
2679                                             vd->vdev_child[rc->rc_devidx],
2680                                             zio, rc->rc_offset, rc->rc_size,
2681                                             (void *)(uintptr_t)c, &zbc);
2682                                 }
2683                         }
2684                 }
2685         }
2686 
2687 done:
2688         zio_checksum_verified(zio);
2689 
2690         if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2691             (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2692                 /*
2693                  * Use the good data we have in hand to repair damaged children.
2694                  */
2695                 for (c = 0; c < rm->rm_cols; c++) {
2696                         rc = &rm->rm_col[c];
2697                         cvd = vd->vdev_child[rc->rc_devidx];
2698 
2699                         if (rc->rc_error == 0)
2700                                 continue;
2701 
2702                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2703                             rc->rc_offset, rc->rc_abd, rc->rc_size,
2704                             ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2705                             ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2706                             ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2707                 }
2708         }
2709 }
2710 
2711 static void
2712 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2713 {
2714         if (faulted > vd->vdev_nparity)
2715                 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2716                     VDEV_AUX_NO_REPLICAS);
2717         else if (degraded + faulted != 0)
2718                 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2719         else
2720                 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2721 }
2722 
2723 static inline void
2724 vdev_raidz_trim_append_rc(dkioc_free_list_t *dfl, uint64_t *num_extsp,
2725     const raidz_col_t *rc)
2726 {
2727         uint64_t num_exts = *num_extsp;
2728         ASSERT(rc->rc_size != 0);
2729 
2730         if (dfl->dfl_num_exts > 0 &&
2731             dfl->dfl_exts[num_exts - 1].dfle_start +
2732             dfl->dfl_exts[num_exts - 1].dfle_length == rc->rc_offset) {
2733                 dfl->dfl_exts[num_exts - 1].dfle_length += rc->rc_size;
2734         } else {
2735                 dfl->dfl_exts[num_exts].dfle_start = rc->rc_offset;
2736                 dfl->dfl_exts[num_exts].dfle_length = rc->rc_size;
2737                 (*num_extsp)++;
2738         }
2739 }
2740 
2741 /*
2742  * Processes a trim for a raidz vdev.
2743  */
2744 static void
2745 vdev_raidz_trim(vdev_t *vd, zio_t *pio, void *trim_exts)
2746 {
2747         dkioc_free_list_t *dfl = trim_exts;
2748         dkioc_free_list_t **sub_dfls;
2749         uint64_t *sub_dfls_num_exts;
2750 
2751         sub_dfls = kmem_zalloc(sizeof (*sub_dfls) * vd->vdev_children,
2752             KM_SLEEP);
2753         sub_dfls_num_exts = kmem_zalloc(sizeof (uint64_t) * vd->vdev_children,
2754             KM_SLEEP);
2755         for (int i = 0; i < vd->vdev_children; i++) {
2756                 /*
2757                  * We might over-allocate here, because the sub-lists can never
2758                  * be longer than the parent list, but they can be shorter.
2759                  * The underlying driver will discard zero-length extents.
2760                  */
2761                 sub_dfls[i] = kmem_zalloc(DFL_SZ(dfl->dfl_num_exts), KM_SLEEP);
2762                 sub_dfls[i]->dfl_num_exts = dfl->dfl_num_exts;
2763                 sub_dfls[i]->dfl_flags = dfl->dfl_flags;
2764                 sub_dfls[i]->dfl_offset = dfl->dfl_offset;
2765                 /* don't copy the check func, because it isn't raidz-aware */
2766         }
2767 
2768         /*
2769          * Process all extents and redistribute them to the component vdevs
2770          * according to a computed raidz map geometry.
2771          */
2772         for (int i = 0; i < dfl->dfl_num_exts; i++) {
2773                 uint64_t start = dfl->dfl_exts[i].dfle_start;
2774                 uint64_t length = dfl->dfl_exts[i].dfle_length;
2775                 raidz_map_t *rm = vdev_raidz_map_alloc(NULL,
2776                     vdev_raidz_psize_floor(vd, length), start,
2777                     vd->vdev_top->vdev_ashift, vd->vdev_children,
2778                     vd->vdev_nparity, B_FALSE);
2779 
2780                 for (uint64_t j = 0; j < rm->rm_cols; j++) {
2781                         uint64_t devidx = rm->rm_col[j].rc_devidx;
2782                         vdev_raidz_trim_append_rc(sub_dfls[devidx],
2783                             &sub_dfls_num_exts[devidx], &rm->rm_col[j]);
2784                 }
2785                 vdev_raidz_map_free(rm);
2786         }
2787 
2788         /*
2789          * Issue the component ioctls as children of the parent zio.
2790          */
2791         for (int i = 0; i < vd->vdev_children; i++) {
2792                 if (sub_dfls_num_exts[i] != 0) {
2793                         zio_nowait(zio_ioctl(pio, vd->vdev_child[i]->vdev_spa,
2794                             vd->vdev_child[i], DKIOCFREE,
2795                             vdev_raidz_trim_done, sub_dfls[i],
2796                             ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
2797                             ZIO_FLAG_DONT_RETRY));
2798                 } else {
2799                         dfl_free(sub_dfls[i]);
2800                 }
2801         }
2802         kmem_free(sub_dfls, sizeof (*sub_dfls) * vd->vdev_children);
2803         kmem_free(sub_dfls_num_exts, sizeof (uint64_t) * vd->vdev_children);
2804 }
2805 
2806 /*
2807  * Releases a dkioc_free_list_t from ioctls issued to component devices in
2808  * vdev_raidz_dkioc_free.
2809  */
2810 static void
2811 vdev_raidz_trim_done(zio_t *zio)
2812 {
2813         ASSERT(zio->io_private != NULL);
2814         dfl_free(zio->io_private);
2815 }
2816 
2817 vdev_ops_t vdev_raidz_ops = {
2818         vdev_raidz_open,
2819         vdev_raidz_close,
2820         vdev_raidz_asize,
2821         vdev_raidz_io_start,
2822         vdev_raidz_io_done,
2823         vdev_raidz_state_change,
2824         NULL,
2825         NULL,
2826         vdev_raidz_trim,
2827         VDEV_TYPE_RAIDZ,        /* name of this vdev type */
2828         B_FALSE                 /* not a leaf vdev */
2829 };