1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
25 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26 * Copyright (c) 2014 Integros [integros.com]
27 * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
28 */
29
30 #include <sys/zfs_context.h>
31 #include <sys/spa.h>
32 #include <sys/spa_impl.h>
33 #include <sys/vdev_impl.h>
34 #include <sys/vdev_disk.h>
35 #include <sys/vdev_file.h>
36 #include <sys/vdev_raidz.h>
37 #include <sys/zio.h>
38 #include <sys/zio_checksum.h>
39 #include <sys/abd.h>
40 #include <sys/fs/zfs.h>
41 #include <sys/fm/fs/zfs.h>
42 #include <sys/dkioc_free_util.h>
43
44 /*
45 * Virtual device vector for RAID-Z.
46 *
47 * This vdev supports single, double, and triple parity. For single parity,
48 * we use a simple XOR of all the data columns. For double or triple parity,
49 * we use a special case of Reed-Solomon coding. This extends the
50 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
51 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
52 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
53 * former is also based. The latter is designed to provide higher performance
54 * for writes.
55 *
56 * Note that the Plank paper claimed to support arbitrary N+M, but was then
57 * amended six years later identifying a critical flaw that invalidates its
58 * claims. Nevertheless, the technique can be adapted to work for up to
59 * triple parity. For additional parity, the amendment "Note: Correction to
60 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
61 * is viable, but the additional complexity means that write performance will
62 * suffer.
63 *
64 * All of the methods above operate on a Galois field, defined over the
65 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
66 * can be expressed with a single byte. Briefly, the operations on the
67 * field are defined as follows:
68 *
69 * o addition (+) is represented by a bitwise XOR
70 * o subtraction (-) is therefore identical to addition: A + B = A - B
71 * o multiplication of A by 2 is defined by the following bitwise expression:
72 *
73 * (A * 2)_7 = A_6
74 * (A * 2)_6 = A_5
75 * (A * 2)_5 = A_4
76 * (A * 2)_4 = A_3 + A_7
77 * (A * 2)_3 = A_2 + A_7
78 * (A * 2)_2 = A_1 + A_7
79 * (A * 2)_1 = A_0
80 * (A * 2)_0 = A_7
81 *
82 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
83 * As an aside, this multiplication is derived from the error correcting
84 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
85 *
86 * Observe that any number in the field (except for 0) can be expressed as a
87 * power of 2 -- a generator for the field. We store a table of the powers of
88 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
89 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
90 * than field addition). The inverse of a field element A (A^-1) is therefore
91 * A ^ (255 - 1) = A^254.
92 *
93 * The up-to-three parity columns, P, Q, R over several data columns,
94 * D_0, ... D_n-1, can be expressed by field operations:
95 *
96 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
97 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
98 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
99 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
100 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
101 *
102 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
103 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
104 * independent coefficients. (There are no additional coefficients that have
105 * this property which is why the uncorrected Plank method breaks down.)
106 *
107 * See the reconstruction code below for how P, Q and R can used individually
108 * or in concert to recover missing data columns.
109 */
110
111 typedef struct raidz_col {
112 uint64_t rc_devidx; /* child device index for I/O */
113 uint64_t rc_offset; /* device offset */
114 uint64_t rc_size; /* I/O size */
115 abd_t *rc_abd; /* I/O data */
116 void *rc_gdata; /* used to store the "good" version */
117 int rc_error; /* I/O error for this device */
118 uint8_t rc_tried; /* Did we attempt this I/O column? */
119 uint8_t rc_skipped; /* Did we skip this I/O column? */
120 } raidz_col_t;
121
122 typedef struct raidz_map {
123 uint64_t rm_cols; /* Regular column count */
124 uint64_t rm_scols; /* Count including skipped columns */
125 uint64_t rm_bigcols; /* Number of oversized columns */
126 uint64_t rm_asize; /* Actual total I/O size */
127 uint64_t rm_missingdata; /* Count of missing data devices */
128 uint64_t rm_missingparity; /* Count of missing parity devices */
129 uint64_t rm_firstdatacol; /* First data column/parity count */
130 uint64_t rm_nskip; /* Skipped sectors for padding */
131 uint64_t rm_skipstart; /* Column index of padding start */
132 abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */
133 uintptr_t rm_reports; /* # of referencing checksum reports */
134 uint8_t rm_freed; /* map no longer has referencing ZIO */
135 uint8_t rm_ecksuminjected; /* checksum error was injected */
136 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
137 } raidz_map_t;
138
139 #define VDEV_RAIDZ_P 0
140 #define VDEV_RAIDZ_Q 1
141 #define VDEV_RAIDZ_R 2
142
143 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
144 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
145
146 /*
147 * We provide a mechanism to perform the field multiplication operation on a
148 * 64-bit value all at once rather than a byte at a time. This works by
149 * creating a mask from the top bit in each byte and using that to
150 * conditionally apply the XOR of 0x1d.
151 */
152 #define VDEV_RAIDZ_64MUL_2(x, mask) \
153 { \
154 (mask) = (x) & 0x8080808080808080ULL; \
155 (mask) = ((mask) << 1) - ((mask) >> 7); \
156 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
157 ((mask) & 0x1d1d1d1d1d1d1d1d); \
158 }
159
160 #define VDEV_RAIDZ_64MUL_4(x, mask) \
161 { \
162 VDEV_RAIDZ_64MUL_2((x), mask); \
163 VDEV_RAIDZ_64MUL_2((x), mask); \
164 }
165
166 #define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE)
167
168 /*
169 * Force reconstruction to use the general purpose method.
170 */
171 int vdev_raidz_default_to_general;
172
173 /*
174 * xor_p hook for external acceleration libraries.
175 */
176 int (*zfs_xorp_hook)(int vects, int len, void **array) = NULL;
177
178 /*
179 * These two tables represent powers and logs of 2 in the Galois field defined
180 * above. These values were computed by repeatedly multiplying by 2 as above.
181 */
182 static const uint8_t vdev_raidz_pow2[256] = {
183 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
184 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
185 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
186 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
187 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
188 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
189 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
190 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
191 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
192 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
193 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
194 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
195 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
196 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
197 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
198 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
199 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
200 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
201 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
202 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
203 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
204 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
205 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
206 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
207 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
208 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
209 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
210 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
211 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
212 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
213 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
214 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
215 };
216 /* Logs of 2 in the Galois field defined above. */
217 static const uint8_t vdev_raidz_log2[256] = {
218 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
219 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
220 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
221 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
222 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
223 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
224 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
225 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
226 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
227 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
228 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
229 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
230 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
231 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
232 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
233 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
234 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
235 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
236 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
237 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
238 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
239 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
240 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
241 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
242 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
243 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
244 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
245 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
246 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
247 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
248 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
249 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
250 };
251
252 static void vdev_raidz_generate_parity(raidz_map_t *rm);
253 static void vdev_raidz_trim_done(zio_t *zio);
254
255 /*
256 * Multiply a given number by 2 raised to the given power.
257 */
258 static uint8_t
259 vdev_raidz_exp2(uint_t a, int exp)
260 {
261 if (a == 0)
262 return (0);
263
264 ASSERT(exp >= 0);
265 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
266
267 exp += vdev_raidz_log2[a];
268 if (exp > 255)
269 exp -= 255;
270
271 return (vdev_raidz_pow2[exp]);
272 }
273
274 static void
275 vdev_raidz_map_free(raidz_map_t *rm)
276 {
277 int c;
278 size_t size;
279
280 for (c = 0; c < rm->rm_firstdatacol; c++) {
281 /*
282 * TRIM doesn't allocate data blocks,
283 * so 'rc_abd' is NULL in this case.
284 * See vdev_raidz_trim() and vdev_raidz_map_alloc()
285 * for more details.
286 */
287 if (rm->rm_col[c].rc_abd != NULL)
288 abd_free(rm->rm_col[c].rc_abd);
289
290 if (rm->rm_col[c].rc_gdata != NULL)
291 zio_buf_free(rm->rm_col[c].rc_gdata,
292 rm->rm_col[c].rc_size);
293 }
294
295 size = 0;
296 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
297 /*
298 * TRIM doesn't allocate data blocks,
299 * so 'rc_abd' is NULL in this case
300 * See vdev_raidz_trim() and vdev_raidz_map_alloc()
301 * for more details.
302 */
303 if (rm->rm_col[c].rc_abd != NULL)
304 abd_put(rm->rm_col[c].rc_abd);
305 size += rm->rm_col[c].rc_size;
306 }
307
308 if (rm->rm_abd_copy != NULL)
309 abd_free(rm->rm_abd_copy);
310
311 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
312 }
313
314 static void
315 vdev_raidz_map_free_vsd(zio_t *zio)
316 {
317 raidz_map_t *rm = zio->io_vsd;
318
319 ASSERT0(rm->rm_freed);
320 rm->rm_freed = 1;
321
322 if (rm->rm_reports == 0)
323 vdev_raidz_map_free(rm);
324 }
325
326 /*ARGSUSED*/
327 static void
328 vdev_raidz_cksum_free(void *arg, size_t ignored)
329 {
330 raidz_map_t *rm = arg;
331
332 ASSERT3U(rm->rm_reports, >, 0);
333
334 if (--rm->rm_reports == 0 && rm->rm_freed != 0)
335 vdev_raidz_map_free(rm);
336 }
337
338 static void
339 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
340 {
341 raidz_map_t *rm = zcr->zcr_cbdata;
342 size_t c = zcr->zcr_cbinfo;
343 size_t x;
344
345 const char *good = NULL;
346 char *bad;
347
348 if (good_data == NULL) {
349 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
350 return;
351 }
352
353 if (c < rm->rm_firstdatacol) {
354 /*
355 * The first time through, calculate the parity blocks for
356 * the good data (this relies on the fact that the good
357 * data never changes for a given logical ZIO)
358 */
359 if (rm->rm_col[0].rc_gdata == NULL) {
360 abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
361 char *buf;
362 int offset;
363
364 /*
365 * Set up the rm_col[]s to generate the parity for
366 * good_data, first saving the parity bufs and
367 * replacing them with buffers to hold the result.
368 */
369 for (x = 0; x < rm->rm_firstdatacol; x++) {
370 bad_parity[x] = rm->rm_col[x].rc_abd;
371 rm->rm_col[x].rc_gdata =
372 zio_buf_alloc(rm->rm_col[x].rc_size);
373 rm->rm_col[x].rc_abd =
374 abd_get_from_buf(rm->rm_col[x].rc_gdata,
375 rm->rm_col[x].rc_size);
376 }
377
378 /* fill in the data columns from good_data */
379 buf = (char *)good_data;
380 for (; x < rm->rm_cols; x++) {
381 abd_put(rm->rm_col[x].rc_abd);
382 rm->rm_col[x].rc_abd = abd_get_from_buf(buf,
383 rm->rm_col[x].rc_size);
384 buf += rm->rm_col[x].rc_size;
385 }
386
387 /*
388 * Construct the parity from the good data.
389 */
390 vdev_raidz_generate_parity(rm);
391
392 /* restore everything back to its original state */
393 for (x = 0; x < rm->rm_firstdatacol; x++) {
394 abd_put(rm->rm_col[x].rc_abd);
395 rm->rm_col[x].rc_abd = bad_parity[x];
396 }
397
398 offset = 0;
399 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
400 abd_put(rm->rm_col[x].rc_abd);
401 rm->rm_col[x].rc_abd = abd_get_offset(
402 rm->rm_abd_copy, offset);
403 offset += rm->rm_col[x].rc_size;
404 }
405 }
406
407 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
408 good = rm->rm_col[c].rc_gdata;
409 } else {
410 /* adjust good_data to point at the start of our column */
411 good = good_data;
412
413 for (x = rm->rm_firstdatacol; x < c; x++)
414 good += rm->rm_col[x].rc_size;
415 }
416
417 bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size);
418 /* we drop the ereport if it ends up that the data was good */
419 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
420 abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size);
421 }
422
423 /*
424 * Invoked indirectly by zfs_ereport_start_checksum(), called
425 * below when our read operation fails completely. The main point
426 * is to keep a copy of everything we read from disk, so that at
427 * vdev_raidz_cksum_finish() time we can compare it with the good data.
428 */
429 static void
430 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
431 {
432 size_t c = (size_t)(uintptr_t)arg;
433 size_t offset;
434
435 raidz_map_t *rm = zio->io_vsd;
436 size_t size;
437
438 /* set up the report and bump the refcount */
439 zcr->zcr_cbdata = rm;
440 zcr->zcr_cbinfo = c;
441 zcr->zcr_finish = vdev_raidz_cksum_finish;
442 zcr->zcr_free = vdev_raidz_cksum_free;
443
444 rm->rm_reports++;
445 ASSERT3U(rm->rm_reports, >, 0);
446
447 if (rm->rm_abd_copy != NULL)
448 return;
449
450 /*
451 * It's the first time we're called for this raidz_map_t, so we need
452 * to copy the data aside; there's no guarantee that our zio's buffer
453 * won't be re-used for something else.
454 *
455 * Our parity data is already in separate buffers, so there's no need
456 * to copy them.
457 */
458
459 size = 0;
460 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
461 size += rm->rm_col[c].rc_size;
462
463 rm->rm_abd_copy =
464 abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size);
465
466 for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
467 raidz_col_t *col = &rm->rm_col[c];
468 abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset);
469
470 abd_copy(tmp, col->rc_abd, col->rc_size);
471 abd_put(col->rc_abd);
472 col->rc_abd = tmp;
473
474 offset += col->rc_size;
475 }
476 ASSERT3U(offset, ==, size);
477 }
478
479 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
480 vdev_raidz_map_free_vsd,
481 vdev_raidz_cksum_report
482 };
483
484 /*
485 * Allocates and computes a raidz column map, which directs the raidz column
486 * handling algorithms where to locate and store data and parity columns for
487 * a particular DVA. Usually, dcols is the number of children in the target
488 * vdev.
489 *
490 * The `io_offset', `io_size' and `io_data' hold the offset, size and data
491 * of the zio for which this map is to be computed.
492 * The `unit_shift' parameter contains the minimum allocation bitshift of
493 * the storage pool. The `dcols' parameter contains the number of drives in
494 * this raidz vdev (including parity drives), with `nparity' denoting how
495 * many those contain the parity (one, two or three).
496 *
497 * The `alloc_io_bufs' flag denotes whether you want the constructed raidz
498 * map to contain allocated buffers to hold column IO data or not (if
499 * you're using this function simply to determine raidz geometry, you'll
500 * want to pass B_FALSE here).
501 */
502 static raidz_map_t *
503 vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset,
504 uint64_t unit_shift, uint64_t dcols, uint64_t nparity,
505 boolean_t alloc_data)
506 {
507 raidz_map_t *rm;
508 /* The starting RAIDZ (parent) vdev sector of the block. */
509 uint64_t b = offset >> unit_shift;
510 /* The zio's size in units of the vdev's minimum sector size. */
511 uint64_t s = size >> unit_shift;
512 /* The first column for this stripe. */
513 uint64_t f = b % dcols;
514 /* The starting byte offset on each child vdev. */
515 uint64_t o = (b / dcols) << unit_shift;
516 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
517 uint64_t off = 0;
518
519 /*
520 * "Quotient": The number of data sectors for this stripe on all but
521 * the "big column" child vdevs that also contain "remainder" data.
522 */
523 q = s / (dcols - nparity);
524
525 /*
526 * "Remainder": The number of partial stripe data sectors in this I/O.
527 * This will add a sector to some, but not all, child vdevs.
528 */
529 r = s - q * (dcols - nparity);
530
531 /* The number of "big columns" - those which contain remainder data. */
532 bc = (r == 0 ? 0 : r + nparity);
533
534 /*
535 * The total number of data and parity sectors associated with
536 * this I/O.
537 */
538 tot = s + nparity * (q + (r == 0 ? 0 : 1));
539
540 /* acols: The columns that will be accessed. */
541 /* scols: The columns that will be accessed or skipped. */
542 if (q == 0) {
543 /* Our I/O request doesn't span all child vdevs. */
544 acols = bc;
545 scols = MIN(dcols, roundup(bc, nparity + 1));
546 } else {
547 acols = dcols;
548 scols = dcols;
549 }
550
551 ASSERT3U(acols, <=, scols);
552
553 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
554
555 rm->rm_cols = acols;
556 rm->rm_scols = scols;
557 rm->rm_bigcols = bc;
558 rm->rm_skipstart = bc;
559 rm->rm_missingdata = 0;
560 rm->rm_missingparity = 0;
561 rm->rm_firstdatacol = nparity;
562 rm->rm_abd_copy = NULL;
563 rm->rm_reports = 0;
564 rm->rm_freed = 0;
565 rm->rm_ecksuminjected = 0;
566
567 asize = 0;
568
569 for (c = 0; c < scols; c++) {
570 col = f + c;
571 coff = o;
572 if (col >= dcols) {
573 col -= dcols;
574 coff += 1ULL << unit_shift;
575 }
576 rm->rm_col[c].rc_devidx = col;
577 rm->rm_col[c].rc_offset = coff;
578 rm->rm_col[c].rc_abd = NULL;
579 rm->rm_col[c].rc_gdata = NULL;
580 rm->rm_col[c].rc_error = 0;
581 rm->rm_col[c].rc_tried = 0;
582 rm->rm_col[c].rc_skipped = 0;
583
584 if (c >= acols)
585 rm->rm_col[c].rc_size = 0;
586 else if (c < bc)
587 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
588 else
589 rm->rm_col[c].rc_size = q << unit_shift;
590
591 asize += rm->rm_col[c].rc_size;
592 }
593
594 ASSERT3U(asize, ==, tot << unit_shift);
595 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
596 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
597 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
598 ASSERT3U(rm->rm_nskip, <=, nparity);
599
600 if (alloc_data) {
601 for (c = 0; c < rm->rm_firstdatacol; c++) {
602 rm->rm_col[c].rc_abd =
603 abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);
604 }
605
606 rm->rm_col[c].rc_abd = abd_get_offset(abd, 0);
607 off = rm->rm_col[c].rc_size;
608
609 for (c = c + 1; c < acols; c++) {
610 rm->rm_col[c].rc_abd = abd_get_offset(abd, off);
611 off += rm->rm_col[c].rc_size;
612 }
613 }
614
615 /*
616 * If all data stored spans all columns, there's a danger that parity
617 * will always be on the same device and, since parity isn't read
618 * during normal operation, that that device's I/O bandwidth won't be
619 * used effectively. We therefore switch the parity every 1MB.
620 *
621 * ... at least that was, ostensibly, the theory. As a practical
622 * matter unless we juggle the parity between all devices evenly, we
623 * won't see any benefit. Further, occasional writes that aren't a
624 * multiple of the LCM of the number of children and the minimum
625 * stripe width are sufficient to avoid pessimal behavior.
626 * Unfortunately, this decision created an implicit on-disk format
627 * requirement that we need to support for all eternity, but only
628 * for single-parity RAID-Z.
629 *
630 * If we intend to skip a sector in the zeroth column for padding
631 * we must make sure to note this swap. We will never intend to
632 * skip the first column since at least one data and one parity
633 * column must appear in each row.
634 */
635 ASSERT(rm->rm_cols >= 2);
636 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
637
638 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
639 devidx = rm->rm_col[0].rc_devidx;
640 o = rm->rm_col[0].rc_offset;
641 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
642 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
643 rm->rm_col[1].rc_devidx = devidx;
644 rm->rm_col[1].rc_offset = o;
645
646 if (rm->rm_skipstart == 0)
647 rm->rm_skipstart = 1;
648 }
649
650 return (rm);
651 }
652
653 struct pqr_struct {
654 uint64_t *p;
655 uint64_t *q;
656 uint64_t *r;
657 };
658
659 static int
660 vdev_raidz_p_func(void *buf, size_t size, void *private)
661 {
662 struct pqr_struct *pqr = private;
663 const uint64_t *src = buf;
664 int i, cnt = size / sizeof (src[0]);
665
666 ASSERT(pqr->p && !pqr->q && !pqr->r);
667
668 for (i = 0; i < cnt; i++, src++, pqr->p++)
669 *pqr->p ^= *src;
670
671 return (0);
672 }
673
674 static int
675 vdev_raidz_pq_func(void *buf, size_t size, void *private)
676 {
677 struct pqr_struct *pqr = private;
678 const uint64_t *src = buf;
679 uint64_t mask;
680 int i, cnt = size / sizeof (src[0]);
681
682 ASSERT(pqr->p && pqr->q && !pqr->r);
683
684 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
685 *pqr->p ^= *src;
686 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
687 *pqr->q ^= *src;
688 }
689
690 return (0);
691 }
692
693 static int
694 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
695 {
696 struct pqr_struct *pqr = private;
697 const uint64_t *src = buf;
698 uint64_t mask;
699 int i, cnt = size / sizeof (src[0]);
700
701 ASSERT(pqr->p && pqr->q && pqr->r);
702
703 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
704 *pqr->p ^= *src;
705 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
706 *pqr->q ^= *src;
707 VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
708 *pqr->r ^= *src;
709 }
710
711 return (0);
712 }
713
714 /*
715 * software acceleration of XOR calculations, requirements
716 *
717 * the (src/dst) vectors needs to be 64 byte aligned
718 * all the vectors have to be the same size
719 */
720 #define RAIDZ_ACCELERATION_ALIGNMENT 64ul
721 #define UNALIGNED(addr) \
722 ((unsigned long)(addr) & (RAIDZ_ACCELERATION_ALIGNMENT-1))
723
724 static void
725 vdev_raidz_generate_parity_p(raidz_map_t *rm)
726 {
727 uint64_t *p;
728 int c;
729 abd_t *src;
730
731 #if 0
732 /* FIXME: needs to be reviewed and changed to support ABD */
733 int parity_done;
734 void *va[16];
735 void **array;
736 int j, nvects;
737
738 parity_done = 0;
739 while (0 && zfs_xorp_hook && !parity_done) {
740 unsigned long no_accel = 0;
741 /* at least two columns (plus one for result) */
742 if (rm->rm_cols < 3) {
743 DTRACE_PROBE1(raidz_few_cols, int, rm->rm_cols);
744 break;
745 }
746 /* check sizes and alignment */
747 no_accel = UNALIGNED(rm->rm_col[VDEV_RAIDZ_P].rc_data);
748 if (no_accel) {
749 DTRACE_PROBE1(raidz_unaligned_dst, unsigned long,
750 no_accel);
751 break;
752 }
753 pcount = rm->rm_col[rm->rm_firstdatacol].rc_size;
754 nvects = 1; /* for the destination */
755 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
756 no_accel = UNALIGNED(rm->rm_col[c].rc_data);
757 if (no_accel) {
758 DTRACE_PROBE1(raidz_unaligned_src,
759 unsigned long, no_accel);
760 break;
761 }
762 if (rm->rm_col[c].rc_size != pcount) {
763 DTRACE_PROBE(raidz_sizes_vary);
764 no_accel = 1;
765 break;
766 }
767 nvects++;
768 }
769 if (no_accel)
770 break;
771 if (nvects > 16) {
772 array = kmem_alloc(nvects * sizeof (void *),
773 KM_NOSLEEP);
774 if (array == NULL) {
775 DTRACE_PROBE(raidz_alloc_failed);
776 break;
777 }
778 } else {
779 array = va;
780 }
781 for (j = 0, c = rm->rm_firstdatacol; c < rm->rm_cols;
782 c++, j++) {
783 array[j] = rm->rm_col[c].rc_data;
784 }
785 array[j] = rm->rm_col[VDEV_RAIDZ_P].rc_data;
786 if (zfs_xorp_hook(nvects,
787 rm->rm_col[rm->rm_firstdatacol].rc_size, array)) {
788 DTRACE_PROBE(raidz_accel_failure);
789 break;
790 }
791 if (array != va) {
792 kmem_free(array, nvects * sizeof (void *));
793 }
794 parity_done = 1;
795 DTRACE_PROBE(raidz_accel_success);
796 }
797 if (parity_done)
798 return;
799 #endif
800 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
801 src = rm->rm_col[c].rc_abd;
802 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
803
804 if (c == rm->rm_firstdatacol) {
805 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
806 } else {
807 struct pqr_struct pqr = { p, NULL, NULL };
808 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
809 vdev_raidz_p_func, &pqr);
810 }
811 }
812 }
813
814 static void
815 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
816 {
817 uint64_t *p, *q, pcnt, ccnt, mask, i;
818 int c;
819 abd_t *src;
820
821 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
822 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
823 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
824
825 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
826 src = rm->rm_col[c].rc_abd;
827 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
828 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
829
830 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
831
832 if (c == rm->rm_firstdatacol) {
833 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
834 (void) memcpy(q, p, rm->rm_col[c].rc_size);
835 } else {
836 struct pqr_struct pqr = { p, q, NULL };
837 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
838 vdev_raidz_pq_func, &pqr);
839 }
840
841 if (c == rm->rm_firstdatacol) {
842 for (i = ccnt; i < pcnt; i++) {
843 p[i] = 0;
844 q[i] = 0;
845 }
846 } else {
847 /*
848 * Treat short columns as though they are full of 0s.
849 * Note that there's therefore nothing needed for P.
850 */
851 for (i = ccnt; i < pcnt; i++) {
852 VDEV_RAIDZ_64MUL_2(q[i], mask);
853 }
854 }
855 }
856 }
857
858 static void
859 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
860 {
861 uint64_t *p, *q, *r, pcnt, ccnt, mask, i;
862 int c;
863 abd_t *src;
864
865 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
866 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
867 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
868 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
869 rm->rm_col[VDEV_RAIDZ_R].rc_size);
870
871 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
872 src = rm->rm_col[c].rc_abd;
873 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
874 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
875 r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
876
877 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
878
879 if (c == rm->rm_firstdatacol) {
880 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
881 (void) memcpy(q, p, rm->rm_col[c].rc_size);
882 (void) memcpy(r, p, rm->rm_col[c].rc_size);
883 } else {
884 struct pqr_struct pqr = { p, q, r };
885 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
886 vdev_raidz_pqr_func, &pqr);
887 }
888
889 if (c == rm->rm_firstdatacol) {
890 for (i = ccnt; i < pcnt; i++) {
891 p[i] = 0;
892 q[i] = 0;
893 r[i] = 0;
894 }
895 } else {
896 /*
897 * Treat short columns as though they are full of 0s.
898 * Note that there's therefore nothing needed for P.
899 */
900 for (i = ccnt; i < pcnt; i++) {
901 VDEV_RAIDZ_64MUL_2(q[i], mask);
902 VDEV_RAIDZ_64MUL_4(r[i], mask);
903 }
904 }
905 }
906 }
907
908 /*
909 * Generate RAID parity in the first virtual columns according to the number of
910 * parity columns available.
911 */
912 static void
913 vdev_raidz_generate_parity(raidz_map_t *rm)
914 {
915 switch (rm->rm_firstdatacol) {
916 case 1:
917 vdev_raidz_generate_parity_p(rm);
918 break;
919 case 2:
920 vdev_raidz_generate_parity_pq(rm);
921 break;
922 case 3:
923 vdev_raidz_generate_parity_pqr(rm);
924 break;
925 default:
926 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
927 }
928 }
929
930 /* ARGSUSED */
931 static int
932 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
933 {
934 uint64_t *dst = dbuf;
935 uint64_t *src = sbuf;
936 int cnt = size / sizeof (src[0]);
937
938 for (int i = 0; i < cnt; i++) {
939 dst[i] ^= src[i];
940 }
941
942 return (0);
943 }
944
945 /* ARGSUSED */
946 static int
947 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
948 void *private)
949 {
950 uint64_t *dst = dbuf;
951 uint64_t *src = sbuf;
952 uint64_t mask;
953 int cnt = size / sizeof (dst[0]);
954
955 for (int i = 0; i < cnt; i++, dst++, src++) {
956 VDEV_RAIDZ_64MUL_2(*dst, mask);
957 *dst ^= *src;
958 }
959
960 return (0);
961 }
962
963 /* ARGSUSED */
964 static int
965 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
966 {
967 uint64_t *dst = buf;
968 uint64_t mask;
969 int cnt = size / sizeof (dst[0]);
970
971 for (int i = 0; i < cnt; i++, dst++) {
972 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */
973 VDEV_RAIDZ_64MUL_2(*dst, mask);
974 }
975
976 return (0);
977 }
978
979 struct reconst_q_struct {
980 uint64_t *q;
981 int exp;
982 };
983
984 static int
985 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
986 {
987 struct reconst_q_struct *rq = private;
988 uint64_t *dst = buf;
989 int cnt = size / sizeof (dst[0]);
990
991 for (int i = 0; i < cnt; i++, dst++, rq->q++) {
992 *dst ^= *rq->q;
993
994 int j;
995 uint8_t *b;
996 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
997 *b = vdev_raidz_exp2(*b, rq->exp);
998 }
999 }
1000
1001 return (0);
1002 }
1003
1004 struct reconst_pq_struct {
1005 uint8_t *p;
1006 uint8_t *q;
1007 uint8_t *pxy;
1008 uint8_t *qxy;
1009 int aexp;
1010 int bexp;
1011 };
1012
1013 static int
1014 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
1015 {
1016 struct reconst_pq_struct *rpq = private;
1017 uint8_t *xd = xbuf;
1018 uint8_t *yd = ybuf;
1019
1020 for (int i = 0; i < size;
1021 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
1022 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
1023 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
1024 *yd = *rpq->p ^ *rpq->pxy ^ *xd;
1025 }
1026
1027 return (0);
1028 }
1029
1030 static int
1031 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
1032 {
1033 struct reconst_pq_struct *rpq = private;
1034 uint8_t *xd = xbuf;
1035
1036 for (int i = 0; i < size;
1037 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
1038 /* same operation as vdev_raidz_reconst_pq_func() on xd */
1039 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
1040 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
1041 }
1042
1043 return (0);
1044 }
1045
1046 static int
1047 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
1048 {
1049 int x = tgts[0];
1050 int c;
1051 abd_t *dst, *src;
1052
1053 ASSERT(ntgts == 1);
1054 ASSERT(x >= rm->rm_firstdatacol);
1055 ASSERT(x < rm->rm_cols);
1056
1057 ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
1058 ASSERT(rm->rm_col[x].rc_size > 0);
1059
1060 src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
1061 dst = rm->rm_col[x].rc_abd;
1062
1063 abd_copy(dst, src, rm->rm_col[x].rc_size);
1064
1065 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1066 uint64_t size = MIN(rm->rm_col[x].rc_size,
1067 rm->rm_col[c].rc_size);
1068
1069 src = rm->rm_col[c].rc_abd;
1070 dst = rm->rm_col[x].rc_abd;
1071
1072 if (c == x)
1073 continue;
1074
1075 (void) abd_iterate_func2(dst, src, 0, 0, size,
1076 vdev_raidz_reconst_p_func, NULL);
1077 }
1078
1079 return (1 << VDEV_RAIDZ_P);
1080 }
1081
1082 static int
1083 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
1084 {
1085 int x = tgts[0];
1086 int c, exp;
1087 abd_t *dst, *src;
1088
1089 ASSERT(ntgts == 1);
1090
1091 ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);
1092
1093 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1094 uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
1095 rm->rm_col[c].rc_size);
1096
1097 src = rm->rm_col[c].rc_abd;
1098 dst = rm->rm_col[x].rc_abd;
1099
1100 if (c == rm->rm_firstdatacol) {
1101 abd_copy(dst, src, size);
1102 if (rm->rm_col[x].rc_size > size)
1103 abd_zero_off(dst, size,
1104 rm->rm_col[x].rc_size - size);
1105 } else {
1106 ASSERT3U(size, <=, rm->rm_col[x].rc_size);
1107 (void) abd_iterate_func2(dst, src, 0, 0, size,
1108 vdev_raidz_reconst_q_pre_func, NULL);
1109 (void) abd_iterate_func(dst,
1110 size, rm->rm_col[x].rc_size - size,
1111 vdev_raidz_reconst_q_pre_tail_func, NULL);
1112 }
1113 }
1114
1115 src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
1116 dst = rm->rm_col[x].rc_abd;
1117 exp = 255 - (rm->rm_cols - 1 - x);
1118
1119 struct reconst_q_struct rq = { abd_to_buf(src), exp };
1120 (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
1121 vdev_raidz_reconst_q_post_func, &rq);
1122
1123 return (1 << VDEV_RAIDZ_Q);
1124 }
1125
1126 static int
1127 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
1128 {
1129 uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
1130 abd_t *pdata, *qdata;
1131 uint64_t xsize, ysize;
1132 int x = tgts[0];
1133 int y = tgts[1];
1134 abd_t *xd, *yd;
1135
1136 ASSERT(ntgts == 2);
1137 ASSERT(x < y);
1138 ASSERT(x >= rm->rm_firstdatacol);
1139 ASSERT(y < rm->rm_cols);
1140
1141 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
1142
1143 /*
1144 * Move the parity data aside -- we're going to compute parity as
1145 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
1146 * reuse the parity generation mechanism without trashing the actual
1147 * parity so we make those columns appear to be full of zeros by
1148 * setting their lengths to zero.
1149 */
1150 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
1151 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
1152 xsize = rm->rm_col[x].rc_size;
1153 ysize = rm->rm_col[y].rc_size;
1154
1155 rm->rm_col[VDEV_RAIDZ_P].rc_abd =
1156 abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
1157 rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
1158 abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
1159 rm->rm_col[x].rc_size = 0;
1160 rm->rm_col[y].rc_size = 0;
1161
1162 vdev_raidz_generate_parity_pq(rm);
1163
1164 rm->rm_col[x].rc_size = xsize;
1165 rm->rm_col[y].rc_size = ysize;
1166
1167 p = abd_to_buf(pdata);
1168 q = abd_to_buf(qdata);
1169 pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1170 qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1171 xd = rm->rm_col[x].rc_abd;
1172 yd = rm->rm_col[y].rc_abd;
1173
1174 /*
1175 * We now have:
1176 * Pxy = P + D_x + D_y
1177 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
1178 *
1179 * We can then solve for D_x:
1180 * D_x = A * (P + Pxy) + B * (Q + Qxy)
1181 * where
1182 * A = 2^(x - y) * (2^(x - y) + 1)^-1
1183 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
1184 *
1185 * With D_x in hand, we can easily solve for D_y:
1186 * D_y = P + Pxy + D_x
1187 */
1188
1189 a = vdev_raidz_pow2[255 + x - y];
1190 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
1191 tmp = 255 - vdev_raidz_log2[a ^ 1];
1192
1193 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
1194 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
1195
1196 ASSERT3U(xsize, >=, ysize);
1197 struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
1198 (void) abd_iterate_func2(xd, yd, 0, 0, ysize,
1199 vdev_raidz_reconst_pq_func, &rpq);
1200 (void) abd_iterate_func(xd, ysize, xsize - ysize,
1201 vdev_raidz_reconst_pq_tail_func, &rpq);
1202
1203 abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1204 abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1205
1206 /*
1207 * Restore the saved parity data.
1208 */
1209 rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
1210 rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;
1211
1212 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
1213 }
1214
1215 /* BEGIN CSTYLED */
1216 /*
1217 * In the general case of reconstruction, we must solve the system of linear
1218 * equations defined by the coeffecients used to generate parity as well as
1219 * the contents of the data and parity disks. This can be expressed with
1220 * vectors for the original data (D) and the actual data (d) and parity (p)
1221 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1222 *
1223 * __ __ __ __
1224 * | | __ __ | p_0 |
1225 * | V | | D_0 | | p_m-1 |
1226 * | | x | : | = | d_0 |
1227 * | I | | D_n-1 | | : |
1228 * | | ~~ ~~ | d_n-1 |
1229 * ~~ ~~ ~~ ~~
1230 *
1231 * I is simply a square identity matrix of size n, and V is a vandermonde
1232 * matrix defined by the coeffecients we chose for the various parity columns
1233 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1234 * computation as well as linear separability.
1235 *
1236 * __ __ __ __
1237 * | 1 .. 1 1 1 | | p_0 |
1238 * | 2^n-1 .. 4 2 1 | __ __ | : |
1239 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
1240 * | 1 .. 0 0 0 | | D_1 | | d_0 |
1241 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
1242 * | : : : : | | : | | d_2 |
1243 * | 0 .. 1 0 0 | | D_n-1 | | : |
1244 * | 0 .. 0 1 0 | ~~ ~~ | : |
1245 * | 0 .. 0 0 1 | | d_n-1 |
1246 * ~~ ~~ ~~ ~~
1247 *
1248 * Note that I, V, d, and p are known. To compute D, we must invert the
1249 * matrix and use the known data and parity values to reconstruct the unknown
1250 * data values. We begin by removing the rows in V|I and d|p that correspond
1251 * to failed or missing columns; we then make V|I square (n x n) and d|p
1252 * sized n by removing rows corresponding to unused parity from the bottom up
1253 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1254 * using Gauss-Jordan elimination. In the example below we use m=3 parity
1255 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1256 * __ __
1257 * | 1 1 1 1 1 1 1 1 |
1258 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
1259 * | 19 205 116 29 64 16 4 1 | / /
1260 * | 1 0 0 0 0 0 0 0 | / /
1261 * | 0 1 0 0 0 0 0 0 | <--' /
1262 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
1263 * | 0 0 0 1 0 0 0 0 |
1264 * | 0 0 0 0 1 0 0 0 |
1265 * | 0 0 0 0 0 1 0 0 |
1266 * | 0 0 0 0 0 0 1 0 |
1267 * | 0 0 0 0 0 0 0 1 |
1268 * ~~ ~~
1269 * __ __
1270 * | 1 1 1 1 1 1 1 1 |
1271 * | 19 205 116 29 64 16 4 1 |
1272 * | 1 0 0 0 0 0 0 0 |
1273 * (V|I)' = | 0 0 0 1 0 0 0 0 |
1274 * | 0 0 0 0 1 0 0 0 |
1275 * | 0 0 0 0 0 1 0 0 |
1276 * | 0 0 0 0 0 0 1 0 |
1277 * | 0 0 0 0 0 0 0 1 |
1278 * ~~ ~~
1279 *
1280 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1281 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1282 * matrix is not singular.
1283 * __ __
1284 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1285 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1286 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1287 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1288 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1289 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1290 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1291 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1292 * ~~ ~~
1293 * __ __
1294 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1295 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1296 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1297 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1298 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1299 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1300 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1301 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1302 * ~~ ~~
1303 * __ __
1304 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1305 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1306 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
1307 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1308 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1309 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1310 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1311 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1312 * ~~ ~~
1313 * __ __
1314 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1315 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1316 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
1317 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1318 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1319 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1320 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1321 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1322 * ~~ ~~
1323 * __ __
1324 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1325 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1326 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1327 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1328 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1329 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1330 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1331 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1332 * ~~ ~~
1333 * __ __
1334 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1335 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
1336 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1337 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1338 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1339 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1340 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1341 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1342 * ~~ ~~
1343 * __ __
1344 * | 0 0 1 0 0 0 0 0 |
1345 * | 167 100 5 41 159 169 217 208 |
1346 * | 166 100 4 40 158 168 216 209 |
1347 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
1348 * | 0 0 0 0 1 0 0 0 |
1349 * | 0 0 0 0 0 1 0 0 |
1350 * | 0 0 0 0 0 0 1 0 |
1351 * | 0 0 0 0 0 0 0 1 |
1352 * ~~ ~~
1353 *
1354 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1355 * of the missing data.
1356 *
1357 * As is apparent from the example above, the only non-trivial rows in the
1358 * inverse matrix correspond to the data disks that we're trying to
1359 * reconstruct. Indeed, those are the only rows we need as the others would
1360 * only be useful for reconstructing data known or assumed to be valid. For
1361 * that reason, we only build the coefficients in the rows that correspond to
1362 * targeted columns.
1363 */
1364 /* END CSTYLED */
1365
1366 static void
1367 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1368 uint8_t **rows)
1369 {
1370 int i, j;
1371 int pow;
1372
1373 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1374
1375 /*
1376 * Fill in the missing rows of interest.
1377 */
1378 for (i = 0; i < nmap; i++) {
1379 ASSERT3S(0, <=, map[i]);
1380 ASSERT3S(map[i], <=, 2);
1381
1382 pow = map[i] * n;
1383 if (pow > 255)
1384 pow -= 255;
1385 ASSERT(pow <= 255);
1386
1387 for (j = 0; j < n; j++) {
1388 pow -= map[i];
1389 if (pow < 0)
1390 pow += 255;
1391 rows[i][j] = vdev_raidz_pow2[pow];
1392 }
1393 }
1394 }
1395
1396 static void
1397 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1398 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1399 {
1400 int i, j, ii, jj;
1401 uint8_t log;
1402
1403 /*
1404 * Assert that the first nmissing entries from the array of used
1405 * columns correspond to parity columns and that subsequent entries
1406 * correspond to data columns.
1407 */
1408 for (i = 0; i < nmissing; i++) {
1409 ASSERT3S(used[i], <, rm->rm_firstdatacol);
1410 }
1411 for (; i < n; i++) {
1412 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1413 }
1414
1415 /*
1416 * First initialize the storage where we'll compute the inverse rows.
1417 */
1418 for (i = 0; i < nmissing; i++) {
1419 for (j = 0; j < n; j++) {
1420 invrows[i][j] = (i == j) ? 1 : 0;
1421 }
1422 }
1423
1424 /*
1425 * Subtract all trivial rows from the rows of consequence.
1426 */
1427 for (i = 0; i < nmissing; i++) {
1428 for (j = nmissing; j < n; j++) {
1429 ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1430 jj = used[j] - rm->rm_firstdatacol;
1431 ASSERT3S(jj, <, n);
1432 invrows[i][j] = rows[i][jj];
1433 rows[i][jj] = 0;
1434 }
1435 }
1436
1437 /*
1438 * For each of the rows of interest, we must normalize it and subtract
1439 * a multiple of it from the other rows.
1440 */
1441 for (i = 0; i < nmissing; i++) {
1442 for (j = 0; j < missing[i]; j++) {
1443 ASSERT0(rows[i][j]);
1444 }
1445 ASSERT3U(rows[i][missing[i]], !=, 0);
1446
1447 /*
1448 * Compute the inverse of the first element and multiply each
1449 * element in the row by that value.
1450 */
1451 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1452
1453 for (j = 0; j < n; j++) {
1454 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1455 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1456 }
1457
1458 for (ii = 0; ii < nmissing; ii++) {
1459 if (i == ii)
1460 continue;
1461
1462 ASSERT3U(rows[ii][missing[i]], !=, 0);
1463
1464 log = vdev_raidz_log2[rows[ii][missing[i]]];
1465
1466 for (j = 0; j < n; j++) {
1467 rows[ii][j] ^=
1468 vdev_raidz_exp2(rows[i][j], log);
1469 invrows[ii][j] ^=
1470 vdev_raidz_exp2(invrows[i][j], log);
1471 }
1472 }
1473 }
1474
1475 /*
1476 * Verify that the data that is left in the rows are properly part of
1477 * an identity matrix.
1478 */
1479 for (i = 0; i < nmissing; i++) {
1480 for (j = 0; j < n; j++) {
1481 if (j == missing[i]) {
1482 ASSERT3U(rows[i][j], ==, 1);
1483 } else {
1484 ASSERT0(rows[i][j]);
1485 }
1486 }
1487 }
1488 }
1489
1490 static void
1491 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1492 int *missing, uint8_t **invrows, const uint8_t *used)
1493 {
1494 int i, j, x, cc, c;
1495 uint8_t *src;
1496 uint64_t ccount;
1497 uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1498 uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1499 uint8_t log = 0;
1500 uint8_t val;
1501 int ll;
1502 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1503 uint8_t *p, *pp;
1504 size_t psize;
1505
1506 psize = sizeof (invlog[0][0]) * n * nmissing;
1507 p = kmem_alloc(psize, KM_SLEEP);
1508
1509 for (pp = p, i = 0; i < nmissing; i++) {
1510 invlog[i] = pp;
1511 pp += n;
1512 }
1513
1514 for (i = 0; i < nmissing; i++) {
1515 for (j = 0; j < n; j++) {
1516 ASSERT3U(invrows[i][j], !=, 0);
1517 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1518 }
1519 }
1520
1521 for (i = 0; i < n; i++) {
1522 c = used[i];
1523 ASSERT3U(c, <, rm->rm_cols);
1524
1525 src = abd_to_buf(rm->rm_col[c].rc_abd);
1526 ccount = rm->rm_col[c].rc_size;
1527 for (j = 0; j < nmissing; j++) {
1528 cc = missing[j] + rm->rm_firstdatacol;
1529 ASSERT3U(cc, >=, rm->rm_firstdatacol);
1530 ASSERT3U(cc, <, rm->rm_cols);
1531 ASSERT3U(cc, !=, c);
1532
1533 dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
1534 dcount[j] = rm->rm_col[cc].rc_size;
1535 }
1536
1537 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1538
1539 for (x = 0; x < ccount; x++, src++) {
1540 if (*src != 0)
1541 log = vdev_raidz_log2[*src];
1542
1543 for (cc = 0; cc < nmissing; cc++) {
1544 if (x >= dcount[cc])
1545 continue;
1546
1547 if (*src == 0) {
1548 val = 0;
1549 } else {
1550 if ((ll = log + invlog[cc][i]) >= 255)
1551 ll -= 255;
1552 val = vdev_raidz_pow2[ll];
1553 }
1554
1555 if (i == 0)
1556 dst[cc][x] = val;
1557 else
1558 dst[cc][x] ^= val;
1559 }
1560 }
1561 }
1562
1563 kmem_free(p, psize);
1564 }
1565
1566 static int
1567 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1568 {
1569 int n, i, c, t, tt;
1570 int nmissing_rows;
1571 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1572 int parity_map[VDEV_RAIDZ_MAXPARITY];
1573
1574 uint8_t *p, *pp;
1575 size_t psize;
1576
1577 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1578 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1579 uint8_t *used;
1580
1581 abd_t **bufs = NULL;
1582
1583 int code = 0;
1584
1585 /*
1586 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1587 * temporary linear ABDs.
1588 */
1589 if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
1590 bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);
1591
1592 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1593 raidz_col_t *col = &rm->rm_col[c];
1594
1595 bufs[c] = col->rc_abd;
1596 col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
1597 abd_copy(col->rc_abd, bufs[c], col->rc_size);
1598 }
1599 }
1600
1601 n = rm->rm_cols - rm->rm_firstdatacol;
1602
1603 /*
1604 * Figure out which data columns are missing.
1605 */
1606 nmissing_rows = 0;
1607 for (t = 0; t < ntgts; t++) {
1608 if (tgts[t] >= rm->rm_firstdatacol) {
1609 missing_rows[nmissing_rows++] =
1610 tgts[t] - rm->rm_firstdatacol;
1611 }
1612 }
1613
1614 /*
1615 * Figure out which parity columns to use to help generate the missing
1616 * data columns.
1617 */
1618 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1619 ASSERT(tt < ntgts);
1620 ASSERT(c < rm->rm_firstdatacol);
1621
1622 /*
1623 * Skip any targeted parity columns.
1624 */
1625 if (c == tgts[tt]) {
1626 tt++;
1627 continue;
1628 }
1629
1630 code |= 1 << c;
1631
1632 parity_map[i] = c;
1633 i++;
1634 }
1635
1636 ASSERT(code != 0);
1637 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1638
1639 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1640 nmissing_rows * n + sizeof (used[0]) * n;
1641 p = kmem_alloc(psize, KM_SLEEP);
1642
1643 for (pp = p, i = 0; i < nmissing_rows; i++) {
1644 rows[i] = pp;
1645 pp += n;
1646 invrows[i] = pp;
1647 pp += n;
1648 }
1649 used = pp;
1650
1651 for (i = 0; i < nmissing_rows; i++) {
1652 used[i] = parity_map[i];
1653 }
1654
1655 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1656 if (tt < nmissing_rows &&
1657 c == missing_rows[tt] + rm->rm_firstdatacol) {
1658 tt++;
1659 continue;
1660 }
1661
1662 ASSERT3S(i, <, n);
1663 used[i] = c;
1664 i++;
1665 }
1666
1667 /*
1668 * Initialize the interesting rows of the matrix.
1669 */
1670 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1671
1672 /*
1673 * Invert the matrix.
1674 */
1675 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1676 invrows, used);
1677
1678 /*
1679 * Reconstruct the missing data using the generated matrix.
1680 */
1681 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1682 invrows, used);
1683
1684 kmem_free(p, psize);
1685
1686 /*
1687 * copy back from temporary linear abds and free them
1688 */
1689 if (bufs) {
1690 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1691 raidz_col_t *col = &rm->rm_col[c];
1692
1693 abd_copy(bufs[c], col->rc_abd, col->rc_size);
1694 abd_free(col->rc_abd);
1695 col->rc_abd = bufs[c];
1696 }
1697 kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
1698 }
1699
1700 return (code);
1701 }
1702
1703 static int
1704 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1705 {
1706 int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1707 int ntgts;
1708 int i, c;
1709 int code;
1710 int nbadparity, nbaddata;
1711 int parity_valid[VDEV_RAIDZ_MAXPARITY];
1712
1713 /*
1714 * The tgts list must already be sorted.
1715 */
1716 for (i = 1; i < nt; i++) {
1717 ASSERT(t[i] > t[i - 1]);
1718 }
1719
1720 nbadparity = rm->rm_firstdatacol;
1721 nbaddata = rm->rm_cols - nbadparity;
1722 ntgts = 0;
1723 for (i = 0, c = 0; c < rm->rm_cols; c++) {
1724 if (c < rm->rm_firstdatacol)
1725 parity_valid[c] = B_FALSE;
1726
1727 if (i < nt && c == t[i]) {
1728 tgts[ntgts++] = c;
1729 i++;
1730 } else if (rm->rm_col[c].rc_error != 0) {
1731 tgts[ntgts++] = c;
1732 } else if (c >= rm->rm_firstdatacol) {
1733 nbaddata--;
1734 } else {
1735 parity_valid[c] = B_TRUE;
1736 nbadparity--;
1737 }
1738 }
1739
1740 ASSERT(ntgts >= nt);
1741 ASSERT(nbaddata >= 0);
1742 ASSERT(nbaddata + nbadparity == ntgts);
1743
1744 dt = &tgts[nbadparity];
1745
1746 /*
1747 * See if we can use any of our optimized reconstruction routines.
1748 */
1749 if (!vdev_raidz_default_to_general) {
1750 switch (nbaddata) {
1751 case 1:
1752 if (parity_valid[VDEV_RAIDZ_P])
1753 return (vdev_raidz_reconstruct_p(rm, dt, 1));
1754
1755 ASSERT(rm->rm_firstdatacol > 1);
1756
1757 if (parity_valid[VDEV_RAIDZ_Q])
1758 return (vdev_raidz_reconstruct_q(rm, dt, 1));
1759
1760 ASSERT(rm->rm_firstdatacol > 2);
1761 break;
1762
1763 case 2:
1764 ASSERT(rm->rm_firstdatacol > 1);
1765
1766 if (parity_valid[VDEV_RAIDZ_P] &&
1767 parity_valid[VDEV_RAIDZ_Q])
1768 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1769
1770 ASSERT(rm->rm_firstdatacol > 2);
1771
1772 break;
1773 }
1774 }
1775
1776 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1777 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1778 ASSERT(code > 0);
1779 return (code);
1780 }
1781
1782 static int
1783 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1784 uint64_t *ashift)
1785 {
1786 vdev_t *cvd;
1787 uint64_t nparity = vd->vdev_nparity;
1788 int c;
1789 int lasterror = 0;
1790 int numerrors = 0;
1791
1792 ASSERT(nparity > 0);
1793
1794 if (nparity > VDEV_RAIDZ_MAXPARITY ||
1795 vd->vdev_children < nparity + 1) {
1796 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1797 return (SET_ERROR(EINVAL));
1798 }
1799
1800 vdev_open_children(vd);
1801
1802 for (c = 0; c < vd->vdev_children; c++) {
1803 cvd = vd->vdev_child[c];
1804
1805 if (cvd->vdev_open_error != 0) {
1806 lasterror = cvd->vdev_open_error;
1807 numerrors++;
1808 continue;
1809 }
1810
1811 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1812 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1813 *ashift = MAX(*ashift, cvd->vdev_ashift);
1814 }
1815
1816 *asize *= vd->vdev_children;
1817 *max_asize *= vd->vdev_children;
1818
1819 if (numerrors > nparity) {
1820 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1821 return (lasterror);
1822 }
1823
1824 return (0);
1825 }
1826
1827 static void
1828 vdev_raidz_close(vdev_t *vd)
1829 {
1830 int c;
1831
1832 for (c = 0; c < vd->vdev_children; c++)
1833 vdev_close(vd->vdev_child[c]);
1834 }
1835
1836 /*
1837 * Handle a read or write I/O to a RAID-Z dump device.
1838 *
1839 * The dump device is in a unique situation compared to other ZFS datasets:
1840 * writing to this device should be as simple and fast as possible. In
1841 * addition, durability matters much less since the dump will be extracted
1842 * once the machine reboots. For that reason, this function eschews parity for
1843 * performance and simplicity. The dump device uses the checksum setting
1844 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1845 * dataset.
1846 *
1847 * Blocks of size 128 KB have been preallocated for this volume. I/Os less than
1848 * 128 KB will not fill an entire block; in addition, they may not be properly
1849 * aligned. In that case, this function uses the preallocated 128 KB block and
1850 * omits reading or writing any "empty" portions of that block, as opposed to
1851 * allocating a fresh appropriately-sized block.
1852 *
1853 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1854 *
1855 * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1856 *
1857 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1858 * allocated which spans all five child vdevs. 8 KB of data would be written to
1859 * each of four vdevs, with the fifth containing the parity bits.
1860 *
1861 * parity data data data data
1862 * | PP | XX | XX | XX | XX |
1863 * ^ ^ ^ ^ ^
1864 * | | | | |
1865 * 8 KB parity ------8 KB data blocks------
1866 *
1867 * However, when writing to the dump device, the behavior is different:
1868 *
1869 * vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1870 *
1871 * Unlike the normal RAID-Z case in which the block is allocated based on the
1872 * I/O size, reads and writes here always use a 128 KB logical I/O size. If the
1873 * I/O size is less than 128 KB, only the actual portions of data are written.
1874 * In this example the data is written to the third data vdev since that vdev
1875 * contains the offset [64 KB, 96 KB).
1876 *
1877 * parity data data data data
1878 * | | | | XX | |
1879 * ^
1880 * |
1881 * 32 KB data block
1882 *
1883 * As a result, an individual I/O may not span all child vdevs; moreover, a
1884 * small I/O may only operate on a single child vdev.
1885 *
1886 * Note that since there are no parity bits calculated or written, this format
1887 * remains the same no matter how many parity bits are used in a normal RAID-Z
1888 * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above
1889 * would look like:
1890 *
1891 * parity parity parity data data data data
1892 * | | | | | | XX | |
1893 * ^
1894 * |
1895 * 32 KB data block
1896 */
1897 int
1898 vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1899 uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1900 {
1901 vdev_t *tvd = vd->vdev_top;
1902 vdev_t *cvd;
1903 raidz_map_t *rm;
1904 raidz_col_t *rc;
1905 int c, err = 0;
1906
1907 uint64_t start, end, colstart, colend;
1908 uint64_t coloffset, colsize, colskip;
1909
1910 int flags = doread ? B_READ : B_WRITE;
1911
1912 #ifdef _KERNEL
1913
1914 /*
1915 * Don't write past the end of the block
1916 */
1917 VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1918
1919 start = offset;
1920 end = start + size;
1921
1922 /*
1923 * Allocate a RAID-Z map for this block. Note that this block starts
1924 * from the "original" offset, this is, the offset of the extent which
1925 * contains the requisite offset of the data being read or written.
1926 *
1927 * Even if this I/O operation doesn't span the full block size, let's
1928 * treat the on-disk format as if the only blocks are the complete 128
1929 * KB size.
1930 */
1931 abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
1932 SPA_OLD_MAXBLOCKSIZE);
1933 rm = vdev_raidz_map_alloc(abd,
1934 SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift,
1935 vd->vdev_children, vd->vdev_nparity, B_TRUE);
1936
1937 coloffset = origoffset;
1938
1939 for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1940 c++, coloffset += rc->rc_size) {
1941 rc = &rm->rm_col[c];
1942 cvd = vd->vdev_child[rc->rc_devidx];
1943
1944 /*
1945 * Find the start and end of this column in the RAID-Z map,
1946 * keeping in mind that the stated size and offset of the
1947 * operation may not fill the entire column for this vdev.
1948 *
1949 * If any portion of the data spans this column, issue the
1950 * appropriate operation to the vdev.
1951 */
1952 if (coloffset + rc->rc_size <= start)
1953 continue;
1954 if (coloffset >= end)
1955 continue;
1956
1957 colstart = MAX(coloffset, start);
1958 colend = MIN(end, coloffset + rc->rc_size);
1959 colsize = colend - colstart;
1960 colskip = colstart - coloffset;
1961
1962 VERIFY3U(colsize, <=, rc->rc_size);
1963 VERIFY3U(colskip, <=, rc->rc_size);
1964
1965 /*
1966 * Note that the child vdev will have a vdev label at the start
1967 * of its range of offsets, hence the need for
1968 * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another
1969 * example of why this calculation is needed.
1970 */
1971 if ((err = vdev_disk_physio(cvd,
1972 ((char *)abd_to_buf(rc->rc_abd)) + colskip, colsize,
1973 VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1974 flags, isdump)) != 0)
1975 break;
1976 }
1977
1978 vdev_raidz_map_free(rm);
1979 abd_put(abd);
1980 #endif /* KERNEL */
1981
1982 return (err);
1983 }
1984
1985 static uint64_t
1986 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1987 {
1988 uint64_t asize;
1989 uint64_t ashift = vd->vdev_top->vdev_ashift;
1990 uint64_t cols = vd->vdev_children;
1991 uint64_t nparity = vd->vdev_nparity;
1992
1993 asize = ((psize - 1) >> ashift) + 1;
1994 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1995 asize = roundup(asize, nparity + 1) << ashift;
1996
1997 return (asize);
1998 }
1999
2000 /*
2001 * Converts an allocated size on a raidz vdev back to a logical block
2002 * size. This is used in trimming to figure out the appropriate logical
2003 * size to pass to vdev_raidz_map_alloc when splitting up extents of free
2004 * space obtained from metaslabs. However, a range of free space on a
2005 * raidz vdev might have originally consisted of multiple blocks and
2006 * those, taken together with their skip blocks, might not always align
2007 * neatly to a new vdev_raidz_map_alloc covering the entire unified
2008 * range. So to ensure that the newly allocated raidz map *always* fits
2009 * within the asize passed to this function and never exceeds it (since
2010 * that might trim allocated data past it), we round it down to the
2011 * nearest suitable multiple of the vdev ashift (hence the "_floor" in
2012 * this function's name).
2013 * This function is in effect an inverse of vdev_raidz_asize. However,
2014 * since multiple psizes can map to a single asize (due to variable padding,
2015 * this function instead returns the largest chunk that still fits inside
2016 * the specified asize).
2017 */
2018 static uint64_t
2019 vdev_raidz_psize_floor(vdev_t *vd, uint64_t asize)
2020 {
2021 uint64_t psize;
2022 uint64_t ashift = vd->vdev_top->vdev_ashift;
2023 uint64_t cols = vd->vdev_children;
2024 uint64_t nparity = vd->vdev_nparity;
2025
2026 psize = (asize - (nparity << ashift));
2027 psize /= cols;
2028 psize *= cols - nparity;
2029 psize += (1 << ashift) - 1;
2030
2031 psize = P2ALIGN(psize, 1 << ashift);
2032
2033 return (psize);
2034 }
2035
2036 static void
2037 vdev_raidz_child_done(zio_t *zio)
2038 {
2039 raidz_col_t *rc = zio->io_private;
2040
2041 rc->rc_error = zio->io_error;
2042 rc->rc_tried = 1;
2043 rc->rc_skipped = 0;
2044 }
2045
2046 /*
2047 * Start an IO operation on a RAIDZ VDev
2048 *
2049 * Outline:
2050 * - For write operations:
2051 * 1. Generate the parity data
2052 * 2. Create child zio write operations to each column's vdev, for both
2053 * data and parity.
2054 * 3. If the column skips any sectors for padding, create optional dummy
2055 * write zio children for those areas to improve aggregation continuity.
2056 * - For read operations:
2057 * 1. Create child zio read operations to each data column's vdev to read
2058 * the range of data required for zio.
2059 * 2. If this is a scrub or resilver operation, or if any of the data
2060 * vdevs have had errors, then create zio read operations to the parity
2061 * columns' VDevs as well.
2062 */
2063 static void
2064 vdev_raidz_io_start(zio_t *zio)
2065 {
2066 vdev_t *vd = zio->io_vd;
2067 vdev_t *tvd = vd->vdev_top;
2068 vdev_t *cvd;
2069 raidz_map_t *rm;
2070 raidz_col_t *rc;
2071 int c, i;
2072
2073 rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
2074 tvd->vdev_ashift, vd->vdev_children,
2075 vd->vdev_nparity, B_TRUE);
2076
2077 zio->io_vsd = rm;
2078 zio->io_vsd_ops = &vdev_raidz_vsd_ops;
2079
2080 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
2081
2082 if (zio->io_type == ZIO_TYPE_WRITE) {
2083 vdev_raidz_generate_parity(rm);
2084
2085 for (c = 0; c < rm->rm_cols; c++) {
2086 rc = &rm->rm_col[c];
2087 cvd = vd->vdev_child[rc->rc_devidx];
2088 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2089 rc->rc_offset, rc->rc_abd, rc->rc_size,
2090 zio->io_type, zio->io_priority, 0,
2091 vdev_raidz_child_done, rc));
2092 }
2093
2094 /*
2095 * Generate optional I/Os for any skipped sectors to improve
2096 * aggregation contiguity.
2097 */
2098 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
2099 ASSERT(c <= rm->rm_scols);
2100 if (c == rm->rm_scols)
2101 c = 0;
2102 rc = &rm->rm_col[c];
2103 cvd = vd->vdev_child[rc->rc_devidx];
2104 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2105 rc->rc_offset + rc->rc_size, NULL,
2106 1 << tvd->vdev_ashift,
2107 zio->io_type, zio->io_priority,
2108 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
2109 }
2110
2111 zio_execute(zio);
2112 return;
2113 }
2114
2115 ASSERT(zio->io_type == ZIO_TYPE_READ);
2116
2117 /*
2118 * Iterate over the columns in reverse order so that we hit the parity
2119 * last -- any errors along the way will force us to read the parity.
2120 */
2121 for (c = rm->rm_cols - 1; c >= 0; c--) {
2122 rc = &rm->rm_col[c];
2123 cvd = vd->vdev_child[rc->rc_devidx];
2124 if (!vdev_readable(cvd)) {
2125 if (c >= rm->rm_firstdatacol)
2126 rm->rm_missingdata++;
2127 else
2128 rm->rm_missingparity++;
2129 rc->rc_error = SET_ERROR(ENXIO);
2130 rc->rc_tried = 1; /* don't even try */
2131 rc->rc_skipped = 1;
2132 continue;
2133 }
2134 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2135 if (c >= rm->rm_firstdatacol)
2136 rm->rm_missingdata++;
2137 else
2138 rm->rm_missingparity++;
2139 rc->rc_error = SET_ERROR(ESTALE);
2140 rc->rc_skipped = 1;
2141 continue;
2142 }
2143 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
2144 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
2145 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2146 rc->rc_offset, rc->rc_abd, rc->rc_size,
2147 zio->io_type, zio->io_priority, 0,
2148 vdev_raidz_child_done, rc));
2149 }
2150 }
2151
2152 zio_execute(zio);
2153 }
2154
2155
2156 /*
2157 * Report a checksum error for a child of a RAID-Z device.
2158 */
2159 static void
2160 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
2161 {
2162 void *buf;
2163 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
2164 vdev_stat_t *vs = &vd->vdev_stat;
2165 spa_t *spa = zio->io_spa;
2166
2167 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2168 zio_bad_cksum_t zbc;
2169 raidz_map_t *rm = zio->io_vsd;
2170
2171 mutex_enter(&vd->vdev_stat_lock);
2172 vd->vdev_stat.vs_checksum_errors++;
2173 mutex_exit(&vd->vdev_stat_lock);
2174
2175 zbc.zbc_has_cksum = 0;
2176 zbc.zbc_injected = rm->rm_ecksuminjected;
2177
2178 buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size);
2179 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
2180 rc->rc_offset, rc->rc_size, buf, bad_data,
2181 &zbc);
2182 abd_return_buf(rc->rc_abd, buf, rc->rc_size);
2183 }
2184
2185 if (vd->vdev_isspecial && (vs->vs_checksum_errors ||
2186 vs->vs_read_errors || vs->vs_write_errors) &&
2187 !spa->spa_special_has_errors) {
2188 spa->spa_special_has_errors = B_TRUE;
2189 }
2190 }
2191
2192 /*
2193 * We keep track of whether or not there were any injected errors, so that
2194 * any ereports we generate can note it.
2195 */
2196 static int
2197 raidz_checksum_verify(zio_t *zio)
2198 {
2199 zio_bad_cksum_t zbc;
2200 raidz_map_t *rm = zio->io_vsd;
2201
2202 int ret = zio_checksum_error(zio, &zbc);
2203 if (ret != 0 && zbc.zbc_injected != 0)
2204 rm->rm_ecksuminjected = 1;
2205
2206 return (ret);
2207 }
2208
2209 /*
2210 * Generate the parity from the data columns. If we tried and were able to
2211 * read the parity without error, verify that the generated parity matches the
2212 * data we read. If it doesn't, we fire off a checksum error. Return the
2213 * number such failures.
2214 */
2215 static int
2216 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
2217 {
2218 void *orig[VDEV_RAIDZ_MAXPARITY];
2219 int c, ret = 0;
2220 raidz_col_t *rc;
2221
2222 blkptr_t *bp = zio->io_bp;
2223 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
2224 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
2225
2226 if (checksum == ZIO_CHECKSUM_NOPARITY)
2227 return (ret);
2228
2229 for (c = 0; c < rm->rm_firstdatacol; c++) {
2230 rc = &rm->rm_col[c];
2231 if (!rc->rc_tried || rc->rc_error != 0)
2232 continue;
2233 orig[c] = zio_buf_alloc(rc->rc_size);
2234 abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size);
2235 }
2236
2237 vdev_raidz_generate_parity(rm);
2238
2239 for (c = 0; c < rm->rm_firstdatacol; c++) {
2240 rc = &rm->rm_col[c];
2241 if (!rc->rc_tried || rc->rc_error != 0)
2242 continue;
2243 if (abd_cmp_buf(rc->rc_abd, orig[c], rc->rc_size) != 0) {
2244 raidz_checksum_error(zio, rc, orig[c]);
2245 rc->rc_error = SET_ERROR(ECKSUM);
2246 ret++;
2247 }
2248 zio_buf_free(orig[c], rc->rc_size);
2249 }
2250
2251 return (ret);
2252 }
2253
2254 /*
2255 * Keep statistics on all the ways that we used parity to correct data.
2256 */
2257 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
2258
2259 static int
2260 vdev_raidz_worst_error(raidz_map_t *rm)
2261 {
2262 int error = 0;
2263
2264 for (int c = 0; c < rm->rm_cols; c++)
2265 error = zio_worst_error(error, rm->rm_col[c].rc_error);
2266
2267 return (error);
2268 }
2269
2270 /*
2271 * Iterate over all combinations of bad data and attempt a reconstruction.
2272 * Note that the algorithm below is non-optimal because it doesn't take into
2273 * account how reconstruction is actually performed. For example, with
2274 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
2275 * is targeted as invalid as if columns 1 and 4 are targeted since in both
2276 * cases we'd only use parity information in column 0.
2277 */
2278 static int
2279 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
2280 {
2281 raidz_map_t *rm = zio->io_vsd;
2282 raidz_col_t *rc;
2283 void *orig[VDEV_RAIDZ_MAXPARITY];
2284 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
2285 int *tgts = &tstore[1];
2286 int current, next, i, c, n;
2287 int code, ret = 0;
2288
2289 ASSERT(total_errors < rm->rm_firstdatacol);
2290
2291 /*
2292 * This simplifies one edge condition.
2293 */
2294 tgts[-1] = -1;
2295
2296 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
2297 /*
2298 * Initialize the targets array by finding the first n columns
2299 * that contain no error.
2300 *
2301 * If there were no data errors, we need to ensure that we're
2302 * always explicitly attempting to reconstruct at least one
2303 * data column. To do this, we simply push the highest target
2304 * up into the data columns.
2305 */
2306 for (c = 0, i = 0; i < n; i++) {
2307 if (i == n - 1 && data_errors == 0 &&
2308 c < rm->rm_firstdatacol) {
2309 c = rm->rm_firstdatacol;
2310 }
2311
2312 while (rm->rm_col[c].rc_error != 0) {
2313 c++;
2314 ASSERT3S(c, <, rm->rm_cols);
2315 }
2316
2317 tgts[i] = c++;
2318 }
2319
2320 /*
2321 * Setting tgts[n] simplifies the other edge condition.
2322 */
2323 tgts[n] = rm->rm_cols;
2324
2325 /*
2326 * These buffers were allocated in previous iterations.
2327 */
2328 for (i = 0; i < n - 1; i++) {
2329 ASSERT(orig[i] != NULL);
2330 }
2331
2332 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
2333
2334 current = 0;
2335 next = tgts[current];
2336
2337 while (current != n) {
2338 tgts[current] = next;
2339 current = 0;
2340
2341 /*
2342 * Save off the original data that we're going to
2343 * attempt to reconstruct.
2344 */
2345 for (i = 0; i < n; i++) {
2346 ASSERT(orig[i] != NULL);
2347 c = tgts[i];
2348 ASSERT3S(c, >=, 0);
2349 ASSERT3S(c, <, rm->rm_cols);
2350 rc = &rm->rm_col[c];
2351 abd_copy_to_buf(orig[i], rc->rc_abd,
2352 rc->rc_size);
2353 }
2354
2355 /*
2356 * Attempt a reconstruction and exit the outer loop on
2357 * success.
2358 */
2359 code = vdev_raidz_reconstruct(rm, tgts, n);
2360 if (raidz_checksum_verify(zio) == 0) {
2361 atomic_inc_64(&raidz_corrected[code]);
2362
2363 for (i = 0; i < n; i++) {
2364 c = tgts[i];
2365 rc = &rm->rm_col[c];
2366 ASSERT(rc->rc_error == 0);
2367 if (rc->rc_tried)
2368 raidz_checksum_error(zio, rc,
2369 orig[i]);
2370 rc->rc_error = SET_ERROR(ECKSUM);
2371 }
2372
2373 ret = code;
2374 goto done;
2375 }
2376
2377 /*
2378 * Restore the original data.
2379 */
2380 for (i = 0; i < n; i++) {
2381 c = tgts[i];
2382 rc = &rm->rm_col[c];
2383 abd_copy_from_buf(rc->rc_abd, orig[i],
2384 rc->rc_size);
2385 }
2386
2387 do {
2388 /*
2389 * Find the next valid column after the current
2390 * position..
2391 */
2392 for (next = tgts[current] + 1;
2393 next < rm->rm_cols &&
2394 rm->rm_col[next].rc_error != 0; next++)
2395 continue;
2396
2397 ASSERT(next <= tgts[current + 1]);
2398
2399 /*
2400 * If that spot is available, we're done here.
2401 */
2402 if (next != tgts[current + 1])
2403 break;
2404
2405 /*
2406 * Otherwise, find the next valid column after
2407 * the previous position.
2408 */
2409 for (c = tgts[current - 1] + 1;
2410 rm->rm_col[c].rc_error != 0; c++)
2411 continue;
2412
2413 tgts[current] = c;
2414 current++;
2415
2416 } while (current != n);
2417 }
2418 }
2419 n--;
2420 done:
2421 for (i = 0; i < n; i++) {
2422 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2423 }
2424
2425 return (ret);
2426 }
2427
2428 /*
2429 * Complete an IO operation on a RAIDZ VDev
2430 *
2431 * Outline:
2432 * - For write operations:
2433 * 1. Check for errors on the child IOs.
2434 * 2. Return, setting an error code if too few child VDevs were written
2435 * to reconstruct the data later. Note that partial writes are
2436 * considered successful if they can be reconstructed at all.
2437 * - For read operations:
2438 * 1. Check for errors on the child IOs.
2439 * 2. If data errors occurred:
2440 * a. Try to reassemble the data from the parity available.
2441 * b. If we haven't yet read the parity drives, read them now.
2442 * c. If all parity drives have been read but the data still doesn't
2443 * reassemble with a correct checksum, then try combinatorial
2444 * reconstruction.
2445 * d. If that doesn't work, return an error.
2446 * 3. If there were unexpected errors or this is a resilver operation,
2447 * rewrite the vdevs that had errors.
2448 */
2449 static void
2450 vdev_raidz_io_done(zio_t *zio)
2451 {
2452 vdev_t *vd = zio->io_vd;
2453 vdev_t *cvd;
2454 raidz_map_t *rm = zio->io_vsd;
2455 raidz_col_t *rc;
2456 int unexpected_errors = 0;
2457 int parity_errors = 0;
2458 int parity_untried = 0;
2459 int data_errors = 0;
2460 int total_errors = 0;
2461 int n, c;
2462 int tgts[VDEV_RAIDZ_MAXPARITY];
2463 int code;
2464
2465 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2466 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2467
2468 for (c = 0; c < rm->rm_cols; c++) {
2469 rc = &rm->rm_col[c];
2470
2471 if (rc->rc_error) {
2472 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
2473
2474 if (c < rm->rm_firstdatacol)
2475 parity_errors++;
2476 else
2477 data_errors++;
2478
2479 if (!rc->rc_skipped)
2480 unexpected_errors++;
2481
2482 total_errors++;
2483 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2484 parity_untried++;
2485 }
2486 }
2487
2488 if (zio->io_type == ZIO_TYPE_WRITE) {
2489 /*
2490 * XXX -- for now, treat partial writes as a success.
2491 * (If we couldn't write enough columns to reconstruct
2492 * the data, the I/O failed. Otherwise, good enough.)
2493 *
2494 * Now that we support write reallocation, it would be better
2495 * to treat partial failure as real failure unless there are
2496 * no non-degraded top-level vdevs left, and not update DTLs
2497 * if we intend to reallocate.
2498 */
2499 /* XXPOLICY */
2500 if (total_errors > rm->rm_firstdatacol)
2501 zio->io_error = vdev_raidz_worst_error(rm);
2502
2503 return;
2504 }
2505
2506 ASSERT(zio->io_type == ZIO_TYPE_READ);
2507 /*
2508 * There are three potential phases for a read:
2509 * 1. produce valid data from the columns read
2510 * 2. read all disks and try again
2511 * 3. perform combinatorial reconstruction
2512 *
2513 * Each phase is progressively both more expensive and less likely to
2514 * occur. If we encounter more errors than we can repair or all phases
2515 * fail, we have no choice but to return an error.
2516 */
2517
2518 /*
2519 * If the number of errors we saw was correctable -- less than or equal
2520 * to the number of parity disks read -- attempt to produce data that
2521 * has a valid checksum. Naturally, this case applies in the absence of
2522 * any errors.
2523 */
2524 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2525 if (data_errors == 0) {
2526 if (raidz_checksum_verify(zio) == 0) {
2527 /*
2528 * If we read parity information (unnecessarily
2529 * as it happens since no reconstruction was
2530 * needed) regenerate and verify the parity.
2531 * We also regenerate parity when resilvering
2532 * so we can write it out to the failed device
2533 * later.
2534 */
2535 if (parity_errors + parity_untried <
2536 rm->rm_firstdatacol ||
2537 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2538 n = raidz_parity_verify(zio, rm);
2539 unexpected_errors += n;
2540 ASSERT(parity_errors + n <=
2541 rm->rm_firstdatacol);
2542 }
2543 goto done;
2544 }
2545 } else {
2546 /*
2547 * We either attempt to read all the parity columns or
2548 * none of them. If we didn't try to read parity, we
2549 * wouldn't be here in the correctable case. There must
2550 * also have been fewer parity errors than parity
2551 * columns or, again, we wouldn't be in this code path.
2552 */
2553 ASSERT(parity_untried == 0);
2554 ASSERT(parity_errors < rm->rm_firstdatacol);
2555
2556 /*
2557 * Identify the data columns that reported an error.
2558 */
2559 n = 0;
2560 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2561 rc = &rm->rm_col[c];
2562 if (rc->rc_error != 0) {
2563 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2564 tgts[n++] = c;
2565 }
2566 }
2567
2568 ASSERT(rm->rm_firstdatacol >= n);
2569
2570 code = vdev_raidz_reconstruct(rm, tgts, n);
2571
2572 if (raidz_checksum_verify(zio) == 0) {
2573 atomic_inc_64(&raidz_corrected[code]);
2574
2575 /*
2576 * If we read more parity disks than were used
2577 * for reconstruction, confirm that the other
2578 * parity disks produced correct data. This
2579 * routine is suboptimal in that it regenerates
2580 * the parity that we already used in addition
2581 * to the parity that we're attempting to
2582 * verify, but this should be a relatively
2583 * uncommon case, and can be optimized if it
2584 * becomes a problem. Note that we regenerate
2585 * parity when resilvering so we can write it
2586 * out to failed devices later.
2587 */
2588 if (parity_errors < rm->rm_firstdatacol - n ||
2589 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2590 n = raidz_parity_verify(zio, rm);
2591 unexpected_errors += n;
2592 ASSERT(parity_errors + n <=
2593 rm->rm_firstdatacol);
2594 }
2595
2596 goto done;
2597 }
2598 }
2599 }
2600
2601 /*
2602 * This isn't a typical situation -- either we got a read error or
2603 * a child silently returned bad data. Read every block so we can
2604 * try again with as much data and parity as we can track down. If
2605 * we've already been through once before, all children will be marked
2606 * as tried so we'll proceed to combinatorial reconstruction.
2607 */
2608 unexpected_errors = 1;
2609 rm->rm_missingdata = 0;
2610 rm->rm_missingparity = 0;
2611
2612 for (c = 0; c < rm->rm_cols; c++) {
2613 if (rm->rm_col[c].rc_tried)
2614 continue;
2615
2616 zio_vdev_io_redone(zio);
2617 do {
2618 rc = &rm->rm_col[c];
2619 if (rc->rc_tried)
2620 continue;
2621 zio_nowait(zio_vdev_child_io(zio, NULL,
2622 vd->vdev_child[rc->rc_devidx],
2623 rc->rc_offset, rc->rc_abd, rc->rc_size,
2624 zio->io_type, zio->io_priority, 0,
2625 vdev_raidz_child_done, rc));
2626 } while (++c < rm->rm_cols);
2627
2628 return;
2629 }
2630
2631 /*
2632 * At this point we've attempted to reconstruct the data given the
2633 * errors we detected, and we've attempted to read all columns. There
2634 * must, therefore, be one or more additional problems -- silent errors
2635 * resulting in invalid data rather than explicit I/O errors resulting
2636 * in absent data. We check if there is enough additional data to
2637 * possibly reconstruct the data and then perform combinatorial
2638 * reconstruction over all possible combinations. If that fails,
2639 * we're cooked.
2640 */
2641 if (total_errors > rm->rm_firstdatacol) {
2642 zio->io_error = vdev_raidz_worst_error(rm);
2643
2644 } else if (total_errors < rm->rm_firstdatacol &&
2645 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2646 /*
2647 * If we didn't use all the available parity for the
2648 * combinatorial reconstruction, verify that the remaining
2649 * parity is correct.
2650 */
2651 if (code != (1 << rm->rm_firstdatacol) - 1)
2652 (void) raidz_parity_verify(zio, rm);
2653 } else {
2654 /*
2655 * We're here because either:
2656 *
2657 * total_errors == rm_first_datacol, or
2658 * vdev_raidz_combrec() failed
2659 *
2660 * In either case, there is enough bad data to prevent
2661 * reconstruction.
2662 *
2663 * Start checksum ereports for all children which haven't
2664 * failed, and the IO wasn't speculative.
2665 */
2666 zio->io_error = SET_ERROR(ECKSUM);
2667
2668 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2669 for (c = 0; c < rm->rm_cols; c++) {
2670 rc = &rm->rm_col[c];
2671 if (rc->rc_error == 0) {
2672 zio_bad_cksum_t zbc;
2673 zbc.zbc_has_cksum = 0;
2674 zbc.zbc_injected =
2675 rm->rm_ecksuminjected;
2676
2677 zfs_ereport_start_checksum(
2678 zio->io_spa,
2679 vd->vdev_child[rc->rc_devidx],
2680 zio, rc->rc_offset, rc->rc_size,
2681 (void *)(uintptr_t)c, &zbc);
2682 }
2683 }
2684 }
2685 }
2686
2687 done:
2688 zio_checksum_verified(zio);
2689
2690 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2691 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2692 /*
2693 * Use the good data we have in hand to repair damaged children.
2694 */
2695 for (c = 0; c < rm->rm_cols; c++) {
2696 rc = &rm->rm_col[c];
2697 cvd = vd->vdev_child[rc->rc_devidx];
2698
2699 if (rc->rc_error == 0)
2700 continue;
2701
2702 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2703 rc->rc_offset, rc->rc_abd, rc->rc_size,
2704 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2705 ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2706 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2707 }
2708 }
2709 }
2710
2711 static void
2712 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2713 {
2714 if (faulted > vd->vdev_nparity)
2715 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2716 VDEV_AUX_NO_REPLICAS);
2717 else if (degraded + faulted != 0)
2718 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2719 else
2720 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2721 }
2722
2723 static inline void
2724 vdev_raidz_trim_append_rc(dkioc_free_list_t *dfl, uint64_t *num_extsp,
2725 const raidz_col_t *rc)
2726 {
2727 uint64_t num_exts = *num_extsp;
2728 ASSERT(rc->rc_size != 0);
2729
2730 if (dfl->dfl_num_exts > 0 &&
2731 dfl->dfl_exts[num_exts - 1].dfle_start +
2732 dfl->dfl_exts[num_exts - 1].dfle_length == rc->rc_offset) {
2733 dfl->dfl_exts[num_exts - 1].dfle_length += rc->rc_size;
2734 } else {
2735 dfl->dfl_exts[num_exts].dfle_start = rc->rc_offset;
2736 dfl->dfl_exts[num_exts].dfle_length = rc->rc_size;
2737 (*num_extsp)++;
2738 }
2739 }
2740
2741 /*
2742 * Processes a trim for a raidz vdev.
2743 */
2744 static void
2745 vdev_raidz_trim(vdev_t *vd, zio_t *pio, void *trim_exts)
2746 {
2747 dkioc_free_list_t *dfl = trim_exts;
2748 dkioc_free_list_t **sub_dfls;
2749 uint64_t *sub_dfls_num_exts;
2750
2751 sub_dfls = kmem_zalloc(sizeof (*sub_dfls) * vd->vdev_children,
2752 KM_SLEEP);
2753 sub_dfls_num_exts = kmem_zalloc(sizeof (uint64_t) * vd->vdev_children,
2754 KM_SLEEP);
2755 for (int i = 0; i < vd->vdev_children; i++) {
2756 /*
2757 * We might over-allocate here, because the sub-lists can never
2758 * be longer than the parent list, but they can be shorter.
2759 * The underlying driver will discard zero-length extents.
2760 */
2761 sub_dfls[i] = kmem_zalloc(DFL_SZ(dfl->dfl_num_exts), KM_SLEEP);
2762 sub_dfls[i]->dfl_num_exts = dfl->dfl_num_exts;
2763 sub_dfls[i]->dfl_flags = dfl->dfl_flags;
2764 sub_dfls[i]->dfl_offset = dfl->dfl_offset;
2765 /* don't copy the check func, because it isn't raidz-aware */
2766 }
2767
2768 /*
2769 * Process all extents and redistribute them to the component vdevs
2770 * according to a computed raidz map geometry.
2771 */
2772 for (int i = 0; i < dfl->dfl_num_exts; i++) {
2773 uint64_t start = dfl->dfl_exts[i].dfle_start;
2774 uint64_t length = dfl->dfl_exts[i].dfle_length;
2775 raidz_map_t *rm = vdev_raidz_map_alloc(NULL,
2776 vdev_raidz_psize_floor(vd, length), start,
2777 vd->vdev_top->vdev_ashift, vd->vdev_children,
2778 vd->vdev_nparity, B_FALSE);
2779
2780 for (uint64_t j = 0; j < rm->rm_cols; j++) {
2781 uint64_t devidx = rm->rm_col[j].rc_devidx;
2782 vdev_raidz_trim_append_rc(sub_dfls[devidx],
2783 &sub_dfls_num_exts[devidx], &rm->rm_col[j]);
2784 }
2785 vdev_raidz_map_free(rm);
2786 }
2787
2788 /*
2789 * Issue the component ioctls as children of the parent zio.
2790 */
2791 for (int i = 0; i < vd->vdev_children; i++) {
2792 if (sub_dfls_num_exts[i] != 0) {
2793 zio_nowait(zio_ioctl(pio, vd->vdev_child[i]->vdev_spa,
2794 vd->vdev_child[i], DKIOCFREE,
2795 vdev_raidz_trim_done, sub_dfls[i],
2796 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
2797 ZIO_FLAG_DONT_RETRY));
2798 } else {
2799 dfl_free(sub_dfls[i]);
2800 }
2801 }
2802 kmem_free(sub_dfls, sizeof (*sub_dfls) * vd->vdev_children);
2803 kmem_free(sub_dfls_num_exts, sizeof (uint64_t) * vd->vdev_children);
2804 }
2805
2806 /*
2807 * Releases a dkioc_free_list_t from ioctls issued to component devices in
2808 * vdev_raidz_dkioc_free.
2809 */
2810 static void
2811 vdev_raidz_trim_done(zio_t *zio)
2812 {
2813 ASSERT(zio->io_private != NULL);
2814 dfl_free(zio->io_private);
2815 }
2816
2817 vdev_ops_t vdev_raidz_ops = {
2818 vdev_raidz_open,
2819 vdev_raidz_close,
2820 vdev_raidz_asize,
2821 vdev_raidz_io_start,
2822 vdev_raidz_io_done,
2823 vdev_raidz_state_change,
2824 NULL,
2825 NULL,
2826 vdev_raidz_trim,
2827 VDEV_TYPE_RAIDZ, /* name of this vdev type */
2828 B_FALSE /* not a leaf vdev */
2829 };