Print this page
NEX-16191 scrub after trim finds thousands of checksum errors
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
NEX-15749 zpool trim command for a raidz-pool causes panic
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
NEX-15749 zpool trim command for a raidz-pool causes panic
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
NEX-14571 remove isal support remnants
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
NEX-5795 Rename 'wrc' as 'wbc' in the source and in the tech docs
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
NEX-4620 ZFS autotrim triggering is unreliable
NEX-4622 On-demand TRIM code illogically enumerates metaslabs via mg_ms_tree
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
Reviewed by: Hans Rosenfeld <hans.rosenfeld@nexenta.com>
NEX-3984 On-demand TRIM
Reviewed by: Alek Pinchuk <alek@nexenta.com>
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
Conflicts:
usr/src/common/zfs/zpool_prop.c
usr/src/uts/common/sys/fs/zfs.h
NEX-4003 WRC: System panics on debug build
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
NEX-3558 KRRP Integration
NEX-3508 CLONE - Port NEX-2946 Add UNMAP/TRIM functionality to ZFS and illumos
Reviewed by: Josef Sipek <josef.sipek@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Conflicts:
usr/src/uts/common/io/scsi/targets/sd.c
usr/src/uts/common/sys/scsi/targets/sddef.h
re #8279 rb3915 need a mechanism to notify NMS about ZFS config changes (fix lint -courtesy of Yuri Pankov)
re #12584 rb4049 zfsxx latest code merge (fix lint - courtesy of Yuri Pankov)
re #12585 rb4049 ZFS++ work port - refactoring to improve separation of open/closed code, bug fixes, performance improvements - open code
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/fs/zfs/vdev_raidz.c
+++ new/usr/src/uts/common/fs/zfs/vdev_raidz.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
|
↓ open down ↓ |
16 lines elided |
↑ open up ↑ |
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 24 * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
25 25 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26 26 * Copyright (c) 2014 Integros [integros.com]
27 + * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
27 28 */
28 29
29 30 #include <sys/zfs_context.h>
30 31 #include <sys/spa.h>
32 +#include <sys/spa_impl.h>
31 33 #include <sys/vdev_impl.h>
32 34 #include <sys/vdev_disk.h>
33 35 #include <sys/vdev_file.h>
34 36 #include <sys/vdev_raidz.h>
35 37 #include <sys/zio.h>
36 38 #include <sys/zio_checksum.h>
37 39 #include <sys/abd.h>
38 40 #include <sys/fs/zfs.h>
39 41 #include <sys/fm/fs/zfs.h>
42 +#include <sys/dkioc_free_util.h>
40 43
41 44 /*
42 45 * Virtual device vector for RAID-Z.
43 46 *
44 47 * This vdev supports single, double, and triple parity. For single parity,
45 48 * we use a simple XOR of all the data columns. For double or triple parity,
46 49 * we use a special case of Reed-Solomon coding. This extends the
47 50 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
48 51 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
49 52 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
50 53 * former is also based. The latter is designed to provide higher performance
51 54 * for writes.
52 55 *
53 56 * Note that the Plank paper claimed to support arbitrary N+M, but was then
54 57 * amended six years later identifying a critical flaw that invalidates its
55 58 * claims. Nevertheless, the technique can be adapted to work for up to
56 59 * triple parity. For additional parity, the amendment "Note: Correction to
57 60 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
58 61 * is viable, but the additional complexity means that write performance will
59 62 * suffer.
60 63 *
61 64 * All of the methods above operate on a Galois field, defined over the
62 65 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
63 66 * can be expressed with a single byte. Briefly, the operations on the
64 67 * field are defined as follows:
65 68 *
66 69 * o addition (+) is represented by a bitwise XOR
67 70 * o subtraction (-) is therefore identical to addition: A + B = A - B
68 71 * o multiplication of A by 2 is defined by the following bitwise expression:
69 72 *
70 73 * (A * 2)_7 = A_6
71 74 * (A * 2)_6 = A_5
72 75 * (A * 2)_5 = A_4
73 76 * (A * 2)_4 = A_3 + A_7
74 77 * (A * 2)_3 = A_2 + A_7
75 78 * (A * 2)_2 = A_1 + A_7
76 79 * (A * 2)_1 = A_0
77 80 * (A * 2)_0 = A_7
78 81 *
79 82 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
80 83 * As an aside, this multiplication is derived from the error correcting
81 84 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
82 85 *
83 86 * Observe that any number in the field (except for 0) can be expressed as a
84 87 * power of 2 -- a generator for the field. We store a table of the powers of
85 88 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
86 89 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
87 90 * than field addition). The inverse of a field element A (A^-1) is therefore
88 91 * A ^ (255 - 1) = A^254.
89 92 *
90 93 * The up-to-three parity columns, P, Q, R over several data columns,
91 94 * D_0, ... D_n-1, can be expressed by field operations:
92 95 *
93 96 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
94 97 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
95 98 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
96 99 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
97 100 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
98 101 *
99 102 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
100 103 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
101 104 * independent coefficients. (There are no additional coefficients that have
102 105 * this property which is why the uncorrected Plank method breaks down.)
103 106 *
104 107 * See the reconstruction code below for how P, Q and R can used individually
105 108 * or in concert to recover missing data columns.
106 109 */
107 110
108 111 typedef struct raidz_col {
109 112 uint64_t rc_devidx; /* child device index for I/O */
110 113 uint64_t rc_offset; /* device offset */
111 114 uint64_t rc_size; /* I/O size */
112 115 abd_t *rc_abd; /* I/O data */
113 116 void *rc_gdata; /* used to store the "good" version */
114 117 int rc_error; /* I/O error for this device */
115 118 uint8_t rc_tried; /* Did we attempt this I/O column? */
116 119 uint8_t rc_skipped; /* Did we skip this I/O column? */
117 120 } raidz_col_t;
118 121
119 122 typedef struct raidz_map {
120 123 uint64_t rm_cols; /* Regular column count */
121 124 uint64_t rm_scols; /* Count including skipped columns */
122 125 uint64_t rm_bigcols; /* Number of oversized columns */
123 126 uint64_t rm_asize; /* Actual total I/O size */
124 127 uint64_t rm_missingdata; /* Count of missing data devices */
125 128 uint64_t rm_missingparity; /* Count of missing parity devices */
126 129 uint64_t rm_firstdatacol; /* First data column/parity count */
127 130 uint64_t rm_nskip; /* Skipped sectors for padding */
128 131 uint64_t rm_skipstart; /* Column index of padding start */
129 132 abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */
130 133 uintptr_t rm_reports; /* # of referencing checksum reports */
131 134 uint8_t rm_freed; /* map no longer has referencing ZIO */
132 135 uint8_t rm_ecksuminjected; /* checksum error was injected */
133 136 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
134 137 } raidz_map_t;
135 138
136 139 #define VDEV_RAIDZ_P 0
137 140 #define VDEV_RAIDZ_Q 1
138 141 #define VDEV_RAIDZ_R 2
139 142
140 143 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
141 144 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
142 145
143 146 /*
144 147 * We provide a mechanism to perform the field multiplication operation on a
145 148 * 64-bit value all at once rather than a byte at a time. This works by
146 149 * creating a mask from the top bit in each byte and using that to
147 150 * conditionally apply the XOR of 0x1d.
148 151 */
149 152 #define VDEV_RAIDZ_64MUL_2(x, mask) \
150 153 { \
151 154 (mask) = (x) & 0x8080808080808080ULL; \
152 155 (mask) = ((mask) << 1) - ((mask) >> 7); \
153 156 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
154 157 ((mask) & 0x1d1d1d1d1d1d1d1d); \
155 158 }
156 159
157 160 #define VDEV_RAIDZ_64MUL_4(x, mask) \
158 161 { \
159 162 VDEV_RAIDZ_64MUL_2((x), mask); \
|
↓ open down ↓ |
110 lines elided |
↑ open up ↑ |
160 163 VDEV_RAIDZ_64MUL_2((x), mask); \
161 164 }
162 165
163 166 #define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE)
164 167
165 168 /*
166 169 * Force reconstruction to use the general purpose method.
167 170 */
168 171 int vdev_raidz_default_to_general;
169 172
170 -/* Powers of 2 in the Galois field defined above. */
173 +/*
174 + * xor_p hook for external acceleration libraries.
175 + */
176 +int (*zfs_xorp_hook)(int vects, int len, void **array) = NULL;
177 +
178 +/*
179 + * These two tables represent powers and logs of 2 in the Galois field defined
180 + * above. These values were computed by repeatedly multiplying by 2 as above.
181 + */
171 182 static const uint8_t vdev_raidz_pow2[256] = {
172 183 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
173 184 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
174 185 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
175 186 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
176 187 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
177 188 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
178 189 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
179 190 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
180 191 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
181 192 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
182 193 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
183 194 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
184 195 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
185 196 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
186 197 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
187 198 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
188 199 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
189 200 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
190 201 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
191 202 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
192 203 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
193 204 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
194 205 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
195 206 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
196 207 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
197 208 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
198 209 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
199 210 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
200 211 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
201 212 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
202 213 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
203 214 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
204 215 };
205 216 /* Logs of 2 in the Galois field defined above. */
206 217 static const uint8_t vdev_raidz_log2[256] = {
207 218 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
208 219 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
209 220 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
210 221 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
211 222 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
212 223 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
213 224 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
214 225 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
215 226 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
216 227 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
217 228 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
218 229 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
219 230 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
220 231 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
221 232 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
222 233 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
223 234 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
224 235 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
225 236 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
226 237 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
227 238 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
228 239 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
229 240 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
230 241 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
231 242 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
|
↓ open down ↓ |
51 lines elided |
↑ open up ↑ |
232 243 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
233 244 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
234 245 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
235 246 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
236 247 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
237 248 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
238 249 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
239 250 };
240 251
241 252 static void vdev_raidz_generate_parity(raidz_map_t *rm);
253 +static void vdev_raidz_trim_done(zio_t *zio);
242 254
243 255 /*
244 256 * Multiply a given number by 2 raised to the given power.
245 257 */
246 258 static uint8_t
247 259 vdev_raidz_exp2(uint_t a, int exp)
248 260 {
249 261 if (a == 0)
250 262 return (0);
251 263
252 264 ASSERT(exp >= 0);
253 265 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
254 266
255 267 exp += vdev_raidz_log2[a];
256 268 if (exp > 255)
257 269 exp -= 255;
258 270
|
↓ open down ↓ |
7 lines elided |
↑ open up ↑ |
259 271 return (vdev_raidz_pow2[exp]);
260 272 }
261 273
262 274 static void
263 275 vdev_raidz_map_free(raidz_map_t *rm)
264 276 {
265 277 int c;
266 278 size_t size;
267 279
268 280 for (c = 0; c < rm->rm_firstdatacol; c++) {
269 - abd_free(rm->rm_col[c].rc_abd);
281 + /*
282 + * TRIM doesn't allocate data blocks,
283 + * so 'rc_abd' is NULL in this case.
284 + * See vdev_raidz_trim() and vdev_raidz_map_alloc()
285 + * for more details.
286 + */
287 + if (rm->rm_col[c].rc_abd != NULL)
288 + abd_free(rm->rm_col[c].rc_abd);
270 289
271 290 if (rm->rm_col[c].rc_gdata != NULL)
272 291 zio_buf_free(rm->rm_col[c].rc_gdata,
273 292 rm->rm_col[c].rc_size);
274 293 }
275 294
276 295 size = 0;
277 296 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
278 - abd_put(rm->rm_col[c].rc_abd);
297 + /*
298 + * TRIM doesn't allocate data blocks,
299 + * so 'rc_abd' is NULL in this case
300 + * See vdev_raidz_trim() and vdev_raidz_map_alloc()
301 + * for more details.
302 + */
303 + if (rm->rm_col[c].rc_abd != NULL)
304 + abd_put(rm->rm_col[c].rc_abd);
279 305 size += rm->rm_col[c].rc_size;
280 306 }
281 307
282 308 if (rm->rm_abd_copy != NULL)
283 309 abd_free(rm->rm_abd_copy);
284 310
285 311 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
286 312 }
287 313
288 314 static void
289 315 vdev_raidz_map_free_vsd(zio_t *zio)
290 316 {
291 317 raidz_map_t *rm = zio->io_vsd;
292 318
293 319 ASSERT0(rm->rm_freed);
294 320 rm->rm_freed = 1;
295 321
296 322 if (rm->rm_reports == 0)
297 323 vdev_raidz_map_free(rm);
298 324 }
299 325
300 326 /*ARGSUSED*/
301 327 static void
302 328 vdev_raidz_cksum_free(void *arg, size_t ignored)
303 329 {
304 330 raidz_map_t *rm = arg;
305 331
306 332 ASSERT3U(rm->rm_reports, >, 0);
307 333
308 334 if (--rm->rm_reports == 0 && rm->rm_freed != 0)
309 335 vdev_raidz_map_free(rm);
310 336 }
311 337
312 338 static void
313 339 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
314 340 {
315 341 raidz_map_t *rm = zcr->zcr_cbdata;
316 342 size_t c = zcr->zcr_cbinfo;
317 343 size_t x;
318 344
319 345 const char *good = NULL;
320 346 char *bad;
321 347
322 348 if (good_data == NULL) {
323 349 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
324 350 return;
325 351 }
326 352
327 353 if (c < rm->rm_firstdatacol) {
328 354 /*
329 355 * The first time through, calculate the parity blocks for
330 356 * the good data (this relies on the fact that the good
331 357 * data never changes for a given logical ZIO)
332 358 */
333 359 if (rm->rm_col[0].rc_gdata == NULL) {
334 360 abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
335 361 char *buf;
336 362 int offset;
337 363
338 364 /*
339 365 * Set up the rm_col[]s to generate the parity for
340 366 * good_data, first saving the parity bufs and
341 367 * replacing them with buffers to hold the result.
342 368 */
343 369 for (x = 0; x < rm->rm_firstdatacol; x++) {
344 370 bad_parity[x] = rm->rm_col[x].rc_abd;
345 371 rm->rm_col[x].rc_gdata =
346 372 zio_buf_alloc(rm->rm_col[x].rc_size);
347 373 rm->rm_col[x].rc_abd =
348 374 abd_get_from_buf(rm->rm_col[x].rc_gdata,
349 375 rm->rm_col[x].rc_size);
350 376 }
351 377
352 378 /* fill in the data columns from good_data */
353 379 buf = (char *)good_data;
354 380 for (; x < rm->rm_cols; x++) {
355 381 abd_put(rm->rm_col[x].rc_abd);
356 382 rm->rm_col[x].rc_abd = abd_get_from_buf(buf,
357 383 rm->rm_col[x].rc_size);
358 384 buf += rm->rm_col[x].rc_size;
359 385 }
360 386
361 387 /*
362 388 * Construct the parity from the good data.
363 389 */
364 390 vdev_raidz_generate_parity(rm);
365 391
366 392 /* restore everything back to its original state */
367 393 for (x = 0; x < rm->rm_firstdatacol; x++) {
368 394 abd_put(rm->rm_col[x].rc_abd);
369 395 rm->rm_col[x].rc_abd = bad_parity[x];
370 396 }
371 397
372 398 offset = 0;
373 399 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
374 400 abd_put(rm->rm_col[x].rc_abd);
375 401 rm->rm_col[x].rc_abd = abd_get_offset(
376 402 rm->rm_abd_copy, offset);
377 403 offset += rm->rm_col[x].rc_size;
378 404 }
379 405 }
380 406
381 407 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
382 408 good = rm->rm_col[c].rc_gdata;
383 409 } else {
384 410 /* adjust good_data to point at the start of our column */
385 411 good = good_data;
386 412
387 413 for (x = rm->rm_firstdatacol; x < c; x++)
388 414 good += rm->rm_col[x].rc_size;
389 415 }
390 416
391 417 bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size);
392 418 /* we drop the ereport if it ends up that the data was good */
393 419 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
394 420 abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size);
395 421 }
396 422
397 423 /*
398 424 * Invoked indirectly by zfs_ereport_start_checksum(), called
399 425 * below when our read operation fails completely. The main point
400 426 * is to keep a copy of everything we read from disk, so that at
401 427 * vdev_raidz_cksum_finish() time we can compare it with the good data.
402 428 */
403 429 static void
404 430 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
405 431 {
406 432 size_t c = (size_t)(uintptr_t)arg;
407 433 size_t offset;
408 434
409 435 raidz_map_t *rm = zio->io_vsd;
410 436 size_t size;
411 437
412 438 /* set up the report and bump the refcount */
413 439 zcr->zcr_cbdata = rm;
414 440 zcr->zcr_cbinfo = c;
415 441 zcr->zcr_finish = vdev_raidz_cksum_finish;
416 442 zcr->zcr_free = vdev_raidz_cksum_free;
417 443
418 444 rm->rm_reports++;
419 445 ASSERT3U(rm->rm_reports, >, 0);
420 446
421 447 if (rm->rm_abd_copy != NULL)
422 448 return;
423 449
424 450 /*
425 451 * It's the first time we're called for this raidz_map_t, so we need
426 452 * to copy the data aside; there's no guarantee that our zio's buffer
427 453 * won't be re-used for something else.
428 454 *
429 455 * Our parity data is already in separate buffers, so there's no need
430 456 * to copy them.
431 457 */
432 458
433 459 size = 0;
434 460 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
435 461 size += rm->rm_col[c].rc_size;
436 462
437 463 rm->rm_abd_copy =
438 464 abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size);
439 465
440 466 for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
441 467 raidz_col_t *col = &rm->rm_col[c];
442 468 abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset);
443 469
444 470 abd_copy(tmp, col->rc_abd, col->rc_size);
445 471 abd_put(col->rc_abd);
446 472 col->rc_abd = tmp;
447 473
448 474 offset += col->rc_size;
|
↓ open down ↓ |
160 lines elided |
↑ open up ↑ |
449 475 }
450 476 ASSERT3U(offset, ==, size);
451 477 }
452 478
453 479 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
454 480 vdev_raidz_map_free_vsd,
455 481 vdev_raidz_cksum_report
456 482 };
457 483
458 484 /*
459 - * Divides the IO evenly across all child vdevs; usually, dcols is
460 - * the number of children in the target vdev.
485 + * Allocates and computes a raidz column map, which directs the raidz column
486 + * handling algorithms where to locate and store data and parity columns for
487 + * a particular DVA. Usually, dcols is the number of children in the target
488 + * vdev.
489 + *
490 + * The `io_offset', `io_size' and `io_data' hold the offset, size and data
491 + * of the zio for which this map is to be computed.
492 + * The `unit_shift' parameter contains the minimum allocation bitshift of
493 + * the storage pool. The `dcols' parameter contains the number of drives in
494 + * this raidz vdev (including parity drives), with `nparity' denoting how
495 + * many those contain the parity (one, two or three).
496 + *
497 + * The `alloc_io_bufs' flag denotes whether you want the constructed raidz
498 + * map to contain allocated buffers to hold column IO data or not (if
499 + * you're using this function simply to determine raidz geometry, you'll
500 + * want to pass B_FALSE here).
461 501 */
462 502 static raidz_map_t *
463 503 vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset,
464 - uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
504 + uint64_t unit_shift, uint64_t dcols, uint64_t nparity,
505 + boolean_t alloc_data)
465 506 {
466 507 raidz_map_t *rm;
467 508 /* The starting RAIDZ (parent) vdev sector of the block. */
468 509 uint64_t b = offset >> unit_shift;
469 510 /* The zio's size in units of the vdev's minimum sector size. */
470 511 uint64_t s = size >> unit_shift;
471 512 /* The first column for this stripe. */
472 513 uint64_t f = b % dcols;
473 514 /* The starting byte offset on each child vdev. */
474 515 uint64_t o = (b / dcols) << unit_shift;
475 516 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
476 517 uint64_t off = 0;
477 518
478 519 /*
479 520 * "Quotient": The number of data sectors for this stripe on all but
480 521 * the "big column" child vdevs that also contain "remainder" data.
481 522 */
482 523 q = s / (dcols - nparity);
483 524
484 525 /*
485 526 * "Remainder": The number of partial stripe data sectors in this I/O.
486 527 * This will add a sector to some, but not all, child vdevs.
487 528 */
488 529 r = s - q * (dcols - nparity);
489 530
490 531 /* The number of "big columns" - those which contain remainder data. */
491 532 bc = (r == 0 ? 0 : r + nparity);
492 533
493 534 /*
494 535 * The total number of data and parity sectors associated with
495 536 * this I/O.
496 537 */
497 538 tot = s + nparity * (q + (r == 0 ? 0 : 1));
498 539
499 540 /* acols: The columns that will be accessed. */
500 541 /* scols: The columns that will be accessed or skipped. */
501 542 if (q == 0) {
502 543 /* Our I/O request doesn't span all child vdevs. */
503 544 acols = bc;
504 545 scols = MIN(dcols, roundup(bc, nparity + 1));
505 546 } else {
506 547 acols = dcols;
507 548 scols = dcols;
508 549 }
509 550
510 551 ASSERT3U(acols, <=, scols);
511 552
512 553 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
513 554
514 555 rm->rm_cols = acols;
515 556 rm->rm_scols = scols;
516 557 rm->rm_bigcols = bc;
517 558 rm->rm_skipstart = bc;
518 559 rm->rm_missingdata = 0;
519 560 rm->rm_missingparity = 0;
520 561 rm->rm_firstdatacol = nparity;
521 562 rm->rm_abd_copy = NULL;
522 563 rm->rm_reports = 0;
523 564 rm->rm_freed = 0;
524 565 rm->rm_ecksuminjected = 0;
525 566
526 567 asize = 0;
527 568
528 569 for (c = 0; c < scols; c++) {
529 570 col = f + c;
530 571 coff = o;
531 572 if (col >= dcols) {
532 573 col -= dcols;
533 574 coff += 1ULL << unit_shift;
534 575 }
535 576 rm->rm_col[c].rc_devidx = col;
536 577 rm->rm_col[c].rc_offset = coff;
537 578 rm->rm_col[c].rc_abd = NULL;
538 579 rm->rm_col[c].rc_gdata = NULL;
539 580 rm->rm_col[c].rc_error = 0;
540 581 rm->rm_col[c].rc_tried = 0;
541 582 rm->rm_col[c].rc_skipped = 0;
542 583
543 584 if (c >= acols)
544 585 rm->rm_col[c].rc_size = 0;
545 586 else if (c < bc)
546 587 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
547 588 else
548 589 rm->rm_col[c].rc_size = q << unit_shift;
|
↓ open down ↓ |
74 lines elided |
↑ open up ↑ |
549 590
550 591 asize += rm->rm_col[c].rc_size;
551 592 }
552 593
553 594 ASSERT3U(asize, ==, tot << unit_shift);
554 595 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
555 596 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
556 597 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
557 598 ASSERT3U(rm->rm_nskip, <=, nparity);
558 599
559 - for (c = 0; c < rm->rm_firstdatacol; c++)
560 - rm->rm_col[c].rc_abd =
561 - abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);
600 + if (alloc_data) {
601 + for (c = 0; c < rm->rm_firstdatacol; c++) {
602 + rm->rm_col[c].rc_abd =
603 + abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);
604 + }
562 605
563 - rm->rm_col[c].rc_abd = abd_get_offset(abd, 0);
564 - off = rm->rm_col[c].rc_size;
606 + rm->rm_col[c].rc_abd = abd_get_offset(abd, 0);
607 + off = rm->rm_col[c].rc_size;
565 608
566 - for (c = c + 1; c < acols; c++) {
567 - rm->rm_col[c].rc_abd = abd_get_offset(abd, off);
568 - off += rm->rm_col[c].rc_size;
609 + for (c = c + 1; c < acols; c++) {
610 + rm->rm_col[c].rc_abd = abd_get_offset(abd, off);
611 + off += rm->rm_col[c].rc_size;
612 + }
569 613 }
570 614
571 615 /*
572 616 * If all data stored spans all columns, there's a danger that parity
573 617 * will always be on the same device and, since parity isn't read
574 618 * during normal operation, that that device's I/O bandwidth won't be
575 619 * used effectively. We therefore switch the parity every 1MB.
576 620 *
577 621 * ... at least that was, ostensibly, the theory. As a practical
578 622 * matter unless we juggle the parity between all devices evenly, we
579 623 * won't see any benefit. Further, occasional writes that aren't a
580 624 * multiple of the LCM of the number of children and the minimum
581 625 * stripe width are sufficient to avoid pessimal behavior.
582 626 * Unfortunately, this decision created an implicit on-disk format
583 627 * requirement that we need to support for all eternity, but only
584 628 * for single-parity RAID-Z.
585 629 *
586 630 * If we intend to skip a sector in the zeroth column for padding
587 631 * we must make sure to note this swap. We will never intend to
588 632 * skip the first column since at least one data and one parity
589 633 * column must appear in each row.
590 634 */
591 635 ASSERT(rm->rm_cols >= 2);
592 636 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
593 637
594 638 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
595 639 devidx = rm->rm_col[0].rc_devidx;
596 640 o = rm->rm_col[0].rc_offset;
597 641 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
598 642 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
599 643 rm->rm_col[1].rc_devidx = devidx;
600 644 rm->rm_col[1].rc_offset = o;
601 645
602 646 if (rm->rm_skipstart == 0)
603 647 rm->rm_skipstart = 1;
604 648 }
605 649
606 650 return (rm);
607 651 }
608 652
609 653 struct pqr_struct {
610 654 uint64_t *p;
611 655 uint64_t *q;
612 656 uint64_t *r;
613 657 };
614 658
615 659 static int
616 660 vdev_raidz_p_func(void *buf, size_t size, void *private)
617 661 {
618 662 struct pqr_struct *pqr = private;
619 663 const uint64_t *src = buf;
620 664 int i, cnt = size / sizeof (src[0]);
621 665
622 666 ASSERT(pqr->p && !pqr->q && !pqr->r);
623 667
624 668 for (i = 0; i < cnt; i++, src++, pqr->p++)
625 669 *pqr->p ^= *src;
626 670
627 671 return (0);
628 672 }
629 673
630 674 static int
631 675 vdev_raidz_pq_func(void *buf, size_t size, void *private)
632 676 {
633 677 struct pqr_struct *pqr = private;
634 678 const uint64_t *src = buf;
635 679 uint64_t mask;
636 680 int i, cnt = size / sizeof (src[0]);
637 681
638 682 ASSERT(pqr->p && pqr->q && !pqr->r);
639 683
640 684 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
641 685 *pqr->p ^= *src;
642 686 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
643 687 *pqr->q ^= *src;
644 688 }
645 689
646 690 return (0);
647 691 }
648 692
649 693 static int
650 694 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
651 695 {
652 696 struct pqr_struct *pqr = private;
653 697 const uint64_t *src = buf;
654 698 uint64_t mask;
655 699 int i, cnt = size / sizeof (src[0]);
656 700
657 701 ASSERT(pqr->p && pqr->q && pqr->r);
658 702
659 703 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
|
↓ open down ↓ |
81 lines elided |
↑ open up ↑ |
660 704 *pqr->p ^= *src;
661 705 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
662 706 *pqr->q ^= *src;
663 707 VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
664 708 *pqr->r ^= *src;
665 709 }
666 710
667 711 return (0);
668 712 }
669 713
714 +/*
715 + * software acceleration of XOR calculations, requirements
716 + *
717 + * the (src/dst) vectors needs to be 64 byte aligned
718 + * all the vectors have to be the same size
719 + */
720 +#define RAIDZ_ACCELERATION_ALIGNMENT 64ul
721 +#define UNALIGNED(addr) \
722 + ((unsigned long)(addr) & (RAIDZ_ACCELERATION_ALIGNMENT-1))
723 +
670 724 static void
671 725 vdev_raidz_generate_parity_p(raidz_map_t *rm)
672 726 {
673 727 uint64_t *p;
674 728 int c;
675 729 abd_t *src;
676 730
731 +#if 0
732 + /* FIXME: needs to be reviewed and changed to support ABD */
733 + int parity_done;
734 + void *va[16];
735 + void **array;
736 + int j, nvects;
737 +
738 + parity_done = 0;
739 + while (0 && zfs_xorp_hook && !parity_done) {
740 + unsigned long no_accel = 0;
741 + /* at least two columns (plus one for result) */
742 + if (rm->rm_cols < 3) {
743 + DTRACE_PROBE1(raidz_few_cols, int, rm->rm_cols);
744 + break;
745 + }
746 + /* check sizes and alignment */
747 + no_accel = UNALIGNED(rm->rm_col[VDEV_RAIDZ_P].rc_data);
748 + if (no_accel) {
749 + DTRACE_PROBE1(raidz_unaligned_dst, unsigned long,
750 + no_accel);
751 + break;
752 + }
753 + pcount = rm->rm_col[rm->rm_firstdatacol].rc_size;
754 + nvects = 1; /* for the destination */
755 + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
756 + no_accel = UNALIGNED(rm->rm_col[c].rc_data);
757 + if (no_accel) {
758 + DTRACE_PROBE1(raidz_unaligned_src,
759 + unsigned long, no_accel);
760 + break;
761 + }
762 + if (rm->rm_col[c].rc_size != pcount) {
763 + DTRACE_PROBE(raidz_sizes_vary);
764 + no_accel = 1;
765 + break;
766 + }
767 + nvects++;
768 + }
769 + if (no_accel)
770 + break;
771 + if (nvects > 16) {
772 + array = kmem_alloc(nvects * sizeof (void *),
773 + KM_NOSLEEP);
774 + if (array == NULL) {
775 + DTRACE_PROBE(raidz_alloc_failed);
776 + break;
777 + }
778 + } else {
779 + array = va;
780 + }
781 + for (j = 0, c = rm->rm_firstdatacol; c < rm->rm_cols;
782 + c++, j++) {
783 + array[j] = rm->rm_col[c].rc_data;
784 + }
785 + array[j] = rm->rm_col[VDEV_RAIDZ_P].rc_data;
786 + if (zfs_xorp_hook(nvects,
787 + rm->rm_col[rm->rm_firstdatacol].rc_size, array)) {
788 + DTRACE_PROBE(raidz_accel_failure);
789 + break;
790 + }
791 + if (array != va) {
792 + kmem_free(array, nvects * sizeof (void *));
793 + }
794 + parity_done = 1;
795 + DTRACE_PROBE(raidz_accel_success);
796 + }
797 + if (parity_done)
798 + return;
799 +#endif
677 800 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
678 801 src = rm->rm_col[c].rc_abd;
679 802 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
680 803
681 804 if (c == rm->rm_firstdatacol) {
682 805 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
683 806 } else {
684 807 struct pqr_struct pqr = { p, NULL, NULL };
685 808 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
686 809 vdev_raidz_p_func, &pqr);
687 810 }
688 811 }
689 812 }
690 813
691 814 static void
692 815 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
693 816 {
694 817 uint64_t *p, *q, pcnt, ccnt, mask, i;
695 818 int c;
696 819 abd_t *src;
697 820
698 821 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
699 822 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
700 823 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
701 824
702 825 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
703 826 src = rm->rm_col[c].rc_abd;
704 827 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
705 828 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
706 829
707 830 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
708 831
709 832 if (c == rm->rm_firstdatacol) {
710 833 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
711 834 (void) memcpy(q, p, rm->rm_col[c].rc_size);
712 835 } else {
713 836 struct pqr_struct pqr = { p, q, NULL };
714 837 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
715 838 vdev_raidz_pq_func, &pqr);
716 839 }
717 840
718 841 if (c == rm->rm_firstdatacol) {
719 842 for (i = ccnt; i < pcnt; i++) {
720 843 p[i] = 0;
721 844 q[i] = 0;
722 845 }
723 846 } else {
724 847 /*
725 848 * Treat short columns as though they are full of 0s.
726 849 * Note that there's therefore nothing needed for P.
727 850 */
728 851 for (i = ccnt; i < pcnt; i++) {
729 852 VDEV_RAIDZ_64MUL_2(q[i], mask);
730 853 }
731 854 }
732 855 }
733 856 }
734 857
735 858 static void
736 859 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
737 860 {
738 861 uint64_t *p, *q, *r, pcnt, ccnt, mask, i;
739 862 int c;
740 863 abd_t *src;
741 864
742 865 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
743 866 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
744 867 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
745 868 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
746 869 rm->rm_col[VDEV_RAIDZ_R].rc_size);
747 870
748 871 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
749 872 src = rm->rm_col[c].rc_abd;
750 873 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
751 874 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
752 875 r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
753 876
754 877 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
755 878
756 879 if (c == rm->rm_firstdatacol) {
757 880 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
758 881 (void) memcpy(q, p, rm->rm_col[c].rc_size);
759 882 (void) memcpy(r, p, rm->rm_col[c].rc_size);
760 883 } else {
761 884 struct pqr_struct pqr = { p, q, r };
762 885 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
763 886 vdev_raidz_pqr_func, &pqr);
764 887 }
765 888
766 889 if (c == rm->rm_firstdatacol) {
767 890 for (i = ccnt; i < pcnt; i++) {
768 891 p[i] = 0;
769 892 q[i] = 0;
770 893 r[i] = 0;
771 894 }
772 895 } else {
773 896 /*
774 897 * Treat short columns as though they are full of 0s.
775 898 * Note that there's therefore nothing needed for P.
776 899 */
777 900 for (i = ccnt; i < pcnt; i++) {
778 901 VDEV_RAIDZ_64MUL_2(q[i], mask);
779 902 VDEV_RAIDZ_64MUL_4(r[i], mask);
780 903 }
781 904 }
782 905 }
783 906 }
784 907
785 908 /*
786 909 * Generate RAID parity in the first virtual columns according to the number of
787 910 * parity columns available.
788 911 */
789 912 static void
790 913 vdev_raidz_generate_parity(raidz_map_t *rm)
791 914 {
792 915 switch (rm->rm_firstdatacol) {
793 916 case 1:
794 917 vdev_raidz_generate_parity_p(rm);
795 918 break;
796 919 case 2:
797 920 vdev_raidz_generate_parity_pq(rm);
798 921 break;
799 922 case 3:
800 923 vdev_raidz_generate_parity_pqr(rm);
801 924 break;
802 925 default:
803 926 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
804 927 }
805 928 }
806 929
807 930 /* ARGSUSED */
808 931 static int
809 932 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
810 933 {
811 934 uint64_t *dst = dbuf;
812 935 uint64_t *src = sbuf;
813 936 int cnt = size / sizeof (src[0]);
814 937
815 938 for (int i = 0; i < cnt; i++) {
816 939 dst[i] ^= src[i];
817 940 }
818 941
819 942 return (0);
820 943 }
821 944
822 945 /* ARGSUSED */
823 946 static int
824 947 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
825 948 void *private)
826 949 {
827 950 uint64_t *dst = dbuf;
828 951 uint64_t *src = sbuf;
829 952 uint64_t mask;
830 953 int cnt = size / sizeof (dst[0]);
831 954
832 955 for (int i = 0; i < cnt; i++, dst++, src++) {
833 956 VDEV_RAIDZ_64MUL_2(*dst, mask);
834 957 *dst ^= *src;
835 958 }
836 959
837 960 return (0);
838 961 }
839 962
840 963 /* ARGSUSED */
841 964 static int
842 965 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
843 966 {
844 967 uint64_t *dst = buf;
845 968 uint64_t mask;
846 969 int cnt = size / sizeof (dst[0]);
847 970
848 971 for (int i = 0; i < cnt; i++, dst++) {
849 972 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */
850 973 VDEV_RAIDZ_64MUL_2(*dst, mask);
851 974 }
852 975
853 976 return (0);
854 977 }
855 978
856 979 struct reconst_q_struct {
857 980 uint64_t *q;
858 981 int exp;
859 982 };
860 983
861 984 static int
862 985 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
863 986 {
864 987 struct reconst_q_struct *rq = private;
865 988 uint64_t *dst = buf;
866 989 int cnt = size / sizeof (dst[0]);
867 990
868 991 for (int i = 0; i < cnt; i++, dst++, rq->q++) {
869 992 *dst ^= *rq->q;
870 993
871 994 int j;
872 995 uint8_t *b;
873 996 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
874 997 *b = vdev_raidz_exp2(*b, rq->exp);
875 998 }
876 999 }
877 1000
878 1001 return (0);
879 1002 }
880 1003
881 1004 struct reconst_pq_struct {
882 1005 uint8_t *p;
883 1006 uint8_t *q;
884 1007 uint8_t *pxy;
885 1008 uint8_t *qxy;
886 1009 int aexp;
887 1010 int bexp;
888 1011 };
889 1012
890 1013 static int
891 1014 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
892 1015 {
893 1016 struct reconst_pq_struct *rpq = private;
894 1017 uint8_t *xd = xbuf;
895 1018 uint8_t *yd = ybuf;
896 1019
897 1020 for (int i = 0; i < size;
898 1021 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
899 1022 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
900 1023 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
901 1024 *yd = *rpq->p ^ *rpq->pxy ^ *xd;
902 1025 }
903 1026
904 1027 return (0);
905 1028 }
906 1029
907 1030 static int
908 1031 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
909 1032 {
910 1033 struct reconst_pq_struct *rpq = private;
911 1034 uint8_t *xd = xbuf;
912 1035
913 1036 for (int i = 0; i < size;
914 1037 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
915 1038 /* same operation as vdev_raidz_reconst_pq_func() on xd */
916 1039 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
917 1040 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
918 1041 }
919 1042
920 1043 return (0);
921 1044 }
922 1045
923 1046 static int
924 1047 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
925 1048 {
926 1049 int x = tgts[0];
927 1050 int c;
928 1051 abd_t *dst, *src;
929 1052
930 1053 ASSERT(ntgts == 1);
931 1054 ASSERT(x >= rm->rm_firstdatacol);
932 1055 ASSERT(x < rm->rm_cols);
933 1056
934 1057 ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
935 1058 ASSERT(rm->rm_col[x].rc_size > 0);
936 1059
937 1060 src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
938 1061 dst = rm->rm_col[x].rc_abd;
939 1062
940 1063 abd_copy(dst, src, rm->rm_col[x].rc_size);
941 1064
942 1065 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
943 1066 uint64_t size = MIN(rm->rm_col[x].rc_size,
944 1067 rm->rm_col[c].rc_size);
945 1068
946 1069 src = rm->rm_col[c].rc_abd;
947 1070 dst = rm->rm_col[x].rc_abd;
948 1071
949 1072 if (c == x)
950 1073 continue;
951 1074
952 1075 (void) abd_iterate_func2(dst, src, 0, 0, size,
953 1076 vdev_raidz_reconst_p_func, NULL);
954 1077 }
955 1078
956 1079 return (1 << VDEV_RAIDZ_P);
957 1080 }
958 1081
959 1082 static int
960 1083 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
961 1084 {
962 1085 int x = tgts[0];
963 1086 int c, exp;
964 1087 abd_t *dst, *src;
965 1088
966 1089 ASSERT(ntgts == 1);
967 1090
968 1091 ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);
969 1092
970 1093 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
971 1094 uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
972 1095 rm->rm_col[c].rc_size);
973 1096
974 1097 src = rm->rm_col[c].rc_abd;
975 1098 dst = rm->rm_col[x].rc_abd;
976 1099
977 1100 if (c == rm->rm_firstdatacol) {
978 1101 abd_copy(dst, src, size);
979 1102 if (rm->rm_col[x].rc_size > size)
980 1103 abd_zero_off(dst, size,
981 1104 rm->rm_col[x].rc_size - size);
982 1105 } else {
983 1106 ASSERT3U(size, <=, rm->rm_col[x].rc_size);
984 1107 (void) abd_iterate_func2(dst, src, 0, 0, size,
985 1108 vdev_raidz_reconst_q_pre_func, NULL);
986 1109 (void) abd_iterate_func(dst,
987 1110 size, rm->rm_col[x].rc_size - size,
988 1111 vdev_raidz_reconst_q_pre_tail_func, NULL);
989 1112 }
990 1113 }
991 1114
992 1115 src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
993 1116 dst = rm->rm_col[x].rc_abd;
994 1117 exp = 255 - (rm->rm_cols - 1 - x);
995 1118
996 1119 struct reconst_q_struct rq = { abd_to_buf(src), exp };
997 1120 (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
998 1121 vdev_raidz_reconst_q_post_func, &rq);
999 1122
1000 1123 return (1 << VDEV_RAIDZ_Q);
1001 1124 }
1002 1125
1003 1126 static int
1004 1127 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
1005 1128 {
1006 1129 uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
1007 1130 abd_t *pdata, *qdata;
1008 1131 uint64_t xsize, ysize;
1009 1132 int x = tgts[0];
1010 1133 int y = tgts[1];
1011 1134 abd_t *xd, *yd;
1012 1135
1013 1136 ASSERT(ntgts == 2);
1014 1137 ASSERT(x < y);
1015 1138 ASSERT(x >= rm->rm_firstdatacol);
1016 1139 ASSERT(y < rm->rm_cols);
1017 1140
1018 1141 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
1019 1142
1020 1143 /*
1021 1144 * Move the parity data aside -- we're going to compute parity as
1022 1145 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
1023 1146 * reuse the parity generation mechanism without trashing the actual
1024 1147 * parity so we make those columns appear to be full of zeros by
1025 1148 * setting their lengths to zero.
1026 1149 */
1027 1150 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
1028 1151 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
1029 1152 xsize = rm->rm_col[x].rc_size;
1030 1153 ysize = rm->rm_col[y].rc_size;
1031 1154
1032 1155 rm->rm_col[VDEV_RAIDZ_P].rc_abd =
1033 1156 abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
1034 1157 rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
1035 1158 abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
1036 1159 rm->rm_col[x].rc_size = 0;
1037 1160 rm->rm_col[y].rc_size = 0;
1038 1161
1039 1162 vdev_raidz_generate_parity_pq(rm);
1040 1163
1041 1164 rm->rm_col[x].rc_size = xsize;
1042 1165 rm->rm_col[y].rc_size = ysize;
1043 1166
1044 1167 p = abd_to_buf(pdata);
1045 1168 q = abd_to_buf(qdata);
1046 1169 pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1047 1170 qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1048 1171 xd = rm->rm_col[x].rc_abd;
1049 1172 yd = rm->rm_col[y].rc_abd;
1050 1173
1051 1174 /*
1052 1175 * We now have:
1053 1176 * Pxy = P + D_x + D_y
1054 1177 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
1055 1178 *
1056 1179 * We can then solve for D_x:
1057 1180 * D_x = A * (P + Pxy) + B * (Q + Qxy)
1058 1181 * where
1059 1182 * A = 2^(x - y) * (2^(x - y) + 1)^-1
1060 1183 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
1061 1184 *
1062 1185 * With D_x in hand, we can easily solve for D_y:
1063 1186 * D_y = P + Pxy + D_x
1064 1187 */
1065 1188
1066 1189 a = vdev_raidz_pow2[255 + x - y];
1067 1190 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
1068 1191 tmp = 255 - vdev_raidz_log2[a ^ 1];
1069 1192
1070 1193 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
1071 1194 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
1072 1195
1073 1196 ASSERT3U(xsize, >=, ysize);
1074 1197 struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
1075 1198 (void) abd_iterate_func2(xd, yd, 0, 0, ysize,
1076 1199 vdev_raidz_reconst_pq_func, &rpq);
1077 1200 (void) abd_iterate_func(xd, ysize, xsize - ysize,
1078 1201 vdev_raidz_reconst_pq_tail_func, &rpq);
1079 1202
1080 1203 abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1081 1204 abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1082 1205
1083 1206 /*
1084 1207 * Restore the saved parity data.
1085 1208 */
1086 1209 rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
1087 1210 rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;
1088 1211
1089 1212 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
1090 1213 }
1091 1214
1092 1215 /* BEGIN CSTYLED */
1093 1216 /*
1094 1217 * In the general case of reconstruction, we must solve the system of linear
1095 1218 * equations defined by the coeffecients used to generate parity as well as
1096 1219 * the contents of the data and parity disks. This can be expressed with
1097 1220 * vectors for the original data (D) and the actual data (d) and parity (p)
1098 1221 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1099 1222 *
1100 1223 * __ __ __ __
1101 1224 * | | __ __ | p_0 |
1102 1225 * | V | | D_0 | | p_m-1 |
1103 1226 * | | x | : | = | d_0 |
1104 1227 * | I | | D_n-1 | | : |
1105 1228 * | | ~~ ~~ | d_n-1 |
1106 1229 * ~~ ~~ ~~ ~~
1107 1230 *
1108 1231 * I is simply a square identity matrix of size n, and V is a vandermonde
1109 1232 * matrix defined by the coeffecients we chose for the various parity columns
1110 1233 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1111 1234 * computation as well as linear separability.
1112 1235 *
1113 1236 * __ __ __ __
1114 1237 * | 1 .. 1 1 1 | | p_0 |
1115 1238 * | 2^n-1 .. 4 2 1 | __ __ | : |
1116 1239 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
1117 1240 * | 1 .. 0 0 0 | | D_1 | | d_0 |
1118 1241 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
1119 1242 * | : : : : | | : | | d_2 |
1120 1243 * | 0 .. 1 0 0 | | D_n-1 | | : |
1121 1244 * | 0 .. 0 1 0 | ~~ ~~ | : |
1122 1245 * | 0 .. 0 0 1 | | d_n-1 |
1123 1246 * ~~ ~~ ~~ ~~
1124 1247 *
1125 1248 * Note that I, V, d, and p are known. To compute D, we must invert the
1126 1249 * matrix and use the known data and parity values to reconstruct the unknown
1127 1250 * data values. We begin by removing the rows in V|I and d|p that correspond
1128 1251 * to failed or missing columns; we then make V|I square (n x n) and d|p
1129 1252 * sized n by removing rows corresponding to unused parity from the bottom up
1130 1253 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1131 1254 * using Gauss-Jordan elimination. In the example below we use m=3 parity
1132 1255 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1133 1256 * __ __
1134 1257 * | 1 1 1 1 1 1 1 1 |
1135 1258 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
1136 1259 * | 19 205 116 29 64 16 4 1 | / /
1137 1260 * | 1 0 0 0 0 0 0 0 | / /
1138 1261 * | 0 1 0 0 0 0 0 0 | <--' /
1139 1262 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
1140 1263 * | 0 0 0 1 0 0 0 0 |
1141 1264 * | 0 0 0 0 1 0 0 0 |
1142 1265 * | 0 0 0 0 0 1 0 0 |
1143 1266 * | 0 0 0 0 0 0 1 0 |
1144 1267 * | 0 0 0 0 0 0 0 1 |
1145 1268 * ~~ ~~
1146 1269 * __ __
1147 1270 * | 1 1 1 1 1 1 1 1 |
1148 1271 * | 19 205 116 29 64 16 4 1 |
1149 1272 * | 1 0 0 0 0 0 0 0 |
1150 1273 * (V|I)' = | 0 0 0 1 0 0 0 0 |
1151 1274 * | 0 0 0 0 1 0 0 0 |
1152 1275 * | 0 0 0 0 0 1 0 0 |
1153 1276 * | 0 0 0 0 0 0 1 0 |
1154 1277 * | 0 0 0 0 0 0 0 1 |
1155 1278 * ~~ ~~
1156 1279 *
1157 1280 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1158 1281 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1159 1282 * matrix is not singular.
1160 1283 * __ __
1161 1284 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1162 1285 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1163 1286 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1164 1287 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1165 1288 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1166 1289 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1167 1290 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1168 1291 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1169 1292 * ~~ ~~
1170 1293 * __ __
1171 1294 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1172 1295 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1173 1296 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1174 1297 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1175 1298 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1176 1299 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1177 1300 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1178 1301 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1179 1302 * ~~ ~~
1180 1303 * __ __
1181 1304 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1182 1305 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1183 1306 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
1184 1307 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1185 1308 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1186 1309 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1187 1310 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1188 1311 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1189 1312 * ~~ ~~
1190 1313 * __ __
1191 1314 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1192 1315 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1193 1316 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
1194 1317 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1195 1318 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1196 1319 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1197 1320 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1198 1321 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1199 1322 * ~~ ~~
1200 1323 * __ __
1201 1324 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1202 1325 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1203 1326 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1204 1327 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1205 1328 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1206 1329 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1207 1330 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1208 1331 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1209 1332 * ~~ ~~
1210 1333 * __ __
1211 1334 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1212 1335 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
1213 1336 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1214 1337 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1215 1338 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1216 1339 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1217 1340 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1218 1341 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1219 1342 * ~~ ~~
1220 1343 * __ __
1221 1344 * | 0 0 1 0 0 0 0 0 |
1222 1345 * | 167 100 5 41 159 169 217 208 |
1223 1346 * | 166 100 4 40 158 168 216 209 |
1224 1347 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
1225 1348 * | 0 0 0 0 1 0 0 0 |
1226 1349 * | 0 0 0 0 0 1 0 0 |
1227 1350 * | 0 0 0 0 0 0 1 0 |
1228 1351 * | 0 0 0 0 0 0 0 1 |
1229 1352 * ~~ ~~
1230 1353 *
1231 1354 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1232 1355 * of the missing data.
1233 1356 *
1234 1357 * As is apparent from the example above, the only non-trivial rows in the
1235 1358 * inverse matrix correspond to the data disks that we're trying to
1236 1359 * reconstruct. Indeed, those are the only rows we need as the others would
1237 1360 * only be useful for reconstructing data known or assumed to be valid. For
1238 1361 * that reason, we only build the coefficients in the rows that correspond to
1239 1362 * targeted columns.
1240 1363 */
1241 1364 /* END CSTYLED */
1242 1365
1243 1366 static void
1244 1367 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1245 1368 uint8_t **rows)
1246 1369 {
1247 1370 int i, j;
1248 1371 int pow;
1249 1372
1250 1373 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1251 1374
1252 1375 /*
1253 1376 * Fill in the missing rows of interest.
1254 1377 */
1255 1378 for (i = 0; i < nmap; i++) {
1256 1379 ASSERT3S(0, <=, map[i]);
1257 1380 ASSERT3S(map[i], <=, 2);
1258 1381
1259 1382 pow = map[i] * n;
1260 1383 if (pow > 255)
1261 1384 pow -= 255;
1262 1385 ASSERT(pow <= 255);
1263 1386
1264 1387 for (j = 0; j < n; j++) {
1265 1388 pow -= map[i];
1266 1389 if (pow < 0)
1267 1390 pow += 255;
1268 1391 rows[i][j] = vdev_raidz_pow2[pow];
1269 1392 }
1270 1393 }
1271 1394 }
1272 1395
1273 1396 static void
1274 1397 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1275 1398 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1276 1399 {
1277 1400 int i, j, ii, jj;
1278 1401 uint8_t log;
1279 1402
1280 1403 /*
1281 1404 * Assert that the first nmissing entries from the array of used
1282 1405 * columns correspond to parity columns and that subsequent entries
1283 1406 * correspond to data columns.
1284 1407 */
1285 1408 for (i = 0; i < nmissing; i++) {
1286 1409 ASSERT3S(used[i], <, rm->rm_firstdatacol);
1287 1410 }
1288 1411 for (; i < n; i++) {
1289 1412 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1290 1413 }
1291 1414
1292 1415 /*
1293 1416 * First initialize the storage where we'll compute the inverse rows.
1294 1417 */
1295 1418 for (i = 0; i < nmissing; i++) {
1296 1419 for (j = 0; j < n; j++) {
1297 1420 invrows[i][j] = (i == j) ? 1 : 0;
1298 1421 }
1299 1422 }
1300 1423
1301 1424 /*
1302 1425 * Subtract all trivial rows from the rows of consequence.
1303 1426 */
1304 1427 for (i = 0; i < nmissing; i++) {
1305 1428 for (j = nmissing; j < n; j++) {
1306 1429 ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1307 1430 jj = used[j] - rm->rm_firstdatacol;
1308 1431 ASSERT3S(jj, <, n);
1309 1432 invrows[i][j] = rows[i][jj];
1310 1433 rows[i][jj] = 0;
1311 1434 }
1312 1435 }
1313 1436
1314 1437 /*
1315 1438 * For each of the rows of interest, we must normalize it and subtract
1316 1439 * a multiple of it from the other rows.
1317 1440 */
1318 1441 for (i = 0; i < nmissing; i++) {
1319 1442 for (j = 0; j < missing[i]; j++) {
1320 1443 ASSERT0(rows[i][j]);
1321 1444 }
1322 1445 ASSERT3U(rows[i][missing[i]], !=, 0);
1323 1446
1324 1447 /*
1325 1448 * Compute the inverse of the first element and multiply each
1326 1449 * element in the row by that value.
1327 1450 */
1328 1451 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1329 1452
1330 1453 for (j = 0; j < n; j++) {
1331 1454 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1332 1455 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1333 1456 }
1334 1457
1335 1458 for (ii = 0; ii < nmissing; ii++) {
1336 1459 if (i == ii)
1337 1460 continue;
1338 1461
1339 1462 ASSERT3U(rows[ii][missing[i]], !=, 0);
1340 1463
1341 1464 log = vdev_raidz_log2[rows[ii][missing[i]]];
1342 1465
1343 1466 for (j = 0; j < n; j++) {
1344 1467 rows[ii][j] ^=
1345 1468 vdev_raidz_exp2(rows[i][j], log);
1346 1469 invrows[ii][j] ^=
1347 1470 vdev_raidz_exp2(invrows[i][j], log);
1348 1471 }
1349 1472 }
1350 1473 }
1351 1474
1352 1475 /*
1353 1476 * Verify that the data that is left in the rows are properly part of
1354 1477 * an identity matrix.
1355 1478 */
1356 1479 for (i = 0; i < nmissing; i++) {
1357 1480 for (j = 0; j < n; j++) {
1358 1481 if (j == missing[i]) {
1359 1482 ASSERT3U(rows[i][j], ==, 1);
1360 1483 } else {
1361 1484 ASSERT0(rows[i][j]);
1362 1485 }
1363 1486 }
1364 1487 }
1365 1488 }
1366 1489
1367 1490 static void
1368 1491 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1369 1492 int *missing, uint8_t **invrows, const uint8_t *used)
1370 1493 {
1371 1494 int i, j, x, cc, c;
1372 1495 uint8_t *src;
1373 1496 uint64_t ccount;
1374 1497 uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1375 1498 uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1376 1499 uint8_t log = 0;
1377 1500 uint8_t val;
1378 1501 int ll;
1379 1502 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1380 1503 uint8_t *p, *pp;
1381 1504 size_t psize;
1382 1505
1383 1506 psize = sizeof (invlog[0][0]) * n * nmissing;
1384 1507 p = kmem_alloc(psize, KM_SLEEP);
1385 1508
1386 1509 for (pp = p, i = 0; i < nmissing; i++) {
1387 1510 invlog[i] = pp;
1388 1511 pp += n;
1389 1512 }
1390 1513
1391 1514 for (i = 0; i < nmissing; i++) {
1392 1515 for (j = 0; j < n; j++) {
1393 1516 ASSERT3U(invrows[i][j], !=, 0);
1394 1517 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1395 1518 }
1396 1519 }
1397 1520
1398 1521 for (i = 0; i < n; i++) {
1399 1522 c = used[i];
1400 1523 ASSERT3U(c, <, rm->rm_cols);
1401 1524
1402 1525 src = abd_to_buf(rm->rm_col[c].rc_abd);
1403 1526 ccount = rm->rm_col[c].rc_size;
1404 1527 for (j = 0; j < nmissing; j++) {
1405 1528 cc = missing[j] + rm->rm_firstdatacol;
1406 1529 ASSERT3U(cc, >=, rm->rm_firstdatacol);
1407 1530 ASSERT3U(cc, <, rm->rm_cols);
1408 1531 ASSERT3U(cc, !=, c);
1409 1532
1410 1533 dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
1411 1534 dcount[j] = rm->rm_col[cc].rc_size;
1412 1535 }
1413 1536
1414 1537 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1415 1538
1416 1539 for (x = 0; x < ccount; x++, src++) {
1417 1540 if (*src != 0)
1418 1541 log = vdev_raidz_log2[*src];
1419 1542
1420 1543 for (cc = 0; cc < nmissing; cc++) {
1421 1544 if (x >= dcount[cc])
1422 1545 continue;
1423 1546
1424 1547 if (*src == 0) {
1425 1548 val = 0;
1426 1549 } else {
1427 1550 if ((ll = log + invlog[cc][i]) >= 255)
1428 1551 ll -= 255;
1429 1552 val = vdev_raidz_pow2[ll];
1430 1553 }
1431 1554
1432 1555 if (i == 0)
1433 1556 dst[cc][x] = val;
1434 1557 else
1435 1558 dst[cc][x] ^= val;
1436 1559 }
1437 1560 }
1438 1561 }
1439 1562
1440 1563 kmem_free(p, psize);
1441 1564 }
1442 1565
1443 1566 static int
1444 1567 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1445 1568 {
1446 1569 int n, i, c, t, tt;
1447 1570 int nmissing_rows;
1448 1571 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1449 1572 int parity_map[VDEV_RAIDZ_MAXPARITY];
1450 1573
1451 1574 uint8_t *p, *pp;
1452 1575 size_t psize;
1453 1576
1454 1577 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1455 1578 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1456 1579 uint8_t *used;
1457 1580
1458 1581 abd_t **bufs = NULL;
1459 1582
1460 1583 int code = 0;
1461 1584
1462 1585 /*
1463 1586 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1464 1587 * temporary linear ABDs.
1465 1588 */
1466 1589 if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
1467 1590 bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);
1468 1591
1469 1592 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1470 1593 raidz_col_t *col = &rm->rm_col[c];
1471 1594
1472 1595 bufs[c] = col->rc_abd;
1473 1596 col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
1474 1597 abd_copy(col->rc_abd, bufs[c], col->rc_size);
1475 1598 }
1476 1599 }
1477 1600
1478 1601 n = rm->rm_cols - rm->rm_firstdatacol;
1479 1602
1480 1603 /*
1481 1604 * Figure out which data columns are missing.
1482 1605 */
1483 1606 nmissing_rows = 0;
1484 1607 for (t = 0; t < ntgts; t++) {
1485 1608 if (tgts[t] >= rm->rm_firstdatacol) {
1486 1609 missing_rows[nmissing_rows++] =
1487 1610 tgts[t] - rm->rm_firstdatacol;
1488 1611 }
1489 1612 }
1490 1613
1491 1614 /*
1492 1615 * Figure out which parity columns to use to help generate the missing
1493 1616 * data columns.
1494 1617 */
1495 1618 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1496 1619 ASSERT(tt < ntgts);
1497 1620 ASSERT(c < rm->rm_firstdatacol);
1498 1621
1499 1622 /*
1500 1623 * Skip any targeted parity columns.
1501 1624 */
1502 1625 if (c == tgts[tt]) {
1503 1626 tt++;
1504 1627 continue;
1505 1628 }
1506 1629
1507 1630 code |= 1 << c;
1508 1631
1509 1632 parity_map[i] = c;
1510 1633 i++;
1511 1634 }
1512 1635
1513 1636 ASSERT(code != 0);
1514 1637 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1515 1638
1516 1639 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1517 1640 nmissing_rows * n + sizeof (used[0]) * n;
1518 1641 p = kmem_alloc(psize, KM_SLEEP);
1519 1642
1520 1643 for (pp = p, i = 0; i < nmissing_rows; i++) {
1521 1644 rows[i] = pp;
1522 1645 pp += n;
1523 1646 invrows[i] = pp;
1524 1647 pp += n;
1525 1648 }
1526 1649 used = pp;
1527 1650
1528 1651 for (i = 0; i < nmissing_rows; i++) {
1529 1652 used[i] = parity_map[i];
1530 1653 }
1531 1654
1532 1655 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1533 1656 if (tt < nmissing_rows &&
1534 1657 c == missing_rows[tt] + rm->rm_firstdatacol) {
1535 1658 tt++;
1536 1659 continue;
1537 1660 }
1538 1661
1539 1662 ASSERT3S(i, <, n);
1540 1663 used[i] = c;
1541 1664 i++;
1542 1665 }
1543 1666
1544 1667 /*
1545 1668 * Initialize the interesting rows of the matrix.
1546 1669 */
1547 1670 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1548 1671
1549 1672 /*
1550 1673 * Invert the matrix.
1551 1674 */
1552 1675 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1553 1676 invrows, used);
1554 1677
1555 1678 /*
1556 1679 * Reconstruct the missing data using the generated matrix.
1557 1680 */
1558 1681 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1559 1682 invrows, used);
1560 1683
1561 1684 kmem_free(p, psize);
1562 1685
1563 1686 /*
1564 1687 * copy back from temporary linear abds and free them
1565 1688 */
1566 1689 if (bufs) {
1567 1690 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1568 1691 raidz_col_t *col = &rm->rm_col[c];
1569 1692
1570 1693 abd_copy(bufs[c], col->rc_abd, col->rc_size);
1571 1694 abd_free(col->rc_abd);
1572 1695 col->rc_abd = bufs[c];
1573 1696 }
1574 1697 kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
1575 1698 }
1576 1699
1577 1700 return (code);
1578 1701 }
1579 1702
1580 1703 static int
1581 1704 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1582 1705 {
1583 1706 int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1584 1707 int ntgts;
1585 1708 int i, c;
1586 1709 int code;
1587 1710 int nbadparity, nbaddata;
1588 1711 int parity_valid[VDEV_RAIDZ_MAXPARITY];
1589 1712
1590 1713 /*
1591 1714 * The tgts list must already be sorted.
1592 1715 */
1593 1716 for (i = 1; i < nt; i++) {
1594 1717 ASSERT(t[i] > t[i - 1]);
1595 1718 }
1596 1719
1597 1720 nbadparity = rm->rm_firstdatacol;
1598 1721 nbaddata = rm->rm_cols - nbadparity;
1599 1722 ntgts = 0;
1600 1723 for (i = 0, c = 0; c < rm->rm_cols; c++) {
1601 1724 if (c < rm->rm_firstdatacol)
1602 1725 parity_valid[c] = B_FALSE;
1603 1726
1604 1727 if (i < nt && c == t[i]) {
1605 1728 tgts[ntgts++] = c;
1606 1729 i++;
1607 1730 } else if (rm->rm_col[c].rc_error != 0) {
1608 1731 tgts[ntgts++] = c;
1609 1732 } else if (c >= rm->rm_firstdatacol) {
1610 1733 nbaddata--;
1611 1734 } else {
1612 1735 parity_valid[c] = B_TRUE;
1613 1736 nbadparity--;
1614 1737 }
1615 1738 }
1616 1739
1617 1740 ASSERT(ntgts >= nt);
1618 1741 ASSERT(nbaddata >= 0);
1619 1742 ASSERT(nbaddata + nbadparity == ntgts);
1620 1743
1621 1744 dt = &tgts[nbadparity];
1622 1745
1623 1746 /*
1624 1747 * See if we can use any of our optimized reconstruction routines.
1625 1748 */
1626 1749 if (!vdev_raidz_default_to_general) {
1627 1750 switch (nbaddata) {
1628 1751 case 1:
1629 1752 if (parity_valid[VDEV_RAIDZ_P])
1630 1753 return (vdev_raidz_reconstruct_p(rm, dt, 1));
1631 1754
1632 1755 ASSERT(rm->rm_firstdatacol > 1);
1633 1756
1634 1757 if (parity_valid[VDEV_RAIDZ_Q])
1635 1758 return (vdev_raidz_reconstruct_q(rm, dt, 1));
1636 1759
1637 1760 ASSERT(rm->rm_firstdatacol > 2);
1638 1761 break;
1639 1762
1640 1763 case 2:
1641 1764 ASSERT(rm->rm_firstdatacol > 1);
1642 1765
1643 1766 if (parity_valid[VDEV_RAIDZ_P] &&
1644 1767 parity_valid[VDEV_RAIDZ_Q])
1645 1768 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1646 1769
1647 1770 ASSERT(rm->rm_firstdatacol > 2);
1648 1771
1649 1772 break;
1650 1773 }
1651 1774 }
1652 1775
1653 1776 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1654 1777 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1655 1778 ASSERT(code > 0);
1656 1779 return (code);
1657 1780 }
1658 1781
1659 1782 static int
1660 1783 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1661 1784 uint64_t *ashift)
1662 1785 {
1663 1786 vdev_t *cvd;
1664 1787 uint64_t nparity = vd->vdev_nparity;
1665 1788 int c;
1666 1789 int lasterror = 0;
1667 1790 int numerrors = 0;
1668 1791
1669 1792 ASSERT(nparity > 0);
1670 1793
1671 1794 if (nparity > VDEV_RAIDZ_MAXPARITY ||
1672 1795 vd->vdev_children < nparity + 1) {
1673 1796 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1674 1797 return (SET_ERROR(EINVAL));
1675 1798 }
1676 1799
1677 1800 vdev_open_children(vd);
1678 1801
1679 1802 for (c = 0; c < vd->vdev_children; c++) {
1680 1803 cvd = vd->vdev_child[c];
1681 1804
1682 1805 if (cvd->vdev_open_error != 0) {
1683 1806 lasterror = cvd->vdev_open_error;
1684 1807 numerrors++;
1685 1808 continue;
1686 1809 }
1687 1810
1688 1811 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1689 1812 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1690 1813 *ashift = MAX(*ashift, cvd->vdev_ashift);
1691 1814 }
1692 1815
1693 1816 *asize *= vd->vdev_children;
1694 1817 *max_asize *= vd->vdev_children;
1695 1818
1696 1819 if (numerrors > nparity) {
1697 1820 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1698 1821 return (lasterror);
1699 1822 }
1700 1823
1701 1824 return (0);
1702 1825 }
1703 1826
1704 1827 static void
1705 1828 vdev_raidz_close(vdev_t *vd)
1706 1829 {
1707 1830 int c;
1708 1831
1709 1832 for (c = 0; c < vd->vdev_children; c++)
1710 1833 vdev_close(vd->vdev_child[c]);
1711 1834 }
1712 1835
1713 1836 /*
1714 1837 * Handle a read or write I/O to a RAID-Z dump device.
1715 1838 *
1716 1839 * The dump device is in a unique situation compared to other ZFS datasets:
1717 1840 * writing to this device should be as simple and fast as possible. In
1718 1841 * addition, durability matters much less since the dump will be extracted
1719 1842 * once the machine reboots. For that reason, this function eschews parity for
1720 1843 * performance and simplicity. The dump device uses the checksum setting
1721 1844 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1722 1845 * dataset.
1723 1846 *
1724 1847 * Blocks of size 128 KB have been preallocated for this volume. I/Os less than
1725 1848 * 128 KB will not fill an entire block; in addition, they may not be properly
1726 1849 * aligned. In that case, this function uses the preallocated 128 KB block and
1727 1850 * omits reading or writing any "empty" portions of that block, as opposed to
1728 1851 * allocating a fresh appropriately-sized block.
1729 1852 *
1730 1853 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1731 1854 *
1732 1855 * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1733 1856 *
1734 1857 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1735 1858 * allocated which spans all five child vdevs. 8 KB of data would be written to
1736 1859 * each of four vdevs, with the fifth containing the parity bits.
1737 1860 *
1738 1861 * parity data data data data
1739 1862 * | PP | XX | XX | XX | XX |
1740 1863 * ^ ^ ^ ^ ^
1741 1864 * | | | | |
1742 1865 * 8 KB parity ------8 KB data blocks------
1743 1866 *
1744 1867 * However, when writing to the dump device, the behavior is different:
1745 1868 *
1746 1869 * vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1747 1870 *
1748 1871 * Unlike the normal RAID-Z case in which the block is allocated based on the
1749 1872 * I/O size, reads and writes here always use a 128 KB logical I/O size. If the
1750 1873 * I/O size is less than 128 KB, only the actual portions of data are written.
1751 1874 * In this example the data is written to the third data vdev since that vdev
1752 1875 * contains the offset [64 KB, 96 KB).
1753 1876 *
1754 1877 * parity data data data data
1755 1878 * | | | | XX | |
1756 1879 * ^
1757 1880 * |
1758 1881 * 32 KB data block
1759 1882 *
1760 1883 * As a result, an individual I/O may not span all child vdevs; moreover, a
1761 1884 * small I/O may only operate on a single child vdev.
1762 1885 *
1763 1886 * Note that since there are no parity bits calculated or written, this format
1764 1887 * remains the same no matter how many parity bits are used in a normal RAID-Z
1765 1888 * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above
1766 1889 * would look like:
1767 1890 *
1768 1891 * parity parity parity data data data data
1769 1892 * | | | | | | XX | |
1770 1893 * ^
1771 1894 * |
1772 1895 * 32 KB data block
1773 1896 */
1774 1897 int
1775 1898 vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1776 1899 uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1777 1900 {
1778 1901 vdev_t *tvd = vd->vdev_top;
1779 1902 vdev_t *cvd;
1780 1903 raidz_map_t *rm;
1781 1904 raidz_col_t *rc;
1782 1905 int c, err = 0;
1783 1906
1784 1907 uint64_t start, end, colstart, colend;
1785 1908 uint64_t coloffset, colsize, colskip;
1786 1909
1787 1910 int flags = doread ? B_READ : B_WRITE;
1788 1911
1789 1912 #ifdef _KERNEL
1790 1913
1791 1914 /*
1792 1915 * Don't write past the end of the block
1793 1916 */
1794 1917 VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1795 1918
1796 1919 start = offset;
1797 1920 end = start + size;
1798 1921
1799 1922 /*
1800 1923 * Allocate a RAID-Z map for this block. Note that this block starts
1801 1924 * from the "original" offset, this is, the offset of the extent which
|
↓ open down ↓ |
1115 lines elided |
↑ open up ↑ |
1802 1925 * contains the requisite offset of the data being read or written.
1803 1926 *
1804 1927 * Even if this I/O operation doesn't span the full block size, let's
1805 1928 * treat the on-disk format as if the only blocks are the complete 128
1806 1929 * KB size.
1807 1930 */
1808 1931 abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
1809 1932 SPA_OLD_MAXBLOCKSIZE);
1810 1933 rm = vdev_raidz_map_alloc(abd,
1811 1934 SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift,
1812 - vd->vdev_children, vd->vdev_nparity);
1935 + vd->vdev_children, vd->vdev_nparity, B_TRUE);
1813 1936
1814 1937 coloffset = origoffset;
1815 1938
1816 1939 for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1817 1940 c++, coloffset += rc->rc_size) {
1818 1941 rc = &rm->rm_col[c];
1819 1942 cvd = vd->vdev_child[rc->rc_devidx];
1820 1943
1821 1944 /*
1822 1945 * Find the start and end of this column in the RAID-Z map,
1823 1946 * keeping in mind that the stated size and offset of the
1824 1947 * operation may not fill the entire column for this vdev.
1825 1948 *
1826 1949 * If any portion of the data spans this column, issue the
1827 1950 * appropriate operation to the vdev.
1828 1951 */
1829 1952 if (coloffset + rc->rc_size <= start)
1830 1953 continue;
1831 1954 if (coloffset >= end)
1832 1955 continue;
1833 1956
1834 1957 colstart = MAX(coloffset, start);
1835 1958 colend = MIN(end, coloffset + rc->rc_size);
1836 1959 colsize = colend - colstart;
1837 1960 colskip = colstart - coloffset;
1838 1961
1839 1962 VERIFY3U(colsize, <=, rc->rc_size);
1840 1963 VERIFY3U(colskip, <=, rc->rc_size);
1841 1964
1842 1965 /*
1843 1966 * Note that the child vdev will have a vdev label at the start
1844 1967 * of its range of offsets, hence the need for
1845 1968 * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another
1846 1969 * example of why this calculation is needed.
1847 1970 */
1848 1971 if ((err = vdev_disk_physio(cvd,
1849 1972 ((char *)abd_to_buf(rc->rc_abd)) + colskip, colsize,
1850 1973 VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1851 1974 flags, isdump)) != 0)
1852 1975 break;
1853 1976 }
1854 1977
1855 1978 vdev_raidz_map_free(rm);
1856 1979 abd_put(abd);
1857 1980 #endif /* KERNEL */
1858 1981
1859 1982 return (err);
1860 1983 }
1861 1984
1862 1985 static uint64_t
1863 1986 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1864 1987 {
1865 1988 uint64_t asize;
1866 1989 uint64_t ashift = vd->vdev_top->vdev_ashift;
|
↓ open down ↓ |
44 lines elided |
↑ open up ↑ |
1867 1990 uint64_t cols = vd->vdev_children;
1868 1991 uint64_t nparity = vd->vdev_nparity;
1869 1992
1870 1993 asize = ((psize - 1) >> ashift) + 1;
1871 1994 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1872 1995 asize = roundup(asize, nparity + 1) << ashift;
1873 1996
1874 1997 return (asize);
1875 1998 }
1876 1999
2000 +/*
2001 + * Converts an allocated size on a raidz vdev back to a logical block
2002 + * size. This is used in trimming to figure out the appropriate logical
2003 + * size to pass to vdev_raidz_map_alloc when splitting up extents of free
2004 + * space obtained from metaslabs. However, a range of free space on a
2005 + * raidz vdev might have originally consisted of multiple blocks and
2006 + * those, taken together with their skip blocks, might not always align
2007 + * neatly to a new vdev_raidz_map_alloc covering the entire unified
2008 + * range. So to ensure that the newly allocated raidz map *always* fits
2009 + * within the asize passed to this function and never exceeds it (since
2010 + * that might trim allocated data past it), we round it down to the
2011 + * nearest suitable multiple of the vdev ashift (hence the "_floor" in
2012 + * this function's name).
2013 + * This function is in effect an inverse of vdev_raidz_asize. However,
2014 + * since multiple psizes can map to a single asize (due to variable padding,
2015 + * this function instead returns the largest chunk that still fits inside
2016 + * the specified asize).
2017 + */
2018 +static uint64_t
2019 +vdev_raidz_psize_floor(vdev_t *vd, uint64_t asize)
2020 +{
2021 + uint64_t psize;
2022 + uint64_t ashift = vd->vdev_top->vdev_ashift;
2023 + uint64_t cols = vd->vdev_children;
2024 + uint64_t nparity = vd->vdev_nparity;
2025 +
2026 + psize = (asize - (nparity << ashift));
2027 + psize /= cols;
2028 + psize *= cols - nparity;
2029 + psize += (1 << ashift) - 1;
2030 +
2031 + psize = P2ALIGN(psize, 1 << ashift);
2032 +
2033 + return (psize);
2034 +}
2035 +
1877 2036 static void
1878 2037 vdev_raidz_child_done(zio_t *zio)
1879 2038 {
1880 2039 raidz_col_t *rc = zio->io_private;
1881 2040
1882 2041 rc->rc_error = zio->io_error;
1883 2042 rc->rc_tried = 1;
1884 2043 rc->rc_skipped = 0;
1885 2044 }
1886 2045
1887 2046 /*
1888 2047 * Start an IO operation on a RAIDZ VDev
1889 2048 *
1890 2049 * Outline:
1891 2050 * - For write operations:
1892 2051 * 1. Generate the parity data
1893 2052 * 2. Create child zio write operations to each column's vdev, for both
1894 2053 * data and parity.
1895 2054 * 3. If the column skips any sectors for padding, create optional dummy
1896 2055 * write zio children for those areas to improve aggregation continuity.
1897 2056 * - For read operations:
1898 2057 * 1. Create child zio read operations to each data column's vdev to read
1899 2058 * the range of data required for zio.
1900 2059 * 2. If this is a scrub or resilver operation, or if any of the data
1901 2060 * vdevs have had errors, then create zio read operations to the parity
1902 2061 * columns' VDevs as well.
1903 2062 */
1904 2063 static void
1905 2064 vdev_raidz_io_start(zio_t *zio)
|
↓ open down ↓ |
19 lines elided |
↑ open up ↑ |
1906 2065 {
1907 2066 vdev_t *vd = zio->io_vd;
1908 2067 vdev_t *tvd = vd->vdev_top;
1909 2068 vdev_t *cvd;
1910 2069 raidz_map_t *rm;
1911 2070 raidz_col_t *rc;
1912 2071 int c, i;
1913 2072
1914 2073 rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
1915 2074 tvd->vdev_ashift, vd->vdev_children,
1916 - vd->vdev_nparity);
2075 + vd->vdev_nparity, B_TRUE);
1917 2076
1918 2077 zio->io_vsd = rm;
1919 2078 zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1920 2079
1921 2080 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1922 2081
1923 2082 if (zio->io_type == ZIO_TYPE_WRITE) {
1924 2083 vdev_raidz_generate_parity(rm);
1925 2084
1926 2085 for (c = 0; c < rm->rm_cols; c++) {
1927 2086 rc = &rm->rm_col[c];
1928 2087 cvd = vd->vdev_child[rc->rc_devidx];
1929 2088 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1930 2089 rc->rc_offset, rc->rc_abd, rc->rc_size,
1931 2090 zio->io_type, zio->io_priority, 0,
1932 2091 vdev_raidz_child_done, rc));
1933 2092 }
1934 2093
1935 2094 /*
1936 2095 * Generate optional I/Os for any skipped sectors to improve
1937 2096 * aggregation contiguity.
1938 2097 */
1939 2098 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1940 2099 ASSERT(c <= rm->rm_scols);
1941 2100 if (c == rm->rm_scols)
1942 2101 c = 0;
1943 2102 rc = &rm->rm_col[c];
1944 2103 cvd = vd->vdev_child[rc->rc_devidx];
1945 2104 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1946 2105 rc->rc_offset + rc->rc_size, NULL,
1947 2106 1 << tvd->vdev_ashift,
1948 2107 zio->io_type, zio->io_priority,
1949 2108 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1950 2109 }
1951 2110
1952 2111 zio_execute(zio);
1953 2112 return;
1954 2113 }
1955 2114
1956 2115 ASSERT(zio->io_type == ZIO_TYPE_READ);
1957 2116
1958 2117 /*
1959 2118 * Iterate over the columns in reverse order so that we hit the parity
1960 2119 * last -- any errors along the way will force us to read the parity.
1961 2120 */
1962 2121 for (c = rm->rm_cols - 1; c >= 0; c--) {
1963 2122 rc = &rm->rm_col[c];
1964 2123 cvd = vd->vdev_child[rc->rc_devidx];
1965 2124 if (!vdev_readable(cvd)) {
1966 2125 if (c >= rm->rm_firstdatacol)
1967 2126 rm->rm_missingdata++;
1968 2127 else
1969 2128 rm->rm_missingparity++;
1970 2129 rc->rc_error = SET_ERROR(ENXIO);
1971 2130 rc->rc_tried = 1; /* don't even try */
1972 2131 rc->rc_skipped = 1;
1973 2132 continue;
1974 2133 }
1975 2134 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1976 2135 if (c >= rm->rm_firstdatacol)
1977 2136 rm->rm_missingdata++;
1978 2137 else
1979 2138 rm->rm_missingparity++;
1980 2139 rc->rc_error = SET_ERROR(ESTALE);
1981 2140 rc->rc_skipped = 1;
1982 2141 continue;
1983 2142 }
1984 2143 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1985 2144 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1986 2145 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1987 2146 rc->rc_offset, rc->rc_abd, rc->rc_size,
1988 2147 zio->io_type, zio->io_priority, 0,
1989 2148 vdev_raidz_child_done, rc));
1990 2149 }
1991 2150 }
1992 2151
1993 2152 zio_execute(zio);
1994 2153 }
|
↓ open down ↓ |
68 lines elided |
↑ open up ↑ |
1995 2154
1996 2155
1997 2156 /*
1998 2157 * Report a checksum error for a child of a RAID-Z device.
1999 2158 */
2000 2159 static void
2001 2160 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
2002 2161 {
2003 2162 void *buf;
2004 2163 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
2164 + vdev_stat_t *vs = &vd->vdev_stat;
2165 + spa_t *spa = zio->io_spa;
2005 2166
2006 2167 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2007 2168 zio_bad_cksum_t zbc;
2008 2169 raidz_map_t *rm = zio->io_vsd;
2009 2170
2010 2171 mutex_enter(&vd->vdev_stat_lock);
2011 2172 vd->vdev_stat.vs_checksum_errors++;
2012 2173 mutex_exit(&vd->vdev_stat_lock);
2013 2174
2014 2175 zbc.zbc_has_cksum = 0;
2015 2176 zbc.zbc_injected = rm->rm_ecksuminjected;
2016 2177
2017 2178 buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size);
2018 2179 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
2019 2180 rc->rc_offset, rc->rc_size, buf, bad_data,
2020 2181 &zbc);
2021 2182 abd_return_buf(rc->rc_abd, buf, rc->rc_size);
2022 2183 }
2184 +
2185 + if (vd->vdev_isspecial && (vs->vs_checksum_errors ||
2186 + vs->vs_read_errors || vs->vs_write_errors) &&
2187 + !spa->spa_special_has_errors) {
2188 + spa->spa_special_has_errors = B_TRUE;
2189 + }
2023 2190 }
2024 2191
2025 2192 /*
2026 2193 * We keep track of whether or not there were any injected errors, so that
2027 2194 * any ereports we generate can note it.
2028 2195 */
2029 2196 static int
2030 2197 raidz_checksum_verify(zio_t *zio)
2031 2198 {
2032 2199 zio_bad_cksum_t zbc;
2033 2200 raidz_map_t *rm = zio->io_vsd;
2034 2201
2035 2202 int ret = zio_checksum_error(zio, &zbc);
2036 2203 if (ret != 0 && zbc.zbc_injected != 0)
2037 2204 rm->rm_ecksuminjected = 1;
2038 2205
2039 2206 return (ret);
2040 2207 }
2041 2208
2042 2209 /*
2043 2210 * Generate the parity from the data columns. If we tried and were able to
2044 2211 * read the parity without error, verify that the generated parity matches the
2045 2212 * data we read. If it doesn't, we fire off a checksum error. Return the
2046 2213 * number such failures.
2047 2214 */
2048 2215 static int
2049 2216 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
2050 2217 {
2051 2218 void *orig[VDEV_RAIDZ_MAXPARITY];
2052 2219 int c, ret = 0;
2053 2220 raidz_col_t *rc;
2054 2221
2055 2222 blkptr_t *bp = zio->io_bp;
2056 2223 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
2057 2224 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
2058 2225
2059 2226 if (checksum == ZIO_CHECKSUM_NOPARITY)
2060 2227 return (ret);
2061 2228
2062 2229 for (c = 0; c < rm->rm_firstdatacol; c++) {
2063 2230 rc = &rm->rm_col[c];
2064 2231 if (!rc->rc_tried || rc->rc_error != 0)
2065 2232 continue;
2066 2233 orig[c] = zio_buf_alloc(rc->rc_size);
2067 2234 abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size);
2068 2235 }
2069 2236
2070 2237 vdev_raidz_generate_parity(rm);
2071 2238
2072 2239 for (c = 0; c < rm->rm_firstdatacol; c++) {
2073 2240 rc = &rm->rm_col[c];
2074 2241 if (!rc->rc_tried || rc->rc_error != 0)
2075 2242 continue;
2076 2243 if (abd_cmp_buf(rc->rc_abd, orig[c], rc->rc_size) != 0) {
2077 2244 raidz_checksum_error(zio, rc, orig[c]);
2078 2245 rc->rc_error = SET_ERROR(ECKSUM);
2079 2246 ret++;
2080 2247 }
2081 2248 zio_buf_free(orig[c], rc->rc_size);
2082 2249 }
2083 2250
2084 2251 return (ret);
2085 2252 }
2086 2253
2087 2254 /*
2088 2255 * Keep statistics on all the ways that we used parity to correct data.
2089 2256 */
2090 2257 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
2091 2258
2092 2259 static int
2093 2260 vdev_raidz_worst_error(raidz_map_t *rm)
2094 2261 {
2095 2262 int error = 0;
2096 2263
2097 2264 for (int c = 0; c < rm->rm_cols; c++)
2098 2265 error = zio_worst_error(error, rm->rm_col[c].rc_error);
2099 2266
2100 2267 return (error);
2101 2268 }
2102 2269
2103 2270 /*
2104 2271 * Iterate over all combinations of bad data and attempt a reconstruction.
2105 2272 * Note that the algorithm below is non-optimal because it doesn't take into
2106 2273 * account how reconstruction is actually performed. For example, with
2107 2274 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
2108 2275 * is targeted as invalid as if columns 1 and 4 are targeted since in both
2109 2276 * cases we'd only use parity information in column 0.
2110 2277 */
2111 2278 static int
2112 2279 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
2113 2280 {
2114 2281 raidz_map_t *rm = zio->io_vsd;
2115 2282 raidz_col_t *rc;
2116 2283 void *orig[VDEV_RAIDZ_MAXPARITY];
2117 2284 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
2118 2285 int *tgts = &tstore[1];
2119 2286 int current, next, i, c, n;
2120 2287 int code, ret = 0;
2121 2288
2122 2289 ASSERT(total_errors < rm->rm_firstdatacol);
2123 2290
2124 2291 /*
2125 2292 * This simplifies one edge condition.
2126 2293 */
2127 2294 tgts[-1] = -1;
2128 2295
2129 2296 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
2130 2297 /*
2131 2298 * Initialize the targets array by finding the first n columns
2132 2299 * that contain no error.
2133 2300 *
2134 2301 * If there were no data errors, we need to ensure that we're
2135 2302 * always explicitly attempting to reconstruct at least one
2136 2303 * data column. To do this, we simply push the highest target
2137 2304 * up into the data columns.
2138 2305 */
2139 2306 for (c = 0, i = 0; i < n; i++) {
2140 2307 if (i == n - 1 && data_errors == 0 &&
2141 2308 c < rm->rm_firstdatacol) {
2142 2309 c = rm->rm_firstdatacol;
2143 2310 }
2144 2311
2145 2312 while (rm->rm_col[c].rc_error != 0) {
2146 2313 c++;
2147 2314 ASSERT3S(c, <, rm->rm_cols);
2148 2315 }
2149 2316
2150 2317 tgts[i] = c++;
2151 2318 }
2152 2319
2153 2320 /*
2154 2321 * Setting tgts[n] simplifies the other edge condition.
2155 2322 */
2156 2323 tgts[n] = rm->rm_cols;
2157 2324
2158 2325 /*
2159 2326 * These buffers were allocated in previous iterations.
2160 2327 */
2161 2328 for (i = 0; i < n - 1; i++) {
2162 2329 ASSERT(orig[i] != NULL);
2163 2330 }
2164 2331
2165 2332 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
2166 2333
2167 2334 current = 0;
2168 2335 next = tgts[current];
2169 2336
2170 2337 while (current != n) {
2171 2338 tgts[current] = next;
2172 2339 current = 0;
2173 2340
2174 2341 /*
2175 2342 * Save off the original data that we're going to
2176 2343 * attempt to reconstruct.
2177 2344 */
2178 2345 for (i = 0; i < n; i++) {
2179 2346 ASSERT(orig[i] != NULL);
2180 2347 c = tgts[i];
2181 2348 ASSERT3S(c, >=, 0);
2182 2349 ASSERT3S(c, <, rm->rm_cols);
2183 2350 rc = &rm->rm_col[c];
2184 2351 abd_copy_to_buf(orig[i], rc->rc_abd,
2185 2352 rc->rc_size);
2186 2353 }
2187 2354
2188 2355 /*
2189 2356 * Attempt a reconstruction and exit the outer loop on
2190 2357 * success.
2191 2358 */
2192 2359 code = vdev_raidz_reconstruct(rm, tgts, n);
2193 2360 if (raidz_checksum_verify(zio) == 0) {
2194 2361 atomic_inc_64(&raidz_corrected[code]);
2195 2362
2196 2363 for (i = 0; i < n; i++) {
2197 2364 c = tgts[i];
2198 2365 rc = &rm->rm_col[c];
2199 2366 ASSERT(rc->rc_error == 0);
2200 2367 if (rc->rc_tried)
2201 2368 raidz_checksum_error(zio, rc,
2202 2369 orig[i]);
2203 2370 rc->rc_error = SET_ERROR(ECKSUM);
2204 2371 }
2205 2372
2206 2373 ret = code;
2207 2374 goto done;
2208 2375 }
2209 2376
2210 2377 /*
2211 2378 * Restore the original data.
2212 2379 */
2213 2380 for (i = 0; i < n; i++) {
2214 2381 c = tgts[i];
2215 2382 rc = &rm->rm_col[c];
2216 2383 abd_copy_from_buf(rc->rc_abd, orig[i],
2217 2384 rc->rc_size);
2218 2385 }
2219 2386
2220 2387 do {
2221 2388 /*
2222 2389 * Find the next valid column after the current
2223 2390 * position..
2224 2391 */
2225 2392 for (next = tgts[current] + 1;
2226 2393 next < rm->rm_cols &&
2227 2394 rm->rm_col[next].rc_error != 0; next++)
2228 2395 continue;
2229 2396
2230 2397 ASSERT(next <= tgts[current + 1]);
2231 2398
2232 2399 /*
2233 2400 * If that spot is available, we're done here.
2234 2401 */
2235 2402 if (next != tgts[current + 1])
2236 2403 break;
2237 2404
2238 2405 /*
2239 2406 * Otherwise, find the next valid column after
2240 2407 * the previous position.
2241 2408 */
2242 2409 for (c = tgts[current - 1] + 1;
2243 2410 rm->rm_col[c].rc_error != 0; c++)
2244 2411 continue;
2245 2412
2246 2413 tgts[current] = c;
2247 2414 current++;
2248 2415
2249 2416 } while (current != n);
2250 2417 }
2251 2418 }
2252 2419 n--;
2253 2420 done:
2254 2421 for (i = 0; i < n; i++) {
2255 2422 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2256 2423 }
2257 2424
2258 2425 return (ret);
2259 2426 }
2260 2427
2261 2428 /*
2262 2429 * Complete an IO operation on a RAIDZ VDev
2263 2430 *
2264 2431 * Outline:
2265 2432 * - For write operations:
2266 2433 * 1. Check for errors on the child IOs.
2267 2434 * 2. Return, setting an error code if too few child VDevs were written
2268 2435 * to reconstruct the data later. Note that partial writes are
2269 2436 * considered successful if they can be reconstructed at all.
2270 2437 * - For read operations:
2271 2438 * 1. Check for errors on the child IOs.
2272 2439 * 2. If data errors occurred:
2273 2440 * a. Try to reassemble the data from the parity available.
2274 2441 * b. If we haven't yet read the parity drives, read them now.
2275 2442 * c. If all parity drives have been read but the data still doesn't
2276 2443 * reassemble with a correct checksum, then try combinatorial
2277 2444 * reconstruction.
2278 2445 * d. If that doesn't work, return an error.
2279 2446 * 3. If there were unexpected errors or this is a resilver operation,
2280 2447 * rewrite the vdevs that had errors.
2281 2448 */
2282 2449 static void
2283 2450 vdev_raidz_io_done(zio_t *zio)
2284 2451 {
2285 2452 vdev_t *vd = zio->io_vd;
2286 2453 vdev_t *cvd;
2287 2454 raidz_map_t *rm = zio->io_vsd;
|
↓ open down ↓ |
255 lines elided |
↑ open up ↑ |
2288 2455 raidz_col_t *rc;
2289 2456 int unexpected_errors = 0;
2290 2457 int parity_errors = 0;
2291 2458 int parity_untried = 0;
2292 2459 int data_errors = 0;
2293 2460 int total_errors = 0;
2294 2461 int n, c;
2295 2462 int tgts[VDEV_RAIDZ_MAXPARITY];
2296 2463 int code;
2297 2464
2298 - ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
2299 -
2300 2465 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2301 2466 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2302 2467
2303 2468 for (c = 0; c < rm->rm_cols; c++) {
2304 2469 rc = &rm->rm_col[c];
2305 2470
2306 2471 if (rc->rc_error) {
2307 2472 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
2308 2473
2309 2474 if (c < rm->rm_firstdatacol)
2310 2475 parity_errors++;
2311 2476 else
2312 2477 data_errors++;
2313 2478
2314 2479 if (!rc->rc_skipped)
2315 2480 unexpected_errors++;
2316 2481
2317 2482 total_errors++;
2318 2483 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2319 2484 parity_untried++;
2320 2485 }
2321 2486 }
2322 2487
2323 2488 if (zio->io_type == ZIO_TYPE_WRITE) {
2324 2489 /*
2325 2490 * XXX -- for now, treat partial writes as a success.
2326 2491 * (If we couldn't write enough columns to reconstruct
2327 2492 * the data, the I/O failed. Otherwise, good enough.)
2328 2493 *
2329 2494 * Now that we support write reallocation, it would be better
2330 2495 * to treat partial failure as real failure unless there are
2331 2496 * no non-degraded top-level vdevs left, and not update DTLs
2332 2497 * if we intend to reallocate.
2333 2498 */
2334 2499 /* XXPOLICY */
2335 2500 if (total_errors > rm->rm_firstdatacol)
2336 2501 zio->io_error = vdev_raidz_worst_error(rm);
2337 2502
2338 2503 return;
2339 2504 }
2340 2505
2341 2506 ASSERT(zio->io_type == ZIO_TYPE_READ);
2342 2507 /*
2343 2508 * There are three potential phases for a read:
2344 2509 * 1. produce valid data from the columns read
2345 2510 * 2. read all disks and try again
2346 2511 * 3. perform combinatorial reconstruction
2347 2512 *
2348 2513 * Each phase is progressively both more expensive and less likely to
2349 2514 * occur. If we encounter more errors than we can repair or all phases
2350 2515 * fail, we have no choice but to return an error.
2351 2516 */
2352 2517
2353 2518 /*
2354 2519 * If the number of errors we saw was correctable -- less than or equal
2355 2520 * to the number of parity disks read -- attempt to produce data that
2356 2521 * has a valid checksum. Naturally, this case applies in the absence of
2357 2522 * any errors.
2358 2523 */
2359 2524 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2360 2525 if (data_errors == 0) {
2361 2526 if (raidz_checksum_verify(zio) == 0) {
2362 2527 /*
2363 2528 * If we read parity information (unnecessarily
2364 2529 * as it happens since no reconstruction was
2365 2530 * needed) regenerate and verify the parity.
2366 2531 * We also regenerate parity when resilvering
2367 2532 * so we can write it out to the failed device
2368 2533 * later.
2369 2534 */
2370 2535 if (parity_errors + parity_untried <
2371 2536 rm->rm_firstdatacol ||
2372 2537 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2373 2538 n = raidz_parity_verify(zio, rm);
2374 2539 unexpected_errors += n;
2375 2540 ASSERT(parity_errors + n <=
2376 2541 rm->rm_firstdatacol);
2377 2542 }
2378 2543 goto done;
2379 2544 }
2380 2545 } else {
2381 2546 /*
2382 2547 * We either attempt to read all the parity columns or
2383 2548 * none of them. If we didn't try to read parity, we
2384 2549 * wouldn't be here in the correctable case. There must
2385 2550 * also have been fewer parity errors than parity
2386 2551 * columns or, again, we wouldn't be in this code path.
2387 2552 */
2388 2553 ASSERT(parity_untried == 0);
2389 2554 ASSERT(parity_errors < rm->rm_firstdatacol);
2390 2555
2391 2556 /*
2392 2557 * Identify the data columns that reported an error.
2393 2558 */
2394 2559 n = 0;
2395 2560 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2396 2561 rc = &rm->rm_col[c];
2397 2562 if (rc->rc_error != 0) {
2398 2563 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2399 2564 tgts[n++] = c;
2400 2565 }
2401 2566 }
2402 2567
2403 2568 ASSERT(rm->rm_firstdatacol >= n);
2404 2569
2405 2570 code = vdev_raidz_reconstruct(rm, tgts, n);
2406 2571
2407 2572 if (raidz_checksum_verify(zio) == 0) {
2408 2573 atomic_inc_64(&raidz_corrected[code]);
2409 2574
2410 2575 /*
2411 2576 * If we read more parity disks than were used
2412 2577 * for reconstruction, confirm that the other
2413 2578 * parity disks produced correct data. This
2414 2579 * routine is suboptimal in that it regenerates
2415 2580 * the parity that we already used in addition
2416 2581 * to the parity that we're attempting to
2417 2582 * verify, but this should be a relatively
2418 2583 * uncommon case, and can be optimized if it
2419 2584 * becomes a problem. Note that we regenerate
2420 2585 * parity when resilvering so we can write it
2421 2586 * out to failed devices later.
2422 2587 */
2423 2588 if (parity_errors < rm->rm_firstdatacol - n ||
2424 2589 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2425 2590 n = raidz_parity_verify(zio, rm);
2426 2591 unexpected_errors += n;
2427 2592 ASSERT(parity_errors + n <=
2428 2593 rm->rm_firstdatacol);
2429 2594 }
2430 2595
2431 2596 goto done;
2432 2597 }
2433 2598 }
2434 2599 }
2435 2600
2436 2601 /*
2437 2602 * This isn't a typical situation -- either we got a read error or
2438 2603 * a child silently returned bad data. Read every block so we can
2439 2604 * try again with as much data and parity as we can track down. If
2440 2605 * we've already been through once before, all children will be marked
2441 2606 * as tried so we'll proceed to combinatorial reconstruction.
2442 2607 */
2443 2608 unexpected_errors = 1;
2444 2609 rm->rm_missingdata = 0;
2445 2610 rm->rm_missingparity = 0;
2446 2611
2447 2612 for (c = 0; c < rm->rm_cols; c++) {
2448 2613 if (rm->rm_col[c].rc_tried)
2449 2614 continue;
2450 2615
2451 2616 zio_vdev_io_redone(zio);
2452 2617 do {
2453 2618 rc = &rm->rm_col[c];
2454 2619 if (rc->rc_tried)
2455 2620 continue;
2456 2621 zio_nowait(zio_vdev_child_io(zio, NULL,
2457 2622 vd->vdev_child[rc->rc_devidx],
2458 2623 rc->rc_offset, rc->rc_abd, rc->rc_size,
2459 2624 zio->io_type, zio->io_priority, 0,
2460 2625 vdev_raidz_child_done, rc));
2461 2626 } while (++c < rm->rm_cols);
2462 2627
2463 2628 return;
2464 2629 }
2465 2630
2466 2631 /*
2467 2632 * At this point we've attempted to reconstruct the data given the
2468 2633 * errors we detected, and we've attempted to read all columns. There
2469 2634 * must, therefore, be one or more additional problems -- silent errors
2470 2635 * resulting in invalid data rather than explicit I/O errors resulting
2471 2636 * in absent data. We check if there is enough additional data to
2472 2637 * possibly reconstruct the data and then perform combinatorial
2473 2638 * reconstruction over all possible combinations. If that fails,
2474 2639 * we're cooked.
2475 2640 */
2476 2641 if (total_errors > rm->rm_firstdatacol) {
2477 2642 zio->io_error = vdev_raidz_worst_error(rm);
2478 2643
2479 2644 } else if (total_errors < rm->rm_firstdatacol &&
2480 2645 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2481 2646 /*
2482 2647 * If we didn't use all the available parity for the
2483 2648 * combinatorial reconstruction, verify that the remaining
2484 2649 * parity is correct.
2485 2650 */
2486 2651 if (code != (1 << rm->rm_firstdatacol) - 1)
2487 2652 (void) raidz_parity_verify(zio, rm);
2488 2653 } else {
2489 2654 /*
2490 2655 * We're here because either:
2491 2656 *
2492 2657 * total_errors == rm_first_datacol, or
2493 2658 * vdev_raidz_combrec() failed
2494 2659 *
2495 2660 * In either case, there is enough bad data to prevent
2496 2661 * reconstruction.
2497 2662 *
2498 2663 * Start checksum ereports for all children which haven't
2499 2664 * failed, and the IO wasn't speculative.
2500 2665 */
2501 2666 zio->io_error = SET_ERROR(ECKSUM);
2502 2667
2503 2668 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2504 2669 for (c = 0; c < rm->rm_cols; c++) {
2505 2670 rc = &rm->rm_col[c];
2506 2671 if (rc->rc_error == 0) {
2507 2672 zio_bad_cksum_t zbc;
2508 2673 zbc.zbc_has_cksum = 0;
2509 2674 zbc.zbc_injected =
2510 2675 rm->rm_ecksuminjected;
2511 2676
2512 2677 zfs_ereport_start_checksum(
2513 2678 zio->io_spa,
2514 2679 vd->vdev_child[rc->rc_devidx],
2515 2680 zio, rc->rc_offset, rc->rc_size,
2516 2681 (void *)(uintptr_t)c, &zbc);
2517 2682 }
2518 2683 }
2519 2684 }
2520 2685 }
2521 2686
2522 2687 done:
2523 2688 zio_checksum_verified(zio);
2524 2689
2525 2690 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2526 2691 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2527 2692 /*
2528 2693 * Use the good data we have in hand to repair damaged children.
2529 2694 */
2530 2695 for (c = 0; c < rm->rm_cols; c++) {
2531 2696 rc = &rm->rm_col[c];
2532 2697 cvd = vd->vdev_child[rc->rc_devidx];
2533 2698
2534 2699 if (rc->rc_error == 0)
2535 2700 continue;
2536 2701
2537 2702 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2538 2703 rc->rc_offset, rc->rc_abd, rc->rc_size,
2539 2704 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2540 2705 ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2541 2706 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2542 2707 }
2543 2708 }
2544 2709 }
2545 2710
2546 2711 static void
2547 2712 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
|
↓ open down ↓ |
238 lines elided |
↑ open up ↑ |
2548 2713 {
2549 2714 if (faulted > vd->vdev_nparity)
2550 2715 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2551 2716 VDEV_AUX_NO_REPLICAS);
2552 2717 else if (degraded + faulted != 0)
2553 2718 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2554 2719 else
2555 2720 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2556 2721 }
2557 2722
2723 +static inline void
2724 +vdev_raidz_trim_append_rc(dkioc_free_list_t *dfl, uint64_t *num_extsp,
2725 + const raidz_col_t *rc)
2726 +{
2727 + uint64_t num_exts = *num_extsp;
2728 + ASSERT(rc->rc_size != 0);
2729 +
2730 + if (dfl->dfl_num_exts > 0 &&
2731 + dfl->dfl_exts[num_exts - 1].dfle_start +
2732 + dfl->dfl_exts[num_exts - 1].dfle_length == rc->rc_offset) {
2733 + dfl->dfl_exts[num_exts - 1].dfle_length += rc->rc_size;
2734 + } else {
2735 + dfl->dfl_exts[num_exts].dfle_start = rc->rc_offset;
2736 + dfl->dfl_exts[num_exts].dfle_length = rc->rc_size;
2737 + (*num_extsp)++;
2738 + }
2739 +}
2740 +
2741 +/*
2742 + * Processes a trim for a raidz vdev.
2743 + */
2744 +static void
2745 +vdev_raidz_trim(vdev_t *vd, zio_t *pio, void *trim_exts)
2746 +{
2747 + dkioc_free_list_t *dfl = trim_exts;
2748 + dkioc_free_list_t **sub_dfls;
2749 + uint64_t *sub_dfls_num_exts;
2750 +
2751 + sub_dfls = kmem_zalloc(sizeof (*sub_dfls) * vd->vdev_children,
2752 + KM_SLEEP);
2753 + sub_dfls_num_exts = kmem_zalloc(sizeof (uint64_t) * vd->vdev_children,
2754 + KM_SLEEP);
2755 + for (int i = 0; i < vd->vdev_children; i++) {
2756 + /*
2757 + * We might over-allocate here, because the sub-lists can never
2758 + * be longer than the parent list, but they can be shorter.
2759 + * The underlying driver will discard zero-length extents.
2760 + */
2761 + sub_dfls[i] = kmem_zalloc(DFL_SZ(dfl->dfl_num_exts), KM_SLEEP);
2762 + sub_dfls[i]->dfl_num_exts = dfl->dfl_num_exts;
2763 + sub_dfls[i]->dfl_flags = dfl->dfl_flags;
2764 + sub_dfls[i]->dfl_offset = dfl->dfl_offset;
2765 + /* don't copy the check func, because it isn't raidz-aware */
2766 + }
2767 +
2768 + /*
2769 + * Process all extents and redistribute them to the component vdevs
2770 + * according to a computed raidz map geometry.
2771 + */
2772 + for (int i = 0; i < dfl->dfl_num_exts; i++) {
2773 + uint64_t start = dfl->dfl_exts[i].dfle_start;
2774 + uint64_t length = dfl->dfl_exts[i].dfle_length;
2775 + raidz_map_t *rm = vdev_raidz_map_alloc(NULL,
2776 + vdev_raidz_psize_floor(vd, length), start,
2777 + vd->vdev_top->vdev_ashift, vd->vdev_children,
2778 + vd->vdev_nparity, B_FALSE);
2779 +
2780 + for (uint64_t j = 0; j < rm->rm_cols; j++) {
2781 + uint64_t devidx = rm->rm_col[j].rc_devidx;
2782 + vdev_raidz_trim_append_rc(sub_dfls[devidx],
2783 + &sub_dfls_num_exts[devidx], &rm->rm_col[j]);
2784 + }
2785 + vdev_raidz_map_free(rm);
2786 + }
2787 +
2788 + /*
2789 + * Issue the component ioctls as children of the parent zio.
2790 + */
2791 + for (int i = 0; i < vd->vdev_children; i++) {
2792 + if (sub_dfls_num_exts[i] != 0) {
2793 + zio_nowait(zio_ioctl(pio, vd->vdev_child[i]->vdev_spa,
2794 + vd->vdev_child[i], DKIOCFREE,
2795 + vdev_raidz_trim_done, sub_dfls[i],
2796 + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
2797 + ZIO_FLAG_DONT_RETRY));
2798 + } else {
2799 + dfl_free(sub_dfls[i]);
2800 + }
2801 + }
2802 + kmem_free(sub_dfls, sizeof (*sub_dfls) * vd->vdev_children);
2803 + kmem_free(sub_dfls_num_exts, sizeof (uint64_t) * vd->vdev_children);
2804 +}
2805 +
2806 +/*
2807 + * Releases a dkioc_free_list_t from ioctls issued to component devices in
2808 + * vdev_raidz_dkioc_free.
2809 + */
2810 +static void
2811 +vdev_raidz_trim_done(zio_t *zio)
2812 +{
2813 + ASSERT(zio->io_private != NULL);
2814 + dfl_free(zio->io_private);
2815 +}
2816 +
2558 2817 vdev_ops_t vdev_raidz_ops = {
2559 2818 vdev_raidz_open,
2560 2819 vdev_raidz_close,
2561 2820 vdev_raidz_asize,
2562 2821 vdev_raidz_io_start,
2563 2822 vdev_raidz_io_done,
2564 2823 vdev_raidz_state_change,
2565 2824 NULL,
2566 2825 NULL,
2567 - NULL,
2826 + vdev_raidz_trim,
2568 2827 VDEV_TYPE_RAIDZ, /* name of this vdev type */
2569 2828 B_FALSE /* not a leaf vdev */
2570 2829 };
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX