Print this page
NEX-7298 powertop dumps core when -c, -d or -t flags are used
Reviewed by: Gordon Ross <gordon.ross@nexenta.com>
NEX-6725 zpool offlining/onlining first disk in a mirror causes checksum error
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
NEX-5795 Rename 'wrc' as 'wbc' in the source and in the tech docs
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
NEX-4683 WRC: Special block pointer must know that it is special
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
NEX-4245 WRC: Code cleanup and refactoring to simplify merge with upstream
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
NEX-4091 WRC: Scrub sometimes reports checksum error
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
NEX-3558 KRRP Integration
NEX-3508 CLONE - Port NEX-2946 Add UNMAP/TRIM functionality to ZFS and illumos
Reviewed by: Josef Sipek <josef.sipek@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Conflicts:
usr/src/uts/common/io/scsi/targets/sd.c
usr/src/uts/common/sys/scsi/targets/sddef.h
NEX-1007 added checks for NULL vdev in mirror_map
NEX-801 If a block pointer is corrupt read or write may crash
If block pointer is corrupt in such a way that vdev id of one of the
ditto blocks is wrong (out of range), zio_vdev_io_start or zio_vdev_io_done
may trip over it and crash.
This changeset takes care of this by claiming that an invalid vdev is
neither readable, nor writeable.
OS-80 support for vdev and CoS properties for the new I/O scheduler
OS-95 lint warning introduced by OS-61
Moved closed ZFS files to open repo, changed Makefiles accordingly
Removed unneeded weak symbols
re #12585 rb4049 ZFS++ work port - refactoring to improve separation of open/closed code, bug fixes, performance improvements - open code
Bug 11205: add missing libzfs_closed_stubs.c to fix opensource-only build.
ZFS plus work: special vdevs, cos, cos/vdev properties
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/fs/zfs/vdev_mirror.c
+++ new/usr/src/uts/common/fs/zfs/vdev_mirror.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
|
↓ open down ↓ |
16 lines elided |
↑ open up ↑ |
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 24 */
25 25
26 26 /*
27 + * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
27 28 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
28 29 */
29 30
30 31 #include <sys/zfs_context.h>
31 32 #include <sys/spa.h>
32 33 #include <sys/spa_impl.h>
33 34 #include <sys/dsl_pool.h>
34 35 #include <sys/dsl_scan.h>
35 36 #include <sys/vdev_impl.h>
36 37 #include <sys/zio.h>
37 38 #include <sys/abd.h>
39 +#include <sys/wbc.h>
38 40 #include <sys/fs/zfs.h>
39 41
40 42 /*
41 43 * Virtual device vector for mirroring.
42 44 */
43 45
44 46 typedef struct mirror_child {
45 47 vdev_t *mc_vd;
46 48 uint64_t mc_offset;
47 49 int mc_error;
48 50 uint8_t mc_tried;
49 51 uint8_t mc_skipped;
50 52 uint8_t mc_speculative;
53 + int mc_index; /* index in mirror_map_t */
54 + avl_node_t mc_node; /* used for sorting based on weight */
55 + int64_t mc_weight; /* thread-local copy of vdev_weight */
51 56 } mirror_child_t;
52 57
53 58 typedef struct mirror_map {
54 59 int mm_children;
55 60 int mm_resilvering;
56 61 int mm_preferred;
57 62 int mm_root;
58 63 mirror_child_t mm_child[1];
59 64 } mirror_map_t;
60 65
61 66 int vdev_mirror_shift = 21;
62 67
63 68 static void
64 69 vdev_mirror_map_free(zio_t *zio)
65 70 {
66 71 mirror_map_t *mm = zio->io_vsd;
67 72
68 73 kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children]));
69 74 }
70 75
71 76 static const zio_vsd_ops_t vdev_mirror_vsd_ops = {
72 77 vdev_mirror_map_free,
73 78 zio_vsd_default_cksum_report
74 79 };
75 80
76 81 static mirror_map_t *
|
↓ open down ↓ |
16 lines elided |
↑ open up ↑ |
77 82 vdev_mirror_map_alloc(zio_t *zio)
78 83 {
79 84 mirror_map_t *mm = NULL;
80 85 mirror_child_t *mc;
81 86 vdev_t *vd = zio->io_vd;
82 87 int c, d;
83 88
84 89 if (vd == NULL) {
85 90 dva_t *dva = zio->io_bp->blk_dva;
86 91 spa_t *spa = zio->io_spa;
87 - dva_t dva_copy[SPA_DVAS_PER_BP];
88 92
89 93 c = BP_GET_NDVAS(zio->io_bp);
90 94
91 - /*
92 - * If we do not trust the pool config, some DVAs might be
93 - * invalid or point to vdevs that do not exist. We skip them.
94 - */
95 - if (!spa_trust_config(spa)) {
96 - ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
97 - int j = 0;
98 - for (int i = 0; i < c; i++) {
99 - if (zfs_dva_valid(spa, &dva[i], zio->io_bp))
100 - dva_copy[j++] = dva[i];
101 - }
102 - if (j == 0) {
103 - zio->io_vsd = NULL;
104 - zio->io_error = ENXIO;
105 - return (NULL);
106 - }
107 - if (j < c) {
108 - dva = dva_copy;
109 - c = j;
110 - }
111 - }
112 -
113 95 mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
114 96 mm->mm_children = c;
115 97 mm->mm_resilvering = B_FALSE;
116 98 mm->mm_preferred = spa_get_random(c);
117 99 mm->mm_root = B_TRUE;
118 100
119 101 /*
120 102 * Check the other, lower-index DVAs to see if they're on
121 103 * the same vdev as the child we picked. If they are, use
122 104 * them since they are likely to have been allocated from
123 105 * the primary metaslab in use at the time, and hence are
124 106 * more likely to have locality with single-copy data.
125 107 */
|
↓ open down ↓ |
3 lines elided |
↑ open up ↑ |
126 108 for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) {
127 109 if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c]))
128 110 mm->mm_preferred = d;
129 111 }
130 112
131 113 for (c = 0; c < mm->mm_children; c++) {
132 114 mc = &mm->mm_child[c];
133 115
134 116 mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c]));
135 117 mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
118 + mc->mc_index = c;
119 + mc->mc_weight = (mc->mc_vd != NULL ?
120 + mc->mc_vd->vdev_weight : 0);
136 121 }
137 122 } else {
138 123 int replacing;
139 124
140 125 c = vd->vdev_children;
141 126
142 127 mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
143 128 mm->mm_children = c;
144 129 /*
145 130 * If we are resilvering, then we should handle scrub reads
146 131 * differently; we shouldn't issue them to the resilvering
147 132 * device because it might not have those blocks.
148 133 *
149 134 * We are resilvering iff:
150 135 * 1) We are a replacing vdev (ie our name is "replacing-1" or
151 136 * "spare-1" or something like that), and
152 137 * 2) The pool is currently being resilvered.
153 138 *
154 139 * We cannot simply check vd->vdev_resilver_txg, because it's
155 140 * not set in this path.
156 141 *
157 142 * Nor can we just check our vdev_ops; there are cases (such as
158 143 * when a user types "zpool replace pool odev spare_dev" and
159 144 * spare_dev is in the spare list, or when a spare device is
160 145 * automatically used to replace a DEGRADED device) when
161 146 * resilvering is complete but both the original vdev and the
162 147 * spare vdev remain in the pool. That behavior is intentional.
163 148 * It helps implement the policy that a spare should be
164 149 * automatically removed from the pool after the user replaces
165 150 * the device that originally failed.
166 151 */
167 152 replacing = (vd->vdev_ops == &vdev_replacing_ops ||
168 153 vd->vdev_ops == &vdev_spare_ops);
169 154 /*
170 155 * If a spa load is in progress, then spa_dsl_pool may be
171 156 * uninitialized. But we shouldn't be resilvering during a spa
172 157 * load anyway.
173 158 */
174 159 if (replacing &&
175 160 (spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE) &&
176 161 dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool)) {
177 162 mm->mm_resilvering = B_TRUE;
178 163 } else {
179 164 mm->mm_resilvering = B_FALSE;
|
↓ open down ↓ |
34 lines elided |
↑ open up ↑ |
180 165 }
181 166
182 167 mm->mm_preferred = mm->mm_resilvering ? 0 :
183 168 (zio->io_offset >> vdev_mirror_shift) % c;
184 169 mm->mm_root = B_FALSE;
185 170
186 171 for (c = 0; c < mm->mm_children; c++) {
187 172 mc = &mm->mm_child[c];
188 173 mc->mc_vd = vd->vdev_child[c];
189 174 mc->mc_offset = zio->io_offset;
175 + mc->mc_index = c;
176 + mc->mc_weight = (mc->mc_vd != NULL ?
177 + mc->mc_vd->vdev_weight : 0);
190 178 }
191 179 }
192 180
193 181 zio->io_vsd = mm;
194 182 zio->io_vsd_ops = &vdev_mirror_vsd_ops;
195 183 return (mm);
196 184 }
197 185
198 186 static int
199 187 vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
200 188 uint64_t *ashift)
201 189 {
202 190 int numerrors = 0;
203 191 int lasterror = 0;
204 192
205 193 if (vd->vdev_children == 0) {
206 194 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
207 195 return (SET_ERROR(EINVAL));
208 196 }
209 197
210 198 vdev_open_children(vd);
211 199
212 200 for (int c = 0; c < vd->vdev_children; c++) {
213 201 vdev_t *cvd = vd->vdev_child[c];
214 202
215 203 if (cvd->vdev_open_error) {
216 204 lasterror = cvd->vdev_open_error;
|
↓ open down ↓ |
17 lines elided |
↑ open up ↑ |
217 205 numerrors++;
218 206 continue;
219 207 }
220 208
221 209 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
222 210 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
223 211 *ashift = MAX(*ashift, cvd->vdev_ashift);
224 212 }
225 213
226 214 if (numerrors == vd->vdev_children) {
227 - if (vdev_children_are_offline(vd))
228 - vd->vdev_stat.vs_aux = VDEV_AUX_CHILDREN_OFFLINE;
229 - else
230 - vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
215 + vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
231 216 return (lasterror);
232 217 }
233 218
234 219 return (0);
235 220 }
236 221
237 222 static void
238 223 vdev_mirror_close(vdev_t *vd)
239 224 {
240 225 for (int c = 0; c < vd->vdev_children; c++)
241 226 vdev_close(vd->vdev_child[c]);
242 227 }
243 228
244 229 static void
245 230 vdev_mirror_child_done(zio_t *zio)
246 231 {
247 232 mirror_child_t *mc = zio->io_private;
248 233
249 234 mc->mc_error = zio->io_error;
250 235 mc->mc_tried = 1;
251 236 mc->mc_skipped = 0;
252 237 }
253 238
254 239 static void
255 240 vdev_mirror_scrub_done(zio_t *zio)
256 241 {
257 242 mirror_child_t *mc = zio->io_private;
258 243
259 244 if (zio->io_error == 0) {
260 245 zio_t *pio;
261 246 zio_link_t *zl = NULL;
262 247
263 248 mutex_enter(&zio->io_lock);
264 249 while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
265 250 mutex_enter(&pio->io_lock);
266 251 ASSERT3U(zio->io_size, >=, pio->io_size);
267 252 abd_copy(pio->io_abd, zio->io_abd, pio->io_size);
268 253 mutex_exit(&pio->io_lock);
|
↓ open down ↓ |
28 lines elided |
↑ open up ↑ |
269 254 }
270 255 mutex_exit(&zio->io_lock);
271 256 }
272 257 abd_free(zio->io_abd);
273 258
274 259 mc->mc_error = zio->io_error;
275 260 mc->mc_tried = 1;
276 261 mc->mc_skipped = 0;
277 262 }
278 263
264 +static int
265 +vdev_weight_compar(const void *mc_a, const void *mc_b)
266 +{
267 + const mirror_child_t *a = mc_a, *b = mc_b;
268 +
269 + /*
270 + * 1) if a's weight is less than b's, a goes right in the tree
271 + * 2) if a's weight is greater than b's, a goes left
272 + * 3) if a's and b's weights are equal, lower map index goes left
273 + * 4) if weight and map index are equal, it's the same object
274 + */
275 + if (a->mc_weight < b->mc_weight)
276 + return (1);
277 + if (a->mc_weight > b->mc_weight)
278 + return (-1);
279 + if (a->mc_index > b->mc_index)
280 + return (1);
281 + if (a->mc_index < b->mc_index)
282 + return (-1);
283 + ASSERT3P(a->mc_vd, ==, b->mc_vd);
284 + return (0);
285 +}
286 +
287 +static boolean_t
288 +child_select_mc(mirror_child_t *mc, uint64_t txg)
289 +{
290 + if (mc->mc_tried || mc->mc_skipped)
291 + return (B_FALSE);
292 + if (!vdev_readable(mc->mc_vd)) {
293 + mc->mc_error = SET_ERROR(ENXIO);
294 + mc->mc_tried = 1; /* don't even try */
295 + mc->mc_skipped = 1;
296 + return (B_FALSE);
297 + }
298 + if (!vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) {
299 + mc->mc_weight--;
300 + mc->mc_vd->vdev_weight = mc->mc_weight;
301 + return (B_TRUE);
302 + }
303 + mc->mc_error = SET_ERROR(ESTALE);
304 + mc->mc_skipped = 1;
305 + mc->mc_speculative = 1;
306 + return (B_FALSE);
307 +}
308 +
309 +static void
310 +child_select_cleanup(mirror_map_t *mm, avl_tree_t *vdevs_by_weight)
311 +{
312 + for (int i = 0; i < mm->mm_children; i++)
313 + avl_remove(vdevs_by_weight, &mm->mm_child[i]);
314 + avl_destroy(vdevs_by_weight);
315 +}
316 +
279 317 /*
280 318 * Try to find a child whose DTL doesn't contain the block we want to read.
281 319 * If we can't, try the read on any vdev we haven't already tried.
282 320 */
283 321 static int
284 322 vdev_mirror_child_select(zio_t *zio)
285 323 {
286 324 mirror_map_t *mm = zio->io_vsd;
287 - mirror_child_t *mc;
288 325 uint64_t txg = zio->io_txg;
289 - int i, c;
326 + /*
327 + * Look at the weights of the vdevs in the mirror; the weights help
328 + * decide which vdev to read from; the highest-weight suitable child
329 + * index is returned, and its weight is decremented in order to avoid
330 + * creating "hot" devices; once all the vdevs' weights are zero, the
331 + * weights are set back to the ones configured in vdev props
332 + */
333 + int64_t max_weight = 0;
290 334
291 335 ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg);
292 336
337 + for (int c = 0; c < mm->mm_children; c++) {
338 + mirror_child_t *mc = &mm->mm_child[c];
339 + if (mc->mc_vd == NULL)
340 + continue;
341 + max_weight = MAX(max_weight, mc->mc_weight);
342 + }
343 +
293 344 /*
294 - * Try to find a child whose DTL doesn't contain the block to read.
295 - * If a child is known to be completely inaccessible (indicated by
296 - * vdev_readable() returning B_FALSE), don't even try.
345 + * Recalculate weights
297 346 */
298 - for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) {
299 - if (c >= mm->mm_children)
300 - c = 0;
301 - mc = &mm->mm_child[c];
302 - if (mc->mc_tried || mc->mc_skipped)
303 - continue;
304 - if (!vdev_readable(mc->mc_vd)) {
305 - mc->mc_error = SET_ERROR(ENXIO);
306 - mc->mc_tried = 1; /* don't even try */
307 - mc->mc_skipped = 1;
308 - continue;
347 + if (max_weight == 0) {
348 + for (int c = 0; c < mm->mm_children; c++) {
349 + mirror_child_t *mc = &mm->mm_child[c];
350 + if (mc->mc_vd == NULL)
351 + continue;
352 + mc->mc_weight =
353 + vdev_queue_get_prop_uint64(&mc->mc_vd->vdev_queue,
354 + VDEV_PROP_PREFERRED_READ) + 1;
355 + mc->mc_vd->vdev_weight = mc->mc_weight;
309 356 }
310 - if (!vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1))
311 - return (c);
312 - mc->mc_error = SET_ERROR(ESTALE);
313 - mc->mc_skipped = 1;
314 - mc->mc_speculative = 1;
315 357 }
316 358
359 + if (mm->mm_children > 1) {
360 + avl_tree_t vdevs_by_weight;
361 +
362 + avl_create(&vdevs_by_weight, vdev_weight_compar,
363 + sizeof (mirror_child_t), offsetof(mirror_child_t, mc_node));
364 +
365 + /*
366 + * Sort the weighted list
367 + */
368 + for (int i = 0; i < mm->mm_children; i++)
369 + avl_add(&vdevs_by_weight, &mm->mm_child[i]);
370 +
371 + /*
372 + * Try to find a child whose DTL doesn't contain the block to
373 + * read. If a child is known to be completely inaccessible
374 + * (vdev_readable() returning B_FALSE), don't even try.
375 + */
376 + for (mirror_child_t *mc = avl_first(&vdevs_by_weight);
377 + mc != NULL; mc = AVL_NEXT(&vdevs_by_weight, mc)) {
378 + if (child_select_mc(mc, txg)) {
379 + child_select_cleanup(mm, &vdevs_by_weight);
380 + return (mc->mc_index);
381 + }
382 + }
383 + child_select_cleanup(mm, &vdevs_by_weight);
384 + } else {
385 + if (child_select_mc(&mm->mm_child[0], txg))
386 + return (0);
387 + }
388 +
317 389 /*
318 390 * Every device is either missing or has this txg in its DTL.
319 391 * Look for any child we haven't already tried before giving up.
320 392 */
321 - for (c = 0; c < mm->mm_children; c++)
322 - if (!mm->mm_child[c].mc_tried)
393 + for (int c = 0; c < mm->mm_children; c++)
394 + if (!mm->mm_child[c].mc_tried && mm->mm_child[c].mc_vd != NULL)
323 395 return (c);
324 396
325 397 /*
326 398 * Every child failed. There's no place left to look.
327 399 */
328 400 return (-1);
329 401 }
330 402
331 403 static void
332 404 vdev_mirror_io_start(zio_t *zio)
333 405 {
334 406 mirror_map_t *mm;
335 407 mirror_child_t *mc;
336 408 int c, children;
409 + boolean_t spec_case = B_FALSE;
410 + spa_t *spa = zio->io_spa;
337 411
338 412 mm = vdev_mirror_map_alloc(zio);
339 413
340 - if (mm == NULL) {
341 - ASSERT(!spa_trust_config(zio->io_spa));
342 - ASSERT(zio->io_type == ZIO_TYPE_READ);
343 - zio_execute(zio);
344 - return;
345 - }
414 + if (zio->io_child_type != ZIO_CHILD_VDEV &&
415 + BP_IS_SPECIAL(zio->io_bp))
416 + spec_case = B_TRUE;
346 417
347 418 if (zio->io_type == ZIO_TYPE_READ) {
348 419 if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering) {
420 + int target = 0;
421 +
349 422 /*
423 + * Scrub of special BPs should take into
424 + * account the state of WBC-Window
425 + */
426 + if (spec_case) {
427 + target = wbc_select_dva(
428 + spa_get_wbc_data(spa), zio);
429 + }
430 +
431 + /*
350 432 * For scrubbing reads we need to allocate a read
351 433 * buffer for each child and issue reads to all
352 434 * children. If any child succeeds, it will copy its
353 435 * data into zio->io_data in vdev_mirror_scrub_done.
354 436 */
355 437 for (c = 0; c < mm->mm_children; c++) {
356 438 mc = &mm->mm_child[c];
439 + if (mc->mc_vd == NULL) {
440 + /*
441 + * Invalid vdev id in blkptr caused
442 + * mc_vd to be NULL here.
443 + * Just skip this vdev.
444 + */
445 + continue;
446 + }
447 +
448 + if (spec_case && c != target)
449 + continue;
450 +
357 451 zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
358 452 mc->mc_vd, mc->mc_offset,
359 453 abd_alloc_sametype(zio->io_abd,
360 454 zio->io_size), zio->io_size,
361 455 zio->io_type, zio->io_priority, 0,
362 456 vdev_mirror_scrub_done, mc));
363 457 }
364 458 zio_execute(zio);
365 459 return;
366 460 }
367 461 /*
368 462 * For normal reads just pick one child.
369 463 */
370 - c = vdev_mirror_child_select(zio);
464 +
465 + if (spec_case)
466 + c = wbc_select_dva(spa_get_wbc_data(spa), zio);
467 + else
468 + c = vdev_mirror_child_select(zio);
469 +
371 470 children = (c >= 0);
372 471 } else {
373 472 ASSERT(zio->io_type == ZIO_TYPE_WRITE);
374 473
375 474 /*
376 475 * Writes go to all children.
377 476 */
378 477 c = 0;
379 478 children = mm->mm_children;
380 479 }
381 480
382 - while (children--) {
481 + for (; children--; c++) {
383 482 mc = &mm->mm_child[c];
483 + if (mc->mc_vd == NULL) {
484 + /*
485 + * Invalid vdev in blkptr caused mc_vd to be NULL here.
486 + * Just skip this vdev.
487 + */
488 + continue;
489 + }
490 +
491 + if (spec_case) {
492 + if (zio->io_type == ZIO_TYPE_WRITE &&
493 + !vdev_is_special(mc->mc_vd))
494 + continue;
495 + }
496 +
384 497 zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
385 498 mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
386 499 zio->io_type, zio->io_priority, 0,
387 500 vdev_mirror_child_done, mc));
388 - c++;
501 +
502 + if (spec_case)
503 + break;
389 504 }
390 505
391 506 zio_execute(zio);
392 507 }
393 508
394 509 static int
395 510 vdev_mirror_worst_error(mirror_map_t *mm)
396 511 {
397 512 int error[2] = { 0, 0 };
398 513
399 514 for (int c = 0; c < mm->mm_children; c++) {
400 515 mirror_child_t *mc = &mm->mm_child[c];
401 516 int s = mc->mc_speculative;
402 517 error[s] = zio_worst_error(error[s], mc->mc_error);
403 518 }
404 519
405 520 return (error[0] ? error[0] : error[1]);
406 521 }
|
↓ open down ↓ |
8 lines elided |
↑ open up ↑ |
407 522
408 523 static void
409 524 vdev_mirror_io_done(zio_t *zio)
410 525 {
411 526 mirror_map_t *mm = zio->io_vsd;
412 527 mirror_child_t *mc;
413 528 int c;
414 529 int good_copies = 0;
415 530 int unexpected_errors = 0;
416 531
417 - if (mm == NULL)
418 - return;
419 -
420 532 for (c = 0; c < mm->mm_children; c++) {
421 533 mc = &mm->mm_child[c];
422 534
423 535 if (mc->mc_error) {
424 536 if (!mc->mc_skipped)
425 537 unexpected_errors++;
426 538 } else if (mc->mc_tried) {
427 539 good_copies++;
428 540 }
429 541 }
430 542
431 543 if (zio->io_type == ZIO_TYPE_WRITE) {
432 544 /*
433 545 * XXX -- for now, treat partial writes as success.
434 546 *
435 547 * Now that we support write reallocation, it would be better
436 548 * to treat partial failure as real failure unless there are
437 549 * no non-degraded top-level vdevs left, and not update DTLs
438 550 * if we intend to reallocate.
439 551 */
440 552 /* XXPOLICY */
441 553 if (good_copies != mm->mm_children) {
442 554 /*
443 555 * Always require at least one good copy.
444 556 *
445 557 * For ditto blocks (io_vd == NULL), require
446 558 * all copies to be good.
447 559 *
448 560 * XXX -- for replacing vdevs, there's no great answer.
449 561 * If the old device is really dead, we may not even
450 562 * be able to access it -- so we only want to
451 563 * require good writes to the new device. But if
452 564 * the new device turns out to be flaky, we want
453 565 * to be able to detach it -- which requires all
454 566 * writes to the old device to have succeeded.
455 567 */
456 568 if (good_copies == 0 || zio->io_vd == NULL)
457 569 zio->io_error = vdev_mirror_worst_error(mm);
458 570 }
459 571 return;
460 572 }
461 573
462 574 ASSERT(zio->io_type == ZIO_TYPE_READ);
463 575
464 576 /*
465 577 * If we don't have a good copy yet, keep trying other children.
466 578 */
467 579 /* XXPOLICY */
468 580 if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) {
469 581 ASSERT(c >= 0 && c < mm->mm_children);
470 582 mc = &mm->mm_child[c];
471 583 zio_vdev_io_redone(zio);
472 584 zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
473 585 mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
474 586 ZIO_TYPE_READ, zio->io_priority, 0,
475 587 vdev_mirror_child_done, mc));
476 588 return;
477 589 }
478 590
479 591 /* XXPOLICY */
480 592 if (good_copies == 0) {
481 593 zio->io_error = vdev_mirror_worst_error(mm);
482 594 ASSERT(zio->io_error != 0);
483 595 }
484 596
485 597 if (good_copies && spa_writeable(zio->io_spa) &&
486 598 (unexpected_errors ||
487 599 (zio->io_flags & ZIO_FLAG_RESILVER) ||
488 600 ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_resilvering))) {
489 601 /*
490 602 * Use the good data we have in hand to repair damaged children.
491 603 */
492 604 for (c = 0; c < mm->mm_children; c++) {
493 605 /*
494 606 * Don't rewrite known good children.
495 607 * Not only is it unnecessary, it could
496 608 * actually be harmful: if the system lost
497 609 * power while rewriting the only good copy,
498 610 * there would be no good copies left!
499 611 */
500 612 mc = &mm->mm_child[c];
501 613
502 614 if (mc->mc_error == 0) {
503 615 if (mc->mc_tried)
504 616 continue;
505 617 if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
506 618 !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL,
507 619 zio->io_txg, 1))
508 620 continue;
509 621 mc->mc_error = SET_ERROR(ESTALE);
510 622 }
511 623
512 624 zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
513 625 mc->mc_vd, mc->mc_offset,
514 626 zio->io_abd, zio->io_size,
|
↓ open down ↓ |
85 lines elided |
↑ open up ↑ |
515 627 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
516 628 ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
517 629 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
518 630 }
519 631 }
520 632 }
521 633
522 634 static void
523 635 vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded)
524 636 {
525 - if (faulted == vd->vdev_children) {
526 - if (vdev_children_are_offline(vd)) {
527 - vdev_set_state(vd, B_FALSE, VDEV_STATE_OFFLINE,
528 - VDEV_AUX_CHILDREN_OFFLINE);
529 - } else {
530 - vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
531 - VDEV_AUX_NO_REPLICAS);
532 - }
533 - } else if (degraded + faulted != 0) {
637 + if (faulted == vd->vdev_children)
638 + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
639 + VDEV_AUX_NO_REPLICAS);
640 + else if (degraded + faulted != 0)
534 641 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
535 - } else {
642 + else
536 643 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
537 - }
538 644 }
539 645
540 646 vdev_ops_t vdev_mirror_ops = {
541 647 vdev_mirror_open,
542 648 vdev_mirror_close,
543 649 vdev_default_asize,
544 650 vdev_mirror_io_start,
545 651 vdev_mirror_io_done,
546 652 vdev_mirror_state_change,
547 653 NULL,
548 654 NULL,
549 655 NULL,
550 656 VDEV_TYPE_MIRROR, /* name of this vdev type */
551 657 B_FALSE /* not a leaf vdev */
552 658 };
553 659
554 660 vdev_ops_t vdev_replacing_ops = {
555 661 vdev_mirror_open,
556 662 vdev_mirror_close,
557 663 vdev_default_asize,
558 664 vdev_mirror_io_start,
559 665 vdev_mirror_io_done,
560 666 vdev_mirror_state_change,
561 667 NULL,
562 668 NULL,
563 669 NULL,
564 670 VDEV_TYPE_REPLACING, /* name of this vdev type */
565 671 B_FALSE /* not a leaf vdev */
566 672 };
567 673
568 674 vdev_ops_t vdev_spare_ops = {
569 675 vdev_mirror_open,
570 676 vdev_mirror_close,
571 677 vdev_default_asize,
572 678 vdev_mirror_io_start,
573 679 vdev_mirror_io_done,
574 680 vdev_mirror_state_change,
575 681 NULL,
576 682 NULL,
577 683 NULL,
578 684 VDEV_TYPE_SPARE, /* name of this vdev type */
579 685 B_FALSE /* not a leaf vdev */
580 686 };
|
↓ open down ↓ |
33 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX