7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
25 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26 * Copyright (c) 2014 Integros [integros.com]
27 */
28
29 #include <sys/zfs_context.h>
30 #include <sys/spa.h>
31 #include <sys/vdev_impl.h>
32 #include <sys/vdev_disk.h>
33 #include <sys/vdev_file.h>
34 #include <sys/vdev_raidz.h>
35 #include <sys/zio.h>
36 #include <sys/zio_checksum.h>
37 #include <sys/abd.h>
38 #include <sys/fs/zfs.h>
39 #include <sys/fm/fs/zfs.h>
40
41 /*
42 * Virtual device vector for RAID-Z.
43 *
44 * This vdev supports single, double, and triple parity. For single parity,
45 * we use a simple XOR of all the data columns. For double or triple parity,
46 * we use a special case of Reed-Solomon coding. This extends the
47 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
48 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
49 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
50 * former is also based. The latter is designed to provide higher performance
51 * for writes.
52 *
53 * Note that the Plank paper claimed to support arbitrary N+M, but was then
54 * amended six years later identifying a critical flaw that invalidates its
55 * claims. Nevertheless, the technique can be adapted to work for up to
56 * triple parity. For additional parity, the amendment "Note: Correction to
57 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
58 * is viable, but the additional complexity means that write performance will
59 * suffer.
150 { \
151 (mask) = (x) & 0x8080808080808080ULL; \
152 (mask) = ((mask) << 1) - ((mask) >> 7); \
153 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
154 ((mask) & 0x1d1d1d1d1d1d1d1d); \
155 }
156
157 #define VDEV_RAIDZ_64MUL_4(x, mask) \
158 { \
159 VDEV_RAIDZ_64MUL_2((x), mask); \
160 VDEV_RAIDZ_64MUL_2((x), mask); \
161 }
162
163 #define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE)
164
165 /*
166 * Force reconstruction to use the general purpose method.
167 */
168 int vdev_raidz_default_to_general;
169
170 /* Powers of 2 in the Galois field defined above. */
171 static const uint8_t vdev_raidz_pow2[256] = {
172 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
173 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
174 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
175 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
176 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
177 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
178 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
179 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
180 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
181 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
182 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
183 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
184 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
185 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
186 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
187 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
188 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
189 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
190 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
222 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
223 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
224 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
225 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
226 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
227 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
228 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
229 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
230 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
231 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
232 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
233 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
234 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
235 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
236 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
237 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
238 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
239 };
240
241 static void vdev_raidz_generate_parity(raidz_map_t *rm);
242
243 /*
244 * Multiply a given number by 2 raised to the given power.
245 */
246 static uint8_t
247 vdev_raidz_exp2(uint_t a, int exp)
248 {
249 if (a == 0)
250 return (0);
251
252 ASSERT(exp >= 0);
253 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
254
255 exp += vdev_raidz_log2[a];
256 if (exp > 255)
257 exp -= 255;
258
259 return (vdev_raidz_pow2[exp]);
260 }
261
262 static void
263 vdev_raidz_map_free(raidz_map_t *rm)
264 {
265 int c;
266 size_t size;
267
268 for (c = 0; c < rm->rm_firstdatacol; c++) {
269 abd_free(rm->rm_col[c].rc_abd);
270
271 if (rm->rm_col[c].rc_gdata != NULL)
272 zio_buf_free(rm->rm_col[c].rc_gdata,
273 rm->rm_col[c].rc_size);
274 }
275
276 size = 0;
277 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
278 abd_put(rm->rm_col[c].rc_abd);
279 size += rm->rm_col[c].rc_size;
280 }
281
282 if (rm->rm_abd_copy != NULL)
283 abd_free(rm->rm_abd_copy);
284
285 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
286 }
287
288 static void
289 vdev_raidz_map_free_vsd(zio_t *zio)
290 {
291 raidz_map_t *rm = zio->io_vsd;
292
293 ASSERT0(rm->rm_freed);
294 rm->rm_freed = 1;
295
296 if (rm->rm_reports == 0)
297 vdev_raidz_map_free(rm);
439
440 for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
441 raidz_col_t *col = &rm->rm_col[c];
442 abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset);
443
444 abd_copy(tmp, col->rc_abd, col->rc_size);
445 abd_put(col->rc_abd);
446 col->rc_abd = tmp;
447
448 offset += col->rc_size;
449 }
450 ASSERT3U(offset, ==, size);
451 }
452
453 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
454 vdev_raidz_map_free_vsd,
455 vdev_raidz_cksum_report
456 };
457
458 /*
459 * Divides the IO evenly across all child vdevs; usually, dcols is
460 * the number of children in the target vdev.
461 */
462 static raidz_map_t *
463 vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset,
464 uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
465 {
466 raidz_map_t *rm;
467 /* The starting RAIDZ (parent) vdev sector of the block. */
468 uint64_t b = offset >> unit_shift;
469 /* The zio's size in units of the vdev's minimum sector size. */
470 uint64_t s = size >> unit_shift;
471 /* The first column for this stripe. */
472 uint64_t f = b % dcols;
473 /* The starting byte offset on each child vdev. */
474 uint64_t o = (b / dcols) << unit_shift;
475 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
476 uint64_t off = 0;
477
478 /*
479 * "Quotient": The number of data sectors for this stripe on all but
480 * the "big column" child vdevs that also contain "remainder" data.
481 */
482 q = s / (dcols - nparity);
483
484 /*
539 rm->rm_col[c].rc_error = 0;
540 rm->rm_col[c].rc_tried = 0;
541 rm->rm_col[c].rc_skipped = 0;
542
543 if (c >= acols)
544 rm->rm_col[c].rc_size = 0;
545 else if (c < bc)
546 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
547 else
548 rm->rm_col[c].rc_size = q << unit_shift;
549
550 asize += rm->rm_col[c].rc_size;
551 }
552
553 ASSERT3U(asize, ==, tot << unit_shift);
554 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
555 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
556 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
557 ASSERT3U(rm->rm_nskip, <=, nparity);
558
559 for (c = 0; c < rm->rm_firstdatacol; c++)
560 rm->rm_col[c].rc_abd =
561 abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);
562
563 rm->rm_col[c].rc_abd = abd_get_offset(abd, 0);
564 off = rm->rm_col[c].rc_size;
565
566 for (c = c + 1; c < acols; c++) {
567 rm->rm_col[c].rc_abd = abd_get_offset(abd, off);
568 off += rm->rm_col[c].rc_size;
569 }
570
571 /*
572 * If all data stored spans all columns, there's a danger that parity
573 * will always be on the same device and, since parity isn't read
574 * during normal operation, that that device's I/O bandwidth won't be
575 * used effectively. We therefore switch the parity every 1MB.
576 *
577 * ... at least that was, ostensibly, the theory. As a practical
578 * matter unless we juggle the parity between all devices evenly, we
579 * won't see any benefit. Further, occasional writes that aren't a
580 * multiple of the LCM of the number of children and the minimum
581 * stripe width are sufficient to avoid pessimal behavior.
582 * Unfortunately, this decision created an implicit on-disk format
583 * requirement that we need to support for all eternity, but only
584 * for single-parity RAID-Z.
585 *
586 * If we intend to skip a sector in the zeroth column for padding
587 * we must make sure to note this swap. We will never intend to
588 * skip the first column since at least one data and one parity
589 * column must appear in each row.
650 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
651 {
652 struct pqr_struct *pqr = private;
653 const uint64_t *src = buf;
654 uint64_t mask;
655 int i, cnt = size / sizeof (src[0]);
656
657 ASSERT(pqr->p && pqr->q && pqr->r);
658
659 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
660 *pqr->p ^= *src;
661 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
662 *pqr->q ^= *src;
663 VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
664 *pqr->r ^= *src;
665 }
666
667 return (0);
668 }
669
670 static void
671 vdev_raidz_generate_parity_p(raidz_map_t *rm)
672 {
673 uint64_t *p;
674 int c;
675 abd_t *src;
676
677 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
678 src = rm->rm_col[c].rc_abd;
679 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
680
681 if (c == rm->rm_firstdatacol) {
682 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
683 } else {
684 struct pqr_struct pqr = { p, NULL, NULL };
685 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
686 vdev_raidz_p_func, &pqr);
687 }
688 }
689 }
690
691 static void
692 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
693 {
694 uint64_t *p, *q, pcnt, ccnt, mask, i;
695 int c;
696 abd_t *src;
697
1792 * Don't write past the end of the block
1793 */
1794 VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1795
1796 start = offset;
1797 end = start + size;
1798
1799 /*
1800 * Allocate a RAID-Z map for this block. Note that this block starts
1801 * from the "original" offset, this is, the offset of the extent which
1802 * contains the requisite offset of the data being read or written.
1803 *
1804 * Even if this I/O operation doesn't span the full block size, let's
1805 * treat the on-disk format as if the only blocks are the complete 128
1806 * KB size.
1807 */
1808 abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
1809 SPA_OLD_MAXBLOCKSIZE);
1810 rm = vdev_raidz_map_alloc(abd,
1811 SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift,
1812 vd->vdev_children, vd->vdev_nparity);
1813
1814 coloffset = origoffset;
1815
1816 for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1817 c++, coloffset += rc->rc_size) {
1818 rc = &rm->rm_col[c];
1819 cvd = vd->vdev_child[rc->rc_devidx];
1820
1821 /*
1822 * Find the start and end of this column in the RAID-Z map,
1823 * keeping in mind that the stated size and offset of the
1824 * operation may not fill the entire column for this vdev.
1825 *
1826 * If any portion of the data spans this column, issue the
1827 * appropriate operation to the vdev.
1828 */
1829 if (coloffset + rc->rc_size <= start)
1830 continue;
1831 if (coloffset >= end)
1832 continue;
1857 #endif /* KERNEL */
1858
1859 return (err);
1860 }
1861
1862 static uint64_t
1863 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1864 {
1865 uint64_t asize;
1866 uint64_t ashift = vd->vdev_top->vdev_ashift;
1867 uint64_t cols = vd->vdev_children;
1868 uint64_t nparity = vd->vdev_nparity;
1869
1870 asize = ((psize - 1) >> ashift) + 1;
1871 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1872 asize = roundup(asize, nparity + 1) << ashift;
1873
1874 return (asize);
1875 }
1876
1877 static void
1878 vdev_raidz_child_done(zio_t *zio)
1879 {
1880 raidz_col_t *rc = zio->io_private;
1881
1882 rc->rc_error = zio->io_error;
1883 rc->rc_tried = 1;
1884 rc->rc_skipped = 0;
1885 }
1886
1887 /*
1888 * Start an IO operation on a RAIDZ VDev
1889 *
1890 * Outline:
1891 * - For write operations:
1892 * 1. Generate the parity data
1893 * 2. Create child zio write operations to each column's vdev, for both
1894 * data and parity.
1895 * 3. If the column skips any sectors for padding, create optional dummy
1896 * write zio children for those areas to improve aggregation continuity.
1897 * - For read operations:
1898 * 1. Create child zio read operations to each data column's vdev to read
1899 * the range of data required for zio.
1900 * 2. If this is a scrub or resilver operation, or if any of the data
1901 * vdevs have had errors, then create zio read operations to the parity
1902 * columns' VDevs as well.
1903 */
1904 static void
1905 vdev_raidz_io_start(zio_t *zio)
1906 {
1907 vdev_t *vd = zio->io_vd;
1908 vdev_t *tvd = vd->vdev_top;
1909 vdev_t *cvd;
1910 raidz_map_t *rm;
1911 raidz_col_t *rc;
1912 int c, i;
1913
1914 rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
1915 tvd->vdev_ashift, vd->vdev_children,
1916 vd->vdev_nparity);
1917
1918 zio->io_vsd = rm;
1919 zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1920
1921 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1922
1923 if (zio->io_type == ZIO_TYPE_WRITE) {
1924 vdev_raidz_generate_parity(rm);
1925
1926 for (c = 0; c < rm->rm_cols; c++) {
1927 rc = &rm->rm_col[c];
1928 cvd = vd->vdev_child[rc->rc_devidx];
1929 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1930 rc->rc_offset, rc->rc_abd, rc->rc_size,
1931 zio->io_type, zio->io_priority, 0,
1932 vdev_raidz_child_done, rc));
1933 }
1934
1935 /*
1936 * Generate optional I/Os for any skipped sectors to improve
1985 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1986 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1987 rc->rc_offset, rc->rc_abd, rc->rc_size,
1988 zio->io_type, zio->io_priority, 0,
1989 vdev_raidz_child_done, rc));
1990 }
1991 }
1992
1993 zio_execute(zio);
1994 }
1995
1996
1997 /*
1998 * Report a checksum error for a child of a RAID-Z device.
1999 */
2000 static void
2001 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
2002 {
2003 void *buf;
2004 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
2005
2006 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2007 zio_bad_cksum_t zbc;
2008 raidz_map_t *rm = zio->io_vsd;
2009
2010 mutex_enter(&vd->vdev_stat_lock);
2011 vd->vdev_stat.vs_checksum_errors++;
2012 mutex_exit(&vd->vdev_stat_lock);
2013
2014 zbc.zbc_has_cksum = 0;
2015 zbc.zbc_injected = rm->rm_ecksuminjected;
2016
2017 buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size);
2018 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
2019 rc->rc_offset, rc->rc_size, buf, bad_data,
2020 &zbc);
2021 abd_return_buf(rc->rc_abd, buf, rc->rc_size);
2022 }
2023 }
2024
2025 /*
2026 * We keep track of whether or not there were any injected errors, so that
2027 * any ereports we generate can note it.
2028 */
2029 static int
2030 raidz_checksum_verify(zio_t *zio)
2031 {
2032 zio_bad_cksum_t zbc;
2033 raidz_map_t *rm = zio->io_vsd;
2034
2035 int ret = zio_checksum_error(zio, &zbc);
2036 if (ret != 0 && zbc.zbc_injected != 0)
2037 rm->rm_ecksuminjected = 1;
2038
2039 return (ret);
2040 }
2041
2042 /*
2278 * d. If that doesn't work, return an error.
2279 * 3. If there were unexpected errors or this is a resilver operation,
2280 * rewrite the vdevs that had errors.
2281 */
2282 static void
2283 vdev_raidz_io_done(zio_t *zio)
2284 {
2285 vdev_t *vd = zio->io_vd;
2286 vdev_t *cvd;
2287 raidz_map_t *rm = zio->io_vsd;
2288 raidz_col_t *rc;
2289 int unexpected_errors = 0;
2290 int parity_errors = 0;
2291 int parity_untried = 0;
2292 int data_errors = 0;
2293 int total_errors = 0;
2294 int n, c;
2295 int tgts[VDEV_RAIDZ_MAXPARITY];
2296 int code;
2297
2298 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
2299
2300 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2301 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2302
2303 for (c = 0; c < rm->rm_cols; c++) {
2304 rc = &rm->rm_col[c];
2305
2306 if (rc->rc_error) {
2307 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
2308
2309 if (c < rm->rm_firstdatacol)
2310 parity_errors++;
2311 else
2312 data_errors++;
2313
2314 if (!rc->rc_skipped)
2315 unexpected_errors++;
2316
2317 total_errors++;
2318 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2319 parity_untried++;
2538 rc->rc_offset, rc->rc_abd, rc->rc_size,
2539 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2540 ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2541 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2542 }
2543 }
2544 }
2545
2546 static void
2547 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2548 {
2549 if (faulted > vd->vdev_nparity)
2550 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2551 VDEV_AUX_NO_REPLICAS);
2552 else if (degraded + faulted != 0)
2553 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2554 else
2555 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2556 }
2557
2558 vdev_ops_t vdev_raidz_ops = {
2559 vdev_raidz_open,
2560 vdev_raidz_close,
2561 vdev_raidz_asize,
2562 vdev_raidz_io_start,
2563 vdev_raidz_io_done,
2564 vdev_raidz_state_change,
2565 NULL,
2566 NULL,
2567 NULL,
2568 VDEV_TYPE_RAIDZ, /* name of this vdev type */
2569 B_FALSE /* not a leaf vdev */
2570 };
|
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
25 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26 * Copyright (c) 2014 Integros [integros.com]
27 * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
28 */
29
30 #include <sys/zfs_context.h>
31 #include <sys/spa.h>
32 #include <sys/spa_impl.h>
33 #include <sys/vdev_impl.h>
34 #include <sys/vdev_disk.h>
35 #include <sys/vdev_file.h>
36 #include <sys/vdev_raidz.h>
37 #include <sys/zio.h>
38 #include <sys/zio_checksum.h>
39 #include <sys/abd.h>
40 #include <sys/fs/zfs.h>
41 #include <sys/fm/fs/zfs.h>
42 #include <sys/dkioc_free_util.h>
43
44 /*
45 * Virtual device vector for RAID-Z.
46 *
47 * This vdev supports single, double, and triple parity. For single parity,
48 * we use a simple XOR of all the data columns. For double or triple parity,
49 * we use a special case of Reed-Solomon coding. This extends the
50 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
51 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
52 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
53 * former is also based. The latter is designed to provide higher performance
54 * for writes.
55 *
56 * Note that the Plank paper claimed to support arbitrary N+M, but was then
57 * amended six years later identifying a critical flaw that invalidates its
58 * claims. Nevertheless, the technique can be adapted to work for up to
59 * triple parity. For additional parity, the amendment "Note: Correction to
60 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
61 * is viable, but the additional complexity means that write performance will
62 * suffer.
153 { \
154 (mask) = (x) & 0x8080808080808080ULL; \
155 (mask) = ((mask) << 1) - ((mask) >> 7); \
156 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
157 ((mask) & 0x1d1d1d1d1d1d1d1d); \
158 }
159
160 #define VDEV_RAIDZ_64MUL_4(x, mask) \
161 { \
162 VDEV_RAIDZ_64MUL_2((x), mask); \
163 VDEV_RAIDZ_64MUL_2((x), mask); \
164 }
165
166 #define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE)
167
168 /*
169 * Force reconstruction to use the general purpose method.
170 */
171 int vdev_raidz_default_to_general;
172
173 /*
174 * xor_p hook for external acceleration libraries.
175 */
176 int (*zfs_xorp_hook)(int vects, int len, void **array) = NULL;
177
178 /*
179 * These two tables represent powers and logs of 2 in the Galois field defined
180 * above. These values were computed by repeatedly multiplying by 2 as above.
181 */
182 static const uint8_t vdev_raidz_pow2[256] = {
183 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
184 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
185 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
186 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
187 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
188 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
189 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
190 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
191 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
192 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
193 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
194 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
195 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
196 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
197 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
198 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
199 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
200 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
201 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
233 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
234 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
235 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
236 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
237 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
238 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
239 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
240 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
241 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
242 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
243 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
244 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
245 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
246 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
247 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
248 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
249 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
250 };
251
252 static void vdev_raidz_generate_parity(raidz_map_t *rm);
253 static void vdev_raidz_trim_done(zio_t *zio);
254
255 /*
256 * Multiply a given number by 2 raised to the given power.
257 */
258 static uint8_t
259 vdev_raidz_exp2(uint_t a, int exp)
260 {
261 if (a == 0)
262 return (0);
263
264 ASSERT(exp >= 0);
265 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
266
267 exp += vdev_raidz_log2[a];
268 if (exp > 255)
269 exp -= 255;
270
271 return (vdev_raidz_pow2[exp]);
272 }
273
274 static void
275 vdev_raidz_map_free(raidz_map_t *rm)
276 {
277 int c;
278 size_t size;
279
280 for (c = 0; c < rm->rm_firstdatacol; c++) {
281 /*
282 * TRIM doesn't allocate data blocks,
283 * so 'rc_abd' is NULL in this case.
284 * See vdev_raidz_trim() and vdev_raidz_map_alloc()
285 * for more details.
286 */
287 if (rm->rm_col[c].rc_abd != NULL)
288 abd_free(rm->rm_col[c].rc_abd);
289
290 if (rm->rm_col[c].rc_gdata != NULL)
291 zio_buf_free(rm->rm_col[c].rc_gdata,
292 rm->rm_col[c].rc_size);
293 }
294
295 size = 0;
296 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
297 /*
298 * TRIM doesn't allocate data blocks,
299 * so 'rc_abd' is NULL in this case
300 * See vdev_raidz_trim() and vdev_raidz_map_alloc()
301 * for more details.
302 */
303 if (rm->rm_col[c].rc_abd != NULL)
304 abd_put(rm->rm_col[c].rc_abd);
305 size += rm->rm_col[c].rc_size;
306 }
307
308 if (rm->rm_abd_copy != NULL)
309 abd_free(rm->rm_abd_copy);
310
311 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
312 }
313
314 static void
315 vdev_raidz_map_free_vsd(zio_t *zio)
316 {
317 raidz_map_t *rm = zio->io_vsd;
318
319 ASSERT0(rm->rm_freed);
320 rm->rm_freed = 1;
321
322 if (rm->rm_reports == 0)
323 vdev_raidz_map_free(rm);
465
466 for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
467 raidz_col_t *col = &rm->rm_col[c];
468 abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset);
469
470 abd_copy(tmp, col->rc_abd, col->rc_size);
471 abd_put(col->rc_abd);
472 col->rc_abd = tmp;
473
474 offset += col->rc_size;
475 }
476 ASSERT3U(offset, ==, size);
477 }
478
479 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
480 vdev_raidz_map_free_vsd,
481 vdev_raidz_cksum_report
482 };
483
484 /*
485 * Allocates and computes a raidz column map, which directs the raidz column
486 * handling algorithms where to locate and store data and parity columns for
487 * a particular DVA. Usually, dcols is the number of children in the target
488 * vdev.
489 *
490 * The `io_offset', `io_size' and `io_data' hold the offset, size and data
491 * of the zio for which this map is to be computed.
492 * The `unit_shift' parameter contains the minimum allocation bitshift of
493 * the storage pool. The `dcols' parameter contains the number of drives in
494 * this raidz vdev (including parity drives), with `nparity' denoting how
495 * many those contain the parity (one, two or three).
496 *
497 * The `alloc_io_bufs' flag denotes whether you want the constructed raidz
498 * map to contain allocated buffers to hold column IO data or not (if
499 * you're using this function simply to determine raidz geometry, you'll
500 * want to pass B_FALSE here).
501 */
502 static raidz_map_t *
503 vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset,
504 uint64_t unit_shift, uint64_t dcols, uint64_t nparity,
505 boolean_t alloc_data)
506 {
507 raidz_map_t *rm;
508 /* The starting RAIDZ (parent) vdev sector of the block. */
509 uint64_t b = offset >> unit_shift;
510 /* The zio's size in units of the vdev's minimum sector size. */
511 uint64_t s = size >> unit_shift;
512 /* The first column for this stripe. */
513 uint64_t f = b % dcols;
514 /* The starting byte offset on each child vdev. */
515 uint64_t o = (b / dcols) << unit_shift;
516 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
517 uint64_t off = 0;
518
519 /*
520 * "Quotient": The number of data sectors for this stripe on all but
521 * the "big column" child vdevs that also contain "remainder" data.
522 */
523 q = s / (dcols - nparity);
524
525 /*
580 rm->rm_col[c].rc_error = 0;
581 rm->rm_col[c].rc_tried = 0;
582 rm->rm_col[c].rc_skipped = 0;
583
584 if (c >= acols)
585 rm->rm_col[c].rc_size = 0;
586 else if (c < bc)
587 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
588 else
589 rm->rm_col[c].rc_size = q << unit_shift;
590
591 asize += rm->rm_col[c].rc_size;
592 }
593
594 ASSERT3U(asize, ==, tot << unit_shift);
595 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
596 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
597 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
598 ASSERT3U(rm->rm_nskip, <=, nparity);
599
600 if (alloc_data) {
601 for (c = 0; c < rm->rm_firstdatacol; c++) {
602 rm->rm_col[c].rc_abd =
603 abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);
604 }
605
606 rm->rm_col[c].rc_abd = abd_get_offset(abd, 0);
607 off = rm->rm_col[c].rc_size;
608
609 for (c = c + 1; c < acols; c++) {
610 rm->rm_col[c].rc_abd = abd_get_offset(abd, off);
611 off += rm->rm_col[c].rc_size;
612 }
613 }
614
615 /*
616 * If all data stored spans all columns, there's a danger that parity
617 * will always be on the same device and, since parity isn't read
618 * during normal operation, that that device's I/O bandwidth won't be
619 * used effectively. We therefore switch the parity every 1MB.
620 *
621 * ... at least that was, ostensibly, the theory. As a practical
622 * matter unless we juggle the parity between all devices evenly, we
623 * won't see any benefit. Further, occasional writes that aren't a
624 * multiple of the LCM of the number of children and the minimum
625 * stripe width are sufficient to avoid pessimal behavior.
626 * Unfortunately, this decision created an implicit on-disk format
627 * requirement that we need to support for all eternity, but only
628 * for single-parity RAID-Z.
629 *
630 * If we intend to skip a sector in the zeroth column for padding
631 * we must make sure to note this swap. We will never intend to
632 * skip the first column since at least one data and one parity
633 * column must appear in each row.
694 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
695 {
696 struct pqr_struct *pqr = private;
697 const uint64_t *src = buf;
698 uint64_t mask;
699 int i, cnt = size / sizeof (src[0]);
700
701 ASSERT(pqr->p && pqr->q && pqr->r);
702
703 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
704 *pqr->p ^= *src;
705 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
706 *pqr->q ^= *src;
707 VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
708 *pqr->r ^= *src;
709 }
710
711 return (0);
712 }
713
714 /*
715 * software acceleration of XOR calculations, requirements
716 *
717 * the (src/dst) vectors needs to be 64 byte aligned
718 * all the vectors have to be the same size
719 */
720 #define RAIDZ_ACCELERATION_ALIGNMENT 64ul
721 #define UNALIGNED(addr) \
722 ((unsigned long)(addr) & (RAIDZ_ACCELERATION_ALIGNMENT-1))
723
724 static void
725 vdev_raidz_generate_parity_p(raidz_map_t *rm)
726 {
727 uint64_t *p;
728 int c;
729 abd_t *src;
730
731 #if 0
732 /* FIXME: needs to be reviewed and changed to support ABD */
733 int parity_done;
734 void *va[16];
735 void **array;
736 int j, nvects;
737
738 parity_done = 0;
739 while (0 && zfs_xorp_hook && !parity_done) {
740 unsigned long no_accel = 0;
741 /* at least two columns (plus one for result) */
742 if (rm->rm_cols < 3) {
743 DTRACE_PROBE1(raidz_few_cols, int, rm->rm_cols);
744 break;
745 }
746 /* check sizes and alignment */
747 no_accel = UNALIGNED(rm->rm_col[VDEV_RAIDZ_P].rc_data);
748 if (no_accel) {
749 DTRACE_PROBE1(raidz_unaligned_dst, unsigned long,
750 no_accel);
751 break;
752 }
753 pcount = rm->rm_col[rm->rm_firstdatacol].rc_size;
754 nvects = 1; /* for the destination */
755 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
756 no_accel = UNALIGNED(rm->rm_col[c].rc_data);
757 if (no_accel) {
758 DTRACE_PROBE1(raidz_unaligned_src,
759 unsigned long, no_accel);
760 break;
761 }
762 if (rm->rm_col[c].rc_size != pcount) {
763 DTRACE_PROBE(raidz_sizes_vary);
764 no_accel = 1;
765 break;
766 }
767 nvects++;
768 }
769 if (no_accel)
770 break;
771 if (nvects > 16) {
772 array = kmem_alloc(nvects * sizeof (void *),
773 KM_NOSLEEP);
774 if (array == NULL) {
775 DTRACE_PROBE(raidz_alloc_failed);
776 break;
777 }
778 } else {
779 array = va;
780 }
781 for (j = 0, c = rm->rm_firstdatacol; c < rm->rm_cols;
782 c++, j++) {
783 array[j] = rm->rm_col[c].rc_data;
784 }
785 array[j] = rm->rm_col[VDEV_RAIDZ_P].rc_data;
786 if (zfs_xorp_hook(nvects,
787 rm->rm_col[rm->rm_firstdatacol].rc_size, array)) {
788 DTRACE_PROBE(raidz_accel_failure);
789 break;
790 }
791 if (array != va) {
792 kmem_free(array, nvects * sizeof (void *));
793 }
794 parity_done = 1;
795 DTRACE_PROBE(raidz_accel_success);
796 }
797 if (parity_done)
798 return;
799 #endif
800 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
801 src = rm->rm_col[c].rc_abd;
802 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
803
804 if (c == rm->rm_firstdatacol) {
805 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
806 } else {
807 struct pqr_struct pqr = { p, NULL, NULL };
808 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
809 vdev_raidz_p_func, &pqr);
810 }
811 }
812 }
813
814 static void
815 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
816 {
817 uint64_t *p, *q, pcnt, ccnt, mask, i;
818 int c;
819 abd_t *src;
820
1915 * Don't write past the end of the block
1916 */
1917 VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1918
1919 start = offset;
1920 end = start + size;
1921
1922 /*
1923 * Allocate a RAID-Z map for this block. Note that this block starts
1924 * from the "original" offset, this is, the offset of the extent which
1925 * contains the requisite offset of the data being read or written.
1926 *
1927 * Even if this I/O operation doesn't span the full block size, let's
1928 * treat the on-disk format as if the only blocks are the complete 128
1929 * KB size.
1930 */
1931 abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
1932 SPA_OLD_MAXBLOCKSIZE);
1933 rm = vdev_raidz_map_alloc(abd,
1934 SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift,
1935 vd->vdev_children, vd->vdev_nparity, B_TRUE);
1936
1937 coloffset = origoffset;
1938
1939 for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1940 c++, coloffset += rc->rc_size) {
1941 rc = &rm->rm_col[c];
1942 cvd = vd->vdev_child[rc->rc_devidx];
1943
1944 /*
1945 * Find the start and end of this column in the RAID-Z map,
1946 * keeping in mind that the stated size and offset of the
1947 * operation may not fill the entire column for this vdev.
1948 *
1949 * If any portion of the data spans this column, issue the
1950 * appropriate operation to the vdev.
1951 */
1952 if (coloffset + rc->rc_size <= start)
1953 continue;
1954 if (coloffset >= end)
1955 continue;
1980 #endif /* KERNEL */
1981
1982 return (err);
1983 }
1984
1985 static uint64_t
1986 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1987 {
1988 uint64_t asize;
1989 uint64_t ashift = vd->vdev_top->vdev_ashift;
1990 uint64_t cols = vd->vdev_children;
1991 uint64_t nparity = vd->vdev_nparity;
1992
1993 asize = ((psize - 1) >> ashift) + 1;
1994 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1995 asize = roundup(asize, nparity + 1) << ashift;
1996
1997 return (asize);
1998 }
1999
2000 /*
2001 * Converts an allocated size on a raidz vdev back to a logical block
2002 * size. This is used in trimming to figure out the appropriate logical
2003 * size to pass to vdev_raidz_map_alloc when splitting up extents of free
2004 * space obtained from metaslabs. However, a range of free space on a
2005 * raidz vdev might have originally consisted of multiple blocks and
2006 * those, taken together with their skip blocks, might not always align
2007 * neatly to a new vdev_raidz_map_alloc covering the entire unified
2008 * range. So to ensure that the newly allocated raidz map *always* fits
2009 * within the asize passed to this function and never exceeds it (since
2010 * that might trim allocated data past it), we round it down to the
2011 * nearest suitable multiple of the vdev ashift (hence the "_floor" in
2012 * this function's name).
2013 * This function is in effect an inverse of vdev_raidz_asize. However,
2014 * since multiple psizes can map to a single asize (due to variable padding,
2015 * this function instead returns the largest chunk that still fits inside
2016 * the specified asize).
2017 */
2018 static uint64_t
2019 vdev_raidz_psize_floor(vdev_t *vd, uint64_t asize)
2020 {
2021 uint64_t psize;
2022 uint64_t ashift = vd->vdev_top->vdev_ashift;
2023 uint64_t cols = vd->vdev_children;
2024 uint64_t nparity = vd->vdev_nparity;
2025
2026 psize = (asize - (nparity << ashift));
2027 psize /= cols;
2028 psize *= cols - nparity;
2029 psize += (1 << ashift) - 1;
2030
2031 psize = P2ALIGN(psize, 1 << ashift);
2032
2033 return (psize);
2034 }
2035
2036 static void
2037 vdev_raidz_child_done(zio_t *zio)
2038 {
2039 raidz_col_t *rc = zio->io_private;
2040
2041 rc->rc_error = zio->io_error;
2042 rc->rc_tried = 1;
2043 rc->rc_skipped = 0;
2044 }
2045
2046 /*
2047 * Start an IO operation on a RAIDZ VDev
2048 *
2049 * Outline:
2050 * - For write operations:
2051 * 1. Generate the parity data
2052 * 2. Create child zio write operations to each column's vdev, for both
2053 * data and parity.
2054 * 3. If the column skips any sectors for padding, create optional dummy
2055 * write zio children for those areas to improve aggregation continuity.
2056 * - For read operations:
2057 * 1. Create child zio read operations to each data column's vdev to read
2058 * the range of data required for zio.
2059 * 2. If this is a scrub or resilver operation, or if any of the data
2060 * vdevs have had errors, then create zio read operations to the parity
2061 * columns' VDevs as well.
2062 */
2063 static void
2064 vdev_raidz_io_start(zio_t *zio)
2065 {
2066 vdev_t *vd = zio->io_vd;
2067 vdev_t *tvd = vd->vdev_top;
2068 vdev_t *cvd;
2069 raidz_map_t *rm;
2070 raidz_col_t *rc;
2071 int c, i;
2072
2073 rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
2074 tvd->vdev_ashift, vd->vdev_children,
2075 vd->vdev_nparity, B_TRUE);
2076
2077 zio->io_vsd = rm;
2078 zio->io_vsd_ops = &vdev_raidz_vsd_ops;
2079
2080 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
2081
2082 if (zio->io_type == ZIO_TYPE_WRITE) {
2083 vdev_raidz_generate_parity(rm);
2084
2085 for (c = 0; c < rm->rm_cols; c++) {
2086 rc = &rm->rm_col[c];
2087 cvd = vd->vdev_child[rc->rc_devidx];
2088 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2089 rc->rc_offset, rc->rc_abd, rc->rc_size,
2090 zio->io_type, zio->io_priority, 0,
2091 vdev_raidz_child_done, rc));
2092 }
2093
2094 /*
2095 * Generate optional I/Os for any skipped sectors to improve
2144 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
2145 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2146 rc->rc_offset, rc->rc_abd, rc->rc_size,
2147 zio->io_type, zio->io_priority, 0,
2148 vdev_raidz_child_done, rc));
2149 }
2150 }
2151
2152 zio_execute(zio);
2153 }
2154
2155
2156 /*
2157 * Report a checksum error for a child of a RAID-Z device.
2158 */
2159 static void
2160 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
2161 {
2162 void *buf;
2163 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
2164 vdev_stat_t *vs = &vd->vdev_stat;
2165 spa_t *spa = zio->io_spa;
2166
2167 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2168 zio_bad_cksum_t zbc;
2169 raidz_map_t *rm = zio->io_vsd;
2170
2171 mutex_enter(&vd->vdev_stat_lock);
2172 vd->vdev_stat.vs_checksum_errors++;
2173 mutex_exit(&vd->vdev_stat_lock);
2174
2175 zbc.zbc_has_cksum = 0;
2176 zbc.zbc_injected = rm->rm_ecksuminjected;
2177
2178 buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size);
2179 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
2180 rc->rc_offset, rc->rc_size, buf, bad_data,
2181 &zbc);
2182 abd_return_buf(rc->rc_abd, buf, rc->rc_size);
2183 }
2184
2185 if (vd->vdev_isspecial && (vs->vs_checksum_errors ||
2186 vs->vs_read_errors || vs->vs_write_errors) &&
2187 !spa->spa_special_has_errors) {
2188 spa->spa_special_has_errors = B_TRUE;
2189 }
2190 }
2191
2192 /*
2193 * We keep track of whether or not there were any injected errors, so that
2194 * any ereports we generate can note it.
2195 */
2196 static int
2197 raidz_checksum_verify(zio_t *zio)
2198 {
2199 zio_bad_cksum_t zbc;
2200 raidz_map_t *rm = zio->io_vsd;
2201
2202 int ret = zio_checksum_error(zio, &zbc);
2203 if (ret != 0 && zbc.zbc_injected != 0)
2204 rm->rm_ecksuminjected = 1;
2205
2206 return (ret);
2207 }
2208
2209 /*
2445 * d. If that doesn't work, return an error.
2446 * 3. If there were unexpected errors or this is a resilver operation,
2447 * rewrite the vdevs that had errors.
2448 */
2449 static void
2450 vdev_raidz_io_done(zio_t *zio)
2451 {
2452 vdev_t *vd = zio->io_vd;
2453 vdev_t *cvd;
2454 raidz_map_t *rm = zio->io_vsd;
2455 raidz_col_t *rc;
2456 int unexpected_errors = 0;
2457 int parity_errors = 0;
2458 int parity_untried = 0;
2459 int data_errors = 0;
2460 int total_errors = 0;
2461 int n, c;
2462 int tgts[VDEV_RAIDZ_MAXPARITY];
2463 int code;
2464
2465 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2466 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2467
2468 for (c = 0; c < rm->rm_cols; c++) {
2469 rc = &rm->rm_col[c];
2470
2471 if (rc->rc_error) {
2472 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
2473
2474 if (c < rm->rm_firstdatacol)
2475 parity_errors++;
2476 else
2477 data_errors++;
2478
2479 if (!rc->rc_skipped)
2480 unexpected_errors++;
2481
2482 total_errors++;
2483 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2484 parity_untried++;
2703 rc->rc_offset, rc->rc_abd, rc->rc_size,
2704 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2705 ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2706 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2707 }
2708 }
2709 }
2710
2711 static void
2712 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2713 {
2714 if (faulted > vd->vdev_nparity)
2715 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2716 VDEV_AUX_NO_REPLICAS);
2717 else if (degraded + faulted != 0)
2718 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2719 else
2720 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2721 }
2722
2723 static inline void
2724 vdev_raidz_trim_append_rc(dkioc_free_list_t *dfl, uint64_t *num_extsp,
2725 const raidz_col_t *rc)
2726 {
2727 uint64_t num_exts = *num_extsp;
2728 ASSERT(rc->rc_size != 0);
2729
2730 if (dfl->dfl_num_exts > 0 &&
2731 dfl->dfl_exts[num_exts - 1].dfle_start +
2732 dfl->dfl_exts[num_exts - 1].dfle_length == rc->rc_offset) {
2733 dfl->dfl_exts[num_exts - 1].dfle_length += rc->rc_size;
2734 } else {
2735 dfl->dfl_exts[num_exts].dfle_start = rc->rc_offset;
2736 dfl->dfl_exts[num_exts].dfle_length = rc->rc_size;
2737 (*num_extsp)++;
2738 }
2739 }
2740
2741 /*
2742 * Processes a trim for a raidz vdev.
2743 */
2744 static void
2745 vdev_raidz_trim(vdev_t *vd, zio_t *pio, void *trim_exts)
2746 {
2747 dkioc_free_list_t *dfl = trim_exts;
2748 dkioc_free_list_t **sub_dfls;
2749 uint64_t *sub_dfls_num_exts;
2750
2751 sub_dfls = kmem_zalloc(sizeof (*sub_dfls) * vd->vdev_children,
2752 KM_SLEEP);
2753 sub_dfls_num_exts = kmem_zalloc(sizeof (uint64_t) * vd->vdev_children,
2754 KM_SLEEP);
2755 for (int i = 0; i < vd->vdev_children; i++) {
2756 /*
2757 * We might over-allocate here, because the sub-lists can never
2758 * be longer than the parent list, but they can be shorter.
2759 * The underlying driver will discard zero-length extents.
2760 */
2761 sub_dfls[i] = kmem_zalloc(DFL_SZ(dfl->dfl_num_exts), KM_SLEEP);
2762 sub_dfls[i]->dfl_num_exts = dfl->dfl_num_exts;
2763 sub_dfls[i]->dfl_flags = dfl->dfl_flags;
2764 sub_dfls[i]->dfl_offset = dfl->dfl_offset;
2765 /* don't copy the check func, because it isn't raidz-aware */
2766 }
2767
2768 /*
2769 * Process all extents and redistribute them to the component vdevs
2770 * according to a computed raidz map geometry.
2771 */
2772 for (int i = 0; i < dfl->dfl_num_exts; i++) {
2773 uint64_t start = dfl->dfl_exts[i].dfle_start;
2774 uint64_t length = dfl->dfl_exts[i].dfle_length;
2775 raidz_map_t *rm = vdev_raidz_map_alloc(NULL,
2776 vdev_raidz_psize_floor(vd, length), start,
2777 vd->vdev_top->vdev_ashift, vd->vdev_children,
2778 vd->vdev_nparity, B_FALSE);
2779
2780 for (uint64_t j = 0; j < rm->rm_cols; j++) {
2781 uint64_t devidx = rm->rm_col[j].rc_devidx;
2782 vdev_raidz_trim_append_rc(sub_dfls[devidx],
2783 &sub_dfls_num_exts[devidx], &rm->rm_col[j]);
2784 }
2785 vdev_raidz_map_free(rm);
2786 }
2787
2788 /*
2789 * Issue the component ioctls as children of the parent zio.
2790 */
2791 for (int i = 0; i < vd->vdev_children; i++) {
2792 if (sub_dfls_num_exts[i] != 0) {
2793 zio_nowait(zio_ioctl(pio, vd->vdev_child[i]->vdev_spa,
2794 vd->vdev_child[i], DKIOCFREE,
2795 vdev_raidz_trim_done, sub_dfls[i],
2796 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
2797 ZIO_FLAG_DONT_RETRY));
2798 } else {
2799 dfl_free(sub_dfls[i]);
2800 }
2801 }
2802 kmem_free(sub_dfls, sizeof (*sub_dfls) * vd->vdev_children);
2803 kmem_free(sub_dfls_num_exts, sizeof (uint64_t) * vd->vdev_children);
2804 }
2805
2806 /*
2807 * Releases a dkioc_free_list_t from ioctls issued to component devices in
2808 * vdev_raidz_dkioc_free.
2809 */
2810 static void
2811 vdev_raidz_trim_done(zio_t *zio)
2812 {
2813 ASSERT(zio->io_private != NULL);
2814 dfl_free(zio->io_private);
2815 }
2816
2817 vdev_ops_t vdev_raidz_ops = {
2818 vdev_raidz_open,
2819 vdev_raidz_close,
2820 vdev_raidz_asize,
2821 vdev_raidz_io_start,
2822 vdev_raidz_io_done,
2823 vdev_raidz_state_change,
2824 NULL,
2825 NULL,
2826 vdev_raidz_trim,
2827 VDEV_TYPE_RAIDZ, /* name of this vdev type */
2828 B_FALSE /* not a leaf vdev */
2829 };
|