Print this page
NEX-16191 scrub after trim finds thousands of checksum errors
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
NEX-15749 zpool trim command for a raidz-pool causes panic
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
NEX-15749 zpool trim command for a raidz-pool causes panic
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
NEX-14571 remove isal support remnants
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
NEX-5795 Rename 'wrc' as 'wbc' in the source and in the tech docs
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
NEX-4620 ZFS autotrim triggering is unreliable
NEX-4622 On-demand TRIM code illogically enumerates metaslabs via mg_ms_tree
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
Reviewed by: Hans Rosenfeld <hans.rosenfeld@nexenta.com>
NEX-3984 On-demand TRIM
Reviewed by: Alek Pinchuk <alek@nexenta.com>
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
Conflicts:
        usr/src/common/zfs/zpool_prop.c
        usr/src/uts/common/sys/fs/zfs.h
NEX-4003 WRC: System panics on debug build
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
NEX-3558 KRRP Integration
NEX-3508 CLONE - Port NEX-2946 Add UNMAP/TRIM functionality to ZFS and illumos
Reviewed by: Josef Sipek <josef.sipek@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Conflicts:
    usr/src/uts/common/io/scsi/targets/sd.c
    usr/src/uts/common/sys/scsi/targets/sddef.h
re #8279 rb3915 need a mechanism to notify NMS about ZFS config changes (fix lint -courtesy of Yuri Pankov)
re #12584 rb4049 zfsxx latest code merge (fix lint - courtesy of Yuri Pankov)
re #12585 rb4049 ZFS++ work port - refactoring to improve separation of open/closed code, bug fixes, performance improvements - open code

*** 22,44 **** --- 22,47 ---- /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2017 by Delphix. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. */ #include <sys/zfs_context.h> #include <sys/spa.h> + #include <sys/spa_impl.h> #include <sys/vdev_impl.h> #include <sys/vdev_disk.h> #include <sys/vdev_file.h> #include <sys/vdev_raidz.h> #include <sys/zio.h> #include <sys/zio_checksum.h> #include <sys/abd.h> #include <sys/fs/zfs.h> #include <sys/fm/fs/zfs.h> + #include <sys/dkioc_free_util.h> /* * Virtual device vector for RAID-Z. * * This vdev supports single, double, and triple parity. For single parity,
*** 165,175 **** /* * Force reconstruction to use the general purpose method. */ int vdev_raidz_default_to_general; ! /* Powers of 2 in the Galois field defined above. */ static const uint8_t vdev_raidz_pow2[256] = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, --- 168,186 ---- /* * Force reconstruction to use the general purpose method. */ int vdev_raidz_default_to_general; ! /* ! * xor_p hook for external acceleration libraries. ! */ ! int (*zfs_xorp_hook)(int vects, int len, void **array) = NULL; ! ! /* ! * These two tables represent powers and logs of 2 in the Galois field defined ! * above. These values were computed by repeatedly multiplying by 2 as above. ! */ static const uint8_t vdev_raidz_pow2[256] = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
*** 237,246 **** --- 248,258 ---- 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, }; static void vdev_raidz_generate_parity(raidz_map_t *rm); + static void vdev_raidz_trim_done(zio_t *zio); /* * Multiply a given number by 2 raised to the given power. */ static uint8_t
*** 264,282 **** --- 276,308 ---- { int c; size_t size; for (c = 0; c < rm->rm_firstdatacol; c++) { + /* + * TRIM doesn't allocate data blocks, + * so 'rc_abd' is NULL in this case. + * See vdev_raidz_trim() and vdev_raidz_map_alloc() + * for more details. + */ + if (rm->rm_col[c].rc_abd != NULL) abd_free(rm->rm_col[c].rc_abd); if (rm->rm_col[c].rc_gdata != NULL) zio_buf_free(rm->rm_col[c].rc_gdata, rm->rm_col[c].rc_size); } size = 0; for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + /* + * TRIM doesn't allocate data blocks, + * so 'rc_abd' is NULL in this case + * See vdev_raidz_trim() and vdev_raidz_map_alloc() + * for more details. + */ + if (rm->rm_col[c].rc_abd != NULL) abd_put(rm->rm_col[c].rc_abd); size += rm->rm_col[c].rc_size; } if (rm->rm_abd_copy != NULL)
*** 454,469 **** vdev_raidz_map_free_vsd, vdev_raidz_cksum_report }; /* ! * Divides the IO evenly across all child vdevs; usually, dcols is ! * the number of children in the target vdev. */ static raidz_map_t * vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, ! uint64_t unit_shift, uint64_t dcols, uint64_t nparity) { raidz_map_t *rm; /* The starting RAIDZ (parent) vdev sector of the block. */ uint64_t b = offset >> unit_shift; /* The zio's size in units of the vdev's minimum sector size. */ --- 480,510 ---- vdev_raidz_map_free_vsd, vdev_raidz_cksum_report }; /* ! * Allocates and computes a raidz column map, which directs the raidz column ! * handling algorithms where to locate and store data and parity columns for ! * a particular DVA. Usually, dcols is the number of children in the target ! * vdev. ! * ! * The `io_offset', `io_size' and `io_data' hold the offset, size and data ! * of the zio for which this map is to be computed. ! * The `unit_shift' parameter contains the minimum allocation bitshift of ! * the storage pool. The `dcols' parameter contains the number of drives in ! * this raidz vdev (including parity drives), with `nparity' denoting how ! * many those contain the parity (one, two or three). ! * ! * The `alloc_io_bufs' flag denotes whether you want the constructed raidz ! * map to contain allocated buffers to hold column IO data or not (if ! * you're using this function simply to determine raidz geometry, you'll ! * want to pass B_FALSE here). */ static raidz_map_t * vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, ! uint64_t unit_shift, uint64_t dcols, uint64_t nparity, ! boolean_t alloc_data) { raidz_map_t *rm; /* The starting RAIDZ (parent) vdev sector of the block. */ uint64_t b = offset >> unit_shift; /* The zio's size in units of the vdev's minimum sector size. */
*** 554,574 **** rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); rm->rm_nskip = roundup(tot, nparity + 1) - tot; ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); ASSERT3U(rm->rm_nskip, <=, nparity); ! for (c = 0; c < rm->rm_firstdatacol; c++) rm->rm_col[c].rc_abd = abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE); rm->rm_col[c].rc_abd = abd_get_offset(abd, 0); off = rm->rm_col[c].rc_size; for (c = c + 1; c < acols; c++) { rm->rm_col[c].rc_abd = abd_get_offset(abd, off); off += rm->rm_col[c].rc_size; } /* * If all data stored spans all columns, there's a danger that parity * will always be on the same device and, since parity isn't read * during normal operation, that that device's I/O bandwidth won't be --- 595,618 ---- rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); rm->rm_nskip = roundup(tot, nparity + 1) - tot; ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); ASSERT3U(rm->rm_nskip, <=, nparity); ! if (alloc_data) { ! for (c = 0; c < rm->rm_firstdatacol; c++) { rm->rm_col[c].rc_abd = abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE); + } rm->rm_col[c].rc_abd = abd_get_offset(abd, 0); off = rm->rm_col[c].rc_size; for (c = c + 1; c < acols; c++) { rm->rm_col[c].rc_abd = abd_get_offset(abd, off); off += rm->rm_col[c].rc_size; } + } /* * If all data stored spans all columns, there's a danger that parity * will always be on the same device and, since parity isn't read * during normal operation, that that device's I/O bandwidth won't be
*** 665,682 **** --- 709,805 ---- } return (0); } + /* + * software acceleration of XOR calculations, requirements + * + * the (src/dst) vectors needs to be 64 byte aligned + * all the vectors have to be the same size + */ + #define RAIDZ_ACCELERATION_ALIGNMENT 64ul + #define UNALIGNED(addr) \ + ((unsigned long)(addr) & (RAIDZ_ACCELERATION_ALIGNMENT-1)) + static void vdev_raidz_generate_parity_p(raidz_map_t *rm) { uint64_t *p; int c; abd_t *src; + #if 0 + /* FIXME: needs to be reviewed and changed to support ABD */ + int parity_done; + void *va[16]; + void **array; + int j, nvects; + + parity_done = 0; + while (0 && zfs_xorp_hook && !parity_done) { + unsigned long no_accel = 0; + /* at least two columns (plus one for result) */ + if (rm->rm_cols < 3) { + DTRACE_PROBE1(raidz_few_cols, int, rm->rm_cols); + break; + } + /* check sizes and alignment */ + no_accel = UNALIGNED(rm->rm_col[VDEV_RAIDZ_P].rc_data); + if (no_accel) { + DTRACE_PROBE1(raidz_unaligned_dst, unsigned long, + no_accel); + break; + } + pcount = rm->rm_col[rm->rm_firstdatacol].rc_size; + nvects = 1; /* for the destination */ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + no_accel = UNALIGNED(rm->rm_col[c].rc_data); + if (no_accel) { + DTRACE_PROBE1(raidz_unaligned_src, + unsigned long, no_accel); + break; + } + if (rm->rm_col[c].rc_size != pcount) { + DTRACE_PROBE(raidz_sizes_vary); + no_accel = 1; + break; + } + nvects++; + } + if (no_accel) + break; + if (nvects > 16) { + array = kmem_alloc(nvects * sizeof (void *), + KM_NOSLEEP); + if (array == NULL) { + DTRACE_PROBE(raidz_alloc_failed); + break; + } + } else { + array = va; + } + for (j = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; + c++, j++) { + array[j] = rm->rm_col[c].rc_data; + } + array[j] = rm->rm_col[VDEV_RAIDZ_P].rc_data; + if (zfs_xorp_hook(nvects, + rm->rm_col[rm->rm_firstdatacol].rc_size, array)) { + DTRACE_PROBE(raidz_accel_failure); + break; + } + if (array != va) { + kmem_free(array, nvects * sizeof (void *)); + } + parity_done = 1; + DTRACE_PROBE(raidz_accel_success); + } + if (parity_done) + return; + #endif + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { src = rm->rm_col[c].rc_abd; p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); if (c == rm->rm_firstdatacol) { abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
*** 1807,1817 **** */ abd_t *abd = abd_get_from_buf(data - (offset - origoffset), SPA_OLD_MAXBLOCKSIZE); rm = vdev_raidz_map_alloc(abd, SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift, ! vd->vdev_children, vd->vdev_nparity); coloffset = origoffset; for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++, coloffset += rc->rc_size) { --- 1930,1940 ---- */ abd_t *abd = abd_get_from_buf(data - (offset - origoffset), SPA_OLD_MAXBLOCKSIZE); rm = vdev_raidz_map_alloc(abd, SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift, ! vd->vdev_children, vd->vdev_nparity, B_TRUE); coloffset = origoffset; for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++, coloffset += rc->rc_size) {
*** 1872,1881 **** --- 1995,2040 ---- asize = roundup(asize, nparity + 1) << ashift; return (asize); } + /* + * Converts an allocated size on a raidz vdev back to a logical block + * size. This is used in trimming to figure out the appropriate logical + * size to pass to vdev_raidz_map_alloc when splitting up extents of free + * space obtained from metaslabs. However, a range of free space on a + * raidz vdev might have originally consisted of multiple blocks and + * those, taken together with their skip blocks, might not always align + * neatly to a new vdev_raidz_map_alloc covering the entire unified + * range. So to ensure that the newly allocated raidz map *always* fits + * within the asize passed to this function and never exceeds it (since + * that might trim allocated data past it), we round it down to the + * nearest suitable multiple of the vdev ashift (hence the "_floor" in + * this function's name). + * This function is in effect an inverse of vdev_raidz_asize. However, + * since multiple psizes can map to a single asize (due to variable padding, + * this function instead returns the largest chunk that still fits inside + * the specified asize). + */ + static uint64_t + vdev_raidz_psize_floor(vdev_t *vd, uint64_t asize) + { + uint64_t psize; + uint64_t ashift = vd->vdev_top->vdev_ashift; + uint64_t cols = vd->vdev_children; + uint64_t nparity = vd->vdev_nparity; + + psize = (asize - (nparity << ashift)); + psize /= cols; + psize *= cols - nparity; + psize += (1 << ashift) - 1; + + psize = P2ALIGN(psize, 1 << ashift); + + return (psize); + } + static void vdev_raidz_child_done(zio_t *zio) { raidz_col_t *rc = zio->io_private;
*** 1911,1921 **** raidz_col_t *rc; int c, i; rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset, tvd->vdev_ashift, vd->vdev_children, ! vd->vdev_nparity); zio->io_vsd = rm; zio->io_vsd_ops = &vdev_raidz_vsd_ops; ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); --- 2070,2080 ---- raidz_col_t *rc; int c, i; rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset, tvd->vdev_ashift, vd->vdev_children, ! vd->vdev_nparity, B_TRUE); zio->io_vsd = rm; zio->io_vsd_ops = &vdev_raidz_vsd_ops; ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
*** 2000,2009 **** --- 2159,2170 ---- static void raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) { void *buf; vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; + vdev_stat_t *vs = &vd->vdev_stat; + spa_t *spa = zio->io_spa; if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { zio_bad_cksum_t zbc; raidz_map_t *rm = zio->io_vsd;
*** 2018,2027 **** --- 2179,2194 ---- zfs_ereport_post_checksum(zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size, buf, bad_data, &zbc); abd_return_buf(rc->rc_abd, buf, rc->rc_size); } + + if (vd->vdev_isspecial && (vs->vs_checksum_errors || + vs->vs_read_errors || vs->vs_write_errors) && + !spa->spa_special_has_errors) { + spa->spa_special_has_errors = B_TRUE; + } } /* * We keep track of whether or not there were any injected errors, so that * any ereports we generate can note it.
*** 2293,2304 **** int total_errors = 0; int n, c; int tgts[VDEV_RAIDZ_MAXPARITY]; int code; - ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ - ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); for (c = 0; c < rm->rm_cols; c++) { rc = &rm->rm_col[c]; --- 2460,2469 ----
*** 2553,2570 **** vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); else vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); } vdev_ops_t vdev_raidz_ops = { vdev_raidz_open, vdev_raidz_close, vdev_raidz_asize, vdev_raidz_io_start, vdev_raidz_io_done, vdev_raidz_state_change, NULL, NULL, ! NULL, VDEV_TYPE_RAIDZ, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; --- 2718,2829 ---- vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); else vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); } + static inline void + vdev_raidz_trim_append_rc(dkioc_free_list_t *dfl, uint64_t *num_extsp, + const raidz_col_t *rc) + { + uint64_t num_exts = *num_extsp; + ASSERT(rc->rc_size != 0); + + if (dfl->dfl_num_exts > 0 && + dfl->dfl_exts[num_exts - 1].dfle_start + + dfl->dfl_exts[num_exts - 1].dfle_length == rc->rc_offset) { + dfl->dfl_exts[num_exts - 1].dfle_length += rc->rc_size; + } else { + dfl->dfl_exts[num_exts].dfle_start = rc->rc_offset; + dfl->dfl_exts[num_exts].dfle_length = rc->rc_size; + (*num_extsp)++; + } + } + + /* + * Processes a trim for a raidz vdev. + */ + static void + vdev_raidz_trim(vdev_t *vd, zio_t *pio, void *trim_exts) + { + dkioc_free_list_t *dfl = trim_exts; + dkioc_free_list_t **sub_dfls; + uint64_t *sub_dfls_num_exts; + + sub_dfls = kmem_zalloc(sizeof (*sub_dfls) * vd->vdev_children, + KM_SLEEP); + sub_dfls_num_exts = kmem_zalloc(sizeof (uint64_t) * vd->vdev_children, + KM_SLEEP); + for (int i = 0; i < vd->vdev_children; i++) { + /* + * We might over-allocate here, because the sub-lists can never + * be longer than the parent list, but they can be shorter. + * The underlying driver will discard zero-length extents. + */ + sub_dfls[i] = kmem_zalloc(DFL_SZ(dfl->dfl_num_exts), KM_SLEEP); + sub_dfls[i]->dfl_num_exts = dfl->dfl_num_exts; + sub_dfls[i]->dfl_flags = dfl->dfl_flags; + sub_dfls[i]->dfl_offset = dfl->dfl_offset; + /* don't copy the check func, because it isn't raidz-aware */ + } + + /* + * Process all extents and redistribute them to the component vdevs + * according to a computed raidz map geometry. + */ + for (int i = 0; i < dfl->dfl_num_exts; i++) { + uint64_t start = dfl->dfl_exts[i].dfle_start; + uint64_t length = dfl->dfl_exts[i].dfle_length; + raidz_map_t *rm = vdev_raidz_map_alloc(NULL, + vdev_raidz_psize_floor(vd, length), start, + vd->vdev_top->vdev_ashift, vd->vdev_children, + vd->vdev_nparity, B_FALSE); + + for (uint64_t j = 0; j < rm->rm_cols; j++) { + uint64_t devidx = rm->rm_col[j].rc_devidx; + vdev_raidz_trim_append_rc(sub_dfls[devidx], + &sub_dfls_num_exts[devidx], &rm->rm_col[j]); + } + vdev_raidz_map_free(rm); + } + + /* + * Issue the component ioctls as children of the parent zio. + */ + for (int i = 0; i < vd->vdev_children; i++) { + if (sub_dfls_num_exts[i] != 0) { + zio_nowait(zio_ioctl(pio, vd->vdev_child[i]->vdev_spa, + vd->vdev_child[i], DKIOCFREE, + vdev_raidz_trim_done, sub_dfls[i], + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY)); + } else { + dfl_free(sub_dfls[i]); + } + } + kmem_free(sub_dfls, sizeof (*sub_dfls) * vd->vdev_children); + kmem_free(sub_dfls_num_exts, sizeof (uint64_t) * vd->vdev_children); + } + + /* + * Releases a dkioc_free_list_t from ioctls issued to component devices in + * vdev_raidz_dkioc_free. + */ + static void + vdev_raidz_trim_done(zio_t *zio) + { + ASSERT(zio->io_private != NULL); + dfl_free(zio->io_private); + } + vdev_ops_t vdev_raidz_ops = { vdev_raidz_open, vdev_raidz_close, vdev_raidz_asize, vdev_raidz_io_start, vdev_raidz_io_done, vdev_raidz_state_change, NULL, NULL, ! vdev_raidz_trim, VDEV_TYPE_RAIDZ, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ };