Print this page
NEX-16191 scrub after trim finds thousands of checksum errors
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
NEX-15749 zpool trim command for a raidz-pool causes panic
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
NEX-15749 zpool trim command for a raidz-pool causes panic
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
NEX-14571 remove isal support remnants
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
NEX-5795 Rename 'wrc' as 'wbc' in the source and in the tech docs
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
NEX-4620 ZFS autotrim triggering is unreliable
NEX-4622 On-demand TRIM code illogically enumerates metaslabs via mg_ms_tree
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
Reviewed by: Hans Rosenfeld <hans.rosenfeld@nexenta.com>
NEX-3984 On-demand TRIM
Reviewed by: Alek Pinchuk <alek@nexenta.com>
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
Conflicts:
usr/src/common/zfs/zpool_prop.c
usr/src/uts/common/sys/fs/zfs.h
NEX-4003 WRC: System panics on debug build
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
NEX-3558 KRRP Integration
NEX-3508 CLONE - Port NEX-2946 Add UNMAP/TRIM functionality to ZFS and illumos
Reviewed by: Josef Sipek <josef.sipek@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Conflicts:
usr/src/uts/common/io/scsi/targets/sd.c
usr/src/uts/common/sys/scsi/targets/sddef.h
re #8279 rb3915 need a mechanism to notify NMS about ZFS config changes (fix lint -courtesy of Yuri Pankov)
re #12584 rb4049 zfsxx latest code merge (fix lint - courtesy of Yuri Pankov)
re #12585 rb4049 ZFS++ work port - refactoring to improve separation of open/closed code, bug fixes, performance improvements - open code
@@ -22,23 +22,26 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/spa.h>
+#include <sys/spa_impl.h>
#include <sys/vdev_impl.h>
#include <sys/vdev_disk.h>
#include <sys/vdev_file.h>
#include <sys/vdev_raidz.h>
#include <sys/zio.h>
#include <sys/zio_checksum.h>
#include <sys/abd.h>
#include <sys/fs/zfs.h>
#include <sys/fm/fs/zfs.h>
+#include <sys/dkioc_free_util.h>
/*
* Virtual device vector for RAID-Z.
*
* This vdev supports single, double, and triple parity. For single parity,
@@ -165,11 +168,19 @@
/*
* Force reconstruction to use the general purpose method.
*/
int vdev_raidz_default_to_general;
-/* Powers of 2 in the Galois field defined above. */
+/*
+ * xor_p hook for external acceleration libraries.
+ */
+int (*zfs_xorp_hook)(int vects, int len, void **array) = NULL;
+
+/*
+ * These two tables represent powers and logs of 2 in the Galois field defined
+ * above. These values were computed by repeatedly multiplying by 2 as above.
+ */
static const uint8_t vdev_raidz_pow2[256] = {
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
@@ -237,10 +248,11 @@
0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
};
static void vdev_raidz_generate_parity(raidz_map_t *rm);
+static void vdev_raidz_trim_done(zio_t *zio);
/*
* Multiply a given number by 2 raised to the given power.
*/
static uint8_t
@@ -264,19 +276,33 @@
{
int c;
size_t size;
for (c = 0; c < rm->rm_firstdatacol; c++) {
+ /*
+ * TRIM doesn't allocate data blocks,
+ * so 'rc_abd' is NULL in this case.
+ * See vdev_raidz_trim() and vdev_raidz_map_alloc()
+ * for more details.
+ */
+ if (rm->rm_col[c].rc_abd != NULL)
abd_free(rm->rm_col[c].rc_abd);
if (rm->rm_col[c].rc_gdata != NULL)
zio_buf_free(rm->rm_col[c].rc_gdata,
rm->rm_col[c].rc_size);
}
size = 0;
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ /*
+ * TRIM doesn't allocate data blocks,
+ * so 'rc_abd' is NULL in this case
+ * See vdev_raidz_trim() and vdev_raidz_map_alloc()
+ * for more details.
+ */
+ if (rm->rm_col[c].rc_abd != NULL)
abd_put(rm->rm_col[c].rc_abd);
size += rm->rm_col[c].rc_size;
}
if (rm->rm_abd_copy != NULL)
@@ -454,16 +480,31 @@
vdev_raidz_map_free_vsd,
vdev_raidz_cksum_report
};
/*
- * Divides the IO evenly across all child vdevs; usually, dcols is
- * the number of children in the target vdev.
+ * Allocates and computes a raidz column map, which directs the raidz column
+ * handling algorithms where to locate and store data and parity columns for
+ * a particular DVA. Usually, dcols is the number of children in the target
+ * vdev.
+ *
+ * The `io_offset', `io_size' and `io_data' hold the offset, size and data
+ * of the zio for which this map is to be computed.
+ * The `unit_shift' parameter contains the minimum allocation bitshift of
+ * the storage pool. The `dcols' parameter contains the number of drives in
+ * this raidz vdev (including parity drives), with `nparity' denoting how
+ * many those contain the parity (one, two or three).
+ *
+ * The `alloc_io_bufs' flag denotes whether you want the constructed raidz
+ * map to contain allocated buffers to hold column IO data or not (if
+ * you're using this function simply to determine raidz geometry, you'll
+ * want to pass B_FALSE here).
*/
static raidz_map_t *
vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset,
- uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
+ uint64_t unit_shift, uint64_t dcols, uint64_t nparity,
+ boolean_t alloc_data)
{
raidz_map_t *rm;
/* The starting RAIDZ (parent) vdev sector of the block. */
uint64_t b = offset >> unit_shift;
/* The zio's size in units of the vdev's minimum sector size. */
@@ -554,21 +595,24 @@
rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
rm->rm_nskip = roundup(tot, nparity + 1) - tot;
ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
ASSERT3U(rm->rm_nskip, <=, nparity);
- for (c = 0; c < rm->rm_firstdatacol; c++)
+ if (alloc_data) {
+ for (c = 0; c < rm->rm_firstdatacol; c++) {
rm->rm_col[c].rc_abd =
abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);
+ }
rm->rm_col[c].rc_abd = abd_get_offset(abd, 0);
off = rm->rm_col[c].rc_size;
for (c = c + 1; c < acols; c++) {
rm->rm_col[c].rc_abd = abd_get_offset(abd, off);
off += rm->rm_col[c].rc_size;
}
+ }
/*
* If all data stored spans all columns, there's a danger that parity
* will always be on the same device and, since parity isn't read
* during normal operation, that that device's I/O bandwidth won't be
@@ -665,18 +709,97 @@
}
return (0);
}
+/*
+ * software acceleration of XOR calculations, requirements
+ *
+ * the (src/dst) vectors needs to be 64 byte aligned
+ * all the vectors have to be the same size
+ */
+#define RAIDZ_ACCELERATION_ALIGNMENT 64ul
+#define UNALIGNED(addr) \
+ ((unsigned long)(addr) & (RAIDZ_ACCELERATION_ALIGNMENT-1))
+
static void
vdev_raidz_generate_parity_p(raidz_map_t *rm)
{
uint64_t *p;
int c;
abd_t *src;
+#if 0
+ /* FIXME: needs to be reviewed and changed to support ABD */
+ int parity_done;
+ void *va[16];
+ void **array;
+ int j, nvects;
+
+ parity_done = 0;
+ while (0 && zfs_xorp_hook && !parity_done) {
+ unsigned long no_accel = 0;
+ /* at least two columns (plus one for result) */
+ if (rm->rm_cols < 3) {
+ DTRACE_PROBE1(raidz_few_cols, int, rm->rm_cols);
+ break;
+ }
+ /* check sizes and alignment */
+ no_accel = UNALIGNED(rm->rm_col[VDEV_RAIDZ_P].rc_data);
+ if (no_accel) {
+ DTRACE_PROBE1(raidz_unaligned_dst, unsigned long,
+ no_accel);
+ break;
+ }
+ pcount = rm->rm_col[rm->rm_firstdatacol].rc_size;
+ nvects = 1; /* for the destination */
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ no_accel = UNALIGNED(rm->rm_col[c].rc_data);
+ if (no_accel) {
+ DTRACE_PROBE1(raidz_unaligned_src,
+ unsigned long, no_accel);
+ break;
+ }
+ if (rm->rm_col[c].rc_size != pcount) {
+ DTRACE_PROBE(raidz_sizes_vary);
+ no_accel = 1;
+ break;
+ }
+ nvects++;
+ }
+ if (no_accel)
+ break;
+ if (nvects > 16) {
+ array = kmem_alloc(nvects * sizeof (void *),
+ KM_NOSLEEP);
+ if (array == NULL) {
+ DTRACE_PROBE(raidz_alloc_failed);
+ break;
+ }
+ } else {
+ array = va;
+ }
+ for (j = 0, c = rm->rm_firstdatacol; c < rm->rm_cols;
+ c++, j++) {
+ array[j] = rm->rm_col[c].rc_data;
+ }
+ array[j] = rm->rm_col[VDEV_RAIDZ_P].rc_data;
+ if (zfs_xorp_hook(nvects,
+ rm->rm_col[rm->rm_firstdatacol].rc_size, array)) {
+ DTRACE_PROBE(raidz_accel_failure);
+ break;
+ }
+ if (array != va) {
+ kmem_free(array, nvects * sizeof (void *));
+ }
+ parity_done = 1;
+ DTRACE_PROBE(raidz_accel_success);
+ }
+ if (parity_done)
+ return;
+#endif
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
src = rm->rm_col[c].rc_abd;
p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
if (c == rm->rm_firstdatacol) {
abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
@@ -1807,11 +1930,11 @@
*/
abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
SPA_OLD_MAXBLOCKSIZE);
rm = vdev_raidz_map_alloc(abd,
SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift,
- vd->vdev_children, vd->vdev_nparity);
+ vd->vdev_children, vd->vdev_nparity, B_TRUE);
coloffset = origoffset;
for (c = rm->rm_firstdatacol; c < rm->rm_cols;
c++, coloffset += rc->rc_size) {
@@ -1872,10 +1995,46 @@
asize = roundup(asize, nparity + 1) << ashift;
return (asize);
}
+/*
+ * Converts an allocated size on a raidz vdev back to a logical block
+ * size. This is used in trimming to figure out the appropriate logical
+ * size to pass to vdev_raidz_map_alloc when splitting up extents of free
+ * space obtained from metaslabs. However, a range of free space on a
+ * raidz vdev might have originally consisted of multiple blocks and
+ * those, taken together with their skip blocks, might not always align
+ * neatly to a new vdev_raidz_map_alloc covering the entire unified
+ * range. So to ensure that the newly allocated raidz map *always* fits
+ * within the asize passed to this function and never exceeds it (since
+ * that might trim allocated data past it), we round it down to the
+ * nearest suitable multiple of the vdev ashift (hence the "_floor" in
+ * this function's name).
+ * This function is in effect an inverse of vdev_raidz_asize. However,
+ * since multiple psizes can map to a single asize (due to variable padding,
+ * this function instead returns the largest chunk that still fits inside
+ * the specified asize).
+ */
+static uint64_t
+vdev_raidz_psize_floor(vdev_t *vd, uint64_t asize)
+{
+ uint64_t psize;
+ uint64_t ashift = vd->vdev_top->vdev_ashift;
+ uint64_t cols = vd->vdev_children;
+ uint64_t nparity = vd->vdev_nparity;
+
+ psize = (asize - (nparity << ashift));
+ psize /= cols;
+ psize *= cols - nparity;
+ psize += (1 << ashift) - 1;
+
+ psize = P2ALIGN(psize, 1 << ashift);
+
+ return (psize);
+}
+
static void
vdev_raidz_child_done(zio_t *zio)
{
raidz_col_t *rc = zio->io_private;
@@ -1911,11 +2070,11 @@
raidz_col_t *rc;
int c, i;
rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
tvd->vdev_ashift, vd->vdev_children,
- vd->vdev_nparity);
+ vd->vdev_nparity, B_TRUE);
zio->io_vsd = rm;
zio->io_vsd_ops = &vdev_raidz_vsd_ops;
ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
@@ -2000,10 +2159,12 @@
static void
raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
{
void *buf;
vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
+ vdev_stat_t *vs = &vd->vdev_stat;
+ spa_t *spa = zio->io_spa;
if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
zio_bad_cksum_t zbc;
raidz_map_t *rm = zio->io_vsd;
@@ -2018,10 +2179,16 @@
zfs_ereport_post_checksum(zio->io_spa, vd, zio,
rc->rc_offset, rc->rc_size, buf, bad_data,
&zbc);
abd_return_buf(rc->rc_abd, buf, rc->rc_size);
}
+
+ if (vd->vdev_isspecial && (vs->vs_checksum_errors ||
+ vs->vs_read_errors || vs->vs_write_errors) &&
+ !spa->spa_special_has_errors) {
+ spa->spa_special_has_errors = B_TRUE;
+ }
}
/*
* We keep track of whether or not there were any injected errors, so that
* any ereports we generate can note it.
@@ -2293,12 +2460,10 @@
int total_errors = 0;
int n, c;
int tgts[VDEV_RAIDZ_MAXPARITY];
int code;
- ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
-
ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
for (c = 0; c < rm->rm_cols; c++) {
rc = &rm->rm_col[c];
@@ -2553,18 +2718,112 @@
vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
else
vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
}
+static inline void
+vdev_raidz_trim_append_rc(dkioc_free_list_t *dfl, uint64_t *num_extsp,
+ const raidz_col_t *rc)
+{
+ uint64_t num_exts = *num_extsp;
+ ASSERT(rc->rc_size != 0);
+
+ if (dfl->dfl_num_exts > 0 &&
+ dfl->dfl_exts[num_exts - 1].dfle_start +
+ dfl->dfl_exts[num_exts - 1].dfle_length == rc->rc_offset) {
+ dfl->dfl_exts[num_exts - 1].dfle_length += rc->rc_size;
+ } else {
+ dfl->dfl_exts[num_exts].dfle_start = rc->rc_offset;
+ dfl->dfl_exts[num_exts].dfle_length = rc->rc_size;
+ (*num_extsp)++;
+ }
+}
+
+/*
+ * Processes a trim for a raidz vdev.
+ */
+static void
+vdev_raidz_trim(vdev_t *vd, zio_t *pio, void *trim_exts)
+{
+ dkioc_free_list_t *dfl = trim_exts;
+ dkioc_free_list_t **sub_dfls;
+ uint64_t *sub_dfls_num_exts;
+
+ sub_dfls = kmem_zalloc(sizeof (*sub_dfls) * vd->vdev_children,
+ KM_SLEEP);
+ sub_dfls_num_exts = kmem_zalloc(sizeof (uint64_t) * vd->vdev_children,
+ KM_SLEEP);
+ for (int i = 0; i < vd->vdev_children; i++) {
+ /*
+ * We might over-allocate here, because the sub-lists can never
+ * be longer than the parent list, but they can be shorter.
+ * The underlying driver will discard zero-length extents.
+ */
+ sub_dfls[i] = kmem_zalloc(DFL_SZ(dfl->dfl_num_exts), KM_SLEEP);
+ sub_dfls[i]->dfl_num_exts = dfl->dfl_num_exts;
+ sub_dfls[i]->dfl_flags = dfl->dfl_flags;
+ sub_dfls[i]->dfl_offset = dfl->dfl_offset;
+ /* don't copy the check func, because it isn't raidz-aware */
+ }
+
+ /*
+ * Process all extents and redistribute them to the component vdevs
+ * according to a computed raidz map geometry.
+ */
+ for (int i = 0; i < dfl->dfl_num_exts; i++) {
+ uint64_t start = dfl->dfl_exts[i].dfle_start;
+ uint64_t length = dfl->dfl_exts[i].dfle_length;
+ raidz_map_t *rm = vdev_raidz_map_alloc(NULL,
+ vdev_raidz_psize_floor(vd, length), start,
+ vd->vdev_top->vdev_ashift, vd->vdev_children,
+ vd->vdev_nparity, B_FALSE);
+
+ for (uint64_t j = 0; j < rm->rm_cols; j++) {
+ uint64_t devidx = rm->rm_col[j].rc_devidx;
+ vdev_raidz_trim_append_rc(sub_dfls[devidx],
+ &sub_dfls_num_exts[devidx], &rm->rm_col[j]);
+ }
+ vdev_raidz_map_free(rm);
+ }
+
+ /*
+ * Issue the component ioctls as children of the parent zio.
+ */
+ for (int i = 0; i < vd->vdev_children; i++) {
+ if (sub_dfls_num_exts[i] != 0) {
+ zio_nowait(zio_ioctl(pio, vd->vdev_child[i]->vdev_spa,
+ vd->vdev_child[i], DKIOCFREE,
+ vdev_raidz_trim_done, sub_dfls[i],
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
+ ZIO_FLAG_DONT_RETRY));
+ } else {
+ dfl_free(sub_dfls[i]);
+ }
+ }
+ kmem_free(sub_dfls, sizeof (*sub_dfls) * vd->vdev_children);
+ kmem_free(sub_dfls_num_exts, sizeof (uint64_t) * vd->vdev_children);
+}
+
+/*
+ * Releases a dkioc_free_list_t from ioctls issued to component devices in
+ * vdev_raidz_dkioc_free.
+ */
+static void
+vdev_raidz_trim_done(zio_t *zio)
+{
+ ASSERT(zio->io_private != NULL);
+ dfl_free(zio->io_private);
+}
+
vdev_ops_t vdev_raidz_ops = {
vdev_raidz_open,
vdev_raidz_close,
vdev_raidz_asize,
vdev_raidz_io_start,
vdev_raidz_io_done,
vdev_raidz_state_change,
NULL,
NULL,
- NULL,
+ vdev_raidz_trim,
VDEV_TYPE_RAIDZ, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};