Print this page
NEX-16191 scrub after trim finds thousands of checksum errors
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
NEX-15749 zpool trim command for a raidz-pool causes panic
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
NEX-15749 zpool trim command for a raidz-pool causes panic
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
NEX-14571 remove isal support remnants
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
NEX-5795 Rename 'wrc' as 'wbc' in the source and in the tech docs
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
NEX-4620 ZFS autotrim triggering is unreliable
NEX-4622 On-demand TRIM code illogically enumerates metaslabs via mg_ms_tree
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
Reviewed by: Hans Rosenfeld <hans.rosenfeld@nexenta.com>
NEX-3984 On-demand TRIM
Reviewed by: Alek Pinchuk <alek@nexenta.com>
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
Conflicts:
        usr/src/common/zfs/zpool_prop.c
        usr/src/uts/common/sys/fs/zfs.h
NEX-4003 WRC: System panics on debug build
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
NEX-3558 KRRP Integration
NEX-3508 CLONE - Port NEX-2946 Add UNMAP/TRIM functionality to ZFS and illumos
Reviewed by: Josef Sipek <josef.sipek@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Conflicts:
    usr/src/uts/common/io/scsi/targets/sd.c
    usr/src/uts/common/sys/scsi/targets/sddef.h
re #8279 rb3915 need a mechanism to notify NMS about ZFS config changes (fix lint -courtesy of Yuri Pankov)
re #12584 rb4049 zfsxx latest code merge (fix lint - courtesy of Yuri Pankov)
re #12585 rb4049 ZFS++ work port - refactoring to improve separation of open/closed code, bug fixes, performance improvements - open code

@@ -22,23 +22,26 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
+#include <sys/spa_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_disk.h>
 #include <sys/vdev_file.h>
 #include <sys/vdev_raidz.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/abd.h>
 #include <sys/fs/zfs.h>
 #include <sys/fm/fs/zfs.h>
+#include <sys/dkioc_free_util.h>
 
 /*
  * Virtual device vector for RAID-Z.
  *
  * This vdev supports single, double, and triple parity. For single parity,

@@ -165,11 +168,19 @@
 /*
  * Force reconstruction to use the general purpose method.
  */
 int vdev_raidz_default_to_general;
 
-/* Powers of 2 in the Galois field defined above. */
+/*
+ * xor_p hook for external acceleration libraries.
+ */
+int (*zfs_xorp_hook)(int vects, int len, void **array) = NULL;
+
+/*
+ * These two tables represent powers and logs of 2 in the Galois field defined
+ * above. These values were computed by repeatedly multiplying by 2 as above.
+ */
 static const uint8_t vdev_raidz_pow2[256] = {
         0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
         0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
         0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
         0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,

@@ -237,10 +248,11 @@
         0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
         0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
 };
 
 static void vdev_raidz_generate_parity(raidz_map_t *rm);
+static void vdev_raidz_trim_done(zio_t *zio);
 
 /*
  * Multiply a given number by 2 raised to the given power.
  */
 static uint8_t

@@ -264,19 +276,33 @@
 {
         int c;
         size_t size;
 
         for (c = 0; c < rm->rm_firstdatacol; c++) {
+                /*
+                 * TRIM doesn't allocate data blocks,
+                 * so 'rc_abd' is NULL in this case.
+                 * See vdev_raidz_trim() and vdev_raidz_map_alloc()
+                 * for more details.
+                 */
+                if (rm->rm_col[c].rc_abd != NULL)
                 abd_free(rm->rm_col[c].rc_abd);
 
                 if (rm->rm_col[c].rc_gdata != NULL)
                         zio_buf_free(rm->rm_col[c].rc_gdata,
                             rm->rm_col[c].rc_size);
         }
 
         size = 0;
         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+                /*
+                 * TRIM doesn't allocate data blocks,
+                 * so 'rc_abd' is NULL in this case
+                 * See vdev_raidz_trim() and vdev_raidz_map_alloc()
+                 * for more details.
+                 */
+                if (rm->rm_col[c].rc_abd != NULL)
                 abd_put(rm->rm_col[c].rc_abd);
                 size += rm->rm_col[c].rc_size;
         }
 
         if (rm->rm_abd_copy != NULL)

@@ -454,16 +480,31 @@
         vdev_raidz_map_free_vsd,
         vdev_raidz_cksum_report
 };
 
 /*
- * Divides the IO evenly across all child vdevs; usually, dcols is
- * the number of children in the target vdev.
+ * Allocates and computes a raidz column map, which directs the raidz column
+ * handling algorithms where to locate and store data and parity columns for
+ * a particular DVA. Usually, dcols is the number of children in the target
+ * vdev.
+ *
+ * The `io_offset', `io_size' and `io_data' hold the offset, size and data
+ * of the zio for which this map is to be computed.
+ * The `unit_shift' parameter contains the minimum allocation bitshift of
+ * the storage pool. The `dcols' parameter contains the number of drives in
+ * this raidz vdev (including parity drives), with `nparity' denoting how
+ * many those contain the parity (one, two or three).
+ *
+ * The `alloc_io_bufs' flag denotes whether you want the constructed raidz
+ * map to contain allocated buffers to hold column IO data or not (if
+ * you're using this function simply to determine raidz geometry, you'll
+ * want to pass B_FALSE here).
  */
 static raidz_map_t *
 vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset,
-    uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
+    uint64_t unit_shift, uint64_t dcols, uint64_t nparity,
+    boolean_t alloc_data)
 {
         raidz_map_t *rm;
         /* The starting RAIDZ (parent) vdev sector of the block. */
         uint64_t b = offset >> unit_shift;
         /* The zio's size in units of the vdev's minimum sector size. */

@@ -554,21 +595,24 @@
         rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
         rm->rm_nskip = roundup(tot, nparity + 1) - tot;
         ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
         ASSERT3U(rm->rm_nskip, <=, nparity);
 
-        for (c = 0; c < rm->rm_firstdatacol; c++)
+        if (alloc_data) {
+                for (c = 0; c < rm->rm_firstdatacol; c++) {
                 rm->rm_col[c].rc_abd =
                     abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);
+                }
 
         rm->rm_col[c].rc_abd = abd_get_offset(abd, 0);
         off = rm->rm_col[c].rc_size;
 
         for (c = c + 1; c < acols; c++) {
                 rm->rm_col[c].rc_abd = abd_get_offset(abd, off);
                 off += rm->rm_col[c].rc_size;
         }
+        }
 
         /*
          * If all data stored spans all columns, there's a danger that parity
          * will always be on the same device and, since parity isn't read
          * during normal operation, that that device's I/O bandwidth won't be

@@ -665,18 +709,97 @@
         }
 
         return (0);
 }
 
+/*
+ * software acceleration of XOR calculations, requirements
+ *
+ * the (src/dst) vectors needs to be 64 byte aligned
+ * all the vectors have to be the same size
+ */
+#define RAIDZ_ACCELERATION_ALIGNMENT    64ul
+#define UNALIGNED(addr) \
+        ((unsigned long)(addr) & (RAIDZ_ACCELERATION_ALIGNMENT-1))
+
 static void
 vdev_raidz_generate_parity_p(raidz_map_t *rm)
 {
         uint64_t *p;
         int c;
         abd_t *src;
 
+#if 0
+        /* FIXME: needs to be reviewed and changed to support ABD */
+        int parity_done;
+        void *va[16];
+        void **array;
+        int j, nvects;
+
+        parity_done = 0;
+        while (0 && zfs_xorp_hook && !parity_done) {
+                unsigned long no_accel = 0;
+                /* at least two columns (plus one for result) */
+                if (rm->rm_cols < 3) {
+                        DTRACE_PROBE1(raidz_few_cols, int, rm->rm_cols);
+                        break;
+                }
+                /* check sizes and alignment */
+                no_accel = UNALIGNED(rm->rm_col[VDEV_RAIDZ_P].rc_data);
+                if (no_accel) {
+                        DTRACE_PROBE1(raidz_unaligned_dst, unsigned long,
+                            no_accel);
+                        break;
+                }
+                pcount = rm->rm_col[rm->rm_firstdatacol].rc_size;
+                nvects = 1; /* for the destination */
         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+                        no_accel = UNALIGNED(rm->rm_col[c].rc_data);
+                        if (no_accel) {
+                                DTRACE_PROBE1(raidz_unaligned_src,
+                                    unsigned long, no_accel);
+                                break;
+                        }
+                        if (rm->rm_col[c].rc_size != pcount) {
+                                DTRACE_PROBE(raidz_sizes_vary);
+                                no_accel = 1;
+                                break;
+                        }
+                        nvects++;
+                }
+                if (no_accel)
+                        break;
+                if (nvects > 16) {
+                        array = kmem_alloc(nvects * sizeof (void *),
+                            KM_NOSLEEP);
+                        if (array == NULL) {
+                                DTRACE_PROBE(raidz_alloc_failed);
+                                break;
+                        }
+                } else {
+                        array = va;
+                }
+                for (j = 0, c = rm->rm_firstdatacol; c < rm->rm_cols;
+                    c++, j++) {
+                        array[j] = rm->rm_col[c].rc_data;
+                }
+                array[j] = rm->rm_col[VDEV_RAIDZ_P].rc_data;
+                if (zfs_xorp_hook(nvects,
+                    rm->rm_col[rm->rm_firstdatacol].rc_size, array)) {
+                        DTRACE_PROBE(raidz_accel_failure);
+                        break;
+                }
+                if (array != va) {
+                        kmem_free(array, nvects * sizeof (void *));
+                }
+                parity_done = 1;
+                DTRACE_PROBE(raidz_accel_success);
+        }
+        if (parity_done)
+                return;
+#endif
+        for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
                 src = rm->rm_col[c].rc_abd;
                 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
 
                 if (c == rm->rm_firstdatacol) {
                         abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);

@@ -1807,11 +1930,11 @@
          */
         abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
             SPA_OLD_MAXBLOCKSIZE);
         rm = vdev_raidz_map_alloc(abd,
             SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift,
-            vd->vdev_children, vd->vdev_nparity);
+            vd->vdev_children, vd->vdev_nparity, B_TRUE);
 
         coloffset = origoffset;
 
         for (c = rm->rm_firstdatacol; c < rm->rm_cols;
             c++, coloffset += rc->rc_size) {

@@ -1872,10 +1995,46 @@
         asize = roundup(asize, nparity + 1) << ashift;
 
         return (asize);
 }
 
+/*
+ * Converts an allocated size on a raidz vdev back to a logical block
+ * size. This is used in trimming to figure out the appropriate logical
+ * size to pass to vdev_raidz_map_alloc when splitting up extents of free
+ * space obtained from metaslabs. However, a range of free space on a
+ * raidz vdev might have originally consisted of multiple blocks and
+ * those, taken together with their skip blocks, might not always align
+ * neatly to a new vdev_raidz_map_alloc covering the entire unified
+ * range. So to ensure that the newly allocated raidz map *always* fits
+ * within the asize passed to this function and never exceeds it (since
+ * that might trim allocated data past it), we round it down to the
+ * nearest suitable multiple of the vdev ashift (hence the "_floor" in
+ * this function's name).
+ * This function is in effect an inverse of vdev_raidz_asize. However,
+ * since multiple psizes can map to a single asize (due to variable padding,
+ * this function instead returns the largest chunk that still fits inside
+ * the specified asize).
+ */
+static uint64_t
+vdev_raidz_psize_floor(vdev_t *vd, uint64_t asize)
+{
+        uint64_t psize;
+        uint64_t ashift = vd->vdev_top->vdev_ashift;
+        uint64_t cols = vd->vdev_children;
+        uint64_t nparity = vd->vdev_nparity;
+
+        psize = (asize - (nparity << ashift));
+        psize /= cols;
+        psize *= cols - nparity;
+        psize += (1 << ashift) - 1;
+
+        psize = P2ALIGN(psize, 1 << ashift);
+
+        return (psize);
+}
+
 static void
 vdev_raidz_child_done(zio_t *zio)
 {
         raidz_col_t *rc = zio->io_private;
 

@@ -1911,11 +2070,11 @@
         raidz_col_t *rc;
         int c, i;
 
         rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
             tvd->vdev_ashift, vd->vdev_children,
-            vd->vdev_nparity);
+            vd->vdev_nparity, B_TRUE);
 
         zio->io_vsd = rm;
         zio->io_vsd_ops = &vdev_raidz_vsd_ops;
 
         ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));

@@ -2000,10 +2159,12 @@
 static void
 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
 {
         void *buf;
         vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
+        vdev_stat_t *vs = &vd->vdev_stat;
+        spa_t *spa = zio->io_spa;
 
         if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
                 zio_bad_cksum_t zbc;
                 raidz_map_t *rm = zio->io_vsd;
 

@@ -2018,10 +2179,16 @@
                 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
                     rc->rc_offset, rc->rc_size, buf, bad_data,
                     &zbc);
                 abd_return_buf(rc->rc_abd, buf, rc->rc_size);
         }
+
+        if (vd->vdev_isspecial && (vs->vs_checksum_errors ||
+            vs->vs_read_errors || vs->vs_write_errors) &&
+            !spa->spa_special_has_errors) {
+                spa->spa_special_has_errors = B_TRUE;
+        }
 }
 
 /*
  * We keep track of whether or not there were any injected errors, so that
  * any ereports we generate can note it.

@@ -2293,12 +2460,10 @@
         int total_errors = 0;
         int n, c;
         int tgts[VDEV_RAIDZ_MAXPARITY];
         int code;
 
-        ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
-
         ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
         ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
 
         for (c = 0; c < rm->rm_cols; c++) {
                 rc = &rm->rm_col[c];

@@ -2553,18 +2718,112 @@
                 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
         else
                 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
 }
 
+static inline void
+vdev_raidz_trim_append_rc(dkioc_free_list_t *dfl, uint64_t *num_extsp,
+    const raidz_col_t *rc)
+{
+        uint64_t num_exts = *num_extsp;
+        ASSERT(rc->rc_size != 0);
+
+        if (dfl->dfl_num_exts > 0 &&
+            dfl->dfl_exts[num_exts - 1].dfle_start +
+            dfl->dfl_exts[num_exts - 1].dfle_length == rc->rc_offset) {
+                dfl->dfl_exts[num_exts - 1].dfle_length += rc->rc_size;
+        } else {
+                dfl->dfl_exts[num_exts].dfle_start = rc->rc_offset;
+                dfl->dfl_exts[num_exts].dfle_length = rc->rc_size;
+                (*num_extsp)++;
+        }
+}
+
+/*
+ * Processes a trim for a raidz vdev.
+ */
+static void
+vdev_raidz_trim(vdev_t *vd, zio_t *pio, void *trim_exts)
+{
+        dkioc_free_list_t *dfl = trim_exts;
+        dkioc_free_list_t **sub_dfls;
+        uint64_t *sub_dfls_num_exts;
+
+        sub_dfls = kmem_zalloc(sizeof (*sub_dfls) * vd->vdev_children,
+            KM_SLEEP);
+        sub_dfls_num_exts = kmem_zalloc(sizeof (uint64_t) * vd->vdev_children,
+            KM_SLEEP);
+        for (int i = 0; i < vd->vdev_children; i++) {
+                /*
+                 * We might over-allocate here, because the sub-lists can never
+                 * be longer than the parent list, but they can be shorter.
+                 * The underlying driver will discard zero-length extents.
+                 */
+                sub_dfls[i] = kmem_zalloc(DFL_SZ(dfl->dfl_num_exts), KM_SLEEP);
+                sub_dfls[i]->dfl_num_exts = dfl->dfl_num_exts;
+                sub_dfls[i]->dfl_flags = dfl->dfl_flags;
+                sub_dfls[i]->dfl_offset = dfl->dfl_offset;
+                /* don't copy the check func, because it isn't raidz-aware */
+        }
+
+        /*
+         * Process all extents and redistribute them to the component vdevs
+         * according to a computed raidz map geometry.
+         */
+        for (int i = 0; i < dfl->dfl_num_exts; i++) {
+                uint64_t start = dfl->dfl_exts[i].dfle_start;
+                uint64_t length = dfl->dfl_exts[i].dfle_length;
+                raidz_map_t *rm = vdev_raidz_map_alloc(NULL,
+                    vdev_raidz_psize_floor(vd, length), start,
+                    vd->vdev_top->vdev_ashift, vd->vdev_children,
+                    vd->vdev_nparity, B_FALSE);
+
+                for (uint64_t j = 0; j < rm->rm_cols; j++) {
+                        uint64_t devidx = rm->rm_col[j].rc_devidx;
+                        vdev_raidz_trim_append_rc(sub_dfls[devidx],
+                            &sub_dfls_num_exts[devidx], &rm->rm_col[j]);
+                }
+                vdev_raidz_map_free(rm);
+        }
+
+        /*
+         * Issue the component ioctls as children of the parent zio.
+         */
+        for (int i = 0; i < vd->vdev_children; i++) {
+                if (sub_dfls_num_exts[i] != 0) {
+                        zio_nowait(zio_ioctl(pio, vd->vdev_child[i]->vdev_spa,
+                            vd->vdev_child[i], DKIOCFREE,
+                            vdev_raidz_trim_done, sub_dfls[i],
+                            ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
+                            ZIO_FLAG_DONT_RETRY));
+                } else {
+                        dfl_free(sub_dfls[i]);
+                }
+        }
+        kmem_free(sub_dfls, sizeof (*sub_dfls) * vd->vdev_children);
+        kmem_free(sub_dfls_num_exts, sizeof (uint64_t) * vd->vdev_children);
+}
+
+/*
+ * Releases a dkioc_free_list_t from ioctls issued to component devices in
+ * vdev_raidz_dkioc_free.
+ */
+static void
+vdev_raidz_trim_done(zio_t *zio)
+{
+        ASSERT(zio->io_private != NULL);
+        dfl_free(zio->io_private);
+}
+
 vdev_ops_t vdev_raidz_ops = {
         vdev_raidz_open,
         vdev_raidz_close,
         vdev_raidz_asize,
         vdev_raidz_io_start,
         vdev_raidz_io_done,
         vdev_raidz_state_change,
         NULL,
         NULL,
-        NULL,
+        vdev_raidz_trim,
         VDEV_TYPE_RAIDZ,        /* name of this vdev type */
         B_FALSE                 /* not a leaf vdev */
 };