Print this page
NEX-6353 The "DKIOCSOLIDSTATE failed, assuming non-SSD media" messages don't provide any useful information
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
NEX-5736 implement autoreplace matching based on FRU slot number
NEX-6200 hot spares are not reactivated after reinserting into enclosure
NEX-9403 need to update FRU for spare and l2cache devices
NEX-9404 remove lofi autoreplace support from syseventd
NEX-9409 hotsparing doesn't work for vdevs without FRU
NEX-9424 zfs`vdev_online() needs better notification about state changes
Portions contributed by: Alek Pinchuk <alek@nexenta.com>
Portions contributed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Steve Peng <steve.peng@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
NEX-8065 ZFS doesn't notice when disk vdevs have no write cache
Reviewed by: Dan Fields <dan.fields@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
NEX-2846 Enable Automatic/Intelligent Hot Sparing capability
Reviewed by: Jeffry Molanus <jeffry.molanus@nexenta.com>
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
6494 ASSERT supported zio_types for file and disk vdevs
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Approved by: Albert Lee <trisk@omniti.com>
NEX-3984 On-demand TRIM
Reviewed by: Alek Pinchuk <alek@nexenta.com>
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
Conflicts:
        usr/src/common/zfs/zpool_prop.c
        usr/src/uts/common/sys/fs/zfs.h
NEX-3508 CLONE - Port NEX-2946 Add UNMAP/TRIM functionality to ZFS and illumos
Reviewed by: Josef Sipek <josef.sipek@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Conflicts:
    usr/src/uts/common/io/scsi/targets/sd.c
    usr/src/uts/common/sys/scsi/targets/sddef.h
NEX-2933 tip of nza-kernel hangs during zpool offline
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Reviewed by: Josef Sipek <josef.sipek@nexenta.com>
NEX-1142 move rwlock to vdev to protect vdev_tsd
not just ldi handle.
This way we serialize open/close, yet allow parallel I/O.
4370 avoid transmitting holes during zfs send
4371 DMU code clean up
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
Approved by: Garrett D'Amore <garrett@damore.org>
NEX-1065 Added serialization to avoid race (fix lint)
NEX-1065 Added serialization to avoid race
between ldi notification and I/O path.
Also fixes OS-124, NEX-1051, NEX-1062.

@@ -16,15 +16,16 @@
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
+
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
- * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2013 Joyent, Inc.  All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa_impl.h>
 #include <sys/refcount.h>

@@ -41,60 +42,70 @@
  * Virtual device vector for disks.
  */
 
 extern ldi_ident_t zfs_li;
 
-static void vdev_disk_close(vdev_t *);
+static void vdev_disk_close_impl(vdev_t *, boolean_t);
 
 typedef struct vdev_disk_ldi_cb {
         list_node_t             lcb_next;
         ldi_callback_id_t       lcb_id;
 } vdev_disk_ldi_cb_t;
 
-static void
-vdev_disk_alloc(vdev_t *vd)
+static vdev_disk_t *
+vdev_disk_alloc(void)
 {
         vdev_disk_t *dvd;
 
-        dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
+        dvd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
         /*
          * Create the LDI event callback list.
          */
         list_create(&dvd->vd_ldi_cbs, sizeof (vdev_disk_ldi_cb_t),
             offsetof(vdev_disk_ldi_cb_t, lcb_next));
+        return (dvd);
 }
 
 static void
-vdev_disk_free(vdev_t *vd)
+vdev_disk_free_locked(vdev_t *vd)
 {
-        vdev_disk_t *dvd = vd->vdev_tsd;
         vdev_disk_ldi_cb_t *lcb;
+        vdev_disk_t *dvd = vd->vdev_tsd;
 
+        ASSERT(rw_lock_held(&vd->vdev_tsd_lock));
+
         if (dvd == NULL)
                 return;
 
         /*
          * We have already closed the LDI handle. Clean up the LDI event
          * callbacks and free vd->vdev_tsd.
          */
+        vd->vdev_tsd = NULL;
         while ((lcb = list_head(&dvd->vd_ldi_cbs)) != NULL) {
                 list_remove(&dvd->vd_ldi_cbs, lcb);
                 (void) ldi_ev_remove_callbacks(lcb->lcb_id);
                 kmem_free(lcb, sizeof (vdev_disk_ldi_cb_t));
         }
         list_destroy(&dvd->vd_ldi_cbs);
         kmem_free(dvd, sizeof (vdev_disk_t));
-        vd->vdev_tsd = NULL;
 }
 
+static void
+vdev_disk_free(vdev_t *vd)
+{
+        rw_enter(&vd->vdev_tsd_lock, RW_WRITER);
+        vdev_disk_free_locked(vd);
+        rw_exit(&vd->vdev_tsd_lock);
+}
+
 /* ARGSUSED */
 static int
 vdev_disk_off_notify(ldi_handle_t lh, ldi_ev_cookie_t ecookie, void *arg,
     void *ev_data)
 {
         vdev_t *vd = (vdev_t *)arg;
-        vdev_disk_t *dvd = vd->vdev_tsd;
 
         /*
          * Ignore events other than offline.
          */
         if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0)

@@ -106,12 +117,11 @@
          *
          * We inform vdev_disk_close that it is being called from offline
          * notify context so it will defer cleanup of LDI event callbacks and
          * freeing of vd->vdev_tsd to the offline finalize or a reopen.
          */
-        dvd->vd_ldi_offline = B_TRUE;
-        vdev_disk_close(vd);
+        vdev_disk_close_impl(vd, B_TRUE);
 
         /*
          * Now that the device is closed, request that the spa_async_thread
          * mark the device as REMOVED and notify FMA of the removal.
          */

@@ -240,26 +250,16 @@
                     dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool));
                 vd->vdev_devid_vp = NULL;
         }
 }
 
-/*
- * We want to be loud in DEBUG kernels when DKIOCGMEDIAINFOEXT fails, or when
- * even a fallback to DKIOCGMEDIAINFO fails.
- */
-#ifdef DEBUG
-#define VDEV_DEBUG(...) cmn_err(CE_NOTE, __VA_ARGS__)
-#else
-#define VDEV_DEBUG(...) /* Nothing... */
-#endif
-
 static int
 vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
     uint64_t *ashift)
 {
         spa_t *spa = vd->vdev_spa;
-        vdev_disk_t *dvd = vd->vdev_tsd;
+        vdev_disk_t *dvd;
         ldi_ev_cookie_t ecookie;
         vdev_disk_ldi_cb_t *lcb;
         union {
                 struct dk_minfo_ext ude;
                 struct dk_minfo ud;

@@ -266,11 +266,11 @@
         } dks;
         struct dk_minfo_ext *dkmext = &dks.ude;
         struct dk_minfo *dkm = &dks.ud;
         int error;
         dev_t dev;
-        int otyp;
+        int otyp, vdev_ssd;
         boolean_t validate_devid = B_FALSE;
         ddi_devid_t devid;
         uint64_t capacity = 0, blksz = 0, pbsize;
 
         /*

@@ -278,34 +278,34 @@
          */
         if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
                 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
                 return (SET_ERROR(EINVAL));
         }
-
+        rw_enter(&vd->vdev_tsd_lock, RW_WRITER);
+        dvd = vd->vdev_tsd;
         /*
          * Reopen the device if it's not currently open. Otherwise,
          * just update the physical size of the device.
          */
         if (dvd != NULL) {
-                if (dvd->vd_ldi_offline && dvd->vd_lh == NULL) {
+                ASSERT(vd->vdev_reopening);
                         /*
-                         * If we are opening a device in its offline notify
-                         * context, the LDI handle was just closed. Clean
-                         * up the LDI event callbacks and free vd->vdev_tsd.
+                 * Here vd_lh is protected by vdev_tsd_lock
                          */
-                        vdev_disk_free(vd);
-                } else {
-                        ASSERT(vd->vdev_reopening);
+                ASSERT(dvd->vd_lh != NULL);
+                /* This should not happen, but let's be safe */
+                if (dvd->vd_lh == NULL) {
+                        /* What are we going to do here??? */
+                        rw_exit(&vd->vdev_tsd_lock);
+                        return (SET_ERROR(ENXIO));
+                }
                         goto skip_open;
                 }
-        }
-
         /*
-         * Create vd->vdev_tsd.
+         * Create dvd to be used as vd->vdev_tsd.
          */
-        vdev_disk_alloc(vd);
-        dvd = vd->vdev_tsd;
+        vd->vdev_tsd = dvd = vdev_disk_alloc();
 
         /*
          * When opening a disk device, we want to preserve the user's original
          * intent.  We always want to open the device by the path the user gave
          * us, even if it is one of multiple paths to the same device.  But we

@@ -323,12 +323,12 @@
          */
         if (vd->vdev_devid != NULL) {
                 if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
                     &dvd->vd_minor) != 0) {
                         vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
-                        vdev_dbgmsg(vd, "vdev_disk_open: invalid "
-                            "vdev_devid '%s'", vd->vdev_devid);
+                        vdev_disk_free_locked(vd);
+                        rw_exit(&vd->vdev_tsd_lock);
                         return (SET_ERROR(EINVAL));
                 }
         }
 
         error = EINVAL;         /* presume failure */

@@ -417,12 +417,12 @@
                             kcred, &dvd->vd_lh, zfs_li);
         }
 
         if (error) {
                 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
-                vdev_dbgmsg(vd, "vdev_disk_open: failed to open [error=%d]",
-                    error);
+                vdev_disk_free_locked(vd);
+                rw_exit(&vd->vdev_tsd_lock);
                 return (error);
         }
 
         /*
          * Now that the device has been successfully opened, update the devid

@@ -432,12 +432,12 @@
             ldi_get_devid(dvd->vd_lh, &devid) == 0) {
                 if (ddi_devid_compare(devid, dvd->vd_devid) != 0) {
                         char *vd_devid;
 
                         vd_devid = ddi_devid_str_encode(devid, dvd->vd_minor);
-                        vdev_dbgmsg(vd, "vdev_disk_open: update devid from "
-                            "'%s' to '%s'", vd->vdev_devid, vd_devid);
+                        zfs_dbgmsg("vdev %s: update devid from %s, "
+                            "to %s", vd->vdev_path, vd->vdev_devid, vd_devid);
                         spa_strfree(vd->vdev_devid);
                         vd->vdev_devid = spa_strdup(vd_devid);
                         ddi_devid_str_free(vd_devid);
                 }
                 ddi_devid_free(devid);

@@ -487,17 +487,23 @@
                 lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP);
                 list_insert_tail(&dvd->vd_ldi_cbs, lcb);
                 (void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie,
                     &vdev_disk_dgrd_callb, (void *) vd, &lcb->lcb_id);
         }
+
+        /* Reset TRIM flag, as underlying device support may have changed */
+        vd->vdev_notrim = B_FALSE;
+
 skip_open:
+        ASSERT(dvd != NULL);
         /*
          * Determine the actual size of the device.
          */
         if (ldi_get_size(dvd->vd_lh, psize) != 0) {
                 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
-                vdev_dbgmsg(vd, "vdev_disk_open: failed to get size");
+                vdev_disk_free_locked(vd);
+                rw_exit(&vd->vdev_tsd_lock);
                 return (SET_ERROR(EINVAL));
         }
 
         *max_psize = *psize;
 

@@ -510,20 +516,14 @@
                 capacity = dkmext->dki_capacity - 1;
                 blksz = dkmext->dki_lbsize;
                 pbsize = dkmext->dki_pbsize;
         } else if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO,
             (intptr_t)dkm, FKIOCTL, kcred, NULL)) == 0) {
-                VDEV_DEBUG(
-                    "vdev_disk_open(\"%s\"): fallback to DKIOCGMEDIAINFO\n",
-                    vd->vdev_path);
                 capacity = dkm->dki_capacity - 1;
                 blksz = dkm->dki_lbsize;
                 pbsize = blksz;
         } else {
-                VDEV_DEBUG("vdev_disk_open(\"%s\"): "
-                    "both DKIOCGMEDIAINFO{,EXT} calls failed, %d\n",
-                    vd->vdev_path, error);
                 pbsize = DEV_BSIZE;
         }
 
         *ashift = highbit64(MAX(pbsize, SPA_MINBLOCKSIZE)) - 1;
 

@@ -546,26 +546,50 @@
                  */
                 (void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce,
                     FKIOCTL, kcred, NULL);
         }
 
+        if (ldi_ioctl(dvd->vd_lh, DKIOCSOLIDSTATE, (intptr_t)&vdev_ssd,
+            FKIOCTL, kcred, NULL) != 0)
+                vd->vdev_is_ssd = B_FALSE;
+        else
+                vd->vdev_is_ssd = vdev_ssd ? B_TRUE : B_FALSE;
+
         /*
+         * We are done with vd_lh and vdev_tsd, release the vdev_tsd_lock
+         */
+        rw_exit(&vd->vdev_tsd_lock);
+
+        /*
          * Clear the nowritecache bit, so that on a vdev_reopen() we will
          * try again.
          */
         vd->vdev_nowritecache = B_FALSE;
 
+        /*
+         * vdev open has succeeded - reset fault flags if last fault was due
+         * to a failed open since the open fault looks to have been transient
+         */
+        if (vd->vdev_removed || (vd->vdev_faulted &&
+            vd->vdev_label_aux == VDEV_AUX_OPEN_FAILED)) {
+                vd->vdev_faulted = vd->vdev_removed = 0ULL;
+                vd->vdev_label_aux = VDEV_AUX_NONE;
+        }
+
         return (0);
 }
 
 static void
-vdev_disk_close(vdev_t *vd)
+vdev_disk_close_impl(vdev_t *vd, boolean_t ldi_offline)
 {
-        vdev_disk_t *dvd = vd->vdev_tsd;
+        vdev_disk_t *dvd;
 
+        rw_enter(&vd->vdev_tsd_lock, RW_WRITER);
+        dvd = vd->vdev_tsd;
+
         if (vd->vdev_reopening || dvd == NULL)
-                return;
+                goto out;
 
         if (dvd->vd_minor != NULL) {
                 ddi_devid_str_free(dvd->vd_minor);
                 dvd->vd_minor = NULL;
         }

@@ -584,42 +608,54 @@
         /*
          * If we closed the LDI handle due to an offline notify from LDI,
          * don't free vd->vdev_tsd or unregister the callbacks here;
          * the offline finalize callback or a reopen will take care of it.
          */
-        if (dvd->vd_ldi_offline)
-                return;
+        if (!ldi_offline)
+                vdev_disk_free_locked(vd);
+out:
+        rw_exit(&vd->vdev_tsd_lock);
+}
 
-        vdev_disk_free(vd);
+static void
+vdev_disk_close(vdev_t *vd)
+{
+        vdev_disk_close_impl(vd, B_FALSE);
 }
 
 int
 vdev_disk_physio(vdev_t *vd, caddr_t data,
     size_t size, uint64_t offset, int flags, boolean_t isdump)
 {
-        vdev_disk_t *dvd = vd->vdev_tsd;
+        int rc = EIO;
+        vdev_disk_t *dvd;
 
+        rw_enter(&vd->vdev_tsd_lock, RW_READER);
+        dvd = vd->vdev_tsd;
         /*
          * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
          * Nothing to be done here but return failure.
          */
-        if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL))
-                return (EIO);
+        if (dvd == NULL || dvd->vd_lh == NULL)
+                goto out;
 
         ASSERT(vd->vdev_ops == &vdev_disk_ops);
 
         /*
          * If in the context of an active crash dump, use the ldi_dump(9F)
          * call instead of ldi_strategy(9F) as usual.
          */
         if (isdump) {
                 ASSERT3P(dvd, !=, NULL);
-                return (ldi_dump(dvd->vd_lh, data, lbtodb(offset),
-                    lbtodb(size)));
+                rc = ldi_dump(dvd->vd_lh, data, lbtodb(offset), lbtodb(size));
+                goto out;
         }
 
-        return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags));
+        rc = vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags);
+out:
+        rw_exit(&vd->vdev_tsd_lock);
+        return (rc);
 }
 
 int
 vdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data,
     size_t size, uint64_t offset, int flags)

@@ -698,30 +734,34 @@
 
 static void
 vdev_disk_io_start(zio_t *zio)
 {
         vdev_t *vd = zio->io_vd;
-        vdev_disk_t *dvd = vd->vdev_tsd;
+        vdev_disk_t *dvd;
         vdev_buf_t *vb;
         struct dk_callback *dkc;
         buf_t *bp;
         int error;
 
+        rw_enter(&vd->vdev_tsd_lock, RW_READER);
+        dvd = vd->vdev_tsd;
         /*
          * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
          * Nothing to be done here but return failure.
          */
-        if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL)) {
+        if (dvd == NULL || dvd->vd_lh == NULL) {
                 zio->io_error = ENXIO;
+                rw_exit(&vd->vdev_tsd_lock);
                 zio_interrupt(zio);
                 return;
         }
 
         if (zio->io_type == ZIO_TYPE_IOCTL) {
                 /* XXPOLICY */
                 if (!vdev_readable(vd)) {
                         zio->io_error = SET_ERROR(ENXIO);
+                        rw_exit(&vd->vdev_tsd_lock);
                         zio_interrupt(zio);
                         return;
                 }
 
                 switch (zio->io_cmd) {

@@ -750,21 +790,52 @@
                                 /*
                                  * The ioctl will be done asychronously,
                                  * and will call vdev_disk_ioctl_done()
                                  * upon completion.
                                  */
+                                rw_exit(&vd->vdev_tsd_lock);
                                 return;
                         }
 
                         zio->io_error = error;
 
                         break;
 
+                case DKIOCFREE:
+                        /*
+                         * We perform device support checks here instead of
+                         * in zio_trim(), as zio_trim() might be invoked on
+                         * top of a top-level vdev, whereas vdev_disk_io_start
+                         * is guaranteed to be operating a leaf vdev.
+                         */
+                        if (vd->vdev_notrim &&
+                            spa_get_force_trim(vd->vdev_spa) !=
+                            SPA_FORCE_TRIM_ON) {
+                                zio->io_error = SET_ERROR(ENOTSUP);
+                                break;
+                        }
+
+                        /*
+                         * zio->io_private contains a dkioc_free_list_t
+                         * specifying which offsets are to be freed
+                         */
+                        ASSERT(zio->io_private != NULL);
+                        error = ldi_ioctl(dvd->vd_lh, zio->io_cmd,
+                            (uintptr_t)zio->io_private, FKIOCTL, kcred, NULL);
+
+                        if (error == ENOTSUP || error == ENOTTY)
+                                vd->vdev_notrim = B_TRUE;
+
+                        zio->io_error = error;
+
+                        break;
+
                 default:
                         zio->io_error = SET_ERROR(ENOTSUP);
                 }
 
+                rw_exit(&vd->vdev_tsd_lock);
                 zio_execute(zio);
                 return;
         }
 
         ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);

@@ -794,10 +865,11 @@
         bp->b_bufsize = zio->io_size;
         bp->b_iodone = (int (*)())vdev_disk_io_intr;
 
         /* ldi_strategy() will return non-zero only on programming errors */
         VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0);
+        rw_exit(&vd->vdev_tsd_lock);
 }
 
 static void
 vdev_disk_io_done(zio_t *zio)
 {

@@ -808,15 +880,20 @@
          * the device has been removed.  If this is the case, then we trigger an
          * asynchronous removal of the device. Otherwise, probe the device and
          * make sure it's still accessible.
          */
         if (zio->io_error == EIO && !vd->vdev_remove_wanted) {
-                vdev_disk_t *dvd = vd->vdev_tsd;
-                int state = DKIO_NONE;
+                vdev_disk_t *dvd;
+                int rc = EIO, state = DKIO_NONE;
 
-                if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state,
-                    FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) {
+                rw_enter(&vd->vdev_tsd_lock, RW_READER);
+                dvd = vd->vdev_tsd;
+                if (dvd != NULL && dvd->vd_lh != NULL)
+                        rc = ldi_ioctl(dvd->vd_lh, DKIOCSTATE,
+                            (intptr_t)&state, FKIOCTL, kcred, NULL);
+                rw_exit(&vd->vdev_tsd_lock);
+                if (rc == 0 && state != DKIO_INSERTED) {
                         /*
                          * We post the resource as soon as possible, instead of
                          * when the async removal actually happens, because the
                          * DE is using this information to discard previous I/O
                          * errors.