Print this page
NEX-18069 Unable to get/set VDEV_PROP_RESILVER_MAXACTIVE/VDEV_PROP_RESILVER_MINACTIVE props
Reviewed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
NEX-9552 zfs_scan_idle throttling harms performance and needs to be removed
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
NEX-13937 Improve kstat performance
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Reviewed by: Evan Layton <evan.layton@nexenta.com>
NEX-3558 KRRP Integration
OS-103 handle CoS descriptor persistent references across vdev operations
OS-80 support for vdev and CoS properties for the new I/O scheduler
OS-95 lint warning introduced by OS-61
re #12643 rb4064 ZFS meta refactoring - vdev utilization tracking, auto-dedup
re #12585 rb4049 ZFS++ work port - refactoring to improve separation of open/closed code, bug fixes, performance improvements - open code
Bug 11205: add missing libzfs_closed_stubs.c to fix opensource-only build.
ZFS plus work: special vdevs, cos, cos/vdev properties

@@ -19,19 +19,21 @@
  * CDDL HEADER END
  */
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2018 Nexenta Systems, Inc. All rights reserved.
  */
 
 /*
  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/zfs_context.h>
 #include <sys/vdev_impl.h>
+#include <sys/cos.h>
 #include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/avl.h>
 #include <sys/dsl_pool.h>
 #include <sys/metaslab_impl.h>

@@ -144,14 +146,14 @@
 uint32_t zfs_vdev_sync_write_max_active = 10;
 uint32_t zfs_vdev_async_read_min_active = 1;
 uint32_t zfs_vdev_async_read_max_active = 3;
 uint32_t zfs_vdev_async_write_min_active = 1;
 uint32_t zfs_vdev_async_write_max_active = 10;
+uint32_t zfs_vdev_resilver_min_active = 1;
+uint32_t zfs_vdev_resilver_max_active = 3;
 uint32_t zfs_vdev_scrub_min_active = 1;
 uint32_t zfs_vdev_scrub_max_active = 2;
-uint32_t zfs_vdev_removal_min_active = 1;
-uint32_t zfs_vdev_removal_max_active = 2;
 
 /*
  * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent
  * dirty data, use zfs_vdev_async_write_min_active.  When it has more than
  * zfs_vdev_async_write_active_max_dirty_percent, use

@@ -293,79 +295,106 @@
 
 static void
 vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
 {
         spa_t *spa = zio->io_spa;
+        hrtime_t t = gethrtime_unscaled();
 
         ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
         avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
         avl_add(vdev_queue_type_tree(vq, zio->io_type), zio);
 
+        atomic_inc_64(&spa->spa_queue_stats[zio->io_priority].spa_queued);
         mutex_enter(&spa->spa_iokstat_lock);
-        spa->spa_queue_stats[zio->io_priority].spa_queued++;
         if (spa->spa_iokstat != NULL)
-                kstat_waitq_enter(spa->spa_iokstat->ks_data);
+                kstat_waitq_enter_time(spa->spa_iokstat->ks_data, t);
+        if (vq->vq_vdev->vdev_iokstat != NULL)
+                kstat_waitq_enter_time(vq->vq_vdev->vdev_iokstat->ks_data, t);
         mutex_exit(&spa->spa_iokstat_lock);
 }
 
 static void
 vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
 {
         spa_t *spa = zio->io_spa;
+        hrtime_t t = gethrtime_unscaled();
 
         ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
         avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
         avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio);
 
-        mutex_enter(&spa->spa_iokstat_lock);
         ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_queued, >, 0);
-        spa->spa_queue_stats[zio->io_priority].spa_queued--;
+        atomic_dec_64(&spa->spa_queue_stats[zio->io_priority].spa_queued);
+
+        mutex_enter(&spa->spa_iokstat_lock);
         if (spa->spa_iokstat != NULL)
-                kstat_waitq_exit(spa->spa_iokstat->ks_data);
+                kstat_waitq_exit_time(spa->spa_iokstat->ks_data, t);
+        if (vq->vq_vdev->vdev_iokstat != NULL)
+                kstat_waitq_exit_time(vq->vq_vdev->vdev_iokstat->ks_data, t);
         mutex_exit(&spa->spa_iokstat_lock);
 }
 
 static void
 vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
 {
         spa_t *spa = zio->io_spa;
+        hrtime_t t = gethrtime_unscaled();
+
         ASSERT(MUTEX_HELD(&vq->vq_lock));
         ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
         vq->vq_class[zio->io_priority].vqc_active++;
         avl_add(&vq->vq_active_tree, zio);
 
+        atomic_inc_64(&spa->spa_queue_stats[zio->io_priority].spa_active);
         mutex_enter(&spa->spa_iokstat_lock);
-        spa->spa_queue_stats[zio->io_priority].spa_active++;
         if (spa->spa_iokstat != NULL)
-                kstat_runq_enter(spa->spa_iokstat->ks_data);
+                kstat_runq_enter_time(spa->spa_iokstat->ks_data, t);
+        if (vq->vq_vdev->vdev_iokstat != NULL)
+                kstat_runq_enter_time(vq->vq_vdev->vdev_iokstat->ks_data, t);
         mutex_exit(&spa->spa_iokstat_lock);
 }
 
 static void
 vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
 {
         spa_t *spa = zio->io_spa;
+        hrtime_t t = gethrtime_unscaled();
+
         ASSERT(MUTEX_HELD(&vq->vq_lock));
         ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
         vq->vq_class[zio->io_priority].vqc_active--;
         avl_remove(&vq->vq_active_tree, zio);
 
-        mutex_enter(&spa->spa_iokstat_lock);
         ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_active, >, 0);
-        spa->spa_queue_stats[zio->io_priority].spa_active--;
+        atomic_dec_64(&spa->spa_queue_stats[zio->io_priority].spa_active);
+
+        mutex_enter(&spa->spa_iokstat_lock);
         if (spa->spa_iokstat != NULL) {
                 kstat_io_t *ksio = spa->spa_iokstat->ks_data;
 
-                kstat_runq_exit(spa->spa_iokstat->ks_data);
+                kstat_runq_exit_time(spa->spa_iokstat->ks_data, t);
                 if (zio->io_type == ZIO_TYPE_READ) {
                         ksio->reads++;
                         ksio->nread += zio->io_size;
                 } else if (zio->io_type == ZIO_TYPE_WRITE) {
                         ksio->writes++;
                         ksio->nwritten += zio->io_size;
                 }
         }
+
+        if (vq->vq_vdev->vdev_iokstat != NULL) {
+                kstat_io_t *ksio = vq->vq_vdev->vdev_iokstat->ks_data;
+
+                kstat_runq_exit_time(ksio, t);
+                if (zio->io_type == ZIO_TYPE_READ) {
+                        ksio->reads++;
+                        ksio->nread += zio->io_size;
+                } else if (zio->io_type == ZIO_TYPE_WRITE) {
+                        ksio->writes++;
+                        ksio->nwritten += zio->io_size;
+                }
+        }
         mutex_exit(&spa->spa_iokstat_lock);
 }
 
 static void
 vdev_queue_agg_io_done(zio_t *aio)

@@ -380,91 +409,184 @@
         }
 
         abd_free(aio->io_abd);
 }
 
+static uint64_t
+scan_prio2active(uint64_t prio, boolean_t max_active)
+{
+        uint64_t act, act_max;
+
+        if (max_active) {
+                act_max = MAX(MAX(zfs_vdev_sync_read_max_active,
+                    zfs_vdev_sync_write_max_active),
+                    MAX(zfs_vdev_async_read_max_active,
+                    zfs_vdev_async_write_max_active));
+                act = ((prio * (zfs_vdev_sync_read_max_active +
+                    zfs_vdev_sync_write_max_active +
+                    zfs_vdev_async_read_max_active +
+                    zfs_vdev_async_write_max_active)) / 100);
+        } else {
+                act_max = MAX(MAX(zfs_vdev_sync_read_min_active,
+                    zfs_vdev_sync_write_min_active),
+                    MAX(zfs_vdev_async_read_min_active,
+                    zfs_vdev_async_write_min_active));
+                act = ((prio * (zfs_vdev_sync_read_min_active +
+                    zfs_vdev_sync_write_min_active +
+                    zfs_vdev_async_read_min_active +
+                    zfs_vdev_async_write_min_active)) / 100);
+        }
+        act = MAX(MIN(act, act_max), 1);
+
+        return (act);
+}
+
 static int
-vdev_queue_class_min_active(zio_priority_t p)
+vdev_queue_class_min_active(zio_priority_t p, vdev_queue_t *vq)
 {
+        int zfs_min_active = 0;
+        int vqc_min_active;
+        vdev_prop_t prop = VDEV_ZIO_PRIO_TO_PROP_MIN(p);
+
+        ASSERT(VDEV_PROP_MIN_VALID(prop));
+        vqc_min_active = vdev_queue_get_prop_uint64(vq, prop);
+
         switch (p) {
         case ZIO_PRIORITY_SYNC_READ:
-                return (zfs_vdev_sync_read_min_active);
+                zfs_min_active = zfs_vdev_sync_read_min_active;
+                break;
         case ZIO_PRIORITY_SYNC_WRITE:
-                return (zfs_vdev_sync_write_min_active);
+                zfs_min_active = zfs_vdev_sync_write_min_active;
+                break;
         case ZIO_PRIORITY_ASYNC_READ:
-                return (zfs_vdev_async_read_min_active);
+                zfs_min_active = zfs_vdev_async_read_min_active;
+                break;
         case ZIO_PRIORITY_ASYNC_WRITE:
-                return (zfs_vdev_async_write_min_active);
-        case ZIO_PRIORITY_SCRUB:
-                return (zfs_vdev_scrub_min_active);
-        case ZIO_PRIORITY_REMOVAL:
-                return (zfs_vdev_removal_min_active);
+                zfs_min_active = zfs_vdev_async_write_min_active;
+                break;
+        case ZIO_PRIORITY_RESILVER: {
+                uint64_t prio = vq->vq_vdev->vdev_spa->spa_resilver_prio;
+                if (prio > 0)
+                        zfs_min_active = scan_prio2active(prio, B_FALSE);
+                else
+                        zfs_min_active = zfs_vdev_resilver_min_active;
+                break;
+        }
+        case ZIO_PRIORITY_SCRUB: {
+                uint64_t prio = vq->vq_vdev->vdev_spa->spa_scrub_prio;
+                if (prio > 0)
+                        zfs_min_active = scan_prio2active(prio, B_FALSE);
+                else
+                        zfs_min_active = zfs_vdev_scrub_min_active;
+                break;
+        }
         default:
                 panic("invalid priority %u", p);
                 return (0);
         }
+
+        /* zero vdev-specific setting means "use zfs global setting" */
+        return ((vqc_min_active) ? vqc_min_active : zfs_min_active);
 }
 
 static int
-vdev_queue_max_async_writes(spa_t *spa)
+vdev_queue_max_async_writes(spa_t *spa, vdev_queue_t *vq)
 {
         int writes;
         uint64_t dirty = spa->spa_dsl_pool->dp_dirty_total;
         uint64_t min_bytes = zfs_dirty_data_max *
             zfs_vdev_async_write_active_min_dirty_percent / 100;
         uint64_t max_bytes = zfs_dirty_data_max *
             zfs_vdev_async_write_active_max_dirty_percent / 100;
 
         /*
+         * vdev-specific properties override global tunables
+         * zero vdev-specific settings indicate fallback on the globals
+         */
+        int vqc_min_active =
+            vdev_queue_get_prop_uint64(vq, VDEV_PROP_AWRITE_MINACTIVE);
+        int min_active =
+            (vqc_min_active) ? vqc_min_active : zfs_vdev_async_write_min_active;
+        int vqc_max_active =
+            vdev_queue_get_prop_uint64(vq, VDEV_PROP_AWRITE_MAXACTIVE);
+        int max_active =
+            (vqc_max_active) ? vqc_max_active : zfs_vdev_async_write_max_active;
+
+        /*
          * Sync tasks correspond to interactive user actions. To reduce the
          * execution time of those actions we push data out as fast as possible.
          */
         if (spa_has_pending_synctask(spa)) {
                 return (zfs_vdev_async_write_max_active);
         }
 
         if (dirty < min_bytes)
-                return (zfs_vdev_async_write_min_active);
+                return (min_active);
         if (dirty > max_bytes)
-                return (zfs_vdev_async_write_max_active);
+                return (max_active);
 
         /*
          * linear interpolation:
          * slope = (max_writes - min_writes) / (max_bytes - min_bytes)
          * move right by min_bytes
          * move up by min_writes
          */
-        writes = (dirty - min_bytes) *
-            (zfs_vdev_async_write_max_active -
-            zfs_vdev_async_write_min_active) /
-            (max_bytes - min_bytes) +
-            zfs_vdev_async_write_min_active;
-        ASSERT3U(writes, >=, zfs_vdev_async_write_min_active);
-        ASSERT3U(writes, <=, zfs_vdev_async_write_max_active);
+        writes = (dirty - min_bytes) * (max_active - min_active) /
+            (max_bytes - min_bytes) + min_active;
+        ASSERT3U(writes, >=, min_active);
+        ASSERT3U(writes, <=, max_active);
         return (writes);
 }
 
 static int
-vdev_queue_class_max_active(spa_t *spa, zio_priority_t p)
+vdev_queue_class_max_active(spa_t *spa, zio_priority_t p, vdev_queue_t *vq)
 {
+        int zfs_max_active = 0;
+        int vqc_max_active;
+        vdev_prop_t prop = VDEV_ZIO_PRIO_TO_PROP_MAX(p);
+
+        ASSERT(VDEV_PROP_MAX_VALID(prop));
+        vqc_max_active = vdev_queue_get_prop_uint64(vq, prop);
+
         switch (p) {
         case ZIO_PRIORITY_SYNC_READ:
-                return (zfs_vdev_sync_read_max_active);
+                zfs_max_active = zfs_vdev_sync_read_max_active;
+                break;
         case ZIO_PRIORITY_SYNC_WRITE:
-                return (zfs_vdev_sync_write_max_active);
+                zfs_max_active = zfs_vdev_sync_write_max_active;
+                break;
         case ZIO_PRIORITY_ASYNC_READ:
-                return (zfs_vdev_async_read_max_active);
+                zfs_max_active = zfs_vdev_async_read_max_active;
+                break;
         case ZIO_PRIORITY_ASYNC_WRITE:
-                return (vdev_queue_max_async_writes(spa));
-        case ZIO_PRIORITY_SCRUB:
-                return (zfs_vdev_scrub_max_active);
-        case ZIO_PRIORITY_REMOVAL:
-                return (zfs_vdev_removal_max_active);
+                /* takes into account vdev-specific props internally */
+                vqc_max_active = vdev_queue_max_async_writes(spa, vq);
+                ASSERT(vqc_max_active);
+                break;
+        case ZIO_PRIORITY_RESILVER: {
+                uint64_t prio = vq->vq_vdev->vdev_spa->spa_resilver_prio;
+                if (prio > 0)
+                        zfs_max_active = scan_prio2active(prio, B_TRUE);
+                else
+                        zfs_max_active = zfs_vdev_resilver_max_active;
+                break;
+        }
+        case ZIO_PRIORITY_SCRUB: {
+                uint64_t prio = vq->vq_vdev->vdev_spa->spa_scrub_prio;
+                if (prio > 0)
+                        zfs_max_active = scan_prio2active(prio, B_TRUE);
+                else
+                        zfs_max_active = zfs_vdev_scrub_max_active;
+                break;
+        }
         default:
                 panic("invalid priority %u", p);
                 return (0);
         }
+
+        /* zero vdev-specific setting means "use zfs global setting" */
+        return ((vqc_max_active) ? vqc_max_active : zfs_max_active);
 }
 
 /*
  * Return the i/o class to issue from, or ZIO_PRIORITY_MAX_QUEUEABLE if
  * there is no eligible class.

@@ -480,11 +602,11 @@
 
         /* find a queue that has not reached its minimum # outstanding i/os */
         for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
                 if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
                     vq->vq_class[p].vqc_active <
-                    vdev_queue_class_min_active(p))
+                    vdev_queue_class_min_active(p, vq))
                         return (p);
         }
 
         /*
          * If we haven't found a queue, look for one that hasn't reached its

@@ -491,11 +613,11 @@
          * maximum # outstanding i/os.
          */
         for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
                 if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
                     vq->vq_class[p].vqc_active <
-                    vdev_queue_class_max_active(spa, p))
+                    vdev_queue_class_max_active(spa, p, vq))
                         return (p);
         }
 
         /* No eligible queued i/os */
         return (ZIO_PRIORITY_NUM_QUEUEABLE);

@@ -548,12 +670,11 @@
          * recording the last non-optional I/O.
          */
         while ((dio = AVL_PREV(t, first)) != NULL &&
             (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
             IO_SPAN(dio, last) <= zfs_vdev_aggregation_limit &&
-            IO_GAP(dio, first) <= maxgap &&
-            dio->io_type == zio->io_type) {
+            IO_GAP(dio, first) <= maxgap) {
                 first = dio;
                 if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL))
                         mandatory = first;
         }
 

@@ -573,12 +694,11 @@
          */
         while ((dio = AVL_NEXT(t, last)) != NULL &&
             (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
             (IO_SPAN(first, dio) <= zfs_vdev_aggregation_limit ||
             (dio->io_flags & ZIO_FLAG_OPTIONAL)) &&
-            IO_GAP(last, dio) <= maxgap &&
-            dio->io_type == zio->io_type) {
+            IO_GAP(last, dio) <= maxgap) {
                 last = dio;
                 if (!(last->io_flags & ZIO_FLAG_OPTIONAL))
                         mandatory = last;
         }
 

@@ -733,18 +853,16 @@
          * not match the child's i/o type.  Fix it up here.
          */
         if (zio->io_type == ZIO_TYPE_READ) {
                 if (zio->io_priority != ZIO_PRIORITY_SYNC_READ &&
                     zio->io_priority != ZIO_PRIORITY_ASYNC_READ &&
-                    zio->io_priority != ZIO_PRIORITY_SCRUB &&
-                    zio->io_priority != ZIO_PRIORITY_REMOVAL)
+                    zio->io_priority != ZIO_PRIORITY_SCRUB)
                         zio->io_priority = ZIO_PRIORITY_ASYNC_READ;
         } else {
                 ASSERT(zio->io_type == ZIO_TYPE_WRITE);
                 if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&
-                    zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE &&
-                    zio->io_priority != ZIO_PRIORITY_REMOVAL)
+                    zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE)
                         zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE;
         }
 
         zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
 

@@ -787,6 +905,63 @@
                 }
                 mutex_enter(&vq->vq_lock);
         }
 
         mutex_exit(&vq->vq_lock);
+}
+
+uint64_t
+vdev_queue_get_prop_uint64(vdev_queue_t *vq, vdev_prop_t p)
+{
+        uint64_t val = 0;
+        int zprio = 0;
+        cos_t *cos = vq->vq_cos;
+
+        switch (p) {
+        case VDEV_PROP_READ_MINACTIVE:
+        case VDEV_PROP_AREAD_MINACTIVE:
+        case VDEV_PROP_WRITE_MINACTIVE:
+        case VDEV_PROP_AWRITE_MINACTIVE:
+        case VDEV_PROP_SCRUB_MINACTIVE:
+        case VDEV_PROP_RESILVER_MINACTIVE:
+                zprio = VDEV_PROP_TO_ZIO_PRIO_MIN(p);
+                ASSERT(ZIO_PRIORITY_QUEUEABLE_VALID(zprio));
+                if (vq->vq_cos != NULL) {
+                        cos_prop_t p = COS_ZIO_PRIO_TO_PROP_MIN(zprio);
+                        ASSERT(COS_PROP_MIN_VALID(p));
+                        val = cos_get_prop_uint64(vq->vq_cos, p);
+                }
+                if (val == 0)
+                        val = vq->vq_class[zprio].vqc_min_active;
+                break;
+        case VDEV_PROP_READ_MAXACTIVE:
+        case VDEV_PROP_AREAD_MAXACTIVE:
+        case VDEV_PROP_WRITE_MAXACTIVE:
+        case VDEV_PROP_AWRITE_MAXACTIVE:
+        case VDEV_PROP_SCRUB_MAXACTIVE:
+        case VDEV_PROP_RESILVER_MAXACTIVE:
+                zprio = VDEV_PROP_TO_ZIO_PRIO_MAX(p);
+                ASSERT(ZIO_PRIORITY_QUEUEABLE_VALID(zprio));
+                if (vq->vq_cos != NULL) {
+                        cos_prop_t p = COS_ZIO_PRIO_TO_PROP_MAX(zprio);
+                        ASSERT(COS_PROP_MAX_VALID(p));
+                        val = cos_get_prop_uint64(vq->vq_cos, p);
+                }
+                if (val == 0)
+                        val = vq->vq_class[zprio].vqc_max_active;
+                break;
+        case VDEV_PROP_PREFERRED_READ:
+                if (vq->vq_cos != NULL)
+                        val = cos_get_prop_uint64(vq->vq_cos,
+                            COS_PROP_PREFERRED_READ);
+                if (val == 0)
+                        val = vq->vq_preferred_read;
+                break;
+        default:
+                panic("Non-numeric property requested\n");
+                return (0);
+        }
+
+        VERIFY(cos == vq->vq_cos);
+
+        return (val);
 }