big-one Wdiff usr/src/uts/common/fs/zfs/vdev_queue.c

Print this page

NEX-18069 Unable to get/set VDEV_PROP_RESILVER_MAXACTIVE/VDEV_PROP_RESILVER_MINACTIVE props
Reviewed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
NEX-9552 zfs_scan_idle throttling harms performance and needs to be removed
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
NEX-13937 Improve kstat performance
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Reviewed by: Evan Layton <evan.layton@nexenta.com>
NEX-3558 KRRP Integration
OS-103 handle CoS descriptor persistent references across vdev operations
OS-80 support for vdev and CoS properties for the new I/O scheduler
OS-95 lint warning introduced by OS-61
re #12643 rb4064 ZFS meta refactoring - vdev utilization tracking, auto-dedup
re #12585 rb4049 ZFS++ work port - refactoring to improve separation of open/closed code, bug fixes, performance improvements - open code
Bug 11205: add missing libzfs_closed_stubs.c to fix opensource-only build.
ZFS plus work: special vdevs, cos, cos/vdev properties

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/vdev_queue.c
          +++ new/usr/src/uts/common/fs/zfs/vdev_queue.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each

↓ open down ↓

13 lines elided

↑ open up ↑

  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
       24 + * Copyright 2018 Nexenta Systems, Inc. All rights reserved.
  24   25   */
  25   26  
  26   27  /*
  27   28   * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  28   29   * Copyright (c) 2014 Integros [integros.com]
  29   30   */
  30   31  
  31   32  #include <sys/zfs_context.h>
  32   33  #include <sys/vdev_impl.h>
       34 +#include <sys/cos.h>
  33   35  #include <sys/spa_impl.h>
  34   36  #include <sys/zio.h>
  35   37  #include <sys/avl.h>
  36   38  #include <sys/dsl_pool.h>
  37   39  #include <sys/metaslab_impl.h>
  38   40  #include <sys/abd.h>
  39   41  
  40   42  /*
  41   43   * ZFS I/O Scheduler
  42   44   * ---------------

  43   45   *
  44   46   * ZFS issues I/O operations to leaf vdevs to satisfy and complete zios.  The
  45   47   * I/O scheduler determines when and in what order those operations are
  46   48   * issued.  The I/O scheduler divides operations into five I/O classes
  47   49   * prioritized in the following order: sync read, sync write, async read,
  48   50   * async write, and scrub/resilver.  Each queue defines the minimum and
  49   51   * maximum number of concurrent operations that may be issued to the device.
  50   52   * In addition, the device has an aggregate maximum. Note that the sum of the
  51   53   * per-queue minimums must not exceed the aggregate maximum, and if the
  52   54   * aggregate maximum is equal to or greater than the sum of the per-queue
  53   55   * maximums, the per-queue minimum has no effect.
  54   56   *
  55   57   * For many physical devices, throughput increases with the number of
  56   58   * concurrent operations, but latency typically suffers. Further, physical
  57   59   * devices typically have a limit at which more concurrent operations have no
  58   60   * effect on throughput or can actually cause it to decrease.
  59   61   *
  60   62   * The scheduler selects the next operation to issue by first looking for an
  61   63   * I/O class whose minimum has not been satisfied. Once all are satisfied and
  62   64   * the aggregate maximum has not been hit, the scheduler looks for classes
  63   65   * whose maximum has not been satisfied. Iteration through the I/O classes is
  64   66   * done in the order specified above. No further operations are issued if the
  65   67   * aggregate maximum number of concurrent operations has been hit or if there
  66   68   * are no operations queued for an I/O class that has not hit its maximum.
  67   69   * Every time an i/o is queued or an operation completes, the I/O scheduler
  68   70   * looks for new operations to issue.
  69   71   *
  70   72   * All I/O classes have a fixed maximum number of outstanding operations
  71   73   * except for the async write class. Asynchronous writes represent the data
  72   74   * that is committed to stable storage during the syncing stage for
  73   75   * transaction groups (see txg.c). Transaction groups enter the syncing state
  74   76   * periodically so the number of queued async writes will quickly burst up and
  75   77   * then bleed down to zero. Rather than servicing them as quickly as possible,
  76   78   * the I/O scheduler changes the maximum number of active async write i/os
  77   79   * according to the amount of dirty data in the pool (see dsl_pool.c). Since
  78   80   * both throughput and latency typically increase with the number of
  79   81   * concurrent operations issued to physical devices, reducing the burstiness
  80   82   * in the number of concurrent operations also stabilizes the response time of
  81   83   * operations from other -- and in particular synchronous -- queues. In broad
  82   84   * strokes, the I/O scheduler will issue more concurrent operations from the
  83   85   * async write queue as there's more dirty data in the pool.
  84   86   *
  85   87   * Async Writes
  86   88   *
  87   89   * The number of concurrent operations issued for the async write I/O class
  88   90   * follows a piece-wise linear function defined by a few adjustable points.
  89   91   *
  90   92   *        |                   o---------| <-- zfs_vdev_async_write_max_active
  91   93   *   ^    |                  /^         |
  92   94   *   |    |                 / |         |
  93   95   * active |                /  |         |
  94   96   *  I/O   |               /   |         |
  95   97   * count  |              /    |         |
  96   98   *        |             /     |         |
  97   99   *        |------------o      |         | <-- zfs_vdev_async_write_min_active
  98  100   *       0|____________^______|_________|
  99  101   *        0%           |      |       100% of zfs_dirty_data_max
 100  102   *                     |      |
 101  103   *                     |      `-- zfs_vdev_async_write_active_max_dirty_percent
 102  104   *                     `--------- zfs_vdev_async_write_active_min_dirty_percent
 103  105   *
 104  106   * Until the amount of dirty data exceeds a minimum percentage of the dirty
 105  107   * data allowed in the pool, the I/O scheduler will limit the number of
 106  108   * concurrent operations to the minimum. As that threshold is crossed, the
 107  109   * number of concurrent operations issued increases linearly to the maximum at
 108  110   * the specified maximum percentage of the dirty data allowed in the pool.
 109  111   *
 110  112   * Ideally, the amount of dirty data on a busy pool will stay in the sloped
 111  113   * part of the function between zfs_vdev_async_write_active_min_dirty_percent
 112  114   * and zfs_vdev_async_write_active_max_dirty_percent. If it exceeds the
 113  115   * maximum percentage, this indicates that the rate of incoming data is
 114  116   * greater than the rate that the backend storage can handle. In this case, we
 115  117   * must further throttle incoming writes (see dmu_tx_delay() for details).
 116  118   */
 117  119  
 118  120  /*
 119  121   * The maximum number of i/os active to each device.  Ideally, this will be >=
 120  122   * the sum of each queue's max_active.  It must be at least the sum of each
 121  123   * queue's min_active.
 122  124   */
 123  125  uint32_t zfs_vdev_max_active = 1000;
 124  126  
 125  127  /*
 126  128   * Per-queue limits on the number of i/os active to each device.  If the
 127  129   * sum of the queue's max_active is < zfs_vdev_max_active, then the
 128  130   * min_active comes into play.  We will send min_active from each queue,
 129  131   * and then select from queues in the order defined by zio_priority_t.
 130  132   *
 131  133   * In general, smaller max_active's will lead to lower latency of synchronous
 132  134   * operations.  Larger max_active's may lead to higher overall throughput,
 133  135   * depending on underlying storage.
 134  136   *
 135  137   * The ratio of the queues' max_actives determines the balance of performance
 136  138   * between reads, writes, and scrubs.  E.g., increasing
 137  139   * zfs_vdev_scrub_max_active will cause the scrub or resilver to complete
 138  140   * more quickly, but reads and writes to have higher latency and lower

↓ open down ↓

96 lines elided

↑ open up ↑

 139  141   * throughput.
 140  142   */
 141  143  uint32_t zfs_vdev_sync_read_min_active = 10;
 142  144  uint32_t zfs_vdev_sync_read_max_active = 10;
 143  145  uint32_t zfs_vdev_sync_write_min_active = 10;
 144  146  uint32_t zfs_vdev_sync_write_max_active = 10;
 145  147  uint32_t zfs_vdev_async_read_min_active = 1;
 146  148  uint32_t zfs_vdev_async_read_max_active = 3;
 147  149  uint32_t zfs_vdev_async_write_min_active = 1;
 148  150  uint32_t zfs_vdev_async_write_max_active = 10;
      151 +uint32_t zfs_vdev_resilver_min_active = 1;
      152 +uint32_t zfs_vdev_resilver_max_active = 3;
 149  153  uint32_t zfs_vdev_scrub_min_active = 1;
 150  154  uint32_t zfs_vdev_scrub_max_active = 2;
 151      -uint32_t zfs_vdev_removal_min_active = 1;
 152      -uint32_t zfs_vdev_removal_max_active = 2;
 153  155  
 154  156  /*
 155  157   * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent
 156  158   * dirty data, use zfs_vdev_async_write_min_active.  When it has more than
 157  159   * zfs_vdev_async_write_active_max_dirty_percent, use
 158  160   * zfs_vdev_async_write_max_active. The value is linearly interpolated
 159  161   * between min and max.
 160  162   */
 161  163  int zfs_vdev_async_write_active_min_dirty_percent = 30;
 162  164  int zfs_vdev_async_write_active_max_dirty_percent = 60;

 163  165  
 164  166  /*
 165  167   * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O.
 166  168   * For read I/Os, we also aggregate across small adjacency gaps; for writes
 167  169   * we include spans of optional I/Os to aid aggregation at the disk even when
 168  170   * they aren't able to help us aggregate at this level.
 169  171   */
 170  172  int zfs_vdev_aggregation_limit = SPA_OLD_MAXBLOCKSIZE;
 171  173  int zfs_vdev_read_gap_limit = 32 << 10;
 172  174  int zfs_vdev_write_gap_limit = 4 << 10;
 173  175  
 174  176  /*
 175  177   * Define the queue depth percentage for each top-level. This percentage is
 176  178   * used in conjunction with zfs_vdev_async_max_active to determine how many
 177  179   * allocations a specific top-level vdev should handle. Once the queue depth
 178  180   * reaches zfs_vdev_queue_depth_pct * zfs_vdev_async_write_max_active / 100
 179  181   * then allocator will stop allocating blocks on that top-level device.
 180  182   * The default kernel setting is 1000% which will yield 100 allocations per
 181  183   * device. For userland testing, the default setting is 300% which equates
 182  184   * to 30 allocations per device.
 183  185   */
 184  186  #ifdef _KERNEL
 185  187  int zfs_vdev_queue_depth_pct = 1000;
 186  188  #else
 187  189  int zfs_vdev_queue_depth_pct = 300;
 188  190  #endif
 189  191  
 190  192  
 191  193  int
 192  194  vdev_queue_offset_compare(const void *x1, const void *x2)
 193  195  {
 194  196          const zio_t *z1 = x1;
 195  197          const zio_t *z2 = x2;
 196  198  
 197  199          if (z1->io_offset < z2->io_offset)
 198  200                  return (-1);
 199  201          if (z1->io_offset > z2->io_offset)
 200  202                  return (1);
 201  203  
 202  204          if (z1 < z2)
 203  205                  return (-1);
 204  206          if (z1 > z2)
 205  207                  return (1);
 206  208  
 207  209          return (0);
 208  210  }
 209  211  
 210  212  static inline avl_tree_t *
 211  213  vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p)
 212  214  {
 213  215          return (&vq->vq_class[p].vqc_queued_tree);
 214  216  }
 215  217  
 216  218  static inline avl_tree_t *
 217  219  vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t)
 218  220  {
 219  221          ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE);
 220  222          if (t == ZIO_TYPE_READ)
 221  223                  return (&vq->vq_read_offset_tree);
 222  224          else
 223  225                  return (&vq->vq_write_offset_tree);
 224  226  }
 225  227  
 226  228  int
 227  229  vdev_queue_timestamp_compare(const void *x1, const void *x2)
 228  230  {
 229  231          const zio_t *z1 = x1;
 230  232          const zio_t *z2 = x2;
 231  233  
 232  234          if (z1->io_timestamp < z2->io_timestamp)
 233  235                  return (-1);
 234  236          if (z1->io_timestamp > z2->io_timestamp)
 235  237                  return (1);
 236  238  
 237  239          if (z1 < z2)
 238  240                  return (-1);
 239  241          if (z1 > z2)
 240  242                  return (1);
 241  243  
 242  244          return (0);
 243  245  }
 244  246  
 245  247  void
 246  248  vdev_queue_init(vdev_t *vd)
 247  249  {
 248  250          vdev_queue_t *vq = &vd->vdev_queue;
 249  251  
 250  252          mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
 251  253          vq->vq_vdev = vd;
 252  254  
 253  255          avl_create(&vq->vq_active_tree, vdev_queue_offset_compare,
 254  256              sizeof (zio_t), offsetof(struct zio, io_queue_node));
 255  257          avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ),
 256  258              vdev_queue_offset_compare, sizeof (zio_t),
 257  259              offsetof(struct zio, io_offset_node));
 258  260          avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE),
 259  261              vdev_queue_offset_compare, sizeof (zio_t),
 260  262              offsetof(struct zio, io_offset_node));
 261  263  
 262  264          for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
 263  265                  int (*compfn) (const void *, const void *);
 264  266  
 265  267                  /*
 266  268                   * The synchronous i/o queues are dispatched in FIFO rather
 267  269                   * than LBA order.  This provides more consistent latency for
 268  270                   * these i/os.
 269  271                   */
 270  272                  if (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE)
 271  273                          compfn = vdev_queue_timestamp_compare;
 272  274                  else
 273  275                          compfn = vdev_queue_offset_compare;
 274  276  
 275  277                  avl_create(vdev_queue_class_tree(vq, p), compfn,
 276  278                      sizeof (zio_t), offsetof(struct zio, io_queue_node));
 277  279          }
 278  280  }
 279  281  
 280  282  void
 281  283  vdev_queue_fini(vdev_t *vd)
 282  284  {
 283  285          vdev_queue_t *vq = &vd->vdev_queue;
 284  286  
 285  287          for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++)
 286  288                  avl_destroy(vdev_queue_class_tree(vq, p));
 287  289          avl_destroy(&vq->vq_active_tree);

↓ open down ↓

125 lines elided

↑ open up ↑

 288  290          avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ));
 289  291          avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE));
 290  292  
 291  293          mutex_destroy(&vq->vq_lock);
 292  294  }
 293  295  
 294  296  static void
 295  297  vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
 296  298  {
 297  299          spa_t *spa = zio->io_spa;
      300 +        hrtime_t t = gethrtime_unscaled();
 298  301  
 299  302          ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 300  303          avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
 301  304          avl_add(vdev_queue_type_tree(vq, zio->io_type), zio);
 302  305  
      306 +        atomic_inc_64(&spa->spa_queue_stats[zio->io_priority].spa_queued);
 303  307          mutex_enter(&spa->spa_iokstat_lock);
 304      -        spa->spa_queue_stats[zio->io_priority].spa_queued++;
 305  308          if (spa->spa_iokstat != NULL)
 306      -                kstat_waitq_enter(spa->spa_iokstat->ks_data);
      309 +                kstat_waitq_enter_time(spa->spa_iokstat->ks_data, t);
      310 +        if (vq->vq_vdev->vdev_iokstat != NULL)
      311 +                kstat_waitq_enter_time(vq->vq_vdev->vdev_iokstat->ks_data, t);
 307  312          mutex_exit(&spa->spa_iokstat_lock);
 308  313  }
 309  314  
 310  315  static void
 311  316  vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
 312  317  {
 313  318          spa_t *spa = zio->io_spa;
      319 +        hrtime_t t = gethrtime_unscaled();
 314  320  
 315  321          ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 316  322          avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
 317  323          avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio);
 318  324  
 319      -        mutex_enter(&spa->spa_iokstat_lock);
 320  325          ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_queued, >, 0);
 321      -        spa->spa_queue_stats[zio->io_priority].spa_queued--;
      326 +        atomic_dec_64(&spa->spa_queue_stats[zio->io_priority].spa_queued);
      327 +
      328 +        mutex_enter(&spa->spa_iokstat_lock);
 322  329          if (spa->spa_iokstat != NULL)
 323      -                kstat_waitq_exit(spa->spa_iokstat->ks_data);
      330 +                kstat_waitq_exit_time(spa->spa_iokstat->ks_data, t);
      331 +        if (vq->vq_vdev->vdev_iokstat != NULL)
      332 +                kstat_waitq_exit_time(vq->vq_vdev->vdev_iokstat->ks_data, t);
 324  333          mutex_exit(&spa->spa_iokstat_lock);
 325  334  }
 326  335  
 327  336  static void
 328  337  vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
 329  338  {
 330  339          spa_t *spa = zio->io_spa;
      340 +        hrtime_t t = gethrtime_unscaled();
      341 +
 331  342          ASSERT(MUTEX_HELD(&vq->vq_lock));
 332  343          ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 333  344          vq->vq_class[zio->io_priority].vqc_active++;
 334  345          avl_add(&vq->vq_active_tree, zio);
 335  346  
      347 +        atomic_inc_64(&spa->spa_queue_stats[zio->io_priority].spa_active);
 336  348          mutex_enter(&spa->spa_iokstat_lock);
 337      -        spa->spa_queue_stats[zio->io_priority].spa_active++;
 338  349          if (spa->spa_iokstat != NULL)
 339      -                kstat_runq_enter(spa->spa_iokstat->ks_data);
      350 +                kstat_runq_enter_time(spa->spa_iokstat->ks_data, t);
      351 +        if (vq->vq_vdev->vdev_iokstat != NULL)
      352 +                kstat_runq_enter_time(vq->vq_vdev->vdev_iokstat->ks_data, t);
 340  353          mutex_exit(&spa->spa_iokstat_lock);
 341  354  }
 342  355  
 343  356  static void
 344  357  vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
 345  358  {
 346  359          spa_t *spa = zio->io_spa;
      360 +        hrtime_t t = gethrtime_unscaled();
      361 +
 347  362          ASSERT(MUTEX_HELD(&vq->vq_lock));
 348  363          ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 349  364          vq->vq_class[zio->io_priority].vqc_active--;
 350  365          avl_remove(&vq->vq_active_tree, zio);
 351  366  
 352      -        mutex_enter(&spa->spa_iokstat_lock);
 353  367          ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_active, >, 0);
 354      -        spa->spa_queue_stats[zio->io_priority].spa_active--;
      368 +        atomic_dec_64(&spa->spa_queue_stats[zio->io_priority].spa_active);
      369 +
      370 +        mutex_enter(&spa->spa_iokstat_lock);
 355  371          if (spa->spa_iokstat != NULL) {
 356  372                  kstat_io_t *ksio = spa->spa_iokstat->ks_data;
 357  373  
 358      -                kstat_runq_exit(spa->spa_iokstat->ks_data);
      374 +                kstat_runq_exit_time(spa->spa_iokstat->ks_data, t);
 359  375                  if (zio->io_type == ZIO_TYPE_READ) {
 360  376                          ksio->reads++;
 361  377                          ksio->nread += zio->io_size;
 362  378                  } else if (zio->io_type == ZIO_TYPE_WRITE) {
 363  379                          ksio->writes++;
 364  380                          ksio->nwritten += zio->io_size;
 365  381                  }
 366  382          }
      383 +
      384 +        if (vq->vq_vdev->vdev_iokstat != NULL) {
      385 +                kstat_io_t *ksio = vq->vq_vdev->vdev_iokstat->ks_data;
      386 +
      387 +                kstat_runq_exit_time(ksio, t);
      388 +                if (zio->io_type == ZIO_TYPE_READ) {
      389 +                        ksio->reads++;
      390 +                        ksio->nread += zio->io_size;
      391 +                } else if (zio->io_type == ZIO_TYPE_WRITE) {
      392 +                        ksio->writes++;
      393 +                        ksio->nwritten += zio->io_size;
      394 +                }
      395 +        }
 367  396          mutex_exit(&spa->spa_iokstat_lock);
 368  397  }
 369  398  
 370  399  static void
 371  400  vdev_queue_agg_io_done(zio_t *aio)
 372  401  {
 373  402          if (aio->io_type == ZIO_TYPE_READ) {
 374  403                  zio_t *pio;
 375  404                  zio_link_t *zl = NULL;
 376  405                  while ((pio = zio_walk_parents(aio, &zl)) != NULL) {
 377  406                          abd_copy_off(pio->io_abd, aio->io_abd,
 378  407                              0, pio->io_offset - aio->io_offset, pio->io_size);
 379  408                  }
 380  409          }
 381  410  
 382  411          abd_free(aio->io_abd);
 383  412  }
 384  413  
      414 +static uint64_t
      415 +scan_prio2active(uint64_t prio, boolean_t max_active)
      416 +{
      417 +        uint64_t act, act_max;
      418 +
      419 +        if (max_active) {
      420 +                act_max = MAX(MAX(zfs_vdev_sync_read_max_active,
      421 +                    zfs_vdev_sync_write_max_active),
      422 +                    MAX(zfs_vdev_async_read_max_active,
      423 +                    zfs_vdev_async_write_max_active));
      424 +                act = ((prio * (zfs_vdev_sync_read_max_active +
      425 +                    zfs_vdev_sync_write_max_active +
      426 +                    zfs_vdev_async_read_max_active +
      427 +                    zfs_vdev_async_write_max_active)) / 100);
      428 +        } else {
      429 +                act_max = MAX(MAX(zfs_vdev_sync_read_min_active,
      430 +                    zfs_vdev_sync_write_min_active),
      431 +                    MAX(zfs_vdev_async_read_min_active,
      432 +                    zfs_vdev_async_write_min_active));
      433 +                act = ((prio * (zfs_vdev_sync_read_min_active +
      434 +                    zfs_vdev_sync_write_min_active +
      435 +                    zfs_vdev_async_read_min_active +
      436 +                    zfs_vdev_async_write_min_active)) / 100);
      437 +        }
      438 +        act = MAX(MIN(act, act_max), 1);
      439 +
      440 +        return (act);
      441 +}
      442 +
 385  443  static int
 386      -vdev_queue_class_min_active(zio_priority_t p)
      444 +vdev_queue_class_min_active(zio_priority_t p, vdev_queue_t *vq)
 387  445  {
      446 +        int zfs_min_active = 0;
      447 +        int vqc_min_active;
      448 +        vdev_prop_t prop = VDEV_ZIO_PRIO_TO_PROP_MIN(p);
      449 +
      450 +        ASSERT(VDEV_PROP_MIN_VALID(prop));
      451 +        vqc_min_active = vdev_queue_get_prop_uint64(vq, prop);
      452 +
 388  453          switch (p) {
 389  454          case ZIO_PRIORITY_SYNC_READ:
 390      -                return (zfs_vdev_sync_read_min_active);
      455 +                zfs_min_active = zfs_vdev_sync_read_min_active;
      456 +                break;
 391  457          case ZIO_PRIORITY_SYNC_WRITE:
 392      -                return (zfs_vdev_sync_write_min_active);
      458 +                zfs_min_active = zfs_vdev_sync_write_min_active;
      459 +                break;
 393  460          case ZIO_PRIORITY_ASYNC_READ:
 394      -                return (zfs_vdev_async_read_min_active);
      461 +                zfs_min_active = zfs_vdev_async_read_min_active;
      462 +                break;
 395  463          case ZIO_PRIORITY_ASYNC_WRITE:
 396      -                return (zfs_vdev_async_write_min_active);
 397      -        case ZIO_PRIORITY_SCRUB:
 398      -                return (zfs_vdev_scrub_min_active);
 399      -        case ZIO_PRIORITY_REMOVAL:
 400      -                return (zfs_vdev_removal_min_active);
      464 +                zfs_min_active = zfs_vdev_async_write_min_active;
      465 +                break;
      466 +        case ZIO_PRIORITY_RESILVER: {
      467 +                uint64_t prio = vq->vq_vdev->vdev_spa->spa_resilver_prio;
      468 +                if (prio > 0)
      469 +                        zfs_min_active = scan_prio2active(prio, B_FALSE);
      470 +                else
      471 +                        zfs_min_active = zfs_vdev_resilver_min_active;
      472 +                break;
      473 +        }
      474 +        case ZIO_PRIORITY_SCRUB: {
      475 +                uint64_t prio = vq->vq_vdev->vdev_spa->spa_scrub_prio;
      476 +                if (prio > 0)
      477 +                        zfs_min_active = scan_prio2active(prio, B_FALSE);
      478 +                else
      479 +                        zfs_min_active = zfs_vdev_scrub_min_active;
      480 +                break;
      481 +        }
 401  482          default:
 402  483                  panic("invalid priority %u", p);
 403  484                  return (0);
 404  485          }
      486 +
      487 +        /* zero vdev-specific setting means "use zfs global setting" */
      488 +        return ((vqc_min_active) ? vqc_min_active : zfs_min_active);
 405  489  }
 406  490  
 407  491  static int
 408      -vdev_queue_max_async_writes(spa_t *spa)
      492 +vdev_queue_max_async_writes(spa_t *spa, vdev_queue_t *vq)
 409  493  {
 410  494          int writes;
 411  495          uint64_t dirty = spa->spa_dsl_pool->dp_dirty_total;
 412  496          uint64_t min_bytes = zfs_dirty_data_max *
 413  497              zfs_vdev_async_write_active_min_dirty_percent / 100;
 414  498          uint64_t max_bytes = zfs_dirty_data_max *
 415  499              zfs_vdev_async_write_active_max_dirty_percent / 100;
 416  500  
 417  501          /*
      502 +         * vdev-specific properties override global tunables
      503 +         * zero vdev-specific settings indicate fallback on the globals
      504 +         */
      505 +        int vqc_min_active =
      506 +            vdev_queue_get_prop_uint64(vq, VDEV_PROP_AWRITE_MINACTIVE);
      507 +        int min_active =
      508 +            (vqc_min_active) ? vqc_min_active : zfs_vdev_async_write_min_active;
      509 +        int vqc_max_active =
      510 +            vdev_queue_get_prop_uint64(vq, VDEV_PROP_AWRITE_MAXACTIVE);
      511 +        int max_active =
      512 +            (vqc_max_active) ? vqc_max_active : zfs_vdev_async_write_max_active;
      513 +
      514 +        /*
 418  515           * Sync tasks correspond to interactive user actions. To reduce the
 419  516           * execution time of those actions we push data out as fast as possible.
 420  517           */
 421  518          if (spa_has_pending_synctask(spa)) {
 422  519                  return (zfs_vdev_async_write_max_active);
 423  520          }
 424  521  
 425  522          if (dirty < min_bytes)
 426      -                return (zfs_vdev_async_write_min_active);
      523 +                return (min_active);
 427  524          if (dirty > max_bytes)
 428      -                return (zfs_vdev_async_write_max_active);
      525 +                return (max_active);
 429  526  
 430  527          /*
 431  528           * linear interpolation:
 432  529           * slope = (max_writes - min_writes) / (max_bytes - min_bytes)
 433  530           * move right by min_bytes
 434  531           * move up by min_writes
 435  532           */
 436      -        writes = (dirty - min_bytes) *
 437      -            (zfs_vdev_async_write_max_active -
 438      -            zfs_vdev_async_write_min_active) /
 439      -            (max_bytes - min_bytes) +
 440      -            zfs_vdev_async_write_min_active;
 441      -        ASSERT3U(writes, >=, zfs_vdev_async_write_min_active);
 442      -        ASSERT3U(writes, <=, zfs_vdev_async_write_max_active);
      533 +        writes = (dirty - min_bytes) * (max_active - min_active) /
      534 +            (max_bytes - min_bytes) + min_active;
      535 +        ASSERT3U(writes, >=, min_active);
      536 +        ASSERT3U(writes, <=, max_active);
 443  537          return (writes);
 444  538  }
 445  539  
 446  540  static int
 447      -vdev_queue_class_max_active(spa_t *spa, zio_priority_t p)
      541 +vdev_queue_class_max_active(spa_t *spa, zio_priority_t p, vdev_queue_t *vq)
 448  542  {
      543 +        int zfs_max_active = 0;
      544 +        int vqc_max_active;
      545 +        vdev_prop_t prop = VDEV_ZIO_PRIO_TO_PROP_MAX(p);
      546 +
      547 +        ASSERT(VDEV_PROP_MAX_VALID(prop));
      548 +        vqc_max_active = vdev_queue_get_prop_uint64(vq, prop);
      549 +
 449  550          switch (p) {
 450  551          case ZIO_PRIORITY_SYNC_READ:
 451      -                return (zfs_vdev_sync_read_max_active);
      552 +                zfs_max_active = zfs_vdev_sync_read_max_active;
      553 +                break;
 452  554          case ZIO_PRIORITY_SYNC_WRITE:
 453      -                return (zfs_vdev_sync_write_max_active);
      555 +                zfs_max_active = zfs_vdev_sync_write_max_active;
      556 +                break;
 454  557          case ZIO_PRIORITY_ASYNC_READ:
 455      -                return (zfs_vdev_async_read_max_active);
      558 +                zfs_max_active = zfs_vdev_async_read_max_active;
      559 +                break;
 456  560          case ZIO_PRIORITY_ASYNC_WRITE:
 457      -                return (vdev_queue_max_async_writes(spa));
 458      -        case ZIO_PRIORITY_SCRUB:
 459      -                return (zfs_vdev_scrub_max_active);
 460      -        case ZIO_PRIORITY_REMOVAL:
 461      -                return (zfs_vdev_removal_max_active);
      561 +                /* takes into account vdev-specific props internally */
      562 +                vqc_max_active = vdev_queue_max_async_writes(spa, vq);
      563 +                ASSERT(vqc_max_active);
      564 +                break;
      565 +        case ZIO_PRIORITY_RESILVER: {
      566 +                uint64_t prio = vq->vq_vdev->vdev_spa->spa_resilver_prio;
      567 +                if (prio > 0)
      568 +                        zfs_max_active = scan_prio2active(prio, B_TRUE);
      569 +                else
      570 +                        zfs_max_active = zfs_vdev_resilver_max_active;
      571 +                break;
      572 +        }
      573 +        case ZIO_PRIORITY_SCRUB: {
      574 +                uint64_t prio = vq->vq_vdev->vdev_spa->spa_scrub_prio;
      575 +                if (prio > 0)
      576 +                        zfs_max_active = scan_prio2active(prio, B_TRUE);
      577 +                else
      578 +                        zfs_max_active = zfs_vdev_scrub_max_active;
      579 +                break;
      580 +        }
 462  581          default:
 463  582                  panic("invalid priority %u", p);
 464  583                  return (0);
 465  584          }
      585 +
      586 +        /* zero vdev-specific setting means "use zfs global setting" */
      587 +        return ((vqc_max_active) ? vqc_max_active : zfs_max_active);
 466  588  }
 467  589  
 468  590  /*
 469  591   * Return the i/o class to issue from, or ZIO_PRIORITY_MAX_QUEUEABLE if
 470  592   * there is no eligible class.
 471  593   */
 472  594  static zio_priority_t
 473  595  vdev_queue_class_to_issue(vdev_queue_t *vq)
 474  596  {
 475  597          spa_t *spa = vq->vq_vdev->vdev_spa;
 476  598          zio_priority_t p;
 477  599  
 478  600          if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active)
 479  601                  return (ZIO_PRIORITY_NUM_QUEUEABLE);
 480  602  
 481  603          /* find a queue that has not reached its minimum # outstanding i/os */
 482  604          for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
 483  605                  if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
 484  606                      vq->vq_class[p].vqc_active <
 485      -                    vdev_queue_class_min_active(p))
      607 +                    vdev_queue_class_min_active(p, vq))
 486  608                          return (p);
 487  609          }
 488  610  
 489  611          /*
 490  612           * If we haven't found a queue, look for one that hasn't reached its
 491  613           * maximum # outstanding i/os.
 492  614           */
 493  615          for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
 494  616                  if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
 495  617                      vq->vq_class[p].vqc_active <
 496      -                    vdev_queue_class_max_active(spa, p))
      618 +                    vdev_queue_class_max_active(spa, p, vq))
 497  619                          return (p);
 498  620          }
 499  621  
 500  622          /* No eligible queued i/os */
 501  623          return (ZIO_PRIORITY_NUM_QUEUEABLE);
 502  624  }
 503  625  
 504  626  /*
 505  627   * Compute the range spanned by two i/os, which is the endpoint of the last
 506  628   * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset).

 507  629   * Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio);
 508  630   * thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0.
 509  631   */
 510  632  #define IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset)
 511  633  #define IO_GAP(fio, lio) (-IO_SPAN(lio, fio))
 512  634  
 513  635  static zio_t *
 514  636  vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
 515  637  {
 516  638          zio_t *first, *last, *aio, *dio, *mandatory, *nio;
 517  639          uint64_t maxgap = 0;
 518  640          uint64_t size;
 519  641          boolean_t stretch = B_FALSE;
 520  642          avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type);
 521  643          enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
 522  644  
 523  645          if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE)
 524  646                  return (NULL);
 525  647  
 526  648          first = last = zio;
 527  649  
 528  650          if (zio->io_type == ZIO_TYPE_READ)
 529  651                  maxgap = zfs_vdev_read_gap_limit;
 530  652  
 531  653          /*
 532  654           * We can aggregate I/Os that are sufficiently adjacent and of
 533  655           * the same flavor, as expressed by the AGG_INHERIT flags.
 534  656           * The latter requirement is necessary so that certain
 535  657           * attributes of the I/O, such as whether it's a normal I/O
 536  658           * or a scrub/resilver, can be preserved in the aggregate.
 537  659           * We can include optional I/Os, but don't allow them
 538  660           * to begin a range as they add no benefit in that situation.
 539  661           */
 540  662  
 541  663          /*
 542  664           * We keep track of the last non-optional I/O.

↓ open down ↓

36 lines elided

↑ open up ↑

 543  665           */
 544  666          mandatory = (first->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : first;
 545  667  
 546  668          /*
 547  669           * Walk backwards through sufficiently contiguous I/Os
 548  670           * recording the last non-optional I/O.
 549  671           */
 550  672          while ((dio = AVL_PREV(t, first)) != NULL &&
 551  673              (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
 552  674              IO_SPAN(dio, last) <= zfs_vdev_aggregation_limit &&
 553      -            IO_GAP(dio, first) <= maxgap &&
 554      -            dio->io_type == zio->io_type) {
      675 +            IO_GAP(dio, first) <= maxgap) {
 555  676                  first = dio;
 556  677                  if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL))
 557  678                          mandatory = first;
 558  679          }
 559  680  
 560  681          /*
 561  682           * Skip any initial optional I/Os.
 562  683           */
 563  684          while ((first->io_flags & ZIO_FLAG_OPTIONAL) && first != last) {
 564  685                  first = AVL_NEXT(t, first);

 565  686                  ASSERT(first != NULL);
 566  687          }
 567  688

↓ open down ↓

3 lines elided

↑ open up ↑

 568  689          /*
 569  690           * Walk forward through sufficiently contiguous I/Os.
 570  691           * The aggregation limit does not apply to optional i/os, so that
 571  692           * we can issue contiguous writes even if they are larger than the
 572  693           * aggregation limit.
 573  694           */
 574  695          while ((dio = AVL_NEXT(t, last)) != NULL &&
 575  696              (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
 576  697              (IO_SPAN(first, dio) <= zfs_vdev_aggregation_limit ||
 577  698              (dio->io_flags & ZIO_FLAG_OPTIONAL)) &&
 578      -            IO_GAP(last, dio) <= maxgap &&
 579      -            dio->io_type == zio->io_type) {
      699 +            IO_GAP(last, dio) <= maxgap) {
 580  700                  last = dio;
 581  701                  if (!(last->io_flags & ZIO_FLAG_OPTIONAL))
 582  702                          mandatory = last;
 583  703          }
 584  704  
 585  705          /*
 586  706           * Now that we've established the range of the I/O aggregation
 587  707           * we must decide what to do with trailing optional I/Os.
 588  708           * For reads, there's nothing to do. While we are unable to
 589  709           * aggregate further, it's possible that a trailing optional

 590  710           * I/O would allow the underlying device to aggregate with
 591  711           * subsequent I/Os. We must therefore determine if the next
 592  712           * non-optional I/O is close enough to make aggregation
 593  713           * worthwhile.
 594  714           */
 595  715          if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) {
 596  716                  zio_t *nio = last;
 597  717                  while ((dio = AVL_NEXT(t, nio)) != NULL &&
 598  718                      IO_GAP(nio, dio) == 0 &&
 599  719                      IO_GAP(mandatory, dio) <= zfs_vdev_write_gap_limit) {
 600  720                          nio = dio;
 601  721                          if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) {
 602  722                                  stretch = B_TRUE;
 603  723                                  break;
 604  724                          }
 605  725                  }
 606  726          }
 607  727  
 608  728          if (stretch) {
 609  729                  /*
 610  730                   * We are going to include an optional io in our aggregated
 611  731                   * span, thus closing the write gap.  Only mandatory i/os can
 612  732                   * start aggregated spans, so make sure that the next i/o
 613  733                   * after our span is mandatory.
 614  734                   */
 615  735                  dio = AVL_NEXT(t, last);
 616  736                  dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
 617  737          } else {
 618  738                  /* do not include the optional i/o */
 619  739                  while (last != mandatory && last != first) {
 620  740                          ASSERT(last->io_flags & ZIO_FLAG_OPTIONAL);
 621  741                          last = AVL_PREV(t, last);
 622  742                          ASSERT(last != NULL);
 623  743                  }
 624  744          }
 625  745  
 626  746          if (first == last)
 627  747                  return (NULL);
 628  748  
 629  749          size = IO_SPAN(first, last);
 630  750          ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
 631  751  
 632  752          aio = zio_vdev_delegated_io(first->io_vd, first->io_offset,
 633  753              abd_alloc_for_io(size, B_TRUE), size, first->io_type,
 634  754              zio->io_priority, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
 635  755              vdev_queue_agg_io_done, NULL);
 636  756          aio->io_timestamp = first->io_timestamp;
 637  757  
 638  758          nio = first;
 639  759          do {
 640  760                  dio = nio;
 641  761                  nio = AVL_NEXT(t, dio);
 642  762                  ASSERT3U(dio->io_type, ==, aio->io_type);
 643  763  
 644  764                  if (dio->io_flags & ZIO_FLAG_NODATA) {
 645  765                          ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE);
 646  766                          abd_zero_off(aio->io_abd,
 647  767                              dio->io_offset - aio->io_offset, dio->io_size);
 648  768                  } else if (dio->io_type == ZIO_TYPE_WRITE) {
 649  769                          abd_copy_off(aio->io_abd, dio->io_abd,
 650  770                              dio->io_offset - aio->io_offset, 0, dio->io_size);
 651  771                  }
 652  772  
 653  773                  zio_add_child(dio, aio);
 654  774                  vdev_queue_io_remove(vq, dio);
 655  775                  zio_vdev_io_bypass(dio);
 656  776                  zio_execute(dio);
 657  777          } while (dio != last);
 658  778  
 659  779          return (aio);
 660  780  }
 661  781  
 662  782  static zio_t *
 663  783  vdev_queue_io_to_issue(vdev_queue_t *vq)
 664  784  {
 665  785          zio_t *zio, *aio;
 666  786          zio_priority_t p;
 667  787          avl_index_t idx;
 668  788          avl_tree_t *tree;
 669  789          zio_t search;
 670  790  
 671  791  again:
 672  792          ASSERT(MUTEX_HELD(&vq->vq_lock));
 673  793  
 674  794          p = vdev_queue_class_to_issue(vq);
 675  795  
 676  796          if (p == ZIO_PRIORITY_NUM_QUEUEABLE) {
 677  797                  /* No eligible queued i/os */
 678  798                  return (NULL);
 679  799          }
 680  800  
 681  801          /*
 682  802           * For LBA-ordered queues (async / scrub), issue the i/o which follows
 683  803           * the most recently issued i/o in LBA (offset) order.
 684  804           *
 685  805           * For FIFO queues (sync), issue the i/o with the lowest timestamp.
 686  806           */
 687  807          tree = vdev_queue_class_tree(vq, p);
 688  808          search.io_timestamp = 0;
 689  809          search.io_offset = vq->vq_last_offset + 1;
 690  810          VERIFY3P(avl_find(tree, &search, &idx), ==, NULL);
 691  811          zio = avl_nearest(tree, idx, AVL_AFTER);
 692  812          if (zio == NULL)
 693  813                  zio = avl_first(tree);
 694  814          ASSERT3U(zio->io_priority, ==, p);
 695  815  
 696  816          aio = vdev_queue_aggregate(vq, zio);
 697  817          if (aio != NULL)
 698  818                  zio = aio;
 699  819          else
 700  820                  vdev_queue_io_remove(vq, zio);
 701  821  
 702  822          /*
 703  823           * If the I/O is or was optional and therefore has no data, we need to
 704  824           * simply discard it. We need to drop the vdev queue's lock to avoid a
 705  825           * deadlock that we could encounter since this I/O will complete
 706  826           * immediately.
 707  827           */
 708  828          if (zio->io_flags & ZIO_FLAG_NODATA) {
 709  829                  mutex_exit(&vq->vq_lock);
 710  830                  zio_vdev_io_bypass(zio);
 711  831                  zio_execute(zio);
 712  832                  mutex_enter(&vq->vq_lock);
 713  833                  goto again;
 714  834          }
 715  835  
 716  836          vdev_queue_pending_add(vq, zio);
 717  837          vq->vq_last_offset = zio->io_offset;
 718  838  
 719  839          return (zio);
 720  840  }
 721  841  
 722  842  zio_t *
 723  843  vdev_queue_io(zio_t *zio)
 724  844  {
 725  845          vdev_queue_t *vq = &zio->io_vd->vdev_queue;
 726  846          zio_t *nio;
 727  847

↓ open down ↓

138 lines elided

↑ open up ↑

 728  848          if (zio->io_flags & ZIO_FLAG_DONT_QUEUE)
 729  849                  return (zio);
 730  850  
 731  851          /*
 732  852           * Children i/os inherent their parent's priority, which might
 733  853           * not match the child's i/o type.  Fix it up here.
 734  854           */
 735  855          if (zio->io_type == ZIO_TYPE_READ) {
 736  856                  if (zio->io_priority != ZIO_PRIORITY_SYNC_READ &&
 737  857                      zio->io_priority != ZIO_PRIORITY_ASYNC_READ &&
 738      -                    zio->io_priority != ZIO_PRIORITY_SCRUB &&
 739      -                    zio->io_priority != ZIO_PRIORITY_REMOVAL)
      858 +                    zio->io_priority != ZIO_PRIORITY_SCRUB)
 740  859                          zio->io_priority = ZIO_PRIORITY_ASYNC_READ;
 741  860          } else {
 742  861                  ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 743  862                  if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&
 744      -                    zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE &&
 745      -                    zio->io_priority != ZIO_PRIORITY_REMOVAL)
      863 +                    zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE)
 746  864                          zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE;
 747  865          }
 748  866  
 749  867          zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
 750  868  
 751  869          mutex_enter(&vq->vq_lock);
 752  870          zio->io_timestamp = gethrtime();
 753  871          vdev_queue_io_add(vq, zio);
 754  872          nio = vdev_queue_io_to_issue(vq);
 755  873          mutex_exit(&vq->vq_lock);

 756  874  
 757  875          if (nio == NULL)
 758  876                  return (NULL);
 759  877  
 760  878          if (nio->io_done == vdev_queue_agg_io_done) {
 761  879                  zio_nowait(nio);
 762  880                  return (NULL);
 763  881          }
 764  882  
 765  883          return (nio);
 766  884  }
 767  885  
 768  886  void
 769  887  vdev_queue_io_done(zio_t *zio)
 770  888  {
 771  889          vdev_queue_t *vq = &zio->io_vd->vdev_queue;
 772  890          zio_t *nio;
 773  891  
 774  892          mutex_enter(&vq->vq_lock);
 775  893  
 776  894          vdev_queue_pending_remove(vq, zio);
 777  895  
 778  896          vq->vq_io_complete_ts = gethrtime();
 779  897  
 780  898          while ((nio = vdev_queue_io_to_issue(vq)) != NULL) {
 781  899                  mutex_exit(&vq->vq_lock);

↓ open down ↓

26 lines elided

↑ open up ↑

 782  900                  if (nio->io_done == vdev_queue_agg_io_done) {
 783  901                          zio_nowait(nio);
 784  902                  } else {
 785  903                          zio_vdev_io_reissue(nio);
 786  904                          zio_execute(nio);
 787  905                  }
 788  906                  mutex_enter(&vq->vq_lock);
 789  907          }
 790  908  
 791  909          mutex_exit(&vq->vq_lock);
      910 +}
      911 +
      912 +uint64_t
      913 +vdev_queue_get_prop_uint64(vdev_queue_t *vq, vdev_prop_t p)
      914 +{
      915 +        uint64_t val = 0;
      916 +        int zprio = 0;
      917 +        cos_t *cos = vq->vq_cos;
      918 +
      919 +        switch (p) {
      920 +        case VDEV_PROP_READ_MINACTIVE:
      921 +        case VDEV_PROP_AREAD_MINACTIVE:
      922 +        case VDEV_PROP_WRITE_MINACTIVE:
      923 +        case VDEV_PROP_AWRITE_MINACTIVE:
      924 +        case VDEV_PROP_SCRUB_MINACTIVE:
      925 +        case VDEV_PROP_RESILVER_MINACTIVE:
      926 +                zprio = VDEV_PROP_TO_ZIO_PRIO_MIN(p);
      927 +                ASSERT(ZIO_PRIORITY_QUEUEABLE_VALID(zprio));
      928 +                if (vq->vq_cos != NULL) {
      929 +                        cos_prop_t p = COS_ZIO_PRIO_TO_PROP_MIN(zprio);
      930 +                        ASSERT(COS_PROP_MIN_VALID(p));
      931 +                        val = cos_get_prop_uint64(vq->vq_cos, p);
      932 +                }
      933 +                if (val == 0)
      934 +                        val = vq->vq_class[zprio].vqc_min_active;
      935 +                break;
      936 +        case VDEV_PROP_READ_MAXACTIVE:
      937 +        case VDEV_PROP_AREAD_MAXACTIVE:
      938 +        case VDEV_PROP_WRITE_MAXACTIVE:
      939 +        case VDEV_PROP_AWRITE_MAXACTIVE:
      940 +        case VDEV_PROP_SCRUB_MAXACTIVE:
      941 +        case VDEV_PROP_RESILVER_MAXACTIVE:
      942 +                zprio = VDEV_PROP_TO_ZIO_PRIO_MAX(p);
      943 +                ASSERT(ZIO_PRIORITY_QUEUEABLE_VALID(zprio));
      944 +                if (vq->vq_cos != NULL) {
      945 +                        cos_prop_t p = COS_ZIO_PRIO_TO_PROP_MAX(zprio);
      946 +                        ASSERT(COS_PROP_MAX_VALID(p));
      947 +                        val = cos_get_prop_uint64(vq->vq_cos, p);
      948 +                }
      949 +                if (val == 0)
      950 +                        val = vq->vq_class[zprio].vqc_max_active;
      951 +                break;
      952 +        case VDEV_PROP_PREFERRED_READ:
      953 +                if (vq->vq_cos != NULL)
      954 +                        val = cos_get_prop_uint64(vq->vq_cos,
      955 +                            COS_PROP_PREFERRED_READ);
      956 +                if (val == 0)
      957 +                        val = vq->vq_preferred_read;
      958 +                break;
      959 +        default:
      960 +                panic("Non-numeric property requested\n");
      961 +                return (0);
      962 +        }
      963 +
      964 +        VERIFY(cos == vq->vq_cos);
      965 +
      966 +        return (val);
 792  967  }

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX