Print this page
NEX-18069 Unable to get/set VDEV_PROP_RESILVER_MAXACTIVE/VDEV_PROP_RESILVER_MINACTIVE props
Reviewed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
NEX-9552 zfs_scan_idle throttling harms performance and needs to be removed
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
NEX-13937 Improve kstat performance
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Reviewed by: Evan Layton <evan.layton@nexenta.com>
NEX-3558 KRRP Integration
OS-103 handle CoS descriptor persistent references across vdev operations
OS-80 support for vdev and CoS properties for the new I/O scheduler
OS-95 lint warning introduced by OS-61
re #12643 rb4064 ZFS meta refactoring - vdev utilization tracking, auto-dedup
re #12585 rb4049 ZFS++ work port - refactoring to improve separation of open/closed code, bug fixes, performance improvements - open code
Bug 11205: add missing libzfs_closed_stubs.c to fix opensource-only build.
ZFS plus work: special vdevs, cos, cos/vdev properties

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/vdev_queue.c
          +++ new/usr/src/uts/common/fs/zfs/vdev_queue.c
↓ open down ↓ 13 lines elided ↑ open up ↑
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
       24 + * Copyright 2018 Nexenta Systems, Inc. All rights reserved.
  24   25   */
  25   26  
  26   27  /*
  27   28   * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  28   29   * Copyright (c) 2014 Integros [integros.com]
  29   30   */
  30   31  
  31   32  #include <sys/zfs_context.h>
  32   33  #include <sys/vdev_impl.h>
       34 +#include <sys/cos.h>
  33   35  #include <sys/spa_impl.h>
  34   36  #include <sys/zio.h>
  35   37  #include <sys/avl.h>
  36   38  #include <sys/dsl_pool.h>
  37   39  #include <sys/metaslab_impl.h>
  38   40  #include <sys/abd.h>
  39   41  
  40   42  /*
  41   43   * ZFS I/O Scheduler
  42   44   * ---------------
↓ open down ↓ 96 lines elided ↑ open up ↑
 139  141   * throughput.
 140  142   */
 141  143  uint32_t zfs_vdev_sync_read_min_active = 10;
 142  144  uint32_t zfs_vdev_sync_read_max_active = 10;
 143  145  uint32_t zfs_vdev_sync_write_min_active = 10;
 144  146  uint32_t zfs_vdev_sync_write_max_active = 10;
 145  147  uint32_t zfs_vdev_async_read_min_active = 1;
 146  148  uint32_t zfs_vdev_async_read_max_active = 3;
 147  149  uint32_t zfs_vdev_async_write_min_active = 1;
 148  150  uint32_t zfs_vdev_async_write_max_active = 10;
      151 +uint32_t zfs_vdev_resilver_min_active = 1;
      152 +uint32_t zfs_vdev_resilver_max_active = 3;
 149  153  uint32_t zfs_vdev_scrub_min_active = 1;
 150  154  uint32_t zfs_vdev_scrub_max_active = 2;
 151      -uint32_t zfs_vdev_removal_min_active = 1;
 152      -uint32_t zfs_vdev_removal_max_active = 2;
 153  155  
 154  156  /*
 155  157   * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent
 156  158   * dirty data, use zfs_vdev_async_write_min_active.  When it has more than
 157  159   * zfs_vdev_async_write_active_max_dirty_percent, use
 158  160   * zfs_vdev_async_write_max_active. The value is linearly interpolated
 159  161   * between min and max.
 160  162   */
 161  163  int zfs_vdev_async_write_active_min_dirty_percent = 30;
 162  164  int zfs_vdev_async_write_active_max_dirty_percent = 60;
↓ open down ↓ 125 lines elided ↑ open up ↑
 288  290          avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ));
 289  291          avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE));
 290  292  
 291  293          mutex_destroy(&vq->vq_lock);
 292  294  }
 293  295  
 294  296  static void
 295  297  vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
 296  298  {
 297  299          spa_t *spa = zio->io_spa;
      300 +        hrtime_t t = gethrtime_unscaled();
 298  301  
 299  302          ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 300  303          avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
 301  304          avl_add(vdev_queue_type_tree(vq, zio->io_type), zio);
 302  305  
      306 +        atomic_inc_64(&spa->spa_queue_stats[zio->io_priority].spa_queued);
 303  307          mutex_enter(&spa->spa_iokstat_lock);
 304      -        spa->spa_queue_stats[zio->io_priority].spa_queued++;
 305  308          if (spa->spa_iokstat != NULL)
 306      -                kstat_waitq_enter(spa->spa_iokstat->ks_data);
      309 +                kstat_waitq_enter_time(spa->spa_iokstat->ks_data, t);
      310 +        if (vq->vq_vdev->vdev_iokstat != NULL)
      311 +                kstat_waitq_enter_time(vq->vq_vdev->vdev_iokstat->ks_data, t);
 307  312          mutex_exit(&spa->spa_iokstat_lock);
 308  313  }
 309  314  
 310  315  static void
 311  316  vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
 312  317  {
 313  318          spa_t *spa = zio->io_spa;
      319 +        hrtime_t t = gethrtime_unscaled();
 314  320  
 315  321          ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 316  322          avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
 317  323          avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio);
 318  324  
 319      -        mutex_enter(&spa->spa_iokstat_lock);
 320  325          ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_queued, >, 0);
 321      -        spa->spa_queue_stats[zio->io_priority].spa_queued--;
      326 +        atomic_dec_64(&spa->spa_queue_stats[zio->io_priority].spa_queued);
      327 +
      328 +        mutex_enter(&spa->spa_iokstat_lock);
 322  329          if (spa->spa_iokstat != NULL)
 323      -                kstat_waitq_exit(spa->spa_iokstat->ks_data);
      330 +                kstat_waitq_exit_time(spa->spa_iokstat->ks_data, t);
      331 +        if (vq->vq_vdev->vdev_iokstat != NULL)
      332 +                kstat_waitq_exit_time(vq->vq_vdev->vdev_iokstat->ks_data, t);
 324  333          mutex_exit(&spa->spa_iokstat_lock);
 325  334  }
 326  335  
 327  336  static void
 328  337  vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
 329  338  {
 330  339          spa_t *spa = zio->io_spa;
      340 +        hrtime_t t = gethrtime_unscaled();
      341 +
 331  342          ASSERT(MUTEX_HELD(&vq->vq_lock));
 332  343          ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 333  344          vq->vq_class[zio->io_priority].vqc_active++;
 334  345          avl_add(&vq->vq_active_tree, zio);
 335  346  
      347 +        atomic_inc_64(&spa->spa_queue_stats[zio->io_priority].spa_active);
 336  348          mutex_enter(&spa->spa_iokstat_lock);
 337      -        spa->spa_queue_stats[zio->io_priority].spa_active++;
 338  349          if (spa->spa_iokstat != NULL)
 339      -                kstat_runq_enter(spa->spa_iokstat->ks_data);
      350 +                kstat_runq_enter_time(spa->spa_iokstat->ks_data, t);
      351 +        if (vq->vq_vdev->vdev_iokstat != NULL)
      352 +                kstat_runq_enter_time(vq->vq_vdev->vdev_iokstat->ks_data, t);
 340  353          mutex_exit(&spa->spa_iokstat_lock);
 341  354  }
 342  355  
 343  356  static void
 344  357  vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
 345  358  {
 346  359          spa_t *spa = zio->io_spa;
      360 +        hrtime_t t = gethrtime_unscaled();
      361 +
 347  362          ASSERT(MUTEX_HELD(&vq->vq_lock));
 348  363          ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 349  364          vq->vq_class[zio->io_priority].vqc_active--;
 350  365          avl_remove(&vq->vq_active_tree, zio);
 351  366  
 352      -        mutex_enter(&spa->spa_iokstat_lock);
 353  367          ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_active, >, 0);
 354      -        spa->spa_queue_stats[zio->io_priority].spa_active--;
      368 +        atomic_dec_64(&spa->spa_queue_stats[zio->io_priority].spa_active);
      369 +
      370 +        mutex_enter(&spa->spa_iokstat_lock);
 355  371          if (spa->spa_iokstat != NULL) {
 356  372                  kstat_io_t *ksio = spa->spa_iokstat->ks_data;
 357  373  
 358      -                kstat_runq_exit(spa->spa_iokstat->ks_data);
      374 +                kstat_runq_exit_time(spa->spa_iokstat->ks_data, t);
 359  375                  if (zio->io_type == ZIO_TYPE_READ) {
 360  376                          ksio->reads++;
 361  377                          ksio->nread += zio->io_size;
 362  378                  } else if (zio->io_type == ZIO_TYPE_WRITE) {
 363  379                          ksio->writes++;
 364  380                          ksio->nwritten += zio->io_size;
 365  381                  }
 366  382          }
      383 +
      384 +        if (vq->vq_vdev->vdev_iokstat != NULL) {
      385 +                kstat_io_t *ksio = vq->vq_vdev->vdev_iokstat->ks_data;
      386 +
      387 +                kstat_runq_exit_time(ksio, t);
      388 +                if (zio->io_type == ZIO_TYPE_READ) {
      389 +                        ksio->reads++;
      390 +                        ksio->nread += zio->io_size;
      391 +                } else if (zio->io_type == ZIO_TYPE_WRITE) {
      392 +                        ksio->writes++;
      393 +                        ksio->nwritten += zio->io_size;
      394 +                }
      395 +        }
 367  396          mutex_exit(&spa->spa_iokstat_lock);
 368  397  }
 369  398  
 370  399  static void
 371  400  vdev_queue_agg_io_done(zio_t *aio)
 372  401  {
 373  402          if (aio->io_type == ZIO_TYPE_READ) {
 374  403                  zio_t *pio;
 375  404                  zio_link_t *zl = NULL;
 376  405                  while ((pio = zio_walk_parents(aio, &zl)) != NULL) {
 377  406                          abd_copy_off(pio->io_abd, aio->io_abd,
 378  407                              0, pio->io_offset - aio->io_offset, pio->io_size);
 379  408                  }
 380  409          }
 381  410  
 382  411          abd_free(aio->io_abd);
 383  412  }
 384  413  
      414 +static uint64_t
      415 +scan_prio2active(uint64_t prio, boolean_t max_active)
      416 +{
      417 +        uint64_t act, act_max;
      418 +
      419 +        if (max_active) {
      420 +                act_max = MAX(MAX(zfs_vdev_sync_read_max_active,
      421 +                    zfs_vdev_sync_write_max_active),
      422 +                    MAX(zfs_vdev_async_read_max_active,
      423 +                    zfs_vdev_async_write_max_active));
      424 +                act = ((prio * (zfs_vdev_sync_read_max_active +
      425 +                    zfs_vdev_sync_write_max_active +
      426 +                    zfs_vdev_async_read_max_active +
      427 +                    zfs_vdev_async_write_max_active)) / 100);
      428 +        } else {
      429 +                act_max = MAX(MAX(zfs_vdev_sync_read_min_active,
      430 +                    zfs_vdev_sync_write_min_active),
      431 +                    MAX(zfs_vdev_async_read_min_active,
      432 +                    zfs_vdev_async_write_min_active));
      433 +                act = ((prio * (zfs_vdev_sync_read_min_active +
      434 +                    zfs_vdev_sync_write_min_active +
      435 +                    zfs_vdev_async_read_min_active +
      436 +                    zfs_vdev_async_write_min_active)) / 100);
      437 +        }
      438 +        act = MAX(MIN(act, act_max), 1);
      439 +
      440 +        return (act);
      441 +}
      442 +
 385  443  static int
 386      -vdev_queue_class_min_active(zio_priority_t p)
      444 +vdev_queue_class_min_active(zio_priority_t p, vdev_queue_t *vq)
 387  445  {
      446 +        int zfs_min_active = 0;
      447 +        int vqc_min_active;
      448 +        vdev_prop_t prop = VDEV_ZIO_PRIO_TO_PROP_MIN(p);
      449 +
      450 +        ASSERT(VDEV_PROP_MIN_VALID(prop));
      451 +        vqc_min_active = vdev_queue_get_prop_uint64(vq, prop);
      452 +
 388  453          switch (p) {
 389  454          case ZIO_PRIORITY_SYNC_READ:
 390      -                return (zfs_vdev_sync_read_min_active);
      455 +                zfs_min_active = zfs_vdev_sync_read_min_active;
      456 +                break;
 391  457          case ZIO_PRIORITY_SYNC_WRITE:
 392      -                return (zfs_vdev_sync_write_min_active);
      458 +                zfs_min_active = zfs_vdev_sync_write_min_active;
      459 +                break;
 393  460          case ZIO_PRIORITY_ASYNC_READ:
 394      -                return (zfs_vdev_async_read_min_active);
      461 +                zfs_min_active = zfs_vdev_async_read_min_active;
      462 +                break;
 395  463          case ZIO_PRIORITY_ASYNC_WRITE:
 396      -                return (zfs_vdev_async_write_min_active);
 397      -        case ZIO_PRIORITY_SCRUB:
 398      -                return (zfs_vdev_scrub_min_active);
 399      -        case ZIO_PRIORITY_REMOVAL:
 400      -                return (zfs_vdev_removal_min_active);
      464 +                zfs_min_active = zfs_vdev_async_write_min_active;
      465 +                break;
      466 +        case ZIO_PRIORITY_RESILVER: {
      467 +                uint64_t prio = vq->vq_vdev->vdev_spa->spa_resilver_prio;
      468 +                if (prio > 0)
      469 +                        zfs_min_active = scan_prio2active(prio, B_FALSE);
      470 +                else
      471 +                        zfs_min_active = zfs_vdev_resilver_min_active;
      472 +                break;
      473 +        }
      474 +        case ZIO_PRIORITY_SCRUB: {
      475 +                uint64_t prio = vq->vq_vdev->vdev_spa->spa_scrub_prio;
      476 +                if (prio > 0)
      477 +                        zfs_min_active = scan_prio2active(prio, B_FALSE);
      478 +                else
      479 +                        zfs_min_active = zfs_vdev_scrub_min_active;
      480 +                break;
      481 +        }
 401  482          default:
 402  483                  panic("invalid priority %u", p);
 403  484                  return (0);
 404  485          }
      486 +
      487 +        /* zero vdev-specific setting means "use zfs global setting" */
      488 +        return ((vqc_min_active) ? vqc_min_active : zfs_min_active);
 405  489  }
 406  490  
 407  491  static int
 408      -vdev_queue_max_async_writes(spa_t *spa)
      492 +vdev_queue_max_async_writes(spa_t *spa, vdev_queue_t *vq)
 409  493  {
 410  494          int writes;
 411  495          uint64_t dirty = spa->spa_dsl_pool->dp_dirty_total;
 412  496          uint64_t min_bytes = zfs_dirty_data_max *
 413  497              zfs_vdev_async_write_active_min_dirty_percent / 100;
 414  498          uint64_t max_bytes = zfs_dirty_data_max *
 415  499              zfs_vdev_async_write_active_max_dirty_percent / 100;
 416  500  
 417  501          /*
      502 +         * vdev-specific properties override global tunables
      503 +         * zero vdev-specific settings indicate fallback on the globals
      504 +         */
      505 +        int vqc_min_active =
      506 +            vdev_queue_get_prop_uint64(vq, VDEV_PROP_AWRITE_MINACTIVE);
      507 +        int min_active =
      508 +            (vqc_min_active) ? vqc_min_active : zfs_vdev_async_write_min_active;
      509 +        int vqc_max_active =
      510 +            vdev_queue_get_prop_uint64(vq, VDEV_PROP_AWRITE_MAXACTIVE);
      511 +        int max_active =
      512 +            (vqc_max_active) ? vqc_max_active : zfs_vdev_async_write_max_active;
      513 +
      514 +        /*
 418  515           * Sync tasks correspond to interactive user actions. To reduce the
 419  516           * execution time of those actions we push data out as fast as possible.
 420  517           */
 421  518          if (spa_has_pending_synctask(spa)) {
 422  519                  return (zfs_vdev_async_write_max_active);
 423  520          }
 424  521  
 425  522          if (dirty < min_bytes)
 426      -                return (zfs_vdev_async_write_min_active);
      523 +                return (min_active);
 427  524          if (dirty > max_bytes)
 428      -                return (zfs_vdev_async_write_max_active);
      525 +                return (max_active);
 429  526  
 430  527          /*
 431  528           * linear interpolation:
 432  529           * slope = (max_writes - min_writes) / (max_bytes - min_bytes)
 433  530           * move right by min_bytes
 434  531           * move up by min_writes
 435  532           */
 436      -        writes = (dirty - min_bytes) *
 437      -            (zfs_vdev_async_write_max_active -
 438      -            zfs_vdev_async_write_min_active) /
 439      -            (max_bytes - min_bytes) +
 440      -            zfs_vdev_async_write_min_active;
 441      -        ASSERT3U(writes, >=, zfs_vdev_async_write_min_active);
 442      -        ASSERT3U(writes, <=, zfs_vdev_async_write_max_active);
      533 +        writes = (dirty - min_bytes) * (max_active - min_active) /
      534 +            (max_bytes - min_bytes) + min_active;
      535 +        ASSERT3U(writes, >=, min_active);
      536 +        ASSERT3U(writes, <=, max_active);
 443  537          return (writes);
 444  538  }
 445  539  
 446  540  static int
 447      -vdev_queue_class_max_active(spa_t *spa, zio_priority_t p)
      541 +vdev_queue_class_max_active(spa_t *spa, zio_priority_t p, vdev_queue_t *vq)
 448  542  {
      543 +        int zfs_max_active = 0;
      544 +        int vqc_max_active;
      545 +        vdev_prop_t prop = VDEV_ZIO_PRIO_TO_PROP_MAX(p);
      546 +
      547 +        ASSERT(VDEV_PROP_MAX_VALID(prop));
      548 +        vqc_max_active = vdev_queue_get_prop_uint64(vq, prop);
      549 +
 449  550          switch (p) {
 450  551          case ZIO_PRIORITY_SYNC_READ:
 451      -                return (zfs_vdev_sync_read_max_active);
      552 +                zfs_max_active = zfs_vdev_sync_read_max_active;
      553 +                break;
 452  554          case ZIO_PRIORITY_SYNC_WRITE:
 453      -                return (zfs_vdev_sync_write_max_active);
      555 +                zfs_max_active = zfs_vdev_sync_write_max_active;
      556 +                break;
 454  557          case ZIO_PRIORITY_ASYNC_READ:
 455      -                return (zfs_vdev_async_read_max_active);
      558 +                zfs_max_active = zfs_vdev_async_read_max_active;
      559 +                break;
 456  560          case ZIO_PRIORITY_ASYNC_WRITE:
 457      -                return (vdev_queue_max_async_writes(spa));
 458      -        case ZIO_PRIORITY_SCRUB:
 459      -                return (zfs_vdev_scrub_max_active);
 460      -        case ZIO_PRIORITY_REMOVAL:
 461      -                return (zfs_vdev_removal_max_active);
      561 +                /* takes into account vdev-specific props internally */
      562 +                vqc_max_active = vdev_queue_max_async_writes(spa, vq);
      563 +                ASSERT(vqc_max_active);
      564 +                break;
      565 +        case ZIO_PRIORITY_RESILVER: {
      566 +                uint64_t prio = vq->vq_vdev->vdev_spa->spa_resilver_prio;
      567 +                if (prio > 0)
      568 +                        zfs_max_active = scan_prio2active(prio, B_TRUE);
      569 +                else
      570 +                        zfs_max_active = zfs_vdev_resilver_max_active;
      571 +                break;
      572 +        }
      573 +        case ZIO_PRIORITY_SCRUB: {
      574 +                uint64_t prio = vq->vq_vdev->vdev_spa->spa_scrub_prio;
      575 +                if (prio > 0)
      576 +                        zfs_max_active = scan_prio2active(prio, B_TRUE);
      577 +                else
      578 +                        zfs_max_active = zfs_vdev_scrub_max_active;
      579 +                break;
      580 +        }
 462  581          default:
 463  582                  panic("invalid priority %u", p);
 464  583                  return (0);
 465  584          }
      585 +
      586 +        /* zero vdev-specific setting means "use zfs global setting" */
      587 +        return ((vqc_max_active) ? vqc_max_active : zfs_max_active);
 466  588  }
 467  589  
 468  590  /*
 469  591   * Return the i/o class to issue from, or ZIO_PRIORITY_MAX_QUEUEABLE if
 470  592   * there is no eligible class.
 471  593   */
 472  594  static zio_priority_t
 473  595  vdev_queue_class_to_issue(vdev_queue_t *vq)
 474  596  {
 475  597          spa_t *spa = vq->vq_vdev->vdev_spa;
 476  598          zio_priority_t p;
 477  599  
 478  600          if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active)
 479  601                  return (ZIO_PRIORITY_NUM_QUEUEABLE);
 480  602  
 481  603          /* find a queue that has not reached its minimum # outstanding i/os */
 482  604          for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
 483  605                  if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
 484  606                      vq->vq_class[p].vqc_active <
 485      -                    vdev_queue_class_min_active(p))
      607 +                    vdev_queue_class_min_active(p, vq))
 486  608                          return (p);
 487  609          }
 488  610  
 489  611          /*
 490  612           * If we haven't found a queue, look for one that hasn't reached its
 491  613           * maximum # outstanding i/os.
 492  614           */
 493  615          for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
 494  616                  if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
 495  617                      vq->vq_class[p].vqc_active <
 496      -                    vdev_queue_class_max_active(spa, p))
      618 +                    vdev_queue_class_max_active(spa, p, vq))
 497  619                          return (p);
 498  620          }
 499  621  
 500  622          /* No eligible queued i/os */
 501  623          return (ZIO_PRIORITY_NUM_QUEUEABLE);
 502  624  }
 503  625  
 504  626  /*
 505  627   * Compute the range spanned by two i/os, which is the endpoint of the last
 506  628   * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset).
↓ open down ↓ 36 lines elided ↑ open up ↑
 543  665           */
 544  666          mandatory = (first->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : first;
 545  667  
 546  668          /*
 547  669           * Walk backwards through sufficiently contiguous I/Os
 548  670           * recording the last non-optional I/O.
 549  671           */
 550  672          while ((dio = AVL_PREV(t, first)) != NULL &&
 551  673              (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
 552  674              IO_SPAN(dio, last) <= zfs_vdev_aggregation_limit &&
 553      -            IO_GAP(dio, first) <= maxgap &&
 554      -            dio->io_type == zio->io_type) {
      675 +            IO_GAP(dio, first) <= maxgap) {
 555  676                  first = dio;
 556  677                  if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL))
 557  678                          mandatory = first;
 558  679          }
 559  680  
 560  681          /*
 561  682           * Skip any initial optional I/Os.
 562  683           */
 563  684          while ((first->io_flags & ZIO_FLAG_OPTIONAL) && first != last) {
 564  685                  first = AVL_NEXT(t, first);
↓ open down ↓ 3 lines elided ↑ open up ↑
 568  689          /*
 569  690           * Walk forward through sufficiently contiguous I/Os.
 570  691           * The aggregation limit does not apply to optional i/os, so that
 571  692           * we can issue contiguous writes even if they are larger than the
 572  693           * aggregation limit.
 573  694           */
 574  695          while ((dio = AVL_NEXT(t, last)) != NULL &&
 575  696              (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
 576  697              (IO_SPAN(first, dio) <= zfs_vdev_aggregation_limit ||
 577  698              (dio->io_flags & ZIO_FLAG_OPTIONAL)) &&
 578      -            IO_GAP(last, dio) <= maxgap &&
 579      -            dio->io_type == zio->io_type) {
      699 +            IO_GAP(last, dio) <= maxgap) {
 580  700                  last = dio;
 581  701                  if (!(last->io_flags & ZIO_FLAG_OPTIONAL))
 582  702                          mandatory = last;
 583  703          }
 584  704  
 585  705          /*
 586  706           * Now that we've established the range of the I/O aggregation
 587  707           * we must decide what to do with trailing optional I/Os.
 588  708           * For reads, there's nothing to do. While we are unable to
 589  709           * aggregate further, it's possible that a trailing optional
↓ open down ↓ 138 lines elided ↑ open up ↑
 728  848          if (zio->io_flags & ZIO_FLAG_DONT_QUEUE)
 729  849                  return (zio);
 730  850  
 731  851          /*
 732  852           * Children i/os inherent their parent's priority, which might
 733  853           * not match the child's i/o type.  Fix it up here.
 734  854           */
 735  855          if (zio->io_type == ZIO_TYPE_READ) {
 736  856                  if (zio->io_priority != ZIO_PRIORITY_SYNC_READ &&
 737  857                      zio->io_priority != ZIO_PRIORITY_ASYNC_READ &&
 738      -                    zio->io_priority != ZIO_PRIORITY_SCRUB &&
 739      -                    zio->io_priority != ZIO_PRIORITY_REMOVAL)
      858 +                    zio->io_priority != ZIO_PRIORITY_SCRUB)
 740  859                          zio->io_priority = ZIO_PRIORITY_ASYNC_READ;
 741  860          } else {
 742  861                  ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 743  862                  if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&
 744      -                    zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE &&
 745      -                    zio->io_priority != ZIO_PRIORITY_REMOVAL)
      863 +                    zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE)
 746  864                          zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE;
 747  865          }
 748  866  
 749  867          zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
 750  868  
 751  869          mutex_enter(&vq->vq_lock);
 752  870          zio->io_timestamp = gethrtime();
 753  871          vdev_queue_io_add(vq, zio);
 754  872          nio = vdev_queue_io_to_issue(vq);
 755  873          mutex_exit(&vq->vq_lock);
↓ open down ↓ 26 lines elided ↑ open up ↑
 782  900                  if (nio->io_done == vdev_queue_agg_io_done) {
 783  901                          zio_nowait(nio);
 784  902                  } else {
 785  903                          zio_vdev_io_reissue(nio);
 786  904                          zio_execute(nio);
 787  905                  }
 788  906                  mutex_enter(&vq->vq_lock);
 789  907          }
 790  908  
 791  909          mutex_exit(&vq->vq_lock);
      910 +}
      911 +
      912 +uint64_t
      913 +vdev_queue_get_prop_uint64(vdev_queue_t *vq, vdev_prop_t p)
      914 +{
      915 +        uint64_t val = 0;
      916 +        int zprio = 0;
      917 +        cos_t *cos = vq->vq_cos;
      918 +
      919 +        switch (p) {
      920 +        case VDEV_PROP_READ_MINACTIVE:
      921 +        case VDEV_PROP_AREAD_MINACTIVE:
      922 +        case VDEV_PROP_WRITE_MINACTIVE:
      923 +        case VDEV_PROP_AWRITE_MINACTIVE:
      924 +        case VDEV_PROP_SCRUB_MINACTIVE:
      925 +        case VDEV_PROP_RESILVER_MINACTIVE:
      926 +                zprio = VDEV_PROP_TO_ZIO_PRIO_MIN(p);
      927 +                ASSERT(ZIO_PRIORITY_QUEUEABLE_VALID(zprio));
      928 +                if (vq->vq_cos != NULL) {
      929 +                        cos_prop_t p = COS_ZIO_PRIO_TO_PROP_MIN(zprio);
      930 +                        ASSERT(COS_PROP_MIN_VALID(p));
      931 +                        val = cos_get_prop_uint64(vq->vq_cos, p);
      932 +                }
      933 +                if (val == 0)
      934 +                        val = vq->vq_class[zprio].vqc_min_active;
      935 +                break;
      936 +        case VDEV_PROP_READ_MAXACTIVE:
      937 +        case VDEV_PROP_AREAD_MAXACTIVE:
      938 +        case VDEV_PROP_WRITE_MAXACTIVE:
      939 +        case VDEV_PROP_AWRITE_MAXACTIVE:
      940 +        case VDEV_PROP_SCRUB_MAXACTIVE:
      941 +        case VDEV_PROP_RESILVER_MAXACTIVE:
      942 +                zprio = VDEV_PROP_TO_ZIO_PRIO_MAX(p);
      943 +                ASSERT(ZIO_PRIORITY_QUEUEABLE_VALID(zprio));
      944 +                if (vq->vq_cos != NULL) {
      945 +                        cos_prop_t p = COS_ZIO_PRIO_TO_PROP_MAX(zprio);
      946 +                        ASSERT(COS_PROP_MAX_VALID(p));
      947 +                        val = cos_get_prop_uint64(vq->vq_cos, p);
      948 +                }
      949 +                if (val == 0)
      950 +                        val = vq->vq_class[zprio].vqc_max_active;
      951 +                break;
      952 +        case VDEV_PROP_PREFERRED_READ:
      953 +                if (vq->vq_cos != NULL)
      954 +                        val = cos_get_prop_uint64(vq->vq_cos,
      955 +                            COS_PROP_PREFERRED_READ);
      956 +                if (val == 0)
      957 +                        val = vq->vq_preferred_read;
      958 +                break;
      959 +        default:
      960 +                panic("Non-numeric property requested\n");
      961 +                return (0);
      962 +        }
      963 +
      964 +        VERIFY(cos == vq->vq_cos);
      965 +
      966 +        return (val);
 792  967  }
    
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX