illumos-omnios Old usr/src/uts/common/fs/zfs/zfs

   1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright 2013, Joyent, Inc. All rights reserved.
  14  */
  15 
  16 /*
  17  * The ZFS/Zone I/O throttle and scheduler attempts to ensure fair access to
  18  * ZFS I/O resources for each zone.
  19  *
  20  * I/O contention can be major pain point on a multi-tenant system. A single
  21  * zone can issue a stream of I/O operations, usually synchronous writes, which
  22  * disrupt I/O performance for all other zones. This problem is further
  23  * exacerbated by ZFS, which buffers all asynchronous writes in a single TXG,
  24  * a set of blocks which are atomically synced to disk. The process of
  25  * syncing a TXG can occupy all of a device's I/O bandwidth, thereby starving
  26  * out any pending read operations.
  27  *
  28  * There are two facets to this capability; the throttle and the scheduler.
  29  *
  30  * Throttle
  31  *
  32  * The requirements on the throttle are:
  33  *
  34  *     1) Ensure consistent and predictable I/O latency across all zones.
  35  *     2) Sequential and random workloads have very different characteristics,
  36  *        so it is a non-starter to track IOPS or throughput.
  37  *     3) A zone should be able to use the full disk bandwidth if no other zone
  38  *        is actively using the disk.
  39  *
  40  * The throttle has two components: one to track and account for each zone's
  41  * I/O requests, and another to throttle each zone's operations when it
  42  * exceeds its fair share of disk I/O. When the throttle detects that a zone is
  43  * consuming more than is appropriate, each read or write system call is
  44  * delayed by up to 100 microseconds, which we've found is sufficient to allow
  45  * other zones to interleave I/O requests during those delays.
  46  *
  47  * Note: The throttle will delay each logical I/O (as opposed to the physical
  48  * I/O which will likely be issued asynchronously), so it may be easier to
  49  * think of the I/O throttle delaying each read/write syscall instead of the
  50  * actual I/O operation. For each zone, the throttle tracks an ongoing average
  51  * of read and write operations performed to determine the overall I/O
  52  * utilization for each zone.
  53  *
  54  * The throttle calculates a I/O utilization metric for each zone using the
  55  * following formula:
  56  *
  57  *     (# of read syscalls) x (Average read latency) +
  58  *     (# of write syscalls) x (Average write latency)
  59  *
  60  * Once each zone has its utilization metric, the I/O throttle will compare I/O
  61  * utilization across all zones, and if a zone has a higher-than-average I/O
  62  * utilization, system calls from that zone are throttled. That is, if one
  63  * zone has a much higher utilization, that zone's delay is increased by 5
  64  * microseconds, up to a maximum of 100 microseconds. Conversely, if a zone is
  65  * already throttled and has a lower utilization than average, its delay will
  66  * be lowered by 5 microseconds.
  67  *
  68  * The throttle calculation is driven by IO activity, but since IO does not
  69  * happen at fixed intervals, timestamps are used to track when the last update
  70  * was made and to drive recalculation.
  71  *
  72  * The throttle recalculates each zone's I/O usage and throttle delay (if any)
  73  * on the zfs_zone_adjust_time interval. Overall I/O latency is maintained as
  74  * a decayed average which is updated on the zfs_zone_sys_avg_cycle interval.
  75  *
  76  * Scheduler
  77  *
  78  * The I/O scheduler manages the vdev queues, the queues of pending I/Os to
  79  * issue to the disks. It only makes scheduling decisions for the two
  80  * synchronous I/O queues (read & write).
  81  *
  82  * The scheduler maintains how many I/Os in the queue are from each zone, and
  83  * if one zone has a disproportionately large number of I/Os in the queue, the
  84  * scheduler will allow certain I/Os from the underutilized zones to be "bumped"
  85  * and pulled from the middle of the queue. This bump allows zones with a small
  86  * number of I/Os (so small they may not even be taken into account by the
  87  * throttle) to complete quickly instead of waiting behind dozens of I/Os from
  88  * other zones.
  89  */
  90 
  91 #include <sys/spa.h>
  92 #include <sys/vdev_impl.h>
  93 #include <sys/zfs_zone.h>
  94 
  95 #ifndef _KERNEL
  96 
  97 /*
  98  * Stubs for when compiling for user-land.
  99  */
 100 
 101 void
 102 zfs_zone_io_throttle(zfs_zone_iop_type_t type)
 103 {
 104 }
 105 
 106 void
 107 zfs_zone_zio_init(zio_t *zp)
 108 {
 109 }
 110 
 111 void
 112 zfs_zone_zio_start(zio_t *zp)
 113 {
 114 }
 115 
 116 void
 117 zfs_zone_zio_done(zio_t *zp)
 118 {
 119 }
 120 
 121 void
 122 zfs_zone_zio_dequeue(zio_t *zp)
 123 {
 124 }
 125 
 126 void
 127 zfs_zone_zio_enqueue(zio_t *zp)
 128 {
 129 }
 130 
 131 /*ARGSUSED*/
 132 void
 133 zfs_zone_report_txg_sync(void *dp)
 134 {
 135 }
 136 
 137 hrtime_t
 138 zfs_zone_txg_delay()
 139 {
 140         return (MSEC2NSEC(10));
 141 }
 142 
 143 #else
 144 
 145 /*
 146  * The real code.
 147  */
 148 
 149 #include <sys/systm.h>
 150 #include <sys/thread.h>
 151 #include <sys/proc.h>
 152 #include <sys/types.h>
 153 #include <sys/param.h>
 154 #include <sys/time.h>
 155 #include <sys/atomic.h>
 156 #include <sys/zio.h>
 157 #include <sys/zone.h>
 158 #include <sys/avl.h>
 159 #include <sys/sdt.h>
 160 #include <sys/ddi.h>
 161 
 162 /*
 163  * The zone throttle delays read and write operations from certain zones based
 164  * on each zone's IO utilitzation.  Once a cycle (defined by zfs_zone_cycle_time
 165  * below), the delays for each zone are recalculated based on the utilization
 166  * over the previous window.
 167  */
 168 boolean_t       zfs_zone_delay_enable = B_TRUE; /* enable IO throttle */
 169 uint16_t        zfs_zone_delay_step = 5;        /* usec amnt to change delay */
 170 uint16_t        zfs_zone_delay_ceiling = 100;   /* usec delay max */
 171 
 172 boolean_t       zfs_zone_priority_enable = B_TRUE;  /* enable IO priority */
 173 
 174 /*
 175  * For certain workloads, one zone may be issuing primarily sequential I/O and
 176  * another primarily random I/O.  The sequential I/O will complete much more
 177  * quickly than the random I/O, driving the average system latency for those
 178  * operations way down.  As a result, the random I/O may be throttled back, even
 179  * though the sequential I/O should be throttled to allow the random I/O more
 180  * access to the disk.
 181  *
 182  * This tunable limits the discrepancy between the read and write system
 183  * latency.  If one becomes excessively high, this tunable prevents the I/O
 184  * throttler from exacerbating the imbalance.
 185  */
 186 uint_t          zfs_zone_rw_lat_limit = 10;
 187 
 188 /*
 189  * The I/O throttle will only start delaying zones when it detects disk
 190  * utilization has reached a certain level.  This tunable controls the
 191  * threshold at which the throttle will start delaying zones.  When the number
 192  * of vdevs is small, the calculation should correspond closely with the %b
 193  * column from iostat -- but as the number of vdevs becomes large, it will
 194  * correlate less and less to any single device (therefore making it a poor
 195  * approximation for the actual I/O utilization on such systems).  We
 196  * therefore use our derived utilization conservatively:  we know that low
 197  * derived utilization does indeed correlate to low I/O use -- but that a high
 198  * rate of derived utilization does not necesarily alone denote saturation;
 199  * where we see a high rate of utilization, we also look for laggard I/Os to
 200  * attempt to detect saturation.
 201  */
 202 uint_t          zfs_zone_util_threshold = 80;
 203 uint_t          zfs_zone_underutil_threshold = 60;
 204 
 205 /*
 206  * There are three important tunables here:  zfs_zone_laggard_threshold denotes
 207  * the threshold at which an I/O is considered to be of notably high latency;
 208  * zfs_zone_laggard_recent denotes the number of microseconds before the
 209  * current time after which the last laggard is considered to be sufficiently
 210  * recent to merit increasing the throttle; zfs_zone_laggard_ancient denotes
 211  * the microseconds before the current time before which the last laggard is
 212  * considered to be sufficiently old to merit decreasing the throttle.  The
 213  * most important tunable of these three is the zfs_zone_laggard_threshold: in
 214  * modeling data from a large public cloud, this tunable was found to have a
 215  * much greater effect on the throttle than the two time-based thresholds.
 216  * This must be set high enough to not result in spurious throttling, but not
 217  * so high as to allow pathological I/O to persist in the system.
 218  */
 219 uint_t          zfs_zone_laggard_threshold = 50000;     /* 50 ms */
 220 uint_t          zfs_zone_laggard_recent = 1000000;      /* 1000 ms */
 221 uint_t          zfs_zone_laggard_ancient = 5000000;     /* 5000 ms */
 222 
 223 /*
 224  * Throughout this subsystem, our timestamps are in microseconds.  Our system
 225  * average cycle is one second or 1 million microseconds.  Our zone counter
 226  * update cycle is two seconds or 2 million microseconds.  We use a longer
 227  * duration for that cycle because some ops can see a little over two seconds of
 228  * latency when they are being starved by another zone.
 229  */
 230 uint_t          zfs_zone_sys_avg_cycle = 1000000;       /* 1 s */
 231 uint_t          zfs_zone_cycle_time = 2000000;          /* 2 s */
 232 
 233 /*
 234  * How often the I/O throttle will reevaluate each zone's utilization, in
 235  * microseconds. Default is 1/4 sec.
 236  */
 237 uint_t          zfs_zone_adjust_time = 250000;          /* 250 ms */
 238 
 239 typedef struct {
 240         hrtime_t        cycle_start;
 241         int             cycle_cnt;
 242         hrtime_t        cycle_lat;
 243         hrtime_t        sys_avg_lat;
 244 } sys_lat_cycle_t;
 245 
 246 typedef struct {
 247         hrtime_t zi_now;
 248         uint_t zi_avgrlat;
 249         uint_t zi_avgwlat;
 250         uint64_t zi_totpri;
 251         uint64_t zi_totutil;
 252         int zi_active;
 253         uint_t zi_diskutil;
 254         boolean_t zi_underutil;
 255         boolean_t zi_overutil;
 256 } zoneio_stats_t;
 257 
 258 static sys_lat_cycle_t  rd_lat;
 259 static sys_lat_cycle_t  wr_lat;
 260 
 261 /*
 262  * Some basic disk stats to determine disk utilization. The utilization info
 263  * for all disks on the system is aggregated into these values.
 264  *
 265  * Overall disk utilization for the current cycle is calculated as:
 266  *
 267  * ((zfs_disk_rtime - zfs_disk_last_rtime) * 100)
 268  * ----------------------------------------------
 269  *    ((now - zfs_zone_last_checked) * 1000);
 270  */
 271 kmutex_t        zfs_disk_lock;          /* protects the following: */
 272 uint_t          zfs_disk_rcnt;          /* Number of outstanding IOs */
 273 hrtime_t        zfs_disk_rtime = 0; /* cummulative sum of time performing IO */
 274 hrtime_t        zfs_disk_rlastupdate = 0; /* time last IO dispatched */
 275 
 276 hrtime_t        zfs_disk_last_rtime = 0; /* prev. cycle's zfs_disk_rtime val */
 277 /* time that we last updated per-zone throttle info */
 278 hrtime_t        zfs_zone_last_checked = 0;
 279 hrtime_t        zfs_disk_last_laggard = 0;
 280 
 281 /*
 282  * Data used to keep track of how often txg sync is running.
 283  */
 284 extern int      zfs_txg_timeout;
 285 static uint_t   txg_last_check;
 286 static uint_t   txg_cnt;
 287 static uint_t   txg_sync_rate;
 288 
 289 boolean_t       zfs_zone_schedule_enable = B_TRUE;      /* enable IO sched. */
 290 /*
 291  * Threshold for when zio scheduling should kick in.
 292  *
 293  * This threshold is based on the zfs_vdev_sync_read_max_active value for the
 294  * number of I/Os that can be pending on a device.  If there are more than the
 295  * max_active ops already queued up, beyond those already issued to the vdev,
 296  * then use zone-based scheduling to get the next synchronous zio.
 297  */
 298 uint32_t        zfs_zone_schedule_thresh = 10;
 299 
 300 /*
 301  * On each pass of the scheduler we increment the zone's weight (up to this
 302  * maximum). The weight is used by the scheduler to prevent starvation so
 303  * that zones which haven't been able to do any IO over many iterations
 304  * will max out thier weight to this value.
 305  */
 306 #define SCHED_WEIGHT_MAX        20
 307 
 308 /*
 309  * Tunables for delay throttling when TXG sync is occurring.
 310  *
 311  * If the zone is performing a write and we're doing above normal TXG syncing,
 312  * then throttle for longer than normal. The zone's wait time is multiplied
 313  * by the scale (zfs_zone_txg_throttle_scale).
 314  */
 315 int             zfs_zone_txg_throttle_scale = 2;
 316 hrtime_t        zfs_zone_txg_delay_nsec = MSEC2NSEC(20);
 317 
 318 typedef struct {
 319         int             zq_qdepth;
 320         zio_priority_t  zq_queue;
 321         int             zq_priority;
 322         int             zq_wt;
 323         zoneid_t        zq_zoneid;
 324 } zone_q_bump_t;
 325 
 326 /*
 327  * This uses gethrtime() but returns a value in usecs.
 328  */
 329 #define GET_USEC_TIME           (gethrtime() / 1000)
 330 #define NANO_TO_MICRO(x)        (x / (NANOSEC / MICROSEC))
 331 
 332 /*
 333  * Keep track of the zone's ZFS IOPs.
 334  *
 335  * See the comment on the zfs_zone_io_throttle function for which/how IOPs are
 336  * accounted for.
 337  *
 338  * If the number of ops is >1 then we can just use that value.  However,
 339  * if the number of ops is <2 then we might have a zone which is trying to do
 340  * IO but is not able to get any ops through the system.  We don't want to lose
 341  * track of this zone so we factor in its decayed count into the current count.
 342  *
 343  * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed count.
 344  * However, since this calculation is driven by IO activity and since IO does
 345  * not happen at fixed intervals, we use a timestamp to see when the last update
 346  * was made.  If it was more than one cycle ago, then we need to decay the
 347  * historical count by the proper number of additional cycles in which no IO was
 348  * performed.
 349  *
 350  * Return a time delta indicating how far into the current cycle we are or 0
 351  * if the last IO was more than a cycle ago.
 352  */
 353 static hrtime_t
 354 compute_historical_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp)
 355 {
 356         hrtime_t delta;
 357         int     gen_cnt;
 358 
 359         /*
 360          * Check if its time to recompute a new zone count.
 361          * If we're still collecting data for the current cycle, return false.
 362          */
 363         delta = unow - cp->cycle_start;
 364         if (delta < zfs_zone_cycle_time)
 365                 return (delta);
 366 
 367         /* A previous cycle is past, compute the new zone count. */
 368 
 369         /*
 370          * Figure out how many generations we have to decay the historical
 371          * count, since multiple cycles may have elapsed since our last IO.
 372          * We depend on int rounding here.
 373          */
 374         gen_cnt = (int)(delta / zfs_zone_cycle_time);
 375 
 376         /* If more than 5 cycles since last the IO, reset count. */
 377         if (gen_cnt > 5) {
 378                 cp->zone_avg_cnt = 0;
 379         } else {
 380                 /* Update the count. */
 381                 int     i;
 382 
 383                 /*
 384                  * If the zone did more than 1 IO, just use its current count
 385                  * as the historical value, otherwise decay the historical
 386                  * count and factor that into the new historical count.  We
 387                  * pick a threshold > 1 so that we don't lose track of IO due
 388                  * to int rounding.
 389                  */
 390                 if (cp->cycle_cnt > 1)
 391                         cp->zone_avg_cnt = cp->cycle_cnt;
 392                 else
 393                         cp->zone_avg_cnt = cp->cycle_cnt +
 394                             (cp->zone_avg_cnt / 2);
 395 
 396                 /*
 397                  * If more than one generation has elapsed since the last
 398                  * update, decay the values further.
 399                  */
 400                 for (i = 1; i < gen_cnt; i++)
 401                         cp->zone_avg_cnt = cp->zone_avg_cnt / 2;
 402         }
 403 
 404         /* A new cycle begins. */
 405         cp->cycle_start = unow;
 406         cp->cycle_cnt = 0;
 407 
 408         return (0);
 409 }
 410 
 411 /*
 412  * Add IO op data to the zone.
 413  */
 414 static void
 415 add_zone_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op)
 416 {
 417         switch (op) {
 418         case ZFS_ZONE_IOP_READ:
 419                 (void) compute_historical_zone_cnt(unow, &zonep->zone_rd_ops);
 420                 zonep->zone_rd_ops.cycle_cnt++;
 421                 break;
 422         case ZFS_ZONE_IOP_WRITE:
 423                 (void) compute_historical_zone_cnt(unow, &zonep->zone_wr_ops);
 424                 zonep->zone_wr_ops.cycle_cnt++;
 425                 break;
 426         case ZFS_ZONE_IOP_LOGICAL_WRITE:
 427                 (void) compute_historical_zone_cnt(unow, &zonep->zone_lwr_ops);
 428                 zonep->zone_lwr_ops.cycle_cnt++;
 429                 break;
 430         }
 431 }
 432 
 433 /*
 434  * Use a decaying average to keep track of the overall system latency.
 435  *
 436  * We want to have the recent activity heavily weighted, but if the
 437  * activity decreases or stops, then the average should quickly decay
 438  * down to the new value.
 439  *
 440  * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed average.
 441  * However, since this calculation is driven by IO activity and since IO does
 442  * not happen at fixed intervals, we use a timestamp to see when the last
 443  * update was made. If it was more than one cycle ago, then we need to decay
 444  * the average by the proper number of additional cycles in which no IO was
 445  * performed.
 446  *
 447  * Return true if we actually computed a new system average.
 448  * If we're still within an active cycle there is nothing to do, return false.
 449  */
 450 static boolean_t
 451 compute_new_sys_avg(hrtime_t unow, sys_lat_cycle_t *cp)
 452 {
 453         hrtime_t delta;
 454         int     gen_cnt;
 455 
 456         /*
 457          * Check if its time to recompute a new average.
 458          * If we're still collecting data for the current cycle, return false.
 459          */
 460         delta = unow - cp->cycle_start;
 461         if (delta < zfs_zone_sys_avg_cycle)
 462                 return (B_FALSE);
 463 
 464         /* A previous cycle is past, compute a new system average. */
 465 
 466         /*
 467          * Figure out how many generations we have to decay, since multiple
 468          * cycles may have elapsed since our last IO.
 469          * We count on int rounding here.
 470          */
 471         gen_cnt = (int)(delta / zfs_zone_sys_avg_cycle);
 472 
 473         /* If more than 5 cycles since last the IO, reset average. */
 474         if (gen_cnt > 5) {
 475                 cp->sys_avg_lat = 0;
 476         } else {
 477                 /* Update the average. */
 478                 int     i;
 479 
 480                 cp->sys_avg_lat =
 481                     (cp->sys_avg_lat + cp->cycle_lat) / (1 + cp->cycle_cnt);
 482 
 483                 /*
 484                  * If more than one generation has elapsed since the last
 485                  * update, decay the values further.
 486                  */
 487                 for (i = 1; i < gen_cnt; i++)
 488                         cp->sys_avg_lat = cp->sys_avg_lat / 2;
 489         }
 490 
 491         /* A new cycle begins. */
 492         cp->cycle_start = unow;
 493         cp->cycle_cnt = 0;
 494         cp->cycle_lat = 0;
 495 
 496         return (B_TRUE);
 497 }
 498 
 499 static void
 500 add_sys_iop(hrtime_t unow, int op, int lat)
 501 {
 502         switch (op) {
 503         case ZFS_ZONE_IOP_READ:
 504                 (void) compute_new_sys_avg(unow, &rd_lat);
 505                 rd_lat.cycle_cnt++;
 506                 rd_lat.cycle_lat += lat;
 507                 break;
 508         case ZFS_ZONE_IOP_WRITE:
 509                 (void) compute_new_sys_avg(unow, &wr_lat);
 510                 wr_lat.cycle_cnt++;
 511                 wr_lat.cycle_lat += lat;
 512                 break;
 513         }
 514 }
 515 
 516 /*
 517  * Get the zone IO counts.
 518  */
 519 static uint_t
 520 calc_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp)
 521 {
 522         hrtime_t delta;
 523         uint_t cnt;
 524 
 525         if ((delta = compute_historical_zone_cnt(unow, cp)) == 0) {
 526                 /*
 527                  * No activity in the current cycle, we already have the
 528                  * historical data so we'll use that.
 529                  */
 530                 cnt = cp->zone_avg_cnt;
 531         } else {
 532                 /*
 533                  * If we're less than half way through the cycle then use
 534                  * the current count plus half the historical count, otherwise
 535                  * just use the current count.
 536                  */
 537                 if (delta < (zfs_zone_cycle_time / 2))
 538                         cnt = cp->cycle_cnt + (cp->zone_avg_cnt / 2);
 539                 else
 540                         cnt = cp->cycle_cnt;
 541         }
 542 
 543         return (cnt);
 544 }
 545 
 546 /*
 547  * Get the average read/write latency in usecs for the system.
 548  */
 549 static uint_t
 550 calc_avg_lat(hrtime_t unow, sys_lat_cycle_t *cp)
 551 {
 552         if (compute_new_sys_avg(unow, cp)) {
 553                 /*
 554                  * No activity in the current cycle, we already have the
 555                  * historical data so we'll use that.
 556                  */
 557                 return (cp->sys_avg_lat);
 558         } else {
 559                 /*
 560                  * We're within a cycle; weight the current activity higher
 561                  * compared to the historical data and use that.
 562                  */
 563                 DTRACE_PROBE3(zfs__zone__calc__wt__avg,
 564                     uintptr_t, cp->sys_avg_lat,
 565                     uintptr_t, cp->cycle_lat,
 566                     uintptr_t, cp->cycle_cnt);
 567 
 568                 return ((cp->sys_avg_lat + (cp->cycle_lat * 8)) /
 569                     (1 + (cp->cycle_cnt * 8)));
 570         }
 571 }
 572 
 573 /*
 574  * Account for the current IOP on the zone and for the system as a whole.
 575  * The latency parameter is in usecs.
 576  */
 577 static void
 578 add_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op, hrtime_t lat)
 579 {
 580         /* Add op to zone */
 581         add_zone_iop(zonep, unow, op);
 582 
 583         /* Track system latency */
 584         if (op != ZFS_ZONE_IOP_LOGICAL_WRITE)
 585                 add_sys_iop(unow, op, lat);
 586 }
 587 
 588 /*
 589  * Calculate and return the total number of read ops, write ops and logical
 590  * write ops for the given zone.  If the zone has issued operations of any type
 591  * return a non-zero value, otherwise return 0.
 592  */
 593 static int
 594 get_zone_io_cnt(hrtime_t unow, zone_t *zonep, uint_t *rops, uint_t *wops,
 595     uint_t *lwops)
 596 {
 597         *rops = calc_zone_cnt(unow, &zonep->zone_rd_ops);
 598         *wops = calc_zone_cnt(unow, &zonep->zone_wr_ops);
 599         *lwops = calc_zone_cnt(unow, &zonep->zone_lwr_ops);
 600 
 601         DTRACE_PROBE4(zfs__zone__io__cnt, uintptr_t, zonep->zone_id,
 602             uintptr_t, *rops, uintptr_t, *wops, uintptr_t, *lwops);
 603 
 604         return (*rops | *wops | *lwops);
 605 }
 606 
 607 /*
 608  * Get the average read/write latency in usecs for the system.
 609  */
 610 static void
 611 get_sys_avg_lat(hrtime_t unow, uint_t *rlat, uint_t *wlat)
 612 {
 613         *rlat = calc_avg_lat(unow, &rd_lat);
 614         *wlat = calc_avg_lat(unow, &wr_lat);
 615 
 616         /*
 617          * In an attempt to improve the accuracy of the throttling algorithm,
 618          * assume that IO operations can't have zero latency.  Instead, assume
 619          * a reasonable lower bound for each operation type. If the actual
 620          * observed latencies are non-zero, use those latency values instead.
 621          */
 622         if (*rlat == 0)
 623                 *rlat = 1000;
 624         if (*wlat == 0)
 625                 *wlat = 1000;
 626 
 627         DTRACE_PROBE2(zfs__zone__sys__avg__lat, uintptr_t, *rlat,
 628             uintptr_t, *wlat);
 629 }
 630 
 631 /*
 632  * Find disk utilization for each zone and average utilization for all active
 633  * zones.
 634  */
 635 static int
 636 zfs_zone_wait_adjust_calculate_cb(zone_t *zonep, void *arg)
 637 {
 638         zoneio_stats_t *sp = arg;
 639         uint_t rops, wops, lwops;
 640 
 641         if (zonep->zone_id == GLOBAL_ZONEID ||
 642             get_zone_io_cnt(sp->zi_now, zonep, &rops, &wops, &lwops) == 0) {
 643                 zonep->zone_io_util = 0;
 644                 return (0);
 645         }
 646 
 647         zonep->zone_io_util = (rops * sp->zi_avgrlat) +
 648             (wops * sp->zi_avgwlat) + (lwops * sp->zi_avgwlat);
 649         sp->zi_totutil += zonep->zone_io_util;
 650 
 651         if (zonep->zone_io_util > 0) {
 652                 sp->zi_active++;
 653                 sp->zi_totpri += zonep->zone_zfs_io_pri;
 654         }
 655 
 656         /*
 657          * sdt:::zfs-zone-utilization
 658          *
 659          *      arg0: zone ID
 660          *      arg1: read operations observed during time window
 661          *      arg2: physical write operations observed during time window
 662          *      arg3: logical write ops observed during time window
 663          *      arg4: calculated utilization given read and write ops
 664          *      arg5: I/O priority assigned to this zone
 665          */
 666         DTRACE_PROBE6(zfs__zone__utilization, uint_t, zonep->zone_id,
 667             uint_t, rops, uint_t, wops, uint_t, lwops,
 668             uint_t, zonep->zone_io_util, uint_t, zonep->zone_zfs_io_pri);
 669 
 670         return (0);
 671 }
 672 
 673 static void
 674 zfs_zone_delay_inc(zone_t *zonep)
 675 {
 676         if (zonep->zone_io_delay < zfs_zone_delay_ceiling)
 677                 zonep->zone_io_delay += zfs_zone_delay_step;
 678 }
 679 
 680 static void
 681 zfs_zone_delay_dec(zone_t *zonep)
 682 {
 683         if (zonep->zone_io_delay > 0)
 684                 zonep->zone_io_delay -= zfs_zone_delay_step;
 685 }
 686 
 687 /*
 688  * For all zones "far enough" away from the average utilization, increase that
 689  * zones delay.  Otherwise, reduce its delay.
 690  */
 691 static int
 692 zfs_zone_wait_adjust_delay_cb(zone_t *zonep, void *arg)
 693 {
 694         zoneio_stats_t *sp = arg;
 695         uint16_t delay = zonep->zone_io_delay;
 696         uint_t fairutil = 0;
 697 
 698         zonep->zone_io_util_above_avg = B_FALSE;
 699 
 700         /*
 701          * Given the calculated total utilitzation for all zones, calculate the
 702          * fair share of I/O for this zone.
 703          */
 704         if (zfs_zone_priority_enable && sp->zi_totpri > 0) {
 705                 fairutil = (sp->zi_totutil * zonep->zone_zfs_io_pri) /
 706                     sp->zi_totpri;
 707         } else if (sp->zi_active > 0) {
 708                 fairutil = sp->zi_totutil / sp->zi_active;
 709         }
 710 
 711         /*
 712          * Adjust each IO's delay.  If the overall delay becomes too high, avoid
 713          * increasing beyond the ceiling value.
 714          */
 715         if (zonep->zone_io_util > fairutil && sp->zi_overutil) {
 716                 zonep->zone_io_util_above_avg = B_TRUE;
 717 
 718                 if (sp->zi_active > 1)
 719                         zfs_zone_delay_inc(zonep);
 720         } else if (zonep->zone_io_util < fairutil || sp->zi_underutil ||
 721             sp->zi_active <= 1) {
 722                 zfs_zone_delay_dec(zonep);
 723         }
 724 
 725         /*
 726          * sdt:::zfs-zone-throttle
 727          *
 728          *      arg0: zone ID
 729          *      arg1: old delay for this zone
 730          *      arg2: new delay for this zone
 731          *      arg3: calculated fair I/O utilization
 732          *      arg4: actual I/O utilization
 733          */
 734         DTRACE_PROBE5(zfs__zone__throttle, uintptr_t, zonep->zone_id,
 735             uintptr_t, delay, uintptr_t, zonep->zone_io_delay,
 736             uintptr_t, fairutil, uintptr_t, zonep->zone_io_util);
 737 
 738         return (0);
 739 }
 740 
 741 /*
 742  * Examine the utilization between different zones, and adjust the delay for
 743  * each zone appropriately.
 744  */
 745 static void
 746 zfs_zone_wait_adjust(hrtime_t unow, hrtime_t last_checked)
 747 {
 748         zoneio_stats_t stats;
 749         hrtime_t laggard_udelta = 0;
 750 
 751         (void) bzero(&stats, sizeof (stats));
 752 
 753         stats.zi_now = unow;
 754         get_sys_avg_lat(unow, &stats.zi_avgrlat, &stats.zi_avgwlat);
 755 
 756         if (stats.zi_avgrlat > stats.zi_avgwlat * zfs_zone_rw_lat_limit)
 757                 stats.zi_avgrlat = stats.zi_avgwlat * zfs_zone_rw_lat_limit;
 758         else if (stats.zi_avgrlat * zfs_zone_rw_lat_limit < stats.zi_avgwlat)
 759                 stats.zi_avgwlat = stats.zi_avgrlat * zfs_zone_rw_lat_limit;
 760 
 761         if (zone_walk(zfs_zone_wait_adjust_calculate_cb, &stats) != 0)
 762                 return;
 763 
 764         /*
 765          * Calculate disk utilization for the most recent period.
 766          */
 767         if (zfs_disk_last_rtime == 0 || unow - last_checked <= 0) {
 768                 stats.zi_diskutil = 0;
 769         } else {
 770                 stats.zi_diskutil =
 771                     ((zfs_disk_rtime - zfs_disk_last_rtime) * 100) /
 772                     ((unow - last_checked) * 1000);
 773         }
 774         zfs_disk_last_rtime = zfs_disk_rtime;
 775 
 776         if (unow > zfs_disk_last_laggard)
 777                 laggard_udelta = unow - zfs_disk_last_laggard;
 778 
 779         /*
 780          * To minimize porpoising, we have three separate states for our
 781          * assessment of I/O performance:  overutilized, underutilized, and
 782          * neither overutilized nor underutilized.  We will increment the
 783          * throttle if a zone is using more than its fair share _and_ I/O
 784          * is overutilized; we will decrement the throttle if a zone is using
 785          * less than its fair share _or_ I/O is underutilized.
 786          */
 787         stats.zi_underutil = stats.zi_diskutil < zfs_zone_underutil_threshold ||
 788             laggard_udelta > zfs_zone_laggard_ancient;
 789 
 790         stats.zi_overutil = stats.zi_diskutil > zfs_zone_util_threshold &&
 791             laggard_udelta < zfs_zone_laggard_recent;
 792 
 793         /*
 794          * sdt:::zfs-zone-stats
 795          *
 796          * Statistics observed over the last period:
 797          *
 798          *      arg0: average system read latency
 799          *      arg1: average system write latency
 800          *      arg2: number of active zones
 801          *      arg3: total I/O 'utilization' for all zones
 802          *      arg4: total I/O priority of all active zones
 803          *      arg5: calculated disk utilization
 804          */
 805         DTRACE_PROBE6(zfs__zone__stats, uintptr_t, stats.zi_avgrlat,
 806             uintptr_t, stats.zi_avgwlat, uintptr_t, stats.zi_active,
 807             uintptr_t, stats.zi_totutil, uintptr_t, stats.zi_totpri,
 808             uintptr_t, stats.zi_diskutil);
 809 
 810         (void) zone_walk(zfs_zone_wait_adjust_delay_cb, &stats);
 811 }
 812 
 813 /*
 814  * Callback used to calculate a zone's IO schedule priority.
 815  *
 816  * We scan the zones looking for ones with ops in the queue.  Out of those,
 817  * we pick the one that calculates to the highest schedule priority.
 818  */
 819 static int
 820 get_sched_pri_cb(zone_t *zonep, void *arg)
 821 {
 822         int pri;
 823         uint_t cnt;
 824         zone_q_bump_t *qbp = arg;
 825         zio_priority_t p = qbp->zq_queue;
 826 
 827         cnt = zonep->zone_zfs_queued[p];
 828         if (cnt == 0) {
 829                 zonep->zone_zfs_weight = 0;
 830                 return (0);
 831         }
 832 
 833         /*
 834          * On each pass, increment the zone's weight.  We use this as input
 835          * to the calculation to prevent starvation.  The value is reset
 836          * each time we issue an IO for this zone so zones which haven't
 837          * done any IO over several iterations will see their weight max
 838          * out.
 839          */
 840         if (zonep->zone_zfs_weight < SCHED_WEIGHT_MAX)
 841                 zonep->zone_zfs_weight++;
 842 
 843         /*
 844          * This zone's IO priority is the inverse of the number of IOs
 845          * the zone has enqueued * zone's configured priority * weight.
 846          * The queue depth has already been scaled by 10 to avoid problems
 847          * with int rounding.
 848          *
 849          * This means that zones with fewer IOs in the queue will get
 850          * preference unless other zone's assigned priority pulls them
 851          * ahead.  The weight is factored in to help ensure that zones
 852          * which haven't done IO in a while aren't getting starved.
 853          */
 854         pri = (qbp->zq_qdepth / cnt) *
 855             zonep->zone_zfs_io_pri * zonep->zone_zfs_weight;
 856 
 857         /*
 858          * If this zone has a higher priority than what we found so far,
 859          * it becomes the new leading contender.
 860          */
 861         if (pri > qbp->zq_priority) {
 862                 qbp->zq_zoneid = zonep->zone_id;
 863                 qbp->zq_priority = pri;
 864                 qbp->zq_wt = zonep->zone_zfs_weight;
 865         }
 866         return (0);
 867 }
 868 
 869 /*
 870  * See if we need to bump a zone's zio to the head of the queue. This is only
 871  * done on the two synchronous I/O queues (see the block comment on the
 872  * zfs_zone_schedule function). We get the correct vdev_queue_class_t and
 873  * queue depth from our caller.
 874  *
 875  * For single-threaded synchronous processes a zone cannot get more than
 876  * 1 op into the queue at a time unless the zone is running multiple processes
 877  * in parallel.  This can cause an imbalance in performance if there are zones
 878  * with many parallel processes (and ops in the queue) vs. other zones which
 879  * are doing simple single-threaded processes, such as interactive tasks in the
 880  * shell.  These zones can get backed up behind a deep queue and their IO
 881  * performance will appear to be very poor as a result.  This can make the
 882  * zone work badly for interactive behavior.
 883  *
 884  * The scheduling algorithm kicks in once we start to get a deeper queue.
 885  * Once that occurs, we look at all of the zones to see which one calculates
 886  * to the highest priority.  We bump that zone's first zio to the head of the
 887  * queue.
 888  *
 889  * We use a counter on the zone so that we can quickly find how many ops each
 890  * zone has in the queue without having to search the entire queue itself.
 891  * This scales better since the number of zones is expected to be on the
 892  * order of 10-100 whereas the queue depth can be in the range of 50-2000.
 893  * In addition, since the zio's in the queue only have the zoneid, we would
 894  * have to look up the zone for each zio enqueued and that means the overhead
 895  * for scanning the queue each time would be much higher.
 896  *
 897  * In all cases, we fall back to simply pulling the next op off the queue
 898  * if something should go wrong.
 899  */
 900 static zio_t *
 901 get_next_zio(vdev_queue_class_t *vqc, int qdepth, zio_priority_t p)
 902 {
 903         zone_q_bump_t qbump;
 904         zio_t *zp = NULL, *zphead;
 905         int cnt = 0;
 906 
 907         /* To avoid problems with int rounding, scale the queue depth by 10 */
 908         qbump.zq_qdepth = qdepth * 10;
 909         qbump.zq_priority = 0;
 910         qbump.zq_zoneid = 0;
 911         qbump.zq_queue = p;
 912         (void) zone_walk(get_sched_pri_cb, &qbump);
 913 
 914         zphead = avl_first(&vqc->vqc_queued_tree);
 915 
 916         /* Check if the scheduler didn't pick a zone for some reason!? */
 917         if (qbump.zq_zoneid != 0) {
 918                 for (zp = avl_first(&vqc->vqc_queued_tree); zp != NULL;
 919                     zp = avl_walk(&vqc->vqc_queued_tree, zp, AVL_AFTER)) {
 920                         if (zp->io_zoneid == qbump.zq_zoneid)
 921                                 break;
 922                         cnt++;
 923                 }
 924         }
 925 
 926         if (zp == NULL) {
 927                 zp = zphead;
 928         } else if (zp != zphead) {
 929                 /*
 930                  * Only fire the probe if we actually picked a different zio
 931                  * than the one already at the head of the queue.
 932                  */
 933                 DTRACE_PROBE4(zfs__zone__sched__bump, uint_t, zp->io_zoneid,
 934                     uint_t, cnt, int, qbump.zq_priority, int, qbump.zq_wt);
 935         }
 936 
 937         return (zp);
 938 }
 939 
 940 /*
 941  * Add our zone ID to the zio so we can keep track of which zones are doing
 942  * what, even when the current thread processing the zio is not associated
 943  * with the zone (e.g. the kernel taskq which pushes out RX groups).
 944  */
 945 void
 946 zfs_zone_zio_init(zio_t *zp)
 947 {
 948         zone_t  *zonep = curzone;
 949 
 950         zp->io_zoneid = zonep->zone_id;
 951 }
 952 
 953 /*
 954  * Track IO operations per zone.  Called from dmu_tx_count_write for write ops
 955  * and dmu_read_uio for read ops.  For each operation, increment that zone's
 956  * counter based on the type of operation.
 957  *
 958  * There are three basic ways that we can see write ops:
 959  * 1) An application does write syscalls.  Those ops go into a TXG which
 960  *    we'll count here.  Sometime later a kernel taskq thread (we'll see the
 961  *    vdev IO as zone 0) will perform some number of physical writes to commit
 962  *    the TXG to disk.  Those writes are not associated with the zone which
 963  *    made the write syscalls and the number of operations is not correlated
 964  *    between the taskq and the zone.
 965  * 2) An application opens a file with O_SYNC.  Each write will result in
 966  *    an operation which we'll see here plus a low-level vdev write from
 967  *    that zone.
 968  * 3) An application does write syscalls followed by an fsync().  We'll
 969  *    count the writes going into a TXG here.  We'll also see some number
 970  *    (usually much smaller, maybe only 1) of low-level vdev writes from this
 971  *    zone when the fsync is performed, plus some other low-level vdev writes
 972  *    from the taskq in zone 0 (are these metadata writes?).
 973  *
 974  * 4) In addition to the above, there are misc. system-level writes, such as
 975  *    writing out dirty pages to swap, or sync(2) calls, which will be handled
 976  *    by the global zone and which we count but don't generally worry about.
 977  *
 978  * Because of the above, we can see writes twice because this is called
 979  * at a high level by a zone thread, but we also will count the phys. writes
 980  * that are performed at a low level via zfs_zone_zio_start.
 981  *
 982  * Without this, it can look like a non-global zone never writes (case 1).
 983  * Depending on when the TXG is synced, the counts may be in the same sample
 984  * bucket or in a different one.
 985  *
 986  * Tracking read operations is simpler due to their synchronous semantics.  The
 987  * zfs_read function -- called as a result of a read(2) syscall -- will always
 988  * retrieve the data to be read through dmu_read_uio.
 989  */
 990 void
 991 zfs_zone_io_throttle(zfs_zone_iop_type_t type)
 992 {
 993         zone_t *zonep = curzone;
 994         hrtime_t unow, last_checked;
 995         uint16_t wait;
 996 
 997         unow = GET_USEC_TIME;
 998 
 999         /*
1000          * Only bump the counters for logical operations here.  The counters for
1001          * tracking physical IO operations are handled in zfs_zone_zio_done.
1002          */
1003         if (type == ZFS_ZONE_IOP_LOGICAL_WRITE) {
1004                 mutex_enter(&zonep->zone_stg_io_lock);
1005                 add_iop(zonep, unow, type, 0);
1006                 mutex_exit(&zonep->zone_stg_io_lock);
1007         }
1008 
1009         if (!zfs_zone_delay_enable)
1010                 return;
1011 
1012         /*
1013          * If the zone's I/O priority is set to zero, don't throttle that zone's
1014          * operations at all.
1015          */
1016         if (zonep->zone_zfs_io_pri == 0)
1017                 return;
1018 
1019         /*
1020          * XXX There's a potential race here in that more than one thread may
1021          * update the zone delays concurrently.  The worst outcome is corruption
1022          * of our data to track each zone's IO, so the algorithm may make
1023          * incorrect throttling decisions until the data is refreshed.
1024          */
1025         last_checked = zfs_zone_last_checked;
1026         if ((unow - last_checked) > zfs_zone_adjust_time) {
1027                 zfs_zone_last_checked = unow;
1028                 zfs_zone_wait_adjust(unow, last_checked);
1029         }
1030 
1031         if ((wait = zonep->zone_io_delay) > 0) {
1032                 /*
1033                  * If this is a write and we're doing above normal TXG
1034                  * syncing, then throttle for longer than normal.
1035                  */
1036                 if (type == ZFS_ZONE_IOP_LOGICAL_WRITE &&
1037                     (txg_cnt > 1 || txg_sync_rate > 1))
1038                         wait *= zfs_zone_txg_throttle_scale;
1039 
1040                 /*
1041                  * sdt:::zfs-zone-wait
1042                  *
1043                  *      arg0: zone ID
1044                  *      arg1: type of IO operation
1045                  *      arg2: time to delay (in us)
1046                  */
1047                 DTRACE_PROBE3(zfs__zone__wait, uintptr_t, zonep->zone_id,
1048                     uintptr_t, type, uintptr_t, wait);
1049 
1050                 drv_usecwait(wait);
1051         }
1052 }
1053 
1054 /*
1055  * XXX Ignore the pool pointer parameter for now.
1056  *
1057  * Keep track to see if the TXG sync rate is running above the expected rate.
1058  * If so, this implies that we are filling TXG's at a high rate due to a heavy
1059  * write workload.  We use this as input into the zone throttle.
1060  *
1061  * This function is called every 5 seconds (zfs_txg_timeout) under a normal
1062  * write load.  In this case, the sync rate is going to be 1.  When there
1063  * is a heavy write load, TXG's fill up fast and the sync thread will write
1064  * the TXG more frequently (perhaps once a second).  In this case the rate
1065  * will be > 1.  The sync rate is a lagging indicator since it can be up
1066  * to 5 seconds old.  We use the txg_cnt to keep track of the rate in the
1067  * current 5 second interval and txg_sync_rate to keep track of the previous
1068  * 5 second interval.  In that way we don't have a period (1 or more seconds)
1069  * where the txg_cnt == 0 and we cut back on throttling even though the rate
1070  * is still high.
1071  */
1072 /*ARGSUSED*/
1073 void
1074 zfs_zone_report_txg_sync(void *dp)
1075 {
1076         uint_t now;
1077 
1078         txg_cnt++;
1079         now = (uint_t)(gethrtime() / NANOSEC);
1080         if ((now - txg_last_check) >= zfs_txg_timeout) {
1081                 txg_sync_rate = txg_cnt / 2;
1082                 txg_cnt = 0;
1083                 txg_last_check = now;
1084         }
1085 }
1086 
1087 hrtime_t
1088 zfs_zone_txg_delay()
1089 {
1090         if (curzone->zone_io_util_above_avg)
1091                 return (zfs_zone_txg_delay_nsec);
1092 
1093         return (MSEC2NSEC(10));
1094 }
1095 
1096 /*
1097  * Called from vdev_disk_io_start when an IO hits the end of the zio pipeline
1098  * and is issued.
1099  * Keep track of start time for latency calculation in zfs_zone_zio_done.
1100  */
1101 void
1102 zfs_zone_zio_start(zio_t *zp)
1103 {
1104         zone_t  *zonep;
1105 
1106         /*
1107          * I/Os of type ZIO_TYPE_IOCTL are used to flush the disk cache, not for
1108          * an actual I/O operation.  Ignore those operations as they relate to
1109          * throttling and scheduling.
1110          */
1111         if (zp->io_type == ZIO_TYPE_IOCTL)
1112                 return;
1113 
1114         if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
1115                 return;
1116 
1117         zonep->zone_zfs_weight = 0;
1118 
1119         mutex_enter(&zfs_disk_lock);
1120         zp->io_dispatched = gethrtime();
1121 
1122         if (zfs_disk_rcnt++ != 0)
1123                 zfs_disk_rtime += (zp->io_dispatched - zfs_disk_rlastupdate);
1124         zfs_disk_rlastupdate = zp->io_dispatched;
1125         mutex_exit(&zfs_disk_lock);
1126 
1127         zone_rele(zonep);
1128 }
1129 
1130 /*
1131  * Called from vdev_disk_io_done when an IO completes.
1132  * Increment our counter for zone ops.
1133  * Calculate the IO latency avg. for this zone.
1134  */
1135 void
1136 zfs_zone_zio_done(zio_t *zp)
1137 {
1138         zone_t  *zonep;
1139         hrtime_t now, unow, udelta;
1140 
1141         if (zp->io_type == ZIO_TYPE_IOCTL)
1142                 return;
1143 
1144         if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
1145                 return;
1146 
1147         if (zp->io_dispatched == 0)
1148                 return;
1149 
1150         now = gethrtime();
1151         unow = NANO_TO_MICRO(now);
1152         udelta = unow - NANO_TO_MICRO(zp->io_dispatched);
1153 
1154         mutex_enter(&zfs_disk_lock);
1155         zfs_disk_rcnt--;
1156         zfs_disk_rtime += (now - zfs_disk_rlastupdate);
1157         zfs_disk_rlastupdate = now;
1158 
1159         if (udelta > zfs_zone_laggard_threshold)
1160                 zfs_disk_last_laggard = unow;
1161 
1162         mutex_exit(&zfs_disk_lock);
1163 
1164         if (zfs_zone_delay_enable) {
1165                 mutex_enter(&zonep->zone_stg_io_lock);
1166                 add_iop(zonep, unow, zp->io_type == ZIO_TYPE_READ ?
1167                     ZFS_ZONE_IOP_READ : ZFS_ZONE_IOP_WRITE, udelta);
1168                 mutex_exit(&zonep->zone_stg_io_lock);
1169         }
1170 
1171         zone_rele(zonep);
1172 
1173         /*
1174          * sdt:::zfs-zone-latency
1175          *
1176          *      arg0: zone ID
1177          *      arg1: type of I/O operation
1178          *      arg2: I/O latency (in us)
1179          */
1180         DTRACE_PROBE3(zfs__zone__latency, uintptr_t, zp->io_zoneid,
1181             uintptr_t, zp->io_type, uintptr_t, udelta);
1182 }
1183 
1184 void
1185 zfs_zone_zio_dequeue(zio_t *zp)
1186 {
1187         zio_priority_t p;
1188         zone_t  *zonep;
1189 
1190         p = zp->io_priority;
1191         if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE)
1192                 return;
1193 
1194         /* We depend on p being defined as either 0 or 1 */
1195         ASSERT(p < 2);
1196 
1197         if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
1198                 return;
1199 
1200         mutex_enter(&zonep->zone_stg_io_lock);
1201         ASSERT(zonep->zone_zfs_queued[p] > 0);
1202         if (zonep->zone_zfs_queued[p] == 0)
1203                 cmn_err(CE_WARN, "zfs_zone_zio_dequeue: count==0");
1204         else
1205                 zonep->zone_zfs_queued[p]--;
1206         mutex_exit(&zonep->zone_stg_io_lock);
1207         zone_rele(zonep);
1208 }
1209 
1210 void
1211 zfs_zone_zio_enqueue(zio_t *zp)
1212 {
1213         zio_priority_t p;
1214         zone_t  *zonep;
1215 
1216         p = zp->io_priority;
1217         if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE)
1218                 return;
1219 
1220         /* We depend on p being defined as either 0 or 1 */
1221         ASSERT(p < 2);
1222 
1223         if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
1224                 return;
1225 
1226         mutex_enter(&zonep->zone_stg_io_lock);
1227         zonep->zone_zfs_queued[p]++;
1228         mutex_exit(&zonep->zone_stg_io_lock);
1229         zone_rele(zonep);
1230 }
1231 
1232 /*
1233  * Called from vdev_queue_io_to_issue. That function is where zio's are listed
1234  * in FIFO order on one of the sync queues, then pulled off (by
1235  * vdev_queue_io_remove) and issued.  We potentially do zone-based scheduling
1236  * here to find a zone's zio deeper in the sync queue and issue that instead
1237  * of simply doing FIFO.
1238  *
1239  * We only do zone-based zio scheduling for the two synchronous I/O queues
1240  * (read & write). These queues are normally serviced in FIFO order but we
1241  * may decide to move a zone's zio to the head of the line. A typical I/O
1242  * load will be mostly synchronous reads and some asynchronous writes (which
1243  * are scheduled differently due to transaction groups). There will also be
1244  * some synchronous writes for those apps which want to ensure their data is on
1245  * disk. We want to make sure that a zone with a single-threaded app (e.g. the
1246  * shell) that is doing synchronous I/O (typically reads) isn't penalized by
1247  * other zones which are doing lots of synchronous I/O because they have many
1248  * running threads.
1249  *
1250  * The vq->vq_lock mutex is held when we're executing this function so we
1251  * can safely access the "last zone" variable on the queue.
1252  */
1253 zio_t *
1254 zfs_zone_schedule(vdev_queue_t *vq, zio_priority_t p, avl_index_t idx)
1255 {
1256         vdev_queue_class_t *vqc = &vq->vq_class[p];
1257         uint_t cnt;
1258         zoneid_t last_zone;
1259         zio_t *zio;
1260 
1261         ASSERT(MUTEX_HELD(&vq->vq_lock));
1262 
1263         /* Don't change the order on the LBA ordered queues. */
1264         if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE)
1265                 return (avl_nearest(&vqc->vqc_queued_tree, idx, AVL_AFTER));
1266 
1267         /* We depend on p being defined as either 0 or 1 */
1268         ASSERT(p < 2);
1269 
1270         cnt = avl_numnodes(&vqc->vqc_queued_tree);
1271         last_zone = vq->vq_last_zone_id;
1272 
1273         /*
1274          * If there are only a few zios in the queue then just issue the head.
1275          * If there are more than a few zios already queued up, then use
1276          * scheduling to get the next zio.
1277          */
1278         if (!zfs_zone_schedule_enable || cnt < zfs_zone_schedule_thresh)
1279                 zio = avl_nearest(&vqc->vqc_queued_tree, idx, AVL_AFTER);
1280         else
1281                 zio = get_next_zio(vqc, cnt, p);
1282 
1283         vq->vq_last_zone_id = zio->io_zoneid;
1284 
1285         /*
1286          * Probe with 4 args; the number of IOs in the queue, the zone that
1287          * was last scheduled off this queue, the zone that was associated
1288          * with the next IO that is scheduled, and which queue (priority).
1289          */
1290         DTRACE_PROBE4(zfs__zone__sched, uint_t, cnt, uint_t, last_zone,
1291             uint_t, zio->io_zoneid, uint_t, p);
1292 
1293         return (zio);
1294 }
1295 
1296 #endif