1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2013, Joyent, Inc. All rights reserved.
14 */
15
16 /*
17 * The ZFS/Zone I/O throttle and scheduler attempts to ensure fair access to
18 * ZFS I/O resources for each zone.
19 *
20 * I/O contention can be major pain point on a multi-tenant system. A single
21 * zone can issue a stream of I/O operations, usually synchronous writes, which
22 * disrupt I/O performance for all other zones. This problem is further
23 * exacerbated by ZFS, which buffers all asynchronous writes in a single TXG,
24 * a set of blocks which are atomically synced to disk. The process of
25 * syncing a TXG can occupy all of a device's I/O bandwidth, thereby starving
26 * out any pending read operations.
27 *
28 * There are two facets to this capability; the throttle and the scheduler.
29 *
30 * Throttle
31 *
32 * The requirements on the throttle are:
33 *
34 * 1) Ensure consistent and predictable I/O latency across all zones.
35 * 2) Sequential and random workloads have very different characteristics,
36 * so it is a non-starter to track IOPS or throughput.
37 * 3) A zone should be able to use the full disk bandwidth if no other zone
38 * is actively using the disk.
39 *
40 * The throttle has two components: one to track and account for each zone's
41 * I/O requests, and another to throttle each zone's operations when it
42 * exceeds its fair share of disk I/O. When the throttle detects that a zone is
43 * consuming more than is appropriate, each read or write system call is
44 * delayed by up to 100 microseconds, which we've found is sufficient to allow
45 * other zones to interleave I/O requests during those delays.
46 *
47 * Note: The throttle will delay each logical I/O (as opposed to the physical
48 * I/O which will likely be issued asynchronously), so it may be easier to
49 * think of the I/O throttle delaying each read/write syscall instead of the
50 * actual I/O operation. For each zone, the throttle tracks an ongoing average
51 * of read and write operations performed to determine the overall I/O
52 * utilization for each zone.
53 *
54 * The throttle calculates a I/O utilization metric for each zone using the
55 * following formula:
56 *
57 * (# of read syscalls) x (Average read latency) +
58 * (# of write syscalls) x (Average write latency)
59 *
60 * Once each zone has its utilization metric, the I/O throttle will compare I/O
61 * utilization across all zones, and if a zone has a higher-than-average I/O
62 * utilization, system calls from that zone are throttled. That is, if one
63 * zone has a much higher utilization, that zone's delay is increased by 5
64 * microseconds, up to a maximum of 100 microseconds. Conversely, if a zone is
65 * already throttled and has a lower utilization than average, its delay will
66 * be lowered by 5 microseconds.
67 *
68 * The throttle calculation is driven by IO activity, but since IO does not
69 * happen at fixed intervals, timestamps are used to track when the last update
70 * was made and to drive recalculation.
71 *
72 * The throttle recalculates each zone's I/O usage and throttle delay (if any)
73 * on the zfs_zone_adjust_time interval. Overall I/O latency is maintained as
74 * a decayed average which is updated on the zfs_zone_sys_avg_cycle interval.
75 *
76 * Scheduler
77 *
78 * The I/O scheduler manages the vdev queues, the queues of pending I/Os to
79 * issue to the disks. It only makes scheduling decisions for the two
80 * synchronous I/O queues (read & write).
81 *
82 * The scheduler maintains how many I/Os in the queue are from each zone, and
83 * if one zone has a disproportionately large number of I/Os in the queue, the
84 * scheduler will allow certain I/Os from the underutilized zones to be "bumped"
85 * and pulled from the middle of the queue. This bump allows zones with a small
86 * number of I/Os (so small they may not even be taken into account by the
87 * throttle) to complete quickly instead of waiting behind dozens of I/Os from
88 * other zones.
89 */
90
91 #include <sys/spa.h>
92 #include <sys/vdev_impl.h>
93 #include <sys/zfs_zone.h>
94
95 #ifndef _KERNEL
96
97 /*
98 * Stubs for when compiling for user-land.
99 */
100
101 void
102 zfs_zone_io_throttle(zfs_zone_iop_type_t type)
103 {
104 }
105
106 void
107 zfs_zone_zio_init(zio_t *zp)
108 {
109 }
110
111 void
112 zfs_zone_zio_start(zio_t *zp)
113 {
114 }
115
116 void
117 zfs_zone_zio_done(zio_t *zp)
118 {
119 }
120
121 void
122 zfs_zone_zio_dequeue(zio_t *zp)
123 {
124 }
125
126 void
127 zfs_zone_zio_enqueue(zio_t *zp)
128 {
129 }
130
131 /*ARGSUSED*/
132 void
133 zfs_zone_report_txg_sync(void *dp)
134 {
135 }
136
137 hrtime_t
138 zfs_zone_txg_delay()
139 {
140 return (MSEC2NSEC(10));
141 }
142
143 #else
144
145 /*
146 * The real code.
147 */
148
149 #include <sys/systm.h>
150 #include <sys/thread.h>
151 #include <sys/proc.h>
152 #include <sys/types.h>
153 #include <sys/param.h>
154 #include <sys/time.h>
155 #include <sys/atomic.h>
156 #include <sys/zio.h>
157 #include <sys/zone.h>
158 #include <sys/avl.h>
159 #include <sys/sdt.h>
160 #include <sys/ddi.h>
161
162 /*
163 * The zone throttle delays read and write operations from certain zones based
164 * on each zone's IO utilitzation. Once a cycle (defined by zfs_zone_cycle_time
165 * below), the delays for each zone are recalculated based on the utilization
166 * over the previous window.
167 */
168 boolean_t zfs_zone_delay_enable = B_TRUE; /* enable IO throttle */
169 uint16_t zfs_zone_delay_step = 5; /* usec amnt to change delay */
170 uint16_t zfs_zone_delay_ceiling = 100; /* usec delay max */
171
172 boolean_t zfs_zone_priority_enable = B_TRUE; /* enable IO priority */
173
174 /*
175 * For certain workloads, one zone may be issuing primarily sequential I/O and
176 * another primarily random I/O. The sequential I/O will complete much more
177 * quickly than the random I/O, driving the average system latency for those
178 * operations way down. As a result, the random I/O may be throttled back, even
179 * though the sequential I/O should be throttled to allow the random I/O more
180 * access to the disk.
181 *
182 * This tunable limits the discrepancy between the read and write system
183 * latency. If one becomes excessively high, this tunable prevents the I/O
184 * throttler from exacerbating the imbalance.
185 */
186 uint_t zfs_zone_rw_lat_limit = 10;
187
188 /*
189 * The I/O throttle will only start delaying zones when it detects disk
190 * utilization has reached a certain level. This tunable controls the
191 * threshold at which the throttle will start delaying zones. When the number
192 * of vdevs is small, the calculation should correspond closely with the %b
193 * column from iostat -- but as the number of vdevs becomes large, it will
194 * correlate less and less to any single device (therefore making it a poor
195 * approximation for the actual I/O utilization on such systems). We
196 * therefore use our derived utilization conservatively: we know that low
197 * derived utilization does indeed correlate to low I/O use -- but that a high
198 * rate of derived utilization does not necesarily alone denote saturation;
199 * where we see a high rate of utilization, we also look for laggard I/Os to
200 * attempt to detect saturation.
201 */
202 uint_t zfs_zone_util_threshold = 80;
203 uint_t zfs_zone_underutil_threshold = 60;
204
205 /*
206 * There are three important tunables here: zfs_zone_laggard_threshold denotes
207 * the threshold at which an I/O is considered to be of notably high latency;
208 * zfs_zone_laggard_recent denotes the number of microseconds before the
209 * current time after which the last laggard is considered to be sufficiently
210 * recent to merit increasing the throttle; zfs_zone_laggard_ancient denotes
211 * the microseconds before the current time before which the last laggard is
212 * considered to be sufficiently old to merit decreasing the throttle. The
213 * most important tunable of these three is the zfs_zone_laggard_threshold: in
214 * modeling data from a large public cloud, this tunable was found to have a
215 * much greater effect on the throttle than the two time-based thresholds.
216 * This must be set high enough to not result in spurious throttling, but not
217 * so high as to allow pathological I/O to persist in the system.
218 */
219 uint_t zfs_zone_laggard_threshold = 50000; /* 50 ms */
220 uint_t zfs_zone_laggard_recent = 1000000; /* 1000 ms */
221 uint_t zfs_zone_laggard_ancient = 5000000; /* 5000 ms */
222
223 /*
224 * Throughout this subsystem, our timestamps are in microseconds. Our system
225 * average cycle is one second or 1 million microseconds. Our zone counter
226 * update cycle is two seconds or 2 million microseconds. We use a longer
227 * duration for that cycle because some ops can see a little over two seconds of
228 * latency when they are being starved by another zone.
229 */
230 uint_t zfs_zone_sys_avg_cycle = 1000000; /* 1 s */
231 uint_t zfs_zone_cycle_time = 2000000; /* 2 s */
232
233 /*
234 * How often the I/O throttle will reevaluate each zone's utilization, in
235 * microseconds. Default is 1/4 sec.
236 */
237 uint_t zfs_zone_adjust_time = 250000; /* 250 ms */
238
239 typedef struct {
240 hrtime_t cycle_start;
241 int cycle_cnt;
242 hrtime_t cycle_lat;
243 hrtime_t sys_avg_lat;
244 } sys_lat_cycle_t;
245
246 typedef struct {
247 hrtime_t zi_now;
248 uint_t zi_avgrlat;
249 uint_t zi_avgwlat;
250 uint64_t zi_totpri;
251 uint64_t zi_totutil;
252 int zi_active;
253 uint_t zi_diskutil;
254 boolean_t zi_underutil;
255 boolean_t zi_overutil;
256 } zoneio_stats_t;
257
258 static sys_lat_cycle_t rd_lat;
259 static sys_lat_cycle_t wr_lat;
260
261 /*
262 * Some basic disk stats to determine disk utilization. The utilization info
263 * for all disks on the system is aggregated into these values.
264 *
265 * Overall disk utilization for the current cycle is calculated as:
266 *
267 * ((zfs_disk_rtime - zfs_disk_last_rtime) * 100)
268 * ----------------------------------------------
269 * ((now - zfs_zone_last_checked) * 1000);
270 */
271 kmutex_t zfs_disk_lock; /* protects the following: */
272 uint_t zfs_disk_rcnt; /* Number of outstanding IOs */
273 hrtime_t zfs_disk_rtime = 0; /* cummulative sum of time performing IO */
274 hrtime_t zfs_disk_rlastupdate = 0; /* time last IO dispatched */
275
276 hrtime_t zfs_disk_last_rtime = 0; /* prev. cycle's zfs_disk_rtime val */
277 /* time that we last updated per-zone throttle info */
278 hrtime_t zfs_zone_last_checked = 0;
279 hrtime_t zfs_disk_last_laggard = 0;
280
281 /*
282 * Data used to keep track of how often txg sync is running.
283 */
284 extern int zfs_txg_timeout;
285 static uint_t txg_last_check;
286 static uint_t txg_cnt;
287 static uint_t txg_sync_rate;
288
289 boolean_t zfs_zone_schedule_enable = B_TRUE; /* enable IO sched. */
290 /*
291 * Threshold for when zio scheduling should kick in.
292 *
293 * This threshold is based on the zfs_vdev_sync_read_max_active value for the
294 * number of I/Os that can be pending on a device. If there are more than the
295 * max_active ops already queued up, beyond those already issued to the vdev,
296 * then use zone-based scheduling to get the next synchronous zio.
297 */
298 uint32_t zfs_zone_schedule_thresh = 10;
299
300 /*
301 * On each pass of the scheduler we increment the zone's weight (up to this
302 * maximum). The weight is used by the scheduler to prevent starvation so
303 * that zones which haven't been able to do any IO over many iterations
304 * will max out thier weight to this value.
305 */
306 #define SCHED_WEIGHT_MAX 20
307
308 /*
309 * Tunables for delay throttling when TXG sync is occurring.
310 *
311 * If the zone is performing a write and we're doing above normal TXG syncing,
312 * then throttle for longer than normal. The zone's wait time is multiplied
313 * by the scale (zfs_zone_txg_throttle_scale).
314 */
315 int zfs_zone_txg_throttle_scale = 2;
316 hrtime_t zfs_zone_txg_delay_nsec = MSEC2NSEC(20);
317
318 typedef struct {
319 int zq_qdepth;
320 zio_priority_t zq_queue;
321 int zq_priority;
322 int zq_wt;
323 zoneid_t zq_zoneid;
324 } zone_q_bump_t;
325
326 /*
327 * This uses gethrtime() but returns a value in usecs.
328 */
329 #define GET_USEC_TIME (gethrtime() / 1000)
330 #define NANO_TO_MICRO(x) (x / (NANOSEC / MICROSEC))
331
332 /*
333 * Keep track of the zone's ZFS IOPs.
334 *
335 * See the comment on the zfs_zone_io_throttle function for which/how IOPs are
336 * accounted for.
337 *
338 * If the number of ops is >1 then we can just use that value. However,
339 * if the number of ops is <2 then we might have a zone which is trying to do
340 * IO but is not able to get any ops through the system. We don't want to lose
341 * track of this zone so we factor in its decayed count into the current count.
342 *
343 * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed count.
344 * However, since this calculation is driven by IO activity and since IO does
345 * not happen at fixed intervals, we use a timestamp to see when the last update
346 * was made. If it was more than one cycle ago, then we need to decay the
347 * historical count by the proper number of additional cycles in which no IO was
348 * performed.
349 *
350 * Return a time delta indicating how far into the current cycle we are or 0
351 * if the last IO was more than a cycle ago.
352 */
353 static hrtime_t
354 compute_historical_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp)
355 {
356 hrtime_t delta;
357 int gen_cnt;
358
359 /*
360 * Check if its time to recompute a new zone count.
361 * If we're still collecting data for the current cycle, return false.
362 */
363 delta = unow - cp->cycle_start;
364 if (delta < zfs_zone_cycle_time)
365 return (delta);
366
367 /* A previous cycle is past, compute the new zone count. */
368
369 /*
370 * Figure out how many generations we have to decay the historical
371 * count, since multiple cycles may have elapsed since our last IO.
372 * We depend on int rounding here.
373 */
374 gen_cnt = (int)(delta / zfs_zone_cycle_time);
375
376 /* If more than 5 cycles since last the IO, reset count. */
377 if (gen_cnt > 5) {
378 cp->zone_avg_cnt = 0;
379 } else {
380 /* Update the count. */
381 int i;
382
383 /*
384 * If the zone did more than 1 IO, just use its current count
385 * as the historical value, otherwise decay the historical
386 * count and factor that into the new historical count. We
387 * pick a threshold > 1 so that we don't lose track of IO due
388 * to int rounding.
389 */
390 if (cp->cycle_cnt > 1)
391 cp->zone_avg_cnt = cp->cycle_cnt;
392 else
393 cp->zone_avg_cnt = cp->cycle_cnt +
394 (cp->zone_avg_cnt / 2);
395
396 /*
397 * If more than one generation has elapsed since the last
398 * update, decay the values further.
399 */
400 for (i = 1; i < gen_cnt; i++)
401 cp->zone_avg_cnt = cp->zone_avg_cnt / 2;
402 }
403
404 /* A new cycle begins. */
405 cp->cycle_start = unow;
406 cp->cycle_cnt = 0;
407
408 return (0);
409 }
410
411 /*
412 * Add IO op data to the zone.
413 */
414 static void
415 add_zone_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op)
416 {
417 switch (op) {
418 case ZFS_ZONE_IOP_READ:
419 (void) compute_historical_zone_cnt(unow, &zonep->zone_rd_ops);
420 zonep->zone_rd_ops.cycle_cnt++;
421 break;
422 case ZFS_ZONE_IOP_WRITE:
423 (void) compute_historical_zone_cnt(unow, &zonep->zone_wr_ops);
424 zonep->zone_wr_ops.cycle_cnt++;
425 break;
426 case ZFS_ZONE_IOP_LOGICAL_WRITE:
427 (void) compute_historical_zone_cnt(unow, &zonep->zone_lwr_ops);
428 zonep->zone_lwr_ops.cycle_cnt++;
429 break;
430 }
431 }
432
433 /*
434 * Use a decaying average to keep track of the overall system latency.
435 *
436 * We want to have the recent activity heavily weighted, but if the
437 * activity decreases or stops, then the average should quickly decay
438 * down to the new value.
439 *
440 * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed average.
441 * However, since this calculation is driven by IO activity and since IO does
442 * not happen at fixed intervals, we use a timestamp to see when the last
443 * update was made. If it was more than one cycle ago, then we need to decay
444 * the average by the proper number of additional cycles in which no IO was
445 * performed.
446 *
447 * Return true if we actually computed a new system average.
448 * If we're still within an active cycle there is nothing to do, return false.
449 */
450 static boolean_t
451 compute_new_sys_avg(hrtime_t unow, sys_lat_cycle_t *cp)
452 {
453 hrtime_t delta;
454 int gen_cnt;
455
456 /*
457 * Check if its time to recompute a new average.
458 * If we're still collecting data for the current cycle, return false.
459 */
460 delta = unow - cp->cycle_start;
461 if (delta < zfs_zone_sys_avg_cycle)
462 return (B_FALSE);
463
464 /* A previous cycle is past, compute a new system average. */
465
466 /*
467 * Figure out how many generations we have to decay, since multiple
468 * cycles may have elapsed since our last IO.
469 * We count on int rounding here.
470 */
471 gen_cnt = (int)(delta / zfs_zone_sys_avg_cycle);
472
473 /* If more than 5 cycles since last the IO, reset average. */
474 if (gen_cnt > 5) {
475 cp->sys_avg_lat = 0;
476 } else {
477 /* Update the average. */
478 int i;
479
480 cp->sys_avg_lat =
481 (cp->sys_avg_lat + cp->cycle_lat) / (1 + cp->cycle_cnt);
482
483 /*
484 * If more than one generation has elapsed since the last
485 * update, decay the values further.
486 */
487 for (i = 1; i < gen_cnt; i++)
488 cp->sys_avg_lat = cp->sys_avg_lat / 2;
489 }
490
491 /* A new cycle begins. */
492 cp->cycle_start = unow;
493 cp->cycle_cnt = 0;
494 cp->cycle_lat = 0;
495
496 return (B_TRUE);
497 }
498
499 static void
500 add_sys_iop(hrtime_t unow, int op, int lat)
501 {
502 switch (op) {
503 case ZFS_ZONE_IOP_READ:
504 (void) compute_new_sys_avg(unow, &rd_lat);
505 rd_lat.cycle_cnt++;
506 rd_lat.cycle_lat += lat;
507 break;
508 case ZFS_ZONE_IOP_WRITE:
509 (void) compute_new_sys_avg(unow, &wr_lat);
510 wr_lat.cycle_cnt++;
511 wr_lat.cycle_lat += lat;
512 break;
513 }
514 }
515
516 /*
517 * Get the zone IO counts.
518 */
519 static uint_t
520 calc_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp)
521 {
522 hrtime_t delta;
523 uint_t cnt;
524
525 if ((delta = compute_historical_zone_cnt(unow, cp)) == 0) {
526 /*
527 * No activity in the current cycle, we already have the
528 * historical data so we'll use that.
529 */
530 cnt = cp->zone_avg_cnt;
531 } else {
532 /*
533 * If we're less than half way through the cycle then use
534 * the current count plus half the historical count, otherwise
535 * just use the current count.
536 */
537 if (delta < (zfs_zone_cycle_time / 2))
538 cnt = cp->cycle_cnt + (cp->zone_avg_cnt / 2);
539 else
540 cnt = cp->cycle_cnt;
541 }
542
543 return (cnt);
544 }
545
546 /*
547 * Get the average read/write latency in usecs for the system.
548 */
549 static uint_t
550 calc_avg_lat(hrtime_t unow, sys_lat_cycle_t *cp)
551 {
552 if (compute_new_sys_avg(unow, cp)) {
553 /*
554 * No activity in the current cycle, we already have the
555 * historical data so we'll use that.
556 */
557 return (cp->sys_avg_lat);
558 } else {
559 /*
560 * We're within a cycle; weight the current activity higher
561 * compared to the historical data and use that.
562 */
563 DTRACE_PROBE3(zfs__zone__calc__wt__avg,
564 uintptr_t, cp->sys_avg_lat,
565 uintptr_t, cp->cycle_lat,
566 uintptr_t, cp->cycle_cnt);
567
568 return ((cp->sys_avg_lat + (cp->cycle_lat * 8)) /
569 (1 + (cp->cycle_cnt * 8)));
570 }
571 }
572
573 /*
574 * Account for the current IOP on the zone and for the system as a whole.
575 * The latency parameter is in usecs.
576 */
577 static void
578 add_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op, hrtime_t lat)
579 {
580 /* Add op to zone */
581 add_zone_iop(zonep, unow, op);
582
583 /* Track system latency */
584 if (op != ZFS_ZONE_IOP_LOGICAL_WRITE)
585 add_sys_iop(unow, op, lat);
586 }
587
588 /*
589 * Calculate and return the total number of read ops, write ops and logical
590 * write ops for the given zone. If the zone has issued operations of any type
591 * return a non-zero value, otherwise return 0.
592 */
593 static int
594 get_zone_io_cnt(hrtime_t unow, zone_t *zonep, uint_t *rops, uint_t *wops,
595 uint_t *lwops)
596 {
597 *rops = calc_zone_cnt(unow, &zonep->zone_rd_ops);
598 *wops = calc_zone_cnt(unow, &zonep->zone_wr_ops);
599 *lwops = calc_zone_cnt(unow, &zonep->zone_lwr_ops);
600
601 DTRACE_PROBE4(zfs__zone__io__cnt, uintptr_t, zonep->zone_id,
602 uintptr_t, *rops, uintptr_t, *wops, uintptr_t, *lwops);
603
604 return (*rops | *wops | *lwops);
605 }
606
607 /*
608 * Get the average read/write latency in usecs for the system.
609 */
610 static void
611 get_sys_avg_lat(hrtime_t unow, uint_t *rlat, uint_t *wlat)
612 {
613 *rlat = calc_avg_lat(unow, &rd_lat);
614 *wlat = calc_avg_lat(unow, &wr_lat);
615
616 /*
617 * In an attempt to improve the accuracy of the throttling algorithm,
618 * assume that IO operations can't have zero latency. Instead, assume
619 * a reasonable lower bound for each operation type. If the actual
620 * observed latencies are non-zero, use those latency values instead.
621 */
622 if (*rlat == 0)
623 *rlat = 1000;
624 if (*wlat == 0)
625 *wlat = 1000;
626
627 DTRACE_PROBE2(zfs__zone__sys__avg__lat, uintptr_t, *rlat,
628 uintptr_t, *wlat);
629 }
630
631 /*
632 * Find disk utilization for each zone and average utilization for all active
633 * zones.
634 */
635 static int
636 zfs_zone_wait_adjust_calculate_cb(zone_t *zonep, void *arg)
637 {
638 zoneio_stats_t *sp = arg;
639 uint_t rops, wops, lwops;
640
641 if (zonep->zone_id == GLOBAL_ZONEID ||
642 get_zone_io_cnt(sp->zi_now, zonep, &rops, &wops, &lwops) == 0) {
643 zonep->zone_io_util = 0;
644 return (0);
645 }
646
647 zonep->zone_io_util = (rops * sp->zi_avgrlat) +
648 (wops * sp->zi_avgwlat) + (lwops * sp->zi_avgwlat);
649 sp->zi_totutil += zonep->zone_io_util;
650
651 if (zonep->zone_io_util > 0) {
652 sp->zi_active++;
653 sp->zi_totpri += zonep->zone_zfs_io_pri;
654 }
655
656 /*
657 * sdt:::zfs-zone-utilization
658 *
659 * arg0: zone ID
660 * arg1: read operations observed during time window
661 * arg2: physical write operations observed during time window
662 * arg3: logical write ops observed during time window
663 * arg4: calculated utilization given read and write ops
664 * arg5: I/O priority assigned to this zone
665 */
666 DTRACE_PROBE6(zfs__zone__utilization, uint_t, zonep->zone_id,
667 uint_t, rops, uint_t, wops, uint_t, lwops,
668 uint_t, zonep->zone_io_util, uint_t, zonep->zone_zfs_io_pri);
669
670 return (0);
671 }
672
673 static void
674 zfs_zone_delay_inc(zone_t *zonep)
675 {
676 if (zonep->zone_io_delay < zfs_zone_delay_ceiling)
677 zonep->zone_io_delay += zfs_zone_delay_step;
678 }
679
680 static void
681 zfs_zone_delay_dec(zone_t *zonep)
682 {
683 if (zonep->zone_io_delay > 0)
684 zonep->zone_io_delay -= zfs_zone_delay_step;
685 }
686
687 /*
688 * For all zones "far enough" away from the average utilization, increase that
689 * zones delay. Otherwise, reduce its delay.
690 */
691 static int
692 zfs_zone_wait_adjust_delay_cb(zone_t *zonep, void *arg)
693 {
694 zoneio_stats_t *sp = arg;
695 uint16_t delay = zonep->zone_io_delay;
696 uint_t fairutil = 0;
697
698 zonep->zone_io_util_above_avg = B_FALSE;
699
700 /*
701 * Given the calculated total utilitzation for all zones, calculate the
702 * fair share of I/O for this zone.
703 */
704 if (zfs_zone_priority_enable && sp->zi_totpri > 0) {
705 fairutil = (sp->zi_totutil * zonep->zone_zfs_io_pri) /
706 sp->zi_totpri;
707 } else if (sp->zi_active > 0) {
708 fairutil = sp->zi_totutil / sp->zi_active;
709 }
710
711 /*
712 * Adjust each IO's delay. If the overall delay becomes too high, avoid
713 * increasing beyond the ceiling value.
714 */
715 if (zonep->zone_io_util > fairutil && sp->zi_overutil) {
716 zonep->zone_io_util_above_avg = B_TRUE;
717
718 if (sp->zi_active > 1)
719 zfs_zone_delay_inc(zonep);
720 } else if (zonep->zone_io_util < fairutil || sp->zi_underutil ||
721 sp->zi_active <= 1) {
722 zfs_zone_delay_dec(zonep);
723 }
724
725 /*
726 * sdt:::zfs-zone-throttle
727 *
728 * arg0: zone ID
729 * arg1: old delay for this zone
730 * arg2: new delay for this zone
731 * arg3: calculated fair I/O utilization
732 * arg4: actual I/O utilization
733 */
734 DTRACE_PROBE5(zfs__zone__throttle, uintptr_t, zonep->zone_id,
735 uintptr_t, delay, uintptr_t, zonep->zone_io_delay,
736 uintptr_t, fairutil, uintptr_t, zonep->zone_io_util);
737
738 return (0);
739 }
740
741 /*
742 * Examine the utilization between different zones, and adjust the delay for
743 * each zone appropriately.
744 */
745 static void
746 zfs_zone_wait_adjust(hrtime_t unow, hrtime_t last_checked)
747 {
748 zoneio_stats_t stats;
749 hrtime_t laggard_udelta = 0;
750
751 (void) bzero(&stats, sizeof (stats));
752
753 stats.zi_now = unow;
754 get_sys_avg_lat(unow, &stats.zi_avgrlat, &stats.zi_avgwlat);
755
756 if (stats.zi_avgrlat > stats.zi_avgwlat * zfs_zone_rw_lat_limit)
757 stats.zi_avgrlat = stats.zi_avgwlat * zfs_zone_rw_lat_limit;
758 else if (stats.zi_avgrlat * zfs_zone_rw_lat_limit < stats.zi_avgwlat)
759 stats.zi_avgwlat = stats.zi_avgrlat * zfs_zone_rw_lat_limit;
760
761 if (zone_walk(zfs_zone_wait_adjust_calculate_cb, &stats) != 0)
762 return;
763
764 /*
765 * Calculate disk utilization for the most recent period.
766 */
767 if (zfs_disk_last_rtime == 0 || unow - last_checked <= 0) {
768 stats.zi_diskutil = 0;
769 } else {
770 stats.zi_diskutil =
771 ((zfs_disk_rtime - zfs_disk_last_rtime) * 100) /
772 ((unow - last_checked) * 1000);
773 }
774 zfs_disk_last_rtime = zfs_disk_rtime;
775
776 if (unow > zfs_disk_last_laggard)
777 laggard_udelta = unow - zfs_disk_last_laggard;
778
779 /*
780 * To minimize porpoising, we have three separate states for our
781 * assessment of I/O performance: overutilized, underutilized, and
782 * neither overutilized nor underutilized. We will increment the
783 * throttle if a zone is using more than its fair share _and_ I/O
784 * is overutilized; we will decrement the throttle if a zone is using
785 * less than its fair share _or_ I/O is underutilized.
786 */
787 stats.zi_underutil = stats.zi_diskutil < zfs_zone_underutil_threshold ||
788 laggard_udelta > zfs_zone_laggard_ancient;
789
790 stats.zi_overutil = stats.zi_diskutil > zfs_zone_util_threshold &&
791 laggard_udelta < zfs_zone_laggard_recent;
792
793 /*
794 * sdt:::zfs-zone-stats
795 *
796 * Statistics observed over the last period:
797 *
798 * arg0: average system read latency
799 * arg1: average system write latency
800 * arg2: number of active zones
801 * arg3: total I/O 'utilization' for all zones
802 * arg4: total I/O priority of all active zones
803 * arg5: calculated disk utilization
804 */
805 DTRACE_PROBE6(zfs__zone__stats, uintptr_t, stats.zi_avgrlat,
806 uintptr_t, stats.zi_avgwlat, uintptr_t, stats.zi_active,
807 uintptr_t, stats.zi_totutil, uintptr_t, stats.zi_totpri,
808 uintptr_t, stats.zi_diskutil);
809
810 (void) zone_walk(zfs_zone_wait_adjust_delay_cb, &stats);
811 }
812
813 /*
814 * Callback used to calculate a zone's IO schedule priority.
815 *
816 * We scan the zones looking for ones with ops in the queue. Out of those,
817 * we pick the one that calculates to the highest schedule priority.
818 */
819 static int
820 get_sched_pri_cb(zone_t *zonep, void *arg)
821 {
822 int pri;
823 uint_t cnt;
824 zone_q_bump_t *qbp = arg;
825 zio_priority_t p = qbp->zq_queue;
826
827 cnt = zonep->zone_zfs_queued[p];
828 if (cnt == 0) {
829 zonep->zone_zfs_weight = 0;
830 return (0);
831 }
832
833 /*
834 * On each pass, increment the zone's weight. We use this as input
835 * to the calculation to prevent starvation. The value is reset
836 * each time we issue an IO for this zone so zones which haven't
837 * done any IO over several iterations will see their weight max
838 * out.
839 */
840 if (zonep->zone_zfs_weight < SCHED_WEIGHT_MAX)
841 zonep->zone_zfs_weight++;
842
843 /*
844 * This zone's IO priority is the inverse of the number of IOs
845 * the zone has enqueued * zone's configured priority * weight.
846 * The queue depth has already been scaled by 10 to avoid problems
847 * with int rounding.
848 *
849 * This means that zones with fewer IOs in the queue will get
850 * preference unless other zone's assigned priority pulls them
851 * ahead. The weight is factored in to help ensure that zones
852 * which haven't done IO in a while aren't getting starved.
853 */
854 pri = (qbp->zq_qdepth / cnt) *
855 zonep->zone_zfs_io_pri * zonep->zone_zfs_weight;
856
857 /*
858 * If this zone has a higher priority than what we found so far,
859 * it becomes the new leading contender.
860 */
861 if (pri > qbp->zq_priority) {
862 qbp->zq_zoneid = zonep->zone_id;
863 qbp->zq_priority = pri;
864 qbp->zq_wt = zonep->zone_zfs_weight;
865 }
866 return (0);
867 }
868
869 /*
870 * See if we need to bump a zone's zio to the head of the queue. This is only
871 * done on the two synchronous I/O queues (see the block comment on the
872 * zfs_zone_schedule function). We get the correct vdev_queue_class_t and
873 * queue depth from our caller.
874 *
875 * For single-threaded synchronous processes a zone cannot get more than
876 * 1 op into the queue at a time unless the zone is running multiple processes
877 * in parallel. This can cause an imbalance in performance if there are zones
878 * with many parallel processes (and ops in the queue) vs. other zones which
879 * are doing simple single-threaded processes, such as interactive tasks in the
880 * shell. These zones can get backed up behind a deep queue and their IO
881 * performance will appear to be very poor as a result. This can make the
882 * zone work badly for interactive behavior.
883 *
884 * The scheduling algorithm kicks in once we start to get a deeper queue.
885 * Once that occurs, we look at all of the zones to see which one calculates
886 * to the highest priority. We bump that zone's first zio to the head of the
887 * queue.
888 *
889 * We use a counter on the zone so that we can quickly find how many ops each
890 * zone has in the queue without having to search the entire queue itself.
891 * This scales better since the number of zones is expected to be on the
892 * order of 10-100 whereas the queue depth can be in the range of 50-2000.
893 * In addition, since the zio's in the queue only have the zoneid, we would
894 * have to look up the zone for each zio enqueued and that means the overhead
895 * for scanning the queue each time would be much higher.
896 *
897 * In all cases, we fall back to simply pulling the next op off the queue
898 * if something should go wrong.
899 */
900 static zio_t *
901 get_next_zio(vdev_queue_class_t *vqc, int qdepth, zio_priority_t p)
902 {
903 zone_q_bump_t qbump;
904 zio_t *zp = NULL, *zphead;
905 int cnt = 0;
906
907 /* To avoid problems with int rounding, scale the queue depth by 10 */
908 qbump.zq_qdepth = qdepth * 10;
909 qbump.zq_priority = 0;
910 qbump.zq_zoneid = 0;
911 qbump.zq_queue = p;
912 (void) zone_walk(get_sched_pri_cb, &qbump);
913
914 zphead = avl_first(&vqc->vqc_queued_tree);
915
916 /* Check if the scheduler didn't pick a zone for some reason!? */
917 if (qbump.zq_zoneid != 0) {
918 for (zp = avl_first(&vqc->vqc_queued_tree); zp != NULL;
919 zp = avl_walk(&vqc->vqc_queued_tree, zp, AVL_AFTER)) {
920 if (zp->io_zoneid == qbump.zq_zoneid)
921 break;
922 cnt++;
923 }
924 }
925
926 if (zp == NULL) {
927 zp = zphead;
928 } else if (zp != zphead) {
929 /*
930 * Only fire the probe if we actually picked a different zio
931 * than the one already at the head of the queue.
932 */
933 DTRACE_PROBE4(zfs__zone__sched__bump, uint_t, zp->io_zoneid,
934 uint_t, cnt, int, qbump.zq_priority, int, qbump.zq_wt);
935 }
936
937 return (zp);
938 }
939
940 /*
941 * Add our zone ID to the zio so we can keep track of which zones are doing
942 * what, even when the current thread processing the zio is not associated
943 * with the zone (e.g. the kernel taskq which pushes out RX groups).
944 */
945 void
946 zfs_zone_zio_init(zio_t *zp)
947 {
948 zone_t *zonep = curzone;
949
950 zp->io_zoneid = zonep->zone_id;
951 }
952
953 /*
954 * Track IO operations per zone. Called from dmu_tx_count_write for write ops
955 * and dmu_read_uio for read ops. For each operation, increment that zone's
956 * counter based on the type of operation.
957 *
958 * There are three basic ways that we can see write ops:
959 * 1) An application does write syscalls. Those ops go into a TXG which
960 * we'll count here. Sometime later a kernel taskq thread (we'll see the
961 * vdev IO as zone 0) will perform some number of physical writes to commit
962 * the TXG to disk. Those writes are not associated with the zone which
963 * made the write syscalls and the number of operations is not correlated
964 * between the taskq and the zone.
965 * 2) An application opens a file with O_SYNC. Each write will result in
966 * an operation which we'll see here plus a low-level vdev write from
967 * that zone.
968 * 3) An application does write syscalls followed by an fsync(). We'll
969 * count the writes going into a TXG here. We'll also see some number
970 * (usually much smaller, maybe only 1) of low-level vdev writes from this
971 * zone when the fsync is performed, plus some other low-level vdev writes
972 * from the taskq in zone 0 (are these metadata writes?).
973 *
974 * 4) In addition to the above, there are misc. system-level writes, such as
975 * writing out dirty pages to swap, or sync(2) calls, which will be handled
976 * by the global zone and which we count but don't generally worry about.
977 *
978 * Because of the above, we can see writes twice because this is called
979 * at a high level by a zone thread, but we also will count the phys. writes
980 * that are performed at a low level via zfs_zone_zio_start.
981 *
982 * Without this, it can look like a non-global zone never writes (case 1).
983 * Depending on when the TXG is synced, the counts may be in the same sample
984 * bucket or in a different one.
985 *
986 * Tracking read operations is simpler due to their synchronous semantics. The
987 * zfs_read function -- called as a result of a read(2) syscall -- will always
988 * retrieve the data to be read through dmu_read_uio.
989 */
990 void
991 zfs_zone_io_throttle(zfs_zone_iop_type_t type)
992 {
993 zone_t *zonep = curzone;
994 hrtime_t unow, last_checked;
995 uint16_t wait;
996
997 unow = GET_USEC_TIME;
998
999 /*
1000 * Only bump the counters for logical operations here. The counters for
1001 * tracking physical IO operations are handled in zfs_zone_zio_done.
1002 */
1003 if (type == ZFS_ZONE_IOP_LOGICAL_WRITE) {
1004 mutex_enter(&zonep->zone_stg_io_lock);
1005 add_iop(zonep, unow, type, 0);
1006 mutex_exit(&zonep->zone_stg_io_lock);
1007 }
1008
1009 if (!zfs_zone_delay_enable)
1010 return;
1011
1012 /*
1013 * If the zone's I/O priority is set to zero, don't throttle that zone's
1014 * operations at all.
1015 */
1016 if (zonep->zone_zfs_io_pri == 0)
1017 return;
1018
1019 /*
1020 * XXX There's a potential race here in that more than one thread may
1021 * update the zone delays concurrently. The worst outcome is corruption
1022 * of our data to track each zone's IO, so the algorithm may make
1023 * incorrect throttling decisions until the data is refreshed.
1024 */
1025 last_checked = zfs_zone_last_checked;
1026 if ((unow - last_checked) > zfs_zone_adjust_time) {
1027 zfs_zone_last_checked = unow;
1028 zfs_zone_wait_adjust(unow, last_checked);
1029 }
1030
1031 if ((wait = zonep->zone_io_delay) > 0) {
1032 /*
1033 * If this is a write and we're doing above normal TXG
1034 * syncing, then throttle for longer than normal.
1035 */
1036 if (type == ZFS_ZONE_IOP_LOGICAL_WRITE &&
1037 (txg_cnt > 1 || txg_sync_rate > 1))
1038 wait *= zfs_zone_txg_throttle_scale;
1039
1040 /*
1041 * sdt:::zfs-zone-wait
1042 *
1043 * arg0: zone ID
1044 * arg1: type of IO operation
1045 * arg2: time to delay (in us)
1046 */
1047 DTRACE_PROBE3(zfs__zone__wait, uintptr_t, zonep->zone_id,
1048 uintptr_t, type, uintptr_t, wait);
1049
1050 drv_usecwait(wait);
1051 }
1052 }
1053
1054 /*
1055 * XXX Ignore the pool pointer parameter for now.
1056 *
1057 * Keep track to see if the TXG sync rate is running above the expected rate.
1058 * If so, this implies that we are filling TXG's at a high rate due to a heavy
1059 * write workload. We use this as input into the zone throttle.
1060 *
1061 * This function is called every 5 seconds (zfs_txg_timeout) under a normal
1062 * write load. In this case, the sync rate is going to be 1. When there
1063 * is a heavy write load, TXG's fill up fast and the sync thread will write
1064 * the TXG more frequently (perhaps once a second). In this case the rate
1065 * will be > 1. The sync rate is a lagging indicator since it can be up
1066 * to 5 seconds old. We use the txg_cnt to keep track of the rate in the
1067 * current 5 second interval and txg_sync_rate to keep track of the previous
1068 * 5 second interval. In that way we don't have a period (1 or more seconds)
1069 * where the txg_cnt == 0 and we cut back on throttling even though the rate
1070 * is still high.
1071 */
1072 /*ARGSUSED*/
1073 void
1074 zfs_zone_report_txg_sync(void *dp)
1075 {
1076 uint_t now;
1077
1078 txg_cnt++;
1079 now = (uint_t)(gethrtime() / NANOSEC);
1080 if ((now - txg_last_check) >= zfs_txg_timeout) {
1081 txg_sync_rate = txg_cnt / 2;
1082 txg_cnt = 0;
1083 txg_last_check = now;
1084 }
1085 }
1086
1087 hrtime_t
1088 zfs_zone_txg_delay()
1089 {
1090 if (curzone->zone_io_util_above_avg)
1091 return (zfs_zone_txg_delay_nsec);
1092
1093 return (MSEC2NSEC(10));
1094 }
1095
1096 /*
1097 * Called from vdev_disk_io_start when an IO hits the end of the zio pipeline
1098 * and is issued.
1099 * Keep track of start time for latency calculation in zfs_zone_zio_done.
1100 */
1101 void
1102 zfs_zone_zio_start(zio_t *zp)
1103 {
1104 zone_t *zonep;
1105
1106 /*
1107 * I/Os of type ZIO_TYPE_IOCTL are used to flush the disk cache, not for
1108 * an actual I/O operation. Ignore those operations as they relate to
1109 * throttling and scheduling.
1110 */
1111 if (zp->io_type == ZIO_TYPE_IOCTL)
1112 return;
1113
1114 if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
1115 return;
1116
1117 zonep->zone_zfs_weight = 0;
1118
1119 mutex_enter(&zfs_disk_lock);
1120 zp->io_dispatched = gethrtime();
1121
1122 if (zfs_disk_rcnt++ != 0)
1123 zfs_disk_rtime += (zp->io_dispatched - zfs_disk_rlastupdate);
1124 zfs_disk_rlastupdate = zp->io_dispatched;
1125 mutex_exit(&zfs_disk_lock);
1126
1127 zone_rele(zonep);
1128 }
1129
1130 /*
1131 * Called from vdev_disk_io_done when an IO completes.
1132 * Increment our counter for zone ops.
1133 * Calculate the IO latency avg. for this zone.
1134 */
1135 void
1136 zfs_zone_zio_done(zio_t *zp)
1137 {
1138 zone_t *zonep;
1139 hrtime_t now, unow, udelta;
1140
1141 if (zp->io_type == ZIO_TYPE_IOCTL)
1142 return;
1143
1144 if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
1145 return;
1146
1147 if (zp->io_dispatched == 0)
1148 return;
1149
1150 now = gethrtime();
1151 unow = NANO_TO_MICRO(now);
1152 udelta = unow - NANO_TO_MICRO(zp->io_dispatched);
1153
1154 mutex_enter(&zfs_disk_lock);
1155 zfs_disk_rcnt--;
1156 zfs_disk_rtime += (now - zfs_disk_rlastupdate);
1157 zfs_disk_rlastupdate = now;
1158
1159 if (udelta > zfs_zone_laggard_threshold)
1160 zfs_disk_last_laggard = unow;
1161
1162 mutex_exit(&zfs_disk_lock);
1163
1164 if (zfs_zone_delay_enable) {
1165 mutex_enter(&zonep->zone_stg_io_lock);
1166 add_iop(zonep, unow, zp->io_type == ZIO_TYPE_READ ?
1167 ZFS_ZONE_IOP_READ : ZFS_ZONE_IOP_WRITE, udelta);
1168 mutex_exit(&zonep->zone_stg_io_lock);
1169 }
1170
1171 zone_rele(zonep);
1172
1173 /*
1174 * sdt:::zfs-zone-latency
1175 *
1176 * arg0: zone ID
1177 * arg1: type of I/O operation
1178 * arg2: I/O latency (in us)
1179 */
1180 DTRACE_PROBE3(zfs__zone__latency, uintptr_t, zp->io_zoneid,
1181 uintptr_t, zp->io_type, uintptr_t, udelta);
1182 }
1183
1184 void
1185 zfs_zone_zio_dequeue(zio_t *zp)
1186 {
1187 zio_priority_t p;
1188 zone_t *zonep;
1189
1190 p = zp->io_priority;
1191 if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE)
1192 return;
1193
1194 /* We depend on p being defined as either 0 or 1 */
1195 ASSERT(p < 2);
1196
1197 if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
1198 return;
1199
1200 mutex_enter(&zonep->zone_stg_io_lock);
1201 ASSERT(zonep->zone_zfs_queued[p] > 0);
1202 if (zonep->zone_zfs_queued[p] == 0)
1203 cmn_err(CE_WARN, "zfs_zone_zio_dequeue: count==0");
1204 else
1205 zonep->zone_zfs_queued[p]--;
1206 mutex_exit(&zonep->zone_stg_io_lock);
1207 zone_rele(zonep);
1208 }
1209
1210 void
1211 zfs_zone_zio_enqueue(zio_t *zp)
1212 {
1213 zio_priority_t p;
1214 zone_t *zonep;
1215
1216 p = zp->io_priority;
1217 if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE)
1218 return;
1219
1220 /* We depend on p being defined as either 0 or 1 */
1221 ASSERT(p < 2);
1222
1223 if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
1224 return;
1225
1226 mutex_enter(&zonep->zone_stg_io_lock);
1227 zonep->zone_zfs_queued[p]++;
1228 mutex_exit(&zonep->zone_stg_io_lock);
1229 zone_rele(zonep);
1230 }
1231
1232 /*
1233 * Called from vdev_queue_io_to_issue. That function is where zio's are listed
1234 * in FIFO order on one of the sync queues, then pulled off (by
1235 * vdev_queue_io_remove) and issued. We potentially do zone-based scheduling
1236 * here to find a zone's zio deeper in the sync queue and issue that instead
1237 * of simply doing FIFO.
1238 *
1239 * We only do zone-based zio scheduling for the two synchronous I/O queues
1240 * (read & write). These queues are normally serviced in FIFO order but we
1241 * may decide to move a zone's zio to the head of the line. A typical I/O
1242 * load will be mostly synchronous reads and some asynchronous writes (which
1243 * are scheduled differently due to transaction groups). There will also be
1244 * some synchronous writes for those apps which want to ensure their data is on
1245 * disk. We want to make sure that a zone with a single-threaded app (e.g. the
1246 * shell) that is doing synchronous I/O (typically reads) isn't penalized by
1247 * other zones which are doing lots of synchronous I/O because they have many
1248 * running threads.
1249 *
1250 * The vq->vq_lock mutex is held when we're executing this function so we
1251 * can safely access the "last zone" variable on the queue.
1252 */
1253 zio_t *
1254 zfs_zone_schedule(vdev_queue_t *vq, zio_priority_t p, avl_index_t idx)
1255 {
1256 vdev_queue_class_t *vqc = &vq->vq_class[p];
1257 uint_t cnt;
1258 zoneid_t last_zone;
1259 zio_t *zio;
1260
1261 ASSERT(MUTEX_HELD(&vq->vq_lock));
1262
1263 /* Don't change the order on the LBA ordered queues. */
1264 if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE)
1265 return (avl_nearest(&vqc->vqc_queued_tree, idx, AVL_AFTER));
1266
1267 /* We depend on p being defined as either 0 or 1 */
1268 ASSERT(p < 2);
1269
1270 cnt = avl_numnodes(&vqc->vqc_queued_tree);
1271 last_zone = vq->vq_last_zone_id;
1272
1273 /*
1274 * If there are only a few zios in the queue then just issue the head.
1275 * If there are more than a few zios already queued up, then use
1276 * scheduling to get the next zio.
1277 */
1278 if (!zfs_zone_schedule_enable || cnt < zfs_zone_schedule_thresh)
1279 zio = avl_nearest(&vqc->vqc_queued_tree, idx, AVL_AFTER);
1280 else
1281 zio = get_next_zio(vqc, cnt, p);
1282
1283 vq->vq_last_zone_id = zio->io_zoneid;
1284
1285 /*
1286 * Probe with 4 args; the number of IOs in the queue, the zone that
1287 * was last scheduled off this queue, the zone that was associated
1288 * with the next IO that is scheduled, and which queue (priority).
1289 */
1290 DTRACE_PROBE4(zfs__zone__sched, uint_t, cnt, uint_t, last_zone,
1291 uint_t, zio->io_zoneid, uint_t, p);
1292
1293 return (zio);
1294 }
1295
1296 #endif