Print this page
Side-port of OS-2943 zone stuck 'down': references still extant (illumos-joyent 4cb09b44b4f851905a0e8cccbd9bfc834acc2041)
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/fs/zfs/zfs_zone.c
+++ new/usr/src/uts/common/fs/zfs/zfs_zone.c
1 1 /*
2 2 * This file and its contents are supplied under the terms of the
|
↓ open down ↓ |
2 lines elided |
↑ open up ↑ |
3 3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 4 * You may only use this file in accordance with the terms of version
5 5 * 1.0 of the CDDL.
6 6 *
7 7 * A full copy of the text of the CDDL should have accompanied this
8 8 * source. A copy of the CDDL is also available via the Internet at
9 9 * http://www.illumos.org/license/CDDL.
10 10 */
11 11
12 12 /*
13 - * Copyright 2013, Joyent, Inc. All rights reserved.
13 + * Copyright 2014, Joyent, Inc. All rights reserved.
14 14 */
15 15
16 16 /*
17 17 * The ZFS/Zone I/O throttle and scheduler attempts to ensure fair access to
18 18 * ZFS I/O resources for each zone.
19 19 *
20 20 * I/O contention can be major pain point on a multi-tenant system. A single
21 21 * zone can issue a stream of I/O operations, usually synchronous writes, which
22 22 * disrupt I/O performance for all other zones. This problem is further
23 23 * exacerbated by ZFS, which buffers all asynchronous writes in a single TXG,
24 24 * a set of blocks which are atomically synced to disk. The process of
25 25 * syncing a TXG can occupy all of a device's I/O bandwidth, thereby starving
26 26 * out any pending read operations.
27 27 *
28 28 * There are two facets to this capability; the throttle and the scheduler.
29 29 *
30 30 * Throttle
31 31 *
32 32 * The requirements on the throttle are:
33 33 *
34 34 * 1) Ensure consistent and predictable I/O latency across all zones.
35 35 * 2) Sequential and random workloads have very different characteristics,
36 36 * so it is a non-starter to track IOPS or throughput.
37 37 * 3) A zone should be able to use the full disk bandwidth if no other zone
38 38 * is actively using the disk.
39 39 *
40 40 * The throttle has two components: one to track and account for each zone's
41 41 * I/O requests, and another to throttle each zone's operations when it
42 42 * exceeds its fair share of disk I/O. When the throttle detects that a zone is
43 43 * consuming more than is appropriate, each read or write system call is
44 44 * delayed by up to 100 microseconds, which we've found is sufficient to allow
45 45 * other zones to interleave I/O requests during those delays.
46 46 *
47 47 * Note: The throttle will delay each logical I/O (as opposed to the physical
48 48 * I/O which will likely be issued asynchronously), so it may be easier to
49 49 * think of the I/O throttle delaying each read/write syscall instead of the
50 50 * actual I/O operation. For each zone, the throttle tracks an ongoing average
51 51 * of read and write operations performed to determine the overall I/O
52 52 * utilization for each zone.
53 53 *
54 54 * The throttle calculates a I/O utilization metric for each zone using the
55 55 * following formula:
56 56 *
57 57 * (# of read syscalls) x (Average read latency) +
58 58 * (# of write syscalls) x (Average write latency)
59 59 *
60 60 * Once each zone has its utilization metric, the I/O throttle will compare I/O
61 61 * utilization across all zones, and if a zone has a higher-than-average I/O
62 62 * utilization, system calls from that zone are throttled. That is, if one
63 63 * zone has a much higher utilization, that zone's delay is increased by 5
64 64 * microseconds, up to a maximum of 100 microseconds. Conversely, if a zone is
65 65 * already throttled and has a lower utilization than average, its delay will
66 66 * be lowered by 5 microseconds.
67 67 *
68 68 * The throttle calculation is driven by IO activity, but since IO does not
69 69 * happen at fixed intervals, timestamps are used to track when the last update
70 70 * was made and to drive recalculation.
71 71 *
72 72 * The throttle recalculates each zone's I/O usage and throttle delay (if any)
73 73 * on the zfs_zone_adjust_time interval. Overall I/O latency is maintained as
74 74 * a decayed average which is updated on the zfs_zone_sys_avg_cycle interval.
75 75 *
76 76 * Scheduler
77 77 *
78 78 * The I/O scheduler manages the vdev queues, the queues of pending I/Os to
79 79 * issue to the disks. It only makes scheduling decisions for the two
80 80 * synchronous I/O queues (read & write).
81 81 *
82 82 * The scheduler maintains how many I/Os in the queue are from each zone, and
83 83 * if one zone has a disproportionately large number of I/Os in the queue, the
84 84 * scheduler will allow certain I/Os from the underutilized zones to be "bumped"
85 85 * and pulled from the middle of the queue. This bump allows zones with a small
86 86 * number of I/Os (so small they may not even be taken into account by the
87 87 * throttle) to complete quickly instead of waiting behind dozens of I/Os from
88 88 * other zones.
89 89 */
90 90
91 91 #include <sys/spa.h>
92 92 #include <sys/vdev_impl.h>
93 93 #include <sys/zfs_zone.h>
94 94
95 95 #ifndef _KERNEL
96 96
97 97 /*
98 98 * Stubs for when compiling for user-land.
99 99 */
100 100
101 101 void
102 102 zfs_zone_io_throttle(zfs_zone_iop_type_t type)
103 103 {
104 104 }
105 105
106 106 void
107 107 zfs_zone_zio_init(zio_t *zp)
108 108 {
109 109 }
110 110
111 111 void
112 112 zfs_zone_zio_start(zio_t *zp)
113 113 {
114 114 }
115 115
116 116 void
117 117 zfs_zone_zio_done(zio_t *zp)
118 118 {
119 119 }
120 120
121 121 void
122 122 zfs_zone_zio_dequeue(zio_t *zp)
123 123 {
124 124 }
125 125
126 126 void
127 127 zfs_zone_zio_enqueue(zio_t *zp)
128 128 {
129 129 }
130 130
131 131 /*ARGSUSED*/
132 132 void
133 133 zfs_zone_report_txg_sync(void *dp)
134 134 {
135 135 }
136 136
137 137 hrtime_t
138 138 zfs_zone_txg_delay()
139 139 {
140 140 return (MSEC2NSEC(10));
141 141 }
142 142
143 143 #else
144 144
145 145 /*
146 146 * The real code.
147 147 */
148 148
149 149 #include <sys/systm.h>
150 150 #include <sys/thread.h>
151 151 #include <sys/proc.h>
152 152 #include <sys/types.h>
153 153 #include <sys/param.h>
154 154 #include <sys/time.h>
155 155 #include <sys/atomic.h>
156 156 #include <sys/zio.h>
157 157 #include <sys/zone.h>
158 158 #include <sys/avl.h>
159 159 #include <sys/sdt.h>
160 160 #include <sys/ddi.h>
161 161
162 162 /*
163 163 * The zone throttle delays read and write operations from certain zones based
164 164 * on each zone's IO utilitzation. Once a cycle (defined by zfs_zone_cycle_time
165 165 * below), the delays for each zone are recalculated based on the utilization
166 166 * over the previous window.
167 167 */
168 168 boolean_t zfs_zone_delay_enable = B_TRUE; /* enable IO throttle */
169 169 uint16_t zfs_zone_delay_step = 5; /* usec amnt to change delay */
170 170 uint16_t zfs_zone_delay_ceiling = 100; /* usec delay max */
171 171
172 172 boolean_t zfs_zone_priority_enable = B_TRUE; /* enable IO priority */
173 173
174 174 /*
175 175 * For certain workloads, one zone may be issuing primarily sequential I/O and
176 176 * another primarily random I/O. The sequential I/O will complete much more
177 177 * quickly than the random I/O, driving the average system latency for those
178 178 * operations way down. As a result, the random I/O may be throttled back, even
179 179 * though the sequential I/O should be throttled to allow the random I/O more
180 180 * access to the disk.
181 181 *
182 182 * This tunable limits the discrepancy between the read and write system
183 183 * latency. If one becomes excessively high, this tunable prevents the I/O
184 184 * throttler from exacerbating the imbalance.
185 185 */
186 186 uint_t zfs_zone_rw_lat_limit = 10;
187 187
188 188 /*
189 189 * The I/O throttle will only start delaying zones when it detects disk
190 190 * utilization has reached a certain level. This tunable controls the
191 191 * threshold at which the throttle will start delaying zones. When the number
192 192 * of vdevs is small, the calculation should correspond closely with the %b
193 193 * column from iostat -- but as the number of vdevs becomes large, it will
194 194 * correlate less and less to any single device (therefore making it a poor
195 195 * approximation for the actual I/O utilization on such systems). We
196 196 * therefore use our derived utilization conservatively: we know that low
197 197 * derived utilization does indeed correlate to low I/O use -- but that a high
198 198 * rate of derived utilization does not necesarily alone denote saturation;
199 199 * where we see a high rate of utilization, we also look for laggard I/Os to
200 200 * attempt to detect saturation.
201 201 */
202 202 uint_t zfs_zone_util_threshold = 80;
203 203 uint_t zfs_zone_underutil_threshold = 60;
204 204
205 205 /*
206 206 * There are three important tunables here: zfs_zone_laggard_threshold denotes
207 207 * the threshold at which an I/O is considered to be of notably high latency;
208 208 * zfs_zone_laggard_recent denotes the number of microseconds before the
209 209 * current time after which the last laggard is considered to be sufficiently
210 210 * recent to merit increasing the throttle; zfs_zone_laggard_ancient denotes
211 211 * the microseconds before the current time before which the last laggard is
212 212 * considered to be sufficiently old to merit decreasing the throttle. The
213 213 * most important tunable of these three is the zfs_zone_laggard_threshold: in
214 214 * modeling data from a large public cloud, this tunable was found to have a
215 215 * much greater effect on the throttle than the two time-based thresholds.
216 216 * This must be set high enough to not result in spurious throttling, but not
217 217 * so high as to allow pathological I/O to persist in the system.
218 218 */
219 219 uint_t zfs_zone_laggard_threshold = 50000; /* 50 ms */
220 220 uint_t zfs_zone_laggard_recent = 1000000; /* 1000 ms */
221 221 uint_t zfs_zone_laggard_ancient = 5000000; /* 5000 ms */
222 222
223 223 /*
224 224 * Throughout this subsystem, our timestamps are in microseconds. Our system
225 225 * average cycle is one second or 1 million microseconds. Our zone counter
226 226 * update cycle is two seconds or 2 million microseconds. We use a longer
227 227 * duration for that cycle because some ops can see a little over two seconds of
228 228 * latency when they are being starved by another zone.
229 229 */
230 230 uint_t zfs_zone_sys_avg_cycle = 1000000; /* 1 s */
231 231 uint_t zfs_zone_cycle_time = 2000000; /* 2 s */
232 232
233 233 /*
234 234 * How often the I/O throttle will reevaluate each zone's utilization, in
235 235 * microseconds. Default is 1/4 sec.
236 236 */
237 237 uint_t zfs_zone_adjust_time = 250000; /* 250 ms */
238 238
239 239 typedef struct {
240 240 hrtime_t cycle_start;
241 241 int cycle_cnt;
242 242 hrtime_t cycle_lat;
243 243 hrtime_t sys_avg_lat;
244 244 } sys_lat_cycle_t;
245 245
246 246 typedef struct {
247 247 hrtime_t zi_now;
248 248 uint_t zi_avgrlat;
249 249 uint_t zi_avgwlat;
250 250 uint64_t zi_totpri;
251 251 uint64_t zi_totutil;
252 252 int zi_active;
253 253 uint_t zi_diskutil;
254 254 boolean_t zi_underutil;
255 255 boolean_t zi_overutil;
256 256 } zoneio_stats_t;
257 257
258 258 static sys_lat_cycle_t rd_lat;
259 259 static sys_lat_cycle_t wr_lat;
260 260
261 261 /*
262 262 * Some basic disk stats to determine disk utilization. The utilization info
263 263 * for all disks on the system is aggregated into these values.
264 264 *
265 265 * Overall disk utilization for the current cycle is calculated as:
266 266 *
267 267 * ((zfs_disk_rtime - zfs_disk_last_rtime) * 100)
268 268 * ----------------------------------------------
269 269 * ((now - zfs_zone_last_checked) * 1000);
270 270 */
271 271 kmutex_t zfs_disk_lock; /* protects the following: */
272 272 uint_t zfs_disk_rcnt; /* Number of outstanding IOs */
273 273 hrtime_t zfs_disk_rtime = 0; /* cummulative sum of time performing IO */
274 274 hrtime_t zfs_disk_rlastupdate = 0; /* time last IO dispatched */
275 275
276 276 hrtime_t zfs_disk_last_rtime = 0; /* prev. cycle's zfs_disk_rtime val */
277 277 /* time that we last updated per-zone throttle info */
278 278 hrtime_t zfs_zone_last_checked = 0;
279 279 hrtime_t zfs_disk_last_laggard = 0;
280 280
281 281 /*
282 282 * Data used to keep track of how often txg sync is running.
283 283 */
284 284 extern int zfs_txg_timeout;
285 285 static uint_t txg_last_check;
286 286 static uint_t txg_cnt;
287 287 static uint_t txg_sync_rate;
288 288
289 289 boolean_t zfs_zone_schedule_enable = B_TRUE; /* enable IO sched. */
290 290 /*
291 291 * Threshold for when zio scheduling should kick in.
292 292 *
293 293 * This threshold is based on the zfs_vdev_sync_read_max_active value for the
294 294 * number of I/Os that can be pending on a device. If there are more than the
295 295 * max_active ops already queued up, beyond those already issued to the vdev,
296 296 * then use zone-based scheduling to get the next synchronous zio.
297 297 */
298 298 uint32_t zfs_zone_schedule_thresh = 10;
299 299
300 300 /*
301 301 * On each pass of the scheduler we increment the zone's weight (up to this
302 302 * maximum). The weight is used by the scheduler to prevent starvation so
303 303 * that zones which haven't been able to do any IO over many iterations
304 304 * will max out thier weight to this value.
305 305 */
306 306 #define SCHED_WEIGHT_MAX 20
307 307
308 308 /*
309 309 * Tunables for delay throttling when TXG sync is occurring.
310 310 *
311 311 * If the zone is performing a write and we're doing above normal TXG syncing,
312 312 * then throttle for longer than normal. The zone's wait time is multiplied
313 313 * by the scale (zfs_zone_txg_throttle_scale).
314 314 */
315 315 int zfs_zone_txg_throttle_scale = 2;
316 316 hrtime_t zfs_zone_txg_delay_nsec = MSEC2NSEC(20);
317 317
318 318 typedef struct {
319 319 int zq_qdepth;
320 320 zio_priority_t zq_queue;
321 321 int zq_priority;
322 322 int zq_wt;
323 323 zoneid_t zq_zoneid;
324 324 } zone_q_bump_t;
325 325
326 326 /*
327 327 * This uses gethrtime() but returns a value in usecs.
328 328 */
329 329 #define GET_USEC_TIME (gethrtime() / 1000)
330 330 #define NANO_TO_MICRO(x) (x / (NANOSEC / MICROSEC))
331 331
332 332 /*
333 333 * Keep track of the zone's ZFS IOPs.
334 334 *
335 335 * See the comment on the zfs_zone_io_throttle function for which/how IOPs are
336 336 * accounted for.
337 337 *
338 338 * If the number of ops is >1 then we can just use that value. However,
339 339 * if the number of ops is <2 then we might have a zone which is trying to do
340 340 * IO but is not able to get any ops through the system. We don't want to lose
341 341 * track of this zone so we factor in its decayed count into the current count.
342 342 *
343 343 * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed count.
344 344 * However, since this calculation is driven by IO activity and since IO does
345 345 * not happen at fixed intervals, we use a timestamp to see when the last update
346 346 * was made. If it was more than one cycle ago, then we need to decay the
347 347 * historical count by the proper number of additional cycles in which no IO was
348 348 * performed.
349 349 *
350 350 * Return a time delta indicating how far into the current cycle we are or 0
351 351 * if the last IO was more than a cycle ago.
352 352 */
353 353 static hrtime_t
354 354 compute_historical_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp)
355 355 {
356 356 hrtime_t delta;
357 357 int gen_cnt;
358 358
359 359 /*
360 360 * Check if its time to recompute a new zone count.
361 361 * If we're still collecting data for the current cycle, return false.
362 362 */
363 363 delta = unow - cp->cycle_start;
364 364 if (delta < zfs_zone_cycle_time)
365 365 return (delta);
366 366
367 367 /* A previous cycle is past, compute the new zone count. */
368 368
369 369 /*
370 370 * Figure out how many generations we have to decay the historical
371 371 * count, since multiple cycles may have elapsed since our last IO.
372 372 * We depend on int rounding here.
373 373 */
374 374 gen_cnt = (int)(delta / zfs_zone_cycle_time);
375 375
376 376 /* If more than 5 cycles since last the IO, reset count. */
377 377 if (gen_cnt > 5) {
378 378 cp->zone_avg_cnt = 0;
379 379 } else {
380 380 /* Update the count. */
381 381 int i;
382 382
383 383 /*
384 384 * If the zone did more than 1 IO, just use its current count
385 385 * as the historical value, otherwise decay the historical
386 386 * count and factor that into the new historical count. We
387 387 * pick a threshold > 1 so that we don't lose track of IO due
388 388 * to int rounding.
389 389 */
390 390 if (cp->cycle_cnt > 1)
391 391 cp->zone_avg_cnt = cp->cycle_cnt;
392 392 else
393 393 cp->zone_avg_cnt = cp->cycle_cnt +
394 394 (cp->zone_avg_cnt / 2);
395 395
396 396 /*
397 397 * If more than one generation has elapsed since the last
398 398 * update, decay the values further.
399 399 */
400 400 for (i = 1; i < gen_cnt; i++)
401 401 cp->zone_avg_cnt = cp->zone_avg_cnt / 2;
402 402 }
403 403
404 404 /* A new cycle begins. */
405 405 cp->cycle_start = unow;
406 406 cp->cycle_cnt = 0;
407 407
408 408 return (0);
409 409 }
410 410
411 411 /*
412 412 * Add IO op data to the zone.
413 413 */
414 414 static void
415 415 add_zone_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op)
416 416 {
417 417 switch (op) {
418 418 case ZFS_ZONE_IOP_READ:
419 419 (void) compute_historical_zone_cnt(unow, &zonep->zone_rd_ops);
420 420 zonep->zone_rd_ops.cycle_cnt++;
421 421 break;
422 422 case ZFS_ZONE_IOP_WRITE:
423 423 (void) compute_historical_zone_cnt(unow, &zonep->zone_wr_ops);
424 424 zonep->zone_wr_ops.cycle_cnt++;
425 425 break;
426 426 case ZFS_ZONE_IOP_LOGICAL_WRITE:
427 427 (void) compute_historical_zone_cnt(unow, &zonep->zone_lwr_ops);
428 428 zonep->zone_lwr_ops.cycle_cnt++;
429 429 break;
430 430 }
431 431 }
432 432
433 433 /*
434 434 * Use a decaying average to keep track of the overall system latency.
435 435 *
436 436 * We want to have the recent activity heavily weighted, but if the
437 437 * activity decreases or stops, then the average should quickly decay
438 438 * down to the new value.
439 439 *
440 440 * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed average.
441 441 * However, since this calculation is driven by IO activity and since IO does
442 442 * not happen at fixed intervals, we use a timestamp to see when the last
443 443 * update was made. If it was more than one cycle ago, then we need to decay
444 444 * the average by the proper number of additional cycles in which no IO was
445 445 * performed.
446 446 *
447 447 * Return true if we actually computed a new system average.
448 448 * If we're still within an active cycle there is nothing to do, return false.
449 449 */
450 450 static boolean_t
451 451 compute_new_sys_avg(hrtime_t unow, sys_lat_cycle_t *cp)
452 452 {
453 453 hrtime_t delta;
454 454 int gen_cnt;
455 455
456 456 /*
457 457 * Check if its time to recompute a new average.
458 458 * If we're still collecting data for the current cycle, return false.
459 459 */
460 460 delta = unow - cp->cycle_start;
461 461 if (delta < zfs_zone_sys_avg_cycle)
462 462 return (B_FALSE);
463 463
464 464 /* A previous cycle is past, compute a new system average. */
465 465
466 466 /*
467 467 * Figure out how many generations we have to decay, since multiple
468 468 * cycles may have elapsed since our last IO.
469 469 * We count on int rounding here.
470 470 */
471 471 gen_cnt = (int)(delta / zfs_zone_sys_avg_cycle);
472 472
473 473 /* If more than 5 cycles since last the IO, reset average. */
474 474 if (gen_cnt > 5) {
475 475 cp->sys_avg_lat = 0;
476 476 } else {
477 477 /* Update the average. */
478 478 int i;
479 479
480 480 cp->sys_avg_lat =
481 481 (cp->sys_avg_lat + cp->cycle_lat) / (1 + cp->cycle_cnt);
482 482
483 483 /*
484 484 * If more than one generation has elapsed since the last
485 485 * update, decay the values further.
486 486 */
487 487 for (i = 1; i < gen_cnt; i++)
488 488 cp->sys_avg_lat = cp->sys_avg_lat / 2;
489 489 }
490 490
491 491 /* A new cycle begins. */
492 492 cp->cycle_start = unow;
493 493 cp->cycle_cnt = 0;
494 494 cp->cycle_lat = 0;
495 495
496 496 return (B_TRUE);
497 497 }
498 498
499 499 static void
500 500 add_sys_iop(hrtime_t unow, int op, int lat)
501 501 {
502 502 switch (op) {
503 503 case ZFS_ZONE_IOP_READ:
504 504 (void) compute_new_sys_avg(unow, &rd_lat);
505 505 rd_lat.cycle_cnt++;
506 506 rd_lat.cycle_lat += lat;
507 507 break;
508 508 case ZFS_ZONE_IOP_WRITE:
509 509 (void) compute_new_sys_avg(unow, &wr_lat);
510 510 wr_lat.cycle_cnt++;
511 511 wr_lat.cycle_lat += lat;
512 512 break;
513 513 }
514 514 }
515 515
516 516 /*
517 517 * Get the zone IO counts.
518 518 */
519 519 static uint_t
520 520 calc_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp)
521 521 {
522 522 hrtime_t delta;
523 523 uint_t cnt;
524 524
525 525 if ((delta = compute_historical_zone_cnt(unow, cp)) == 0) {
526 526 /*
527 527 * No activity in the current cycle, we already have the
528 528 * historical data so we'll use that.
529 529 */
530 530 cnt = cp->zone_avg_cnt;
531 531 } else {
532 532 /*
533 533 * If we're less than half way through the cycle then use
534 534 * the current count plus half the historical count, otherwise
535 535 * just use the current count.
536 536 */
537 537 if (delta < (zfs_zone_cycle_time / 2))
538 538 cnt = cp->cycle_cnt + (cp->zone_avg_cnt / 2);
539 539 else
540 540 cnt = cp->cycle_cnt;
541 541 }
542 542
543 543 return (cnt);
544 544 }
545 545
546 546 /*
547 547 * Get the average read/write latency in usecs for the system.
548 548 */
549 549 static uint_t
550 550 calc_avg_lat(hrtime_t unow, sys_lat_cycle_t *cp)
551 551 {
552 552 if (compute_new_sys_avg(unow, cp)) {
553 553 /*
554 554 * No activity in the current cycle, we already have the
555 555 * historical data so we'll use that.
556 556 */
557 557 return (cp->sys_avg_lat);
558 558 } else {
559 559 /*
560 560 * We're within a cycle; weight the current activity higher
561 561 * compared to the historical data and use that.
562 562 */
563 563 DTRACE_PROBE3(zfs__zone__calc__wt__avg,
564 564 uintptr_t, cp->sys_avg_lat,
565 565 uintptr_t, cp->cycle_lat,
566 566 uintptr_t, cp->cycle_cnt);
567 567
568 568 return ((cp->sys_avg_lat + (cp->cycle_lat * 8)) /
569 569 (1 + (cp->cycle_cnt * 8)));
570 570 }
571 571 }
572 572
573 573 /*
574 574 * Account for the current IOP on the zone and for the system as a whole.
575 575 * The latency parameter is in usecs.
576 576 */
577 577 static void
578 578 add_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op, hrtime_t lat)
579 579 {
580 580 /* Add op to zone */
581 581 add_zone_iop(zonep, unow, op);
582 582
583 583 /* Track system latency */
584 584 if (op != ZFS_ZONE_IOP_LOGICAL_WRITE)
585 585 add_sys_iop(unow, op, lat);
586 586 }
587 587
588 588 /*
589 589 * Calculate and return the total number of read ops, write ops and logical
590 590 * write ops for the given zone. If the zone has issued operations of any type
591 591 * return a non-zero value, otherwise return 0.
592 592 */
593 593 static int
594 594 get_zone_io_cnt(hrtime_t unow, zone_t *zonep, uint_t *rops, uint_t *wops,
595 595 uint_t *lwops)
596 596 {
597 597 *rops = calc_zone_cnt(unow, &zonep->zone_rd_ops);
598 598 *wops = calc_zone_cnt(unow, &zonep->zone_wr_ops);
599 599 *lwops = calc_zone_cnt(unow, &zonep->zone_lwr_ops);
600 600
601 601 DTRACE_PROBE4(zfs__zone__io__cnt, uintptr_t, zonep->zone_id,
602 602 uintptr_t, *rops, uintptr_t, *wops, uintptr_t, *lwops);
603 603
604 604 return (*rops | *wops | *lwops);
605 605 }
606 606
607 607 /*
608 608 * Get the average read/write latency in usecs for the system.
609 609 */
610 610 static void
611 611 get_sys_avg_lat(hrtime_t unow, uint_t *rlat, uint_t *wlat)
612 612 {
613 613 *rlat = calc_avg_lat(unow, &rd_lat);
614 614 *wlat = calc_avg_lat(unow, &wr_lat);
615 615
616 616 /*
617 617 * In an attempt to improve the accuracy of the throttling algorithm,
618 618 * assume that IO operations can't have zero latency. Instead, assume
619 619 * a reasonable lower bound for each operation type. If the actual
620 620 * observed latencies are non-zero, use those latency values instead.
621 621 */
622 622 if (*rlat == 0)
623 623 *rlat = 1000;
624 624 if (*wlat == 0)
625 625 *wlat = 1000;
626 626
627 627 DTRACE_PROBE2(zfs__zone__sys__avg__lat, uintptr_t, *rlat,
628 628 uintptr_t, *wlat);
629 629 }
630 630
631 631 /*
632 632 * Find disk utilization for each zone and average utilization for all active
633 633 * zones.
634 634 */
635 635 static int
636 636 zfs_zone_wait_adjust_calculate_cb(zone_t *zonep, void *arg)
637 637 {
638 638 zoneio_stats_t *sp = arg;
639 639 uint_t rops, wops, lwops;
640 640
641 641 if (zonep->zone_id == GLOBAL_ZONEID ||
642 642 get_zone_io_cnt(sp->zi_now, zonep, &rops, &wops, &lwops) == 0) {
643 643 zonep->zone_io_util = 0;
644 644 return (0);
645 645 }
646 646
647 647 zonep->zone_io_util = (rops * sp->zi_avgrlat) +
648 648 (wops * sp->zi_avgwlat) + (lwops * sp->zi_avgwlat);
649 649 sp->zi_totutil += zonep->zone_io_util;
650 650
651 651 if (zonep->zone_io_util > 0) {
652 652 sp->zi_active++;
653 653 sp->zi_totpri += zonep->zone_zfs_io_pri;
654 654 }
655 655
656 656 /*
657 657 * sdt:::zfs-zone-utilization
658 658 *
659 659 * arg0: zone ID
660 660 * arg1: read operations observed during time window
661 661 * arg2: physical write operations observed during time window
662 662 * arg3: logical write ops observed during time window
663 663 * arg4: calculated utilization given read and write ops
664 664 * arg5: I/O priority assigned to this zone
665 665 */
666 666 DTRACE_PROBE6(zfs__zone__utilization, uint_t, zonep->zone_id,
667 667 uint_t, rops, uint_t, wops, uint_t, lwops,
668 668 uint_t, zonep->zone_io_util, uint_t, zonep->zone_zfs_io_pri);
669 669
670 670 return (0);
671 671 }
672 672
673 673 static void
674 674 zfs_zone_delay_inc(zone_t *zonep)
675 675 {
676 676 if (zonep->zone_io_delay < zfs_zone_delay_ceiling)
677 677 zonep->zone_io_delay += zfs_zone_delay_step;
678 678 }
679 679
680 680 static void
681 681 zfs_zone_delay_dec(zone_t *zonep)
682 682 {
683 683 if (zonep->zone_io_delay > 0)
684 684 zonep->zone_io_delay -= zfs_zone_delay_step;
685 685 }
686 686
687 687 /*
688 688 * For all zones "far enough" away from the average utilization, increase that
689 689 * zones delay. Otherwise, reduce its delay.
690 690 */
691 691 static int
692 692 zfs_zone_wait_adjust_delay_cb(zone_t *zonep, void *arg)
693 693 {
694 694 zoneio_stats_t *sp = arg;
695 695 uint16_t delay = zonep->zone_io_delay;
696 696 uint_t fairutil = 0;
697 697
698 698 zonep->zone_io_util_above_avg = B_FALSE;
699 699
700 700 /*
701 701 * Given the calculated total utilitzation for all zones, calculate the
702 702 * fair share of I/O for this zone.
703 703 */
704 704 if (zfs_zone_priority_enable && sp->zi_totpri > 0) {
705 705 fairutil = (sp->zi_totutil * zonep->zone_zfs_io_pri) /
706 706 sp->zi_totpri;
707 707 } else if (sp->zi_active > 0) {
708 708 fairutil = sp->zi_totutil / sp->zi_active;
709 709 }
710 710
711 711 /*
712 712 * Adjust each IO's delay. If the overall delay becomes too high, avoid
713 713 * increasing beyond the ceiling value.
714 714 */
715 715 if (zonep->zone_io_util > fairutil && sp->zi_overutil) {
716 716 zonep->zone_io_util_above_avg = B_TRUE;
717 717
718 718 if (sp->zi_active > 1)
719 719 zfs_zone_delay_inc(zonep);
720 720 } else if (zonep->zone_io_util < fairutil || sp->zi_underutil ||
721 721 sp->zi_active <= 1) {
722 722 zfs_zone_delay_dec(zonep);
723 723 }
724 724
725 725 /*
726 726 * sdt:::zfs-zone-throttle
727 727 *
728 728 * arg0: zone ID
729 729 * arg1: old delay for this zone
730 730 * arg2: new delay for this zone
731 731 * arg3: calculated fair I/O utilization
732 732 * arg4: actual I/O utilization
733 733 */
734 734 DTRACE_PROBE5(zfs__zone__throttle, uintptr_t, zonep->zone_id,
735 735 uintptr_t, delay, uintptr_t, zonep->zone_io_delay,
736 736 uintptr_t, fairutil, uintptr_t, zonep->zone_io_util);
737 737
738 738 return (0);
739 739 }
740 740
741 741 /*
742 742 * Examine the utilization between different zones, and adjust the delay for
743 743 * each zone appropriately.
744 744 */
745 745 static void
746 746 zfs_zone_wait_adjust(hrtime_t unow, hrtime_t last_checked)
747 747 {
748 748 zoneio_stats_t stats;
749 749 hrtime_t laggard_udelta = 0;
750 750
751 751 (void) bzero(&stats, sizeof (stats));
752 752
753 753 stats.zi_now = unow;
754 754 get_sys_avg_lat(unow, &stats.zi_avgrlat, &stats.zi_avgwlat);
755 755
756 756 if (stats.zi_avgrlat > stats.zi_avgwlat * zfs_zone_rw_lat_limit)
757 757 stats.zi_avgrlat = stats.zi_avgwlat * zfs_zone_rw_lat_limit;
758 758 else if (stats.zi_avgrlat * zfs_zone_rw_lat_limit < stats.zi_avgwlat)
759 759 stats.zi_avgwlat = stats.zi_avgrlat * zfs_zone_rw_lat_limit;
760 760
761 761 if (zone_walk(zfs_zone_wait_adjust_calculate_cb, &stats) != 0)
762 762 return;
763 763
764 764 /*
765 765 * Calculate disk utilization for the most recent period.
766 766 */
767 767 if (zfs_disk_last_rtime == 0 || unow - last_checked <= 0) {
768 768 stats.zi_diskutil = 0;
769 769 } else {
770 770 stats.zi_diskutil =
771 771 ((zfs_disk_rtime - zfs_disk_last_rtime) * 100) /
772 772 ((unow - last_checked) * 1000);
773 773 }
774 774 zfs_disk_last_rtime = zfs_disk_rtime;
775 775
776 776 if (unow > zfs_disk_last_laggard)
777 777 laggard_udelta = unow - zfs_disk_last_laggard;
778 778
779 779 /*
780 780 * To minimize porpoising, we have three separate states for our
781 781 * assessment of I/O performance: overutilized, underutilized, and
782 782 * neither overutilized nor underutilized. We will increment the
783 783 * throttle if a zone is using more than its fair share _and_ I/O
784 784 * is overutilized; we will decrement the throttle if a zone is using
785 785 * less than its fair share _or_ I/O is underutilized.
786 786 */
787 787 stats.zi_underutil = stats.zi_diskutil < zfs_zone_underutil_threshold ||
788 788 laggard_udelta > zfs_zone_laggard_ancient;
789 789
790 790 stats.zi_overutil = stats.zi_diskutil > zfs_zone_util_threshold &&
791 791 laggard_udelta < zfs_zone_laggard_recent;
792 792
793 793 /*
794 794 * sdt:::zfs-zone-stats
795 795 *
796 796 * Statistics observed over the last period:
797 797 *
798 798 * arg0: average system read latency
799 799 * arg1: average system write latency
800 800 * arg2: number of active zones
801 801 * arg3: total I/O 'utilization' for all zones
802 802 * arg4: total I/O priority of all active zones
803 803 * arg5: calculated disk utilization
804 804 */
805 805 DTRACE_PROBE6(zfs__zone__stats, uintptr_t, stats.zi_avgrlat,
806 806 uintptr_t, stats.zi_avgwlat, uintptr_t, stats.zi_active,
807 807 uintptr_t, stats.zi_totutil, uintptr_t, stats.zi_totpri,
808 808 uintptr_t, stats.zi_diskutil);
809 809
810 810 (void) zone_walk(zfs_zone_wait_adjust_delay_cb, &stats);
811 811 }
812 812
813 813 /*
814 814 * Callback used to calculate a zone's IO schedule priority.
815 815 *
816 816 * We scan the zones looking for ones with ops in the queue. Out of those,
817 817 * we pick the one that calculates to the highest schedule priority.
818 818 */
819 819 static int
820 820 get_sched_pri_cb(zone_t *zonep, void *arg)
821 821 {
822 822 int pri;
823 823 uint_t cnt;
824 824 zone_q_bump_t *qbp = arg;
825 825 zio_priority_t p = qbp->zq_queue;
826 826
827 827 cnt = zonep->zone_zfs_queued[p];
828 828 if (cnt == 0) {
829 829 zonep->zone_zfs_weight = 0;
830 830 return (0);
831 831 }
832 832
833 833 /*
834 834 * On each pass, increment the zone's weight. We use this as input
835 835 * to the calculation to prevent starvation. The value is reset
836 836 * each time we issue an IO for this zone so zones which haven't
837 837 * done any IO over several iterations will see their weight max
838 838 * out.
839 839 */
840 840 if (zonep->zone_zfs_weight < SCHED_WEIGHT_MAX)
841 841 zonep->zone_zfs_weight++;
842 842
843 843 /*
844 844 * This zone's IO priority is the inverse of the number of IOs
845 845 * the zone has enqueued * zone's configured priority * weight.
846 846 * The queue depth has already been scaled by 10 to avoid problems
847 847 * with int rounding.
848 848 *
849 849 * This means that zones with fewer IOs in the queue will get
850 850 * preference unless other zone's assigned priority pulls them
851 851 * ahead. The weight is factored in to help ensure that zones
852 852 * which haven't done IO in a while aren't getting starved.
853 853 */
854 854 pri = (qbp->zq_qdepth / cnt) *
855 855 zonep->zone_zfs_io_pri * zonep->zone_zfs_weight;
856 856
857 857 /*
858 858 * If this zone has a higher priority than what we found so far,
859 859 * it becomes the new leading contender.
860 860 */
861 861 if (pri > qbp->zq_priority) {
862 862 qbp->zq_zoneid = zonep->zone_id;
863 863 qbp->zq_priority = pri;
864 864 qbp->zq_wt = zonep->zone_zfs_weight;
865 865 }
866 866 return (0);
867 867 }
868 868
869 869 /*
870 870 * See if we need to bump a zone's zio to the head of the queue. This is only
871 871 * done on the two synchronous I/O queues (see the block comment on the
872 872 * zfs_zone_schedule function). We get the correct vdev_queue_class_t and
873 873 * queue depth from our caller.
874 874 *
875 875 * For single-threaded synchronous processes a zone cannot get more than
876 876 * 1 op into the queue at a time unless the zone is running multiple processes
877 877 * in parallel. This can cause an imbalance in performance if there are zones
878 878 * with many parallel processes (and ops in the queue) vs. other zones which
879 879 * are doing simple single-threaded processes, such as interactive tasks in the
880 880 * shell. These zones can get backed up behind a deep queue and their IO
881 881 * performance will appear to be very poor as a result. This can make the
882 882 * zone work badly for interactive behavior.
883 883 *
884 884 * The scheduling algorithm kicks in once we start to get a deeper queue.
885 885 * Once that occurs, we look at all of the zones to see which one calculates
886 886 * to the highest priority. We bump that zone's first zio to the head of the
887 887 * queue.
888 888 *
889 889 * We use a counter on the zone so that we can quickly find how many ops each
890 890 * zone has in the queue without having to search the entire queue itself.
891 891 * This scales better since the number of zones is expected to be on the
892 892 * order of 10-100 whereas the queue depth can be in the range of 50-2000.
893 893 * In addition, since the zio's in the queue only have the zoneid, we would
894 894 * have to look up the zone for each zio enqueued and that means the overhead
895 895 * for scanning the queue each time would be much higher.
896 896 *
897 897 * In all cases, we fall back to simply pulling the next op off the queue
898 898 * if something should go wrong.
899 899 */
900 900 static zio_t *
901 901 get_next_zio(vdev_queue_class_t *vqc, int qdepth, zio_priority_t p)
902 902 {
903 903 zone_q_bump_t qbump;
904 904 zio_t *zp = NULL, *zphead;
905 905 int cnt = 0;
906 906
907 907 /* To avoid problems with int rounding, scale the queue depth by 10 */
908 908 qbump.zq_qdepth = qdepth * 10;
909 909 qbump.zq_priority = 0;
910 910 qbump.zq_zoneid = 0;
911 911 qbump.zq_queue = p;
912 912 (void) zone_walk(get_sched_pri_cb, &qbump);
913 913
914 914 zphead = avl_first(&vqc->vqc_queued_tree);
915 915
916 916 /* Check if the scheduler didn't pick a zone for some reason!? */
917 917 if (qbump.zq_zoneid != 0) {
918 918 for (zp = avl_first(&vqc->vqc_queued_tree); zp != NULL;
919 919 zp = avl_walk(&vqc->vqc_queued_tree, zp, AVL_AFTER)) {
920 920 if (zp->io_zoneid == qbump.zq_zoneid)
921 921 break;
922 922 cnt++;
923 923 }
924 924 }
925 925
926 926 if (zp == NULL) {
927 927 zp = zphead;
928 928 } else if (zp != zphead) {
929 929 /*
930 930 * Only fire the probe if we actually picked a different zio
931 931 * than the one already at the head of the queue.
932 932 */
933 933 DTRACE_PROBE4(zfs__zone__sched__bump, uint_t, zp->io_zoneid,
934 934 uint_t, cnt, int, qbump.zq_priority, int, qbump.zq_wt);
935 935 }
936 936
937 937 return (zp);
938 938 }
939 939
940 940 /*
941 941 * Add our zone ID to the zio so we can keep track of which zones are doing
942 942 * what, even when the current thread processing the zio is not associated
943 943 * with the zone (e.g. the kernel taskq which pushes out RX groups).
944 944 */
945 945 void
946 946 zfs_zone_zio_init(zio_t *zp)
947 947 {
948 948 zone_t *zonep = curzone;
949 949
950 950 zp->io_zoneid = zonep->zone_id;
951 951 }
952 952
953 953 /*
954 954 * Track IO operations per zone. Called from dmu_tx_count_write for write ops
955 955 * and dmu_read_uio for read ops. For each operation, increment that zone's
956 956 * counter based on the type of operation.
957 957 *
958 958 * There are three basic ways that we can see write ops:
959 959 * 1) An application does write syscalls. Those ops go into a TXG which
960 960 * we'll count here. Sometime later a kernel taskq thread (we'll see the
961 961 * vdev IO as zone 0) will perform some number of physical writes to commit
962 962 * the TXG to disk. Those writes are not associated with the zone which
963 963 * made the write syscalls and the number of operations is not correlated
964 964 * between the taskq and the zone.
965 965 * 2) An application opens a file with O_SYNC. Each write will result in
966 966 * an operation which we'll see here plus a low-level vdev write from
967 967 * that zone.
968 968 * 3) An application does write syscalls followed by an fsync(). We'll
969 969 * count the writes going into a TXG here. We'll also see some number
970 970 * (usually much smaller, maybe only 1) of low-level vdev writes from this
971 971 * zone when the fsync is performed, plus some other low-level vdev writes
972 972 * from the taskq in zone 0 (are these metadata writes?).
973 973 *
974 974 * 4) In addition to the above, there are misc. system-level writes, such as
975 975 * writing out dirty pages to swap, or sync(2) calls, which will be handled
976 976 * by the global zone and which we count but don't generally worry about.
977 977 *
978 978 * Because of the above, we can see writes twice because this is called
979 979 * at a high level by a zone thread, but we also will count the phys. writes
980 980 * that are performed at a low level via zfs_zone_zio_start.
981 981 *
982 982 * Without this, it can look like a non-global zone never writes (case 1).
983 983 * Depending on when the TXG is synced, the counts may be in the same sample
984 984 * bucket or in a different one.
985 985 *
986 986 * Tracking read operations is simpler due to their synchronous semantics. The
987 987 * zfs_read function -- called as a result of a read(2) syscall -- will always
988 988 * retrieve the data to be read through dmu_read_uio.
989 989 */
990 990 void
991 991 zfs_zone_io_throttle(zfs_zone_iop_type_t type)
992 992 {
993 993 zone_t *zonep = curzone;
994 994 hrtime_t unow, last_checked;
995 995 uint16_t wait;
996 996
997 997 unow = GET_USEC_TIME;
998 998
999 999 /*
1000 1000 * Only bump the counters for logical operations here. The counters for
1001 1001 * tracking physical IO operations are handled in zfs_zone_zio_done.
1002 1002 */
1003 1003 if (type == ZFS_ZONE_IOP_LOGICAL_WRITE) {
1004 1004 mutex_enter(&zonep->zone_stg_io_lock);
1005 1005 add_iop(zonep, unow, type, 0);
1006 1006 mutex_exit(&zonep->zone_stg_io_lock);
1007 1007 }
1008 1008
1009 1009 if (!zfs_zone_delay_enable)
1010 1010 return;
1011 1011
1012 1012 /*
1013 1013 * If the zone's I/O priority is set to zero, don't throttle that zone's
1014 1014 * operations at all.
1015 1015 */
1016 1016 if (zonep->zone_zfs_io_pri == 0)
1017 1017 return;
1018 1018
1019 1019 /*
1020 1020 * XXX There's a potential race here in that more than one thread may
1021 1021 * update the zone delays concurrently. The worst outcome is corruption
1022 1022 * of our data to track each zone's IO, so the algorithm may make
1023 1023 * incorrect throttling decisions until the data is refreshed.
1024 1024 */
1025 1025 last_checked = zfs_zone_last_checked;
1026 1026 if ((unow - last_checked) > zfs_zone_adjust_time) {
1027 1027 zfs_zone_last_checked = unow;
1028 1028 zfs_zone_wait_adjust(unow, last_checked);
1029 1029 }
1030 1030
1031 1031 if ((wait = zonep->zone_io_delay) > 0) {
1032 1032 /*
1033 1033 * If this is a write and we're doing above normal TXG
1034 1034 * syncing, then throttle for longer than normal.
1035 1035 */
1036 1036 if (type == ZFS_ZONE_IOP_LOGICAL_WRITE &&
1037 1037 (txg_cnt > 1 || txg_sync_rate > 1))
1038 1038 wait *= zfs_zone_txg_throttle_scale;
1039 1039
1040 1040 /*
1041 1041 * sdt:::zfs-zone-wait
1042 1042 *
1043 1043 * arg0: zone ID
1044 1044 * arg1: type of IO operation
1045 1045 * arg2: time to delay (in us)
1046 1046 */
1047 1047 DTRACE_PROBE3(zfs__zone__wait, uintptr_t, zonep->zone_id,
1048 1048 uintptr_t, type, uintptr_t, wait);
1049 1049
1050 1050 drv_usecwait(wait);
1051 1051 }
1052 1052 }
1053 1053
1054 1054 /*
1055 1055 * XXX Ignore the pool pointer parameter for now.
1056 1056 *
1057 1057 * Keep track to see if the TXG sync rate is running above the expected rate.
1058 1058 * If so, this implies that we are filling TXG's at a high rate due to a heavy
1059 1059 * write workload. We use this as input into the zone throttle.
1060 1060 *
1061 1061 * This function is called every 5 seconds (zfs_txg_timeout) under a normal
1062 1062 * write load. In this case, the sync rate is going to be 1. When there
1063 1063 * is a heavy write load, TXG's fill up fast and the sync thread will write
1064 1064 * the TXG more frequently (perhaps once a second). In this case the rate
1065 1065 * will be > 1. The sync rate is a lagging indicator since it can be up
1066 1066 * to 5 seconds old. We use the txg_cnt to keep track of the rate in the
1067 1067 * current 5 second interval and txg_sync_rate to keep track of the previous
1068 1068 * 5 second interval. In that way we don't have a period (1 or more seconds)
1069 1069 * where the txg_cnt == 0 and we cut back on throttling even though the rate
1070 1070 * is still high.
1071 1071 */
1072 1072 /*ARGSUSED*/
1073 1073 void
1074 1074 zfs_zone_report_txg_sync(void *dp)
1075 1075 {
1076 1076 uint_t now;
1077 1077
1078 1078 txg_cnt++;
1079 1079 now = (uint_t)(gethrtime() / NANOSEC);
1080 1080 if ((now - txg_last_check) >= zfs_txg_timeout) {
1081 1081 txg_sync_rate = txg_cnt / 2;
1082 1082 txg_cnt = 0;
1083 1083 txg_last_check = now;
1084 1084 }
1085 1085 }
1086 1086
1087 1087 hrtime_t
1088 1088 zfs_zone_txg_delay()
1089 1089 {
1090 1090 if (curzone->zone_io_util_above_avg)
1091 1091 return (zfs_zone_txg_delay_nsec);
1092 1092
1093 1093 return (MSEC2NSEC(10));
1094 1094 }
1095 1095
1096 1096 /*
1097 1097 * Called from vdev_disk_io_start when an IO hits the end of the zio pipeline
1098 1098 * and is issued.
1099 1099 * Keep track of start time for latency calculation in zfs_zone_zio_done.
1100 1100 */
1101 1101 void
1102 1102 zfs_zone_zio_start(zio_t *zp)
1103 1103 {
1104 1104 zone_t *zonep;
1105 1105
1106 1106 /*
1107 1107 * I/Os of type ZIO_TYPE_IOCTL are used to flush the disk cache, not for
1108 1108 * an actual I/O operation. Ignore those operations as they relate to
1109 1109 * throttling and scheduling.
1110 1110 */
1111 1111 if (zp->io_type == ZIO_TYPE_IOCTL)
1112 1112 return;
1113 1113
1114 1114 if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
1115 1115 return;
1116 1116
1117 1117 zonep->zone_zfs_weight = 0;
1118 1118
1119 1119 mutex_enter(&zfs_disk_lock);
1120 1120 zp->io_dispatched = gethrtime();
1121 1121
1122 1122 if (zfs_disk_rcnt++ != 0)
1123 1123 zfs_disk_rtime += (zp->io_dispatched - zfs_disk_rlastupdate);
1124 1124 zfs_disk_rlastupdate = zp->io_dispatched;
1125 1125 mutex_exit(&zfs_disk_lock);
1126 1126
1127 1127 zone_rele(zonep);
1128 1128 }
1129 1129
1130 1130 /*
1131 1131 * Called from vdev_disk_io_done when an IO completes.
1132 1132 * Increment our counter for zone ops.
1133 1133 * Calculate the IO latency avg. for this zone.
|
↓ open down ↓ |
1110 lines elided |
↑ open up ↑ |
1134 1134 */
1135 1135 void
1136 1136 zfs_zone_zio_done(zio_t *zp)
1137 1137 {
1138 1138 zone_t *zonep;
1139 1139 hrtime_t now, unow, udelta;
1140 1140
1141 1141 if (zp->io_type == ZIO_TYPE_IOCTL)
1142 1142 return;
1143 1143
1144 - if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
1144 + if (zp->io_dispatched == 0)
1145 1145 return;
1146 1146
1147 - if (zp->io_dispatched == 0)
1147 + if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
1148 1148 return;
1149 1149
1150 1150 now = gethrtime();
1151 1151 unow = NANO_TO_MICRO(now);
1152 1152 udelta = unow - NANO_TO_MICRO(zp->io_dispatched);
1153 1153
1154 1154 mutex_enter(&zfs_disk_lock);
1155 1155 zfs_disk_rcnt--;
1156 1156 zfs_disk_rtime += (now - zfs_disk_rlastupdate);
1157 1157 zfs_disk_rlastupdate = now;
1158 1158
1159 1159 if (udelta > zfs_zone_laggard_threshold)
1160 1160 zfs_disk_last_laggard = unow;
1161 1161
1162 1162 mutex_exit(&zfs_disk_lock);
1163 1163
1164 1164 if (zfs_zone_delay_enable) {
1165 1165 mutex_enter(&zonep->zone_stg_io_lock);
1166 1166 add_iop(zonep, unow, zp->io_type == ZIO_TYPE_READ ?
1167 1167 ZFS_ZONE_IOP_READ : ZFS_ZONE_IOP_WRITE, udelta);
1168 1168 mutex_exit(&zonep->zone_stg_io_lock);
1169 1169 }
1170 1170
1171 1171 zone_rele(zonep);
1172 1172
1173 1173 /*
1174 1174 * sdt:::zfs-zone-latency
1175 1175 *
1176 1176 * arg0: zone ID
1177 1177 * arg1: type of I/O operation
1178 1178 * arg2: I/O latency (in us)
1179 1179 */
1180 1180 DTRACE_PROBE3(zfs__zone__latency, uintptr_t, zp->io_zoneid,
1181 1181 uintptr_t, zp->io_type, uintptr_t, udelta);
1182 1182 }
1183 1183
1184 1184 void
1185 1185 zfs_zone_zio_dequeue(zio_t *zp)
1186 1186 {
1187 1187 zio_priority_t p;
1188 1188 zone_t *zonep;
1189 1189
1190 1190 p = zp->io_priority;
1191 1191 if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE)
1192 1192 return;
1193 1193
1194 1194 /* We depend on p being defined as either 0 or 1 */
1195 1195 ASSERT(p < 2);
1196 1196
1197 1197 if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
1198 1198 return;
1199 1199
1200 1200 mutex_enter(&zonep->zone_stg_io_lock);
1201 1201 ASSERT(zonep->zone_zfs_queued[p] > 0);
1202 1202 if (zonep->zone_zfs_queued[p] == 0)
1203 1203 cmn_err(CE_WARN, "zfs_zone_zio_dequeue: count==0");
1204 1204 else
1205 1205 zonep->zone_zfs_queued[p]--;
1206 1206 mutex_exit(&zonep->zone_stg_io_lock);
1207 1207 zone_rele(zonep);
1208 1208 }
1209 1209
1210 1210 void
1211 1211 zfs_zone_zio_enqueue(zio_t *zp)
1212 1212 {
1213 1213 zio_priority_t p;
1214 1214 zone_t *zonep;
1215 1215
1216 1216 p = zp->io_priority;
1217 1217 if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE)
1218 1218 return;
1219 1219
1220 1220 /* We depend on p being defined as either 0 or 1 */
1221 1221 ASSERT(p < 2);
1222 1222
1223 1223 if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
1224 1224 return;
1225 1225
1226 1226 mutex_enter(&zonep->zone_stg_io_lock);
1227 1227 zonep->zone_zfs_queued[p]++;
1228 1228 mutex_exit(&zonep->zone_stg_io_lock);
1229 1229 zone_rele(zonep);
1230 1230 }
1231 1231
1232 1232 /*
1233 1233 * Called from vdev_queue_io_to_issue. That function is where zio's are listed
1234 1234 * in FIFO order on one of the sync queues, then pulled off (by
1235 1235 * vdev_queue_io_remove) and issued. We potentially do zone-based scheduling
1236 1236 * here to find a zone's zio deeper in the sync queue and issue that instead
1237 1237 * of simply doing FIFO.
1238 1238 *
1239 1239 * We only do zone-based zio scheduling for the two synchronous I/O queues
1240 1240 * (read & write). These queues are normally serviced in FIFO order but we
1241 1241 * may decide to move a zone's zio to the head of the line. A typical I/O
1242 1242 * load will be mostly synchronous reads and some asynchronous writes (which
1243 1243 * are scheduled differently due to transaction groups). There will also be
1244 1244 * some synchronous writes for those apps which want to ensure their data is on
1245 1245 * disk. We want to make sure that a zone with a single-threaded app (e.g. the
1246 1246 * shell) that is doing synchronous I/O (typically reads) isn't penalized by
1247 1247 * other zones which are doing lots of synchronous I/O because they have many
1248 1248 * running threads.
1249 1249 *
1250 1250 * The vq->vq_lock mutex is held when we're executing this function so we
1251 1251 * can safely access the "last zone" variable on the queue.
1252 1252 */
1253 1253 zio_t *
1254 1254 zfs_zone_schedule(vdev_queue_t *vq, zio_priority_t p, avl_index_t idx)
1255 1255 {
1256 1256 vdev_queue_class_t *vqc = &vq->vq_class[p];
1257 1257 uint_t cnt;
1258 1258 zoneid_t last_zone;
1259 1259 zio_t *zio;
1260 1260
1261 1261 ASSERT(MUTEX_HELD(&vq->vq_lock));
1262 1262
1263 1263 /* Don't change the order on the LBA ordered queues. */
1264 1264 if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE)
1265 1265 return (avl_nearest(&vqc->vqc_queued_tree, idx, AVL_AFTER));
1266 1266
1267 1267 /* We depend on p being defined as either 0 or 1 */
1268 1268 ASSERT(p < 2);
1269 1269
1270 1270 cnt = avl_numnodes(&vqc->vqc_queued_tree);
1271 1271 last_zone = vq->vq_last_zone_id;
1272 1272
1273 1273 /*
1274 1274 * If there are only a few zios in the queue then just issue the head.
1275 1275 * If there are more than a few zios already queued up, then use
1276 1276 * scheduling to get the next zio.
1277 1277 */
1278 1278 if (!zfs_zone_schedule_enable || cnt < zfs_zone_schedule_thresh)
1279 1279 zio = avl_nearest(&vqc->vqc_queued_tree, idx, AVL_AFTER);
1280 1280 else
1281 1281 zio = get_next_zio(vqc, cnt, p);
1282 1282
1283 1283 vq->vq_last_zone_id = zio->io_zoneid;
1284 1284
1285 1285 /*
1286 1286 * Probe with 4 args; the number of IOs in the queue, the zone that
1287 1287 * was last scheduled off this queue, the zone that was associated
1288 1288 * with the next IO that is scheduled, and which queue (priority).
1289 1289 */
1290 1290 DTRACE_PROBE4(zfs__zone__sched, uint_t, cnt, uint_t, last_zone,
1291 1291 uint_t, zio->io_zoneid, uint_t, p);
1292 1292
1293 1293 return (zio);
1294 1294 }
1295 1295
1296 1296 #endif
|
↓ open down ↓ |
139 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX