1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2017 Nexenta Systems, Inc. All rights reserved.
14 */
15
16 #include <sys/zfs_context.h>
17 #include <sys/zap.h>
18 #include <sys/dmu.h>
19 #include <sys/dmu_objset.h>
20 #include <sys/dbuf.h>
21 #include <sys/special_impl.h>
22 #include <sys/metaslab_impl.h>
23 #include <sys/vdev_impl.h>
24 #include <sys/spa_impl.h>
25 #include <sys/zio.h>
26 #ifdef _KERNEL
27 #include <sys/instance.h>
28 #endif
29
30 #include <sys/sysevent/eventdefs.h>
31 /*
32 * There already exist several types of "special" vdevs in zpool:
33 * log, cache, and spare. However, there are other dimensions of
34 * the issue that could be addressed in a similar fashion:
35 * - vdevs for storing ZFS metadata, including DDTs
36 * - vdevs for storing important ZFS data
37 * - vdevs that absorb write load spikes and move the data
38 * to regular devices during load valleys
39 *
40 * Clearly, there are lots of options. So, a generalized "special"
41 * vdev class is introduced that can be configured to assume the
42 * following personalities:
43 * - ZIL - store ZIL blocks in a way quite similar to SLOG
44 * - META - in addition to ZIL blocks, store ZFS metadata
45 * - WBC - in addition to ZIL blocks and ZFS metadata, also
46 * absorb write load spikes (store data blocks),
47 * and move the data blocks to "regular" vdevs
48 * when the system is not too busy
49 *
50 * The ZIL personality is self-explanatory. The remaining two
51 * personalities are also given additional parameters:
52 * - low/high watermarks for space use
53 * - enable/disable special device
54 *
55 * The watermarks for META personality determine if the metadata
56 * can be placed on the special device, with hysteresis:
57 * until the space used grows above high watermark, metadata
58 * goes to the special vdev, then it stops going to the vdev
59 * until the space used drops below low watermark
60 *
61 * For WBC, the watermarks also gradually reduce the load
62 * on the special vdev once the space consumption grows beyond
63 * the low watermark yet is still below high watermark:
64 * the closer to the high watermark the space consumtion gets,
65 * the smaller percentage of writes goes to the special vdev,
66 * and once the high watermark is reached, all the data goes to
67 * the regular vdevs.
68 *
69 * Additionally, WBC moves the data off the special device
70 * when the system write load subsides, and the amount of data
71 * moved off the special device increases as the load falls. Note
72 * that metadata is not moved off the WBC vdevs.
73 *
74 * The pool configuration parameters that describe special vdevs
75 * are stored as nvlist in the vdevs' labels along with other
76 * standard pool and vdev properties. These parameters include:
77 * - class of special vdevs in the pool (ZIL, META, WBC)
78 * - whether special vdevs are enabled or not
79 * - low and high watermarks for META and WBC
80 * - a flag that marks special vdevs
81 *
82 * The currently supported modes are ZIL and META
83 * (see usr/src/common/zfs/zpool_prop.c) but WBC support will
84 * be provided soon
85 */
86
87 /*
88 * Initial percentage of total write traffic routed to the
89 * special vdev when the latter is working as writeback cache.
90 * See spa->spa_special_to_normal_ratio.
91 * Changing this variable affects only new or imported pools
92 * Valid range: 0% - 100%
93 */
94 uint64_t spa_special_to_normal_ratio = 50;
95
96 /*
97 * Re-routing delta - the default value that gets added to
98 * or subtracted from the spa->spa_special_to_normal_ratio
99 * the setting below works as initial step that gets
100 * reduced as we close on the load balancing optimum
101 */
102 int64_t spa_special_to_normal_delta = 15;
103
104 /*
105 * Initialize special vdev load balancing wares when the pool gets
106 * created or imported
107 */
108 void
109 spa_special_init(spa_t *spa)
110 {
111 mutex_init(&spa->spa_perfmon.perfmon_lock, NULL, MUTEX_DEFAULT, NULL);
112 cv_init(&spa->spa_perfmon.perfmon_cv, NULL, CV_DEFAULT, NULL);
113
114 bzero(&spa->spa_avg_stat, sizeof (spa_avg_stat_t));
115
116 spa->spa_special_to_normal_ratio = spa_special_to_normal_ratio;
117 spa->spa_special_to_normal_delta = 0;
118 spa->spa_dedup_percentage = 100;
119 spa->spa_avg_stat_rotor = 0;
120 spa->spa_dedup_rotor = 0;
121
122 spa->spa_perfmon.perfmon_thread = NULL;
123 spa->spa_perfmon.perfmon_thr_exit = B_FALSE;
124 }
125
126 /*
127 * The spa_special_fini function is symmetric to the spa_special_init
128 * (above)
129 * and is called when the pool gets destroyed or exported.
130 */
131 void
132 spa_special_fini(spa_t *spa)
133 {
134 spa->spa_perfmon.perfmon_thread = NULL;
135 cv_destroy(&spa->spa_perfmon.perfmon_cv);
136 mutex_destroy(&spa->spa_perfmon.perfmon_lock);
137 }
138
139 static void
140 spa_enable_special(spa_t *spa, boolean_t usesc)
141 {
142 ASSERT(spa != NULL);
143
144 if (!spa_has_special(spa) || usesc == spa->spa_usesc)
145 return;
146
147 spa->spa_usesc = usesc;
148 }
149
150 /*
151 * Determine whether we should consider writing data synchronously to
152 * special vdevs. See comments in zvol_log_write() and zfs_log_write()
153 */
154 boolean_t
155 spa_write_data_to_special(spa_t *spa, objset_t *os)
156 {
157 ASSERT(os != NULL);
158 return ((spa_has_special(spa)) &&
159 (spa->spa_usesc) &&
160 (spa->spa_watermark == SPA_WM_NONE) &&
161 (os->os_wbc_mode != ZFS_WBC_MODE_OFF));
162 }
163
164 boolean_t
165 spa_can_special_be_used(spa_t *spa)
166 {
167 return (spa_has_special(spa) && spa->spa_usesc &&
168 (spa->spa_watermark == SPA_WM_NONE));
169 }
170
171 static uint64_t
172 spa_special_space_perc(spa_t *spa, uint64_t perc)
173 {
174 metaslab_class_t *mc;
175
176 ASSERT(spa_has_special(spa));
177 mc = spa_special_class(spa);
178 return (metaslab_class_get_space(mc) * perc / 100);
179 }
180
181 /*
182 * Checks whether used space on a special device
183 * has exceeded either low or high watermarks.
184 */
185 static void
186 spa_check_watermarks(spa_t *spa)
187 {
188 metaslab_class_t *mc;
189 uint64_t aspace, lspace;
190 vdev_t *vd = NULL;
191
192 if (!spa_has_special(spa))
193 return;
194
195 /* Control logic will not work if one of the value is 0 */
196 if (spa->spa_lowat == 0 || spa->spa_hiwat == 0)
197 return;
198
199 mc = spa_special_class(spa);
200 vd = mc->mc_rotor->mg_vd;
201 aspace = metaslab_class_get_alloc(mc);
202 spa->spa_lwm_space = spa_special_space_perc(spa, spa->spa_lowat);
203 spa->spa_hwm_space = spa_special_space_perc(spa, spa->spa_hiwat);
204 spa->spa_wbc_wm_range = spa->spa_hwm_space - spa->spa_lwm_space;
205
206 if (aspace <= spa->spa_lwm_space) {
207 if (spa->spa_watermark != SPA_WM_NONE) {
208 spa->spa_watermark = SPA_WM_NONE;
209 spa_event_notify(spa, vd, NULL, ESC_ZFS_NONE_WATERMARK);
210 }
211 spa_enable_special(spa, B_TRUE);
212 } else if (aspace > spa->spa_hwm_space) {
213 if (spa->spa_watermark != SPA_WM_HIGH) {
214 spa->spa_watermark = SPA_WM_HIGH;
215 spa_enable_special(spa, B_FALSE);
216 spa_event_notify(spa, vd, NULL, ESC_ZFS_HIGH_WATERMARK);
217 }
218 } else {
219 if (spa->spa_watermark != SPA_WM_LOW) {
220 if (spa->spa_watermark == SPA_WM_NONE)
221 spa_enable_special(spa, B_TRUE);
222 spa->spa_watermark = SPA_WM_LOW;
223 spa_event_notify(spa, vd, NULL, ESC_ZFS_LOW_WATERMARK);
224 }
225
226 /*
227 * correction_rate is used by the spa_special_adjust_routing()
228 * the coefficient changes proportionally to the space on the
229 * special vdev utilized beyond low watermark:
230 * from 0% - when we are below low watermark
231 * to 100% - at high watermark
232 */
233 spa->spa_special_vdev_correction_rate =
234 ((aspace - spa->spa_lwm_space) * 100) /
235 (spa->spa_hwm_space - spa->spa_lwm_space);
236
237 if (spa->spa_wbc.wbc_thread != NULL) {
238 /*
239 * Unlike Meta device, write cache is enabled, when
240 * we change from SPA_WM_HIGH to SPA_WM_LOW and then
241 * enables the throttling logic.
242 */
243 if (spa->spa_watermark == SPA_WM_HIGH)
244 spa_enable_special(spa, B_TRUE);
245 lspace = aspace - spa->spa_lwm_space;
246 if (spa->spa_wbc_wm_range) {
247 spa->spa_wbc_perc = (uint8_t)(lspace * 100 /
248 spa->spa_wbc_wm_range);
249 } else {
250 spa->spa_wbc_perc = 50;
251 }
252 }
253 }
254
255 DTRACE_PROBE1(check_wm, spa_t *, spa);
256 }
257
258 static int
259 spa_check_special_degraded(spa_t *spa)
260 {
261 metaslab_class_t *mc;
262 metaslab_group_t *mg;
263 vdev_t *vd;
264
265 if (!spa_has_special(spa))
266 return (0);
267
268 mc = spa_special_class(spa);
269 /*
270 * Must hold one of the spa_config locks.
271 */
272 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) != 0 ||
273 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER) != 0);
274
275 if ((mg = mc->mc_rotor) == NULL)
276 return (0);
277
278 do {
279 vd = mg->mg_vd;
280 if (vd->vdev_state == VDEV_STATE_DEGRADED ||
281 vd->vdev_state == VDEV_STATE_FAULTED)
282 return (1);
283 } while ((mg = mg->mg_next) != mc->mc_rotor);
284
285 return (0);
286 }
287
288 void
289 spa_check_special(spa_t *spa)
290 {
291 if (!spa_has_special(spa))
292 return;
293
294 /*
295 * Check if special has degraded vdevs then disable it
296 */
297 if (spa_check_special_degraded(spa) != 0) {
298 spa_enable_special(spa, B_FALSE);
299 return;
300 }
301
302 spa_check_watermarks(spa);
303 }
304
305 /* returns B_TRUE if placed on special and B_FALSE if placed elsewhere */
306 static boolean_t
307 spa_refine_meta_placement(spa_t *spa, uint64_t zpl_meta_to_special,
308 dmu_object_type_t ot)
309 {
310 spa_meta_placement_t *mp = &spa->spa_meta_policy;
311 boolean_t isddt = DMU_OT_IS_DDT_META(ot),
312 iszpl = DMU_OT_IS_ZPL_META(ot);
313
314 if (isddt && (mp->spa_ddt_meta_to_special == META_PLACEMENT_OFF))
315 return (B_FALSE);
316 else if (iszpl && (zpl_meta_to_special == META_PLACEMENT_OFF))
317 return (B_FALSE);
318 else if (!isddt && !iszpl && (mp->spa_zfs_meta_to_special ==
319 META_PLACEMENT_OFF))
320 return (B_FALSE);
321 else
322 return (B_TRUE);
323 }
324
325 /* returns B_TRUE if can be placed on cache and B_FALSE otherwise */
326 static boolean_t
327 spa_meta_is_dual(spa_t *spa, uint64_t zpl_meta_to_special, dmu_object_type_t ot)
328 {
329 spa_meta_placement_t *mp = &spa->spa_meta_policy;
330 boolean_t isddt = DMU_OT_IS_DDT_META(ot),
331 iszpl = DMU_OT_IS_ZPL_META(ot);
332
333 if (isddt && (mp->spa_ddt_meta_to_special != META_PLACEMENT_DUAL))
334 return (B_FALSE);
335 else if (iszpl && (zpl_meta_to_special != META_PLACEMENT_DUAL))
336 return (B_FALSE);
337 else if (!isddt && !iszpl && (mp->spa_zfs_meta_to_special !=
338 META_PLACEMENT_DUAL))
339 return (B_FALSE);
340 else
341 return (B_TRUE);
342 }
343
344 /*
345 * Tunable: special load balancing goal
346 * selects among special and normal vdevs in order to optimize specific
347 * system parameter, e.g. latency or throughput/utilization
348 *
349 * ASSMPTION: we assume that special vdevs are much faster than regular vdevs
350 * If this is not the case, the system will work better if all the vdevs
351 * are made normal, as there is no reason to differentiate
352 */
353 spa_special_selection_t spa_special_selection =
354 SPA_SPECIAL_SELECTION_UTILIZATION;
355
356 /*
357 * Tunable: factor used to adjust the ratio up/down
358 * Range: 0 - 100
359 * Units: percents
360 */
361 uint64_t spa_special_factor = 5;
362
363 /*
364 * Distribute writes across special and normal vdevs in
365 * spa_special_to_normal-1:1 proportion
366 */
367 static boolean_t
368 spa_refine_data_placement(spa_t *spa, zio_t *zio)
369 {
370 uint64_t rotor = atomic_inc_64_nv(&spa->spa_avg_stat_rotor);
371 spa_meta_placement_t *mp = &spa->spa_meta_policy;
372 boolean_t result = B_FALSE;
373
374 /*
375 * For the "balanced" sync-writes the load balancing is already done
376 * see comment in zfs_log_write()
377 */
378 if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) {
379 if (spa->spa_watermark == SPA_WM_NONE &&
380 (mp->spa_sync_to_special == SYNC_TO_SPECIAL_ALWAYS ||
381 mp->spa_sync_to_special == SYNC_TO_SPECIAL_BALANCED)) {
382 result = B_TRUE;
383 }
384 } else {
385 result = ((rotor % 100) < spa->spa_special_to_normal_ratio);
386 }
387
388 return (result);
389 }
390
391 static boolean_t
392 spa_meta_to_special(spa_t *spa, objset_t *os, dmu_object_type_t ot)
393 {
394 boolean_t result = B_FALSE;
395
396 ASSERT(os != NULL);
397 /* some duplication of the spa_select_class() here */
398
399 if (spa_has_special(spa) && spa->spa_usesc) {
400 result = spa_refine_meta_placement(spa,
401 os->os_zpl_meta_to_special, ot);
402 }
403
404 return (result);
405 }
406
407 /*
408 * Decide whether block should be l2cached. Returns true if block's metadata
409 * type is l2cacheable or block isn't a metadata one
410 */
411 boolean_t
412 dbuf_meta_is_l2cacheable(dmu_buf_impl_t *db)
413 {
414 boolean_t is_metadata, is_to_special;
415 dmu_object_type_t ot = DMU_OT_NONE;
416 spa_t *spa = db->db_objset->os_spa;
417
418 DB_DNODE_ENTER(db);
419 ot = DB_DNODE(db)->dn_type;
420 DB_DNODE_EXIT(db);
421
422 is_metadata = dmu_ot[ot].ot_metadata;
423
424 if (!is_metadata)
425 return (B_TRUE);
426
427 is_to_special = spa_meta_to_special(spa, db->db_objset, ot);
428
429 if (!is_to_special)
430 return (B_TRUE);
431
432 return (spa_meta_is_dual(spa, db->db_objset->os_zpl_meta_to_special,
433 ot));
434 }
435
436 /*
437 * Decide whether block should be l2cached. Returns true if block is a ddt
438 * metadata and ddt metadata is cacheable, or if block isn't a ddt metadata
439 */
440 boolean_t
441 dbuf_ddt_is_l2cacheable(dmu_buf_impl_t *db)
442 {
443 dmu_object_type_t ot;
444 spa_t *spa = db->db_objset->os_spa;
445 spa_meta_placement_t *mp = &spa->spa_meta_policy;
446
447 if (!spa_has_special(spa))
448 return (B_TRUE);
449
450 DB_DNODE_ENTER(db);
451 ot = DB_DNODE(db)->dn_type;
452 DB_DNODE_EXIT(db);
453
454 if (!DMU_OT_IS_DDT_META(ot))
455 return (B_TRUE);
456
457 return (mp->spa_ddt_meta_to_special != META_PLACEMENT_ON);
458 }
459
460 /*
461 * Select whether to direct zio to special or to normal storage class
462 * Even when the top-level criteria match (for placement to the special
463 * class), consider refining data and metadata placement based on
464 * additional information about the system's behavior
465 */
466 metaslab_class_t *
467 spa_select_class(spa_t *spa, zio_t *zio)
468 {
469 zio_prop_t *zp = &zio->io_prop;
470 spa_meta_placement_t *mp = &spa->spa_meta_policy;
471 boolean_t match = B_FALSE;
472
473 if (!zp->zp_usesc || !spa_has_special(spa) ||
474 spa->spa_special_has_errors) {
475 match = B_FALSE;
476 } else if (zp->zp_metadata) {
477 match = mp->spa_enable_meta_placement_selection &&
478 spa_refine_meta_placement(spa, zp->zp_zpl_meta_to_special,
479 zp->zp_type);
480 } else if (BP_GET_PSIZE(zio->io_bp) <= mp->spa_small_data_to_special) {
481 match = B_TRUE;
482 } else {
483 match = zp->zp_usewbc && spa->spa_wbc.wbc_ready_to_use &&
484 spa_refine_data_placement(spa, zio);
485 }
486
487 if (match)
488 return (spa_special_class(spa));
489
490 /* drop 'use special class' */
491 zp->zp_usesc = B_FALSE;
492 zp->zp_usewbc = B_FALSE;
493 return (spa_normal_class(spa));
494 }
495
496 /*
497 * Tunable: enables or disables automatic spa special selection
498 * logic and set static routing value for spa_special_to_normal_ratio
499 *
500 * Range: 0 - 100 (disables automatic logic and set static routing)
501 * or
502 * Default value: UINT64_MAX (enables automatic logic)
503 */
504 uint64_t spa_static_routing_percentage = UINT64_MAX;
505
506 /*
507 * Tunable: minimal delta between the current class-averaged latencies
508 * Range: 0 - 100
509 * Units: Percents
510 */
511 uint64_t spa_min_latency_delta = 15;
512
513 /*
514 * spa_special_adjust_routing() tunables that control re-balancing of the
515 * write traffic between the two spa classes: special and normal.
516 *
517 * Specific SPA_SPECIAL_SELECTION_UTILIZATION mechanism here includes
518 * the following steps executed by the spa_perfmon_thread():
519 * 1) sample vdev utilization
520 * 2) every so many (spa_rotor_load_adjusting) samples: aggregate on a
521 * per-class basis
522 * 3) load-balance depending on where the latter falls as far as:
523 * (... vdev_idle, ... vdev_busy, ...)
524 * where "vdev_idle" and "vdev_busy" are the corresponding per-class
525 * boundaries specified below:
526 */
527
528 /*
529 * class-averaged "busy" and "idle" constants
530 * E.g., special class is considered idle when its average utilization
531 * is at or below spa_special_class_idle
532 */
533 static int spa_special_class_busy = 70;
534 static int spa_normal_class_busy = 70;
535 static int spa_fairly_busy_delta = 10;
536 static int spa_special_class_idle = 30;
537 static int spa_normal_class_idle = 30;
538
539 static boolean_t
540 spa_class_is_busy(int ut, int busy)
541 {
542 return (ut > busy);
543 }
544
545 static boolean_t
546 spa_class_is_idle(int ut, int idle)
547 {
548 return (ut < idle);
549 }
550
551 static boolean_t
552 spa_class_is_fairly_busy(int ut, int busy)
553 {
554 if (busy < spa_fairly_busy_delta)
555 return (B_FALSE);
556 return (ut > busy - spa_fairly_busy_delta);
557 }
558
559 /*
560 * This specific load-balancer implements the following strategy:
561 * when selecting between normal and special classes, bias "more"
562 * load to the class with a smaller average latency
563 */
564 static void
565 spa_special_adjust_routing_latency(spa_t *spa)
566 {
567 /*
568 * average perf counters
569 * computed for the current spa_perfmon_thread iteration
570 */
571 spa_avg_stat_t *stat = &spa->spa_avg_stat;
572
573 /*
574 * class latencies:
575 * normal and special, min and max
576 */
577 uint64_t norm_svct = stat->normal_latency;
578 uint64_t spec_svct = stat->special_latency;
579 uint64_t svct_min = MIN(norm_svct, spec_svct);
580 uint64_t svct_max = MAX(norm_svct, spec_svct);
581
582 /* no rebalancing: do nothing if idle */
583 if (norm_svct == 0 && spec_svct == 0)
584 return;
585
586 /*
587 * normalized difference between the per-class average latencies
588 */
589 uint64_t svct_delta = 100 * (svct_max - svct_min) / svct_max;
590
591 /*
592 * do nothing if the difference between class-averaged latencies
593 * is less than configured
594 */
595 if (svct_delta < spa_min_latency_delta)
596 return;
597
598 /*
599 * current special to normal load balancing ratio and its
600 * current "delta" - note that both values are recomputed below
601 */
602 int64_t ratio = spa->spa_special_to_normal_ratio;
603 int64_t ratio_delta = spa->spa_special_to_normal_delta;
604
605 /*
606 * Recompute special-to-normal load balancing ratio:
607 * 1) given non-zero rerouting delta, consider the current
608 * class-average latencies to possibly change the re-balancing
609 * direction; halve the delta to close on the local optimum
610 * 2) otherwise, reset rerouting delta depending again
611 * on the relationship between average latencies
612 * (2nd and 3rd if)
613 */
614 if ((norm_svct > spec_svct && ratio_delta < 0) ||
615 (norm_svct < spec_svct && ratio_delta > 0))
616 ratio_delta /= -2;
617 else if (norm_svct > spec_svct && ratio_delta == 0)
618 ratio_delta = spa_special_to_normal_delta;
619 else if (norm_svct < spec_svct && ratio_delta == 0)
620 ratio_delta = -spa_special_to_normal_delta;
621
622 ratio += ratio_delta;
623 ratio = MAX(MIN(ratio, 100), 0);
624 spa->spa_special_to_normal_delta = ratio_delta;
625 spa->spa_special_to_normal_ratio = ratio;
626 }
627
628 static void
629 spa_special_adjust_routing_utilization(spa_t *spa)
630 {
631 /*
632 * average perf counters
633 * computed for the current spa_perfmon_thread iteration
634 */
635 spa_avg_stat_t *stat = &spa->spa_avg_stat;
636
637 /* class utilizations: normal and special */
638 uint64_t norm_util = stat->normal_utilization;
639 uint64_t spec_util = stat->special_utilization;
640
641 /*
642 * current special to normal load balancing ratio and its
643 * current "delta" - note that both values are recomputed below
644 *
645 * the first two 'if's below deal with the idle/busy situation,
646 * while the remaining two rebalance between classes as long as
647 * the "other" class is not idle
648 */
649 int64_t ratio = spa->spa_special_to_normal_ratio;
650 int64_t ratio_delta = spa->spa_special_to_normal_delta;
651
652 /* 1. special is fairly busy while normal is idle */
653 if (spa_class_is_fairly_busy(spec_util, spa_special_class_busy) &&
654 spa_class_is_idle(norm_util, spa_normal_class_idle))
655 ratio_delta = -spa_special_factor;
656 /* 2. normal is fairly busy while special is idle */
657 else if (spa_class_is_fairly_busy(norm_util, spa_normal_class_busy) &&
658 spa_class_is_idle(spec_util, spa_special_class_idle))
659 ratio_delta = spa_special_factor;
660 /* 3. normal is not busy and special is not idling as well */
661 else if (!spa_class_is_busy(norm_util, spa_normal_class_busy) &&
662 !spa_class_is_idle(spec_util, spa_special_class_idle))
663 ratio_delta = -spa_special_factor;
664 /* 4. special is not busy and normal is not idling as well */
665 else if (!spa_class_is_busy(spec_util, spa_special_class_busy) &&
666 !spa_class_is_idle(norm_util, spa_normal_class_idle))
667 ratio_delta = spa_special_factor;
668
669 ratio += ratio_delta;
670 ratio = MAX(MIN(ratio, 100), 0);
671 spa->spa_special_to_normal_delta = ratio_delta;
672 spa->spa_special_to_normal_ratio = ratio;
673 }
674
675 static void
676 spa_special_adjust_routing(spa_t *spa)
677 {
678 spa_avg_stat_t *stat = &spa->spa_avg_stat;
679
680 /*
681 * setting this spa_static_routing_percentage to a value
682 * in the range (0, 100) will cause the system to abide
683 * by this statically defined load balancing, and will
684 * therefore totally disable all the dynamic latency and
685 * throughput (default) balancing logic in this function
686 */
687 if (spa_static_routing_percentage <= 100) {
688 spa->spa_special_to_normal_ratio =
689 spa_static_routing_percentage;
690 goto out;
691 }
692
693 if (spa->spa_watermark == SPA_WM_HIGH) {
694 /*
695 * Free space on the special device is too low,
696 * so need to offload it
697 */
698 spa->spa_special_to_normal_ratio = 0;
699 goto out;
700 }
701
702 ASSERT(SPA_SPECIAL_SELECTION_VALID(spa_special_selection));
703
704 switch (spa_special_selection) {
705 case SPA_SPECIAL_SELECTION_LATENCY:
706 spa_special_adjust_routing_latency(spa);
707 break;
708 case SPA_SPECIAL_SELECTION_UTILIZATION:
709 spa_special_adjust_routing_utilization(spa);
710 break;
711 }
712
713 /*
714 * Adjust special/normal load balancing ratio by taking
715 * into account used space vs. configurable watermarks.
716 * (see spa_check_watermarks() for details)
717 * Note that new writes are *not* routed to special
718 * vdev when used above SPA_WM_HIGH
719 */
720 if (spa->spa_watermark == SPA_WM_LOW)
721 spa->spa_special_to_normal_ratio -=
722 spa->spa_special_to_normal_ratio *
723 spa->spa_special_vdev_correction_rate / 100;
724
725 out:
726 #ifdef _KERNEL
727 DTRACE_PROBE7(spa_adjust_routing,
728 uint64_t, spa->spa_special_to_normal_ratio,
729 uint64_t, stat->special_utilization,
730 uint64_t, stat->normal_utilization,
731 uint64_t, stat->special_latency,
732 uint64_t, stat->normal_latency,
733 uint64_t, stat->special_throughput,
734 uint64_t, stat->normal_throughput);
735 #endif
736 ASSERT(spa->spa_special_to_normal_ratio <= 100);
737 }
738
739 typedef void (*spa_load_cb)(vdev_t *, cos_acc_stat_t *);
740
741 /*
742 * Recursive walk top level vdev's tree
743 * Callback on each physical vdev
744 */
745 static void
746 spa_vdev_walk_stats(vdev_t *pvd, spa_load_cb func,
747 cos_acc_stat_t *cos_acc)
748 {
749 if (pvd->vdev_children == 0) {
750 /* Single vdev (itself) */
751 ASSERT(pvd->vdev_ops->vdev_op_leaf);
752 DTRACE_PROBE1(spa_vdev_walk_lf, vdev_t *, pvd);
753 func(pvd, cos_acc);
754 } else {
755 int i;
756 /* Not a leaf-level vdev, has children */
757 ASSERT(!pvd->vdev_ops->vdev_op_leaf);
758 for (i = 0; i < pvd->vdev_children; i++) {
759 vdev_t *vd = pvd->vdev_child[i];
760 ASSERT(vd != NULL);
761
762 if (vd->vdev_islog || vd->vdev_ishole ||
763 vd->vdev_isspare || vd->vdev_isl2cache)
764 continue;
765
766 if (vd->vdev_ops->vdev_op_leaf) {
767 DTRACE_PROBE1(spa_vdev_walk_lf, vdev_t *, vd);
768 func(vd, cos_acc);
769 } else {
770 DTRACE_PROBE1(spa_vdev_walk_nl, vdev_t *, vd);
771 spa_vdev_walk_stats(vd, func, cos_acc);
772 }
773 }
774 }
775 }
776
777 /*
778 * Tunable: period (spa_avg_stat_update_ticks per tick)
779 * for adjusting load distribution
780 * Range: 1-UINT64_MAX
781 * Units: period
782 */
783 uint64_t spa_rotor_load_adjusting = 1;
784
785 /*
786 * Tunable: weighted average over period
787 * Range: 0-1
788 * Units: boolean
789 * 1: weighted average over spa_rotor_load_adjusting period
790 * 0: (default): regular average
791 */
792 boolean_t spa_rotor_use_weight = B_FALSE;
793
794 /*
795 * Retrieve current kstat vdev statistics
796 * Calculate delta values for all statistics
797 * Calculate utilization and latency based on the received values
798 * Update vdev_aux with current kstat values
799 * Accumulate class utilization, latency and throughput into cos_acc
800 */
801 static void
802 spa_vdev_process_stat(vdev_t *vd, cos_acc_stat_t *cos_acc)
803 {
804 uint64_t nread; /* number of bytes read */
805 uint64_t nwritten; /* number of bytes written */
806 uint64_t reads; /* number of read operations */
807 uint64_t writes; /* number of write operations */
808 uint64_t rtime; /* cumulative run (service) time */
809 uint64_t wtime; /* cumulative wait (pre-service) time */
810 uint64_t rlentime; /* cumulative run length*time product */
811 uint64_t wlentime; /* cumulative wait length*time product */
812 uint64_t rlastupdate; /* last time run queue changed */
813 uint64_t wlastupdate; /* last time wait queue changed */
814 uint64_t rcnt; /* count of elements in run state */
815 uint64_t wcnt; /* count of elements in wait state */
816
817 /*
818 * average vdev utilization, measured as the percentage
819 * of time for which the device was busy servicing I/O
820 * requests during the sample interval
821 */
822 uint64_t utilization = 0;
823
824 /*
825 * average vdev throughput for read and write
826 * in kilobytes per second
827 */
828 uint64_t throughput = 0;
829
830 /* average vdev input/output operations per second */
831 uint64_t iops = 0;
832
833 /*
834 * average number of commands being processed in the active
835 * queue that the vdev is working on simultaneously
836 */
837 uint64_t run_len = 0;
838
839 /*
840 * average number of commands waiting in the queues that
841 * have not been sent to the vdev yet
842 */
843 uint64_t wait_len = 0;
844
845 /* average total queue: wait_len + run_len */
846 uint64_t queue_len = 0;
847
848 /*
849 * average time for an operation to complete after
850 * it has been dequeued from the wait queue
851 */
852 uint64_t run_time = 0;
853
854 /* average time for which operations are queued before they are run */
855 uint64_t wait_time = 0;
856
857 /* average time to queue and complete an I/O operation */
858 uint64_t service_time = 0;
859
860 vdev_aux_stat_t *vdev_aux = &vd->vdev_aux_stat;
861 kstat_t *kstat = vd->vdev_iokstat;
862 kstat_io_t *kdata = kstat->ks_data;
863
864 /* retrieve current kstat values for vdev */
865 mutex_enter(kstat->ks_lock);
866
867 nread = kdata->nread;
868 nwritten = kdata->nwritten;
869 reads = kdata->reads;
870 writes = kdata->writes;
871 rtime = kdata->rtime;
872 wtime = kdata->wtime;
873 rlentime = kdata->rlentime;
874 wlentime = kdata->wlentime;
875 rlastupdate = kdata->rlastupdate;
876 wlastupdate = kdata->wlastupdate;
877 rcnt = kdata->rcnt;
878 wcnt = kdata->wcnt;
879
880 mutex_exit(kstat->ks_lock);
881
882 /* convert high-res time to nanoseconds */
883 #ifdef _KERNEL
884 scalehrtime((hrtime_t *)&rtime);
885 scalehrtime((hrtime_t *)&wtime);
886 scalehrtime((hrtime_t *)&rlentime);
887 scalehrtime((hrtime_t *)&wlentime);
888 scalehrtime((hrtime_t *)&rlastupdate);
889 scalehrtime((hrtime_t *)&wlastupdate);
890 #endif
891
892 /*
893 * At the beginning of each stats updating iteration
894 * (wlastupdate == 0): init the counters
895 */
896 if (vdev_aux->wlastupdate != 0) {
897 /* Calculate deltas for vdev statistics */
898 uint64_t nread_delta = nread - vdev_aux->nread;
899 uint64_t nwritten_delta = nwritten - vdev_aux->nwritten;
900 uint64_t reads_delta = reads - vdev_aux->reads;
901 uint64_t writes_delta = writes - vdev_aux->writes;
902 uint64_t rtime_delta = rtime - vdev_aux->rtime;
903 uint64_t rlentime_delta = rlentime - vdev_aux->rlentime;
904 uint64_t wlentime_delta = wlentime - vdev_aux->wlentime;
905 uint64_t wlastupdate_delta = wlastupdate -
906 vdev_aux->wlastupdate;
907
908 if (wlastupdate_delta != 0) {
909 /* busy: proportion of the time as a percentage */
910 utilization = 100 * rtime_delta / wlastupdate_delta;
911 if (utilization > 100)
912 utilization = 100;
913 /* throughput: KiloBytes per second */
914 throughput = NANOSEC * (nread_delta + nwritten_delta) /
915 wlastupdate_delta / 1024;
916 /* input/output operations per second */
917 iops = NANOSEC * (reads_delta + writes_delta) /
918 wlastupdate_delta;
919 run_len = rlentime_delta / wlastupdate_delta;
920 wait_len = wlentime_delta / wlastupdate_delta;
921 queue_len = run_len + wait_len;
922 }
923
924 if (iops != 0) {
925 /* latency: microseconds */
926 run_time = 1000 * run_len / iops;
927 wait_time = 1000 * wait_len / iops;
928 service_time = run_time + wait_time;
929 }
930 }
931
932 /* update previous kstat values */
933 vdev_aux->nread = nread;
934 vdev_aux->nwritten = nwritten;
935 vdev_aux->reads = reads;
936 vdev_aux->writes = writes;
937 vdev_aux->rtime = rtime;
938 vdev_aux->wtime = wtime;
939 vdev_aux->rlentime = rlentime;
940 vdev_aux->wlentime = wlentime;
941 vdev_aux->rlastupdate = rlastupdate;
942 vdev_aux->wlastupdate = wlastupdate;
943 vdev_aux->rcnt = rcnt;
944 vdev_aux->wcnt = wcnt;
945
946 /* accumulate current class values */
947 cos_acc->utilization += utilization;
948 cos_acc->throughput += throughput;
949 cos_acc->iops += iops;
950 cos_acc->run_len += run_len;
951 cos_acc->wait_len += wait_len;
952 cos_acc->queue_len += queue_len;
953 cos_acc->run_time += run_time;
954 cos_acc->wait_time += wait_time;
955 cos_acc->service_time += service_time;
956 cos_acc->count++;
957
958 #ifdef _KERNEL
959 DTRACE_PROBE8(spa_vdev_stat,
960 char *, vd->vdev_path,
961 uint64_t, utilization,
962 uint64_t, throughput,
963 uint64_t, iops,
964 uint64_t, run_len,
965 uint64_t, wait_len,
966 uint64_t, run_time,
967 uint64_t, wait_time);
968 #endif
969 }
970
971 /*
972 * gather and accumulate spa average statistics per special and normal classes
973 */
974 static void
975 spa_class_collect_stats(spa_t *spa, spa_acc_stat_t *spa_acc, uint64_t weight)
976 {
977 vdev_t *rvd = spa->spa_root_vdev;
978 cos_acc_stat_t special_acc, normal_acc;
979 int i;
980
981 ASSERT(rvd != NULL);
982
983 bzero(&special_acc, sizeof (cos_acc_stat_t));
984 bzero(&normal_acc, sizeof (cos_acc_stat_t));
985
986 /*
987 * Walk the top level vdevs and calculate average
988 * stats for the normal and special classes
989 */
990 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
991
992 for (i = 0; i < rvd->vdev_children; i++) {
993 vdev_t *vd = rvd->vdev_child[i];
994 ASSERT(vd != NULL);
995
996 if (vd->vdev_islog || vd->vdev_ishole ||
997 vd->vdev_isspare || vd->vdev_isl2cache)
998 continue;
999
1000 if (vd->vdev_isspecial)
1001 spa_vdev_walk_stats(vd, spa_vdev_process_stat,
1002 &special_acc);
1003 else
1004 spa_vdev_walk_stats(vd, spa_vdev_process_stat,
1005 &normal_acc);
1006 }
1007
1008 spa_config_exit(spa, SCL_VDEV, FTAG);
1009
1010 if (special_acc.count == 0 || normal_acc.count == 0)
1011 return;
1012
1013 /*
1014 * Locally accumulate (sum-up) spa and per-class throughput, latency
1015 * and utilization stats. At the end of each iteration the resulting
1016 * sums are averaged /= num-samples
1017 */
1018
1019 spa_acc->spa_utilization +=
1020 weight * (special_acc.utilization + normal_acc.utilization) /
1021 (special_acc.count + normal_acc.count);
1022
1023 spa_acc->special_utilization +=
1024 weight * special_acc.utilization / special_acc.count;
1025 spa_acc->special_latency +=
1026 weight * special_acc.service_time / special_acc.count;
1027 spa_acc->special_throughput +=
1028 weight * special_acc.throughput / special_acc.count;
1029
1030 spa_acc->normal_utilization +=
1031 weight * normal_acc.utilization / normal_acc.count;
1032 spa_acc->normal_latency +=
1033 weight * normal_acc.service_time / normal_acc.count;
1034 spa_acc->normal_throughput +=
1035 weight * normal_acc.throughput / normal_acc.count;
1036
1037 spa_acc->count += weight;
1038 }
1039
1040 /*
1041 * Updates spa statistics for special and normal classes
1042 * for every spa_rotor_load_adjusting-th of running
1043 */
1044 static void
1045 spa_load_stats_update(spa_t *spa, spa_acc_stat_t *spa_acc, uint64_t rotor)
1046 {
1047 spa_avg_stat_t *spa_avg = &spa->spa_avg_stat;
1048 uint64_t residue, weight = 1;
1049
1050 residue = rotor % spa_rotor_load_adjusting;
1051
1052 if (spa_rotor_use_weight)
1053 weight = residue ? residue : spa_rotor_load_adjusting;
1054
1055 spa_class_collect_stats(spa, spa_acc, weight);
1056
1057 if (residue == 0 && spa_acc->count != 0) {
1058 spa_avg->spa_utilization =
1059 spa_acc->spa_utilization / spa_acc->count;
1060
1061 spa_avg->special_utilization =
1062 spa_acc->special_utilization / spa_acc->count;
1063 spa_avg->normal_utilization =
1064 spa_acc->normal_utilization / spa_acc->count;
1065
1066 spa_avg->special_latency =
1067 spa_acc->special_latency / spa_acc->count;
1068 spa_avg->normal_latency =
1069 spa_acc->normal_latency / spa_acc->count;
1070
1071 spa_avg->special_throughput =
1072 spa_acc->special_throughput / spa_acc->count;
1073 spa_avg->normal_throughput =
1074 spa_acc->normal_throughput / spa_acc->count;
1075 }
1076 }
1077
1078 static void
1079 spa_special_dedup_adjust(spa_t *spa)
1080 {
1081 spa_avg_stat_t *spa_avg = &spa->spa_avg_stat;
1082 int percentage;
1083
1084 /*
1085 * if special_utilization < dedup_lo, then percentage = 100;
1086 * if special_utilization > dedup_hi, then percentage = 0;
1087 * otherwise, the percentage changes linearly from 100 to 0
1088 * as special_utilization moves from dedup_lo to dedup_hi
1089 */
1090 percentage = 100 - 100 *
1091 (spa_avg->special_utilization - spa->spa_dedup_lo_best_effort) /
1092 (spa->spa_dedup_hi_best_effort - spa->spa_dedup_lo_best_effort);
1093 /* enforce proper percentage limits */
1094 percentage = MIN(percentage, 100);
1095 percentage = MAX(percentage, 0);
1096
1097 spa->spa_dedup_percentage = percentage;
1098 }
1099
1100 /*
1101 * Tunable: period (~10ms per tick) for updating spa vdev stats
1102 * Range: 1 - UINT64_MAX
1103 * Units: 10 * milliseconds
1104 * For most recent cases "75" is optimal value.
1105 * The recommended range is: 50...200
1106 */
1107 clock_t spa_avg_stat_update_ticks = 75;
1108
1109 /* Performance monitor thread */
1110 static void
1111 spa_perfmon_thread(void *void_spa)
1112 {
1113 spa_t *spa = void_spa;
1114 spa_perfmon_data_t *data = &spa->spa_perfmon;
1115 spa_acc_stat_t spa_acc;
1116 uint64_t rotor = 0;
1117
1118 ASSERT(data != NULL);
1119
1120 DTRACE_PROBE1(spa_pm_start, spa_t *, spa);
1121
1122 /* take a reference against spa */
1123 mutex_enter(&spa_namespace_lock);
1124 spa_open_ref(spa, FTAG);
1125 mutex_exit(&spa_namespace_lock);
1126 bzero(&spa_acc, sizeof (spa_acc_stat_t));
1127
1128 while (spa->spa_state != POOL_STATE_UNINITIALIZED &&
1129 !data->perfmon_thr_exit) {
1130 clock_t deadline, timeleft = 1;
1131
1132 /*
1133 * do the monitoring work here: gather average
1134 * spa utilization, latency and throughput statistics
1135 */
1136 DTRACE_PROBE1(spa_pm_work, spa_t *, spa);
1137 spa_load_stats_update(spa, &spa_acc, rotor);
1138
1139 /* we can adjust load and dedup at the same time */
1140 if (rotor % spa_rotor_load_adjusting == 0) {
1141 spa_special_adjust_routing(spa);
1142 bzero(&spa_acc, sizeof (spa_acc_stat_t));
1143 }
1144 if (spa->spa_dedup_best_effort)
1145 spa_special_dedup_adjust(spa);
1146
1147 /* wait for the next tick */
1148 DTRACE_PROBE1(spa_pm_sleep, spa_t *, spa);
1149 deadline = ddi_get_lbolt() + spa_avg_stat_update_ticks;
1150 mutex_enter(&data->perfmon_lock);
1151 while (timeleft > 0 &&
1152 spa->spa_state != POOL_STATE_UNINITIALIZED &&
1153 !data->perfmon_thr_exit) {
1154 timeleft = cv_timedwait(&data->perfmon_cv,
1155 &data->perfmon_lock, deadline);
1156 }
1157 mutex_exit(&data->perfmon_lock);
1158 ++rotor;
1159 }
1160
1161 /* release the reference against spa */
1162 mutex_enter(&spa_namespace_lock);
1163 spa_close(spa, FTAG);
1164 mutex_exit(&spa_namespace_lock);
1165
1166 DTRACE_PROBE1(spa_pm_stop, spa_t *, spa);
1167 thread_exit();
1168 }
1169
1170 void
1171 spa_start_perfmon_thread(spa_t *spa)
1172 {
1173 spa_perfmon_data_t *data = &spa->spa_perfmon;
1174
1175 /* not a "real" spa import/create, do not start the thread */
1176 if (strcmp(spa->spa_name, TRYIMPORT_NAME) == 0)
1177 return;
1178
1179 mutex_enter(&data->perfmon_lock);
1180
1181 if (data->perfmon_thread == NULL) {
1182 DTRACE_PROBE1(spa_start_perfmon_act, spa_t *, spa);
1183 data->perfmon_thr_exit = B_FALSE;
1184 #ifdef _KERNEL
1185 data->perfmon_thread = thread_create(NULL, 0,
1186 spa_perfmon_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
1187 #endif
1188 }
1189
1190 mutex_exit(&data->perfmon_lock);
1191 }
1192
1193 boolean_t
1194 spa_stop_perfmon_thread(spa_t *spa)
1195 {
1196 spa_perfmon_data_t *data = &spa->spa_perfmon;
1197 mutex_enter(&data->perfmon_lock);
1198
1199 if (data->perfmon_thread != NULL) {
1200 DTRACE_PROBE1(spa_stop_perfmon_act, spa_t *, spa);
1201 data->perfmon_thr_exit = B_TRUE;
1202 cv_signal(&data->perfmon_cv);
1203 mutex_exit(&data->perfmon_lock);
1204 #ifdef _KERNEL
1205 thread_join(data->perfmon_thread->t_did);
1206 #endif
1207 data->perfmon_thread = NULL;
1208 return (B_TRUE);
1209 }
1210
1211 mutex_exit(&data->perfmon_lock);
1212 return (B_FALSE);
1213 }
1214
1215 /* Closed funcitons from other facilities */
1216 void
1217 zio_best_effort_dedup(zio_t *zio)
1218 {
1219 spa_t *spa = zio->io_spa;
1220 zio_prop_t *zp = &zio->io_prop;
1221 uint64_t val;
1222
1223 if (spa->spa_dedup_best_effort == 0)
1224 return;
1225
1226 val = atomic_inc_64_nv(&spa->spa_dedup_rotor);
1227 if ((val % 100) >= spa->spa_dedup_percentage)
1228 zp->zp_dedup = 0;
1229 }
1230
1231 static boolean_t
1232 spa_has_special_child_errors(vdev_t *vd)
1233 {
1234 vdev_stat_t *vs = &vd->vdev_stat;
1235
1236 return (vs->vs_checksum_errors != 0 || vs->vs_read_errors != 0 ||
1237 vs->vs_write_errors != 0 || !vdev_readable(vd) ||
1238 !vdev_writeable(vd));
1239 }
1240
1241 static int
1242 spa_special_check_errors_children(vdev_t *pvd)
1243 {
1244 int rc = 0;
1245
1246 if (pvd->vdev_children == 0) {
1247 if (spa_has_special_child_errors(pvd))
1248 rc = -1;
1249 } else {
1250 ASSERT(!pvd->vdev_ops->vdev_op_leaf);
1251 for (size_t i = 0; i < pvd->vdev_children; i++) {
1252 vdev_t *vd = pvd->vdev_child[i];
1253 ASSERT(vd != NULL);
1254
1255 if (vd->vdev_ops->vdev_op_leaf) {
1256 if (spa_has_special_child_errors(vd)) {
1257 rc = -1;
1258 break;
1259 }
1260 } else {
1261 rc = spa_special_check_errors_children(vd);
1262 if (rc != 0)
1263 break;
1264 }
1265 }
1266 }
1267
1268 return (rc);
1269 }
1270
1271 /*
1272 * This function is called from dsl_scan_done()
1273 * that is executed in sync-ctx.
1274 * Here we walk over all VDEVs, to find
1275 * special-vdev and check errors on it.
1276 *
1277 * If special-vdev does not have errors we drop
1278 * a flag that does not allow to write to special
1279 */
1280 void
1281 spa_special_check_errors(spa_t *spa)
1282 {
1283 vdev_t *rvd;
1284 boolean_t clean_special_err_flag = B_TRUE;
1285
1286 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
1287
1288 rvd = spa->spa_root_vdev;
1289 for (size_t i = 0; i < rvd->vdev_children; i++) {
1290 vdev_t *vd = rvd->vdev_child[i];
1291 ASSERT(vd != NULL);
1292
1293 if (!vd->vdev_isspecial)
1294 continue;
1295
1296 if (spa_special_check_errors_children(vd) != 0) {
1297 clean_special_err_flag = B_FALSE;
1298 break;
1299 }
1300 }
1301
1302 spa_config_exit(spa, SCL_VDEV, FTAG);
1303
1304 if (clean_special_err_flag)
1305 spa->spa_special_has_errors = B_FALSE;
1306 }
1307
1308 int
1309 spa_special_vdev_remove(spa_t *spa, vdev_t *vd, uint64_t *txg)
1310 {
1311 int err;
1312 metaslab_group_t *mg;
1313
1314 ASSERT(MUTEX_HELD(&spa_namespace_lock));
1315 ASSERT(vdev_is_special(vd));
1316
1317 if (vd != vd->vdev_top)
1318 return (SET_ERROR(ENOTSUP));
1319
1320 if (spa_feature_is_active(spa, SPA_FEATURE_WBC)) {
1321 /*
1322 * WBC still active, so we cannot remove
1323 * special at this time
1324 */
1325 return (SET_ERROR(EBUSY));
1326 }
1327
1328 mg = vd->vdev_mg;
1329
1330 /*
1331 * Stop allocating from this vdev.
1332 */
1333 metaslab_group_passivate(mg);
1334
1335 /*
1336 * Wait for the youngest allocations and frees to sync,
1337 * and then wait for the deferral of those frees to finish.
1338 */
1339 spa_vdev_config_exit(spa, NULL,
1340 *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
1341
1342 if (vd->vdev_stat.vs_alloc != 0) {
1343 /* Make sure that special does not have ZIL data */
1344 err = spa_offline_log(spa);
1345
1346 if (err != 0 || vd->vdev_stat.vs_alloc != 0) {
1347 /*
1348 * err == 0 means all ZIL data is gone,
1349 * but we here so special vdev contains metadata,
1350 * that we cannot migrate.
1351 * It is possible that user has enabled some of
1352 * the *_to_metadev prop but we cannot migrate
1353 * metadata from special vdev to normal vdev in
1354 * this case
1355 */
1356 if (err == 0)
1357 err = SET_ERROR(EEXIST);
1358
1359 *txg = spa_vdev_config_enter(spa);
1360 metaslab_group_activate(mg);
1361 return (err);
1362 }
1363 }
1364
1365 *txg = spa_vdev_config_enter(spa);
1366
1367 vd->vdev_removing = B_TRUE;
1368 vdev_dirty_leaves(vd, VDD_DTL, *txg);
1369 vdev_config_dirty(vd);
1370
1371 /* This exit is required to sync dirty configuration */
1372 spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);
1373
1374 if (spa_feature_is_active(spa, SPA_FEATURE_META_DEVICES)) {
1375 dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa),
1376 spa_last_synced_txg(spa) + 1);
1377
1378 spa_feature_decr(spa, SPA_FEATURE_META_DEVICES, tx);
1379 dmu_tx_commit(tx);
1380 }
1381
1382 *txg = spa_vdev_config_enter(spa);
1383
1384 /*
1385 * Release the references to CoS descriptors if any
1386 */
1387 if (vd->vdev_queue.vq_cos) {
1388 cos_rele(vd->vdev_queue.vq_cos);
1389 vd->vdev_queue.vq_cos = NULL;
1390 }
1391
1392 return (0);
1393 }