1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
  14  */
  15 
  16 #include <sys/zfs_context.h>
  17 #include <sys/zap.h>
  18 #include <sys/dmu.h>
  19 #include <sys/dmu_objset.h>
  20 #include <sys/dbuf.h>
  21 #include <sys/special_impl.h>
  22 #include <sys/metaslab_impl.h>
  23 #include <sys/vdev_impl.h>
  24 #include <sys/spa_impl.h>
  25 #include <sys/zio.h>
  26 #ifdef _KERNEL
  27 #include <sys/instance.h>
  28 #endif
  29 
  30 #include <sys/sysevent/eventdefs.h>
  31 /*
  32  * There already exist several types of "special" vdevs in zpool:
  33  * log, cache, and spare. However, there are other dimensions of
  34  * the issue that could be addressed in a similar fashion:
  35  *  - vdevs for storing ZFS metadata, including DDTs
  36  *  - vdevs for storing important ZFS data
  37  *  - vdevs that absorb write load spikes and move the data
  38  *    to regular devices during load valleys
  39  *
  40  * Clearly, there are lots of options. So, a generalized "special"
  41  * vdev class is introduced that can be configured to assume the
  42  * following personalities:
  43  *  - ZIL     - store ZIL blocks in a way quite similar to SLOG
  44  *  - META    - in addition to ZIL blocks, store ZFS metadata
  45  *  - WBC     - in addition to ZIL blocks and ZFS metadata, also
  46  *              absorb write load spikes (store data blocks),
  47  *              and move the data blocks to "regular" vdevs
  48  *              when the system is not too busy
  49  *
  50  * The ZIL personality is self-explanatory. The remaining two
  51  * personalities are also given additional parameters:
  52  *  - low/high watermarks for space use
  53  *  - enable/disable special device
  54  *
  55  * The watermarks for META personality determine if the metadata
  56  * can be placed on the special device, with hysteresis:
  57  * until the space used grows above high watermark, metadata
  58  * goes to the special vdev, then it stops going to the vdev
  59  * until the space used drops below low watermark
  60  *
  61  * For WBC, the watermarks also gradually reduce the load
  62  * on the special vdev once the space consumption grows beyond
  63  * the low watermark yet is still below high watermark:
  64  * the closer to the high watermark the space consumtion gets,
  65  * the smaller percentage of writes goes to the special vdev,
  66  * and once the high watermark is reached, all the data goes to
  67  * the regular vdevs.
  68  *
  69  * Additionally, WBC moves the data off the special device
  70  * when the system write load subsides, and the amount of data
  71  * moved off the special device increases as the load falls. Note
  72  * that metadata is not moved off the WBC vdevs.
  73  *
  74  * The pool configuration parameters that describe special vdevs
  75  * are stored as nvlist in the vdevs' labels along with other
  76  * standard pool and vdev properties. These parameters include:
  77  * - class of special vdevs in the pool (ZIL, META, WBC)
  78  * - whether special vdevs are enabled or not
  79  * - low and high watermarks for META and WBC
  80  * - a flag that marks special vdevs
  81  *
  82  * The currently supported modes are ZIL and META
  83  * (see usr/src/common/zfs/zpool_prop.c) but WBC support will
  84  * be provided soon
  85  */
  86 
  87 /*
  88  * Initial percentage of total write traffic routed to the
  89  * special vdev when the latter is working as writeback cache.
  90  * See spa->spa_special_to_normal_ratio.
  91  * Changing this variable affects only new or imported pools
  92  * Valid range: 0% - 100%
  93  */
  94 uint64_t spa_special_to_normal_ratio = 50;
  95 
  96 /*
  97  * Re-routing delta - the default value that gets added to
  98  * or subtracted from the spa->spa_special_to_normal_ratio
  99  * the setting below works as initial step that gets
 100  * reduced as we close on the load balancing optimum
 101  */
 102 int64_t spa_special_to_normal_delta = 15;
 103 
 104 /*
 105  * Initialize special vdev load balancing wares when the pool gets
 106  * created or imported
 107  */
 108 void
 109 spa_special_init(spa_t *spa)
 110 {
 111         mutex_init(&spa->spa_perfmon.perfmon_lock, NULL, MUTEX_DEFAULT, NULL);
 112         cv_init(&spa->spa_perfmon.perfmon_cv, NULL, CV_DEFAULT, NULL);
 113 
 114         bzero(&spa->spa_avg_stat, sizeof (spa_avg_stat_t));
 115 
 116         spa->spa_special_to_normal_ratio = spa_special_to_normal_ratio;
 117         spa->spa_special_to_normal_delta = 0;
 118         spa->spa_dedup_percentage = 100;
 119         spa->spa_avg_stat_rotor = 0;
 120         spa->spa_dedup_rotor = 0;
 121 
 122         spa->spa_perfmon.perfmon_thread = NULL;
 123         spa->spa_perfmon.perfmon_thr_exit = B_FALSE;
 124 }
 125 
 126 /*
 127  * The spa_special_fini function is symmetric to the spa_special_init
 128  * (above)
 129  * and is called when the pool gets destroyed or exported.
 130  */
 131 void
 132 spa_special_fini(spa_t *spa)
 133 {
 134         spa->spa_perfmon.perfmon_thread = NULL;
 135         cv_destroy(&spa->spa_perfmon.perfmon_cv);
 136         mutex_destroy(&spa->spa_perfmon.perfmon_lock);
 137 }
 138 
 139 static void
 140 spa_enable_special(spa_t *spa, boolean_t usesc)
 141 {
 142         ASSERT(spa != NULL);
 143 
 144         if (!spa_has_special(spa) || usesc == spa->spa_usesc)
 145                 return;
 146 
 147         spa->spa_usesc = usesc;
 148 }
 149 
 150 /*
 151  * Determine whether we should consider writing data synchronously to
 152  * special vdevs. See comments in zvol_log_write() and zfs_log_write()
 153  */
 154 boolean_t
 155 spa_write_data_to_special(spa_t *spa, objset_t *os)
 156 {
 157         ASSERT(os != NULL);
 158         return ((spa_has_special(spa)) &&
 159             (spa->spa_usesc) &&
 160             (spa->spa_watermark == SPA_WM_NONE) &&
 161             (os->os_wbc_mode != ZFS_WBC_MODE_OFF));
 162 }
 163 
 164 boolean_t
 165 spa_can_special_be_used(spa_t *spa)
 166 {
 167         return (spa_has_special(spa) && spa->spa_usesc &&
 168             (spa->spa_watermark == SPA_WM_NONE));
 169 }
 170 
 171 static uint64_t
 172 spa_special_space_perc(spa_t *spa, uint64_t perc)
 173 {
 174         metaslab_class_t *mc;
 175 
 176         ASSERT(spa_has_special(spa));
 177         mc = spa_special_class(spa);
 178         return (metaslab_class_get_space(mc) * perc / 100);
 179 }
 180 
 181 /*
 182  * Checks whether used space on a special device
 183  * has exceeded either low or high watermarks.
 184  */
 185 static void
 186 spa_check_watermarks(spa_t *spa)
 187 {
 188         metaslab_class_t *mc;
 189         uint64_t aspace, lspace;
 190         vdev_t *vd = NULL;
 191 
 192         if (!spa_has_special(spa))
 193                 return;
 194 
 195         /* Control logic will not work if one of the value is 0 */
 196         if (spa->spa_lowat == 0 || spa->spa_hiwat == 0)
 197                 return;
 198 
 199         mc = spa_special_class(spa);
 200         vd = mc->mc_rotor->mg_vd;
 201         aspace = metaslab_class_get_alloc(mc);
 202         spa->spa_lwm_space = spa_special_space_perc(spa, spa->spa_lowat);
 203         spa->spa_hwm_space = spa_special_space_perc(spa, spa->spa_hiwat);
 204         spa->spa_wbc_wm_range = spa->spa_hwm_space - spa->spa_lwm_space;
 205 
 206         if (aspace <= spa->spa_lwm_space) {
 207                 if (spa->spa_watermark != SPA_WM_NONE) {
 208                         spa->spa_watermark = SPA_WM_NONE;
 209                         spa_event_notify(spa, vd, NULL, ESC_ZFS_NONE_WATERMARK);
 210                 }
 211                 spa_enable_special(spa, B_TRUE);
 212         } else if (aspace > spa->spa_hwm_space) {
 213                 if (spa->spa_watermark != SPA_WM_HIGH) {
 214                         spa->spa_watermark = SPA_WM_HIGH;
 215                         spa_enable_special(spa, B_FALSE);
 216                         spa_event_notify(spa, vd, NULL, ESC_ZFS_HIGH_WATERMARK);
 217                 }
 218         } else {
 219                 if (spa->spa_watermark != SPA_WM_LOW) {
 220                         if (spa->spa_watermark == SPA_WM_NONE)
 221                                 spa_enable_special(spa, B_TRUE);
 222                         spa->spa_watermark = SPA_WM_LOW;
 223                         spa_event_notify(spa, vd, NULL, ESC_ZFS_LOW_WATERMARK);
 224                 }
 225 
 226                 /*
 227                  * correction_rate is used by the spa_special_adjust_routing()
 228                  * the coefficient changes proportionally to the space on the
 229                  * special vdev utilized beyond low watermark:
 230                  *      from 0% - when we are below low watermark
 231                  *      to 100% - at high watermark
 232                  */
 233                 spa->spa_special_vdev_correction_rate =
 234                     ((aspace - spa->spa_lwm_space) * 100) /
 235                     (spa->spa_hwm_space - spa->spa_lwm_space);
 236 
 237                 if (spa->spa_wbc.wbc_thread != NULL) {
 238                         /*
 239                          * Unlike Meta device, write cache is enabled, when
 240                          * we change from SPA_WM_HIGH to SPA_WM_LOW and then
 241                          * enables the throttling logic.
 242                          */
 243                         if (spa->spa_watermark == SPA_WM_HIGH)
 244                                 spa_enable_special(spa, B_TRUE);
 245                         lspace = aspace - spa->spa_lwm_space;
 246                         if (spa->spa_wbc_wm_range) {
 247                                 spa->spa_wbc_perc = (uint8_t)(lspace * 100 /
 248                                     spa->spa_wbc_wm_range);
 249                         } else {
 250                                 spa->spa_wbc_perc = 50;
 251                         }
 252                 }
 253         }
 254 
 255         DTRACE_PROBE1(check_wm, spa_t *, spa);
 256 }
 257 
 258 static int
 259 spa_check_special_degraded(spa_t *spa)
 260 {
 261         metaslab_class_t *mc;
 262         metaslab_group_t *mg;
 263         vdev_t *vd;
 264 
 265         if (!spa_has_special(spa))
 266                 return (0);
 267 
 268         mc = spa_special_class(spa);
 269         /*
 270          * Must hold one of the spa_config locks.
 271          */
 272         ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) != 0 ||
 273             spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER) != 0);
 274 
 275         if ((mg = mc->mc_rotor) == NULL)
 276                 return (0);
 277 
 278         do {
 279                 vd = mg->mg_vd;
 280                 if (vd->vdev_state == VDEV_STATE_DEGRADED ||
 281                     vd->vdev_state == VDEV_STATE_FAULTED)
 282                         return (1);
 283         } while ((mg = mg->mg_next) != mc->mc_rotor);
 284 
 285         return (0);
 286 }
 287 
 288 void
 289 spa_check_special(spa_t *spa)
 290 {
 291         if (!spa_has_special(spa))
 292                 return;
 293 
 294         /*
 295          * Check if special has degraded vdevs then disable it
 296          */
 297         if (spa_check_special_degraded(spa) != 0) {
 298                 spa_enable_special(spa, B_FALSE);
 299                 return;
 300         }
 301 
 302         spa_check_watermarks(spa);
 303 }
 304 
 305 /* returns B_TRUE if placed on special and B_FALSE if placed elsewhere */
 306 static boolean_t
 307 spa_refine_meta_placement(spa_t *spa, uint64_t zpl_meta_to_special,
 308     dmu_object_type_t ot)
 309 {
 310         spa_meta_placement_t *mp = &spa->spa_meta_policy;
 311         boolean_t isddt = DMU_OT_IS_DDT_META(ot),
 312             iszpl = DMU_OT_IS_ZPL_META(ot);
 313 
 314         if (isddt && (mp->spa_ddt_meta_to_special == META_PLACEMENT_OFF))
 315                 return (B_FALSE);
 316         else if (iszpl && (zpl_meta_to_special == META_PLACEMENT_OFF))
 317                 return (B_FALSE);
 318         else if (!isddt && !iszpl && (mp->spa_zfs_meta_to_special ==
 319             META_PLACEMENT_OFF))
 320                 return (B_FALSE);
 321         else
 322                 return (B_TRUE);
 323 }
 324 
 325 /* returns B_TRUE if can be placed on cache and B_FALSE otherwise */
 326 static boolean_t
 327 spa_meta_is_dual(spa_t *spa, uint64_t zpl_meta_to_special, dmu_object_type_t ot)
 328 {
 329         spa_meta_placement_t *mp = &spa->spa_meta_policy;
 330         boolean_t isddt = DMU_OT_IS_DDT_META(ot),
 331             iszpl = DMU_OT_IS_ZPL_META(ot);
 332 
 333         if (isddt && (mp->spa_ddt_meta_to_special != META_PLACEMENT_DUAL))
 334                 return (B_FALSE);
 335         else if (iszpl && (zpl_meta_to_special != META_PLACEMENT_DUAL))
 336                 return (B_FALSE);
 337         else if (!isddt && !iszpl && (mp->spa_zfs_meta_to_special !=
 338             META_PLACEMENT_DUAL))
 339                 return (B_FALSE);
 340         else
 341                 return (B_TRUE);
 342 }
 343 
 344 /*
 345  * Tunable: special load balancing goal
 346  * selects among special and normal vdevs in order to optimize specific
 347  * system parameter, e.g. latency or throughput/utilization
 348  *
 349  * ASSMPTION: we assume that special vdevs are much faster than regular vdevs
 350  * If this is not the case, the system will work better if all the vdevs
 351  * are made normal, as there is no reason to differentiate
 352  */
 353 spa_special_selection_t spa_special_selection =
 354     SPA_SPECIAL_SELECTION_UTILIZATION;
 355 
 356 /*
 357  * Tunable: factor used to adjust the ratio up/down
 358  * Range: 0 - 100
 359  * Units: percents
 360  */
 361 uint64_t spa_special_factor = 5;
 362 
 363 /*
 364  * Distribute writes across special and normal vdevs in
 365  * spa_special_to_normal-1:1 proportion
 366  */
 367 static boolean_t
 368 spa_refine_data_placement(spa_t *spa, zio_t *zio)
 369 {
 370         uint64_t rotor = atomic_inc_64_nv(&spa->spa_avg_stat_rotor);
 371         spa_meta_placement_t *mp = &spa->spa_meta_policy;
 372         boolean_t result = B_FALSE;
 373 
 374         /*
 375          * For the "balanced" sync-writes the load balancing is already done
 376          * see comment in zfs_log_write()
 377          */
 378         if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) {
 379                 if (spa->spa_watermark == SPA_WM_NONE &&
 380                     (mp->spa_sync_to_special == SYNC_TO_SPECIAL_ALWAYS ||
 381                     mp->spa_sync_to_special == SYNC_TO_SPECIAL_BALANCED)) {
 382                         result = B_TRUE;
 383                 }
 384         } else {
 385                 result = ((rotor % 100) < spa->spa_special_to_normal_ratio);
 386         }
 387 
 388         return (result);
 389 }
 390 
 391 static boolean_t
 392 spa_meta_to_special(spa_t *spa, objset_t *os, dmu_object_type_t ot)
 393 {
 394         boolean_t result = B_FALSE;
 395 
 396         ASSERT(os != NULL);
 397         /* some duplication of the spa_select_class() here */
 398 
 399         if (spa_has_special(spa) && spa->spa_usesc) {
 400                 result = spa_refine_meta_placement(spa,
 401                     os->os_zpl_meta_to_special, ot);
 402         }
 403 
 404         return (result);
 405 }
 406 
 407 /*
 408  * Decide whether block should be l2cached. Returns true if block's metadata
 409  * type is l2cacheable or block isn't a metadata one
 410  */
 411 boolean_t
 412 dbuf_meta_is_l2cacheable(dmu_buf_impl_t *db)
 413 {
 414         boolean_t is_metadata, is_to_special;
 415         dmu_object_type_t ot = DMU_OT_NONE;
 416         spa_t *spa = db->db_objset->os_spa;
 417 
 418         DB_DNODE_ENTER(db);
 419         ot = DB_DNODE(db)->dn_type;
 420         DB_DNODE_EXIT(db);
 421 
 422         is_metadata = dmu_ot[ot].ot_metadata;
 423 
 424         if (!is_metadata)
 425                 return (B_TRUE);
 426 
 427         is_to_special  = spa_meta_to_special(spa, db->db_objset, ot);
 428 
 429         if (!is_to_special)
 430                 return (B_TRUE);
 431 
 432         return (spa_meta_is_dual(spa, db->db_objset->os_zpl_meta_to_special,
 433             ot));
 434 }
 435 
 436 /*
 437  * Decide whether block should be l2cached. Returns true if block is a ddt
 438  * metadata and ddt metadata is cacheable, or if block isn't a ddt metadata
 439  */
 440 boolean_t
 441 dbuf_ddt_is_l2cacheable(dmu_buf_impl_t *db)
 442 {
 443         dmu_object_type_t ot;
 444         spa_t *spa = db->db_objset->os_spa;
 445         spa_meta_placement_t *mp = &spa->spa_meta_policy;
 446 
 447         if (!spa_has_special(spa))
 448                 return (B_TRUE);
 449 
 450         DB_DNODE_ENTER(db);
 451         ot = DB_DNODE(db)->dn_type;
 452         DB_DNODE_EXIT(db);
 453 
 454         if (!DMU_OT_IS_DDT_META(ot))
 455                 return (B_TRUE);
 456 
 457         return (mp->spa_ddt_meta_to_special != META_PLACEMENT_ON);
 458 }
 459 
 460 /*
 461  * Select whether to direct zio to special or to normal storage class
 462  * Even when the top-level criteria match (for placement to the special
 463  * class), consider refining data and metadata placement based on
 464  * additional information about the system's behavior
 465  */
 466 metaslab_class_t *
 467 spa_select_class(spa_t *spa, zio_t *zio)
 468 {
 469         zio_prop_t *zp = &zio->io_prop;
 470         spa_meta_placement_t *mp = &spa->spa_meta_policy;
 471         boolean_t match = B_FALSE;
 472 
 473         if (!zp->zp_usesc || !spa_has_special(spa) ||
 474             spa->spa_special_has_errors) {
 475                 match = B_FALSE;
 476         } else if (zp->zp_metadata) {
 477                 match = mp->spa_enable_meta_placement_selection &&
 478                     spa_refine_meta_placement(spa, zp->zp_zpl_meta_to_special,
 479                     zp->zp_type);
 480         } else if (BP_GET_PSIZE(zio->io_bp) <= mp->spa_small_data_to_special) {
 481                 match = B_TRUE;
 482         } else {
 483                 match = zp->zp_usewbc && spa->spa_wbc.wbc_ready_to_use &&
 484                     spa_refine_data_placement(spa, zio);
 485         }
 486 
 487         if (match)
 488                 return (spa_special_class(spa));
 489 
 490         /* drop 'use special class' */
 491         zp->zp_usesc = B_FALSE;
 492         zp->zp_usewbc = B_FALSE;
 493         return (spa_normal_class(spa));
 494 }
 495 
 496 /*
 497  * Tunable: enables or disables automatic spa special selection
 498  * logic and set static routing value for spa_special_to_normal_ratio
 499  *
 500  * Range: 0 - 100 (disables automatic logic and set static routing)
 501  * or
 502  * Default value: UINT64_MAX (enables automatic logic)
 503  */
 504 uint64_t spa_static_routing_percentage = UINT64_MAX;
 505 
 506 /*
 507  * Tunable: minimal delta between the current class-averaged latencies
 508  * Range: 0 - 100
 509  * Units: Percents
 510  */
 511 uint64_t spa_min_latency_delta = 15;
 512 
 513 /*
 514  * spa_special_adjust_routing() tunables that control re-balancing of the
 515  * write traffic between the two spa classes: special and normal.
 516  *
 517  * Specific SPA_SPECIAL_SELECTION_UTILIZATION mechanism here includes
 518  * the following steps executed by the spa_perfmon_thread():
 519  * 1) sample vdev utilization
 520  * 2) every so many (spa_rotor_load_adjusting) samples: aggregate on a
 521  *    per-class basis
 522  * 3) load-balance depending on where the latter falls as far as:
 523  *    (... vdev_idle, ... vdev_busy, ...)
 524  *    where "vdev_idle" and "vdev_busy" are the corresponding per-class
 525  *    boundaries specified below:
 526  */
 527 
 528 /*
 529  * class-averaged "busy" and "idle" constants
 530  * E.g., special class is considered idle when its average utilization
 531  * is at or below spa_special_class_idle
 532  */
 533 static int spa_special_class_busy = 70;
 534 static int spa_normal_class_busy = 70;
 535 static int spa_fairly_busy_delta = 10;
 536 static int spa_special_class_idle = 30;
 537 static int spa_normal_class_idle = 30;
 538 
 539 static boolean_t
 540 spa_class_is_busy(int ut, int busy)
 541 {
 542         return (ut > busy);
 543 }
 544 
 545 static boolean_t
 546 spa_class_is_idle(int ut, int idle)
 547 {
 548         return (ut < idle);
 549 }
 550 
 551 static boolean_t
 552 spa_class_is_fairly_busy(int ut, int busy)
 553 {
 554         if (busy < spa_fairly_busy_delta)
 555                 return (B_FALSE);
 556         return (ut > busy - spa_fairly_busy_delta);
 557 }
 558 
 559 /*
 560  * This specific load-balancer implements the following strategy:
 561  * when selecting between normal and special classes, bias "more"
 562  * load to the class with a smaller average latency
 563  */
 564 static void
 565 spa_special_adjust_routing_latency(spa_t *spa)
 566 {
 567         /*
 568          * average perf counters
 569          * computed for the current spa_perfmon_thread iteration
 570          */
 571         spa_avg_stat_t *stat = &spa->spa_avg_stat;
 572 
 573         /*
 574          * class latencies:
 575          * normal and special, min and max
 576          */
 577         uint64_t norm_svct = stat->normal_latency;
 578         uint64_t spec_svct = stat->special_latency;
 579         uint64_t svct_min = MIN(norm_svct, spec_svct);
 580         uint64_t svct_max = MAX(norm_svct, spec_svct);
 581 
 582         /* no rebalancing: do nothing if idle */
 583         if (norm_svct == 0 && spec_svct == 0)
 584                 return;
 585 
 586         /*
 587          * normalized difference between the per-class average latencies
 588          */
 589         uint64_t svct_delta = 100 * (svct_max - svct_min) / svct_max;
 590 
 591         /*
 592          * do nothing if the difference between class-averaged latencies
 593          * is less than configured
 594          */
 595         if (svct_delta < spa_min_latency_delta)
 596                 return;
 597 
 598         /*
 599          * current special to normal load balancing ratio and its
 600          * current "delta" - note that both values are recomputed below
 601          */
 602         int64_t ratio = spa->spa_special_to_normal_ratio;
 603         int64_t ratio_delta = spa->spa_special_to_normal_delta;
 604 
 605         /*
 606          * Recompute special-to-normal load balancing ratio:
 607          * 1) given non-zero rerouting delta, consider the current
 608          *    class-average latencies to possibly change the re-balancing
 609          *    direction; halve the delta to close on the local optimum
 610          * 2) otherwise, reset rerouting delta depending again
 611          *    on the relationship between average latencies
 612          *    (2nd and 3rd if)
 613          */
 614         if ((norm_svct > spec_svct && ratio_delta < 0) ||
 615             (norm_svct < spec_svct && ratio_delta > 0))
 616                 ratio_delta /= -2;
 617         else if (norm_svct > spec_svct && ratio_delta == 0)
 618                 ratio_delta = spa_special_to_normal_delta;
 619         else if (norm_svct < spec_svct && ratio_delta == 0)
 620                 ratio_delta = -spa_special_to_normal_delta;
 621 
 622         ratio += ratio_delta;
 623         ratio = MAX(MIN(ratio, 100), 0);
 624         spa->spa_special_to_normal_delta = ratio_delta;
 625         spa->spa_special_to_normal_ratio = ratio;
 626 }
 627 
 628 static void
 629 spa_special_adjust_routing_utilization(spa_t *spa)
 630 {
 631         /*
 632          * average perf counters
 633          * computed for the current spa_perfmon_thread iteration
 634          */
 635         spa_avg_stat_t *stat = &spa->spa_avg_stat;
 636 
 637         /* class utilizations: normal and special */
 638         uint64_t norm_util = stat->normal_utilization;
 639         uint64_t spec_util = stat->special_utilization;
 640 
 641         /*
 642          * current special to normal load balancing ratio and its
 643          * current "delta" - note that both values are recomputed below
 644          *
 645          * the first two 'if's below deal with the idle/busy situation,
 646          * while the remaining two rebalance between classes as long as
 647          * the "other" class is not idle
 648          */
 649         int64_t ratio = spa->spa_special_to_normal_ratio;
 650         int64_t ratio_delta = spa->spa_special_to_normal_delta;
 651 
 652         /* 1. special is fairly busy while normal is idle */
 653         if (spa_class_is_fairly_busy(spec_util, spa_special_class_busy) &&
 654             spa_class_is_idle(norm_util, spa_normal_class_idle))
 655                 ratio_delta = -spa_special_factor;
 656         /* 2. normal is fairly busy while special is idle */
 657         else if (spa_class_is_fairly_busy(norm_util, spa_normal_class_busy) &&
 658             spa_class_is_idle(spec_util, spa_special_class_idle))
 659                 ratio_delta = spa_special_factor;
 660         /* 3. normal is not busy and special is not idling as well */
 661         else if (!spa_class_is_busy(norm_util, spa_normal_class_busy) &&
 662             !spa_class_is_idle(spec_util, spa_special_class_idle))
 663                 ratio_delta = -spa_special_factor;
 664         /* 4. special is not busy and normal is not idling as well */
 665         else if (!spa_class_is_busy(spec_util, spa_special_class_busy) &&
 666             !spa_class_is_idle(norm_util, spa_normal_class_idle))
 667                 ratio_delta = spa_special_factor;
 668 
 669         ratio += ratio_delta;
 670         ratio = MAX(MIN(ratio, 100), 0);
 671         spa->spa_special_to_normal_delta = ratio_delta;
 672         spa->spa_special_to_normal_ratio = ratio;
 673 }
 674 
 675 static void
 676 spa_special_adjust_routing(spa_t *spa)
 677 {
 678         spa_avg_stat_t *stat = &spa->spa_avg_stat;
 679 
 680         /*
 681          * setting this spa_static_routing_percentage to a value
 682          * in the range (0, 100) will cause the system to abide
 683          * by this statically defined load balancing, and will
 684          * therefore totally disable all the dynamic latency and
 685          * throughput (default) balancing logic in this function
 686          */
 687         if (spa_static_routing_percentage <= 100) {
 688                 spa->spa_special_to_normal_ratio =
 689                     spa_static_routing_percentage;
 690                 goto out;
 691         }
 692 
 693         if (spa->spa_watermark == SPA_WM_HIGH) {
 694                 /*
 695                  * Free space on the special device is too low,
 696                  * so need to offload it
 697                  */
 698                 spa->spa_special_to_normal_ratio = 0;
 699                 goto out;
 700         }
 701 
 702         ASSERT(SPA_SPECIAL_SELECTION_VALID(spa_special_selection));
 703 
 704         switch (spa_special_selection) {
 705         case SPA_SPECIAL_SELECTION_LATENCY:
 706                 spa_special_adjust_routing_latency(spa);
 707                 break;
 708         case SPA_SPECIAL_SELECTION_UTILIZATION:
 709                 spa_special_adjust_routing_utilization(spa);
 710                 break;
 711         }
 712 
 713         /*
 714          * Adjust special/normal load balancing ratio by taking
 715          * into account used space vs. configurable watermarks.
 716          * (see spa_check_watermarks() for details)
 717          * Note that new writes are *not* routed to special
 718          * vdev when used above SPA_WM_HIGH
 719          */
 720         if (spa->spa_watermark == SPA_WM_LOW)
 721                 spa->spa_special_to_normal_ratio -=
 722                     spa->spa_special_to_normal_ratio *
 723                     spa->spa_special_vdev_correction_rate / 100;
 724 
 725 out:
 726 #ifdef _KERNEL
 727         DTRACE_PROBE7(spa_adjust_routing,
 728             uint64_t, spa->spa_special_to_normal_ratio,
 729             uint64_t, stat->special_utilization,
 730             uint64_t, stat->normal_utilization,
 731             uint64_t, stat->special_latency,
 732             uint64_t, stat->normal_latency,
 733             uint64_t, stat->special_throughput,
 734             uint64_t, stat->normal_throughput);
 735 #endif
 736         ASSERT(spa->spa_special_to_normal_ratio <= 100);
 737 }
 738 
 739 typedef void (*spa_load_cb)(vdev_t *, cos_acc_stat_t *);
 740 
 741 /*
 742  * Recursive walk top level vdev's tree
 743  * Callback on each physical vdev
 744  */
 745 static void
 746 spa_vdev_walk_stats(vdev_t *pvd, spa_load_cb func,
 747     cos_acc_stat_t *cos_acc)
 748 {
 749         if (pvd->vdev_children == 0) {
 750                 /* Single vdev (itself) */
 751                 ASSERT(pvd->vdev_ops->vdev_op_leaf);
 752                 DTRACE_PROBE1(spa_vdev_walk_lf, vdev_t *, pvd);
 753                 func(pvd, cos_acc);
 754         } else {
 755                 int i;
 756                 /* Not a leaf-level vdev, has children */
 757                 ASSERT(!pvd->vdev_ops->vdev_op_leaf);
 758                 for (i = 0; i < pvd->vdev_children; i++) {
 759                         vdev_t *vd = pvd->vdev_child[i];
 760                         ASSERT(vd != NULL);
 761 
 762                         if (vd->vdev_islog || vd->vdev_ishole ||
 763                             vd->vdev_isspare || vd->vdev_isl2cache)
 764                                 continue;
 765 
 766                         if (vd->vdev_ops->vdev_op_leaf) {
 767                                 DTRACE_PROBE1(spa_vdev_walk_lf, vdev_t *, vd);
 768                                 func(vd, cos_acc);
 769                         } else {
 770                                 DTRACE_PROBE1(spa_vdev_walk_nl, vdev_t *, vd);
 771                                 spa_vdev_walk_stats(vd, func, cos_acc);
 772                         }
 773                 }
 774         }
 775 }
 776 
 777 /*
 778  * Tunable: period (spa_avg_stat_update_ticks per tick)
 779  * for adjusting load distribution
 780  * Range: 1-UINT64_MAX
 781  * Units: period
 782  */
 783 uint64_t spa_rotor_load_adjusting = 1;
 784 
 785 /*
 786  * Tunable: weighted average over period
 787  * Range: 0-1
 788  * Units: boolean
 789  * 1: weighted average over spa_rotor_load_adjusting period
 790  * 0: (default): regular average
 791  */
 792 boolean_t spa_rotor_use_weight = B_FALSE;
 793 
 794 /*
 795  * Retrieve current kstat vdev statistics
 796  * Calculate delta values for all statistics
 797  * Calculate utilization and latency based on the received values
 798  * Update vdev_aux with current kstat values
 799  * Accumulate class utilization, latency and throughput into cos_acc
 800  */
 801 static void
 802 spa_vdev_process_stat(vdev_t *vd, cos_acc_stat_t *cos_acc)
 803 {
 804         uint64_t nread;         /* number of bytes read */
 805         uint64_t nwritten;      /* number of bytes written */
 806         uint64_t reads;         /* number of read operations */
 807         uint64_t writes;        /* number of write operations */
 808         uint64_t rtime;         /* cumulative run (service) time */
 809         uint64_t wtime;         /* cumulative wait (pre-service) time */
 810         uint64_t rlentime;      /* cumulative run length*time product */
 811         uint64_t wlentime;      /* cumulative wait length*time product */
 812         uint64_t rlastupdate;   /* last time run queue changed */
 813         uint64_t wlastupdate;   /* last time wait queue changed */
 814         uint64_t rcnt;          /* count of elements in run state */
 815         uint64_t wcnt;          /* count of elements in wait state */
 816 
 817         /*
 818          * average vdev utilization, measured as the percentage
 819          * of time for which the device was busy servicing I/O
 820          * requests during the sample interval
 821          */
 822         uint64_t utilization = 0;
 823 
 824         /*
 825          * average vdev throughput for read and write
 826          * in kilobytes per second
 827          */
 828         uint64_t throughput = 0;
 829 
 830         /* average vdev input/output operations per second */
 831         uint64_t iops = 0;
 832 
 833         /*
 834          * average number of commands being processed in the active
 835          * queue that the vdev is working on simultaneously
 836          */
 837         uint64_t run_len = 0;
 838 
 839         /*
 840          * average number of commands waiting in the queues that
 841          * have not been sent to the vdev yet
 842          */
 843         uint64_t wait_len = 0;
 844 
 845         /* average total queue: wait_len + run_len */
 846         uint64_t queue_len = 0;
 847 
 848         /*
 849          * average time for an operation to complete after
 850          * it has been dequeued from the wait queue
 851          */
 852         uint64_t run_time = 0;
 853 
 854         /* average time for which operations are queued before they are run */
 855         uint64_t wait_time = 0;
 856 
 857         /* average time to queue and complete an I/O operation */
 858         uint64_t service_time = 0;
 859 
 860         vdev_aux_stat_t *vdev_aux = &vd->vdev_aux_stat;
 861         kstat_t *kstat = vd->vdev_iokstat;
 862         kstat_io_t *kdata = kstat->ks_data;
 863 
 864         /* retrieve current kstat values for vdev */
 865         mutex_enter(kstat->ks_lock);
 866 
 867         nread = kdata->nread;
 868         nwritten = kdata->nwritten;
 869         reads = kdata->reads;
 870         writes = kdata->writes;
 871         rtime = kdata->rtime;
 872         wtime = kdata->wtime;
 873         rlentime = kdata->rlentime;
 874         wlentime = kdata->wlentime;
 875         rlastupdate = kdata->rlastupdate;
 876         wlastupdate = kdata->wlastupdate;
 877         rcnt = kdata->rcnt;
 878         wcnt = kdata->wcnt;
 879 
 880         mutex_exit(kstat->ks_lock);
 881 
 882         /* convert high-res time to nanoseconds */
 883 #ifdef _KERNEL
 884         scalehrtime((hrtime_t *)&rtime);
 885         scalehrtime((hrtime_t *)&wtime);
 886         scalehrtime((hrtime_t *)&rlentime);
 887         scalehrtime((hrtime_t *)&wlentime);
 888         scalehrtime((hrtime_t *)&rlastupdate);
 889         scalehrtime((hrtime_t *)&wlastupdate);
 890 #endif
 891 
 892         /*
 893          * At the beginning of each stats updating iteration
 894          * (wlastupdate == 0): init the counters
 895          */
 896         if (vdev_aux->wlastupdate != 0) {
 897                 /* Calculate deltas for vdev statistics */
 898                 uint64_t nread_delta = nread - vdev_aux->nread;
 899                 uint64_t nwritten_delta = nwritten - vdev_aux->nwritten;
 900                 uint64_t reads_delta = reads - vdev_aux->reads;
 901                 uint64_t writes_delta = writes - vdev_aux->writes;
 902                 uint64_t rtime_delta = rtime - vdev_aux->rtime;
 903                 uint64_t rlentime_delta = rlentime - vdev_aux->rlentime;
 904                 uint64_t wlentime_delta = wlentime - vdev_aux->wlentime;
 905                 uint64_t wlastupdate_delta = wlastupdate -
 906                     vdev_aux->wlastupdate;
 907 
 908                 if (wlastupdate_delta != 0) {
 909                         /* busy: proportion of the time as a percentage */
 910                         utilization = 100 * rtime_delta / wlastupdate_delta;
 911                         if (utilization > 100)
 912                                 utilization = 100;
 913                         /* throughput: KiloBytes per second */
 914                         throughput = NANOSEC * (nread_delta + nwritten_delta) /
 915                             wlastupdate_delta / 1024;
 916                         /* input/output operations per second */
 917                         iops = NANOSEC * (reads_delta + writes_delta) /
 918                             wlastupdate_delta;
 919                         run_len = rlentime_delta / wlastupdate_delta;
 920                         wait_len = wlentime_delta / wlastupdate_delta;
 921                         queue_len = run_len + wait_len;
 922                 }
 923 
 924                 if (iops != 0) {
 925                         /* latency: microseconds */
 926                         run_time = 1000 * run_len / iops;
 927                         wait_time = 1000 * wait_len / iops;
 928                         service_time = run_time + wait_time;
 929                 }
 930         }
 931 
 932         /* update previous kstat values */
 933         vdev_aux->nread = nread;
 934         vdev_aux->nwritten = nwritten;
 935         vdev_aux->reads = reads;
 936         vdev_aux->writes = writes;
 937         vdev_aux->rtime = rtime;
 938         vdev_aux->wtime = wtime;
 939         vdev_aux->rlentime = rlentime;
 940         vdev_aux->wlentime = wlentime;
 941         vdev_aux->rlastupdate = rlastupdate;
 942         vdev_aux->wlastupdate = wlastupdate;
 943         vdev_aux->rcnt = rcnt;
 944         vdev_aux->wcnt = wcnt;
 945 
 946         /* accumulate current class values */
 947         cos_acc->utilization += utilization;
 948         cos_acc->throughput += throughput;
 949         cos_acc->iops += iops;
 950         cos_acc->run_len += run_len;
 951         cos_acc->wait_len += wait_len;
 952         cos_acc->queue_len += queue_len;
 953         cos_acc->run_time += run_time;
 954         cos_acc->wait_time += wait_time;
 955         cos_acc->service_time += service_time;
 956         cos_acc->count++;
 957 
 958 #ifdef _KERNEL
 959         DTRACE_PROBE8(spa_vdev_stat,
 960             char *, vd->vdev_path,
 961             uint64_t, utilization,
 962             uint64_t, throughput,
 963             uint64_t, iops,
 964             uint64_t, run_len,
 965             uint64_t, wait_len,
 966             uint64_t, run_time,
 967             uint64_t, wait_time);
 968 #endif
 969 }
 970 
 971 /*
 972  * gather and accumulate spa average statistics per special and normal classes
 973  */
 974 static void
 975 spa_class_collect_stats(spa_t *spa, spa_acc_stat_t *spa_acc, uint64_t weight)
 976 {
 977         vdev_t *rvd = spa->spa_root_vdev;
 978         cos_acc_stat_t special_acc, normal_acc;
 979         int i;
 980 
 981         ASSERT(rvd != NULL);
 982 
 983         bzero(&special_acc, sizeof (cos_acc_stat_t));
 984         bzero(&normal_acc, sizeof (cos_acc_stat_t));
 985 
 986         /*
 987          * Walk the top level vdevs and calculate average
 988          * stats for the normal and special classes
 989          */
 990         spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 991 
 992         for (i = 0; i < rvd->vdev_children; i++) {
 993                 vdev_t *vd = rvd->vdev_child[i];
 994                 ASSERT(vd != NULL);
 995 
 996                 if (vd->vdev_islog || vd->vdev_ishole ||
 997                     vd->vdev_isspare || vd->vdev_isl2cache)
 998                         continue;
 999 
1000                 if (vd->vdev_isspecial)
1001                         spa_vdev_walk_stats(vd, spa_vdev_process_stat,
1002                             &special_acc);
1003                 else
1004                         spa_vdev_walk_stats(vd, spa_vdev_process_stat,
1005                             &normal_acc);
1006         }
1007 
1008         spa_config_exit(spa, SCL_VDEV, FTAG);
1009 
1010         if (special_acc.count == 0 || normal_acc.count == 0)
1011                 return;
1012 
1013         /*
1014          * Locally accumulate (sum-up) spa and per-class throughput, latency
1015          * and utilization stats. At the end of each iteration the resulting
1016          * sums are averaged /= num-samples
1017          */
1018 
1019         spa_acc->spa_utilization +=
1020             weight * (special_acc.utilization + normal_acc.utilization) /
1021             (special_acc.count + normal_acc.count);
1022 
1023         spa_acc->special_utilization +=
1024             weight * special_acc.utilization / special_acc.count;
1025         spa_acc->special_latency +=
1026             weight * special_acc.service_time / special_acc.count;
1027         spa_acc->special_throughput +=
1028             weight * special_acc.throughput / special_acc.count;
1029 
1030         spa_acc->normal_utilization +=
1031             weight * normal_acc.utilization / normal_acc.count;
1032         spa_acc->normal_latency +=
1033             weight * normal_acc.service_time / normal_acc.count;
1034         spa_acc->normal_throughput +=
1035             weight * normal_acc.throughput / normal_acc.count;
1036 
1037         spa_acc->count += weight;
1038 }
1039 
1040 /*
1041  * Updates spa statistics for special and normal classes
1042  * for every spa_rotor_load_adjusting-th of running
1043  */
1044 static void
1045 spa_load_stats_update(spa_t *spa, spa_acc_stat_t *spa_acc, uint64_t rotor)
1046 {
1047         spa_avg_stat_t *spa_avg = &spa->spa_avg_stat;
1048         uint64_t residue, weight = 1;
1049 
1050         residue = rotor % spa_rotor_load_adjusting;
1051 
1052         if (spa_rotor_use_weight)
1053                 weight = residue ? residue : spa_rotor_load_adjusting;
1054 
1055         spa_class_collect_stats(spa, spa_acc, weight);
1056 
1057         if (residue == 0 && spa_acc->count != 0) {
1058                 spa_avg->spa_utilization =
1059                     spa_acc->spa_utilization / spa_acc->count;
1060 
1061                 spa_avg->special_utilization =
1062                     spa_acc->special_utilization / spa_acc->count;
1063                 spa_avg->normal_utilization =
1064                     spa_acc->normal_utilization / spa_acc->count;
1065 
1066                 spa_avg->special_latency =
1067                     spa_acc->special_latency / spa_acc->count;
1068                 spa_avg->normal_latency =
1069                     spa_acc->normal_latency / spa_acc->count;
1070 
1071                 spa_avg->special_throughput =
1072                     spa_acc->special_throughput / spa_acc->count;
1073                 spa_avg->normal_throughput =
1074                     spa_acc->normal_throughput / spa_acc->count;
1075         }
1076 }
1077 
1078 static void
1079 spa_special_dedup_adjust(spa_t *spa)
1080 {
1081         spa_avg_stat_t *spa_avg = &spa->spa_avg_stat;
1082         int percentage;
1083 
1084         /*
1085          * if special_utilization < dedup_lo, then percentage = 100;
1086          * if special_utilization > dedup_hi, then percentage = 0;
1087          * otherwise, the percentage changes linearly from 100 to 0
1088          * as special_utilization moves from dedup_lo to dedup_hi
1089          */
1090         percentage = 100 - 100 *
1091             (spa_avg->special_utilization - spa->spa_dedup_lo_best_effort) /
1092             (spa->spa_dedup_hi_best_effort - spa->spa_dedup_lo_best_effort);
1093         /* enforce proper percentage limits */
1094         percentage = MIN(percentage, 100);
1095         percentage = MAX(percentage, 0);
1096 
1097         spa->spa_dedup_percentage = percentage;
1098 }
1099 
1100 /*
1101  * Tunable: period (~10ms per tick) for updating spa vdev stats
1102  * Range: 1 - UINT64_MAX
1103  * Units: 10 * milliseconds
1104  * For most recent cases "75" is optimal value.
1105  * The recommended range is: 50...200
1106  */
1107 clock_t spa_avg_stat_update_ticks = 75;
1108 
1109 /* Performance monitor thread */
1110 static void
1111 spa_perfmon_thread(void *void_spa)
1112 {
1113         spa_t *spa = void_spa;
1114         spa_perfmon_data_t *data = &spa->spa_perfmon;
1115         spa_acc_stat_t spa_acc;
1116         uint64_t rotor = 0;
1117 
1118         ASSERT(data != NULL);
1119 
1120         DTRACE_PROBE1(spa_pm_start, spa_t *, spa);
1121 
1122         /* take a reference against spa */
1123         mutex_enter(&spa_namespace_lock);
1124         spa_open_ref(spa, FTAG);
1125         mutex_exit(&spa_namespace_lock);
1126         bzero(&spa_acc, sizeof (spa_acc_stat_t));
1127 
1128         while (spa->spa_state != POOL_STATE_UNINITIALIZED &&
1129             !data->perfmon_thr_exit) {
1130                 clock_t deadline, timeleft = 1;
1131 
1132                 /*
1133                  * do the monitoring work here: gather average
1134                  * spa utilization, latency and throughput statistics
1135                  */
1136                 DTRACE_PROBE1(spa_pm_work, spa_t *, spa);
1137                 spa_load_stats_update(spa, &spa_acc, rotor);
1138 
1139                 /* we can adjust load and dedup at the same time */
1140                 if (rotor % spa_rotor_load_adjusting == 0) {
1141                         spa_special_adjust_routing(spa);
1142                         bzero(&spa_acc, sizeof (spa_acc_stat_t));
1143                 }
1144                 if (spa->spa_dedup_best_effort)
1145                         spa_special_dedup_adjust(spa);
1146 
1147                 /* wait for the next tick */
1148                 DTRACE_PROBE1(spa_pm_sleep, spa_t *, spa);
1149                 deadline = ddi_get_lbolt() + spa_avg_stat_update_ticks;
1150                 mutex_enter(&data->perfmon_lock);
1151                 while (timeleft > 0 &&
1152                     spa->spa_state != POOL_STATE_UNINITIALIZED &&
1153                     !data->perfmon_thr_exit) {
1154                         timeleft = cv_timedwait(&data->perfmon_cv,
1155                             &data->perfmon_lock, deadline);
1156                 }
1157                 mutex_exit(&data->perfmon_lock);
1158                 ++rotor;
1159         }
1160 
1161         /* release the reference against spa */
1162         mutex_enter(&spa_namespace_lock);
1163         spa_close(spa, FTAG);
1164         mutex_exit(&spa_namespace_lock);
1165 
1166         DTRACE_PROBE1(spa_pm_stop, spa_t *, spa);
1167         thread_exit();
1168 }
1169 
1170 void
1171 spa_start_perfmon_thread(spa_t *spa)
1172 {
1173         spa_perfmon_data_t *data = &spa->spa_perfmon;
1174 
1175         /* not a "real" spa import/create, do not start the thread */
1176         if (strcmp(spa->spa_name, TRYIMPORT_NAME) == 0)
1177                 return;
1178 
1179         mutex_enter(&data->perfmon_lock);
1180 
1181         if (data->perfmon_thread == NULL) {
1182                 DTRACE_PROBE1(spa_start_perfmon_act, spa_t *, spa);
1183                 data->perfmon_thr_exit = B_FALSE;
1184 #ifdef _KERNEL
1185                 data->perfmon_thread = thread_create(NULL, 0,
1186                     spa_perfmon_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
1187 #endif
1188         }
1189 
1190         mutex_exit(&data->perfmon_lock);
1191 }
1192 
1193 boolean_t
1194 spa_stop_perfmon_thread(spa_t *spa)
1195 {
1196         spa_perfmon_data_t *data = &spa->spa_perfmon;
1197         mutex_enter(&data->perfmon_lock);
1198 
1199         if (data->perfmon_thread != NULL) {
1200                 DTRACE_PROBE1(spa_stop_perfmon_act, spa_t *, spa);
1201                 data->perfmon_thr_exit = B_TRUE;
1202                 cv_signal(&data->perfmon_cv);
1203                 mutex_exit(&data->perfmon_lock);
1204 #ifdef _KERNEL
1205                 thread_join(data->perfmon_thread->t_did);
1206 #endif
1207                 data->perfmon_thread = NULL;
1208                 return (B_TRUE);
1209         }
1210 
1211         mutex_exit(&data->perfmon_lock);
1212         return (B_FALSE);
1213 }
1214 
1215 /* Closed funcitons from other facilities */
1216 void
1217 zio_best_effort_dedup(zio_t *zio)
1218 {
1219         spa_t *spa = zio->io_spa;
1220         zio_prop_t *zp = &zio->io_prop;
1221         uint64_t val;
1222 
1223         if (spa->spa_dedup_best_effort == 0)
1224                 return;
1225 
1226         val = atomic_inc_64_nv(&spa->spa_dedup_rotor);
1227         if ((val % 100) >= spa->spa_dedup_percentage)
1228                 zp->zp_dedup = 0;
1229 }
1230 
1231 static boolean_t
1232 spa_has_special_child_errors(vdev_t *vd)
1233 {
1234         vdev_stat_t *vs = &vd->vdev_stat;
1235 
1236         return (vs->vs_checksum_errors != 0 || vs->vs_read_errors != 0 ||
1237             vs->vs_write_errors != 0 || !vdev_readable(vd) ||
1238             !vdev_writeable(vd));
1239 }
1240 
1241 static int
1242 spa_special_check_errors_children(vdev_t *pvd)
1243 {
1244         int rc = 0;
1245 
1246         if (pvd->vdev_children == 0) {
1247                 if (spa_has_special_child_errors(pvd))
1248                         rc = -1;
1249         } else {
1250                 ASSERT(!pvd->vdev_ops->vdev_op_leaf);
1251                 for (size_t i = 0; i < pvd->vdev_children; i++) {
1252                         vdev_t *vd = pvd->vdev_child[i];
1253                         ASSERT(vd != NULL);
1254 
1255                         if (vd->vdev_ops->vdev_op_leaf) {
1256                                 if (spa_has_special_child_errors(vd)) {
1257                                         rc = -1;
1258                                         break;
1259                                 }
1260                         } else {
1261                                 rc = spa_special_check_errors_children(vd);
1262                                 if (rc != 0)
1263                                         break;
1264                         }
1265                 }
1266         }
1267 
1268         return (rc);
1269 }
1270 
1271 /*
1272  * This function is called from dsl_scan_done()
1273  * that is executed in sync-ctx.
1274  * Here we walk over all VDEVs, to find
1275  * special-vdev and check errors on it.
1276  *
1277  * If special-vdev does not have errors we drop
1278  * a flag that does not allow to write to special
1279  */
1280 void
1281 spa_special_check_errors(spa_t *spa)
1282 {
1283         vdev_t *rvd;
1284         boolean_t clean_special_err_flag = B_TRUE;
1285 
1286         spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
1287 
1288         rvd = spa->spa_root_vdev;
1289         for (size_t i = 0; i < rvd->vdev_children; i++) {
1290                 vdev_t *vd = rvd->vdev_child[i];
1291                 ASSERT(vd != NULL);
1292 
1293                 if (!vd->vdev_isspecial)
1294                         continue;
1295 
1296                 if (spa_special_check_errors_children(vd) != 0) {
1297                         clean_special_err_flag = B_FALSE;
1298                         break;
1299                 }
1300         }
1301 
1302         spa_config_exit(spa, SCL_VDEV, FTAG);
1303 
1304         if (clean_special_err_flag)
1305                 spa->spa_special_has_errors = B_FALSE;
1306 }
1307 
1308 int
1309 spa_special_vdev_remove(spa_t *spa, vdev_t *vd, uint64_t *txg)
1310 {
1311         int err;
1312         metaslab_group_t *mg;
1313 
1314         ASSERT(MUTEX_HELD(&spa_namespace_lock));
1315         ASSERT(vdev_is_special(vd));
1316 
1317         if (vd != vd->vdev_top)
1318                 return (SET_ERROR(ENOTSUP));
1319 
1320         if (spa_feature_is_active(spa, SPA_FEATURE_WBC)) {
1321                 /*
1322                  * WBC still active, so we cannot remove
1323                  * special at this time
1324                  */
1325                 return (SET_ERROR(EBUSY));
1326         }
1327 
1328         mg = vd->vdev_mg;
1329 
1330         /*
1331          * Stop allocating from this vdev.
1332          */
1333         metaslab_group_passivate(mg);
1334 
1335         /*
1336          * Wait for the youngest allocations and frees to sync,
1337          * and then wait for the deferral of those frees to finish.
1338          */
1339         spa_vdev_config_exit(spa, NULL,
1340             *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
1341 
1342         if (vd->vdev_stat.vs_alloc != 0) {
1343                 /* Make sure that special does not have ZIL data */
1344                 err = spa_offline_log(spa);
1345 
1346                 if (err != 0 || vd->vdev_stat.vs_alloc != 0) {
1347                         /*
1348                          * err == 0 means all ZIL data is gone,
1349                          * but we here so special vdev contains metadata,
1350                          * that we cannot migrate.
1351                          * It is possible that user has enabled some of
1352                          * the *_to_metadev prop but we cannot migrate
1353                          * metadata from special vdev to normal vdev in
1354                          * this case
1355                          */
1356                         if (err == 0)
1357                                 err = SET_ERROR(EEXIST);
1358 
1359                         *txg = spa_vdev_config_enter(spa);
1360                         metaslab_group_activate(mg);
1361                         return (err);
1362                 }
1363         }
1364 
1365         *txg = spa_vdev_config_enter(spa);
1366 
1367         vd->vdev_removing = B_TRUE;
1368         vdev_dirty_leaves(vd, VDD_DTL, *txg);
1369         vdev_config_dirty(vd);
1370 
1371         /* This exit is required to sync dirty configuration */
1372         spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);
1373 
1374         if (spa_feature_is_active(spa, SPA_FEATURE_META_DEVICES)) {
1375                 dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa),
1376                     spa_last_synced_txg(spa) + 1);
1377 
1378                 spa_feature_decr(spa, SPA_FEATURE_META_DEVICES, tx);
1379                 dmu_tx_commit(tx);
1380         }
1381 
1382         *txg = spa_vdev_config_enter(spa);
1383 
1384         /*
1385          * Release the references to CoS descriptors if any
1386          */
1387         if (vd->vdev_queue.vq_cos) {
1388                 cos_rele(vd->vdev_queue.vq_cos);
1389                 vd->vdev_queue.vq_cos = NULL;
1390         }
1391 
1392         return (0);
1393 }