Print this page
    
Side-port of OS-2943 zone stuck 'down': references still extant (illumos-joyent 4cb09b44b4f851905a0e8cccbd9bfc834acc2041)
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/zfs/zfs_zone.c
          +++ new/usr/src/uts/common/fs/zfs/zfs_zone.c
   1    1  /*
   2    2   * This file and its contents are supplied under the terms of the
  
    | 
      ↓ open down ↓ | 
    2 lines elided | 
    
      ↑ open up ↑ | 
  
   3    3   * Common Development and Distribution License ("CDDL"), version 1.0.
   4    4   * You may only use this file in accordance with the terms of version
   5    5   * 1.0 of the CDDL.
   6    6   *
   7    7   * A full copy of the text of the CDDL should have accompanied this
   8    8   * source.  A copy of the CDDL is also available via the Internet at
   9    9   * http://www.illumos.org/license/CDDL.
  10   10   */
  11   11  
  12   12  /*
  13      - * Copyright 2013, Joyent, Inc. All rights reserved.
       13 + * Copyright 2014, Joyent, Inc. All rights reserved.
  14   14   */
  15   15  
  16   16  /*
  17   17   * The ZFS/Zone I/O throttle and scheduler attempts to ensure fair access to
  18   18   * ZFS I/O resources for each zone.
  19   19   *
  20   20   * I/O contention can be major pain point on a multi-tenant system. A single
  21   21   * zone can issue a stream of I/O operations, usually synchronous writes, which
  22   22   * disrupt I/O performance for all other zones. This problem is further
  23   23   * exacerbated by ZFS, which buffers all asynchronous writes in a single TXG,
  24   24   * a set of blocks which are atomically synced to disk. The process of
  25   25   * syncing a TXG can occupy all of a device's I/O bandwidth, thereby starving
  26   26   * out any pending read operations.
  27   27   *
  28   28   * There are two facets to this capability; the throttle and the scheduler.
  29   29   *
  30   30   * Throttle
  31   31   *
  32   32   * The requirements on the throttle are:
  33   33   *
  34   34   *     1) Ensure consistent and predictable I/O latency across all zones.
  35   35   *     2) Sequential and random workloads have very different characteristics,
  36   36   *        so it is a non-starter to track IOPS or throughput.
  37   37   *     3) A zone should be able to use the full disk bandwidth if no other zone
  38   38   *        is actively using the disk.
  39   39   *
  40   40   * The throttle has two components: one to track and account for each zone's
  41   41   * I/O requests, and another to throttle each zone's operations when it
  42   42   * exceeds its fair share of disk I/O. When the throttle detects that a zone is
  43   43   * consuming more than is appropriate, each read or write system call is
  44   44   * delayed by up to 100 microseconds, which we've found is sufficient to allow
  45   45   * other zones to interleave I/O requests during those delays.
  46   46   *
  47   47   * Note: The throttle will delay each logical I/O (as opposed to the physical
  48   48   * I/O which will likely be issued asynchronously), so it may be easier to
  49   49   * think of the I/O throttle delaying each read/write syscall instead of the
  50   50   * actual I/O operation. For each zone, the throttle tracks an ongoing average
  51   51   * of read and write operations performed to determine the overall I/O
  52   52   * utilization for each zone.
  53   53   *
  54   54   * The throttle calculates a I/O utilization metric for each zone using the
  55   55   * following formula:
  56   56   *
  57   57   *     (# of read syscalls) x (Average read latency) +
  58   58   *     (# of write syscalls) x (Average write latency)
  59   59   *
  60   60   * Once each zone has its utilization metric, the I/O throttle will compare I/O
  61   61   * utilization across all zones, and if a zone has a higher-than-average I/O
  62   62   * utilization, system calls from that zone are throttled. That is, if one
  63   63   * zone has a much higher utilization, that zone's delay is increased by 5
  64   64   * microseconds, up to a maximum of 100 microseconds. Conversely, if a zone is
  65   65   * already throttled and has a lower utilization than average, its delay will
  66   66   * be lowered by 5 microseconds.
  67   67   *
  68   68   * The throttle calculation is driven by IO activity, but since IO does not
  69   69   * happen at fixed intervals, timestamps are used to track when the last update
  70   70   * was made and to drive recalculation.
  71   71   *
  72   72   * The throttle recalculates each zone's I/O usage and throttle delay (if any)
  73   73   * on the zfs_zone_adjust_time interval. Overall I/O latency is maintained as
  74   74   * a decayed average which is updated on the zfs_zone_sys_avg_cycle interval.
  75   75   *
  76   76   * Scheduler
  77   77   *
  78   78   * The I/O scheduler manages the vdev queues, the queues of pending I/Os to
  79   79   * issue to the disks. It only makes scheduling decisions for the two
  80   80   * synchronous I/O queues (read & write).
  81   81   *
  82   82   * The scheduler maintains how many I/Os in the queue are from each zone, and
  83   83   * if one zone has a disproportionately large number of I/Os in the queue, the
  84   84   * scheduler will allow certain I/Os from the underutilized zones to be "bumped"
  85   85   * and pulled from the middle of the queue. This bump allows zones with a small
  86   86   * number of I/Os (so small they may not even be taken into account by the
  87   87   * throttle) to complete quickly instead of waiting behind dozens of I/Os from
  88   88   * other zones.
  89   89   */
  90   90  
  91   91  #include <sys/spa.h>
  92   92  #include <sys/vdev_impl.h>
  93   93  #include <sys/zfs_zone.h>
  94   94  
  95   95  #ifndef _KERNEL
  96   96  
  97   97  /*
  98   98   * Stubs for when compiling for user-land.
  99   99   */
 100  100  
 101  101  void
 102  102  zfs_zone_io_throttle(zfs_zone_iop_type_t type)
 103  103  {
 104  104  }
 105  105  
 106  106  void
 107  107  zfs_zone_zio_init(zio_t *zp)
 108  108  {
 109  109  }
 110  110  
 111  111  void
 112  112  zfs_zone_zio_start(zio_t *zp)
 113  113  {
 114  114  }
 115  115  
 116  116  void
 117  117  zfs_zone_zio_done(zio_t *zp)
 118  118  {
 119  119  }
 120  120  
 121  121  void
 122  122  zfs_zone_zio_dequeue(zio_t *zp)
 123  123  {
 124  124  }
 125  125  
 126  126  void
 127  127  zfs_zone_zio_enqueue(zio_t *zp)
 128  128  {
 129  129  }
 130  130  
 131  131  /*ARGSUSED*/
 132  132  void
 133  133  zfs_zone_report_txg_sync(void *dp)
 134  134  {
 135  135  }
 136  136  
 137  137  hrtime_t
 138  138  zfs_zone_txg_delay()
 139  139  {
 140  140          return (MSEC2NSEC(10));
 141  141  }
 142  142  
 143  143  #else
 144  144  
 145  145  /*
 146  146   * The real code.
 147  147   */
 148  148  
 149  149  #include <sys/systm.h>
 150  150  #include <sys/thread.h>
 151  151  #include <sys/proc.h>
 152  152  #include <sys/types.h>
 153  153  #include <sys/param.h>
 154  154  #include <sys/time.h>
 155  155  #include <sys/atomic.h>
 156  156  #include <sys/zio.h>
 157  157  #include <sys/zone.h>
 158  158  #include <sys/avl.h>
 159  159  #include <sys/sdt.h>
 160  160  #include <sys/ddi.h>
 161  161  
 162  162  /*
 163  163   * The zone throttle delays read and write operations from certain zones based
 164  164   * on each zone's IO utilitzation.  Once a cycle (defined by zfs_zone_cycle_time
 165  165   * below), the delays for each zone are recalculated based on the utilization
 166  166   * over the previous window.
 167  167   */
 168  168  boolean_t       zfs_zone_delay_enable = B_TRUE; /* enable IO throttle */
 169  169  uint16_t        zfs_zone_delay_step = 5;        /* usec amnt to change delay */
 170  170  uint16_t        zfs_zone_delay_ceiling = 100;   /* usec delay max */
 171  171  
 172  172  boolean_t       zfs_zone_priority_enable = B_TRUE;  /* enable IO priority */
 173  173  
 174  174  /*
 175  175   * For certain workloads, one zone may be issuing primarily sequential I/O and
 176  176   * another primarily random I/O.  The sequential I/O will complete much more
 177  177   * quickly than the random I/O, driving the average system latency for those
 178  178   * operations way down.  As a result, the random I/O may be throttled back, even
 179  179   * though the sequential I/O should be throttled to allow the random I/O more
 180  180   * access to the disk.
 181  181   *
 182  182   * This tunable limits the discrepancy between the read and write system
 183  183   * latency.  If one becomes excessively high, this tunable prevents the I/O
 184  184   * throttler from exacerbating the imbalance.
 185  185   */
 186  186  uint_t          zfs_zone_rw_lat_limit = 10;
 187  187  
 188  188  /*
 189  189   * The I/O throttle will only start delaying zones when it detects disk
 190  190   * utilization has reached a certain level.  This tunable controls the
 191  191   * threshold at which the throttle will start delaying zones.  When the number
 192  192   * of vdevs is small, the calculation should correspond closely with the %b
 193  193   * column from iostat -- but as the number of vdevs becomes large, it will
 194  194   * correlate less and less to any single device (therefore making it a poor
 195  195   * approximation for the actual I/O utilization on such systems).  We
 196  196   * therefore use our derived utilization conservatively:  we know that low
 197  197   * derived utilization does indeed correlate to low I/O use -- but that a high
 198  198   * rate of derived utilization does not necesarily alone denote saturation;
 199  199   * where we see a high rate of utilization, we also look for laggard I/Os to
 200  200   * attempt to detect saturation.
 201  201   */
 202  202  uint_t          zfs_zone_util_threshold = 80;
 203  203  uint_t          zfs_zone_underutil_threshold = 60;
 204  204  
 205  205  /*
 206  206   * There are three important tunables here:  zfs_zone_laggard_threshold denotes
 207  207   * the threshold at which an I/O is considered to be of notably high latency;
 208  208   * zfs_zone_laggard_recent denotes the number of microseconds before the
 209  209   * current time after which the last laggard is considered to be sufficiently
 210  210   * recent to merit increasing the throttle; zfs_zone_laggard_ancient denotes
 211  211   * the microseconds before the current time before which the last laggard is
 212  212   * considered to be sufficiently old to merit decreasing the throttle.  The
 213  213   * most important tunable of these three is the zfs_zone_laggard_threshold: in
 214  214   * modeling data from a large public cloud, this tunable was found to have a
 215  215   * much greater effect on the throttle than the two time-based thresholds.
 216  216   * This must be set high enough to not result in spurious throttling, but not
 217  217   * so high as to allow pathological I/O to persist in the system.
 218  218   */
 219  219  uint_t          zfs_zone_laggard_threshold = 50000;     /* 50 ms */
 220  220  uint_t          zfs_zone_laggard_recent = 1000000;      /* 1000 ms */
 221  221  uint_t          zfs_zone_laggard_ancient = 5000000;     /* 5000 ms */
 222  222  
 223  223  /*
 224  224   * Throughout this subsystem, our timestamps are in microseconds.  Our system
 225  225   * average cycle is one second or 1 million microseconds.  Our zone counter
 226  226   * update cycle is two seconds or 2 million microseconds.  We use a longer
 227  227   * duration for that cycle because some ops can see a little over two seconds of
 228  228   * latency when they are being starved by another zone.
 229  229   */
 230  230  uint_t          zfs_zone_sys_avg_cycle = 1000000;       /* 1 s */
 231  231  uint_t          zfs_zone_cycle_time = 2000000;          /* 2 s */
 232  232  
 233  233  /*
 234  234   * How often the I/O throttle will reevaluate each zone's utilization, in
 235  235   * microseconds. Default is 1/4 sec.
 236  236   */
 237  237  uint_t          zfs_zone_adjust_time = 250000;          /* 250 ms */
 238  238  
 239  239  typedef struct {
 240  240          hrtime_t        cycle_start;
 241  241          int             cycle_cnt;
 242  242          hrtime_t        cycle_lat;
 243  243          hrtime_t        sys_avg_lat;
 244  244  } sys_lat_cycle_t;
 245  245  
 246  246  typedef struct {
 247  247          hrtime_t zi_now;
 248  248          uint_t zi_avgrlat;
 249  249          uint_t zi_avgwlat;
 250  250          uint64_t zi_totpri;
 251  251          uint64_t zi_totutil;
 252  252          int zi_active;
 253  253          uint_t zi_diskutil;
 254  254          boolean_t zi_underutil;
 255  255          boolean_t zi_overutil;
 256  256  } zoneio_stats_t;
 257  257  
 258  258  static sys_lat_cycle_t  rd_lat;
 259  259  static sys_lat_cycle_t  wr_lat;
 260  260  
 261  261  /*
 262  262   * Some basic disk stats to determine disk utilization. The utilization info
 263  263   * for all disks on the system is aggregated into these values.
 264  264   *
 265  265   * Overall disk utilization for the current cycle is calculated as:
 266  266   *
 267  267   * ((zfs_disk_rtime - zfs_disk_last_rtime) * 100)
 268  268   * ----------------------------------------------
 269  269   *    ((now - zfs_zone_last_checked) * 1000);
 270  270   */
 271  271  kmutex_t        zfs_disk_lock;          /* protects the following: */
 272  272  uint_t          zfs_disk_rcnt;          /* Number of outstanding IOs */
 273  273  hrtime_t        zfs_disk_rtime = 0; /* cummulative sum of time performing IO */
 274  274  hrtime_t        zfs_disk_rlastupdate = 0; /* time last IO dispatched */
 275  275  
 276  276  hrtime_t        zfs_disk_last_rtime = 0; /* prev. cycle's zfs_disk_rtime val */
 277  277  /* time that we last updated per-zone throttle info */
 278  278  hrtime_t        zfs_zone_last_checked = 0;
 279  279  hrtime_t        zfs_disk_last_laggard = 0;
 280  280  
 281  281  /*
 282  282   * Data used to keep track of how often txg sync is running.
 283  283   */
 284  284  extern int      zfs_txg_timeout;
 285  285  static uint_t   txg_last_check;
 286  286  static uint_t   txg_cnt;
 287  287  static uint_t   txg_sync_rate;
 288  288  
 289  289  boolean_t       zfs_zone_schedule_enable = B_TRUE;      /* enable IO sched. */
 290  290  /*
 291  291   * Threshold for when zio scheduling should kick in.
 292  292   *
 293  293   * This threshold is based on the zfs_vdev_sync_read_max_active value for the
 294  294   * number of I/Os that can be pending on a device.  If there are more than the
 295  295   * max_active ops already queued up, beyond those already issued to the vdev,
 296  296   * then use zone-based scheduling to get the next synchronous zio.
 297  297   */
 298  298  uint32_t        zfs_zone_schedule_thresh = 10;
 299  299  
 300  300  /*
 301  301   * On each pass of the scheduler we increment the zone's weight (up to this
 302  302   * maximum). The weight is used by the scheduler to prevent starvation so
 303  303   * that zones which haven't been able to do any IO over many iterations
 304  304   * will max out thier weight to this value.
 305  305   */
 306  306  #define SCHED_WEIGHT_MAX        20
 307  307  
 308  308  /*
 309  309   * Tunables for delay throttling when TXG sync is occurring.
 310  310   *
 311  311   * If the zone is performing a write and we're doing above normal TXG syncing,
 312  312   * then throttle for longer than normal. The zone's wait time is multiplied
 313  313   * by the scale (zfs_zone_txg_throttle_scale).
 314  314   */
 315  315  int             zfs_zone_txg_throttle_scale = 2;
 316  316  hrtime_t        zfs_zone_txg_delay_nsec = MSEC2NSEC(20);
 317  317  
 318  318  typedef struct {
 319  319          int             zq_qdepth;
 320  320          zio_priority_t  zq_queue;
 321  321          int             zq_priority;
 322  322          int             zq_wt;
 323  323          zoneid_t        zq_zoneid;
 324  324  } zone_q_bump_t;
 325  325  
 326  326  /*
 327  327   * This uses gethrtime() but returns a value in usecs.
 328  328   */
 329  329  #define GET_USEC_TIME           (gethrtime() / 1000)
 330  330  #define NANO_TO_MICRO(x)        (x / (NANOSEC / MICROSEC))
 331  331  
 332  332  /*
 333  333   * Keep track of the zone's ZFS IOPs.
 334  334   *
 335  335   * See the comment on the zfs_zone_io_throttle function for which/how IOPs are
 336  336   * accounted for.
 337  337   *
 338  338   * If the number of ops is >1 then we can just use that value.  However,
 339  339   * if the number of ops is <2 then we might have a zone which is trying to do
 340  340   * IO but is not able to get any ops through the system.  We don't want to lose
 341  341   * track of this zone so we factor in its decayed count into the current count.
 342  342   *
 343  343   * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed count.
 344  344   * However, since this calculation is driven by IO activity and since IO does
 345  345   * not happen at fixed intervals, we use a timestamp to see when the last update
 346  346   * was made.  If it was more than one cycle ago, then we need to decay the
 347  347   * historical count by the proper number of additional cycles in which no IO was
 348  348   * performed.
 349  349   *
 350  350   * Return a time delta indicating how far into the current cycle we are or 0
 351  351   * if the last IO was more than a cycle ago.
 352  352   */
 353  353  static hrtime_t
 354  354  compute_historical_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp)
 355  355  {
 356  356          hrtime_t delta;
 357  357          int     gen_cnt;
 358  358  
 359  359          /*
 360  360           * Check if its time to recompute a new zone count.
 361  361           * If we're still collecting data for the current cycle, return false.
 362  362           */
 363  363          delta = unow - cp->cycle_start;
 364  364          if (delta < zfs_zone_cycle_time)
 365  365                  return (delta);
 366  366  
 367  367          /* A previous cycle is past, compute the new zone count. */
 368  368  
 369  369          /*
 370  370           * Figure out how many generations we have to decay the historical
 371  371           * count, since multiple cycles may have elapsed since our last IO.
 372  372           * We depend on int rounding here.
 373  373           */
 374  374          gen_cnt = (int)(delta / zfs_zone_cycle_time);
 375  375  
 376  376          /* If more than 5 cycles since last the IO, reset count. */
 377  377          if (gen_cnt > 5) {
 378  378                  cp->zone_avg_cnt = 0;
 379  379          } else {
 380  380                  /* Update the count. */
 381  381                  int     i;
 382  382  
 383  383                  /*
 384  384                   * If the zone did more than 1 IO, just use its current count
 385  385                   * as the historical value, otherwise decay the historical
 386  386                   * count and factor that into the new historical count.  We
 387  387                   * pick a threshold > 1 so that we don't lose track of IO due
 388  388                   * to int rounding.
 389  389                   */
 390  390                  if (cp->cycle_cnt > 1)
 391  391                          cp->zone_avg_cnt = cp->cycle_cnt;
 392  392                  else
 393  393                          cp->zone_avg_cnt = cp->cycle_cnt +
 394  394                              (cp->zone_avg_cnt / 2);
 395  395  
 396  396                  /*
 397  397                   * If more than one generation has elapsed since the last
 398  398                   * update, decay the values further.
 399  399                   */
 400  400                  for (i = 1; i < gen_cnt; i++)
 401  401                          cp->zone_avg_cnt = cp->zone_avg_cnt / 2;
 402  402          }
 403  403  
 404  404          /* A new cycle begins. */
 405  405          cp->cycle_start = unow;
 406  406          cp->cycle_cnt = 0;
 407  407  
 408  408          return (0);
 409  409  }
 410  410  
 411  411  /*
 412  412   * Add IO op data to the zone.
 413  413   */
 414  414  static void
 415  415  add_zone_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op)
 416  416  {
 417  417          switch (op) {
 418  418          case ZFS_ZONE_IOP_READ:
 419  419                  (void) compute_historical_zone_cnt(unow, &zonep->zone_rd_ops);
 420  420                  zonep->zone_rd_ops.cycle_cnt++;
 421  421                  break;
 422  422          case ZFS_ZONE_IOP_WRITE:
 423  423                  (void) compute_historical_zone_cnt(unow, &zonep->zone_wr_ops);
 424  424                  zonep->zone_wr_ops.cycle_cnt++;
 425  425                  break;
 426  426          case ZFS_ZONE_IOP_LOGICAL_WRITE:
 427  427                  (void) compute_historical_zone_cnt(unow, &zonep->zone_lwr_ops);
 428  428                  zonep->zone_lwr_ops.cycle_cnt++;
 429  429                  break;
 430  430          }
 431  431  }
 432  432  
 433  433  /*
 434  434   * Use a decaying average to keep track of the overall system latency.
 435  435   *
 436  436   * We want to have the recent activity heavily weighted, but if the
 437  437   * activity decreases or stops, then the average should quickly decay
 438  438   * down to the new value.
 439  439   *
 440  440   * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed average.
 441  441   * However, since this calculation is driven by IO activity and since IO does
 442  442   * not happen at fixed intervals, we use a timestamp to see when the last
 443  443   * update was made. If it was more than one cycle ago, then we need to decay
 444  444   * the average by the proper number of additional cycles in which no IO was
 445  445   * performed.
 446  446   *
 447  447   * Return true if we actually computed a new system average.
 448  448   * If we're still within an active cycle there is nothing to do, return false.
 449  449   */
 450  450  static boolean_t
 451  451  compute_new_sys_avg(hrtime_t unow, sys_lat_cycle_t *cp)
 452  452  {
 453  453          hrtime_t delta;
 454  454          int     gen_cnt;
 455  455  
 456  456          /*
 457  457           * Check if its time to recompute a new average.
 458  458           * If we're still collecting data for the current cycle, return false.
 459  459           */
 460  460          delta = unow - cp->cycle_start;
 461  461          if (delta < zfs_zone_sys_avg_cycle)
 462  462                  return (B_FALSE);
 463  463  
 464  464          /* A previous cycle is past, compute a new system average. */
 465  465  
 466  466          /*
 467  467           * Figure out how many generations we have to decay, since multiple
 468  468           * cycles may have elapsed since our last IO.
 469  469           * We count on int rounding here.
 470  470           */
 471  471          gen_cnt = (int)(delta / zfs_zone_sys_avg_cycle);
 472  472  
 473  473          /* If more than 5 cycles since last the IO, reset average. */
 474  474          if (gen_cnt > 5) {
 475  475                  cp->sys_avg_lat = 0;
 476  476          } else {
 477  477                  /* Update the average. */
 478  478                  int     i;
 479  479  
 480  480                  cp->sys_avg_lat =
 481  481                      (cp->sys_avg_lat + cp->cycle_lat) / (1 + cp->cycle_cnt);
 482  482  
 483  483                  /*
 484  484                   * If more than one generation has elapsed since the last
 485  485                   * update, decay the values further.
 486  486                   */
 487  487                  for (i = 1; i < gen_cnt; i++)
 488  488                          cp->sys_avg_lat = cp->sys_avg_lat / 2;
 489  489          }
 490  490  
 491  491          /* A new cycle begins. */
 492  492          cp->cycle_start = unow;
 493  493          cp->cycle_cnt = 0;
 494  494          cp->cycle_lat = 0;
 495  495  
 496  496          return (B_TRUE);
 497  497  }
 498  498  
 499  499  static void
 500  500  add_sys_iop(hrtime_t unow, int op, int lat)
 501  501  {
 502  502          switch (op) {
 503  503          case ZFS_ZONE_IOP_READ:
 504  504                  (void) compute_new_sys_avg(unow, &rd_lat);
 505  505                  rd_lat.cycle_cnt++;
 506  506                  rd_lat.cycle_lat += lat;
 507  507                  break;
 508  508          case ZFS_ZONE_IOP_WRITE:
 509  509                  (void) compute_new_sys_avg(unow, &wr_lat);
 510  510                  wr_lat.cycle_cnt++;
 511  511                  wr_lat.cycle_lat += lat;
 512  512                  break;
 513  513          }
 514  514  }
 515  515  
 516  516  /*
 517  517   * Get the zone IO counts.
 518  518   */
 519  519  static uint_t
 520  520  calc_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp)
 521  521  {
 522  522          hrtime_t delta;
 523  523          uint_t cnt;
 524  524  
 525  525          if ((delta = compute_historical_zone_cnt(unow, cp)) == 0) {
 526  526                  /*
 527  527                   * No activity in the current cycle, we already have the
 528  528                   * historical data so we'll use that.
 529  529                   */
 530  530                  cnt = cp->zone_avg_cnt;
 531  531          } else {
 532  532                  /*
 533  533                   * If we're less than half way through the cycle then use
 534  534                   * the current count plus half the historical count, otherwise
 535  535                   * just use the current count.
 536  536                   */
 537  537                  if (delta < (zfs_zone_cycle_time / 2))
 538  538                          cnt = cp->cycle_cnt + (cp->zone_avg_cnt / 2);
 539  539                  else
 540  540                          cnt = cp->cycle_cnt;
 541  541          }
 542  542  
 543  543          return (cnt);
 544  544  }
 545  545  
 546  546  /*
 547  547   * Get the average read/write latency in usecs for the system.
 548  548   */
 549  549  static uint_t
 550  550  calc_avg_lat(hrtime_t unow, sys_lat_cycle_t *cp)
 551  551  {
 552  552          if (compute_new_sys_avg(unow, cp)) {
 553  553                  /*
 554  554                   * No activity in the current cycle, we already have the
 555  555                   * historical data so we'll use that.
 556  556                   */
 557  557                  return (cp->sys_avg_lat);
 558  558          } else {
 559  559                  /*
 560  560                   * We're within a cycle; weight the current activity higher
 561  561                   * compared to the historical data and use that.
 562  562                   */
 563  563                  DTRACE_PROBE3(zfs__zone__calc__wt__avg,
 564  564                      uintptr_t, cp->sys_avg_lat,
 565  565                      uintptr_t, cp->cycle_lat,
 566  566                      uintptr_t, cp->cycle_cnt);
 567  567  
 568  568                  return ((cp->sys_avg_lat + (cp->cycle_lat * 8)) /
 569  569                      (1 + (cp->cycle_cnt * 8)));
 570  570          }
 571  571  }
 572  572  
 573  573  /*
 574  574   * Account for the current IOP on the zone and for the system as a whole.
 575  575   * The latency parameter is in usecs.
 576  576   */
 577  577  static void
 578  578  add_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op, hrtime_t lat)
 579  579  {
 580  580          /* Add op to zone */
 581  581          add_zone_iop(zonep, unow, op);
 582  582  
 583  583          /* Track system latency */
 584  584          if (op != ZFS_ZONE_IOP_LOGICAL_WRITE)
 585  585                  add_sys_iop(unow, op, lat);
 586  586  }
 587  587  
 588  588  /*
 589  589   * Calculate and return the total number of read ops, write ops and logical
 590  590   * write ops for the given zone.  If the zone has issued operations of any type
 591  591   * return a non-zero value, otherwise return 0.
 592  592   */
 593  593  static int
 594  594  get_zone_io_cnt(hrtime_t unow, zone_t *zonep, uint_t *rops, uint_t *wops,
 595  595      uint_t *lwops)
 596  596  {
 597  597          *rops = calc_zone_cnt(unow, &zonep->zone_rd_ops);
 598  598          *wops = calc_zone_cnt(unow, &zonep->zone_wr_ops);
 599  599          *lwops = calc_zone_cnt(unow, &zonep->zone_lwr_ops);
 600  600  
 601  601          DTRACE_PROBE4(zfs__zone__io__cnt, uintptr_t, zonep->zone_id,
 602  602              uintptr_t, *rops, uintptr_t, *wops, uintptr_t, *lwops);
 603  603  
 604  604          return (*rops | *wops | *lwops);
 605  605  }
 606  606  
 607  607  /*
 608  608   * Get the average read/write latency in usecs for the system.
 609  609   */
 610  610  static void
 611  611  get_sys_avg_lat(hrtime_t unow, uint_t *rlat, uint_t *wlat)
 612  612  {
 613  613          *rlat = calc_avg_lat(unow, &rd_lat);
 614  614          *wlat = calc_avg_lat(unow, &wr_lat);
 615  615  
 616  616          /*
 617  617           * In an attempt to improve the accuracy of the throttling algorithm,
 618  618           * assume that IO operations can't have zero latency.  Instead, assume
 619  619           * a reasonable lower bound for each operation type. If the actual
 620  620           * observed latencies are non-zero, use those latency values instead.
 621  621           */
 622  622          if (*rlat == 0)
 623  623                  *rlat = 1000;
 624  624          if (*wlat == 0)
 625  625                  *wlat = 1000;
 626  626  
 627  627          DTRACE_PROBE2(zfs__zone__sys__avg__lat, uintptr_t, *rlat,
 628  628              uintptr_t, *wlat);
 629  629  }
 630  630  
 631  631  /*
 632  632   * Find disk utilization for each zone and average utilization for all active
 633  633   * zones.
 634  634   */
 635  635  static int
 636  636  zfs_zone_wait_adjust_calculate_cb(zone_t *zonep, void *arg)
 637  637  {
 638  638          zoneio_stats_t *sp = arg;
 639  639          uint_t rops, wops, lwops;
 640  640  
 641  641          if (zonep->zone_id == GLOBAL_ZONEID ||
 642  642              get_zone_io_cnt(sp->zi_now, zonep, &rops, &wops, &lwops) == 0) {
 643  643                  zonep->zone_io_util = 0;
 644  644                  return (0);
 645  645          }
 646  646  
 647  647          zonep->zone_io_util = (rops * sp->zi_avgrlat) +
 648  648              (wops * sp->zi_avgwlat) + (lwops * sp->zi_avgwlat);
 649  649          sp->zi_totutil += zonep->zone_io_util;
 650  650  
 651  651          if (zonep->zone_io_util > 0) {
 652  652                  sp->zi_active++;
 653  653                  sp->zi_totpri += zonep->zone_zfs_io_pri;
 654  654          }
 655  655  
 656  656          /*
 657  657           * sdt:::zfs-zone-utilization
 658  658           *
 659  659           *      arg0: zone ID
 660  660           *      arg1: read operations observed during time window
 661  661           *      arg2: physical write operations observed during time window
 662  662           *      arg3: logical write ops observed during time window
 663  663           *      arg4: calculated utilization given read and write ops
 664  664           *      arg5: I/O priority assigned to this zone
 665  665           */
 666  666          DTRACE_PROBE6(zfs__zone__utilization, uint_t, zonep->zone_id,
 667  667              uint_t, rops, uint_t, wops, uint_t, lwops,
 668  668              uint_t, zonep->zone_io_util, uint_t, zonep->zone_zfs_io_pri);
 669  669  
 670  670          return (0);
 671  671  }
 672  672  
 673  673  static void
 674  674  zfs_zone_delay_inc(zone_t *zonep)
 675  675  {
 676  676          if (zonep->zone_io_delay < zfs_zone_delay_ceiling)
 677  677                  zonep->zone_io_delay += zfs_zone_delay_step;
 678  678  }
 679  679  
 680  680  static void
 681  681  zfs_zone_delay_dec(zone_t *zonep)
 682  682  {
 683  683          if (zonep->zone_io_delay > 0)
 684  684                  zonep->zone_io_delay -= zfs_zone_delay_step;
 685  685  }
 686  686  
 687  687  /*
 688  688   * For all zones "far enough" away from the average utilization, increase that
 689  689   * zones delay.  Otherwise, reduce its delay.
 690  690   */
 691  691  static int
 692  692  zfs_zone_wait_adjust_delay_cb(zone_t *zonep, void *arg)
 693  693  {
 694  694          zoneio_stats_t *sp = arg;
 695  695          uint16_t delay = zonep->zone_io_delay;
 696  696          uint_t fairutil = 0;
 697  697  
 698  698          zonep->zone_io_util_above_avg = B_FALSE;
 699  699  
 700  700          /*
 701  701           * Given the calculated total utilitzation for all zones, calculate the
 702  702           * fair share of I/O for this zone.
 703  703           */
 704  704          if (zfs_zone_priority_enable && sp->zi_totpri > 0) {
 705  705                  fairutil = (sp->zi_totutil * zonep->zone_zfs_io_pri) /
 706  706                      sp->zi_totpri;
 707  707          } else if (sp->zi_active > 0) {
 708  708                  fairutil = sp->zi_totutil / sp->zi_active;
 709  709          }
 710  710  
 711  711          /*
 712  712           * Adjust each IO's delay.  If the overall delay becomes too high, avoid
 713  713           * increasing beyond the ceiling value.
 714  714           */
 715  715          if (zonep->zone_io_util > fairutil && sp->zi_overutil) {
 716  716                  zonep->zone_io_util_above_avg = B_TRUE;
 717  717  
 718  718                  if (sp->zi_active > 1)
 719  719                          zfs_zone_delay_inc(zonep);
 720  720          } else if (zonep->zone_io_util < fairutil || sp->zi_underutil ||
 721  721              sp->zi_active <= 1) {
 722  722                  zfs_zone_delay_dec(zonep);
 723  723          }
 724  724  
 725  725          /*
 726  726           * sdt:::zfs-zone-throttle
 727  727           *
 728  728           *      arg0: zone ID
 729  729           *      arg1: old delay for this zone
 730  730           *      arg2: new delay for this zone
 731  731           *      arg3: calculated fair I/O utilization
 732  732           *      arg4: actual I/O utilization
 733  733           */
 734  734          DTRACE_PROBE5(zfs__zone__throttle, uintptr_t, zonep->zone_id,
 735  735              uintptr_t, delay, uintptr_t, zonep->zone_io_delay,
 736  736              uintptr_t, fairutil, uintptr_t, zonep->zone_io_util);
 737  737  
 738  738          return (0);
 739  739  }
 740  740  
 741  741  /*
 742  742   * Examine the utilization between different zones, and adjust the delay for
 743  743   * each zone appropriately.
 744  744   */
 745  745  static void
 746  746  zfs_zone_wait_adjust(hrtime_t unow, hrtime_t last_checked)
 747  747  {
 748  748          zoneio_stats_t stats;
 749  749          hrtime_t laggard_udelta = 0;
 750  750  
 751  751          (void) bzero(&stats, sizeof (stats));
 752  752  
 753  753          stats.zi_now = unow;
 754  754          get_sys_avg_lat(unow, &stats.zi_avgrlat, &stats.zi_avgwlat);
 755  755  
 756  756          if (stats.zi_avgrlat > stats.zi_avgwlat * zfs_zone_rw_lat_limit)
 757  757                  stats.zi_avgrlat = stats.zi_avgwlat * zfs_zone_rw_lat_limit;
 758  758          else if (stats.zi_avgrlat * zfs_zone_rw_lat_limit < stats.zi_avgwlat)
 759  759                  stats.zi_avgwlat = stats.zi_avgrlat * zfs_zone_rw_lat_limit;
 760  760  
 761  761          if (zone_walk(zfs_zone_wait_adjust_calculate_cb, &stats) != 0)
 762  762                  return;
 763  763  
 764  764          /*
 765  765           * Calculate disk utilization for the most recent period.
 766  766           */
 767  767          if (zfs_disk_last_rtime == 0 || unow - last_checked <= 0) {
 768  768                  stats.zi_diskutil = 0;
 769  769          } else {
 770  770                  stats.zi_diskutil =
 771  771                      ((zfs_disk_rtime - zfs_disk_last_rtime) * 100) /
 772  772                      ((unow - last_checked) * 1000);
 773  773          }
 774  774          zfs_disk_last_rtime = zfs_disk_rtime;
 775  775  
 776  776          if (unow > zfs_disk_last_laggard)
 777  777                  laggard_udelta = unow - zfs_disk_last_laggard;
 778  778  
 779  779          /*
 780  780           * To minimize porpoising, we have three separate states for our
 781  781           * assessment of I/O performance:  overutilized, underutilized, and
 782  782           * neither overutilized nor underutilized.  We will increment the
 783  783           * throttle if a zone is using more than its fair share _and_ I/O
 784  784           * is overutilized; we will decrement the throttle if a zone is using
 785  785           * less than its fair share _or_ I/O is underutilized.
 786  786           */
 787  787          stats.zi_underutil = stats.zi_diskutil < zfs_zone_underutil_threshold ||
 788  788              laggard_udelta > zfs_zone_laggard_ancient;
 789  789  
 790  790          stats.zi_overutil = stats.zi_diskutil > zfs_zone_util_threshold &&
 791  791              laggard_udelta < zfs_zone_laggard_recent;
 792  792  
 793  793          /*
 794  794           * sdt:::zfs-zone-stats
 795  795           *
 796  796           * Statistics observed over the last period:
 797  797           *
 798  798           *      arg0: average system read latency
 799  799           *      arg1: average system write latency
 800  800           *      arg2: number of active zones
 801  801           *      arg3: total I/O 'utilization' for all zones
 802  802           *      arg4: total I/O priority of all active zones
 803  803           *      arg5: calculated disk utilization
 804  804           */
 805  805          DTRACE_PROBE6(zfs__zone__stats, uintptr_t, stats.zi_avgrlat,
 806  806              uintptr_t, stats.zi_avgwlat, uintptr_t, stats.zi_active,
 807  807              uintptr_t, stats.zi_totutil, uintptr_t, stats.zi_totpri,
 808  808              uintptr_t, stats.zi_diskutil);
 809  809  
 810  810          (void) zone_walk(zfs_zone_wait_adjust_delay_cb, &stats);
 811  811  }
 812  812  
 813  813  /*
 814  814   * Callback used to calculate a zone's IO schedule priority.
 815  815   *
 816  816   * We scan the zones looking for ones with ops in the queue.  Out of those,
 817  817   * we pick the one that calculates to the highest schedule priority.
 818  818   */
 819  819  static int
 820  820  get_sched_pri_cb(zone_t *zonep, void *arg)
 821  821  {
 822  822          int pri;
 823  823          uint_t cnt;
 824  824          zone_q_bump_t *qbp = arg;
 825  825          zio_priority_t p = qbp->zq_queue;
 826  826  
 827  827          cnt = zonep->zone_zfs_queued[p];
 828  828          if (cnt == 0) {
 829  829                  zonep->zone_zfs_weight = 0;
 830  830                  return (0);
 831  831          }
 832  832  
 833  833          /*
 834  834           * On each pass, increment the zone's weight.  We use this as input
 835  835           * to the calculation to prevent starvation.  The value is reset
 836  836           * each time we issue an IO for this zone so zones which haven't
 837  837           * done any IO over several iterations will see their weight max
 838  838           * out.
 839  839           */
 840  840          if (zonep->zone_zfs_weight < SCHED_WEIGHT_MAX)
 841  841                  zonep->zone_zfs_weight++;
 842  842  
 843  843          /*
 844  844           * This zone's IO priority is the inverse of the number of IOs
 845  845           * the zone has enqueued * zone's configured priority * weight.
 846  846           * The queue depth has already been scaled by 10 to avoid problems
 847  847           * with int rounding.
 848  848           *
 849  849           * This means that zones with fewer IOs in the queue will get
 850  850           * preference unless other zone's assigned priority pulls them
 851  851           * ahead.  The weight is factored in to help ensure that zones
 852  852           * which haven't done IO in a while aren't getting starved.
 853  853           */
 854  854          pri = (qbp->zq_qdepth / cnt) *
 855  855              zonep->zone_zfs_io_pri * zonep->zone_zfs_weight;
 856  856  
 857  857          /*
 858  858           * If this zone has a higher priority than what we found so far,
 859  859           * it becomes the new leading contender.
 860  860           */
 861  861          if (pri > qbp->zq_priority) {
 862  862                  qbp->zq_zoneid = zonep->zone_id;
 863  863                  qbp->zq_priority = pri;
 864  864                  qbp->zq_wt = zonep->zone_zfs_weight;
 865  865          }
 866  866          return (0);
 867  867  }
 868  868  
 869  869  /*
 870  870   * See if we need to bump a zone's zio to the head of the queue. This is only
 871  871   * done on the two synchronous I/O queues (see the block comment on the
 872  872   * zfs_zone_schedule function). We get the correct vdev_queue_class_t and
 873  873   * queue depth from our caller.
 874  874   *
 875  875   * For single-threaded synchronous processes a zone cannot get more than
 876  876   * 1 op into the queue at a time unless the zone is running multiple processes
 877  877   * in parallel.  This can cause an imbalance in performance if there are zones
 878  878   * with many parallel processes (and ops in the queue) vs. other zones which
 879  879   * are doing simple single-threaded processes, such as interactive tasks in the
 880  880   * shell.  These zones can get backed up behind a deep queue and their IO
 881  881   * performance will appear to be very poor as a result.  This can make the
 882  882   * zone work badly for interactive behavior.
 883  883   *
 884  884   * The scheduling algorithm kicks in once we start to get a deeper queue.
 885  885   * Once that occurs, we look at all of the zones to see which one calculates
 886  886   * to the highest priority.  We bump that zone's first zio to the head of the
 887  887   * queue.
 888  888   *
 889  889   * We use a counter on the zone so that we can quickly find how many ops each
 890  890   * zone has in the queue without having to search the entire queue itself.
 891  891   * This scales better since the number of zones is expected to be on the
 892  892   * order of 10-100 whereas the queue depth can be in the range of 50-2000.
 893  893   * In addition, since the zio's in the queue only have the zoneid, we would
 894  894   * have to look up the zone for each zio enqueued and that means the overhead
 895  895   * for scanning the queue each time would be much higher.
 896  896   *
 897  897   * In all cases, we fall back to simply pulling the next op off the queue
 898  898   * if something should go wrong.
 899  899   */
 900  900  static zio_t *
 901  901  get_next_zio(vdev_queue_class_t *vqc, int qdepth, zio_priority_t p)
 902  902  {
 903  903          zone_q_bump_t qbump;
 904  904          zio_t *zp = NULL, *zphead;
 905  905          int cnt = 0;
 906  906  
 907  907          /* To avoid problems with int rounding, scale the queue depth by 10 */
 908  908          qbump.zq_qdepth = qdepth * 10;
 909  909          qbump.zq_priority = 0;
 910  910          qbump.zq_zoneid = 0;
 911  911          qbump.zq_queue = p;
 912  912          (void) zone_walk(get_sched_pri_cb, &qbump);
 913  913  
 914  914          zphead = avl_first(&vqc->vqc_queued_tree);
 915  915  
 916  916          /* Check if the scheduler didn't pick a zone for some reason!? */
 917  917          if (qbump.zq_zoneid != 0) {
 918  918                  for (zp = avl_first(&vqc->vqc_queued_tree); zp != NULL;
 919  919                      zp = avl_walk(&vqc->vqc_queued_tree, zp, AVL_AFTER)) {
 920  920                          if (zp->io_zoneid == qbump.zq_zoneid)
 921  921                                  break;
 922  922                          cnt++;
 923  923                  }
 924  924          }
 925  925  
 926  926          if (zp == NULL) {
 927  927                  zp = zphead;
 928  928          } else if (zp != zphead) {
 929  929                  /*
 930  930                   * Only fire the probe if we actually picked a different zio
 931  931                   * than the one already at the head of the queue.
 932  932                   */
 933  933                  DTRACE_PROBE4(zfs__zone__sched__bump, uint_t, zp->io_zoneid,
 934  934                      uint_t, cnt, int, qbump.zq_priority, int, qbump.zq_wt);
 935  935          }
 936  936  
 937  937          return (zp);
 938  938  }
 939  939  
 940  940  /*
 941  941   * Add our zone ID to the zio so we can keep track of which zones are doing
 942  942   * what, even when the current thread processing the zio is not associated
 943  943   * with the zone (e.g. the kernel taskq which pushes out RX groups).
 944  944   */
 945  945  void
 946  946  zfs_zone_zio_init(zio_t *zp)
 947  947  {
 948  948          zone_t  *zonep = curzone;
 949  949  
 950  950          zp->io_zoneid = zonep->zone_id;
 951  951  }
 952  952  
 953  953  /*
 954  954   * Track IO operations per zone.  Called from dmu_tx_count_write for write ops
 955  955   * and dmu_read_uio for read ops.  For each operation, increment that zone's
 956  956   * counter based on the type of operation.
 957  957   *
 958  958   * There are three basic ways that we can see write ops:
 959  959   * 1) An application does write syscalls.  Those ops go into a TXG which
 960  960   *    we'll count here.  Sometime later a kernel taskq thread (we'll see the
 961  961   *    vdev IO as zone 0) will perform some number of physical writes to commit
 962  962   *    the TXG to disk.  Those writes are not associated with the zone which
 963  963   *    made the write syscalls and the number of operations is not correlated
 964  964   *    between the taskq and the zone.
 965  965   * 2) An application opens a file with O_SYNC.  Each write will result in
 966  966   *    an operation which we'll see here plus a low-level vdev write from
 967  967   *    that zone.
 968  968   * 3) An application does write syscalls followed by an fsync().  We'll
 969  969   *    count the writes going into a TXG here.  We'll also see some number
 970  970   *    (usually much smaller, maybe only 1) of low-level vdev writes from this
 971  971   *    zone when the fsync is performed, plus some other low-level vdev writes
 972  972   *    from the taskq in zone 0 (are these metadata writes?).
 973  973   *
 974  974   * 4) In addition to the above, there are misc. system-level writes, such as
 975  975   *    writing out dirty pages to swap, or sync(2) calls, which will be handled
 976  976   *    by the global zone and which we count but don't generally worry about.
 977  977   *
 978  978   * Because of the above, we can see writes twice because this is called
 979  979   * at a high level by a zone thread, but we also will count the phys. writes
 980  980   * that are performed at a low level via zfs_zone_zio_start.
 981  981   *
 982  982   * Without this, it can look like a non-global zone never writes (case 1).
 983  983   * Depending on when the TXG is synced, the counts may be in the same sample
 984  984   * bucket or in a different one.
 985  985   *
 986  986   * Tracking read operations is simpler due to their synchronous semantics.  The
 987  987   * zfs_read function -- called as a result of a read(2) syscall -- will always
 988  988   * retrieve the data to be read through dmu_read_uio.
 989  989   */
 990  990  void
 991  991  zfs_zone_io_throttle(zfs_zone_iop_type_t type)
 992  992  {
 993  993          zone_t *zonep = curzone;
 994  994          hrtime_t unow, last_checked;
 995  995          uint16_t wait;
 996  996  
 997  997          unow = GET_USEC_TIME;
 998  998  
 999  999          /*
1000 1000           * Only bump the counters for logical operations here.  The counters for
1001 1001           * tracking physical IO operations are handled in zfs_zone_zio_done.
1002 1002           */
1003 1003          if (type == ZFS_ZONE_IOP_LOGICAL_WRITE) {
1004 1004                  mutex_enter(&zonep->zone_stg_io_lock);
1005 1005                  add_iop(zonep, unow, type, 0);
1006 1006                  mutex_exit(&zonep->zone_stg_io_lock);
1007 1007          }
1008 1008  
1009 1009          if (!zfs_zone_delay_enable)
1010 1010                  return;
1011 1011  
1012 1012          /*
1013 1013           * If the zone's I/O priority is set to zero, don't throttle that zone's
1014 1014           * operations at all.
1015 1015           */
1016 1016          if (zonep->zone_zfs_io_pri == 0)
1017 1017                  return;
1018 1018  
1019 1019          /*
1020 1020           * XXX There's a potential race here in that more than one thread may
1021 1021           * update the zone delays concurrently.  The worst outcome is corruption
1022 1022           * of our data to track each zone's IO, so the algorithm may make
1023 1023           * incorrect throttling decisions until the data is refreshed.
1024 1024           */
1025 1025          last_checked = zfs_zone_last_checked;
1026 1026          if ((unow - last_checked) > zfs_zone_adjust_time) {
1027 1027                  zfs_zone_last_checked = unow;
1028 1028                  zfs_zone_wait_adjust(unow, last_checked);
1029 1029          }
1030 1030  
1031 1031          if ((wait = zonep->zone_io_delay) > 0) {
1032 1032                  /*
1033 1033                   * If this is a write and we're doing above normal TXG
1034 1034                   * syncing, then throttle for longer than normal.
1035 1035                   */
1036 1036                  if (type == ZFS_ZONE_IOP_LOGICAL_WRITE &&
1037 1037                      (txg_cnt > 1 || txg_sync_rate > 1))
1038 1038                          wait *= zfs_zone_txg_throttle_scale;
1039 1039  
1040 1040                  /*
1041 1041                   * sdt:::zfs-zone-wait
1042 1042                   *
1043 1043                   *      arg0: zone ID
1044 1044                   *      arg1: type of IO operation
1045 1045                   *      arg2: time to delay (in us)
1046 1046                   */
1047 1047                  DTRACE_PROBE3(zfs__zone__wait, uintptr_t, zonep->zone_id,
1048 1048                      uintptr_t, type, uintptr_t, wait);
1049 1049  
1050 1050                  drv_usecwait(wait);
1051 1051          }
1052 1052  }
1053 1053  
1054 1054  /*
1055 1055   * XXX Ignore the pool pointer parameter for now.
1056 1056   *
1057 1057   * Keep track to see if the TXG sync rate is running above the expected rate.
1058 1058   * If so, this implies that we are filling TXG's at a high rate due to a heavy
1059 1059   * write workload.  We use this as input into the zone throttle.
1060 1060   *
1061 1061   * This function is called every 5 seconds (zfs_txg_timeout) under a normal
1062 1062   * write load.  In this case, the sync rate is going to be 1.  When there
1063 1063   * is a heavy write load, TXG's fill up fast and the sync thread will write
1064 1064   * the TXG more frequently (perhaps once a second).  In this case the rate
1065 1065   * will be > 1.  The sync rate is a lagging indicator since it can be up
1066 1066   * to 5 seconds old.  We use the txg_cnt to keep track of the rate in the
1067 1067   * current 5 second interval and txg_sync_rate to keep track of the previous
1068 1068   * 5 second interval.  In that way we don't have a period (1 or more seconds)
1069 1069   * where the txg_cnt == 0 and we cut back on throttling even though the rate
1070 1070   * is still high.
1071 1071   */
1072 1072  /*ARGSUSED*/
1073 1073  void
1074 1074  zfs_zone_report_txg_sync(void *dp)
1075 1075  {
1076 1076          uint_t now;
1077 1077  
1078 1078          txg_cnt++;
1079 1079          now = (uint_t)(gethrtime() / NANOSEC);
1080 1080          if ((now - txg_last_check) >= zfs_txg_timeout) {
1081 1081                  txg_sync_rate = txg_cnt / 2;
1082 1082                  txg_cnt = 0;
1083 1083                  txg_last_check = now;
1084 1084          }
1085 1085  }
1086 1086  
1087 1087  hrtime_t
1088 1088  zfs_zone_txg_delay()
1089 1089  {
1090 1090          if (curzone->zone_io_util_above_avg)
1091 1091                  return (zfs_zone_txg_delay_nsec);
1092 1092  
1093 1093          return (MSEC2NSEC(10));
1094 1094  }
1095 1095  
1096 1096  /*
1097 1097   * Called from vdev_disk_io_start when an IO hits the end of the zio pipeline
1098 1098   * and is issued.
1099 1099   * Keep track of start time for latency calculation in zfs_zone_zio_done.
1100 1100   */
1101 1101  void
1102 1102  zfs_zone_zio_start(zio_t *zp)
1103 1103  {
1104 1104          zone_t  *zonep;
1105 1105  
1106 1106          /*
1107 1107           * I/Os of type ZIO_TYPE_IOCTL are used to flush the disk cache, not for
1108 1108           * an actual I/O operation.  Ignore those operations as they relate to
1109 1109           * throttling and scheduling.
1110 1110           */
1111 1111          if (zp->io_type == ZIO_TYPE_IOCTL)
1112 1112                  return;
1113 1113  
1114 1114          if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
1115 1115                  return;
1116 1116  
1117 1117          zonep->zone_zfs_weight = 0;
1118 1118  
1119 1119          mutex_enter(&zfs_disk_lock);
1120 1120          zp->io_dispatched = gethrtime();
1121 1121  
1122 1122          if (zfs_disk_rcnt++ != 0)
1123 1123                  zfs_disk_rtime += (zp->io_dispatched - zfs_disk_rlastupdate);
1124 1124          zfs_disk_rlastupdate = zp->io_dispatched;
1125 1125          mutex_exit(&zfs_disk_lock);
1126 1126  
1127 1127          zone_rele(zonep);
1128 1128  }
1129 1129  
1130 1130  /*
1131 1131   * Called from vdev_disk_io_done when an IO completes.
1132 1132   * Increment our counter for zone ops.
1133 1133   * Calculate the IO latency avg. for this zone.
  
    | 
      ↓ open down ↓ | 
    1110 lines elided | 
    
      ↑ open up ↑ | 
  
1134 1134   */
1135 1135  void
1136 1136  zfs_zone_zio_done(zio_t *zp)
1137 1137  {
1138 1138          zone_t  *zonep;
1139 1139          hrtime_t now, unow, udelta;
1140 1140  
1141 1141          if (zp->io_type == ZIO_TYPE_IOCTL)
1142 1142                  return;
1143 1143  
1144      -        if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
     1144 +        if (zp->io_dispatched == 0)
1145 1145                  return;
1146 1146  
1147      -        if (zp->io_dispatched == 0)
     1147 +        if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
1148 1148                  return;
1149 1149  
1150 1150          now = gethrtime();
1151 1151          unow = NANO_TO_MICRO(now);
1152 1152          udelta = unow - NANO_TO_MICRO(zp->io_dispatched);
1153 1153  
1154 1154          mutex_enter(&zfs_disk_lock);
1155 1155          zfs_disk_rcnt--;
1156 1156          zfs_disk_rtime += (now - zfs_disk_rlastupdate);
1157 1157          zfs_disk_rlastupdate = now;
1158 1158  
1159 1159          if (udelta > zfs_zone_laggard_threshold)
1160 1160                  zfs_disk_last_laggard = unow;
1161 1161  
1162 1162          mutex_exit(&zfs_disk_lock);
1163 1163  
1164 1164          if (zfs_zone_delay_enable) {
1165 1165                  mutex_enter(&zonep->zone_stg_io_lock);
1166 1166                  add_iop(zonep, unow, zp->io_type == ZIO_TYPE_READ ?
1167 1167                      ZFS_ZONE_IOP_READ : ZFS_ZONE_IOP_WRITE, udelta);
1168 1168                  mutex_exit(&zonep->zone_stg_io_lock);
1169 1169          }
1170 1170  
1171 1171          zone_rele(zonep);
1172 1172  
1173 1173          /*
1174 1174           * sdt:::zfs-zone-latency
1175 1175           *
1176 1176           *      arg0: zone ID
1177 1177           *      arg1: type of I/O operation
1178 1178           *      arg2: I/O latency (in us)
1179 1179           */
1180 1180          DTRACE_PROBE3(zfs__zone__latency, uintptr_t, zp->io_zoneid,
1181 1181              uintptr_t, zp->io_type, uintptr_t, udelta);
1182 1182  }
1183 1183  
1184 1184  void
1185 1185  zfs_zone_zio_dequeue(zio_t *zp)
1186 1186  {
1187 1187          zio_priority_t p;
1188 1188          zone_t  *zonep;
1189 1189  
1190 1190          p = zp->io_priority;
1191 1191          if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE)
1192 1192                  return;
1193 1193  
1194 1194          /* We depend on p being defined as either 0 or 1 */
1195 1195          ASSERT(p < 2);
1196 1196  
1197 1197          if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
1198 1198                  return;
1199 1199  
1200 1200          mutex_enter(&zonep->zone_stg_io_lock);
1201 1201          ASSERT(zonep->zone_zfs_queued[p] > 0);
1202 1202          if (zonep->zone_zfs_queued[p] == 0)
1203 1203                  cmn_err(CE_WARN, "zfs_zone_zio_dequeue: count==0");
1204 1204          else
1205 1205                  zonep->zone_zfs_queued[p]--;
1206 1206          mutex_exit(&zonep->zone_stg_io_lock);
1207 1207          zone_rele(zonep);
1208 1208  }
1209 1209  
1210 1210  void
1211 1211  zfs_zone_zio_enqueue(zio_t *zp)
1212 1212  {
1213 1213          zio_priority_t p;
1214 1214          zone_t  *zonep;
1215 1215  
1216 1216          p = zp->io_priority;
1217 1217          if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE)
1218 1218                  return;
1219 1219  
1220 1220          /* We depend on p being defined as either 0 or 1 */
1221 1221          ASSERT(p < 2);
1222 1222  
1223 1223          if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
1224 1224                  return;
1225 1225  
1226 1226          mutex_enter(&zonep->zone_stg_io_lock);
1227 1227          zonep->zone_zfs_queued[p]++;
1228 1228          mutex_exit(&zonep->zone_stg_io_lock);
1229 1229          zone_rele(zonep);
1230 1230  }
1231 1231  
1232 1232  /*
1233 1233   * Called from vdev_queue_io_to_issue. That function is where zio's are listed
1234 1234   * in FIFO order on one of the sync queues, then pulled off (by
1235 1235   * vdev_queue_io_remove) and issued.  We potentially do zone-based scheduling
1236 1236   * here to find a zone's zio deeper in the sync queue and issue that instead
1237 1237   * of simply doing FIFO.
1238 1238   *
1239 1239   * We only do zone-based zio scheduling for the two synchronous I/O queues
1240 1240   * (read & write). These queues are normally serviced in FIFO order but we
1241 1241   * may decide to move a zone's zio to the head of the line. A typical I/O
1242 1242   * load will be mostly synchronous reads and some asynchronous writes (which
1243 1243   * are scheduled differently due to transaction groups). There will also be
1244 1244   * some synchronous writes for those apps which want to ensure their data is on
1245 1245   * disk. We want to make sure that a zone with a single-threaded app (e.g. the
1246 1246   * shell) that is doing synchronous I/O (typically reads) isn't penalized by
1247 1247   * other zones which are doing lots of synchronous I/O because they have many
1248 1248   * running threads.
1249 1249   *
1250 1250   * The vq->vq_lock mutex is held when we're executing this function so we
1251 1251   * can safely access the "last zone" variable on the queue.
1252 1252   */
1253 1253  zio_t *
1254 1254  zfs_zone_schedule(vdev_queue_t *vq, zio_priority_t p, avl_index_t idx)
1255 1255  {
1256 1256          vdev_queue_class_t *vqc = &vq->vq_class[p];
1257 1257          uint_t cnt;
1258 1258          zoneid_t last_zone;
1259 1259          zio_t *zio;
1260 1260  
1261 1261          ASSERT(MUTEX_HELD(&vq->vq_lock));
1262 1262  
1263 1263          /* Don't change the order on the LBA ordered queues. */
1264 1264          if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE)
1265 1265                  return (avl_nearest(&vqc->vqc_queued_tree, idx, AVL_AFTER));
1266 1266  
1267 1267          /* We depend on p being defined as either 0 or 1 */
1268 1268          ASSERT(p < 2);
1269 1269  
1270 1270          cnt = avl_numnodes(&vqc->vqc_queued_tree);
1271 1271          last_zone = vq->vq_last_zone_id;
1272 1272  
1273 1273          /*
1274 1274           * If there are only a few zios in the queue then just issue the head.
1275 1275           * If there are more than a few zios already queued up, then use
1276 1276           * scheduling to get the next zio.
1277 1277           */
1278 1278          if (!zfs_zone_schedule_enable || cnt < zfs_zone_schedule_thresh)
1279 1279                  zio = avl_nearest(&vqc->vqc_queued_tree, idx, AVL_AFTER);
1280 1280          else
1281 1281                  zio = get_next_zio(vqc, cnt, p);
1282 1282  
1283 1283          vq->vq_last_zone_id = zio->io_zoneid;
1284 1284  
1285 1285          /*
1286 1286           * Probe with 4 args; the number of IOs in the queue, the zone that
1287 1287           * was last scheduled off this queue, the zone that was associated
1288 1288           * with the next IO that is scheduled, and which queue (priority).
1289 1289           */
1290 1290          DTRACE_PROBE4(zfs__zone__sched, uint_t, cnt, uint_t, last_zone,
1291 1291              uint_t, zio->io_zoneid, uint_t, p);
1292 1292  
1293 1293          return (zio);
1294 1294  }
1295 1295  
1296 1296  #endif
  
    | 
      ↓ open down ↓ | 
    139 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX