merge Wdiff usr/src/uts/i86pc/os/timestamp.c

Print this page

4764 Need a way to get tsc deltas
Reviewed by: Keith M Wesolowski <wesolows@foobazco.org>
Reviewed by: Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Reviewed by: Garrett D'Amore <garrett@damore.org>
Approved by: Richard Lowe <richlowe@richlowe.net>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/i86pc/os/timestamp.c
          +++ new/usr/src/uts/i86pc/os/timestamp.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24   24   * Use is subject to license terms.
  25   25   *
  26   26   * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
  27   27   */
  28   28  
  29   29  #include <sys/types.h>
  30   30  #include <sys/param.h>
  31   31  #include <sys/systm.h>
  32   32  #include <sys/disp.h>
  33   33  #include <sys/var.h>
  34   34  #include <sys/cmn_err.h>
  35   35  #include <sys/debug.h>
  36   36  #include <sys/x86_archext.h>
  37   37  #include <sys/archsystm.h>
  38   38  #include <sys/cpuvar.h>
  39   39  #include <sys/psm_defs.h>
  40   40  #include <sys/clock.h>
  41   41  #include <sys/atomic.h>
  42   42  #include <sys/lockstat.h>
  43   43  #include <sys/smp_impldefs.h>
  44   44  #include <sys/dtrace.h>
  45   45  #include <sys/time.h>
  46   46  #include <sys/panic.h>
  47   47  #include <sys/cpu.h>
  48   48  
  49   49  /*
  50   50   * Using the Pentium's TSC register for gethrtime()
  51   51   * ------------------------------------------------
  52   52   *
  53   53   * The Pentium family, like many chip architectures, has a high-resolution
  54   54   * timestamp counter ("TSC") which increments once per CPU cycle.  The contents
  55   55   * of the timestamp counter are read with the RDTSC instruction.
  56   56   *
  57   57   * As with its UltraSPARC equivalent (the %tick register), TSC's cycle count
  58   58   * must be translated into nanoseconds in order to implement gethrtime().
  59   59   * We avoid inducing floating point operations in this conversion by
  60   60   * implementing the same nsec_scale algorithm as that found in the sun4u
  61   61   * platform code.  The sun4u NATIVE_TIME_TO_NSEC_SCALE block comment contains
  62   62   * a detailed description of the algorithm; the comment is not reproduced
  63   63   * here.  This implementation differs only in its value for NSEC_SHIFT:
  64   64   * we implement an NSEC_SHIFT of 5 (instead of sun4u's 4) to allow for
  65   65   * 60 MHz Pentiums.
  66   66   *
  67   67   * While TSC and %tick are both cycle counting registers, TSC's functionality
  68   68   * falls short in several critical ways:
  69   69   *
  70   70   *  (a) TSCs on different CPUs are not guaranteed to be in sync.  While in
  71   71   *      practice they often _are_ in sync, this isn't guaranteed by the
  72   72   *      architecture.
  73   73   *
  74   74   *  (b) The TSC cannot be reliably set to an arbitrary value.  The architecture
  75   75   *      only supports writing the low 32-bits of TSC, making it impractical
  76   76   *      to rewrite.
  77   77   *
  78   78   *  (c) The architecture doesn't have the capacity to interrupt based on
  79   79   *      arbitrary values of TSC; there is no TICK_CMPR equivalent.
  80   80   *
  81   81   * Together, (a) and (b) imply that software must track the skew between
  82   82   * TSCs and account for it (it is assumed that while there may exist skew,
  83   83   * there does not exist drift).  To determine the skew between CPUs, we
  84   84   * have newly onlined CPUs call tsc_sync_slave(), while the CPU performing
  85   85   * the online operation calls tsc_sync_master().
  86   86   *
  87   87   * In the absence of time-of-day clock adjustments, gethrtime() must stay in
  88   88   * sync with gettimeofday().  This is problematic; given (c), the software
  89   89   * cannot drive its time-of-day source from TSC, and yet they must somehow be
  90   90   * kept in sync.  We implement this by having a routine, tsc_tick(), which
  91   91   * is called once per second from the interrupt which drives time-of-day.
  92   92   *
  93   93   * Note that the hrtime base for gethrtime, tsc_hrtime_base, is modified
  94   94   * atomically with nsec_scale under CLOCK_LOCK.  This assures that time
  95   95   * monotonically increases.
  96   96   */
  97   97  
  98   98  #define NSEC_SHIFT 5
  99   99  
 100  100  static uint_t nsec_scale;
 101  101  static uint_t nsec_unscale;
 102  102  
 103  103  /*
 104  104   * These two variables used to be grouped together inside of a structure that
 105  105   * lived on a single cache line. A regression (bug ID 4623398) caused the
 106  106   * compiler to emit code that "optimized" away the while-loops below. The
 107  107   * result was that no synchronization between the onlining and onlined CPUs
 108  108   * took place.
 109  109   */
 110  110  static volatile int tsc_ready;
 111  111  static volatile int tsc_sync_go;
 112  112  
 113  113  /*
 114  114   * Used as indices into the tsc_sync_snaps[] array.
 115  115   */
 116  116  #define TSC_MASTER              0
 117  117  #define TSC_SLAVE               1
 118  118  
 119  119  /*
 120  120   * Used in the tsc_master_sync()/tsc_slave_sync() rendezvous.
 121  121   */
 122  122  #define TSC_SYNC_STOP           1
 123  123  #define TSC_SYNC_GO             2
 124  124  #define TSC_SYNC_DONE           3
 125  125  #define SYNC_ITERATIONS         10
 126  126  
 127  127  #define TSC_CONVERT_AND_ADD(tsc, hrt, scale) {          \
 128  128          unsigned int *_l = (unsigned int *)&(tsc);      \
 129  129          (hrt) += mul32(_l[1], scale) << NSEC_SHIFT;     \
 130  130          (hrt) += mul32(_l[0], scale) >> (32 - NSEC_SHIFT); \
 131  131  }
 132  132  
 133  133  #define TSC_CONVERT(tsc, hrt, scale) {                  \
 134  134          unsigned int *_l = (unsigned int *)&(tsc);      \
 135  135          (hrt) = mul32(_l[1], scale) << NSEC_SHIFT;      \
 136  136          (hrt) += mul32(_l[0], scale) >> (32 - NSEC_SHIFT); \
 137  137  }
 138  138  
 139  139  int tsc_master_slave_sync_needed = 1;
 140  140  
 141  141  static int      tsc_max_delta;
 142  142  static hrtime_t tsc_sync_tick_delta[NCPU];
 143  143  typedef struct tsc_sync {
 144  144          volatile hrtime_t master_tsc, slave_tsc;
 145  145  } tsc_sync_t;
 146  146  static tsc_sync_t *tscp;
 147  147  static hrtime_t largest_tsc_delta = 0;
 148  148  static ulong_t shortest_write_time = ~0UL;
 149  149  
 150  150  static hrtime_t tsc_last = 0;
 151  151  static hrtime_t tsc_last_jumped = 0;
 152  152  static hrtime_t tsc_hrtime_base = 0;
 153  153  static int      tsc_jumped = 0;
 154  154  
 155  155  static hrtime_t shadow_tsc_hrtime_base;
 156  156  static hrtime_t shadow_tsc_last;
 157  157  static uint_t   shadow_nsec_scale;
 158  158  static uint32_t shadow_hres_lock;
 159  159  int get_tsc_ready();
 160  160  
 161  161  hrtime_t
 162  162  tsc_gethrtime(void)
 163  163  {
 164  164          uint32_t old_hres_lock;
 165  165          hrtime_t tsc, hrt;
 166  166  
 167  167          do {
 168  168                  old_hres_lock = hres_lock;
 169  169  
 170  170                  if ((tsc = tsc_read()) >= tsc_last) {
 171  171                          /*
 172  172                           * It would seem to be obvious that this is true
 173  173                           * (that is, the past is less than the present),
 174  174                           * but it isn't true in the presence of suspend/resume
 175  175                           * cycles.  If we manage to call gethrtime()
 176  176                           * after a resume, but before the first call to
 177  177                           * tsc_tick(), we will see the jump.  In this case,
 178  178                           * we will simply use the value in TSC as the delta.
 179  179                           */
 180  180                          tsc -= tsc_last;
 181  181                  } else if (tsc >= tsc_last - 2*tsc_max_delta) {
 182  182                          /*
 183  183                           * There is a chance that tsc_tick() has just run on
 184  184                           * another CPU, and we have drifted just enough so that
 185  185                           * we appear behind tsc_last.  In this case, force the
 186  186                           * delta to be zero.
 187  187                           */
 188  188                          tsc = 0;
 189  189                  }
 190  190  
 191  191                  hrt = tsc_hrtime_base;
 192  192  
 193  193                  TSC_CONVERT_AND_ADD(tsc, hrt, nsec_scale);
 194  194          } while ((old_hres_lock & ~1) != hres_lock);
 195  195  
 196  196          return (hrt);
 197  197  }
 198  198  
 199  199  hrtime_t
 200  200  tsc_gethrtime_delta(void)
 201  201  {
 202  202          uint32_t old_hres_lock;
 203  203          hrtime_t tsc, hrt;
 204  204          ulong_t flags;
 205  205  
 206  206          do {
 207  207                  old_hres_lock = hres_lock;
 208  208  
 209  209                  /*
 210  210                   * We need to disable interrupts here to assure that we
 211  211                   * don't migrate between the call to tsc_read() and
 212  212                   * adding the CPU's TSC tick delta. Note that disabling
 213  213                   * and reenabling preemption is forbidden here because
 214  214                   * we may be in the middle of a fast trap. In the amd64
 215  215                   * kernel we cannot tolerate preemption during a fast
 216  216                   * trap. See _update_sregs().
 217  217                   */
 218  218  
 219  219                  flags = clear_int_flag();
 220  220                  tsc = tsc_read() + tsc_sync_tick_delta[CPU->cpu_id];
 221  221                  restore_int_flag(flags);
 222  222  
 223  223                  /* See comments in tsc_gethrtime() above */
 224  224  
 225  225                  if (tsc >= tsc_last) {
 226  226                          tsc -= tsc_last;
 227  227                  } else if (tsc >= tsc_last - 2 * tsc_max_delta) {
 228  228                          tsc = 0;

↓ open down ↓

228 lines elided

↑ open up ↑

 229  229                  }
 230  230  
 231  231                  hrt = tsc_hrtime_base;
 232  232  
 233  233                  TSC_CONVERT_AND_ADD(tsc, hrt, nsec_scale);
 234  234          } while ((old_hres_lock & ~1) != hres_lock);
 235  235  
 236  236          return (hrt);
 237  237  }
 238  238  
      239 +hrtime_t
      240 +tsc_gethrtime_tick_delta(void)
      241 +{
      242 +        hrtime_t hrt;
      243 +        ulong_t flags;
      244 +
      245 +        flags = clear_int_flag();
      246 +        hrt = tsc_sync_tick_delta[CPU->cpu_id];
      247 +        restore_int_flag(flags);
      248 +
      249 +        return (hrt);
      250 +}
      251 +
 239  252  /*
 240  253   * This is similar to the above, but it cannot actually spin on hres_lock.
 241  254   * As a result, it caches all of the variables it needs; if the variables
 242  255   * don't change, it's done.
 243  256   */
 244  257  hrtime_t
 245  258  dtrace_gethrtime(void)
 246  259  {
 247  260          uint32_t old_hres_lock;
 248  261          hrtime_t tsc, hrt;

 249  262          ulong_t flags;
 250  263  
 251  264          do {
 252  265                  old_hres_lock = hres_lock;
 253  266  
 254  267                  /*
 255  268                   * Interrupts are disabled to ensure that the thread isn't
 256  269                   * migrated between the tsc_read() and adding the CPU's
 257  270                   * TSC tick delta.
 258  271                   */
 259  272                  flags = clear_int_flag();
 260  273  
 261  274                  tsc = tsc_read();
 262  275  
 263  276                  if (gethrtimef == tsc_gethrtime_delta)
 264  277                          tsc += tsc_sync_tick_delta[CPU->cpu_id];
 265  278  
 266  279                  restore_int_flag(flags);
 267  280  
 268  281                  /*
 269  282                   * See the comments in tsc_gethrtime(), above.
 270  283                   */
 271  284                  if (tsc >= tsc_last)
 272  285                          tsc -= tsc_last;
 273  286                  else if (tsc >= tsc_last - 2*tsc_max_delta)
 274  287                          tsc = 0;
 275  288  
 276  289                  hrt = tsc_hrtime_base;
 277  290  
 278  291                  TSC_CONVERT_AND_ADD(tsc, hrt, nsec_scale);
 279  292  
 280  293                  if ((old_hres_lock & ~1) == hres_lock)
 281  294                          break;
 282  295  
 283  296                  /*
 284  297                   * If we're here, the clock lock is locked -- or it has been
 285  298                   * unlocked and locked since we looked.  This may be due to
 286  299                   * tsc_tick() running on another CPU -- or it may be because
 287  300                   * some code path has ended up in dtrace_probe() with
 288  301                   * CLOCK_LOCK held.  We'll try to determine that we're in
 289  302                   * the former case by taking another lap if the lock has
 290  303                   * changed since when we first looked at it.
 291  304                   */
 292  305                  if (old_hres_lock != hres_lock)
 293  306                          continue;
 294  307  
 295  308                  /*
 296  309                   * So the lock was and is locked.  We'll use the old data
 297  310                   * instead.
 298  311                   */
 299  312                  old_hres_lock = shadow_hres_lock;
 300  313  
 301  314                  /*
 302  315                   * Again, disable interrupts to ensure that the thread
 303  316                   * isn't migrated between the tsc_read() and adding
 304  317                   * the CPU's TSC tick delta.
 305  318                   */
 306  319                  flags = clear_int_flag();
 307  320  
 308  321                  tsc = tsc_read();
 309  322  
 310  323                  if (gethrtimef == tsc_gethrtime_delta)
 311  324                          tsc += tsc_sync_tick_delta[CPU->cpu_id];
 312  325  
 313  326                  restore_int_flag(flags);
 314  327  
 315  328                  /*
 316  329                   * See the comments in tsc_gethrtime(), above.
 317  330                   */
 318  331                  if (tsc >= shadow_tsc_last)
 319  332                          tsc -= shadow_tsc_last;
 320  333                  else if (tsc >= shadow_tsc_last - 2 * tsc_max_delta)
 321  334                          tsc = 0;
 322  335  
 323  336                  hrt = shadow_tsc_hrtime_base;
 324  337  
 325  338                  TSC_CONVERT_AND_ADD(tsc, hrt, shadow_nsec_scale);
 326  339          } while ((old_hres_lock & ~1) != shadow_hres_lock);
 327  340  
 328  341          return (hrt);
 329  342  }
 330  343  
 331  344  hrtime_t
 332  345  tsc_gethrtimeunscaled(void)
 333  346  {
 334  347          uint32_t old_hres_lock;
 335  348          hrtime_t tsc;
 336  349  
 337  350          do {
 338  351                  old_hres_lock = hres_lock;
 339  352  
 340  353                  /* See tsc_tick(). */
 341  354                  tsc = tsc_read() + tsc_last_jumped;
 342  355          } while ((old_hres_lock & ~1) != hres_lock);
 343  356  
 344  357          return (tsc);
 345  358  }
 346  359  
 347  360  /*
 348  361   * Convert a nanosecond based timestamp to tsc
 349  362   */
 350  363  uint64_t
 351  364  tsc_unscalehrtime(hrtime_t nsec)
 352  365  {
 353  366          hrtime_t tsc;
 354  367  
 355  368          if (tsc_gethrtime_enable) {
 356  369                  TSC_CONVERT(nsec, tsc, nsec_unscale);
 357  370                  return (tsc);
 358  371          }
 359  372          return ((uint64_t)nsec);
 360  373  }
 361  374  
 362  375  /* Convert a tsc timestamp to nanoseconds */
 363  376  void
 364  377  tsc_scalehrtime(hrtime_t *tsc)
 365  378  {
 366  379          hrtime_t hrt;
 367  380          hrtime_t mytsc;
 368  381  
 369  382          if (tsc == NULL)
 370  383                  return;
 371  384          mytsc = *tsc;
 372  385  
 373  386          TSC_CONVERT(mytsc, hrt, nsec_scale);
 374  387          *tsc  = hrt;
 375  388  }
 376  389  
 377  390  hrtime_t
 378  391  tsc_gethrtimeunscaled_delta(void)
 379  392  {
 380  393          hrtime_t hrt;
 381  394          ulong_t flags;
 382  395  
 383  396          /*
 384  397           * Similarly to tsc_gethrtime_delta, we need to disable preemption
 385  398           * to prevent migration between the call to tsc_gethrtimeunscaled
 386  399           * and adding the CPU's hrtime delta. Note that disabling and
 387  400           * reenabling preemption is forbidden here because we may be in the
 388  401           * middle of a fast trap. In the amd64 kernel we cannot tolerate
 389  402           * preemption during a fast trap. See _update_sregs().
 390  403           */
 391  404  
 392  405          flags = clear_int_flag();
 393  406          hrt = tsc_gethrtimeunscaled() + tsc_sync_tick_delta[CPU->cpu_id];
 394  407          restore_int_flag(flags);
 395  408  
 396  409          return (hrt);
 397  410  }
 398  411  
 399  412  /*
 400  413   * Called by the master in the TSC sync operation (usually the boot CPU).
 401  414   * If the slave is discovered to have a skew, gethrtimef will be changed to
 402  415   * point to tsc_gethrtime_delta(). Calculating skews is precise only when
 403  416   * the master and slave TSCs are read simultaneously; however, there is no
 404  417   * algorithm that can read both CPUs in perfect simultaneity. The proposed
 405  418   * algorithm is an approximate method based on the behaviour of cache
 406  419   * management. The slave CPU continuously reads TSC and then reads a global
 407  420   * variable which the master CPU updates. The moment the master's update reaches
 408  421   * the slave's visibility (being forced by an mfence operation) we use the TSC
 409  422   * reading taken on the slave. A corresponding TSC read will be taken on the
 410  423   * master as soon as possible after finishing the mfence operation. But the
 411  424   * delay between causing the slave to notice the invalid cache line and the
 412  425   * competion of mfence is not repeatable. This error is heuristically assumed
 413  426   * to be 1/4th of the total write time as being measured by the two TSC reads
 414  427   * on the master sandwiching the mfence. Furthermore, due to the nature of
 415  428   * bus arbitration, contention on memory bus, etc., the time taken for the write
 416  429   * to reflect globally can vary a lot. So instead of taking a single reading,
 417  430   * a set of readings are taken and the one with least write time is chosen
 418  431   * to calculate the final skew.
 419  432   *
 420  433   * TSC sync is disabled in the context of virtualization because the CPUs
 421  434   * assigned to the guest are virtual CPUs which means the real CPUs on which
 422  435   * guest runs keep changing during life time of guest OS. So we would end up
 423  436   * calculating TSC skews for a set of CPUs during boot whereas the guest
 424  437   * might migrate to a different set of physical CPUs at a later point of
 425  438   * time.
 426  439   */
 427  440  void
 428  441  tsc_sync_master(processorid_t slave)
 429  442  {
 430  443          ulong_t flags, source, min_write_time = ~0UL;
 431  444          hrtime_t write_time, x, mtsc_after, tdelta;
 432  445          tsc_sync_t *tsc = tscp;
 433  446          int cnt;
 434  447          int hwtype;
 435  448  
 436  449          hwtype = get_hwenv();
 437  450          if (!tsc_master_slave_sync_needed || (hwtype & HW_VIRTUAL) != 0)
 438  451                  return;
 439  452  
 440  453          flags = clear_int_flag();
 441  454          source = CPU->cpu_id;
 442  455  
 443  456          for (cnt = 0; cnt < SYNC_ITERATIONS; cnt++) {
 444  457                  while (tsc_sync_go != TSC_SYNC_GO)
 445  458                          SMT_PAUSE();
 446  459  
 447  460                  tsc->master_tsc = tsc_read();
 448  461                  membar_enter();
 449  462                  mtsc_after = tsc_read();
 450  463                  while (tsc_sync_go != TSC_SYNC_DONE)
 451  464                          SMT_PAUSE();
 452  465                  write_time =  mtsc_after - tsc->master_tsc;
 453  466                  if (write_time <= min_write_time) {
 454  467                          min_write_time = write_time;
 455  468                          /*
 456  469                           * Apply heuristic adjustment only if the calculated
 457  470                           * delta is > 1/4th of the write time.
 458  471                           */
 459  472                          x = tsc->slave_tsc - mtsc_after;
 460  473                          if (x < 0)
 461  474                                  x = -x;
 462  475                          if (x > (min_write_time/4))
 463  476                                  /*
 464  477                                   * Subtract 1/4th of the measured write time
 465  478                                   * from the master's TSC value, as an estimate
 466  479                                   * of how late the mfence completion came
 467  480                                   * after the slave noticed the cache line
 468  481                                   * change.
 469  482                                   */
 470  483                                  tdelta = tsc->slave_tsc -
 471  484                                      (mtsc_after - (min_write_time/4));
 472  485                          else
 473  486                                  tdelta = tsc->slave_tsc - mtsc_after;
 474  487                          tsc_sync_tick_delta[slave] =
 475  488                              tsc_sync_tick_delta[source] - tdelta;
 476  489                  }
 477  490  
 478  491                  tsc->master_tsc = tsc->slave_tsc = write_time = 0;
 479  492                  membar_enter();
 480  493                  tsc_sync_go = TSC_SYNC_STOP;
 481  494          }
 482  495          if (tdelta < 0)
 483  496                  tdelta = -tdelta;
 484  497          if (tdelta > largest_tsc_delta)
 485  498                  largest_tsc_delta = tdelta;
 486  499          if (min_write_time < shortest_write_time)
 487  500                  shortest_write_time = min_write_time;
 488  501          /*
 489  502           * Enable delta variants of tsc functions if the largest of all chosen
 490  503           * deltas is > smallest of the write time.
 491  504           */
 492  505          if (largest_tsc_delta > shortest_write_time) {
 493  506                  gethrtimef = tsc_gethrtime_delta;
 494  507                  gethrtimeunscaledf = tsc_gethrtimeunscaled_delta;
 495  508          }
 496  509          restore_int_flag(flags);
 497  510  }
 498  511  
 499  512  /*
 500  513   * Called by a CPU which has just been onlined.  It is expected that the CPU
 501  514   * performing the online operation will call tsc_sync_master().
 502  515   *
 503  516   * TSC sync is disabled in the context of virtualization. See comments
 504  517   * above tsc_sync_master.
 505  518   */
 506  519  void
 507  520  tsc_sync_slave(void)
 508  521  {
 509  522          ulong_t flags;
 510  523          hrtime_t s1;
 511  524          tsc_sync_t *tsc = tscp;
 512  525          int cnt;
 513  526          int hwtype;
 514  527  
 515  528          hwtype = get_hwenv();
 516  529          if (!tsc_master_slave_sync_needed || (hwtype & HW_VIRTUAL) != 0)
 517  530                  return;
 518  531  
 519  532          flags = clear_int_flag();
 520  533  
 521  534          for (cnt = 0; cnt < SYNC_ITERATIONS; cnt++) {
 522  535                  /* Re-fill the cache line */
 523  536                  s1 = tsc->master_tsc;
 524  537                  membar_enter();
 525  538                  tsc_sync_go = TSC_SYNC_GO;
 526  539                  do {
 527  540                          /*
 528  541                           * Do not put an SMT_PAUSE here. For instance,
 529  542                           * if the master and slave are really the same
 530  543                           * hyper-threaded CPU, then you want the master
 531  544                           * to yield to the slave as quickly as possible here,
 532  545                           * but not the other way.
 533  546                           */
 534  547                          s1 = tsc_read();
 535  548                  } while (tsc->master_tsc == 0);
 536  549                  tsc->slave_tsc = s1;
 537  550                  membar_enter();
 538  551                  tsc_sync_go = TSC_SYNC_DONE;
 539  552  
 540  553                  while (tsc_sync_go != TSC_SYNC_STOP)
 541  554                          SMT_PAUSE();
 542  555          }
 543  556  
 544  557          restore_int_flag(flags);
 545  558  }
 546  559  
 547  560  /*
 548  561   * Called once per second on a CPU from the cyclic subsystem's
 549  562   * CY_HIGH_LEVEL interrupt.  (No longer just cpu0-only)
 550  563   */
 551  564  void
 552  565  tsc_tick(void)
 553  566  {
 554  567          hrtime_t now, delta;
 555  568          ushort_t spl;
 556  569  
 557  570          /*
 558  571           * Before we set the new variables, we set the shadow values.  This
 559  572           * allows for lock free operation in dtrace_gethrtime().
 560  573           */
 561  574          lock_set_spl((lock_t *)&shadow_hres_lock + HRES_LOCK_OFFSET,
 562  575              ipltospl(CBE_HIGH_PIL), &spl);
 563  576  
 564  577          shadow_tsc_hrtime_base = tsc_hrtime_base;
 565  578          shadow_tsc_last = tsc_last;
 566  579          shadow_nsec_scale = nsec_scale;
 567  580  
 568  581          shadow_hres_lock++;
 569  582          splx(spl);
 570  583  
 571  584          CLOCK_LOCK(&spl);
 572  585  
 573  586          now = tsc_read();
 574  587  
 575  588          if (gethrtimef == tsc_gethrtime_delta)
 576  589                  now += tsc_sync_tick_delta[CPU->cpu_id];
 577  590  
 578  591          if (now < tsc_last) {
 579  592                  /*
 580  593                   * The TSC has just jumped into the past.  We assume that
 581  594                   * this is due to a suspend/resume cycle, and we're going
 582  595                   * to use the _current_ value of TSC as the delta.  This
 583  596                   * will keep tsc_hrtime_base correct.  We're also going to
 584  597                   * assume that rate of tsc does not change after a suspend
 585  598                   * resume (i.e nsec_scale remains the same).
 586  599                   */
 587  600                  delta = now;
 588  601                  tsc_last_jumped += tsc_last;
 589  602                  tsc_jumped = 1;
 590  603          } else {
 591  604                  /*
 592  605                   * Determine the number of TSC ticks since the last clock
 593  606                   * tick, and add that to the hrtime base.
 594  607                   */
 595  608                  delta = now - tsc_last;
 596  609          }
 597  610  
 598  611          TSC_CONVERT_AND_ADD(delta, tsc_hrtime_base, nsec_scale);
 599  612          tsc_last = now;
 600  613  
 601  614          CLOCK_UNLOCK(spl);
 602  615  }
 603  616  
 604  617  void
 605  618  tsc_hrtimeinit(uint64_t cpu_freq_hz)
 606  619  {
 607  620          extern int gethrtime_hires;
 608  621          longlong_t tsc;
 609  622          ulong_t flags;
 610  623  
 611  624          /*
 612  625           * cpu_freq_hz is the measured cpu frequency in hertz
 613  626           */
 614  627  
 615  628          /*
 616  629           * We can't accommodate CPUs slower than 31.25 MHz.
 617  630           */
 618  631          ASSERT(cpu_freq_hz > NANOSEC / (1 << NSEC_SHIFT));
 619  632          nsec_scale =
 620  633              (uint_t)(((uint64_t)NANOSEC << (32 - NSEC_SHIFT)) / cpu_freq_hz);
 621  634          nsec_unscale =
 622  635              (uint_t)(((uint64_t)cpu_freq_hz << (32 - NSEC_SHIFT)) / NANOSEC);
 623  636  
 624  637          flags = clear_int_flag();
 625  638          tsc = tsc_read();
 626  639          (void) tsc_gethrtime();
 627  640          tsc_max_delta = tsc_read() - tsc;
 628  641          restore_int_flag(flags);
 629  642          gethrtimef = tsc_gethrtime;
 630  643          gethrtimeunscaledf = tsc_gethrtimeunscaled;
 631  644          scalehrtimef = tsc_scalehrtime;
 632  645          unscalehrtimef = tsc_unscalehrtime;
 633  646          hrtime_tick = tsc_tick;
 634  647          gethrtime_hires = 1;
 635  648          /*
 636  649           * Allocate memory for the structure used in the tsc sync logic.
 637  650           * This structure should be aligned on a multiple of cache line size.
 638  651           */
 639  652          tscp = kmem_zalloc(PAGESIZE, KM_SLEEP);
 640  653  }
 641  654  
 642  655  int
 643  656  get_tsc_ready()
 644  657  {
 645  658          return (tsc_ready);
 646  659  }
 647  660  
 648  661  /*
 649  662   * Adjust all the deltas by adding the passed value to the array.
 650  663   * Then use the "delt" versions of the the gethrtime functions.
 651  664   * Note that 'tdelta' _could_ be a negative number, which should
 652  665   * reduce the values in the array (used, for example, if the Solaris
 653  666   * instance was moved by a virtual manager to a machine with a higher
 654  667   * value of tsc).
 655  668   */
 656  669  void
 657  670  tsc_adjust_delta(hrtime_t tdelta)
 658  671  {
 659  672          int             i;
 660  673  
 661  674          for (i = 0; i < NCPU; i++) {
 662  675                  tsc_sync_tick_delta[i] += tdelta;
 663  676          }
 664  677  
 665  678          gethrtimef = tsc_gethrtime_delta;
 666  679          gethrtimeunscaledf = tsc_gethrtimeunscaled_delta;
 667  680  }
 668  681  
 669  682  /*
 670  683   * Functions to manage TSC and high-res time on suspend and resume.
 671  684   */
 672  685  
 673  686  /*
 674  687   * declarations needed for time adjustment
 675  688   */
 676  689  extern void     rtcsync(void);
 677  690  extern tod_ops_t *tod_ops;
 678  691  /* There must be a better way than exposing nsec_scale! */
 679  692  extern uint_t   nsec_scale;
 680  693  static uint64_t tsc_saved_tsc = 0; /* 1 in 2^64 chance this'll screw up! */
 681  694  static timestruc_t tsc_saved_ts;
 682  695  static int      tsc_needs_resume = 0;   /* We only want to do this once. */
 683  696  int             tsc_delta_onsuspend = 0;
 684  697  int             tsc_adjust_seconds = 1;
 685  698  int             tsc_suspend_count = 0;
 686  699  int             tsc_resume_in_cyclic = 0;
 687  700  
 688  701  /*
 689  702   * Let timestamp.c know that we are suspending.  It needs to take
 690  703   * snapshots of the current time, and do any pre-suspend work.
 691  704   */
 692  705  void
 693  706  tsc_suspend(void)
 694  707  {
 695  708  /*
 696  709   * What we need to do here, is to get the time we suspended, so that we
 697  710   * know how much we should add to the resume.
 698  711   * This routine is called by each CPU, so we need to handle reentry.
 699  712   */
 700  713          if (tsc_gethrtime_enable) {
 701  714                  /*
 702  715                   * We put the tsc_read() inside the lock as it
 703  716                   * as no locking constraints, and it puts the
 704  717                   * aquired value closer to the time stamp (in
 705  718                   * case we delay getting the lock).
 706  719                   */
 707  720                  mutex_enter(&tod_lock);
 708  721                  tsc_saved_tsc = tsc_read();
 709  722                  tsc_saved_ts = TODOP_GET(tod_ops);
 710  723                  mutex_exit(&tod_lock);
 711  724                  /* We only want to do this once. */
 712  725                  if (tsc_needs_resume == 0) {
 713  726                          if (tsc_delta_onsuspend) {
 714  727                                  tsc_adjust_delta(tsc_saved_tsc);
 715  728                          } else {
 716  729                                  tsc_adjust_delta(nsec_scale);
 717  730                          }
 718  731                          tsc_suspend_count++;
 719  732                  }
 720  733          }
 721  734  
 722  735          invalidate_cache();
 723  736          tsc_needs_resume = 1;
 724  737  }
 725  738  
 726  739  /*
 727  740   * Restore all timestamp state based on the snapshots taken at
 728  741   * suspend time.
 729  742   */
 730  743  void
 731  744  tsc_resume(void)
 732  745  {
 733  746          /*
 734  747           * We only need to (and want to) do this once.  So let the first
 735  748           * caller handle this (we are locked by the cpu lock), as it
 736  749           * is preferential that we get the earliest sync.
 737  750           */
 738  751          if (tsc_needs_resume) {
 739  752                  /*
 740  753                   * If using the TSC, adjust the delta based on how long
 741  754                   * we were sleeping (or away).  We also adjust for
 742  755                   * migration and a grown TSC.
 743  756                   */
 744  757                  if (tsc_saved_tsc != 0) {
 745  758                          timestruc_t     ts;
 746  759                          hrtime_t        now, sleep_tsc = 0;
 747  760                          int             sleep_sec;
 748  761                          extern void     tsc_tick(void);
 749  762                          extern uint64_t cpu_freq_hz;
 750  763  
 751  764                          /* tsc_read() MUST be before TODOP_GET() */
 752  765                          mutex_enter(&tod_lock);
 753  766                          now = tsc_read();
 754  767                          ts = TODOP_GET(tod_ops);
 755  768                          mutex_exit(&tod_lock);
 756  769  
 757  770                          /* Compute seconds of sleep time */
 758  771                          sleep_sec = ts.tv_sec - tsc_saved_ts.tv_sec;
 759  772  
 760  773                          /*
 761  774                           * If the saved sec is less that or equal to
 762  775                           * the current ts, then there is likely a
 763  776                           * problem with the clock.  Assume at least
 764  777                           * one second has passed, so that time goes forward.
 765  778                           */
 766  779                          if (sleep_sec <= 0) {
 767  780                                  sleep_sec = 1;
 768  781                          }
 769  782  
 770  783                          /* How many TSC's should have occured while sleeping */
 771  784                          if (tsc_adjust_seconds)
 772  785                                  sleep_tsc = sleep_sec * cpu_freq_hz;
 773  786  
 774  787                          /*
 775  788                           * We also want to subtract from the "sleep_tsc"
 776  789                           * the current value of tsc_read(), so that our
 777  790                           * adjustment accounts for the amount of time we
 778  791                           * have been resumed _or_ an adjustment based on
 779  792                           * the fact that we didn't actually power off the
 780  793                           * CPU (migration is another issue, but _should_
 781  794                           * also comply with this calculation).  If the CPU
 782  795                           * never powered off, then:
 783  796                           *    'now == sleep_tsc + saved_tsc'
 784  797                           * and the delta will effectively be "0".
 785  798                           */
 786  799                          sleep_tsc -= now;
 787  800                          if (tsc_delta_onsuspend) {
 788  801                                  tsc_adjust_delta(sleep_tsc);
 789  802                          } else {
 790  803                                  tsc_adjust_delta(tsc_saved_tsc + sleep_tsc);
 791  804                          }
 792  805                          tsc_saved_tsc = 0;
 793  806  
 794  807                          tsc_tick();
 795  808                  }
 796  809                  tsc_needs_resume = 0;
 797  810          }
 798  811  
 799  812  }

↓ open down ↓

551 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX