Print this page
    
OS-5192 need faster clock_gettime
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Joshua M. Clulow <jmc@joyent.com>
Reviewed by: Ryan Zezeski <ryan@zinascii.com>
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/i86pc/os/timestamp.c
          +++ new/usr/src/uts/i86pc/os/timestamp.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  
    | 
      ↓ open down ↓ | 
    17 lines elided | 
    
      ↑ open up ↑ | 
  
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24   24   * Use is subject to license terms.
  25   25   *
  26   26   * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
  27   27   * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
       28 + * Copyright 2016 Joyent, Inc.
  28   29   */
  29   30  
  30   31  #include <sys/types.h>
  31   32  #include <sys/param.h>
  32   33  #include <sys/systm.h>
  33   34  #include <sys/disp.h>
  34   35  #include <sys/var.h>
  35   36  #include <sys/cmn_err.h>
  36   37  #include <sys/debug.h>
  37   38  #include <sys/x86_archext.h>
  38   39  #include <sys/archsystm.h>
  39   40  #include <sys/cpuvar.h>
  
    | 
      ↓ open down ↓ | 
    2 lines elided | 
    
      ↑ open up ↑ | 
  
  40   41  #include <sys/psm_defs.h>
  41   42  #include <sys/clock.h>
  42   43  #include <sys/atomic.h>
  43   44  #include <sys/lockstat.h>
  44   45  #include <sys/smp_impldefs.h>
  45   46  #include <sys/dtrace.h>
  46   47  #include <sys/time.h>
  47   48  #include <sys/panic.h>
  48   49  #include <sys/cpu.h>
  49   50  #include <sys/sdt.h>
       51 +#include <sys/comm_page.h>
  50   52  
  51   53  /*
  52   54   * Using the Pentium's TSC register for gethrtime()
  53   55   * ------------------------------------------------
  54   56   *
  55   57   * The Pentium family, like many chip architectures, has a high-resolution
  56   58   * timestamp counter ("TSC") which increments once per CPU cycle.  The contents
  57   59   * of the timestamp counter are read with the RDTSC instruction.
  58   60   *
  59   61   * As with its UltraSPARC equivalent (the %tick register), TSC's cycle count
  60   62   * must be translated into nanoseconds in order to implement gethrtime().
  61   63   * We avoid inducing floating point operations in this conversion by
  62   64   * implementing the same nsec_scale algorithm as that found in the sun4u
  63   65   * platform code.  The sun4u NATIVE_TIME_TO_NSEC_SCALE block comment contains
  64   66   * a detailed description of the algorithm; the comment is not reproduced
  65   67   * here.  This implementation differs only in its value for NSEC_SHIFT:
  66   68   * we implement an NSEC_SHIFT of 5 (instead of sun4u's 4) to allow for
  67   69   * 60 MHz Pentiums.
  68   70   *
  69   71   * While TSC and %tick are both cycle counting registers, TSC's functionality
  70   72   * falls short in several critical ways:
  71   73   *
  72   74   *  (a) TSCs on different CPUs are not guaranteed to be in sync.  While in
  73   75   *      practice they often _are_ in sync, this isn't guaranteed by the
  74   76   *      architecture.
  75   77   *
  76   78   *  (b) The TSC cannot be reliably set to an arbitrary value.  The architecture
  77   79   *      only supports writing the low 32-bits of TSC, making it impractical
  78   80   *      to rewrite.
  79   81   *
  80   82   *  (c) The architecture doesn't have the capacity to interrupt based on
  81   83   *      arbitrary values of TSC; there is no TICK_CMPR equivalent.
  82   84   *
  83   85   * Together, (a) and (b) imply that software must track the skew between
  84   86   * TSCs and account for it (it is assumed that while there may exist skew,
  85   87   * there does not exist drift).  To determine the skew between CPUs, we
  86   88   * have newly onlined CPUs call tsc_sync_slave(), while the CPU performing
  87   89   * the online operation calls tsc_sync_master().
  88   90   *
  89   91   * In the absence of time-of-day clock adjustments, gethrtime() must stay in
  90   92   * sync with gettimeofday().  This is problematic; given (c), the software
  91   93   * cannot drive its time-of-day source from TSC, and yet they must somehow be
  
    | 
      ↓ open down ↓ | 
    32 lines elided | 
    
      ↑ open up ↑ | 
  
  92   94   * kept in sync.  We implement this by having a routine, tsc_tick(), which
  93   95   * is called once per second from the interrupt which drives time-of-day.
  94   96   *
  95   97   * Note that the hrtime base for gethrtime, tsc_hrtime_base, is modified
  96   98   * atomically with nsec_scale under CLOCK_LOCK.  This assures that time
  97   99   * monotonically increases.
  98  100   */
  99  101  
 100  102  #define NSEC_SHIFT 5
 101  103  
 102      -static uint_t nsec_scale;
 103  104  static uint_t nsec_unscale;
 104  105  
 105  106  /*
 106  107   * These two variables used to be grouped together inside of a structure that
 107  108   * lived on a single cache line. A regression (bug ID 4623398) caused the
 108  109   * compiler to emit code that "optimized" away the while-loops below. The
 109  110   * result was that no synchronization between the onlining and onlined CPUs
 110  111   * took place.
 111  112   */
 112  113  static volatile int tsc_ready;
 113  114  static volatile int tsc_sync_go;
 114  115  
 115  116  /*
 116  117   * Used as indices into the tsc_sync_snaps[] array.
 117  118   */
 118  119  #define TSC_MASTER              0
 119  120  #define TSC_SLAVE               1
 120  121  
 121  122  /*
 122  123   * Used in the tsc_master_sync()/tsc_slave_sync() rendezvous.
 123  124   */
 124  125  #define TSC_SYNC_STOP           1
 125  126  #define TSC_SYNC_GO             2
 126  127  #define TSC_SYNC_DONE           3
 127  128  #define SYNC_ITERATIONS         10
 128  129  
 129  130  #define TSC_CONVERT_AND_ADD(tsc, hrt, scale) {          \
 130  131          unsigned int *_l = (unsigned int *)&(tsc);      \
 131  132          (hrt) += mul32(_l[1], scale) << NSEC_SHIFT;     \
 132  133          (hrt) += mul32(_l[0], scale) >> (32 - NSEC_SHIFT); \
  
    | 
      ↓ open down ↓ | 
    20 lines elided | 
    
      ↑ open up ↑ | 
  
 133  134  }
 134  135  
 135  136  #define TSC_CONVERT(tsc, hrt, scale) {                  \
 136  137          unsigned int *_l = (unsigned int *)&(tsc);      \
 137  138          (hrt) = mul32(_l[1], scale) << NSEC_SHIFT;      \
 138  139          (hrt) += mul32(_l[0], scale) >> (32 - NSEC_SHIFT); \
 139  140  }
 140  141  
 141  142  int tsc_master_slave_sync_needed = 1;
 142  143  
 143      -static int      tsc_max_delta;
 144      -static hrtime_t tsc_sync_tick_delta[NCPU];
 145  144  typedef struct tsc_sync {
 146  145          volatile hrtime_t master_tsc, slave_tsc;
 147  146  } tsc_sync_t;
 148  147  static tsc_sync_t *tscp;
 149  148  static hrtime_t largest_tsc_delta = 0;
 150  149  static ulong_t shortest_write_time = ~0UL;
 151  150  
 152      -static hrtime_t tsc_last = 0;
 153  151  static hrtime_t tsc_last_jumped = 0;
 154      -static hrtime_t tsc_hrtime_base = 0;
 155  152  static int      tsc_jumped = 0;
 156  153  static uint32_t tsc_wayback = 0;
 157  154  /*
 158  155   * The cap of 1 second was chosen since it is the frequency at which the
 159  156   * tsc_tick() function runs which means that when gethrtime() is called it
 160  157   * should never be more than 1 second since tsc_last was updated.
 161  158   */
 162      -static hrtime_t tsc_resume_cap;
 163  159  static hrtime_t tsc_resume_cap_ns = NANOSEC;     /* 1s */
 164  160  
 165  161  static hrtime_t shadow_tsc_hrtime_base;
 166  162  static hrtime_t shadow_tsc_last;
 167  163  static uint_t   shadow_nsec_scale;
 168  164  static uint32_t shadow_hres_lock;
 169  165  int get_tsc_ready();
 170  166  
 171  167  static inline
 172  168  hrtime_t tsc_protect(hrtime_t a) {
 173  169          if (a > tsc_resume_cap) {
 174  170                  atomic_inc_32(&tsc_wayback);
 175  171                  DTRACE_PROBE3(tsc__wayback, htrime_t, a, hrtime_t, tsc_last,
 176  172                      uint32_t, tsc_wayback);
 177  173                  return (tsc_resume_cap);
 178  174          }
 179  175          return (a);
 180  176  }
 181  177  
 182  178  hrtime_t
 183  179  tsc_gethrtime(void)
 184  180  {
 185  181          uint32_t old_hres_lock;
 186  182          hrtime_t tsc, hrt;
 187  183  
 188  184          do {
 189  185                  old_hres_lock = hres_lock;
 190  186  
 191  187                  if ((tsc = tsc_read()) >= tsc_last) {
 192  188                          /*
 193  189                           * It would seem to be obvious that this is true
 194  190                           * (that is, the past is less than the present),
 195  191                           * but it isn't true in the presence of suspend/resume
 196  192                           * cycles.  If we manage to call gethrtime()
 197  193                           * after a resume, but before the first call to
 198  194                           * tsc_tick(), we will see the jump.  In this case,
 199  195                           * we will simply use the value in TSC as the delta.
 200  196                           */
 201  197                          tsc -= tsc_last;
 202  198                  } else if (tsc >= tsc_last - 2*tsc_max_delta) {
 203  199                          /*
 204  200                           * There is a chance that tsc_tick() has just run on
 205  201                           * another CPU, and we have drifted just enough so that
 206  202                           * we appear behind tsc_last.  In this case, force the
 207  203                           * delta to be zero.
 208  204                           */
 209  205                          tsc = 0;
 210  206                  } else {
 211  207                          /*
 212  208                           * If we reach this else clause we assume that we have
 213  209                           * gone through a suspend/resume cycle and use the
 214  210                           * current tsc value as the delta.
 215  211                           *
 216  212                           * In rare cases we can reach this else clause due to
 217  213                           * a lack of monotonicity in the TSC value.  In such
 218  214                           * cases using the current TSC value as the delta would
 219  215                           * cause us to return a value ~2x of what it should
 220  216                           * be.  To protect against these cases we cap the
 221  217                           * suspend/resume delta at tsc_resume_cap.
 222  218                           */
 223  219                          tsc = tsc_protect(tsc);
 224  220                  }
 225  221  
 226  222                  hrt = tsc_hrtime_base;
 227  223  
 228  224                  TSC_CONVERT_AND_ADD(tsc, hrt, nsec_scale);
 229  225          } while ((old_hres_lock & ~1) != hres_lock);
 230  226  
 231  227          return (hrt);
 232  228  }
 233  229  
 234  230  hrtime_t
 235  231  tsc_gethrtime_delta(void)
 236  232  {
 237  233          uint32_t old_hres_lock;
 238  234          hrtime_t tsc, hrt;
 239  235          ulong_t flags;
 240  236  
 241  237          do {
 242  238                  old_hres_lock = hres_lock;
 243  239  
 244  240                  /*
 245  241                   * We need to disable interrupts here to assure that we
 246  242                   * don't migrate between the call to tsc_read() and
 247  243                   * adding the CPU's TSC tick delta. Note that disabling
 248  244                   * and reenabling preemption is forbidden here because
 249  245                   * we may be in the middle of a fast trap. In the amd64
 250  246                   * kernel we cannot tolerate preemption during a fast
 251  247                   * trap. See _update_sregs().
 252  248                   */
 253  249  
 254  250                  flags = clear_int_flag();
 255  251                  tsc = tsc_read() + tsc_sync_tick_delta[CPU->cpu_id];
 256  252                  restore_int_flag(flags);
 257  253  
 258  254                  /* See comments in tsc_gethrtime() above */
 259  255  
 260  256                  if (tsc >= tsc_last) {
 261  257                          tsc -= tsc_last;
 262  258                  } else if (tsc >= tsc_last - 2 * tsc_max_delta) {
 263  259                          tsc = 0;
 264  260                  } else {
 265  261                          tsc = tsc_protect(tsc);
 266  262                  }
 267  263  
 268  264                  hrt = tsc_hrtime_base;
 269  265  
 270  266                  TSC_CONVERT_AND_ADD(tsc, hrt, nsec_scale);
 271  267          } while ((old_hres_lock & ~1) != hres_lock);
 272  268  
 273  269          return (hrt);
 274  270  }
 275  271  
 276  272  hrtime_t
 277  273  tsc_gethrtime_tick_delta(void)
 278  274  {
 279  275          hrtime_t hrt;
 280  276          ulong_t flags;
 281  277  
 282  278          flags = clear_int_flag();
 283  279          hrt = tsc_sync_tick_delta[CPU->cpu_id];
 284  280          restore_int_flag(flags);
 285  281  
 286  282          return (hrt);
 287  283  }
 288  284  
 289  285  /*
 290  286   * This is similar to the above, but it cannot actually spin on hres_lock.
 291  287   * As a result, it caches all of the variables it needs; if the variables
 292  288   * don't change, it's done.
 293  289   */
 294  290  hrtime_t
 295  291  dtrace_gethrtime(void)
 296  292  {
 297  293          uint32_t old_hres_lock;
 298  294          hrtime_t tsc, hrt;
 299  295          ulong_t flags;
 300  296  
 301  297          do {
 302  298                  old_hres_lock = hres_lock;
 303  299  
 304  300                  /*
 305  301                   * Interrupts are disabled to ensure that the thread isn't
 306  302                   * migrated between the tsc_read() and adding the CPU's
 307  303                   * TSC tick delta.
 308  304                   */
 309  305                  flags = clear_int_flag();
 310  306  
 311  307                  tsc = tsc_read();
 312  308  
 313  309                  if (gethrtimef == tsc_gethrtime_delta)
 314  310                          tsc += tsc_sync_tick_delta[CPU->cpu_id];
 315  311  
 316  312                  restore_int_flag(flags);
 317  313  
 318  314                  /*
 319  315                   * See the comments in tsc_gethrtime(), above.
 320  316                   */
 321  317                  if (tsc >= tsc_last)
 322  318                          tsc -= tsc_last;
 323  319                  else if (tsc >= tsc_last - 2*tsc_max_delta)
 324  320                          tsc = 0;
 325  321                  else
 326  322                          tsc = tsc_protect(tsc);
 327  323  
 328  324                  hrt = tsc_hrtime_base;
 329  325  
 330  326                  TSC_CONVERT_AND_ADD(tsc, hrt, nsec_scale);
 331  327  
 332  328                  if ((old_hres_lock & ~1) == hres_lock)
 333  329                          break;
 334  330  
 335  331                  /*
 336  332                   * If we're here, the clock lock is locked -- or it has been
 337  333                   * unlocked and locked since we looked.  This may be due to
 338  334                   * tsc_tick() running on another CPU -- or it may be because
 339  335                   * some code path has ended up in dtrace_probe() with
 340  336                   * CLOCK_LOCK held.  We'll try to determine that we're in
 341  337                   * the former case by taking another lap if the lock has
 342  338                   * changed since when we first looked at it.
 343  339                   */
 344  340                  if (old_hres_lock != hres_lock)
 345  341                          continue;
 346  342  
 347  343                  /*
 348  344                   * So the lock was and is locked.  We'll use the old data
 349  345                   * instead.
 350  346                   */
 351  347                  old_hres_lock = shadow_hres_lock;
 352  348  
 353  349                  /*
 354  350                   * Again, disable interrupts to ensure that the thread
 355  351                   * isn't migrated between the tsc_read() and adding
 356  352                   * the CPU's TSC tick delta.
 357  353                   */
 358  354                  flags = clear_int_flag();
 359  355  
 360  356                  tsc = tsc_read();
 361  357  
 362  358                  if (gethrtimef == tsc_gethrtime_delta)
 363  359                          tsc += tsc_sync_tick_delta[CPU->cpu_id];
 364  360  
 365  361                  restore_int_flag(flags);
 366  362  
 367  363                  /*
 368  364                   * See the comments in tsc_gethrtime(), above.
 369  365                   */
 370  366                  if (tsc >= shadow_tsc_last)
 371  367                          tsc -= shadow_tsc_last;
 372  368                  else if (tsc >= shadow_tsc_last - 2 * tsc_max_delta)
 373  369                          tsc = 0;
 374  370                  else
 375  371                          tsc = tsc_protect(tsc);
 376  372  
 377  373                  hrt = shadow_tsc_hrtime_base;
 378  374  
 379  375                  TSC_CONVERT_AND_ADD(tsc, hrt, shadow_nsec_scale);
 380  376          } while ((old_hres_lock & ~1) != shadow_hres_lock);
 381  377  
 382  378          return (hrt);
 383  379  }
 384  380  
 385  381  hrtime_t
 386  382  tsc_gethrtimeunscaled(void)
 387  383  {
 388  384          uint32_t old_hres_lock;
 389  385          hrtime_t tsc;
 390  386  
 391  387          do {
 392  388                  old_hres_lock = hres_lock;
 393  389  
 394  390                  /* See tsc_tick(). */
 395  391                  tsc = tsc_read() + tsc_last_jumped;
 396  392          } while ((old_hres_lock & ~1) != hres_lock);
 397  393  
 398  394          return (tsc);
 399  395  }
 400  396  
 401  397  /*
 402  398   * Convert a nanosecond based timestamp to tsc
 403  399   */
 404  400  uint64_t
 405  401  tsc_unscalehrtime(hrtime_t nsec)
 406  402  {
 407  403          hrtime_t tsc;
 408  404  
 409  405          if (tsc_gethrtime_enable) {
 410  406                  TSC_CONVERT(nsec, tsc, nsec_unscale);
 411  407                  return (tsc);
 412  408          }
 413  409          return ((uint64_t)nsec);
 414  410  }
 415  411  
 416  412  /* Convert a tsc timestamp to nanoseconds */
 417  413  void
 418  414  tsc_scalehrtime(hrtime_t *tsc)
 419  415  {
 420  416          hrtime_t hrt;
 421  417          hrtime_t mytsc;
 422  418  
 423  419          if (tsc == NULL)
 424  420                  return;
 425  421          mytsc = *tsc;
 426  422  
 427  423          TSC_CONVERT(mytsc, hrt, nsec_scale);
 428  424          *tsc  = hrt;
 429  425  }
 430  426  
 431  427  hrtime_t
 432  428  tsc_gethrtimeunscaled_delta(void)
 433  429  {
 434  430          hrtime_t hrt;
 435  431          ulong_t flags;
 436  432  
 437  433          /*
 438  434           * Similarly to tsc_gethrtime_delta, we need to disable preemption
 439  435           * to prevent migration between the call to tsc_gethrtimeunscaled
 440  436           * and adding the CPU's hrtime delta. Note that disabling and
 441  437           * reenabling preemption is forbidden here because we may be in the
 442  438           * middle of a fast trap. In the amd64 kernel we cannot tolerate
 443  439           * preemption during a fast trap. See _update_sregs().
 444  440           */
 445  441  
 446  442          flags = clear_int_flag();
 447  443          hrt = tsc_gethrtimeunscaled() + tsc_sync_tick_delta[CPU->cpu_id];
 448  444          restore_int_flag(flags);
 449  445  
 450  446          return (hrt);
 451  447  }
 452  448  
 453  449  /*
 454  450   * Called by the master in the TSC sync operation (usually the boot CPU).
 455  451   * If the slave is discovered to have a skew, gethrtimef will be changed to
 456  452   * point to tsc_gethrtime_delta(). Calculating skews is precise only when
 457  453   * the master and slave TSCs are read simultaneously; however, there is no
 458  454   * algorithm that can read both CPUs in perfect simultaneity. The proposed
 459  455   * algorithm is an approximate method based on the behaviour of cache
 460  456   * management. The slave CPU continuously reads TSC and then reads a global
 461  457   * variable which the master CPU updates. The moment the master's update reaches
 462  458   * the slave's visibility (being forced by an mfence operation) we use the TSC
 463  459   * reading taken on the slave. A corresponding TSC read will be taken on the
 464  460   * master as soon as possible after finishing the mfence operation. But the
 465  461   * delay between causing the slave to notice the invalid cache line and the
 466  462   * competion of mfence is not repeatable. This error is heuristically assumed
 467  463   * to be 1/4th of the total write time as being measured by the two TSC reads
 468  464   * on the master sandwiching the mfence. Furthermore, due to the nature of
 469  465   * bus arbitration, contention on memory bus, etc., the time taken for the write
 470  466   * to reflect globally can vary a lot. So instead of taking a single reading,
 471  467   * a set of readings are taken and the one with least write time is chosen
 472  468   * to calculate the final skew.
 473  469   *
 474  470   * TSC sync is disabled in the context of virtualization because the CPUs
 475  471   * assigned to the guest are virtual CPUs which means the real CPUs on which
 476  472   * guest runs keep changing during life time of guest OS. So we would end up
 477  473   * calculating TSC skews for a set of CPUs during boot whereas the guest
 478  474   * might migrate to a different set of physical CPUs at a later point of
 479  475   * time.
 480  476   */
 481  477  void
 482  478  tsc_sync_master(processorid_t slave)
 483  479  {
 484  480          ulong_t flags, source, min_write_time = ~0UL;
 485  481          hrtime_t write_time, x, mtsc_after, tdelta;
 486  482          tsc_sync_t *tsc = tscp;
 487  483          int cnt;
 488  484          int hwtype;
 489  485  
 490  486          hwtype = get_hwenv();
 491  487          if (!tsc_master_slave_sync_needed || (hwtype & HW_VIRTUAL) != 0)
 492  488                  return;
 493  489  
 494  490          flags = clear_int_flag();
 495  491          source = CPU->cpu_id;
 496  492  
 497  493          for (cnt = 0; cnt < SYNC_ITERATIONS; cnt++) {
 498  494                  while (tsc_sync_go != TSC_SYNC_GO)
 499  495                          SMT_PAUSE();
 500  496  
 501  497                  tsc->master_tsc = tsc_read();
 502  498                  membar_enter();
 503  499                  mtsc_after = tsc_read();
 504  500                  while (tsc_sync_go != TSC_SYNC_DONE)
 505  501                          SMT_PAUSE();
 506  502                  write_time =  mtsc_after - tsc->master_tsc;
 507  503                  if (write_time <= min_write_time) {
 508  504                          min_write_time = write_time;
 509  505                          /*
 510  506                           * Apply heuristic adjustment only if the calculated
 511  507                           * delta is > 1/4th of the write time.
 512  508                           */
 513  509                          x = tsc->slave_tsc - mtsc_after;
 514  510                          if (x < 0)
 515  511                                  x = -x;
 516  512                          if (x > (min_write_time/4))
 517  513                                  /*
 518  514                                   * Subtract 1/4th of the measured write time
 519  515                                   * from the master's TSC value, as an estimate
 520  516                                   * of how late the mfence completion came
 521  517                                   * after the slave noticed the cache line
 522  518                                   * change.
 523  519                                   */
 524  520                                  tdelta = tsc->slave_tsc -
 525  521                                      (mtsc_after - (min_write_time/4));
 526  522                          else
 527  523                                  tdelta = tsc->slave_tsc - mtsc_after;
 528  524                          tsc_sync_tick_delta[slave] =
 529  525                              tsc_sync_tick_delta[source] - tdelta;
 530  526                  }
 531  527  
 532  528                  tsc->master_tsc = tsc->slave_tsc = write_time = 0;
 533  529                  membar_enter();
 534  530                  tsc_sync_go = TSC_SYNC_STOP;
 535  531          }
 536  532          if (tdelta < 0)
 537  533                  tdelta = -tdelta;
 538  534          if (tdelta > largest_tsc_delta)
  
    | 
      ↓ open down ↓ | 
    366 lines elided | 
    
      ↑ open up ↑ | 
  
 539  535                  largest_tsc_delta = tdelta;
 540  536          if (min_write_time < shortest_write_time)
 541  537                  shortest_write_time = min_write_time;
 542  538          /*
 543  539           * Enable delta variants of tsc functions if the largest of all chosen
 544  540           * deltas is > smallest of the write time.
 545  541           */
 546  542          if (largest_tsc_delta > shortest_write_time) {
 547  543                  gethrtimef = tsc_gethrtime_delta;
 548  544                  gethrtimeunscaledf = tsc_gethrtimeunscaled_delta;
      545 +                tsc_ncpu = NCPU;
 549  546          }
 550  547          restore_int_flag(flags);
 551  548  }
 552  549  
 553  550  /*
 554  551   * Called by a CPU which has just been onlined.  It is expected that the CPU
 555  552   * performing the online operation will call tsc_sync_master().
 556  553   *
 557  554   * TSC sync is disabled in the context of virtualization. See comments
 558  555   * above tsc_sync_master.
 559  556   */
 560  557  void
 561  558  tsc_sync_slave(void)
 562  559  {
 563  560          ulong_t flags;
 564  561          hrtime_t s1;
 565  562          tsc_sync_t *tsc = tscp;
 566  563          int cnt;
 567  564          int hwtype;
 568  565  
 569  566          hwtype = get_hwenv();
 570  567          if (!tsc_master_slave_sync_needed || (hwtype & HW_VIRTUAL) != 0)
 571  568                  return;
 572  569  
 573  570          flags = clear_int_flag();
 574  571  
 575  572          for (cnt = 0; cnt < SYNC_ITERATIONS; cnt++) {
 576  573                  /* Re-fill the cache line */
 577  574                  s1 = tsc->master_tsc;
 578  575                  membar_enter();
 579  576                  tsc_sync_go = TSC_SYNC_GO;
 580  577                  do {
 581  578                          /*
 582  579                           * Do not put an SMT_PAUSE here. For instance,
 583  580                           * if the master and slave are really the same
 584  581                           * hyper-threaded CPU, then you want the master
 585  582                           * to yield to the slave as quickly as possible here,
 586  583                           * but not the other way.
 587  584                           */
 588  585                          s1 = tsc_read();
 589  586                  } while (tsc->master_tsc == 0);
 590  587                  tsc->slave_tsc = s1;
 591  588                  membar_enter();
 592  589                  tsc_sync_go = TSC_SYNC_DONE;
 593  590  
 594  591                  while (tsc_sync_go != TSC_SYNC_STOP)
 595  592                          SMT_PAUSE();
 596  593          }
 597  594  
 598  595          restore_int_flag(flags);
 599  596  }
 600  597  
 601  598  /*
 602  599   * Called once per second on a CPU from the cyclic subsystem's
 603  600   * CY_HIGH_LEVEL interrupt.  (No longer just cpu0-only)
 604  601   */
 605  602  void
 606  603  tsc_tick(void)
 607  604  {
 608  605          hrtime_t now, delta;
 609  606          ushort_t spl;
 610  607  
 611  608          /*
 612  609           * Before we set the new variables, we set the shadow values.  This
 613  610           * allows for lock free operation in dtrace_gethrtime().
 614  611           */
 615  612          lock_set_spl((lock_t *)&shadow_hres_lock + HRES_LOCK_OFFSET,
 616  613              ipltospl(CBE_HIGH_PIL), &spl);
 617  614  
 618  615          shadow_tsc_hrtime_base = tsc_hrtime_base;
 619  616          shadow_tsc_last = tsc_last;
 620  617          shadow_nsec_scale = nsec_scale;
 621  618  
 622  619          shadow_hres_lock++;
 623  620          splx(spl);
 624  621  
 625  622          CLOCK_LOCK(&spl);
 626  623  
 627  624          now = tsc_read();
 628  625  
 629  626          if (gethrtimef == tsc_gethrtime_delta)
 630  627                  now += tsc_sync_tick_delta[CPU->cpu_id];
 631  628  
 632  629          if (now < tsc_last) {
 633  630                  /*
 634  631                   * The TSC has just jumped into the past.  We assume that
 635  632                   * this is due to a suspend/resume cycle, and we're going
 636  633                   * to use the _current_ value of TSC as the delta.  This
 637  634                   * will keep tsc_hrtime_base correct.  We're also going to
 638  635                   * assume that rate of tsc does not change after a suspend
 639  636                   * resume (i.e nsec_scale remains the same).
 640  637                   */
 641  638                  delta = now;
 642  639                  delta = tsc_protect(delta);
 643  640                  tsc_last_jumped += tsc_last;
 644  641                  tsc_jumped = 1;
 645  642          } else {
 646  643                  /*
 647  644                   * Determine the number of TSC ticks since the last clock
 648  645                   * tick, and add that to the hrtime base.
 649  646                   */
 650  647                  delta = now - tsc_last;
 651  648          }
 652  649  
 653  650          TSC_CONVERT_AND_ADD(delta, tsc_hrtime_base, nsec_scale);
 654  651          tsc_last = now;
 655  652  
 656  653          CLOCK_UNLOCK(spl);
 657  654  }
 658  655  
 659  656  void
 660  657  tsc_hrtimeinit(uint64_t cpu_freq_hz)
 661  658  {
 662  659          extern int gethrtime_hires;
 663  660          longlong_t tsc;
 664  661          ulong_t flags;
 665  662  
 666  663          /*
 667  664           * cpu_freq_hz is the measured cpu frequency in hertz
 668  665           */
 669  666  
 670  667          /*
 671  668           * We can't accommodate CPUs slower than 31.25 MHz.
 672  669           */
 673  670          ASSERT(cpu_freq_hz > NANOSEC / (1 << NSEC_SHIFT));
 674  671          nsec_scale =
 675  672              (uint_t)(((uint64_t)NANOSEC << (32 - NSEC_SHIFT)) / cpu_freq_hz);
 676  673          nsec_unscale =
 677  674              (uint_t)(((uint64_t)cpu_freq_hz << (32 - NSEC_SHIFT)) / NANOSEC);
 678  675  
 679  676          flags = clear_int_flag();
 680  677          tsc = tsc_read();
  
    | 
      ↓ open down ↓ | 
    122 lines elided | 
    
      ↑ open up ↑ | 
  
 681  678          (void) tsc_gethrtime();
 682  679          tsc_max_delta = tsc_read() - tsc;
 683  680          restore_int_flag(flags);
 684  681          gethrtimef = tsc_gethrtime;
 685  682          gethrtimeunscaledf = tsc_gethrtimeunscaled;
 686  683          scalehrtimef = tsc_scalehrtime;
 687  684          unscalehrtimef = tsc_unscalehrtime;
 688  685          hrtime_tick = tsc_tick;
 689  686          gethrtime_hires = 1;
 690  687          /*
      688 +         * Being part of the comm page, tsc_ncpu communicates the published
      689 +         * length of the tsc_sync_tick_delta array.  This is kept zeroed to
      690 +         * ignore the absent delta data while the TSCs are synced.
      691 +         */
      692 +        tsc_ncpu = 0;
      693 +        /*
 691  694           * Allocate memory for the structure used in the tsc sync logic.
 692  695           * This structure should be aligned on a multiple of cache line size.
 693  696           */
 694  697          tscp = kmem_zalloc(PAGESIZE, KM_SLEEP);
 695  698  
 696  699          /*
 697  700           * Convert the TSC resume cap ns value into its unscaled TSC value.
 698  701           * See tsc_gethrtime().
 699  702           */
 700  703          if (tsc_resume_cap == 0)
 701  704                  TSC_CONVERT(tsc_resume_cap_ns, tsc_resume_cap, nsec_unscale);
 702  705  }
 703  706  
 704  707  int
 705  708  get_tsc_ready()
 706  709  {
 707  710          return (tsc_ready);
 708  711  }
 709  712  
 710  713  /*
 711  714   * Adjust all the deltas by adding the passed value to the array.
 712  715   * Then use the "delt" versions of the the gethrtime functions.
 713  716   * Note that 'tdelta' _could_ be a negative number, which should
 714  717   * reduce the values in the array (used, for example, if the Solaris
 715  718   * instance was moved by a virtual manager to a machine with a higher
 716  719   * value of tsc).
 717  720   */
 718  721  void
  
    | 
      ↓ open down ↓ | 
    18 lines elided | 
    
      ↑ open up ↑ | 
  
 719  722  tsc_adjust_delta(hrtime_t tdelta)
 720  723  {
 721  724          int             i;
 722  725  
 723  726          for (i = 0; i < NCPU; i++) {
 724  727                  tsc_sync_tick_delta[i] += tdelta;
 725  728          }
 726  729  
 727  730          gethrtimef = tsc_gethrtime_delta;
 728  731          gethrtimeunscaledf = tsc_gethrtimeunscaled_delta;
      732 +        tsc_ncpu = NCPU;
 729  733  }
 730  734  
 731  735  /*
 732  736   * Functions to manage TSC and high-res time on suspend and resume.
 733  737   */
 734  738  
 735  739  /*
 736  740   * declarations needed for time adjustment
 737  741   */
 738  742  extern void     rtcsync(void);
 739  743  extern tod_ops_t *tod_ops;
 740  744  /* There must be a better way than exposing nsec_scale! */
 741  745  extern uint_t   nsec_scale;
 742  746  static uint64_t tsc_saved_tsc = 0; /* 1 in 2^64 chance this'll screw up! */
 743  747  static timestruc_t tsc_saved_ts;
 744  748  static int      tsc_needs_resume = 0;   /* We only want to do this once. */
 745  749  int             tsc_delta_onsuspend = 0;
 746  750  int             tsc_adjust_seconds = 1;
 747  751  int             tsc_suspend_count = 0;
 748  752  int             tsc_resume_in_cyclic = 0;
 749  753  
 750  754  /*
 751  755   * Let timestamp.c know that we are suspending.  It needs to take
 752  756   * snapshots of the current time, and do any pre-suspend work.
 753  757   */
 754  758  void
 755  759  tsc_suspend(void)
 756  760  {
 757  761  /*
 758  762   * What we need to do here, is to get the time we suspended, so that we
 759  763   * know how much we should add to the resume.
 760  764   * This routine is called by each CPU, so we need to handle reentry.
 761  765   */
 762  766          if (tsc_gethrtime_enable) {
 763  767                  /*
 764  768                   * We put the tsc_read() inside the lock as it
 765  769                   * as no locking constraints, and it puts the
 766  770                   * aquired value closer to the time stamp (in
 767  771                   * case we delay getting the lock).
 768  772                   */
 769  773                  mutex_enter(&tod_lock);
 770  774                  tsc_saved_tsc = tsc_read();
 771  775                  tsc_saved_ts = TODOP_GET(tod_ops);
 772  776                  mutex_exit(&tod_lock);
 773  777                  /* We only want to do this once. */
 774  778                  if (tsc_needs_resume == 0) {
 775  779                          if (tsc_delta_onsuspend) {
 776  780                                  tsc_adjust_delta(tsc_saved_tsc);
 777  781                          } else {
 778  782                                  tsc_adjust_delta(nsec_scale);
 779  783                          }
 780  784                          tsc_suspend_count++;
 781  785                  }
 782  786          }
 783  787  
 784  788          invalidate_cache();
 785  789          tsc_needs_resume = 1;
 786  790  }
 787  791  
 788  792  /*
 789  793   * Restore all timestamp state based on the snapshots taken at
 790  794   * suspend time.
 791  795   */
 792  796  void
 793  797  tsc_resume(void)
 794  798  {
 795  799          /*
 796  800           * We only need to (and want to) do this once.  So let the first
 797  801           * caller handle this (we are locked by the cpu lock), as it
 798  802           * is preferential that we get the earliest sync.
 799  803           */
 800  804          if (tsc_needs_resume) {
 801  805                  /*
 802  806                   * If using the TSC, adjust the delta based on how long
 803  807                   * we were sleeping (or away).  We also adjust for
 804  808                   * migration and a grown TSC.
 805  809                   */
 806  810                  if (tsc_saved_tsc != 0) {
 807  811                          timestruc_t     ts;
 808  812                          hrtime_t        now, sleep_tsc = 0;
 809  813                          int             sleep_sec;
 810  814                          extern void     tsc_tick(void);
 811  815                          extern uint64_t cpu_freq_hz;
 812  816  
 813  817                          /* tsc_read() MUST be before TODOP_GET() */
 814  818                          mutex_enter(&tod_lock);
 815  819                          now = tsc_read();
 816  820                          ts = TODOP_GET(tod_ops);
 817  821                          mutex_exit(&tod_lock);
 818  822  
 819  823                          /* Compute seconds of sleep time */
 820  824                          sleep_sec = ts.tv_sec - tsc_saved_ts.tv_sec;
 821  825  
 822  826                          /*
 823  827                           * If the saved sec is less that or equal to
 824  828                           * the current ts, then there is likely a
 825  829                           * problem with the clock.  Assume at least
 826  830                           * one second has passed, so that time goes forward.
 827  831                           */
 828  832                          if (sleep_sec <= 0) {
 829  833                                  sleep_sec = 1;
 830  834                          }
 831  835  
 832  836                          /* How many TSC's should have occured while sleeping */
 833  837                          if (tsc_adjust_seconds)
 834  838                                  sleep_tsc = sleep_sec * cpu_freq_hz;
 835  839  
 836  840                          /*
 837  841                           * We also want to subtract from the "sleep_tsc"
 838  842                           * the current value of tsc_read(), so that our
 839  843                           * adjustment accounts for the amount of time we
 840  844                           * have been resumed _or_ an adjustment based on
 841  845                           * the fact that we didn't actually power off the
 842  846                           * CPU (migration is another issue, but _should_
 843  847                           * also comply with this calculation).  If the CPU
 844  848                           * never powered off, then:
 845  849                           *    'now == sleep_tsc + saved_tsc'
 846  850                           * and the delta will effectively be "0".
 847  851                           */
 848  852                          sleep_tsc -= now;
 849  853                          if (tsc_delta_onsuspend) {
 850  854                                  tsc_adjust_delta(sleep_tsc);
 851  855                          } else {
 852  856                                  tsc_adjust_delta(tsc_saved_tsc + sleep_tsc);
 853  857                          }
 854  858                          tsc_saved_tsc = 0;
 855  859  
 856  860                          tsc_tick();
 857  861                  }
 858  862                  tsc_needs_resume = 0;
 859  863          }
 860  864  
 861  865  }
  
    | 
      ↓ open down ↓ | 
    123 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX