Print this page
    
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/i86pc/os/timestamp.c
          +++ new/usr/src/uts/i86pc/os/timestamp.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24   24   * Use is subject to license terms.
  25   25   *
  26   26   * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
  27   27   * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
  28   28   * Copyright 2016 Joyent, Inc.
  29   29   */
  30   30  
  31   31  #include <sys/types.h>
  32   32  #include <sys/param.h>
  33   33  #include <sys/systm.h>
  34   34  #include <sys/disp.h>
  35   35  #include <sys/var.h>
  36   36  #include <sys/cmn_err.h>
  37   37  #include <sys/debug.h>
  38   38  #include <sys/x86_archext.h>
  39   39  #include <sys/archsystm.h>
  40   40  #include <sys/cpuvar.h>
  41   41  #include <sys/psm_defs.h>
  42   42  #include <sys/clock.h>
  43   43  #include <sys/atomic.h>
  44   44  #include <sys/lockstat.h>
  45   45  #include <sys/smp_impldefs.h>
  46   46  #include <sys/dtrace.h>
  47   47  #include <sys/time.h>
  48   48  #include <sys/panic.h>
  49   49  #include <sys/cpu.h>
  50   50  #include <sys/sdt.h>
  51   51  #include <sys/comm_page.h>
  52   52  
  53   53  /*
  54   54   * Using the Pentium's TSC register for gethrtime()
  55   55   * ------------------------------------------------
  56   56   *
  57   57   * The Pentium family, like many chip architectures, has a high-resolution
  58   58   * timestamp counter ("TSC") which increments once per CPU cycle.  The contents
  59   59   * of the timestamp counter are read with the RDTSC instruction.
  60   60   *
  61   61   * As with its UltraSPARC equivalent (the %tick register), TSC's cycle count
  62   62   * must be translated into nanoseconds in order to implement gethrtime().
  63   63   * We avoid inducing floating point operations in this conversion by
  64   64   * implementing the same nsec_scale algorithm as that found in the sun4u
  65   65   * platform code.  The sun4u NATIVE_TIME_TO_NSEC_SCALE block comment contains
  66   66   * a detailed description of the algorithm; the comment is not reproduced
  67   67   * here.  This implementation differs only in its value for NSEC_SHIFT:
  68   68   * we implement an NSEC_SHIFT of 5 (instead of sun4u's 4) to allow for
  69   69   * 60 MHz Pentiums.
  70   70   *
  71   71   * While TSC and %tick are both cycle counting registers, TSC's functionality
  72   72   * falls short in several critical ways:
  73   73   *
  74   74   *  (a) TSCs on different CPUs are not guaranteed to be in sync.  While in
  75   75   *      practice they often _are_ in sync, this isn't guaranteed by the
  76   76   *      architecture.
  77   77   *
  78   78   *  (b) The TSC cannot be reliably set to an arbitrary value.  The architecture
  79   79   *      only supports writing the low 32-bits of TSC, making it impractical
  80   80   *      to rewrite.
  81   81   *
  82   82   *  (c) The architecture doesn't have the capacity to interrupt based on
  83   83   *      arbitrary values of TSC; there is no TICK_CMPR equivalent.
  84   84   *
  85   85   * Together, (a) and (b) imply that software must track the skew between
  86   86   * TSCs and account for it (it is assumed that while there may exist skew,
  87   87   * there does not exist drift).  To determine the skew between CPUs, we
  88   88   * have newly onlined CPUs call tsc_sync_slave(), while the CPU performing
  89   89   * the online operation calls tsc_sync_master().
  90   90   *
  91   91   * In the absence of time-of-day clock adjustments, gethrtime() must stay in
  92   92   * sync with gettimeofday().  This is problematic; given (c), the software
  93   93   * cannot drive its time-of-day source from TSC, and yet they must somehow be
  94   94   * kept in sync.  We implement this by having a routine, tsc_tick(), which
  95   95   * is called once per second from the interrupt which drives time-of-day.
  96   96   *
  97   97   * Note that the hrtime base for gethrtime, tsc_hrtime_base, is modified
  98   98   * atomically with nsec_scale under CLOCK_LOCK.  This assures that time
  99   99   * monotonically increases.
 100  100   */
 101  101  
 102  102  #define NSEC_SHIFT 5
 103  103  
 104  104  static uint_t nsec_unscale;
 105  105  
 106  106  /*
 107  107   * These two variables used to be grouped together inside of a structure that
 108  108   * lived on a single cache line. A regression (bug ID 4623398) caused the
 109  109   * compiler to emit code that "optimized" away the while-loops below. The
 110  110   * result was that no synchronization between the onlining and onlined CPUs
 111  111   * took place.
 112  112   */
 113  113  static volatile int tsc_ready;
 114  114  static volatile int tsc_sync_go;
 115  115  
 116  116  /*
 117  117   * Used as indices into the tsc_sync_snaps[] array.
 118  118   */
 119  119  #define TSC_MASTER              0
 120  120  #define TSC_SLAVE               1
 121  121  
 122  122  /*
 123  123   * Used in the tsc_master_sync()/tsc_slave_sync() rendezvous.
 124  124   */
 125  125  #define TSC_SYNC_STOP           1
 126  126  #define TSC_SYNC_GO             2
 127  127  #define TSC_SYNC_DONE           3
 128  128  #define SYNC_ITERATIONS         10
 129  129  
 130  130  #define TSC_CONVERT_AND_ADD(tsc, hrt, scale) {          \
 131  131          unsigned int *_l = (unsigned int *)&(tsc);      \
 132  132          (hrt) += mul32(_l[1], scale) << NSEC_SHIFT;     \
 133  133          (hrt) += mul32(_l[0], scale) >> (32 - NSEC_SHIFT); \
 134  134  }
 135  135  
 136  136  #define TSC_CONVERT(tsc, hrt, scale) {                  \
 137  137          unsigned int *_l = (unsigned int *)&(tsc);      \
  
    | 
      ↓ open down ↓ | 
    137 lines elided | 
    
      ↑ open up ↑ | 
  
 138  138          (hrt) = mul32(_l[1], scale) << NSEC_SHIFT;      \
 139  139          (hrt) += mul32(_l[0], scale) >> (32 - NSEC_SHIFT); \
 140  140  }
 141  141  
 142  142  int tsc_master_slave_sync_needed = 1;
 143  143  
 144  144  typedef struct tsc_sync {
 145  145          volatile hrtime_t master_tsc, slave_tsc;
 146  146  } tsc_sync_t;
 147  147  static tsc_sync_t *tscp;
      148 +static hrtime_t largest_tsc_delta = 0;
      149 +static ulong_t shortest_write_time = ~0UL;
 148  150  
 149  151  static hrtime_t tsc_last_jumped = 0;
 150  152  static int      tsc_jumped = 0;
 151  153  static uint32_t tsc_wayback = 0;
 152  154  /*
 153  155   * The cap of 1 second was chosen since it is the frequency at which the
 154  156   * tsc_tick() function runs which means that when gethrtime() is called it
 155  157   * should never be more than 1 second since tsc_last was updated.
 156  158   */
 157  159  static hrtime_t tsc_resume_cap_ns = NANOSEC;     /* 1s */
 158  160  
 159  161  static hrtime_t shadow_tsc_hrtime_base;
 160  162  static hrtime_t shadow_tsc_last;
 161  163  static uint_t   shadow_nsec_scale;
 162  164  static uint32_t shadow_hres_lock;
 163  165  int get_tsc_ready();
 164  166  
 165  167  static inline
 166  168  hrtime_t tsc_protect(hrtime_t a) {
 167  169          if (a > tsc_resume_cap) {
 168  170                  atomic_inc_32(&tsc_wayback);
 169  171                  DTRACE_PROBE3(tsc__wayback, htrime_t, a, hrtime_t, tsc_last,
 170  172                      uint32_t, tsc_wayback);
 171  173                  return (tsc_resume_cap);
 172  174          }
 173  175          return (a);
 174  176  }
 175  177  
 176  178  hrtime_t
 177  179  tsc_gethrtime(void)
 178  180  {
 179  181          uint32_t old_hres_lock;
 180  182          hrtime_t tsc, hrt;
 181  183  
 182  184          do {
 183  185                  old_hres_lock = hres_lock;
 184  186  
 185  187                  if ((tsc = tsc_read()) >= tsc_last) {
 186  188                          /*
 187  189                           * It would seem to be obvious that this is true
 188  190                           * (that is, the past is less than the present),
 189  191                           * but it isn't true in the presence of suspend/resume
 190  192                           * cycles.  If we manage to call gethrtime()
 191  193                           * after a resume, but before the first call to
 192  194                           * tsc_tick(), we will see the jump.  In this case,
 193  195                           * we will simply use the value in TSC as the delta.
 194  196                           */
 195  197                          tsc -= tsc_last;
 196  198                  } else if (tsc >= tsc_last - 2*tsc_max_delta) {
 197  199                          /*
 198  200                           * There is a chance that tsc_tick() has just run on
 199  201                           * another CPU, and we have drifted just enough so that
 200  202                           * we appear behind tsc_last.  In this case, force the
 201  203                           * delta to be zero.
 202  204                           */
 203  205                          tsc = 0;
 204  206                  } else {
 205  207                          /*
 206  208                           * If we reach this else clause we assume that we have
 207  209                           * gone through a suspend/resume cycle and use the
 208  210                           * current tsc value as the delta.
 209  211                           *
 210  212                           * In rare cases we can reach this else clause due to
 211  213                           * a lack of monotonicity in the TSC value.  In such
 212  214                           * cases using the current TSC value as the delta would
 213  215                           * cause us to return a value ~2x of what it should
 214  216                           * be.  To protect against these cases we cap the
 215  217                           * suspend/resume delta at tsc_resume_cap.
 216  218                           */
 217  219                          tsc = tsc_protect(tsc);
 218  220                  }
 219  221  
 220  222                  hrt = tsc_hrtime_base;
 221  223  
 222  224                  TSC_CONVERT_AND_ADD(tsc, hrt, nsec_scale);
 223  225          } while ((old_hres_lock & ~1) != hres_lock);
 224  226  
 225  227          return (hrt);
 226  228  }
 227  229  
 228  230  hrtime_t
 229  231  tsc_gethrtime_delta(void)
 230  232  {
 231  233          uint32_t old_hres_lock;
 232  234          hrtime_t tsc, hrt;
 233  235          ulong_t flags;
 234  236  
 235  237          do {
 236  238                  old_hres_lock = hres_lock;
 237  239  
 238  240                  /*
 239  241                   * We need to disable interrupts here to assure that we
 240  242                   * don't migrate between the call to tsc_read() and
 241  243                   * adding the CPU's TSC tick delta. Note that disabling
 242  244                   * and reenabling preemption is forbidden here because
 243  245                   * we may be in the middle of a fast trap. In the amd64
 244  246                   * kernel we cannot tolerate preemption during a fast
 245  247                   * trap. See _update_sregs().
 246  248                   */
 247  249  
 248  250                  flags = clear_int_flag();
 249  251                  tsc = tsc_read() + tsc_sync_tick_delta[CPU->cpu_id];
 250  252                  restore_int_flag(flags);
 251  253  
 252  254                  /* See comments in tsc_gethrtime() above */
 253  255  
 254  256                  if (tsc >= tsc_last) {
 255  257                          tsc -= tsc_last;
 256  258                  } else if (tsc >= tsc_last - 2 * tsc_max_delta) {
 257  259                          tsc = 0;
 258  260                  } else {
 259  261                          tsc = tsc_protect(tsc);
 260  262                  }
 261  263  
 262  264                  hrt = tsc_hrtime_base;
 263  265  
 264  266                  TSC_CONVERT_AND_ADD(tsc, hrt, nsec_scale);
 265  267          } while ((old_hres_lock & ~1) != hres_lock);
 266  268  
 267  269          return (hrt);
 268  270  }
 269  271  
 270  272  hrtime_t
 271  273  tsc_gethrtime_tick_delta(void)
 272  274  {
 273  275          hrtime_t hrt;
 274  276          ulong_t flags;
 275  277  
 276  278          flags = clear_int_flag();
 277  279          hrt = tsc_sync_tick_delta[CPU->cpu_id];
 278  280          restore_int_flag(flags);
 279  281  
 280  282          return (hrt);
 281  283  }
 282  284  
 283  285  /*
 284  286   * This is similar to the above, but it cannot actually spin on hres_lock.
 285  287   * As a result, it caches all of the variables it needs; if the variables
 286  288   * don't change, it's done.
 287  289   */
 288  290  hrtime_t
 289  291  dtrace_gethrtime(void)
 290  292  {
 291  293          uint32_t old_hres_lock;
 292  294          hrtime_t tsc, hrt;
 293  295          ulong_t flags;
 294  296  
 295  297          do {
 296  298                  old_hres_lock = hres_lock;
 297  299  
 298  300                  /*
 299  301                   * Interrupts are disabled to ensure that the thread isn't
 300  302                   * migrated between the tsc_read() and adding the CPU's
 301  303                   * TSC tick delta.
 302  304                   */
 303  305                  flags = clear_int_flag();
 304  306  
 305  307                  tsc = tsc_read();
 306  308  
 307  309                  if (gethrtimef == tsc_gethrtime_delta)
 308  310                          tsc += tsc_sync_tick_delta[CPU->cpu_id];
 309  311  
 310  312                  restore_int_flag(flags);
 311  313  
 312  314                  /*
 313  315                   * See the comments in tsc_gethrtime(), above.
 314  316                   */
 315  317                  if (tsc >= tsc_last)
 316  318                          tsc -= tsc_last;
 317  319                  else if (tsc >= tsc_last - 2*tsc_max_delta)
 318  320                          tsc = 0;
 319  321                  else
 320  322                          tsc = tsc_protect(tsc);
 321  323  
 322  324                  hrt = tsc_hrtime_base;
 323  325  
 324  326                  TSC_CONVERT_AND_ADD(tsc, hrt, nsec_scale);
 325  327  
 326  328                  if ((old_hres_lock & ~1) == hres_lock)
 327  329                          break;
 328  330  
 329  331                  /*
 330  332                   * If we're here, the clock lock is locked -- or it has been
 331  333                   * unlocked and locked since we looked.  This may be due to
 332  334                   * tsc_tick() running on another CPU -- or it may be because
 333  335                   * some code path has ended up in dtrace_probe() with
 334  336                   * CLOCK_LOCK held.  We'll try to determine that we're in
 335  337                   * the former case by taking another lap if the lock has
 336  338                   * changed since when we first looked at it.
 337  339                   */
 338  340                  if (old_hres_lock != hres_lock)
 339  341                          continue;
 340  342  
 341  343                  /*
 342  344                   * So the lock was and is locked.  We'll use the old data
 343  345                   * instead.
 344  346                   */
 345  347                  old_hres_lock = shadow_hres_lock;
 346  348  
 347  349                  /*
 348  350                   * Again, disable interrupts to ensure that the thread
 349  351                   * isn't migrated between the tsc_read() and adding
 350  352                   * the CPU's TSC tick delta.
 351  353                   */
 352  354                  flags = clear_int_flag();
 353  355  
 354  356                  tsc = tsc_read();
 355  357  
 356  358                  if (gethrtimef == tsc_gethrtime_delta)
 357  359                          tsc += tsc_sync_tick_delta[CPU->cpu_id];
 358  360  
 359  361                  restore_int_flag(flags);
 360  362  
 361  363                  /*
 362  364                   * See the comments in tsc_gethrtime(), above.
 363  365                   */
 364  366                  if (tsc >= shadow_tsc_last)
 365  367                          tsc -= shadow_tsc_last;
 366  368                  else if (tsc >= shadow_tsc_last - 2 * tsc_max_delta)
 367  369                          tsc = 0;
 368  370                  else
 369  371                          tsc = tsc_protect(tsc);
 370  372  
 371  373                  hrt = shadow_tsc_hrtime_base;
 372  374  
 373  375                  TSC_CONVERT_AND_ADD(tsc, hrt, shadow_nsec_scale);
 374  376          } while ((old_hres_lock & ~1) != shadow_hres_lock);
 375  377  
 376  378          return (hrt);
 377  379  }
 378  380  
 379  381  hrtime_t
 380  382  tsc_gethrtimeunscaled(void)
 381  383  {
 382  384          uint32_t old_hres_lock;
 383  385          hrtime_t tsc;
 384  386  
 385  387          do {
 386  388                  old_hres_lock = hres_lock;
 387  389  
 388  390                  /* See tsc_tick(). */
 389  391                  tsc = tsc_read() + tsc_last_jumped;
 390  392          } while ((old_hres_lock & ~1) != hres_lock);
 391  393  
 392  394          return (tsc);
 393  395  }
 394  396  
 395  397  /*
 396  398   * Convert a nanosecond based timestamp to tsc
 397  399   */
 398  400  uint64_t
 399  401  tsc_unscalehrtime(hrtime_t nsec)
 400  402  {
 401  403          hrtime_t tsc;
 402  404  
 403  405          if (tsc_gethrtime_enable) {
 404  406                  TSC_CONVERT(nsec, tsc, nsec_unscale);
 405  407                  return (tsc);
 406  408          }
 407  409          return ((uint64_t)nsec);
 408  410  }
 409  411  
 410  412  /* Convert a tsc timestamp to nanoseconds */
 411  413  void
 412  414  tsc_scalehrtime(hrtime_t *tsc)
 413  415  {
 414  416          hrtime_t hrt;
 415  417          hrtime_t mytsc;
 416  418  
 417  419          if (tsc == NULL)
 418  420                  return;
 419  421          mytsc = *tsc;
 420  422  
 421  423          TSC_CONVERT(mytsc, hrt, nsec_scale);
 422  424          *tsc  = hrt;
 423  425  }
 424  426  
 425  427  hrtime_t
 426  428  tsc_gethrtimeunscaled_delta(void)
 427  429  {
 428  430          hrtime_t hrt;
 429  431          ulong_t flags;
 430  432  
 431  433          /*
 432  434           * Similarly to tsc_gethrtime_delta, we need to disable preemption
 433  435           * to prevent migration between the call to tsc_gethrtimeunscaled
 434  436           * and adding the CPU's hrtime delta. Note that disabling and
 435  437           * reenabling preemption is forbidden here because we may be in the
 436  438           * middle of a fast trap. In the amd64 kernel we cannot tolerate
 437  439           * preemption during a fast trap. See _update_sregs().
  
    | 
      ↓ open down ↓ | 
    280 lines elided | 
    
      ↑ open up ↑ | 
  
 438  440           */
 439  441  
 440  442          flags = clear_int_flag();
 441  443          hrt = tsc_gethrtimeunscaled() + tsc_sync_tick_delta[CPU->cpu_id];
 442  444          restore_int_flag(flags);
 443  445  
 444  446          return (hrt);
 445  447  }
 446  448  
 447  449  /*
 448      - * TSC Sync Master
      450 + * Called by the master in the TSC sync operation (usually the boot CPU).
      451 + * If the slave is discovered to have a skew, gethrtimef will be changed to
      452 + * point to tsc_gethrtime_delta(). Calculating skews is precise only when
      453 + * the master and slave TSCs are read simultaneously; however, there is no
      454 + * algorithm that can read both CPUs in perfect simultaneity. The proposed
      455 + * algorithm is an approximate method based on the behaviour of cache
      456 + * management. The slave CPU continuously reads TSC and then reads a global
      457 + * variable which the master CPU updates. The moment the master's update reaches
      458 + * the slave's visibility (being forced by an mfence operation) we use the TSC
      459 + * reading taken on the slave. A corresponding TSC read will be taken on the
      460 + * master as soon as possible after finishing the mfence operation. But the
      461 + * delay between causing the slave to notice the invalid cache line and the
      462 + * competion of mfence is not repeatable. This error is heuristically assumed
      463 + * to be 1/4th of the total write time as being measured by the two TSC reads
      464 + * on the master sandwiching the mfence. Furthermore, due to the nature of
      465 + * bus arbitration, contention on memory bus, etc., the time taken for the write
      466 + * to reflect globally can vary a lot. So instead of taking a single reading,
      467 + * a set of readings are taken and the one with least write time is chosen
      468 + * to calculate the final skew.
 449  469   *
 450      - * Typically called on the boot CPU, this attempts to quantify TSC skew between
 451      - * different CPUs.  If an appreciable difference is found, gethrtimef will be
 452      - * changed to point to tsc_gethrtime_delta().
 453      - *
 454      - * Calculating skews is precise only when the master and slave TSCs are read
 455      - * simultaneously; however, there is no algorithm that can read both CPUs in
 456      - * perfect simultaneity.  The proposed algorithm is an approximate method based
 457      - * on the behaviour of cache management.  The slave CPU continuously polls the
 458      - * TSC while reading a global variable updated by the master CPU.  The latest
 459      - * TSC reading is saved when the master's update (forced via mfence) reaches
 460      - * visibility on the slave.  The master will also take a TSC reading
 461      - * immediately following the mfence.
 462      - *
 463      - * While the delay between cache line invalidation on the slave and mfence
 464      - * completion on the master is not repeatable, the error is heuristically
 465      - * assumed to be 1/4th of the write time recorded by the master.  Multiple
 466      - * samples are taken to control for the variance caused by external factors
 467      - * such as bus contention.  Each sample set is independent per-CPU to control
 468      - * for differing memory latency on NUMA systems.
 469      - *
 470  470   * TSC sync is disabled in the context of virtualization because the CPUs
 471  471   * assigned to the guest are virtual CPUs which means the real CPUs on which
 472  472   * guest runs keep changing during life time of guest OS. So we would end up
 473  473   * calculating TSC skews for a set of CPUs during boot whereas the guest
 474  474   * might migrate to a different set of physical CPUs at a later point of
 475  475   * time.
 476  476   */
 477  477  void
 478  478  tsc_sync_master(processorid_t slave)
 479  479  {
 480  480          ulong_t flags, source, min_write_time = ~0UL;
 481      -        hrtime_t write_time, mtsc_after, last_delta = 0;
      481 +        hrtime_t write_time, x, mtsc_after, tdelta;
 482  482          tsc_sync_t *tsc = tscp;
 483  483          int cnt;
 484  484          int hwtype;
 485  485  
 486  486          hwtype = get_hwenv();
 487  487          if (!tsc_master_slave_sync_needed || (hwtype & HW_VIRTUAL) != 0)
 488  488                  return;
 489  489  
 490  490          flags = clear_int_flag();
 491  491          source = CPU->cpu_id;
 492  492  
 493  493          for (cnt = 0; cnt < SYNC_ITERATIONS; cnt++) {
  
    | 
      ↓ open down ↓ | 
    2 lines elided | 
    
      ↑ open up ↑ | 
  
 494  494                  while (tsc_sync_go != TSC_SYNC_GO)
 495  495                          SMT_PAUSE();
 496  496  
 497  497                  tsc->master_tsc = tsc_read();
 498  498                  membar_enter();
 499  499                  mtsc_after = tsc_read();
 500  500                  while (tsc_sync_go != TSC_SYNC_DONE)
 501  501                          SMT_PAUSE();
 502  502                  write_time =  mtsc_after - tsc->master_tsc;
 503  503                  if (write_time <= min_write_time) {
 504      -                        hrtime_t tdelta;
 505      -
 506      -                        tdelta = tsc->slave_tsc - mtsc_after;
 507      -                        if (tdelta < 0)
 508      -                                tdelta = -tdelta;
      504 +                        min_write_time = write_time;
 509  505                          /*
 510      -                         * If the margin exists, subtract 1/4th of the measured
 511      -                         * write time from the master's TSC value.  This is an
 512      -                         * estimate of how late the mfence completion came
 513      -                         * after the slave noticed the cache line change.
      506 +                         * Apply heuristic adjustment only if the calculated
      507 +                         * delta is > 1/4th of the write time.
 514  508                           */
 515      -                        if (tdelta > (write_time/4)) {
      509 +                        x = tsc->slave_tsc - mtsc_after;
      510 +                        if (x < 0)
      511 +                                x = -x;
      512 +                        if (x > (min_write_time/4))
      513 +                                /*
      514 +                                 * Subtract 1/4th of the measured write time
      515 +                                 * from the master's TSC value, as an estimate
      516 +                                 * of how late the mfence completion came
      517 +                                 * after the slave noticed the cache line
      518 +                                 * change.
      519 +                                 */
 516  520                                  tdelta = tsc->slave_tsc -
 517      -                                    (mtsc_after - (write_time/4));
 518      -                        } else {
      521 +                                    (mtsc_after - (min_write_time/4));
      522 +                        else
 519  523                                  tdelta = tsc->slave_tsc - mtsc_after;
 520      -                        }
 521      -                        last_delta = tsc_sync_tick_delta[source] - tdelta;
 522      -                        tsc_sync_tick_delta[slave] = last_delta;
 523      -                        min_write_time = write_time;
      524 +                        tsc_sync_tick_delta[slave] =
      525 +                            tsc_sync_tick_delta[source] - tdelta;
 524  526                  }
 525  527  
 526  528                  tsc->master_tsc = tsc->slave_tsc = write_time = 0;
 527  529                  membar_enter();
 528  530                  tsc_sync_go = TSC_SYNC_STOP;
 529  531          }
 530      -
      532 +        if (tdelta < 0)
      533 +                tdelta = -tdelta;
      534 +        if (tdelta > largest_tsc_delta)
      535 +                largest_tsc_delta = tdelta;
      536 +        if (min_write_time < shortest_write_time)
      537 +                shortest_write_time = min_write_time;
 531  538          /*
 532      -         * Only enable the delta variants of the TSC functions if the measured
 533      -         * skew is greater than the fastest write time.
      539 +         * Enable delta variants of tsc functions if the largest of all chosen
      540 +         * deltas is > smallest of the write time.
 534  541           */
 535      -        last_delta = (last_delta < 0) ? -last_delta : last_delta;
 536      -        if (last_delta > min_write_time) {
      542 +        if (largest_tsc_delta > shortest_write_time) {
 537  543                  gethrtimef = tsc_gethrtime_delta;
 538  544                  gethrtimeunscaledf = tsc_gethrtimeunscaled_delta;
 539  545                  tsc_ncpu = NCPU;
 540  546          }
 541  547          restore_int_flag(flags);
 542  548  }
 543  549  
 544  550  /*
 545      - * TSC Sync Slave
 546      - *
 547  551   * Called by a CPU which has just been onlined.  It is expected that the CPU
 548  552   * performing the online operation will call tsc_sync_master().
 549  553   *
 550      - * Like tsc_sync_master, this logic is skipped on virtualized platforms.
      554 + * TSC sync is disabled in the context of virtualization. See comments
      555 + * above tsc_sync_master.
 551  556   */
 552  557  void
 553  558  tsc_sync_slave(void)
 554  559  {
 555  560          ulong_t flags;
 556  561          hrtime_t s1;
 557  562          tsc_sync_t *tsc = tscp;
 558  563          int cnt;
 559  564          int hwtype;
 560  565  
 561  566          hwtype = get_hwenv();
 562  567          if (!tsc_master_slave_sync_needed || (hwtype & HW_VIRTUAL) != 0)
 563  568                  return;
  
    | 
      ↓ open down ↓ | 
    3 lines elided | 
    
      ↑ open up ↑ | 
  
 564  569  
 565  570          flags = clear_int_flag();
 566  571  
 567  572          for (cnt = 0; cnt < SYNC_ITERATIONS; cnt++) {
 568  573                  /* Re-fill the cache line */
 569  574                  s1 = tsc->master_tsc;
 570  575                  membar_enter();
 571  576                  tsc_sync_go = TSC_SYNC_GO;
 572  577                  do {
 573  578                          /*
 574      -                         * Do not put an SMT_PAUSE here.  If the master and
 575      -                         * slave are the same hyper-threaded CPU, we want the
 576      -                         * master to yield as quickly as possible to the slave.
      579 +                         * Do not put an SMT_PAUSE here. For instance,
      580 +                         * if the master and slave are really the same
      581 +                         * hyper-threaded CPU, then you want the master
      582 +                         * to yield to the slave as quickly as possible here,
      583 +                         * but not the other way.
 577  584                           */
 578  585                          s1 = tsc_read();
 579  586                  } while (tsc->master_tsc == 0);
 580  587                  tsc->slave_tsc = s1;
 581  588                  membar_enter();
 582  589                  tsc_sync_go = TSC_SYNC_DONE;
 583  590  
 584  591                  while (tsc_sync_go != TSC_SYNC_STOP)
 585  592                          SMT_PAUSE();
 586  593          }
 587  594  
 588  595          restore_int_flag(flags);
 589  596  }
 590  597  
 591  598  /*
 592  599   * Called once per second on a CPU from the cyclic subsystem's
 593  600   * CY_HIGH_LEVEL interrupt.  (No longer just cpu0-only)
 594  601   */
 595  602  void
 596  603  tsc_tick(void)
 597  604  {
 598  605          hrtime_t now, delta;
 599  606          ushort_t spl;
 600  607  
 601  608          /*
 602  609           * Before we set the new variables, we set the shadow values.  This
 603  610           * allows for lock free operation in dtrace_gethrtime().
 604  611           */
 605  612          lock_set_spl((lock_t *)&shadow_hres_lock + HRES_LOCK_OFFSET,
 606  613              ipltospl(CBE_HIGH_PIL), &spl);
 607  614  
 608  615          shadow_tsc_hrtime_base = tsc_hrtime_base;
 609  616          shadow_tsc_last = tsc_last;
 610  617          shadow_nsec_scale = nsec_scale;
 611  618  
 612  619          shadow_hres_lock++;
 613  620          splx(spl);
 614  621  
 615  622          CLOCK_LOCK(&spl);
 616  623  
 617  624          now = tsc_read();
 618  625  
 619  626          if (gethrtimef == tsc_gethrtime_delta)
 620  627                  now += tsc_sync_tick_delta[CPU->cpu_id];
 621  628  
 622  629          if (now < tsc_last) {
 623  630                  /*
 624  631                   * The TSC has just jumped into the past.  We assume that
 625  632                   * this is due to a suspend/resume cycle, and we're going
 626  633                   * to use the _current_ value of TSC as the delta.  This
 627  634                   * will keep tsc_hrtime_base correct.  We're also going to
 628  635                   * assume that rate of tsc does not change after a suspend
 629  636                   * resume (i.e nsec_scale remains the same).
 630  637                   */
 631  638                  delta = now;
 632  639                  delta = tsc_protect(delta);
 633  640                  tsc_last_jumped += tsc_last;
 634  641                  tsc_jumped = 1;
 635  642          } else {
 636  643                  /*
 637  644                   * Determine the number of TSC ticks since the last clock
 638  645                   * tick, and add that to the hrtime base.
 639  646                   */
 640  647                  delta = now - tsc_last;
 641  648          }
 642  649  
 643  650          TSC_CONVERT_AND_ADD(delta, tsc_hrtime_base, nsec_scale);
 644  651          tsc_last = now;
 645  652  
 646  653          CLOCK_UNLOCK(spl);
 647  654  }
 648  655  
 649  656  void
 650  657  tsc_hrtimeinit(uint64_t cpu_freq_hz)
 651  658  {
 652  659          extern int gethrtime_hires;
 653  660          longlong_t tsc;
 654  661          ulong_t flags;
 655  662  
 656  663          /*
 657  664           * cpu_freq_hz is the measured cpu frequency in hertz
 658  665           */
 659  666  
 660  667          /*
 661  668           * We can't accommodate CPUs slower than 31.25 MHz.
 662  669           */
 663  670          ASSERT(cpu_freq_hz > NANOSEC / (1 << NSEC_SHIFT));
 664  671          nsec_scale =
 665  672              (uint_t)(((uint64_t)NANOSEC << (32 - NSEC_SHIFT)) / cpu_freq_hz);
 666  673          nsec_unscale =
 667  674              (uint_t)(((uint64_t)cpu_freq_hz << (32 - NSEC_SHIFT)) / NANOSEC);
 668  675  
 669  676          flags = clear_int_flag();
 670  677          tsc = tsc_read();
 671  678          (void) tsc_gethrtime();
 672  679          tsc_max_delta = tsc_read() - tsc;
 673  680          restore_int_flag(flags);
 674  681          gethrtimef = tsc_gethrtime;
 675  682          gethrtimeunscaledf = tsc_gethrtimeunscaled;
 676  683          scalehrtimef = tsc_scalehrtime;
 677  684          unscalehrtimef = tsc_unscalehrtime;
 678  685          hrtime_tick = tsc_tick;
 679  686          gethrtime_hires = 1;
 680  687          /*
 681  688           * Being part of the comm page, tsc_ncpu communicates the published
 682  689           * length of the tsc_sync_tick_delta array.  This is kept zeroed to
 683  690           * ignore the absent delta data while the TSCs are synced.
 684  691           */
 685  692          tsc_ncpu = 0;
 686  693          /*
 687  694           * Allocate memory for the structure used in the tsc sync logic.
 688  695           * This structure should be aligned on a multiple of cache line size.
 689  696           */
 690  697          tscp = kmem_zalloc(PAGESIZE, KM_SLEEP);
 691  698  
 692  699          /*
 693  700           * Convert the TSC resume cap ns value into its unscaled TSC value.
 694  701           * See tsc_gethrtime().
 695  702           */
 696  703          if (tsc_resume_cap == 0)
  
    | 
      ↓ open down ↓ | 
    110 lines elided | 
    
      ↑ open up ↑ | 
  
 697  704                  TSC_CONVERT(tsc_resume_cap_ns, tsc_resume_cap, nsec_unscale);
 698  705  }
 699  706  
 700  707  int
 701  708  get_tsc_ready()
 702  709  {
 703  710          return (tsc_ready);
 704  711  }
 705  712  
 706  713  /*
 707      - * Adjust all the deltas by adding the passed value to the array and activate
 708      - * the "delta" versions of the gethrtime functions.  It is possible that the
 709      - * adjustment could be negative.  Such may occur if the SunOS instance was
 710      - * moved by a virtual manager to a machine with a higher value of TSC.
      714 + * Adjust all the deltas by adding the passed value to the array.
      715 + * Then use the "delt" versions of the the gethrtime functions.
      716 + * Note that 'tdelta' _could_ be a negative number, which should
      717 + * reduce the values in the array (used, for example, if the Solaris
      718 + * instance was moved by a virtual manager to a machine with a higher
      719 + * value of tsc).
 711  720   */
 712  721  void
 713  722  tsc_adjust_delta(hrtime_t tdelta)
 714  723  {
 715  724          int             i;
 716  725  
 717  726          for (i = 0; i < NCPU; i++) {
 718  727                  tsc_sync_tick_delta[i] += tdelta;
 719  728          }
 720  729  
 721  730          gethrtimef = tsc_gethrtime_delta;
 722  731          gethrtimeunscaledf = tsc_gethrtimeunscaled_delta;
 723  732          tsc_ncpu = NCPU;
 724  733  }
 725  734  
 726  735  /*
 727  736   * Functions to manage TSC and high-res time on suspend and resume.
 728  737   */
 729  738  
 730      -/* tod_ops from "uts/i86pc/io/todpc_subr.c" */
      739 +/*
      740 + * declarations needed for time adjustment
      741 + */
      742 +extern void     rtcsync(void);
 731  743  extern tod_ops_t *tod_ops;
 732      -
      744 +/* There must be a better way than exposing nsec_scale! */
      745 +extern uint_t   nsec_scale;
 733  746  static uint64_t tsc_saved_tsc = 0; /* 1 in 2^64 chance this'll screw up! */
 734  747  static timestruc_t tsc_saved_ts;
 735  748  static int      tsc_needs_resume = 0;   /* We only want to do this once. */
 736  749  int             tsc_delta_onsuspend = 0;
 737  750  int             tsc_adjust_seconds = 1;
 738  751  int             tsc_suspend_count = 0;
 739  752  int             tsc_resume_in_cyclic = 0;
 740  753  
 741  754  /*
 742      - * Take snapshots of the current time and do any other pre-suspend work.
      755 + * Let timestamp.c know that we are suspending.  It needs to take
      756 + * snapshots of the current time, and do any pre-suspend work.
 743  757   */
 744  758  void
 745  759  tsc_suspend(void)
 746  760  {
 747      -        /*
 748      -         * We need to collect the time at which we suspended here so we know
 749      -         * now much should be added during the resume.  This is called by each
 750      -         * CPU, so reentry must be properly handled.
 751      -         */
      761 +/*
      762 + * What we need to do here, is to get the time we suspended, so that we
      763 + * know how much we should add to the resume.
      764 + * This routine is called by each CPU, so we need to handle reentry.
      765 + */
 752  766          if (tsc_gethrtime_enable) {
 753  767                  /*
 754      -                 * Perform the tsc_read after acquiring the lock to make it as
 755      -                 * accurate as possible in the face of contention.
      768 +                 * We put the tsc_read() inside the lock as it
      769 +                 * as no locking constraints, and it puts the
      770 +                 * aquired value closer to the time stamp (in
      771 +                 * case we delay getting the lock).
 756  772                   */
 757  773                  mutex_enter(&tod_lock);
 758  774                  tsc_saved_tsc = tsc_read();
 759  775                  tsc_saved_ts = TODOP_GET(tod_ops);
 760  776                  mutex_exit(&tod_lock);
 761  777                  /* We only want to do this once. */
 762  778                  if (tsc_needs_resume == 0) {
 763  779                          if (tsc_delta_onsuspend) {
 764  780                                  tsc_adjust_delta(tsc_saved_tsc);
 765  781                          } else {
 766  782                                  tsc_adjust_delta(nsec_scale);
  
    | 
      ↓ open down ↓ | 
    1 lines elided | 
    
      ↑ open up ↑ | 
  
 767  783                          }
 768  784                          tsc_suspend_count++;
 769  785                  }
 770  786          }
 771  787  
 772  788          invalidate_cache();
 773  789          tsc_needs_resume = 1;
 774  790  }
 775  791  
 776  792  /*
 777      - * Restore all timestamp state based on the snapshots taken at suspend time.
      793 + * Restore all timestamp state based on the snapshots taken at
      794 + * suspend time.
 778  795   */
 779  796  void
 780  797  tsc_resume(void)
 781  798  {
 782  799          /*
 783  800           * We only need to (and want to) do this once.  So let the first
 784  801           * caller handle this (we are locked by the cpu lock), as it
 785  802           * is preferential that we get the earliest sync.
 786  803           */
 787  804          if (tsc_needs_resume) {
 788  805                  /*
 789  806                   * If using the TSC, adjust the delta based on how long
 790  807                   * we were sleeping (or away).  We also adjust for
 791  808                   * migration and a grown TSC.
 792  809                   */
 793  810                  if (tsc_saved_tsc != 0) {
 794  811                          timestruc_t     ts;
 795  812                          hrtime_t        now, sleep_tsc = 0;
 796  813                          int             sleep_sec;
 797  814                          extern void     tsc_tick(void);
 798  815                          extern uint64_t cpu_freq_hz;
 799  816  
 800  817                          /* tsc_read() MUST be before TODOP_GET() */
 801  818                          mutex_enter(&tod_lock);
 802  819                          now = tsc_read();
 803  820                          ts = TODOP_GET(tod_ops);
 804  821                          mutex_exit(&tod_lock);
 805  822  
 806  823                          /* Compute seconds of sleep time */
 807  824                          sleep_sec = ts.tv_sec - tsc_saved_ts.tv_sec;
 808  825  
 809  826                          /*
 810  827                           * If the saved sec is less that or equal to
 811  828                           * the current ts, then there is likely a
 812  829                           * problem with the clock.  Assume at least
 813  830                           * one second has passed, so that time goes forward.
 814  831                           */
 815  832                          if (sleep_sec <= 0) {
 816  833                                  sleep_sec = 1;
 817  834                          }
 818  835  
 819  836                          /* How many TSC's should have occured while sleeping */
 820  837                          if (tsc_adjust_seconds)
 821  838                                  sleep_tsc = sleep_sec * cpu_freq_hz;
 822  839  
 823  840                          /*
 824  841                           * We also want to subtract from the "sleep_tsc"
 825  842                           * the current value of tsc_read(), so that our
 826  843                           * adjustment accounts for the amount of time we
 827  844                           * have been resumed _or_ an adjustment based on
 828  845                           * the fact that we didn't actually power off the
 829  846                           * CPU (migration is another issue, but _should_
 830  847                           * also comply with this calculation).  If the CPU
 831  848                           * never powered off, then:
 832  849                           *    'now == sleep_tsc + saved_tsc'
 833  850                           * and the delta will effectively be "0".
 834  851                           */
 835  852                          sleep_tsc -= now;
 836  853                          if (tsc_delta_onsuspend) {
 837  854                                  tsc_adjust_delta(sleep_tsc);
 838  855                          } else {
 839  856                                  tsc_adjust_delta(tsc_saved_tsc + sleep_tsc);
 840  857                          }
 841  858                          tsc_saved_tsc = 0;
 842  859  
 843  860                          tsc_tick();
 844  861                  }
 845  862                  tsc_needs_resume = 0;
 846  863          }
 847  864  
 848  865  }
  
    | 
      ↓ open down ↓ | 
    61 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX