Print this page
        
*** 143,152 ****
--- 143,154 ----
  
  typedef struct tsc_sync {
          volatile hrtime_t master_tsc, slave_tsc;
  } tsc_sync_t;
  static tsc_sync_t *tscp;
+ static hrtime_t largest_tsc_delta = 0;
+ static ulong_t shortest_write_time = ~0UL;
  
  static hrtime_t tsc_last_jumped = 0;
  static int      tsc_jumped = 0;
  static uint32_t tsc_wayback = 0;
  /*
*** 443,474 ****
  
          return (hrt);
  }
  
  /*
!  * TSC Sync Master
   *
-  * Typically called on the boot CPU, this attempts to quantify TSC skew between
-  * different CPUs.  If an appreciable difference is found, gethrtimef will be
-  * changed to point to tsc_gethrtime_delta().
-  *
-  * Calculating skews is precise only when the master and slave TSCs are read
-  * simultaneously; however, there is no algorithm that can read both CPUs in
-  * perfect simultaneity.  The proposed algorithm is an approximate method based
-  * on the behaviour of cache management.  The slave CPU continuously polls the
-  * TSC while reading a global variable updated by the master CPU.  The latest
-  * TSC reading is saved when the master's update (forced via mfence) reaches
-  * visibility on the slave.  The master will also take a TSC reading
-  * immediately following the mfence.
-  *
-  * While the delay between cache line invalidation on the slave and mfence
-  * completion on the master is not repeatable, the error is heuristically
-  * assumed to be 1/4th of the write time recorded by the master.  Multiple
-  * samples are taken to control for the variance caused by external factors
-  * such as bus contention.  Each sample set is independent per-CPU to control
-  * for differing memory latency on NUMA systems.
-  *
   * TSC sync is disabled in the context of virtualization because the CPUs
   * assigned to the guest are virtual CPUs which means the real CPUs on which
   * guest runs keep changing during life time of guest OS. So we would end up
   * calculating TSC skews for a set of CPUs during boot whereas the guest
   * might migrate to a different set of physical CPUs at a later point of
--- 445,474 ----
  
          return (hrt);
  }
  
  /*
!  * Called by the master in the TSC sync operation (usually the boot CPU).
!  * If the slave is discovered to have a skew, gethrtimef will be changed to
!  * point to tsc_gethrtime_delta(). Calculating skews is precise only when
!  * the master and slave TSCs are read simultaneously; however, there is no
!  * algorithm that can read both CPUs in perfect simultaneity. The proposed
!  * algorithm is an approximate method based on the behaviour of cache
!  * management. The slave CPU continuously reads TSC and then reads a global
!  * variable which the master CPU updates. The moment the master's update reaches
!  * the slave's visibility (being forced by an mfence operation) we use the TSC
!  * reading taken on the slave. A corresponding TSC read will be taken on the
!  * master as soon as possible after finishing the mfence operation. But the
!  * delay between causing the slave to notice the invalid cache line and the
!  * competion of mfence is not repeatable. This error is heuristically assumed
!  * to be 1/4th of the total write time as being measured by the two TSC reads
!  * on the master sandwiching the mfence. Furthermore, due to the nature of
!  * bus arbitration, contention on memory bus, etc., the time taken for the write
!  * to reflect globally can vary a lot. So instead of taking a single reading,
!  * a set of readings are taken and the one with least write time is chosen
!  * to calculate the final skew.
   *
   * TSC sync is disabled in the context of virtualization because the CPUs
   * assigned to the guest are virtual CPUs which means the real CPUs on which
   * guest runs keep changing during life time of guest OS. So we would end up
   * calculating TSC skews for a set of CPUs during boot whereas the guest
   * might migrate to a different set of physical CPUs at a later point of
*** 476,486 ****
   */
  void
  tsc_sync_master(processorid_t slave)
  {
          ulong_t flags, source, min_write_time = ~0UL;
!         hrtime_t write_time, mtsc_after, last_delta = 0;
          tsc_sync_t *tsc = tscp;
          int cnt;
          int hwtype;
  
          hwtype = get_hwenv();
--- 476,486 ----
   */
  void
  tsc_sync_master(processorid_t slave)
  {
          ulong_t flags, source, min_write_time = ~0UL;
!         hrtime_t write_time, x, mtsc_after, tdelta;
          tsc_sync_t *tsc = tscp;
          int cnt;
          int hwtype;
  
          hwtype = get_hwenv();
*** 499,555 ****
                  mtsc_after = tsc_read();
                  while (tsc_sync_go != TSC_SYNC_DONE)
                          SMT_PAUSE();
                  write_time =  mtsc_after - tsc->master_tsc;
                  if (write_time <= min_write_time) {
!                         hrtime_t tdelta;
! 
!                         tdelta = tsc->slave_tsc - mtsc_after;
!                         if (tdelta < 0)
!                                 tdelta = -tdelta;
                          /*
!                          * If the margin exists, subtract 1/4th of the measured
!                          * write time from the master's TSC value.  This is an
!                          * estimate of how late the mfence completion came
!                          * after the slave noticed the cache line change.
                           */
!                         if (tdelta > (write_time/4)) {
                                  tdelta = tsc->slave_tsc -
!                                     (mtsc_after - (write_time/4));
!                         } else {
                                  tdelta = tsc->slave_tsc - mtsc_after;
                          }
-                         last_delta = tsc_sync_tick_delta[source] - tdelta;
-                         tsc_sync_tick_delta[slave] = last_delta;
-                         min_write_time = write_time;
-                 }
  
                  tsc->master_tsc = tsc->slave_tsc = write_time = 0;
                  membar_enter();
                  tsc_sync_go = TSC_SYNC_STOP;
          }
! 
          /*
!          * Only enable the delta variants of the TSC functions if the measured
!          * skew is greater than the fastest write time.
           */
!         last_delta = (last_delta < 0) ? -last_delta : last_delta;
!         if (last_delta > min_write_time) {
                  gethrtimef = tsc_gethrtime_delta;
                  gethrtimeunscaledf = tsc_gethrtimeunscaled_delta;
                  tsc_ncpu = NCPU;
          }
          restore_int_flag(flags);
  }
  
  /*
-  * TSC Sync Slave
-  *
   * Called by a CPU which has just been onlined.  It is expected that the CPU
   * performing the online operation will call tsc_sync_master().
   *
!  * Like tsc_sync_master, this logic is skipped on virtualized platforms.
   */
  void
  tsc_sync_slave(void)
  {
          ulong_t flags;
--- 499,560 ----
                  mtsc_after = tsc_read();
                  while (tsc_sync_go != TSC_SYNC_DONE)
                          SMT_PAUSE();
                  write_time =  mtsc_after - tsc->master_tsc;
                  if (write_time <= min_write_time) {
!                         min_write_time = write_time;
                          /*
!                          * Apply heuristic adjustment only if the calculated
!                          * delta is > 1/4th of the write time.
                           */
!                         x = tsc->slave_tsc - mtsc_after;
!                         if (x < 0)
!                                 x = -x;
!                         if (x > (min_write_time/4))
!                                 /*
!                                  * Subtract 1/4th of the measured write time
!                                  * from the master's TSC value, as an estimate
!                                  * of how late the mfence completion came
!                                  * after the slave noticed the cache line
!                                  * change.
!                                  */
                                  tdelta = tsc->slave_tsc -
!                                     (mtsc_after - (min_write_time/4));
!                         else
                                  tdelta = tsc->slave_tsc - mtsc_after;
+                         tsc_sync_tick_delta[slave] =
+                             tsc_sync_tick_delta[source] - tdelta;
                  }
  
                  tsc->master_tsc = tsc->slave_tsc = write_time = 0;
                  membar_enter();
                  tsc_sync_go = TSC_SYNC_STOP;
          }
!         if (tdelta < 0)
!                 tdelta = -tdelta;
!         if (tdelta > largest_tsc_delta)
!                 largest_tsc_delta = tdelta;
!         if (min_write_time < shortest_write_time)
!                 shortest_write_time = min_write_time;
          /*
!          * Enable delta variants of tsc functions if the largest of all chosen
!          * deltas is > smallest of the write time.
           */
!         if (largest_tsc_delta > shortest_write_time) {
                  gethrtimef = tsc_gethrtime_delta;
                  gethrtimeunscaledf = tsc_gethrtimeunscaled_delta;
                  tsc_ncpu = NCPU;
          }
          restore_int_flag(flags);
  }
  
  /*
   * Called by a CPU which has just been onlined.  It is expected that the CPU
   * performing the online operation will call tsc_sync_master().
   *
!  * TSC sync is disabled in the context of virtualization. See comments
!  * above tsc_sync_master.
   */
  void
  tsc_sync_slave(void)
  {
          ulong_t flags;
*** 569,581 ****
                  s1 = tsc->master_tsc;
                  membar_enter();
                  tsc_sync_go = TSC_SYNC_GO;
                  do {
                          /*
!                          * Do not put an SMT_PAUSE here.  If the master and
!                          * slave are the same hyper-threaded CPU, we want the
!                          * master to yield as quickly as possible to the slave.
                           */
                          s1 = tsc_read();
                  } while (tsc->master_tsc == 0);
                  tsc->slave_tsc = s1;
                  membar_enter();
--- 574,588 ----
                  s1 = tsc->master_tsc;
                  membar_enter();
                  tsc_sync_go = TSC_SYNC_GO;
                  do {
                          /*
!                          * Do not put an SMT_PAUSE here. For instance,
!                          * if the master and slave are really the same
!                          * hyper-threaded CPU, then you want the master
!                          * to yield to the slave as quickly as possible here,
!                          * but not the other way.
                           */
                          s1 = tsc_read();
                  } while (tsc->master_tsc == 0);
                  tsc->slave_tsc = s1;
                  membar_enter();
*** 702,715 ****
  {
          return (tsc_ready);
  }
  
  /*
!  * Adjust all the deltas by adding the passed value to the array and activate
!  * the "delta" versions of the gethrtime functions.  It is possible that the
!  * adjustment could be negative.  Such may occur if the SunOS instance was
!  * moved by a virtual manager to a machine with a higher value of TSC.
   */
  void
  tsc_adjust_delta(hrtime_t tdelta)
  {
          int             i;
--- 709,724 ----
  {
          return (tsc_ready);
  }
  
  /*
!  * Adjust all the deltas by adding the passed value to the array.
!  * Then use the "delt" versions of the the gethrtime functions.
!  * Note that 'tdelta' _could_ be a negative number, which should
!  * reduce the values in the array (used, for example, if the Solaris
!  * instance was moved by a virtual manager to a machine with a higher
!  * value of tsc).
   */
  void
  tsc_adjust_delta(hrtime_t tdelta)
  {
          int             i;
*** 725,760 ****
  
  /*
   * Functions to manage TSC and high-res time on suspend and resume.
   */
  
! /* tod_ops from "uts/i86pc/io/todpc_subr.c" */
  extern tod_ops_t *tod_ops;
! 
  static uint64_t tsc_saved_tsc = 0; /* 1 in 2^64 chance this'll screw up! */
  static timestruc_t tsc_saved_ts;
  static int      tsc_needs_resume = 0;   /* We only want to do this once. */
  int             tsc_delta_onsuspend = 0;
  int             tsc_adjust_seconds = 1;
  int             tsc_suspend_count = 0;
  int             tsc_resume_in_cyclic = 0;
  
  /*
!  * Take snapshots of the current time and do any other pre-suspend work.
   */
  void
  tsc_suspend(void)
  {
!         /*
!          * We need to collect the time at which we suspended here so we know
!          * now much should be added during the resume.  This is called by each
!          * CPU, so reentry must be properly handled.
           */
          if (tsc_gethrtime_enable) {
                  /*
!                  * Perform the tsc_read after acquiring the lock to make it as
!                  * accurate as possible in the face of contention.
                   */
                  mutex_enter(&tod_lock);
                  tsc_saved_tsc = tsc_read();
                  tsc_saved_ts = TODOP_GET(tod_ops);
                  mutex_exit(&tod_lock);
--- 734,776 ----
  
  /*
   * Functions to manage TSC and high-res time on suspend and resume.
   */
  
! /*
!  * declarations needed for time adjustment
!  */
! extern void     rtcsync(void);
  extern tod_ops_t *tod_ops;
! /* There must be a better way than exposing nsec_scale! */
! extern uint_t   nsec_scale;
  static uint64_t tsc_saved_tsc = 0; /* 1 in 2^64 chance this'll screw up! */
  static timestruc_t tsc_saved_ts;
  static int      tsc_needs_resume = 0;   /* We only want to do this once. */
  int             tsc_delta_onsuspend = 0;
  int             tsc_adjust_seconds = 1;
  int             tsc_suspend_count = 0;
  int             tsc_resume_in_cyclic = 0;
  
  /*
!  * Let timestamp.c know that we are suspending.  It needs to take
!  * snapshots of the current time, and do any pre-suspend work.
   */
  void
  tsc_suspend(void)
  {
! /*
!  * What we need to do here, is to get the time we suspended, so that we
!  * know how much we should add to the resume.
!  * This routine is called by each CPU, so we need to handle reentry.
   */
          if (tsc_gethrtime_enable) {
                  /*
!                  * We put the tsc_read() inside the lock as it
!                  * as no locking constraints, and it puts the
!                  * aquired value closer to the time stamp (in
!                  * case we delay getting the lock).
                   */
                  mutex_enter(&tod_lock);
                  tsc_saved_tsc = tsc_read();
                  tsc_saved_ts = TODOP_GET(tod_ops);
                  mutex_exit(&tod_lock);
*** 772,782 ****
          invalidate_cache();
          tsc_needs_resume = 1;
  }
  
  /*
!  * Restore all timestamp state based on the snapshots taken at suspend time.
   */
  void
  tsc_resume(void)
  {
          /*
--- 788,799 ----
          invalidate_cache();
          tsc_needs_resume = 1;
  }
  
  /*
!  * Restore all timestamp state based on the snapshots taken at
!  * suspend time.
   */
  void
  tsc_resume(void)
  {
          /*