Print this page
        
@@ -143,10 +143,12 @@
 
 typedef struct tsc_sync {
         volatile hrtime_t master_tsc, slave_tsc;
 } tsc_sync_t;
 static tsc_sync_t *tscp;
+static hrtime_t largest_tsc_delta = 0;
+static ulong_t shortest_write_time = ~0UL;
 
 static hrtime_t tsc_last_jumped = 0;
 static int      tsc_jumped = 0;
 static uint32_t tsc_wayback = 0;
 /*
@@ -443,32 +445,30 @@
 
         return (hrt);
 }
 
 /*
- * TSC Sync Master
+ * Called by the master in the TSC sync operation (usually the boot CPU).
+ * If the slave is discovered to have a skew, gethrtimef will be changed to
+ * point to tsc_gethrtime_delta(). Calculating skews is precise only when
+ * the master and slave TSCs are read simultaneously; however, there is no
+ * algorithm that can read both CPUs in perfect simultaneity. The proposed
+ * algorithm is an approximate method based on the behaviour of cache
+ * management. The slave CPU continuously reads TSC and then reads a global
+ * variable which the master CPU updates. The moment the master's update reaches
+ * the slave's visibility (being forced by an mfence operation) we use the TSC
+ * reading taken on the slave. A corresponding TSC read will be taken on the
+ * master as soon as possible after finishing the mfence operation. But the
+ * delay between causing the slave to notice the invalid cache line and the
+ * competion of mfence is not repeatable. This error is heuristically assumed
+ * to be 1/4th of the total write time as being measured by the two TSC reads
+ * on the master sandwiching the mfence. Furthermore, due to the nature of
+ * bus arbitration, contention on memory bus, etc., the time taken for the write
+ * to reflect globally can vary a lot. So instead of taking a single reading,
+ * a set of readings are taken and the one with least write time is chosen
+ * to calculate the final skew.
  *
- * Typically called on the boot CPU, this attempts to quantify TSC skew between
- * different CPUs.  If an appreciable difference is found, gethrtimef will be
- * changed to point to tsc_gethrtime_delta().
- *
- * Calculating skews is precise only when the master and slave TSCs are read
- * simultaneously; however, there is no algorithm that can read both CPUs in
- * perfect simultaneity.  The proposed algorithm is an approximate method based
- * on the behaviour of cache management.  The slave CPU continuously polls the
- * TSC while reading a global variable updated by the master CPU.  The latest
- * TSC reading is saved when the master's update (forced via mfence) reaches
- * visibility on the slave.  The master will also take a TSC reading
- * immediately following the mfence.
- *
- * While the delay between cache line invalidation on the slave and mfence
- * completion on the master is not repeatable, the error is heuristically
- * assumed to be 1/4th of the write time recorded by the master.  Multiple
- * samples are taken to control for the variance caused by external factors
- * such as bus contention.  Each sample set is independent per-CPU to control
- * for differing memory latency on NUMA systems.
- *
  * TSC sync is disabled in the context of virtualization because the CPUs
  * assigned to the guest are virtual CPUs which means the real CPUs on which
  * guest runs keep changing during life time of guest OS. So we would end up
  * calculating TSC skews for a set of CPUs during boot whereas the guest
  * might migrate to a different set of physical CPUs at a later point of
@@ -476,11 +476,11 @@
  */
 void
 tsc_sync_master(processorid_t slave)
 {
         ulong_t flags, source, min_write_time = ~0UL;
-        hrtime_t write_time, mtsc_after, last_delta = 0;
+        hrtime_t write_time, x, mtsc_after, tdelta;
         tsc_sync_t *tsc = tscp;
         int cnt;
         int hwtype;
 
         hwtype = get_hwenv();
@@ -499,57 +499,62 @@
                 mtsc_after = tsc_read();
                 while (tsc_sync_go != TSC_SYNC_DONE)
                         SMT_PAUSE();
                 write_time =  mtsc_after - tsc->master_tsc;
                 if (write_time <= min_write_time) {
-                        hrtime_t tdelta;
-
-                        tdelta = tsc->slave_tsc - mtsc_after;
-                        if (tdelta < 0)
-                                tdelta = -tdelta;
+                        min_write_time = write_time;
                         /*
-                         * If the margin exists, subtract 1/4th of the measured
-                         * write time from the master's TSC value.  This is an
-                         * estimate of how late the mfence completion came
-                         * after the slave noticed the cache line change.
+                         * Apply heuristic adjustment only if the calculated
+                         * delta is > 1/4th of the write time.
                          */
-                        if (tdelta > (write_time/4)) {
+                        x = tsc->slave_tsc - mtsc_after;
+                        if (x < 0)
+                                x = -x;
+                        if (x > (min_write_time/4))
+                                /*
+                                 * Subtract 1/4th of the measured write time
+                                 * from the master's TSC value, as an estimate
+                                 * of how late the mfence completion came
+                                 * after the slave noticed the cache line
+                                 * change.
+                                 */
                                 tdelta = tsc->slave_tsc -
-                                    (mtsc_after - (write_time/4));
-                        } else {
+                                    (mtsc_after - (min_write_time/4));
+                        else
                                 tdelta = tsc->slave_tsc - mtsc_after;
+                        tsc_sync_tick_delta[slave] =
+                            tsc_sync_tick_delta[source] - tdelta;
                         }
-                        last_delta = tsc_sync_tick_delta[source] - tdelta;
-                        tsc_sync_tick_delta[slave] = last_delta;
-                        min_write_time = write_time;
-                }
 
                 tsc->master_tsc = tsc->slave_tsc = write_time = 0;
                 membar_enter();
                 tsc_sync_go = TSC_SYNC_STOP;
         }
-
+        if (tdelta < 0)
+                tdelta = -tdelta;
+        if (tdelta > largest_tsc_delta)
+                largest_tsc_delta = tdelta;
+        if (min_write_time < shortest_write_time)
+                shortest_write_time = min_write_time;
         /*
-         * Only enable the delta variants of the TSC functions if the measured
-         * skew is greater than the fastest write time.
+         * Enable delta variants of tsc functions if the largest of all chosen
+         * deltas is > smallest of the write time.
          */
-        last_delta = (last_delta < 0) ? -last_delta : last_delta;
-        if (last_delta > min_write_time) {
+        if (largest_tsc_delta > shortest_write_time) {
                 gethrtimef = tsc_gethrtime_delta;
                 gethrtimeunscaledf = tsc_gethrtimeunscaled_delta;
                 tsc_ncpu = NCPU;
         }
         restore_int_flag(flags);
 }
 
 /*
- * TSC Sync Slave
- *
  * Called by a CPU which has just been onlined.  It is expected that the CPU
  * performing the online operation will call tsc_sync_master().
  *
- * Like tsc_sync_master, this logic is skipped on virtualized platforms.
+ * TSC sync is disabled in the context of virtualization. See comments
+ * above tsc_sync_master.
  */
 void
 tsc_sync_slave(void)
 {
         ulong_t flags;
@@ -569,13 +574,15 @@
                 s1 = tsc->master_tsc;
                 membar_enter();
                 tsc_sync_go = TSC_SYNC_GO;
                 do {
                         /*
-                         * Do not put an SMT_PAUSE here.  If the master and
-                         * slave are the same hyper-threaded CPU, we want the
-                         * master to yield as quickly as possible to the slave.
+                         * Do not put an SMT_PAUSE here. For instance,
+                         * if the master and slave are really the same
+                         * hyper-threaded CPU, then you want the master
+                         * to yield to the slave as quickly as possible here,
+                         * but not the other way.
                          */
                         s1 = tsc_read();
                 } while (tsc->master_tsc == 0);
                 tsc->slave_tsc = s1;
                 membar_enter();
@@ -702,14 +709,16 @@
 {
         return (tsc_ready);
 }
 
 /*
- * Adjust all the deltas by adding the passed value to the array and activate
- * the "delta" versions of the gethrtime functions.  It is possible that the
- * adjustment could be negative.  Such may occur if the SunOS instance was
- * moved by a virtual manager to a machine with a higher value of TSC.
+ * Adjust all the deltas by adding the passed value to the array.
+ * Then use the "delt" versions of the the gethrtime functions.
+ * Note that 'tdelta' _could_ be a negative number, which should
+ * reduce the values in the array (used, for example, if the Solaris
+ * instance was moved by a virtual manager to a machine with a higher
+ * value of tsc).
  */
 void
 tsc_adjust_delta(hrtime_t tdelta)
 {
         int             i;
@@ -725,36 +734,43 @@
 
 /*
  * Functions to manage TSC and high-res time on suspend and resume.
  */
 
-/* tod_ops from "uts/i86pc/io/todpc_subr.c" */
+/*
+ * declarations needed for time adjustment
+ */
+extern void     rtcsync(void);
 extern tod_ops_t *tod_ops;
-
+/* There must be a better way than exposing nsec_scale! */
+extern uint_t   nsec_scale;
 static uint64_t tsc_saved_tsc = 0; /* 1 in 2^64 chance this'll screw up! */
 static timestruc_t tsc_saved_ts;
 static int      tsc_needs_resume = 0;   /* We only want to do this once. */
 int             tsc_delta_onsuspend = 0;
 int             tsc_adjust_seconds = 1;
 int             tsc_suspend_count = 0;
 int             tsc_resume_in_cyclic = 0;
 
 /*
- * Take snapshots of the current time and do any other pre-suspend work.
+ * Let timestamp.c know that we are suspending.  It needs to take
+ * snapshots of the current time, and do any pre-suspend work.
  */
 void
 tsc_suspend(void)
 {
-        /*
-         * We need to collect the time at which we suspended here so we know
-         * now much should be added during the resume.  This is called by each
-         * CPU, so reentry must be properly handled.
+/*
+ * What we need to do here, is to get the time we suspended, so that we
+ * know how much we should add to the resume.
+ * This routine is called by each CPU, so we need to handle reentry.
          */
         if (tsc_gethrtime_enable) {
                 /*
-                 * Perform the tsc_read after acquiring the lock to make it as
-                 * accurate as possible in the face of contention.
+                 * We put the tsc_read() inside the lock as it
+                 * as no locking constraints, and it puts the
+                 * aquired value closer to the time stamp (in
+                 * case we delay getting the lock).
                  */
                 mutex_enter(&tod_lock);
                 tsc_saved_tsc = tsc_read();
                 tsc_saved_ts = TODOP_GET(tod_ops);
                 mutex_exit(&tod_lock);
@@ -772,11 +788,12 @@
         invalidate_cache();
         tsc_needs_resume = 1;
 }
 
 /*
- * Restore all timestamp state based on the snapshots taken at suspend time.
+ * Restore all timestamp state based on the snapshots taken at
+ * suspend time.
  */
 void
 tsc_resume(void)
 {
         /*