Print this page
*** 143,152 ****
--- 143,154 ----
typedef struct tsc_sync {
volatile hrtime_t master_tsc, slave_tsc;
} tsc_sync_t;
static tsc_sync_t *tscp;
+ static hrtime_t largest_tsc_delta = 0;
+ static ulong_t shortest_write_time = ~0UL;
static hrtime_t tsc_last_jumped = 0;
static int tsc_jumped = 0;
static uint32_t tsc_wayback = 0;
/*
*** 443,474 ****
return (hrt);
}
/*
! * TSC Sync Master
*
- * Typically called on the boot CPU, this attempts to quantify TSC skew between
- * different CPUs. If an appreciable difference is found, gethrtimef will be
- * changed to point to tsc_gethrtime_delta().
- *
- * Calculating skews is precise only when the master and slave TSCs are read
- * simultaneously; however, there is no algorithm that can read both CPUs in
- * perfect simultaneity. The proposed algorithm is an approximate method based
- * on the behaviour of cache management. The slave CPU continuously polls the
- * TSC while reading a global variable updated by the master CPU. The latest
- * TSC reading is saved when the master's update (forced via mfence) reaches
- * visibility on the slave. The master will also take a TSC reading
- * immediately following the mfence.
- *
- * While the delay between cache line invalidation on the slave and mfence
- * completion on the master is not repeatable, the error is heuristically
- * assumed to be 1/4th of the write time recorded by the master. Multiple
- * samples are taken to control for the variance caused by external factors
- * such as bus contention. Each sample set is independent per-CPU to control
- * for differing memory latency on NUMA systems.
- *
* TSC sync is disabled in the context of virtualization because the CPUs
* assigned to the guest are virtual CPUs which means the real CPUs on which
* guest runs keep changing during life time of guest OS. So we would end up
* calculating TSC skews for a set of CPUs during boot whereas the guest
* might migrate to a different set of physical CPUs at a later point of
--- 445,474 ----
return (hrt);
}
/*
! * Called by the master in the TSC sync operation (usually the boot CPU).
! * If the slave is discovered to have a skew, gethrtimef will be changed to
! * point to tsc_gethrtime_delta(). Calculating skews is precise only when
! * the master and slave TSCs are read simultaneously; however, there is no
! * algorithm that can read both CPUs in perfect simultaneity. The proposed
! * algorithm is an approximate method based on the behaviour of cache
! * management. The slave CPU continuously reads TSC and then reads a global
! * variable which the master CPU updates. The moment the master's update reaches
! * the slave's visibility (being forced by an mfence operation) we use the TSC
! * reading taken on the slave. A corresponding TSC read will be taken on the
! * master as soon as possible after finishing the mfence operation. But the
! * delay between causing the slave to notice the invalid cache line and the
! * competion of mfence is not repeatable. This error is heuristically assumed
! * to be 1/4th of the total write time as being measured by the two TSC reads
! * on the master sandwiching the mfence. Furthermore, due to the nature of
! * bus arbitration, contention on memory bus, etc., the time taken for the write
! * to reflect globally can vary a lot. So instead of taking a single reading,
! * a set of readings are taken and the one with least write time is chosen
! * to calculate the final skew.
*
* TSC sync is disabled in the context of virtualization because the CPUs
* assigned to the guest are virtual CPUs which means the real CPUs on which
* guest runs keep changing during life time of guest OS. So we would end up
* calculating TSC skews for a set of CPUs during boot whereas the guest
* might migrate to a different set of physical CPUs at a later point of
*** 476,486 ****
*/
void
tsc_sync_master(processorid_t slave)
{
ulong_t flags, source, min_write_time = ~0UL;
! hrtime_t write_time, mtsc_after, last_delta = 0;
tsc_sync_t *tsc = tscp;
int cnt;
int hwtype;
hwtype = get_hwenv();
--- 476,486 ----
*/
void
tsc_sync_master(processorid_t slave)
{
ulong_t flags, source, min_write_time = ~0UL;
! hrtime_t write_time, x, mtsc_after, tdelta;
tsc_sync_t *tsc = tscp;
int cnt;
int hwtype;
hwtype = get_hwenv();
*** 499,555 ****
mtsc_after = tsc_read();
while (tsc_sync_go != TSC_SYNC_DONE)
SMT_PAUSE();
write_time = mtsc_after - tsc->master_tsc;
if (write_time <= min_write_time) {
! hrtime_t tdelta;
!
! tdelta = tsc->slave_tsc - mtsc_after;
! if (tdelta < 0)
! tdelta = -tdelta;
/*
! * If the margin exists, subtract 1/4th of the measured
! * write time from the master's TSC value. This is an
! * estimate of how late the mfence completion came
! * after the slave noticed the cache line change.
*/
! if (tdelta > (write_time/4)) {
tdelta = tsc->slave_tsc -
! (mtsc_after - (write_time/4));
! } else {
tdelta = tsc->slave_tsc - mtsc_after;
}
- last_delta = tsc_sync_tick_delta[source] - tdelta;
- tsc_sync_tick_delta[slave] = last_delta;
- min_write_time = write_time;
- }
tsc->master_tsc = tsc->slave_tsc = write_time = 0;
membar_enter();
tsc_sync_go = TSC_SYNC_STOP;
}
!
/*
! * Only enable the delta variants of the TSC functions if the measured
! * skew is greater than the fastest write time.
*/
! last_delta = (last_delta < 0) ? -last_delta : last_delta;
! if (last_delta > min_write_time) {
gethrtimef = tsc_gethrtime_delta;
gethrtimeunscaledf = tsc_gethrtimeunscaled_delta;
tsc_ncpu = NCPU;
}
restore_int_flag(flags);
}
/*
- * TSC Sync Slave
- *
* Called by a CPU which has just been onlined. It is expected that the CPU
* performing the online operation will call tsc_sync_master().
*
! * Like tsc_sync_master, this logic is skipped on virtualized platforms.
*/
void
tsc_sync_slave(void)
{
ulong_t flags;
--- 499,560 ----
mtsc_after = tsc_read();
while (tsc_sync_go != TSC_SYNC_DONE)
SMT_PAUSE();
write_time = mtsc_after - tsc->master_tsc;
if (write_time <= min_write_time) {
! min_write_time = write_time;
/*
! * Apply heuristic adjustment only if the calculated
! * delta is > 1/4th of the write time.
*/
! x = tsc->slave_tsc - mtsc_after;
! if (x < 0)
! x = -x;
! if (x > (min_write_time/4))
! /*
! * Subtract 1/4th of the measured write time
! * from the master's TSC value, as an estimate
! * of how late the mfence completion came
! * after the slave noticed the cache line
! * change.
! */
tdelta = tsc->slave_tsc -
! (mtsc_after - (min_write_time/4));
! else
tdelta = tsc->slave_tsc - mtsc_after;
+ tsc_sync_tick_delta[slave] =
+ tsc_sync_tick_delta[source] - tdelta;
}
tsc->master_tsc = tsc->slave_tsc = write_time = 0;
membar_enter();
tsc_sync_go = TSC_SYNC_STOP;
}
! if (tdelta < 0)
! tdelta = -tdelta;
! if (tdelta > largest_tsc_delta)
! largest_tsc_delta = tdelta;
! if (min_write_time < shortest_write_time)
! shortest_write_time = min_write_time;
/*
! * Enable delta variants of tsc functions if the largest of all chosen
! * deltas is > smallest of the write time.
*/
! if (largest_tsc_delta > shortest_write_time) {
gethrtimef = tsc_gethrtime_delta;
gethrtimeunscaledf = tsc_gethrtimeunscaled_delta;
tsc_ncpu = NCPU;
}
restore_int_flag(flags);
}
/*
* Called by a CPU which has just been onlined. It is expected that the CPU
* performing the online operation will call tsc_sync_master().
*
! * TSC sync is disabled in the context of virtualization. See comments
! * above tsc_sync_master.
*/
void
tsc_sync_slave(void)
{
ulong_t flags;
*** 569,581 ****
s1 = tsc->master_tsc;
membar_enter();
tsc_sync_go = TSC_SYNC_GO;
do {
/*
! * Do not put an SMT_PAUSE here. If the master and
! * slave are the same hyper-threaded CPU, we want the
! * master to yield as quickly as possible to the slave.
*/
s1 = tsc_read();
} while (tsc->master_tsc == 0);
tsc->slave_tsc = s1;
membar_enter();
--- 574,588 ----
s1 = tsc->master_tsc;
membar_enter();
tsc_sync_go = TSC_SYNC_GO;
do {
/*
! * Do not put an SMT_PAUSE here. For instance,
! * if the master and slave are really the same
! * hyper-threaded CPU, then you want the master
! * to yield to the slave as quickly as possible here,
! * but not the other way.
*/
s1 = tsc_read();
} while (tsc->master_tsc == 0);
tsc->slave_tsc = s1;
membar_enter();
*** 702,715 ****
{
return (tsc_ready);
}
/*
! * Adjust all the deltas by adding the passed value to the array and activate
! * the "delta" versions of the gethrtime functions. It is possible that the
! * adjustment could be negative. Such may occur if the SunOS instance was
! * moved by a virtual manager to a machine with a higher value of TSC.
*/
void
tsc_adjust_delta(hrtime_t tdelta)
{
int i;
--- 709,724 ----
{
return (tsc_ready);
}
/*
! * Adjust all the deltas by adding the passed value to the array.
! * Then use the "delt" versions of the the gethrtime functions.
! * Note that 'tdelta' _could_ be a negative number, which should
! * reduce the values in the array (used, for example, if the Solaris
! * instance was moved by a virtual manager to a machine with a higher
! * value of tsc).
*/
void
tsc_adjust_delta(hrtime_t tdelta)
{
int i;
*** 725,760 ****
/*
* Functions to manage TSC and high-res time on suspend and resume.
*/
! /* tod_ops from "uts/i86pc/io/todpc_subr.c" */
extern tod_ops_t *tod_ops;
!
static uint64_t tsc_saved_tsc = 0; /* 1 in 2^64 chance this'll screw up! */
static timestruc_t tsc_saved_ts;
static int tsc_needs_resume = 0; /* We only want to do this once. */
int tsc_delta_onsuspend = 0;
int tsc_adjust_seconds = 1;
int tsc_suspend_count = 0;
int tsc_resume_in_cyclic = 0;
/*
! * Take snapshots of the current time and do any other pre-suspend work.
*/
void
tsc_suspend(void)
{
! /*
! * We need to collect the time at which we suspended here so we know
! * now much should be added during the resume. This is called by each
! * CPU, so reentry must be properly handled.
*/
if (tsc_gethrtime_enable) {
/*
! * Perform the tsc_read after acquiring the lock to make it as
! * accurate as possible in the face of contention.
*/
mutex_enter(&tod_lock);
tsc_saved_tsc = tsc_read();
tsc_saved_ts = TODOP_GET(tod_ops);
mutex_exit(&tod_lock);
--- 734,776 ----
/*
* Functions to manage TSC and high-res time on suspend and resume.
*/
! /*
! * declarations needed for time adjustment
! */
! extern void rtcsync(void);
extern tod_ops_t *tod_ops;
! /* There must be a better way than exposing nsec_scale! */
! extern uint_t nsec_scale;
static uint64_t tsc_saved_tsc = 0; /* 1 in 2^64 chance this'll screw up! */
static timestruc_t tsc_saved_ts;
static int tsc_needs_resume = 0; /* We only want to do this once. */
int tsc_delta_onsuspend = 0;
int tsc_adjust_seconds = 1;
int tsc_suspend_count = 0;
int tsc_resume_in_cyclic = 0;
/*
! * Let timestamp.c know that we are suspending. It needs to take
! * snapshots of the current time, and do any pre-suspend work.
*/
void
tsc_suspend(void)
{
! /*
! * What we need to do here, is to get the time we suspended, so that we
! * know how much we should add to the resume.
! * This routine is called by each CPU, so we need to handle reentry.
*/
if (tsc_gethrtime_enable) {
/*
! * We put the tsc_read() inside the lock as it
! * as no locking constraints, and it puts the
! * aquired value closer to the time stamp (in
! * case we delay getting the lock).
*/
mutex_enter(&tod_lock);
tsc_saved_tsc = tsc_read();
tsc_saved_ts = TODOP_GET(tod_ops);
mutex_exit(&tod_lock);
*** 772,782 ****
invalidate_cache();
tsc_needs_resume = 1;
}
/*
! * Restore all timestamp state based on the snapshots taken at suspend time.
*/
void
tsc_resume(void)
{
/*
--- 788,799 ----
invalidate_cache();
tsc_needs_resume = 1;
}
/*
! * Restore all timestamp state based on the snapshots taken at
! * suspend time.
*/
void
tsc_resume(void)
{
/*