Print this page

        

*** 143,152 **** --- 143,154 ---- typedef struct tsc_sync { volatile hrtime_t master_tsc, slave_tsc; } tsc_sync_t; static tsc_sync_t *tscp; + static hrtime_t largest_tsc_delta = 0; + static ulong_t shortest_write_time = ~0UL; static hrtime_t tsc_last_jumped = 0; static int tsc_jumped = 0; static uint32_t tsc_wayback = 0; /*
*** 443,474 **** return (hrt); } /* ! * TSC Sync Master * - * Typically called on the boot CPU, this attempts to quantify TSC skew between - * different CPUs. If an appreciable difference is found, gethrtimef will be - * changed to point to tsc_gethrtime_delta(). - * - * Calculating skews is precise only when the master and slave TSCs are read - * simultaneously; however, there is no algorithm that can read both CPUs in - * perfect simultaneity. The proposed algorithm is an approximate method based - * on the behaviour of cache management. The slave CPU continuously polls the - * TSC while reading a global variable updated by the master CPU. The latest - * TSC reading is saved when the master's update (forced via mfence) reaches - * visibility on the slave. The master will also take a TSC reading - * immediately following the mfence. - * - * While the delay between cache line invalidation on the slave and mfence - * completion on the master is not repeatable, the error is heuristically - * assumed to be 1/4th of the write time recorded by the master. Multiple - * samples are taken to control for the variance caused by external factors - * such as bus contention. Each sample set is independent per-CPU to control - * for differing memory latency on NUMA systems. - * * TSC sync is disabled in the context of virtualization because the CPUs * assigned to the guest are virtual CPUs which means the real CPUs on which * guest runs keep changing during life time of guest OS. So we would end up * calculating TSC skews for a set of CPUs during boot whereas the guest * might migrate to a different set of physical CPUs at a later point of --- 445,474 ---- return (hrt); } /* ! * Called by the master in the TSC sync operation (usually the boot CPU). ! * If the slave is discovered to have a skew, gethrtimef will be changed to ! * point to tsc_gethrtime_delta(). Calculating skews is precise only when ! * the master and slave TSCs are read simultaneously; however, there is no ! * algorithm that can read both CPUs in perfect simultaneity. The proposed ! * algorithm is an approximate method based on the behaviour of cache ! * management. The slave CPU continuously reads TSC and then reads a global ! * variable which the master CPU updates. The moment the master's update reaches ! * the slave's visibility (being forced by an mfence operation) we use the TSC ! * reading taken on the slave. A corresponding TSC read will be taken on the ! * master as soon as possible after finishing the mfence operation. But the ! * delay between causing the slave to notice the invalid cache line and the ! * competion of mfence is not repeatable. This error is heuristically assumed ! * to be 1/4th of the total write time as being measured by the two TSC reads ! * on the master sandwiching the mfence. Furthermore, due to the nature of ! * bus arbitration, contention on memory bus, etc., the time taken for the write ! * to reflect globally can vary a lot. So instead of taking a single reading, ! * a set of readings are taken and the one with least write time is chosen ! * to calculate the final skew. * * TSC sync is disabled in the context of virtualization because the CPUs * assigned to the guest are virtual CPUs which means the real CPUs on which * guest runs keep changing during life time of guest OS. So we would end up * calculating TSC skews for a set of CPUs during boot whereas the guest * might migrate to a different set of physical CPUs at a later point of
*** 476,486 **** */ void tsc_sync_master(processorid_t slave) { ulong_t flags, source, min_write_time = ~0UL; ! hrtime_t write_time, mtsc_after, last_delta = 0; tsc_sync_t *tsc = tscp; int cnt; int hwtype; hwtype = get_hwenv(); --- 476,486 ---- */ void tsc_sync_master(processorid_t slave) { ulong_t flags, source, min_write_time = ~0UL; ! hrtime_t write_time, x, mtsc_after, tdelta; tsc_sync_t *tsc = tscp; int cnt; int hwtype; hwtype = get_hwenv();
*** 499,555 **** mtsc_after = tsc_read(); while (tsc_sync_go != TSC_SYNC_DONE) SMT_PAUSE(); write_time = mtsc_after - tsc->master_tsc; if (write_time <= min_write_time) { ! hrtime_t tdelta; ! ! tdelta = tsc->slave_tsc - mtsc_after; ! if (tdelta < 0) ! tdelta = -tdelta; /* ! * If the margin exists, subtract 1/4th of the measured ! * write time from the master's TSC value. This is an ! * estimate of how late the mfence completion came ! * after the slave noticed the cache line change. */ ! if (tdelta > (write_time/4)) { tdelta = tsc->slave_tsc - ! (mtsc_after - (write_time/4)); ! } else { tdelta = tsc->slave_tsc - mtsc_after; } - last_delta = tsc_sync_tick_delta[source] - tdelta; - tsc_sync_tick_delta[slave] = last_delta; - min_write_time = write_time; - } tsc->master_tsc = tsc->slave_tsc = write_time = 0; membar_enter(); tsc_sync_go = TSC_SYNC_STOP; } ! /* ! * Only enable the delta variants of the TSC functions if the measured ! * skew is greater than the fastest write time. */ ! last_delta = (last_delta < 0) ? -last_delta : last_delta; ! if (last_delta > min_write_time) { gethrtimef = tsc_gethrtime_delta; gethrtimeunscaledf = tsc_gethrtimeunscaled_delta; tsc_ncpu = NCPU; } restore_int_flag(flags); } /* - * TSC Sync Slave - * * Called by a CPU which has just been onlined. It is expected that the CPU * performing the online operation will call tsc_sync_master(). * ! * Like tsc_sync_master, this logic is skipped on virtualized platforms. */ void tsc_sync_slave(void) { ulong_t flags; --- 499,560 ---- mtsc_after = tsc_read(); while (tsc_sync_go != TSC_SYNC_DONE) SMT_PAUSE(); write_time = mtsc_after - tsc->master_tsc; if (write_time <= min_write_time) { ! min_write_time = write_time; /* ! * Apply heuristic adjustment only if the calculated ! * delta is > 1/4th of the write time. */ ! x = tsc->slave_tsc - mtsc_after; ! if (x < 0) ! x = -x; ! if (x > (min_write_time/4)) ! /* ! * Subtract 1/4th of the measured write time ! * from the master's TSC value, as an estimate ! * of how late the mfence completion came ! * after the slave noticed the cache line ! * change. ! */ tdelta = tsc->slave_tsc - ! (mtsc_after - (min_write_time/4)); ! else tdelta = tsc->slave_tsc - mtsc_after; + tsc_sync_tick_delta[slave] = + tsc_sync_tick_delta[source] - tdelta; } tsc->master_tsc = tsc->slave_tsc = write_time = 0; membar_enter(); tsc_sync_go = TSC_SYNC_STOP; } ! if (tdelta < 0) ! tdelta = -tdelta; ! if (tdelta > largest_tsc_delta) ! largest_tsc_delta = tdelta; ! if (min_write_time < shortest_write_time) ! shortest_write_time = min_write_time; /* ! * Enable delta variants of tsc functions if the largest of all chosen ! * deltas is > smallest of the write time. */ ! if (largest_tsc_delta > shortest_write_time) { gethrtimef = tsc_gethrtime_delta; gethrtimeunscaledf = tsc_gethrtimeunscaled_delta; tsc_ncpu = NCPU; } restore_int_flag(flags); } /* * Called by a CPU which has just been onlined. It is expected that the CPU * performing the online operation will call tsc_sync_master(). * ! * TSC sync is disabled in the context of virtualization. See comments ! * above tsc_sync_master. */ void tsc_sync_slave(void) { ulong_t flags;
*** 569,581 **** s1 = tsc->master_tsc; membar_enter(); tsc_sync_go = TSC_SYNC_GO; do { /* ! * Do not put an SMT_PAUSE here. If the master and ! * slave are the same hyper-threaded CPU, we want the ! * master to yield as quickly as possible to the slave. */ s1 = tsc_read(); } while (tsc->master_tsc == 0); tsc->slave_tsc = s1; membar_enter(); --- 574,588 ---- s1 = tsc->master_tsc; membar_enter(); tsc_sync_go = TSC_SYNC_GO; do { /* ! * Do not put an SMT_PAUSE here. For instance, ! * if the master and slave are really the same ! * hyper-threaded CPU, then you want the master ! * to yield to the slave as quickly as possible here, ! * but not the other way. */ s1 = tsc_read(); } while (tsc->master_tsc == 0); tsc->slave_tsc = s1; membar_enter();
*** 702,715 **** { return (tsc_ready); } /* ! * Adjust all the deltas by adding the passed value to the array and activate ! * the "delta" versions of the gethrtime functions. It is possible that the ! * adjustment could be negative. Such may occur if the SunOS instance was ! * moved by a virtual manager to a machine with a higher value of TSC. */ void tsc_adjust_delta(hrtime_t tdelta) { int i; --- 709,724 ---- { return (tsc_ready); } /* ! * Adjust all the deltas by adding the passed value to the array. ! * Then use the "delt" versions of the the gethrtime functions. ! * Note that 'tdelta' _could_ be a negative number, which should ! * reduce the values in the array (used, for example, if the Solaris ! * instance was moved by a virtual manager to a machine with a higher ! * value of tsc). */ void tsc_adjust_delta(hrtime_t tdelta) { int i;
*** 725,760 **** /* * Functions to manage TSC and high-res time on suspend and resume. */ ! /* tod_ops from "uts/i86pc/io/todpc_subr.c" */ extern tod_ops_t *tod_ops; ! static uint64_t tsc_saved_tsc = 0; /* 1 in 2^64 chance this'll screw up! */ static timestruc_t tsc_saved_ts; static int tsc_needs_resume = 0; /* We only want to do this once. */ int tsc_delta_onsuspend = 0; int tsc_adjust_seconds = 1; int tsc_suspend_count = 0; int tsc_resume_in_cyclic = 0; /* ! * Take snapshots of the current time and do any other pre-suspend work. */ void tsc_suspend(void) { ! /* ! * We need to collect the time at which we suspended here so we know ! * now much should be added during the resume. This is called by each ! * CPU, so reentry must be properly handled. */ if (tsc_gethrtime_enable) { /* ! * Perform the tsc_read after acquiring the lock to make it as ! * accurate as possible in the face of contention. */ mutex_enter(&tod_lock); tsc_saved_tsc = tsc_read(); tsc_saved_ts = TODOP_GET(tod_ops); mutex_exit(&tod_lock); --- 734,776 ---- /* * Functions to manage TSC and high-res time on suspend and resume. */ ! /* ! * declarations needed for time adjustment ! */ ! extern void rtcsync(void); extern tod_ops_t *tod_ops; ! /* There must be a better way than exposing nsec_scale! */ ! extern uint_t nsec_scale; static uint64_t tsc_saved_tsc = 0; /* 1 in 2^64 chance this'll screw up! */ static timestruc_t tsc_saved_ts; static int tsc_needs_resume = 0; /* We only want to do this once. */ int tsc_delta_onsuspend = 0; int tsc_adjust_seconds = 1; int tsc_suspend_count = 0; int tsc_resume_in_cyclic = 0; /* ! * Let timestamp.c know that we are suspending. It needs to take ! * snapshots of the current time, and do any pre-suspend work. */ void tsc_suspend(void) { ! /* ! * What we need to do here, is to get the time we suspended, so that we ! * know how much we should add to the resume. ! * This routine is called by each CPU, so we need to handle reentry. */ if (tsc_gethrtime_enable) { /* ! * We put the tsc_read() inside the lock as it ! * as no locking constraints, and it puts the ! * aquired value closer to the time stamp (in ! * case we delay getting the lock). */ mutex_enter(&tod_lock); tsc_saved_tsc = tsc_read(); tsc_saved_ts = TODOP_GET(tod_ops); mutex_exit(&tod_lock);
*** 772,782 **** invalidate_cache(); tsc_needs_resume = 1; } /* ! * Restore all timestamp state based on the snapshots taken at suspend time. */ void tsc_resume(void) { /* --- 788,799 ---- invalidate_cache(); tsc_needs_resume = 1; } /* ! * Restore all timestamp state based on the snapshots taken at ! * suspend time. */ void tsc_resume(void) { /*