Print this page
OS-5192 need faster clock_gettime
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Joshua M. Clulow <jmc@joyent.com>
Reviewed by: Ryan Zezeski <ryan@zinascii.com>


   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
  27  * Copyright (c) 2014, 2016 by Delphix. All rights reserved.

  28  */
  29 
  30 #include <sys/types.h>
  31 #include <sys/param.h>
  32 #include <sys/systm.h>
  33 #include <sys/disp.h>
  34 #include <sys/var.h>
  35 #include <sys/cmn_err.h>
  36 #include <sys/debug.h>
  37 #include <sys/x86_archext.h>
  38 #include <sys/archsystm.h>
  39 #include <sys/cpuvar.h>
  40 #include <sys/psm_defs.h>
  41 #include <sys/clock.h>
  42 #include <sys/atomic.h>
  43 #include <sys/lockstat.h>
  44 #include <sys/smp_impldefs.h>
  45 #include <sys/dtrace.h>
  46 #include <sys/time.h>
  47 #include <sys/panic.h>
  48 #include <sys/cpu.h>
  49 #include <sys/sdt.h>

  50 
  51 /*
  52  * Using the Pentium's TSC register for gethrtime()
  53  * ------------------------------------------------
  54  *
  55  * The Pentium family, like many chip architectures, has a high-resolution
  56  * timestamp counter ("TSC") which increments once per CPU cycle.  The contents
  57  * of the timestamp counter are read with the RDTSC instruction.
  58  *
  59  * As with its UltraSPARC equivalent (the %tick register), TSC's cycle count
  60  * must be translated into nanoseconds in order to implement gethrtime().
  61  * We avoid inducing floating point operations in this conversion by
  62  * implementing the same nsec_scale algorithm as that found in the sun4u
  63  * platform code.  The sun4u NATIVE_TIME_TO_NSEC_SCALE block comment contains
  64  * a detailed description of the algorithm; the comment is not reproduced
  65  * here.  This implementation differs only in its value for NSEC_SHIFT:
  66  * we implement an NSEC_SHIFT of 5 (instead of sun4u's 4) to allow for
  67  * 60 MHz Pentiums.
  68  *
  69  * While TSC and %tick are both cycle counting registers, TSC's functionality


  82  *
  83  * Together, (a) and (b) imply that software must track the skew between
  84  * TSCs and account for it (it is assumed that while there may exist skew,
  85  * there does not exist drift).  To determine the skew between CPUs, we
  86  * have newly onlined CPUs call tsc_sync_slave(), while the CPU performing
  87  * the online operation calls tsc_sync_master().
  88  *
  89  * In the absence of time-of-day clock adjustments, gethrtime() must stay in
  90  * sync with gettimeofday().  This is problematic; given (c), the software
  91  * cannot drive its time-of-day source from TSC, and yet they must somehow be
  92  * kept in sync.  We implement this by having a routine, tsc_tick(), which
  93  * is called once per second from the interrupt which drives time-of-day.
  94  *
  95  * Note that the hrtime base for gethrtime, tsc_hrtime_base, is modified
  96  * atomically with nsec_scale under CLOCK_LOCK.  This assures that time
  97  * monotonically increases.
  98  */
  99 
 100 #define NSEC_SHIFT 5
 101 
 102 static uint_t nsec_scale;
 103 static uint_t nsec_unscale;
 104 
 105 /*
 106  * These two variables used to be grouped together inside of a structure that
 107  * lived on a single cache line. A regression (bug ID 4623398) caused the
 108  * compiler to emit code that "optimized" away the while-loops below. The
 109  * result was that no synchronization between the onlining and onlined CPUs
 110  * took place.
 111  */
 112 static volatile int tsc_ready;
 113 static volatile int tsc_sync_go;
 114 
 115 /*
 116  * Used as indices into the tsc_sync_snaps[] array.
 117  */
 118 #define TSC_MASTER              0
 119 #define TSC_SLAVE               1
 120 
 121 /*
 122  * Used in the tsc_master_sync()/tsc_slave_sync() rendezvous.
 123  */
 124 #define TSC_SYNC_STOP           1
 125 #define TSC_SYNC_GO             2
 126 #define TSC_SYNC_DONE           3
 127 #define SYNC_ITERATIONS         10
 128 
 129 #define TSC_CONVERT_AND_ADD(tsc, hrt, scale) {          \
 130         unsigned int *_l = (unsigned int *)&(tsc);  \
 131         (hrt) += mul32(_l[1], scale) << NSEC_SHIFT;       \
 132         (hrt) += mul32(_l[0], scale) >> (32 - NSEC_SHIFT); \
 133 }
 134 
 135 #define TSC_CONVERT(tsc, hrt, scale) {                  \
 136         unsigned int *_l = (unsigned int *)&(tsc);  \
 137         (hrt) = mul32(_l[1], scale) << NSEC_SHIFT;        \
 138         (hrt) += mul32(_l[0], scale) >> (32 - NSEC_SHIFT); \
 139 }
 140 
 141 int tsc_master_slave_sync_needed = 1;
 142 
 143 static int      tsc_max_delta;
 144 static hrtime_t tsc_sync_tick_delta[NCPU];
 145 typedef struct tsc_sync {
 146         volatile hrtime_t master_tsc, slave_tsc;
 147 } tsc_sync_t;
 148 static tsc_sync_t *tscp;
 149 static hrtime_t largest_tsc_delta = 0;
 150 static ulong_t shortest_write_time = ~0UL;
 151 
 152 static hrtime_t tsc_last = 0;
 153 static hrtime_t tsc_last_jumped = 0;
 154 static hrtime_t tsc_hrtime_base = 0;
 155 static int      tsc_jumped = 0;
 156 static uint32_t tsc_wayback = 0;
 157 /*
 158  * The cap of 1 second was chosen since it is the frequency at which the
 159  * tsc_tick() function runs which means that when gethrtime() is called it
 160  * should never be more than 1 second since tsc_last was updated.
 161  */
 162 static hrtime_t tsc_resume_cap;
 163 static hrtime_t tsc_resume_cap_ns = NANOSEC;     /* 1s */
 164 
 165 static hrtime_t shadow_tsc_hrtime_base;
 166 static hrtime_t shadow_tsc_last;
 167 static uint_t   shadow_nsec_scale;
 168 static uint32_t shadow_hres_lock;
 169 int get_tsc_ready();
 170 
 171 static inline
 172 hrtime_t tsc_protect(hrtime_t a) {
 173         if (a > tsc_resume_cap) {
 174                 atomic_inc_32(&tsc_wayback);
 175                 DTRACE_PROBE3(tsc__wayback, htrime_t, a, hrtime_t, tsc_last,
 176                     uint32_t, tsc_wayback);
 177                 return (tsc_resume_cap);
 178         }
 179         return (a);
 180 }
 181 
 182 hrtime_t


 529                             tsc_sync_tick_delta[source] - tdelta;
 530                 }
 531 
 532                 tsc->master_tsc = tsc->slave_tsc = write_time = 0;
 533                 membar_enter();
 534                 tsc_sync_go = TSC_SYNC_STOP;
 535         }
 536         if (tdelta < 0)
 537                 tdelta = -tdelta;
 538         if (tdelta > largest_tsc_delta)
 539                 largest_tsc_delta = tdelta;
 540         if (min_write_time < shortest_write_time)
 541                 shortest_write_time = min_write_time;
 542         /*
 543          * Enable delta variants of tsc functions if the largest of all chosen
 544          * deltas is > smallest of the write time.
 545          */
 546         if (largest_tsc_delta > shortest_write_time) {
 547                 gethrtimef = tsc_gethrtime_delta;
 548                 gethrtimeunscaledf = tsc_gethrtimeunscaled_delta;

 549         }
 550         restore_int_flag(flags);
 551 }
 552 
 553 /*
 554  * Called by a CPU which has just been onlined.  It is expected that the CPU
 555  * performing the online operation will call tsc_sync_master().
 556  *
 557  * TSC sync is disabled in the context of virtualization. See comments
 558  * above tsc_sync_master.
 559  */
 560 void
 561 tsc_sync_slave(void)
 562 {
 563         ulong_t flags;
 564         hrtime_t s1;
 565         tsc_sync_t *tsc = tscp;
 566         int cnt;
 567         int hwtype;
 568 


 671          * We can't accommodate CPUs slower than 31.25 MHz.
 672          */
 673         ASSERT(cpu_freq_hz > NANOSEC / (1 << NSEC_SHIFT));
 674         nsec_scale =
 675             (uint_t)(((uint64_t)NANOSEC << (32 - NSEC_SHIFT)) / cpu_freq_hz);
 676         nsec_unscale =
 677             (uint_t)(((uint64_t)cpu_freq_hz << (32 - NSEC_SHIFT)) / NANOSEC);
 678 
 679         flags = clear_int_flag();
 680         tsc = tsc_read();
 681         (void) tsc_gethrtime();
 682         tsc_max_delta = tsc_read() - tsc;
 683         restore_int_flag(flags);
 684         gethrtimef = tsc_gethrtime;
 685         gethrtimeunscaledf = tsc_gethrtimeunscaled;
 686         scalehrtimef = tsc_scalehrtime;
 687         unscalehrtimef = tsc_unscalehrtime;
 688         hrtime_tick = tsc_tick;
 689         gethrtime_hires = 1;
 690         /*






 691          * Allocate memory for the structure used in the tsc sync logic.
 692          * This structure should be aligned on a multiple of cache line size.
 693          */
 694         tscp = kmem_zalloc(PAGESIZE, KM_SLEEP);
 695 
 696         /*
 697          * Convert the TSC resume cap ns value into its unscaled TSC value.
 698          * See tsc_gethrtime().
 699          */
 700         if (tsc_resume_cap == 0)
 701                 TSC_CONVERT(tsc_resume_cap_ns, tsc_resume_cap, nsec_unscale);
 702 }
 703 
 704 int
 705 get_tsc_ready()
 706 {
 707         return (tsc_ready);
 708 }
 709 
 710 /*
 711  * Adjust all the deltas by adding the passed value to the array.
 712  * Then use the "delt" versions of the the gethrtime functions.
 713  * Note that 'tdelta' _could_ be a negative number, which should
 714  * reduce the values in the array (used, for example, if the Solaris
 715  * instance was moved by a virtual manager to a machine with a higher
 716  * value of tsc).
 717  */
 718 void
 719 tsc_adjust_delta(hrtime_t tdelta)
 720 {
 721         int             i;
 722 
 723         for (i = 0; i < NCPU; i++) {
 724                 tsc_sync_tick_delta[i] += tdelta;
 725         }
 726 
 727         gethrtimef = tsc_gethrtime_delta;
 728         gethrtimeunscaledf = tsc_gethrtimeunscaled_delta;

 729 }
 730 
 731 /*
 732  * Functions to manage TSC and high-res time on suspend and resume.
 733  */
 734 
 735 /*
 736  * declarations needed for time adjustment
 737  */
 738 extern void     rtcsync(void);
 739 extern tod_ops_t *tod_ops;
 740 /* There must be a better way than exposing nsec_scale! */
 741 extern uint_t   nsec_scale;
 742 static uint64_t tsc_saved_tsc = 0; /* 1 in 2^64 chance this'll screw up! */
 743 static timestruc_t tsc_saved_ts;
 744 static int      tsc_needs_resume = 0;   /* We only want to do this once. */
 745 int             tsc_delta_onsuspend = 0;
 746 int             tsc_adjust_seconds = 1;
 747 int             tsc_suspend_count = 0;
 748 int             tsc_resume_in_cyclic = 0;




   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
  27  * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
  28  * Copyright 2016 Joyent, Inc.
  29  */
  30 
  31 #include <sys/types.h>
  32 #include <sys/param.h>
  33 #include <sys/systm.h>
  34 #include <sys/disp.h>
  35 #include <sys/var.h>
  36 #include <sys/cmn_err.h>
  37 #include <sys/debug.h>
  38 #include <sys/x86_archext.h>
  39 #include <sys/archsystm.h>
  40 #include <sys/cpuvar.h>
  41 #include <sys/psm_defs.h>
  42 #include <sys/clock.h>
  43 #include <sys/atomic.h>
  44 #include <sys/lockstat.h>
  45 #include <sys/smp_impldefs.h>
  46 #include <sys/dtrace.h>
  47 #include <sys/time.h>
  48 #include <sys/panic.h>
  49 #include <sys/cpu.h>
  50 #include <sys/sdt.h>
  51 #include <sys/comm_page.h>
  52 
  53 /*
  54  * Using the Pentium's TSC register for gethrtime()
  55  * ------------------------------------------------
  56  *
  57  * The Pentium family, like many chip architectures, has a high-resolution
  58  * timestamp counter ("TSC") which increments once per CPU cycle.  The contents
  59  * of the timestamp counter are read with the RDTSC instruction.
  60  *
  61  * As with its UltraSPARC equivalent (the %tick register), TSC's cycle count
  62  * must be translated into nanoseconds in order to implement gethrtime().
  63  * We avoid inducing floating point operations in this conversion by
  64  * implementing the same nsec_scale algorithm as that found in the sun4u
  65  * platform code.  The sun4u NATIVE_TIME_TO_NSEC_SCALE block comment contains
  66  * a detailed description of the algorithm; the comment is not reproduced
  67  * here.  This implementation differs only in its value for NSEC_SHIFT:
  68  * we implement an NSEC_SHIFT of 5 (instead of sun4u's 4) to allow for
  69  * 60 MHz Pentiums.
  70  *
  71  * While TSC and %tick are both cycle counting registers, TSC's functionality


  84  *
  85  * Together, (a) and (b) imply that software must track the skew between
  86  * TSCs and account for it (it is assumed that while there may exist skew,
  87  * there does not exist drift).  To determine the skew between CPUs, we
  88  * have newly onlined CPUs call tsc_sync_slave(), while the CPU performing
  89  * the online operation calls tsc_sync_master().
  90  *
  91  * In the absence of time-of-day clock adjustments, gethrtime() must stay in
  92  * sync with gettimeofday().  This is problematic; given (c), the software
  93  * cannot drive its time-of-day source from TSC, and yet they must somehow be
  94  * kept in sync.  We implement this by having a routine, tsc_tick(), which
  95  * is called once per second from the interrupt which drives time-of-day.
  96  *
  97  * Note that the hrtime base for gethrtime, tsc_hrtime_base, is modified
  98  * atomically with nsec_scale under CLOCK_LOCK.  This assures that time
  99  * monotonically increases.
 100  */
 101 
 102 #define NSEC_SHIFT 5
 103 

 104 static uint_t nsec_unscale;
 105 
 106 /*
 107  * These two variables used to be grouped together inside of a structure that
 108  * lived on a single cache line. A regression (bug ID 4623398) caused the
 109  * compiler to emit code that "optimized" away the while-loops below. The
 110  * result was that no synchronization between the onlining and onlined CPUs
 111  * took place.
 112  */
 113 static volatile int tsc_ready;
 114 static volatile int tsc_sync_go;
 115 
 116 /*
 117  * Used as indices into the tsc_sync_snaps[] array.
 118  */
 119 #define TSC_MASTER              0
 120 #define TSC_SLAVE               1
 121 
 122 /*
 123  * Used in the tsc_master_sync()/tsc_slave_sync() rendezvous.
 124  */
 125 #define TSC_SYNC_STOP           1
 126 #define TSC_SYNC_GO             2
 127 #define TSC_SYNC_DONE           3
 128 #define SYNC_ITERATIONS         10
 129 
 130 #define TSC_CONVERT_AND_ADD(tsc, hrt, scale) {          \
 131         unsigned int *_l = (unsigned int *)&(tsc);  \
 132         (hrt) += mul32(_l[1], scale) << NSEC_SHIFT;       \
 133         (hrt) += mul32(_l[0], scale) >> (32 - NSEC_SHIFT); \
 134 }
 135 
 136 #define TSC_CONVERT(tsc, hrt, scale) {                  \
 137         unsigned int *_l = (unsigned int *)&(tsc);  \
 138         (hrt) = mul32(_l[1], scale) << NSEC_SHIFT;        \
 139         (hrt) += mul32(_l[0], scale) >> (32 - NSEC_SHIFT); \
 140 }
 141 
 142 int tsc_master_slave_sync_needed = 1;
 143 


 144 typedef struct tsc_sync {
 145         volatile hrtime_t master_tsc, slave_tsc;
 146 } tsc_sync_t;
 147 static tsc_sync_t *tscp;
 148 static hrtime_t largest_tsc_delta = 0;
 149 static ulong_t shortest_write_time = ~0UL;
 150 

 151 static hrtime_t tsc_last_jumped = 0;

 152 static int      tsc_jumped = 0;
 153 static uint32_t tsc_wayback = 0;
 154 /*
 155  * The cap of 1 second was chosen since it is the frequency at which the
 156  * tsc_tick() function runs which means that when gethrtime() is called it
 157  * should never be more than 1 second since tsc_last was updated.
 158  */

 159 static hrtime_t tsc_resume_cap_ns = NANOSEC;     /* 1s */
 160 
 161 static hrtime_t shadow_tsc_hrtime_base;
 162 static hrtime_t shadow_tsc_last;
 163 static uint_t   shadow_nsec_scale;
 164 static uint32_t shadow_hres_lock;
 165 int get_tsc_ready();
 166 
 167 static inline
 168 hrtime_t tsc_protect(hrtime_t a) {
 169         if (a > tsc_resume_cap) {
 170                 atomic_inc_32(&tsc_wayback);
 171                 DTRACE_PROBE3(tsc__wayback, htrime_t, a, hrtime_t, tsc_last,
 172                     uint32_t, tsc_wayback);
 173                 return (tsc_resume_cap);
 174         }
 175         return (a);
 176 }
 177 
 178 hrtime_t


 525                             tsc_sync_tick_delta[source] - tdelta;
 526                 }
 527 
 528                 tsc->master_tsc = tsc->slave_tsc = write_time = 0;
 529                 membar_enter();
 530                 tsc_sync_go = TSC_SYNC_STOP;
 531         }
 532         if (tdelta < 0)
 533                 tdelta = -tdelta;
 534         if (tdelta > largest_tsc_delta)
 535                 largest_tsc_delta = tdelta;
 536         if (min_write_time < shortest_write_time)
 537                 shortest_write_time = min_write_time;
 538         /*
 539          * Enable delta variants of tsc functions if the largest of all chosen
 540          * deltas is > smallest of the write time.
 541          */
 542         if (largest_tsc_delta > shortest_write_time) {
 543                 gethrtimef = tsc_gethrtime_delta;
 544                 gethrtimeunscaledf = tsc_gethrtimeunscaled_delta;
 545                 tsc_ncpu = NCPU;
 546         }
 547         restore_int_flag(flags);
 548 }
 549 
 550 /*
 551  * Called by a CPU which has just been onlined.  It is expected that the CPU
 552  * performing the online operation will call tsc_sync_master().
 553  *
 554  * TSC sync is disabled in the context of virtualization. See comments
 555  * above tsc_sync_master.
 556  */
 557 void
 558 tsc_sync_slave(void)
 559 {
 560         ulong_t flags;
 561         hrtime_t s1;
 562         tsc_sync_t *tsc = tscp;
 563         int cnt;
 564         int hwtype;
 565 


 668          * We can't accommodate CPUs slower than 31.25 MHz.
 669          */
 670         ASSERT(cpu_freq_hz > NANOSEC / (1 << NSEC_SHIFT));
 671         nsec_scale =
 672             (uint_t)(((uint64_t)NANOSEC << (32 - NSEC_SHIFT)) / cpu_freq_hz);
 673         nsec_unscale =
 674             (uint_t)(((uint64_t)cpu_freq_hz << (32 - NSEC_SHIFT)) / NANOSEC);
 675 
 676         flags = clear_int_flag();
 677         tsc = tsc_read();
 678         (void) tsc_gethrtime();
 679         tsc_max_delta = tsc_read() - tsc;
 680         restore_int_flag(flags);
 681         gethrtimef = tsc_gethrtime;
 682         gethrtimeunscaledf = tsc_gethrtimeunscaled;
 683         scalehrtimef = tsc_scalehrtime;
 684         unscalehrtimef = tsc_unscalehrtime;
 685         hrtime_tick = tsc_tick;
 686         gethrtime_hires = 1;
 687         /*
 688          * Being part of the comm page, tsc_ncpu communicates the published
 689          * length of the tsc_sync_tick_delta array.  This is kept zeroed to
 690          * ignore the absent delta data while the TSCs are synced.
 691          */
 692         tsc_ncpu = 0;
 693         /*
 694          * Allocate memory for the structure used in the tsc sync logic.
 695          * This structure should be aligned on a multiple of cache line size.
 696          */
 697         tscp = kmem_zalloc(PAGESIZE, KM_SLEEP);
 698 
 699         /*
 700          * Convert the TSC resume cap ns value into its unscaled TSC value.
 701          * See tsc_gethrtime().
 702          */
 703         if (tsc_resume_cap == 0)
 704                 TSC_CONVERT(tsc_resume_cap_ns, tsc_resume_cap, nsec_unscale);
 705 }
 706 
 707 int
 708 get_tsc_ready()
 709 {
 710         return (tsc_ready);
 711 }
 712 
 713 /*
 714  * Adjust all the deltas by adding the passed value to the array.
 715  * Then use the "delt" versions of the the gethrtime functions.
 716  * Note that 'tdelta' _could_ be a negative number, which should
 717  * reduce the values in the array (used, for example, if the Solaris
 718  * instance was moved by a virtual manager to a machine with a higher
 719  * value of tsc).
 720  */
 721 void
 722 tsc_adjust_delta(hrtime_t tdelta)
 723 {
 724         int             i;
 725 
 726         for (i = 0; i < NCPU; i++) {
 727                 tsc_sync_tick_delta[i] += tdelta;
 728         }
 729 
 730         gethrtimef = tsc_gethrtime_delta;
 731         gethrtimeunscaledf = tsc_gethrtimeunscaled_delta;
 732         tsc_ncpu = NCPU;
 733 }
 734 
 735 /*
 736  * Functions to manage TSC and high-res time on suspend and resume.
 737  */
 738 
 739 /*
 740  * declarations needed for time adjustment
 741  */
 742 extern void     rtcsync(void);
 743 extern tod_ops_t *tod_ops;
 744 /* There must be a better way than exposing nsec_scale! */
 745 extern uint_t   nsec_scale;
 746 static uint64_t tsc_saved_tsc = 0; /* 1 in 2^64 chance this'll screw up! */
 747 static timestruc_t tsc_saved_ts;
 748 static int      tsc_needs_resume = 0;   /* We only want to do this once. */
 749 int             tsc_delta_onsuspend = 0;
 750 int             tsc_adjust_seconds = 1;
 751 int             tsc_suspend_count = 0;
 752 int             tsc_resume_in_cyclic = 0;