Print this page
OS-5192 need faster clock_gettime
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Joshua M. Clulow <jmc@joyent.com>
Reviewed by: Ryan Zezeski <ryan@zinascii.com>
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/i86pc/os/timestamp.c
+++ new/usr/src/uts/i86pc/os/timestamp.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
|
↓ open down ↓ |
17 lines elided |
↑ open up ↑ |
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 24 * Use is subject to license terms.
25 25 *
26 26 * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
27 27 * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
28 + * Copyright 2016 Joyent, Inc.
28 29 */
29 30
30 31 #include <sys/types.h>
31 32 #include <sys/param.h>
32 33 #include <sys/systm.h>
33 34 #include <sys/disp.h>
34 35 #include <sys/var.h>
35 36 #include <sys/cmn_err.h>
36 37 #include <sys/debug.h>
37 38 #include <sys/x86_archext.h>
38 39 #include <sys/archsystm.h>
39 40 #include <sys/cpuvar.h>
|
↓ open down ↓ |
2 lines elided |
↑ open up ↑ |
40 41 #include <sys/psm_defs.h>
41 42 #include <sys/clock.h>
42 43 #include <sys/atomic.h>
43 44 #include <sys/lockstat.h>
44 45 #include <sys/smp_impldefs.h>
45 46 #include <sys/dtrace.h>
46 47 #include <sys/time.h>
47 48 #include <sys/panic.h>
48 49 #include <sys/cpu.h>
49 50 #include <sys/sdt.h>
51 +#include <sys/comm_page.h>
50 52
51 53 /*
52 54 * Using the Pentium's TSC register for gethrtime()
53 55 * ------------------------------------------------
54 56 *
55 57 * The Pentium family, like many chip architectures, has a high-resolution
56 58 * timestamp counter ("TSC") which increments once per CPU cycle. The contents
57 59 * of the timestamp counter are read with the RDTSC instruction.
58 60 *
59 61 * As with its UltraSPARC equivalent (the %tick register), TSC's cycle count
60 62 * must be translated into nanoseconds in order to implement gethrtime().
61 63 * We avoid inducing floating point operations in this conversion by
62 64 * implementing the same nsec_scale algorithm as that found in the sun4u
63 65 * platform code. The sun4u NATIVE_TIME_TO_NSEC_SCALE block comment contains
64 66 * a detailed description of the algorithm; the comment is not reproduced
65 67 * here. This implementation differs only in its value for NSEC_SHIFT:
66 68 * we implement an NSEC_SHIFT of 5 (instead of sun4u's 4) to allow for
67 69 * 60 MHz Pentiums.
68 70 *
69 71 * While TSC and %tick are both cycle counting registers, TSC's functionality
70 72 * falls short in several critical ways:
71 73 *
72 74 * (a) TSCs on different CPUs are not guaranteed to be in sync. While in
73 75 * practice they often _are_ in sync, this isn't guaranteed by the
74 76 * architecture.
75 77 *
76 78 * (b) The TSC cannot be reliably set to an arbitrary value. The architecture
77 79 * only supports writing the low 32-bits of TSC, making it impractical
78 80 * to rewrite.
79 81 *
80 82 * (c) The architecture doesn't have the capacity to interrupt based on
81 83 * arbitrary values of TSC; there is no TICK_CMPR equivalent.
82 84 *
83 85 * Together, (a) and (b) imply that software must track the skew between
84 86 * TSCs and account for it (it is assumed that while there may exist skew,
85 87 * there does not exist drift). To determine the skew between CPUs, we
86 88 * have newly onlined CPUs call tsc_sync_slave(), while the CPU performing
87 89 * the online operation calls tsc_sync_master().
88 90 *
89 91 * In the absence of time-of-day clock adjustments, gethrtime() must stay in
90 92 * sync with gettimeofday(). This is problematic; given (c), the software
91 93 * cannot drive its time-of-day source from TSC, and yet they must somehow be
|
↓ open down ↓ |
32 lines elided |
↑ open up ↑ |
92 94 * kept in sync. We implement this by having a routine, tsc_tick(), which
93 95 * is called once per second from the interrupt which drives time-of-day.
94 96 *
95 97 * Note that the hrtime base for gethrtime, tsc_hrtime_base, is modified
96 98 * atomically with nsec_scale under CLOCK_LOCK. This assures that time
97 99 * monotonically increases.
98 100 */
99 101
100 102 #define NSEC_SHIFT 5
101 103
102 -static uint_t nsec_scale;
103 104 static uint_t nsec_unscale;
104 105
105 106 /*
106 107 * These two variables used to be grouped together inside of a structure that
107 108 * lived on a single cache line. A regression (bug ID 4623398) caused the
108 109 * compiler to emit code that "optimized" away the while-loops below. The
109 110 * result was that no synchronization between the onlining and onlined CPUs
110 111 * took place.
111 112 */
112 113 static volatile int tsc_ready;
113 114 static volatile int tsc_sync_go;
114 115
115 116 /*
116 117 * Used as indices into the tsc_sync_snaps[] array.
117 118 */
118 119 #define TSC_MASTER 0
119 120 #define TSC_SLAVE 1
120 121
121 122 /*
122 123 * Used in the tsc_master_sync()/tsc_slave_sync() rendezvous.
123 124 */
124 125 #define TSC_SYNC_STOP 1
125 126 #define TSC_SYNC_GO 2
126 127 #define TSC_SYNC_DONE 3
127 128 #define SYNC_ITERATIONS 10
128 129
129 130 #define TSC_CONVERT_AND_ADD(tsc, hrt, scale) { \
130 131 unsigned int *_l = (unsigned int *)&(tsc); \
131 132 (hrt) += mul32(_l[1], scale) << NSEC_SHIFT; \
132 133 (hrt) += mul32(_l[0], scale) >> (32 - NSEC_SHIFT); \
|
↓ open down ↓ |
20 lines elided |
↑ open up ↑ |
133 134 }
134 135
135 136 #define TSC_CONVERT(tsc, hrt, scale) { \
136 137 unsigned int *_l = (unsigned int *)&(tsc); \
137 138 (hrt) = mul32(_l[1], scale) << NSEC_SHIFT; \
138 139 (hrt) += mul32(_l[0], scale) >> (32 - NSEC_SHIFT); \
139 140 }
140 141
141 142 int tsc_master_slave_sync_needed = 1;
142 143
143 -static int tsc_max_delta;
144 -static hrtime_t tsc_sync_tick_delta[NCPU];
145 144 typedef struct tsc_sync {
146 145 volatile hrtime_t master_tsc, slave_tsc;
147 146 } tsc_sync_t;
148 147 static tsc_sync_t *tscp;
149 148 static hrtime_t largest_tsc_delta = 0;
150 149 static ulong_t shortest_write_time = ~0UL;
151 150
152 -static hrtime_t tsc_last = 0;
153 151 static hrtime_t tsc_last_jumped = 0;
154 -static hrtime_t tsc_hrtime_base = 0;
155 152 static int tsc_jumped = 0;
156 153 static uint32_t tsc_wayback = 0;
157 154 /*
158 155 * The cap of 1 second was chosen since it is the frequency at which the
159 156 * tsc_tick() function runs which means that when gethrtime() is called it
160 157 * should never be more than 1 second since tsc_last was updated.
161 158 */
162 -static hrtime_t tsc_resume_cap;
163 159 static hrtime_t tsc_resume_cap_ns = NANOSEC; /* 1s */
164 160
165 161 static hrtime_t shadow_tsc_hrtime_base;
166 162 static hrtime_t shadow_tsc_last;
167 163 static uint_t shadow_nsec_scale;
168 164 static uint32_t shadow_hres_lock;
169 165 int get_tsc_ready();
170 166
171 167 static inline
172 168 hrtime_t tsc_protect(hrtime_t a) {
173 169 if (a > tsc_resume_cap) {
174 170 atomic_inc_32(&tsc_wayback);
175 171 DTRACE_PROBE3(tsc__wayback, htrime_t, a, hrtime_t, tsc_last,
176 172 uint32_t, tsc_wayback);
177 173 return (tsc_resume_cap);
178 174 }
179 175 return (a);
180 176 }
181 177
182 178 hrtime_t
183 179 tsc_gethrtime(void)
184 180 {
185 181 uint32_t old_hres_lock;
186 182 hrtime_t tsc, hrt;
187 183
188 184 do {
189 185 old_hres_lock = hres_lock;
190 186
191 187 if ((tsc = tsc_read()) >= tsc_last) {
192 188 /*
193 189 * It would seem to be obvious that this is true
194 190 * (that is, the past is less than the present),
195 191 * but it isn't true in the presence of suspend/resume
196 192 * cycles. If we manage to call gethrtime()
197 193 * after a resume, but before the first call to
198 194 * tsc_tick(), we will see the jump. In this case,
199 195 * we will simply use the value in TSC as the delta.
200 196 */
201 197 tsc -= tsc_last;
202 198 } else if (tsc >= tsc_last - 2*tsc_max_delta) {
203 199 /*
204 200 * There is a chance that tsc_tick() has just run on
205 201 * another CPU, and we have drifted just enough so that
206 202 * we appear behind tsc_last. In this case, force the
207 203 * delta to be zero.
208 204 */
209 205 tsc = 0;
210 206 } else {
211 207 /*
212 208 * If we reach this else clause we assume that we have
213 209 * gone through a suspend/resume cycle and use the
214 210 * current tsc value as the delta.
215 211 *
216 212 * In rare cases we can reach this else clause due to
217 213 * a lack of monotonicity in the TSC value. In such
218 214 * cases using the current TSC value as the delta would
219 215 * cause us to return a value ~2x of what it should
220 216 * be. To protect against these cases we cap the
221 217 * suspend/resume delta at tsc_resume_cap.
222 218 */
223 219 tsc = tsc_protect(tsc);
224 220 }
225 221
226 222 hrt = tsc_hrtime_base;
227 223
228 224 TSC_CONVERT_AND_ADD(tsc, hrt, nsec_scale);
229 225 } while ((old_hres_lock & ~1) != hres_lock);
230 226
231 227 return (hrt);
232 228 }
233 229
234 230 hrtime_t
235 231 tsc_gethrtime_delta(void)
236 232 {
237 233 uint32_t old_hres_lock;
238 234 hrtime_t tsc, hrt;
239 235 ulong_t flags;
240 236
241 237 do {
242 238 old_hres_lock = hres_lock;
243 239
244 240 /*
245 241 * We need to disable interrupts here to assure that we
246 242 * don't migrate between the call to tsc_read() and
247 243 * adding the CPU's TSC tick delta. Note that disabling
248 244 * and reenabling preemption is forbidden here because
249 245 * we may be in the middle of a fast trap. In the amd64
250 246 * kernel we cannot tolerate preemption during a fast
251 247 * trap. See _update_sregs().
252 248 */
253 249
254 250 flags = clear_int_flag();
255 251 tsc = tsc_read() + tsc_sync_tick_delta[CPU->cpu_id];
256 252 restore_int_flag(flags);
257 253
258 254 /* See comments in tsc_gethrtime() above */
259 255
260 256 if (tsc >= tsc_last) {
261 257 tsc -= tsc_last;
262 258 } else if (tsc >= tsc_last - 2 * tsc_max_delta) {
263 259 tsc = 0;
264 260 } else {
265 261 tsc = tsc_protect(tsc);
266 262 }
267 263
268 264 hrt = tsc_hrtime_base;
269 265
270 266 TSC_CONVERT_AND_ADD(tsc, hrt, nsec_scale);
271 267 } while ((old_hres_lock & ~1) != hres_lock);
272 268
273 269 return (hrt);
274 270 }
275 271
276 272 hrtime_t
277 273 tsc_gethrtime_tick_delta(void)
278 274 {
279 275 hrtime_t hrt;
280 276 ulong_t flags;
281 277
282 278 flags = clear_int_flag();
283 279 hrt = tsc_sync_tick_delta[CPU->cpu_id];
284 280 restore_int_flag(flags);
285 281
286 282 return (hrt);
287 283 }
288 284
289 285 /*
290 286 * This is similar to the above, but it cannot actually spin on hres_lock.
291 287 * As a result, it caches all of the variables it needs; if the variables
292 288 * don't change, it's done.
293 289 */
294 290 hrtime_t
295 291 dtrace_gethrtime(void)
296 292 {
297 293 uint32_t old_hres_lock;
298 294 hrtime_t tsc, hrt;
299 295 ulong_t flags;
300 296
301 297 do {
302 298 old_hres_lock = hres_lock;
303 299
304 300 /*
305 301 * Interrupts are disabled to ensure that the thread isn't
306 302 * migrated between the tsc_read() and adding the CPU's
307 303 * TSC tick delta.
308 304 */
309 305 flags = clear_int_flag();
310 306
311 307 tsc = tsc_read();
312 308
313 309 if (gethrtimef == tsc_gethrtime_delta)
314 310 tsc += tsc_sync_tick_delta[CPU->cpu_id];
315 311
316 312 restore_int_flag(flags);
317 313
318 314 /*
319 315 * See the comments in tsc_gethrtime(), above.
320 316 */
321 317 if (tsc >= tsc_last)
322 318 tsc -= tsc_last;
323 319 else if (tsc >= tsc_last - 2*tsc_max_delta)
324 320 tsc = 0;
325 321 else
326 322 tsc = tsc_protect(tsc);
327 323
328 324 hrt = tsc_hrtime_base;
329 325
330 326 TSC_CONVERT_AND_ADD(tsc, hrt, nsec_scale);
331 327
332 328 if ((old_hres_lock & ~1) == hres_lock)
333 329 break;
334 330
335 331 /*
336 332 * If we're here, the clock lock is locked -- or it has been
337 333 * unlocked and locked since we looked. This may be due to
338 334 * tsc_tick() running on another CPU -- or it may be because
339 335 * some code path has ended up in dtrace_probe() with
340 336 * CLOCK_LOCK held. We'll try to determine that we're in
341 337 * the former case by taking another lap if the lock has
342 338 * changed since when we first looked at it.
343 339 */
344 340 if (old_hres_lock != hres_lock)
345 341 continue;
346 342
347 343 /*
348 344 * So the lock was and is locked. We'll use the old data
349 345 * instead.
350 346 */
351 347 old_hres_lock = shadow_hres_lock;
352 348
353 349 /*
354 350 * Again, disable interrupts to ensure that the thread
355 351 * isn't migrated between the tsc_read() and adding
356 352 * the CPU's TSC tick delta.
357 353 */
358 354 flags = clear_int_flag();
359 355
360 356 tsc = tsc_read();
361 357
362 358 if (gethrtimef == tsc_gethrtime_delta)
363 359 tsc += tsc_sync_tick_delta[CPU->cpu_id];
364 360
365 361 restore_int_flag(flags);
366 362
367 363 /*
368 364 * See the comments in tsc_gethrtime(), above.
369 365 */
370 366 if (tsc >= shadow_tsc_last)
371 367 tsc -= shadow_tsc_last;
372 368 else if (tsc >= shadow_tsc_last - 2 * tsc_max_delta)
373 369 tsc = 0;
374 370 else
375 371 tsc = tsc_protect(tsc);
376 372
377 373 hrt = shadow_tsc_hrtime_base;
378 374
379 375 TSC_CONVERT_AND_ADD(tsc, hrt, shadow_nsec_scale);
380 376 } while ((old_hres_lock & ~1) != shadow_hres_lock);
381 377
382 378 return (hrt);
383 379 }
384 380
385 381 hrtime_t
386 382 tsc_gethrtimeunscaled(void)
387 383 {
388 384 uint32_t old_hres_lock;
389 385 hrtime_t tsc;
390 386
391 387 do {
392 388 old_hres_lock = hres_lock;
393 389
394 390 /* See tsc_tick(). */
395 391 tsc = tsc_read() + tsc_last_jumped;
396 392 } while ((old_hres_lock & ~1) != hres_lock);
397 393
398 394 return (tsc);
399 395 }
400 396
401 397 /*
402 398 * Convert a nanosecond based timestamp to tsc
403 399 */
404 400 uint64_t
405 401 tsc_unscalehrtime(hrtime_t nsec)
406 402 {
407 403 hrtime_t tsc;
408 404
409 405 if (tsc_gethrtime_enable) {
410 406 TSC_CONVERT(nsec, tsc, nsec_unscale);
411 407 return (tsc);
412 408 }
413 409 return ((uint64_t)nsec);
414 410 }
415 411
416 412 /* Convert a tsc timestamp to nanoseconds */
417 413 void
418 414 tsc_scalehrtime(hrtime_t *tsc)
419 415 {
420 416 hrtime_t hrt;
421 417 hrtime_t mytsc;
422 418
423 419 if (tsc == NULL)
424 420 return;
425 421 mytsc = *tsc;
426 422
427 423 TSC_CONVERT(mytsc, hrt, nsec_scale);
428 424 *tsc = hrt;
429 425 }
430 426
431 427 hrtime_t
432 428 tsc_gethrtimeunscaled_delta(void)
433 429 {
434 430 hrtime_t hrt;
435 431 ulong_t flags;
436 432
437 433 /*
438 434 * Similarly to tsc_gethrtime_delta, we need to disable preemption
439 435 * to prevent migration between the call to tsc_gethrtimeunscaled
440 436 * and adding the CPU's hrtime delta. Note that disabling and
441 437 * reenabling preemption is forbidden here because we may be in the
442 438 * middle of a fast trap. In the amd64 kernel we cannot tolerate
443 439 * preemption during a fast trap. See _update_sregs().
444 440 */
445 441
446 442 flags = clear_int_flag();
447 443 hrt = tsc_gethrtimeunscaled() + tsc_sync_tick_delta[CPU->cpu_id];
448 444 restore_int_flag(flags);
449 445
450 446 return (hrt);
451 447 }
452 448
453 449 /*
454 450 * Called by the master in the TSC sync operation (usually the boot CPU).
455 451 * If the slave is discovered to have a skew, gethrtimef will be changed to
456 452 * point to tsc_gethrtime_delta(). Calculating skews is precise only when
457 453 * the master and slave TSCs are read simultaneously; however, there is no
458 454 * algorithm that can read both CPUs in perfect simultaneity. The proposed
459 455 * algorithm is an approximate method based on the behaviour of cache
460 456 * management. The slave CPU continuously reads TSC and then reads a global
461 457 * variable which the master CPU updates. The moment the master's update reaches
462 458 * the slave's visibility (being forced by an mfence operation) we use the TSC
463 459 * reading taken on the slave. A corresponding TSC read will be taken on the
464 460 * master as soon as possible after finishing the mfence operation. But the
465 461 * delay between causing the slave to notice the invalid cache line and the
466 462 * competion of mfence is not repeatable. This error is heuristically assumed
467 463 * to be 1/4th of the total write time as being measured by the two TSC reads
468 464 * on the master sandwiching the mfence. Furthermore, due to the nature of
469 465 * bus arbitration, contention on memory bus, etc., the time taken for the write
470 466 * to reflect globally can vary a lot. So instead of taking a single reading,
471 467 * a set of readings are taken and the one with least write time is chosen
472 468 * to calculate the final skew.
473 469 *
474 470 * TSC sync is disabled in the context of virtualization because the CPUs
475 471 * assigned to the guest are virtual CPUs which means the real CPUs on which
476 472 * guest runs keep changing during life time of guest OS. So we would end up
477 473 * calculating TSC skews for a set of CPUs during boot whereas the guest
478 474 * might migrate to a different set of physical CPUs at a later point of
479 475 * time.
480 476 */
481 477 void
482 478 tsc_sync_master(processorid_t slave)
483 479 {
484 480 ulong_t flags, source, min_write_time = ~0UL;
485 481 hrtime_t write_time, x, mtsc_after, tdelta;
486 482 tsc_sync_t *tsc = tscp;
487 483 int cnt;
488 484 int hwtype;
489 485
490 486 hwtype = get_hwenv();
491 487 if (!tsc_master_slave_sync_needed || (hwtype & HW_VIRTUAL) != 0)
492 488 return;
493 489
494 490 flags = clear_int_flag();
495 491 source = CPU->cpu_id;
496 492
497 493 for (cnt = 0; cnt < SYNC_ITERATIONS; cnt++) {
498 494 while (tsc_sync_go != TSC_SYNC_GO)
499 495 SMT_PAUSE();
500 496
501 497 tsc->master_tsc = tsc_read();
502 498 membar_enter();
503 499 mtsc_after = tsc_read();
504 500 while (tsc_sync_go != TSC_SYNC_DONE)
505 501 SMT_PAUSE();
506 502 write_time = mtsc_after - tsc->master_tsc;
507 503 if (write_time <= min_write_time) {
508 504 min_write_time = write_time;
509 505 /*
510 506 * Apply heuristic adjustment only if the calculated
511 507 * delta is > 1/4th of the write time.
512 508 */
513 509 x = tsc->slave_tsc - mtsc_after;
514 510 if (x < 0)
515 511 x = -x;
516 512 if (x > (min_write_time/4))
517 513 /*
518 514 * Subtract 1/4th of the measured write time
519 515 * from the master's TSC value, as an estimate
520 516 * of how late the mfence completion came
521 517 * after the slave noticed the cache line
522 518 * change.
523 519 */
524 520 tdelta = tsc->slave_tsc -
525 521 (mtsc_after - (min_write_time/4));
526 522 else
527 523 tdelta = tsc->slave_tsc - mtsc_after;
528 524 tsc_sync_tick_delta[slave] =
529 525 tsc_sync_tick_delta[source] - tdelta;
530 526 }
531 527
532 528 tsc->master_tsc = tsc->slave_tsc = write_time = 0;
533 529 membar_enter();
534 530 tsc_sync_go = TSC_SYNC_STOP;
535 531 }
536 532 if (tdelta < 0)
537 533 tdelta = -tdelta;
538 534 if (tdelta > largest_tsc_delta)
|
↓ open down ↓ |
366 lines elided |
↑ open up ↑ |
539 535 largest_tsc_delta = tdelta;
540 536 if (min_write_time < shortest_write_time)
541 537 shortest_write_time = min_write_time;
542 538 /*
543 539 * Enable delta variants of tsc functions if the largest of all chosen
544 540 * deltas is > smallest of the write time.
545 541 */
546 542 if (largest_tsc_delta > shortest_write_time) {
547 543 gethrtimef = tsc_gethrtime_delta;
548 544 gethrtimeunscaledf = tsc_gethrtimeunscaled_delta;
545 + tsc_ncpu = NCPU;
549 546 }
550 547 restore_int_flag(flags);
551 548 }
552 549
553 550 /*
554 551 * Called by a CPU which has just been onlined. It is expected that the CPU
555 552 * performing the online operation will call tsc_sync_master().
556 553 *
557 554 * TSC sync is disabled in the context of virtualization. See comments
558 555 * above tsc_sync_master.
559 556 */
560 557 void
561 558 tsc_sync_slave(void)
562 559 {
563 560 ulong_t flags;
564 561 hrtime_t s1;
565 562 tsc_sync_t *tsc = tscp;
566 563 int cnt;
567 564 int hwtype;
568 565
569 566 hwtype = get_hwenv();
570 567 if (!tsc_master_slave_sync_needed || (hwtype & HW_VIRTUAL) != 0)
571 568 return;
572 569
573 570 flags = clear_int_flag();
574 571
575 572 for (cnt = 0; cnt < SYNC_ITERATIONS; cnt++) {
576 573 /* Re-fill the cache line */
577 574 s1 = tsc->master_tsc;
578 575 membar_enter();
579 576 tsc_sync_go = TSC_SYNC_GO;
580 577 do {
581 578 /*
582 579 * Do not put an SMT_PAUSE here. For instance,
583 580 * if the master and slave are really the same
584 581 * hyper-threaded CPU, then you want the master
585 582 * to yield to the slave as quickly as possible here,
586 583 * but not the other way.
587 584 */
588 585 s1 = tsc_read();
589 586 } while (tsc->master_tsc == 0);
590 587 tsc->slave_tsc = s1;
591 588 membar_enter();
592 589 tsc_sync_go = TSC_SYNC_DONE;
593 590
594 591 while (tsc_sync_go != TSC_SYNC_STOP)
595 592 SMT_PAUSE();
596 593 }
597 594
598 595 restore_int_flag(flags);
599 596 }
600 597
601 598 /*
602 599 * Called once per second on a CPU from the cyclic subsystem's
603 600 * CY_HIGH_LEVEL interrupt. (No longer just cpu0-only)
604 601 */
605 602 void
606 603 tsc_tick(void)
607 604 {
608 605 hrtime_t now, delta;
609 606 ushort_t spl;
610 607
611 608 /*
612 609 * Before we set the new variables, we set the shadow values. This
613 610 * allows for lock free operation in dtrace_gethrtime().
614 611 */
615 612 lock_set_spl((lock_t *)&shadow_hres_lock + HRES_LOCK_OFFSET,
616 613 ipltospl(CBE_HIGH_PIL), &spl);
617 614
618 615 shadow_tsc_hrtime_base = tsc_hrtime_base;
619 616 shadow_tsc_last = tsc_last;
620 617 shadow_nsec_scale = nsec_scale;
621 618
622 619 shadow_hres_lock++;
623 620 splx(spl);
624 621
625 622 CLOCK_LOCK(&spl);
626 623
627 624 now = tsc_read();
628 625
629 626 if (gethrtimef == tsc_gethrtime_delta)
630 627 now += tsc_sync_tick_delta[CPU->cpu_id];
631 628
632 629 if (now < tsc_last) {
633 630 /*
634 631 * The TSC has just jumped into the past. We assume that
635 632 * this is due to a suspend/resume cycle, and we're going
636 633 * to use the _current_ value of TSC as the delta. This
637 634 * will keep tsc_hrtime_base correct. We're also going to
638 635 * assume that rate of tsc does not change after a suspend
639 636 * resume (i.e nsec_scale remains the same).
640 637 */
641 638 delta = now;
642 639 delta = tsc_protect(delta);
643 640 tsc_last_jumped += tsc_last;
644 641 tsc_jumped = 1;
645 642 } else {
646 643 /*
647 644 * Determine the number of TSC ticks since the last clock
648 645 * tick, and add that to the hrtime base.
649 646 */
650 647 delta = now - tsc_last;
651 648 }
652 649
653 650 TSC_CONVERT_AND_ADD(delta, tsc_hrtime_base, nsec_scale);
654 651 tsc_last = now;
655 652
656 653 CLOCK_UNLOCK(spl);
657 654 }
658 655
659 656 void
660 657 tsc_hrtimeinit(uint64_t cpu_freq_hz)
661 658 {
662 659 extern int gethrtime_hires;
663 660 longlong_t tsc;
664 661 ulong_t flags;
665 662
666 663 /*
667 664 * cpu_freq_hz is the measured cpu frequency in hertz
668 665 */
669 666
670 667 /*
671 668 * We can't accommodate CPUs slower than 31.25 MHz.
672 669 */
673 670 ASSERT(cpu_freq_hz > NANOSEC / (1 << NSEC_SHIFT));
674 671 nsec_scale =
675 672 (uint_t)(((uint64_t)NANOSEC << (32 - NSEC_SHIFT)) / cpu_freq_hz);
676 673 nsec_unscale =
677 674 (uint_t)(((uint64_t)cpu_freq_hz << (32 - NSEC_SHIFT)) / NANOSEC);
678 675
679 676 flags = clear_int_flag();
680 677 tsc = tsc_read();
|
↓ open down ↓ |
122 lines elided |
↑ open up ↑ |
681 678 (void) tsc_gethrtime();
682 679 tsc_max_delta = tsc_read() - tsc;
683 680 restore_int_flag(flags);
684 681 gethrtimef = tsc_gethrtime;
685 682 gethrtimeunscaledf = tsc_gethrtimeunscaled;
686 683 scalehrtimef = tsc_scalehrtime;
687 684 unscalehrtimef = tsc_unscalehrtime;
688 685 hrtime_tick = tsc_tick;
689 686 gethrtime_hires = 1;
690 687 /*
688 + * Being part of the comm page, tsc_ncpu communicates the published
689 + * length of the tsc_sync_tick_delta array. This is kept zeroed to
690 + * ignore the absent delta data while the TSCs are synced.
691 + */
692 + tsc_ncpu = 0;
693 + /*
691 694 * Allocate memory for the structure used in the tsc sync logic.
692 695 * This structure should be aligned on a multiple of cache line size.
693 696 */
694 697 tscp = kmem_zalloc(PAGESIZE, KM_SLEEP);
695 698
696 699 /*
697 700 * Convert the TSC resume cap ns value into its unscaled TSC value.
698 701 * See tsc_gethrtime().
699 702 */
700 703 if (tsc_resume_cap == 0)
701 704 TSC_CONVERT(tsc_resume_cap_ns, tsc_resume_cap, nsec_unscale);
702 705 }
703 706
704 707 int
705 708 get_tsc_ready()
706 709 {
707 710 return (tsc_ready);
708 711 }
709 712
710 713 /*
711 714 * Adjust all the deltas by adding the passed value to the array.
712 715 * Then use the "delt" versions of the the gethrtime functions.
713 716 * Note that 'tdelta' _could_ be a negative number, which should
714 717 * reduce the values in the array (used, for example, if the Solaris
715 718 * instance was moved by a virtual manager to a machine with a higher
716 719 * value of tsc).
717 720 */
718 721 void
|
↓ open down ↓ |
18 lines elided |
↑ open up ↑ |
719 722 tsc_adjust_delta(hrtime_t tdelta)
720 723 {
721 724 int i;
722 725
723 726 for (i = 0; i < NCPU; i++) {
724 727 tsc_sync_tick_delta[i] += tdelta;
725 728 }
726 729
727 730 gethrtimef = tsc_gethrtime_delta;
728 731 gethrtimeunscaledf = tsc_gethrtimeunscaled_delta;
732 + tsc_ncpu = NCPU;
729 733 }
730 734
731 735 /*
732 736 * Functions to manage TSC and high-res time on suspend and resume.
733 737 */
734 738
735 739 /*
736 740 * declarations needed for time adjustment
737 741 */
738 742 extern void rtcsync(void);
739 743 extern tod_ops_t *tod_ops;
740 744 /* There must be a better way than exposing nsec_scale! */
741 745 extern uint_t nsec_scale;
742 746 static uint64_t tsc_saved_tsc = 0; /* 1 in 2^64 chance this'll screw up! */
743 747 static timestruc_t tsc_saved_ts;
744 748 static int tsc_needs_resume = 0; /* We only want to do this once. */
745 749 int tsc_delta_onsuspend = 0;
746 750 int tsc_adjust_seconds = 1;
747 751 int tsc_suspend_count = 0;
748 752 int tsc_resume_in_cyclic = 0;
749 753
750 754 /*
751 755 * Let timestamp.c know that we are suspending. It needs to take
752 756 * snapshots of the current time, and do any pre-suspend work.
753 757 */
754 758 void
755 759 tsc_suspend(void)
756 760 {
757 761 /*
758 762 * What we need to do here, is to get the time we suspended, so that we
759 763 * know how much we should add to the resume.
760 764 * This routine is called by each CPU, so we need to handle reentry.
761 765 */
762 766 if (tsc_gethrtime_enable) {
763 767 /*
764 768 * We put the tsc_read() inside the lock as it
765 769 * as no locking constraints, and it puts the
766 770 * aquired value closer to the time stamp (in
767 771 * case we delay getting the lock).
768 772 */
769 773 mutex_enter(&tod_lock);
770 774 tsc_saved_tsc = tsc_read();
771 775 tsc_saved_ts = TODOP_GET(tod_ops);
772 776 mutex_exit(&tod_lock);
773 777 /* We only want to do this once. */
774 778 if (tsc_needs_resume == 0) {
775 779 if (tsc_delta_onsuspend) {
776 780 tsc_adjust_delta(tsc_saved_tsc);
777 781 } else {
778 782 tsc_adjust_delta(nsec_scale);
779 783 }
780 784 tsc_suspend_count++;
781 785 }
782 786 }
783 787
784 788 invalidate_cache();
785 789 tsc_needs_resume = 1;
786 790 }
787 791
788 792 /*
789 793 * Restore all timestamp state based on the snapshots taken at
790 794 * suspend time.
791 795 */
792 796 void
793 797 tsc_resume(void)
794 798 {
795 799 /*
796 800 * We only need to (and want to) do this once. So let the first
797 801 * caller handle this (we are locked by the cpu lock), as it
798 802 * is preferential that we get the earliest sync.
799 803 */
800 804 if (tsc_needs_resume) {
801 805 /*
802 806 * If using the TSC, adjust the delta based on how long
803 807 * we were sleeping (or away). We also adjust for
804 808 * migration and a grown TSC.
805 809 */
806 810 if (tsc_saved_tsc != 0) {
807 811 timestruc_t ts;
808 812 hrtime_t now, sleep_tsc = 0;
809 813 int sleep_sec;
810 814 extern void tsc_tick(void);
811 815 extern uint64_t cpu_freq_hz;
812 816
813 817 /* tsc_read() MUST be before TODOP_GET() */
814 818 mutex_enter(&tod_lock);
815 819 now = tsc_read();
816 820 ts = TODOP_GET(tod_ops);
817 821 mutex_exit(&tod_lock);
818 822
819 823 /* Compute seconds of sleep time */
820 824 sleep_sec = ts.tv_sec - tsc_saved_ts.tv_sec;
821 825
822 826 /*
823 827 * If the saved sec is less that or equal to
824 828 * the current ts, then there is likely a
825 829 * problem with the clock. Assume at least
826 830 * one second has passed, so that time goes forward.
827 831 */
828 832 if (sleep_sec <= 0) {
829 833 sleep_sec = 1;
830 834 }
831 835
832 836 /* How many TSC's should have occured while sleeping */
833 837 if (tsc_adjust_seconds)
834 838 sleep_tsc = sleep_sec * cpu_freq_hz;
835 839
836 840 /*
837 841 * We also want to subtract from the "sleep_tsc"
838 842 * the current value of tsc_read(), so that our
839 843 * adjustment accounts for the amount of time we
840 844 * have been resumed _or_ an adjustment based on
841 845 * the fact that we didn't actually power off the
842 846 * CPU (migration is another issue, but _should_
843 847 * also comply with this calculation). If the CPU
844 848 * never powered off, then:
845 849 * 'now == sleep_tsc + saved_tsc'
846 850 * and the delta will effectively be "0".
847 851 */
848 852 sleep_tsc -= now;
849 853 if (tsc_delta_onsuspend) {
850 854 tsc_adjust_delta(sleep_tsc);
851 855 } else {
852 856 tsc_adjust_delta(tsc_saved_tsc + sleep_tsc);
853 857 }
854 858 tsc_saved_tsc = 0;
855 859
856 860 tsc_tick();
857 861 }
858 862 tsc_needs_resume = 0;
859 863 }
860 864
861 865 }
|
↓ open down ↓ |
123 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX