Print this page


Split Close
Expand all
Collapse all
          --- old/usr/src/uts/i86pc/os/timestamp.c
          +++ new/usr/src/uts/i86pc/os/timestamp.c
↓ open down ↓ 137 lines elided ↑ open up ↑
 138  138          (hrt) = mul32(_l[1], scale) << NSEC_SHIFT;      \
 139  139          (hrt) += mul32(_l[0], scale) >> (32 - NSEC_SHIFT); \
 140  140  }
 141  141  
 142  142  int tsc_master_slave_sync_needed = 1;
 143  143  
 144  144  typedef struct tsc_sync {
 145  145          volatile hrtime_t master_tsc, slave_tsc;
 146  146  } tsc_sync_t;
 147  147  static tsc_sync_t *tscp;
      148 +static hrtime_t largest_tsc_delta = 0;
      149 +static ulong_t shortest_write_time = ~0UL;
 148  150  
 149  151  static hrtime_t tsc_last_jumped = 0;
 150  152  static int      tsc_jumped = 0;
 151  153  static uint32_t tsc_wayback = 0;
 152  154  /*
 153  155   * The cap of 1 second was chosen since it is the frequency at which the
 154  156   * tsc_tick() function runs which means that when gethrtime() is called it
 155  157   * should never be more than 1 second since tsc_last was updated.
 156  158   */
 157  159  static hrtime_t tsc_resume_cap_ns = NANOSEC;     /* 1s */
↓ open down ↓ 280 lines elided ↑ open up ↑
 438  440           */
 439  441  
 440  442          flags = clear_int_flag();
 441  443          hrt = tsc_gethrtimeunscaled() + tsc_sync_tick_delta[CPU->cpu_id];
 442  444          restore_int_flag(flags);
 443  445  
 444  446          return (hrt);
 445  447  }
 446  448  
 447  449  /*
 448      - * TSC Sync Master
      450 + * Called by the master in the TSC sync operation (usually the boot CPU).
      451 + * If the slave is discovered to have a skew, gethrtimef will be changed to
      452 + * point to tsc_gethrtime_delta(). Calculating skews is precise only when
      453 + * the master and slave TSCs are read simultaneously; however, there is no
      454 + * algorithm that can read both CPUs in perfect simultaneity. The proposed
      455 + * algorithm is an approximate method based on the behaviour of cache
      456 + * management. The slave CPU continuously reads TSC and then reads a global
      457 + * variable which the master CPU updates. The moment the master's update reaches
      458 + * the slave's visibility (being forced by an mfence operation) we use the TSC
      459 + * reading taken on the slave. A corresponding TSC read will be taken on the
      460 + * master as soon as possible after finishing the mfence operation. But the
      461 + * delay between causing the slave to notice the invalid cache line and the
      462 + * competion of mfence is not repeatable. This error is heuristically assumed
      463 + * to be 1/4th of the total write time as being measured by the two TSC reads
      464 + * on the master sandwiching the mfence. Furthermore, due to the nature of
      465 + * bus arbitration, contention on memory bus, etc., the time taken for the write
      466 + * to reflect globally can vary a lot. So instead of taking a single reading,
      467 + * a set of readings are taken and the one with least write time is chosen
      468 + * to calculate the final skew.
 449  469   *
 450      - * Typically called on the boot CPU, this attempts to quantify TSC skew between
 451      - * different CPUs.  If an appreciable difference is found, gethrtimef will be
 452      - * changed to point to tsc_gethrtime_delta().
 453      - *
 454      - * Calculating skews is precise only when the master and slave TSCs are read
 455      - * simultaneously; however, there is no algorithm that can read both CPUs in
 456      - * perfect simultaneity.  The proposed algorithm is an approximate method based
 457      - * on the behaviour of cache management.  The slave CPU continuously polls the
 458      - * TSC while reading a global variable updated by the master CPU.  The latest
 459      - * TSC reading is saved when the master's update (forced via mfence) reaches
 460      - * visibility on the slave.  The master will also take a TSC reading
 461      - * immediately following the mfence.
 462      - *
 463      - * While the delay between cache line invalidation on the slave and mfence
 464      - * completion on the master is not repeatable, the error is heuristically
 465      - * assumed to be 1/4th of the write time recorded by the master.  Multiple
 466      - * samples are taken to control for the variance caused by external factors
 467      - * such as bus contention.  Each sample set is independent per-CPU to control
 468      - * for differing memory latency on NUMA systems.
 469      - *
 470  470   * TSC sync is disabled in the context of virtualization because the CPUs
 471  471   * assigned to the guest are virtual CPUs which means the real CPUs on which
 472  472   * guest runs keep changing during life time of guest OS. So we would end up
 473  473   * calculating TSC skews for a set of CPUs during boot whereas the guest
 474  474   * might migrate to a different set of physical CPUs at a later point of
 475  475   * time.
 476  476   */
 477  477  void
 478  478  tsc_sync_master(processorid_t slave)
 479  479  {
 480  480          ulong_t flags, source, min_write_time = ~0UL;
 481      -        hrtime_t write_time, mtsc_after, last_delta = 0;
      481 +        hrtime_t write_time, x, mtsc_after, tdelta;
 482  482          tsc_sync_t *tsc = tscp;
 483  483          int cnt;
 484  484          int hwtype;
 485  485  
 486  486          hwtype = get_hwenv();
 487  487          if (!tsc_master_slave_sync_needed || (hwtype & HW_VIRTUAL) != 0)
 488  488                  return;
 489  489  
 490  490          flags = clear_int_flag();
 491  491          source = CPU->cpu_id;
↓ open down ↓ 2 lines elided ↑ open up ↑
 494  494                  while (tsc_sync_go != TSC_SYNC_GO)
 495  495                          SMT_PAUSE();
 496  496  
 497  497                  tsc->master_tsc = tsc_read();
 498  498                  membar_enter();
 499  499                  mtsc_after = tsc_read();
 500  500                  while (tsc_sync_go != TSC_SYNC_DONE)
 501  501                          SMT_PAUSE();
 502  502                  write_time =  mtsc_after - tsc->master_tsc;
 503  503                  if (write_time <= min_write_time) {
 504      -                        hrtime_t tdelta;
 505      -
 506      -                        tdelta = tsc->slave_tsc - mtsc_after;
 507      -                        if (tdelta < 0)
 508      -                                tdelta = -tdelta;
      504 +                        min_write_time = write_time;
 509  505                          /*
 510      -                         * If the margin exists, subtract 1/4th of the measured
 511      -                         * write time from the master's TSC value.  This is an
 512      -                         * estimate of how late the mfence completion came
 513      -                         * after the slave noticed the cache line change.
      506 +                         * Apply heuristic adjustment only if the calculated
      507 +                         * delta is > 1/4th of the write time.
 514  508                           */
 515      -                        if (tdelta > (write_time/4)) {
      509 +                        x = tsc->slave_tsc - mtsc_after;
      510 +                        if (x < 0)
      511 +                                x = -x;
      512 +                        if (x > (min_write_time/4))
      513 +                                /*
      514 +                                 * Subtract 1/4th of the measured write time
      515 +                                 * from the master's TSC value, as an estimate
      516 +                                 * of how late the mfence completion came
      517 +                                 * after the slave noticed the cache line
      518 +                                 * change.
      519 +                                 */
 516  520                                  tdelta = tsc->slave_tsc -
 517      -                                    (mtsc_after - (write_time/4));
 518      -                        } else {
      521 +                                    (mtsc_after - (min_write_time/4));
      522 +                        else
 519  523                                  tdelta = tsc->slave_tsc - mtsc_after;
 520      -                        }
 521      -                        last_delta = tsc_sync_tick_delta[source] - tdelta;
 522      -                        tsc_sync_tick_delta[slave] = last_delta;
 523      -                        min_write_time = write_time;
      524 +                        tsc_sync_tick_delta[slave] =
      525 +                            tsc_sync_tick_delta[source] - tdelta;
 524  526                  }
 525  527  
 526  528                  tsc->master_tsc = tsc->slave_tsc = write_time = 0;
 527  529                  membar_enter();
 528  530                  tsc_sync_go = TSC_SYNC_STOP;
 529  531          }
 530      -
      532 +        if (tdelta < 0)
      533 +                tdelta = -tdelta;
      534 +        if (tdelta > largest_tsc_delta)
      535 +                largest_tsc_delta = tdelta;
      536 +        if (min_write_time < shortest_write_time)
      537 +                shortest_write_time = min_write_time;
 531  538          /*
 532      -         * Only enable the delta variants of the TSC functions if the measured
 533      -         * skew is greater than the fastest write time.
      539 +         * Enable delta variants of tsc functions if the largest of all chosen
      540 +         * deltas is > smallest of the write time.
 534  541           */
 535      -        last_delta = (last_delta < 0) ? -last_delta : last_delta;
 536      -        if (last_delta > min_write_time) {
      542 +        if (largest_tsc_delta > shortest_write_time) {
 537  543                  gethrtimef = tsc_gethrtime_delta;
 538  544                  gethrtimeunscaledf = tsc_gethrtimeunscaled_delta;
 539  545                  tsc_ncpu = NCPU;
 540  546          }
 541  547          restore_int_flag(flags);
 542  548  }
 543  549  
 544  550  /*
 545      - * TSC Sync Slave
 546      - *
 547  551   * Called by a CPU which has just been onlined.  It is expected that the CPU
 548  552   * performing the online operation will call tsc_sync_master().
 549  553   *
 550      - * Like tsc_sync_master, this logic is skipped on virtualized platforms.
      554 + * TSC sync is disabled in the context of virtualization. See comments
      555 + * above tsc_sync_master.
 551  556   */
 552  557  void
 553  558  tsc_sync_slave(void)
 554  559  {
 555  560          ulong_t flags;
 556  561          hrtime_t s1;
 557  562          tsc_sync_t *tsc = tscp;
 558  563          int cnt;
 559  564          int hwtype;
 560  565  
↓ open down ↓ 3 lines elided ↑ open up ↑
 564  569  
 565  570          flags = clear_int_flag();
 566  571  
 567  572          for (cnt = 0; cnt < SYNC_ITERATIONS; cnt++) {
 568  573                  /* Re-fill the cache line */
 569  574                  s1 = tsc->master_tsc;
 570  575                  membar_enter();
 571  576                  tsc_sync_go = TSC_SYNC_GO;
 572  577                  do {
 573  578                          /*
 574      -                         * Do not put an SMT_PAUSE here.  If the master and
 575      -                         * slave are the same hyper-threaded CPU, we want the
 576      -                         * master to yield as quickly as possible to the slave.
      579 +                         * Do not put an SMT_PAUSE here. For instance,
      580 +                         * if the master and slave are really the same
      581 +                         * hyper-threaded CPU, then you want the master
      582 +                         * to yield to the slave as quickly as possible here,
      583 +                         * but not the other way.
 577  584                           */
 578  585                          s1 = tsc_read();
 579  586                  } while (tsc->master_tsc == 0);
 580  587                  tsc->slave_tsc = s1;
 581  588                  membar_enter();
 582  589                  tsc_sync_go = TSC_SYNC_DONE;
 583  590  
 584  591                  while (tsc_sync_go != TSC_SYNC_STOP)
 585  592                          SMT_PAUSE();
 586  593          }
↓ open down ↓ 110 lines elided ↑ open up ↑
 697  704                  TSC_CONVERT(tsc_resume_cap_ns, tsc_resume_cap, nsec_unscale);
 698  705  }
 699  706  
 700  707  int
 701  708  get_tsc_ready()
 702  709  {
 703  710          return (tsc_ready);
 704  711  }
 705  712  
 706  713  /*
 707      - * Adjust all the deltas by adding the passed value to the array and activate
 708      - * the "delta" versions of the gethrtime functions.  It is possible that the
 709      - * adjustment could be negative.  Such may occur if the SunOS instance was
 710      - * moved by a virtual manager to a machine with a higher value of TSC.
      714 + * Adjust all the deltas by adding the passed value to the array.
      715 + * Then use the "delt" versions of the the gethrtime functions.
      716 + * Note that 'tdelta' _could_ be a negative number, which should
      717 + * reduce the values in the array (used, for example, if the Solaris
      718 + * instance was moved by a virtual manager to a machine with a higher
      719 + * value of tsc).
 711  720   */
 712  721  void
 713  722  tsc_adjust_delta(hrtime_t tdelta)
 714  723  {
 715  724          int             i;
 716  725  
 717  726          for (i = 0; i < NCPU; i++) {
 718  727                  tsc_sync_tick_delta[i] += tdelta;
 719  728          }
 720  729  
 721  730          gethrtimef = tsc_gethrtime_delta;
 722  731          gethrtimeunscaledf = tsc_gethrtimeunscaled_delta;
 723  732          tsc_ncpu = NCPU;
 724  733  }
 725  734  
 726  735  /*
 727  736   * Functions to manage TSC and high-res time on suspend and resume.
 728  737   */
 729  738  
 730      -/* tod_ops from "uts/i86pc/io/todpc_subr.c" */
      739 +/*
      740 + * declarations needed for time adjustment
      741 + */
      742 +extern void     rtcsync(void);
 731  743  extern tod_ops_t *tod_ops;
 732      -
      744 +/* There must be a better way than exposing nsec_scale! */
      745 +extern uint_t   nsec_scale;
 733  746  static uint64_t tsc_saved_tsc = 0; /* 1 in 2^64 chance this'll screw up! */
 734  747  static timestruc_t tsc_saved_ts;
 735  748  static int      tsc_needs_resume = 0;   /* We only want to do this once. */
 736  749  int             tsc_delta_onsuspend = 0;
 737  750  int             tsc_adjust_seconds = 1;
 738  751  int             tsc_suspend_count = 0;
 739  752  int             tsc_resume_in_cyclic = 0;
 740  753  
 741  754  /*
 742      - * Take snapshots of the current time and do any other pre-suspend work.
      755 + * Let timestamp.c know that we are suspending.  It needs to take
      756 + * snapshots of the current time, and do any pre-suspend work.
 743  757   */
 744  758  void
 745  759  tsc_suspend(void)
 746  760  {
 747      -        /*
 748      -         * We need to collect the time at which we suspended here so we know
 749      -         * now much should be added during the resume.  This is called by each
 750      -         * CPU, so reentry must be properly handled.
 751      -         */
      761 +/*
      762 + * What we need to do here, is to get the time we suspended, so that we
      763 + * know how much we should add to the resume.
      764 + * This routine is called by each CPU, so we need to handle reentry.
      765 + */
 752  766          if (tsc_gethrtime_enable) {
 753  767                  /*
 754      -                 * Perform the tsc_read after acquiring the lock to make it as
 755      -                 * accurate as possible in the face of contention.
      768 +                 * We put the tsc_read() inside the lock as it
      769 +                 * as no locking constraints, and it puts the
      770 +                 * aquired value closer to the time stamp (in
      771 +                 * case we delay getting the lock).
 756  772                   */
 757  773                  mutex_enter(&tod_lock);
 758  774                  tsc_saved_tsc = tsc_read();
 759  775                  tsc_saved_ts = TODOP_GET(tod_ops);
 760  776                  mutex_exit(&tod_lock);
 761  777                  /* We only want to do this once. */
 762  778                  if (tsc_needs_resume == 0) {
 763  779                          if (tsc_delta_onsuspend) {
 764  780                                  tsc_adjust_delta(tsc_saved_tsc);
 765  781                          } else {
↓ open down ↓ 1 lines elided ↑ open up ↑
 767  783                          }
 768  784                          tsc_suspend_count++;
 769  785                  }
 770  786          }
 771  787  
 772  788          invalidate_cache();
 773  789          tsc_needs_resume = 1;
 774  790  }
 775  791  
 776  792  /*
 777      - * Restore all timestamp state based on the snapshots taken at suspend time.
      793 + * Restore all timestamp state based on the snapshots taken at
      794 + * suspend time.
 778  795   */
 779  796  void
 780  797  tsc_resume(void)
 781  798  {
 782  799          /*
 783  800           * We only need to (and want to) do this once.  So let the first
 784  801           * caller handle this (we are locked by the cpu lock), as it
 785  802           * is preferential that we get the earliest sync.
 786  803           */
 787  804          if (tsc_needs_resume) {
↓ open down ↓ 61 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX