io-lx-public-vs-joyent Wdiff usr/src/lib/commpage/amd64/cp_subr.s

Print this page

Split	Close
Expand all
Collapse all

          --- old/usr/src/lib/commpage/amd64/cp_subr.s
          +++ new/usr/src/lib/commpage/amd64/cp_subr.s

   1    1  /*
   2    2   * This file and its contents are supplied under the terms of the
   3    3   * Common Development and Distribution License ("CDDL"), version 1.0.
   4    4   * You may only use this file in accordance with the terms of version
   5    5   * 1.0 of the CDDL.
   6    6   *
   7    7   * A full copy of the text of the CDDL should have accompanied this
   8    8   * source.  A copy of the CDDL is also available via the Internet at
   9    9   * http://www.illumos.org/license/CDDL.
  10   10   */
  11   11  
  12   12  /*
  13   13   * Copyright 2016 Joyent, Inc.
  14   14   */
  15   15  
  16   16  #include <sys/asm_linkage.h>
  17   17  #include <sys/segments.h>
  18   18  #include <sys/time_impl.h>
  19   19  #include <sys/tsc.h>
  20   20  #include <cp_offsets.h>
  21   21  
  22   22  #define GETCPU_GDT_OFFSET       SEL_GDT(GDT_CPUID, SEL_UPL)
  23   23  
  24   24          .file   "cp_subr.s"
  25   25  
  26   26  /*
  27   27   * These are cloned from TSC and time related code in the kernel.  They should
  28   28   * be kept in sync in the case that the source values are changed.
  29   29   * See: uts/i86pc/os/timestamp.c
  30   30   */
  31   31  #define NSEC_SHIFT      5
  32   32  #define ADJ_SHIFT       4
  33   33  #define NANOSEC         0x3b9aca00
  34   34  
  35   35  /*
  36   36   * hrtime_t
  37   37   * __cp_tsc_read(comm_page_t *cp)
  38   38   *
  39   39   * Stack usage: 0 bytes
  40   40   */
  41   41          ENTRY_NP(__cp_tsc_read)
  42   42          movl    CP_TSC_TYPE(%rdi), %esi
  43   43          movl    CP_TSC_NCPU(%rdi), %r8d
  44   44          leaq    CP_TSC_SYNC_TICK_DELTA(%rdi), %r9
  45   45  
  46   46          cmpl    $TSC_TSCP, %esi
  47   47          jne     2f
  48   48          rdtscp
  49   49          /*
  50   50           * When the TSC is read, the low 32 bits are placed in %eax while the
  51   51           * high 32 bits are placed in %edx.  They are shifted and ORed together
  52   52           * to obtain the full 64-bit value.
  53   53           */
  54   54          shlq    $0x20, %rdx
  55   55          orq     %rdx, %rax
  56   56          cmpl    $0, %esi
  57   57          jne     1f
  58   58          ret
  59   59  1:
  60   60          /*
  61   61           * When cp_tsc_ncpu is non-zero, it indicates the length of the
  62   62           * cp_tsc_sync_tick_delta array, which contains per-CPU offsets for the
  63   63           * TSC.  The CPU ID furnished by the IA32_TSC_AUX register via rdtscp
  64   64           * is used to look up an offset value in that array and apply it to the
  65   65           * TSC reading.
  66   66           */
  67   67          movq    (%r9, %rcx, 8), %rdx
  68   68          addq    %rdx, %rax
  69   69          ret
  70   70  
  71   71  2:
  72   72          /*
  73   73           * Without rdtscp, there is no way to perform a TSC reading and
  74   74           * simultaneously query the current CPU.  If tsc_ncpu indicates that
  75   75           * per-CPU TSC offsets are present, the ID of the current CPU is
  76   76           * queried before performing a TSC reading.  It will be later compared
  77   77           * to a second CPU ID lookup to catch CPU migrations.
  78   78           *
  79   79           * This method will catch all but the most pathological scheduling.
  80   80           */
  81   81          cmpl    $0, %r8d
  82   82          je      3f
  83   83          movl    $GETCPU_GDT_OFFSET, %edx
  84   84          lsl     %dx, %edx
  85   85  
  86   86  3:
  87   87          /* Save the most recently queried CPU ID for later comparison. */
  88   88          movl    %edx, %r10d
  89   89  
  90   90          cmpl    $TSC_RDTSC_MFENCE, %esi
  91   91          jne     4f
  92   92          mfence
  93   93          rdtsc
  94   94          jmp     7f
  95   95  
  96   96  4:
  97   97          cmpl    $TSC_RDTSC_LFENCE, %esi
  98   98          jne     5f
  99   99          lfence
 100  100          rdtsc
 101  101          jmp     7f
 102  102  
 103  103  5:
 104  104          cmpl    $TSC_RDTSC_CPUID, %esi
 105  105          jne     6f
 106  106          /*
 107  107           * Since the amd64 ABI dictates that %rbx is callee-saved, it must be
 108  108           * preserved here.  Its contents will be overwritten when cpuid is used
 109  109           * as a serializing instruction.
 110  110           */
 111  111          movq    %rbx, %r11
 112  112          xorl    %eax, %eax
 113  113          cpuid
 114  114          rdtsc
 115  115          movq    %r11, %rbx
 116  116          jmp     7f
 117  117  
 118  118  6:
 119  119          /*
 120  120           * Other protections should have prevented this function from being
 121  121           * called in the first place.  The only sane action is to abort.
 122  122           * The easiest means in this context is via SIGILL.
 123  123           */
 124  124          ud2a
 125  125  
 126  126  7:
 127  127          shlq    $0x20, %rdx
 128  128          orq     %rdx, %rax
 129  129  
 130  130          /*
 131  131           * Query the current CPU again if a per-CPU offset is being applied to
 132  132           * the TSC reading.  If the result differs from the earlier reading,
 133  133           * then a migration has occured and the TSC must be read again.
 134  134           */
 135  135          cmpl    $0, %r8d
 136  136          je      8f
 137  137          movl    $GETCPU_GDT_OFFSET, %edx
 138  138          lsl     %dx, %edx
 139  139          cmpl    %edx, %r10d
 140  140          jne     3b
 141  141          movq    (%r9, %rdx, 8), %rdx
 142  142          addq    %rdx, %rax
 143  143  8:
 144  144          ret
 145  145          SET_SIZE(__cp_tsc_read)
 146  146  
 147  147  
 148  148  /*
 149  149   * uint_t
 150  150   * __cp_getcpu(comm_page_t *)
 151  151   *
 152  152   * Stack usage: 0 bytes
 153  153   */
 154  154          ENTRY_NP(__cp_getcpu)
 155  155          movl    CP_TSC_TYPE(%rdi), %edi
 156  156          /*
 157  157           * If RDTSCP is available, it is a quick way to grab the cpu_id which
 158  158           * is stored in the TSC_AUX MSR by the kernel.
 159  159           */
 160  160          cmpl    $TSC_TSCP, %edi
 161  161          jne     1f
 162  162          rdtscp
 163  163          movl    %ecx, %eax
 164  164          ret
 165  165  1:
 166  166          mov     $GETCPU_GDT_OFFSET, %eax
 167  167          lsl     %ax, %eax
 168  168          ret
 169  169          SET_SIZE(__cp_getcpu)
 170  170  
 171  171  /*
 172  172   * hrtime_t
 173  173   * __cp_gethrtime(comm_page_t *cp)
 174  174   *
 175  175   * Stack usage: 0x20 local + 0x8 call = 0x28 bytes
 176  176   *
 177  177   * %rsp+0x00 - hrtime_t tsc_last
 178  178   * %rsp+0x08 - hrtime_t hrtime_base
 179  179   * %rsp+0x10 - commpage_t *cp
 180  180   * %rsp+0x18 - int hres_lock
 181  181   */
 182  182          ENTRY_NP(__cp_gethrtime)
 183  183          subq    $0x20, %rsp
 184  184          movq    %rdi, 0x10(%rsp)
 185  185  1:
 186  186          movl    CP_HRES_LOCK(%rdi), %r9d
 187  187          movl    %r9d, 0x18(%rsp)
 188  188  
 189  189          movq    CP_TSC_LAST(%rdi), %rax
 190  190          movq    CP_TSC_HRTIME_BASE(%rdi), %rdx
 191  191          movq    %rax, (%rsp)
 192  192          movq    %rdx, 0x8(%rsp)
 193  193  
 194  194          call    __cp_tsc_read
 195  195          movq    0x10(%rsp), %rdi
 196  196  
 197  197          movl    0x18(%rsp), %r9d
 198  198          movl    CP_HRES_LOCK(%rdi), %edx
 199  199          andl    $0xfffffffe, %r9d
 200  200          cmpl    %r9d, %edx
 201  201          jne     1b
 202  202  
 203  203          /*
 204  204           * The in-kernel logic for calculating hrtime performs several checks
 205  205           * to protect against edge cases.  That logic is summarized as:
 206  206           * if (tsc >= tsc_last) {
 207  207           *         delta -= tsc_last;
 208  208           * } else if (tsc >= tsc_last - 2*tsc_max_delta) {
 209  209           *         delta = 0;
 210  210           * } else {
 211  211           *         delta = MIN(tsc, tsc_resume_cap);
 212  212           * }
 213  213           *
 214  214           * The below implementation achieves the same result, although it is
 215  215           * structured for speed and optimized for the fast path:
 216  216           *
 217  217           * delta = tsc - tsc_last;
 218  218           * if (delta < 0) {
 219  219           *         delta += (tsc_max_delta << 1);
 220  220           *         if (delta >= 0) {
 221  221           *                 delta = 0;
 222  222           *         } else {
 223  223           *                 delta = MIN(tsc, tsc_resume_cap);
 224  224           *         }
 225  225           * }
 226  226           */
 227  227          movq    (%rsp), %rdx
 228  228          subq    %rdx, %rax              /* delta = tsc - tsc_last */
 229  229          jbe     3f                      /* if (delta < 0) */
 230  230  
 231  231  2:
 232  232          /*
 233  233           * Optimized TSC_CONVERT_AND_ADD:
 234  234           * hrtime_base += (tsc_delta * nsec_scale) >> (32 - NSEC_SHIFT)
 235  235           *
 236  236           * Since the multiply and shift are done in 128-bit, there is no need
 237  237           * to worry about overflow.
 238  238           */
 239  239          movl    CP_NSEC_SCALE(%rdi), %ecx
 240  240          mulq    %rcx
 241  241          shrdq   $_CONST(32 - NSEC_SHIFT), %rdx, %rax
 242  242          movq    0x8(%rsp), %r8
 243  243          addq    %r8, %rax
 244  244  
 245  245          addq    $0x20, %rsp
 246  246          ret
 247  247  
 248  248  3:
 249  249          movq    %rax, %r9               /* save (tsc - tsc_last) in r9 */
 250  250          movl    CP_TSC_MAX_DELTA(%rdi), %ecx
 251  251          sall    $1, %ecx
 252  252          addq    %rcx, %rax              /* delta += (tsc_max_delta << 1) */
 253  253          jae     4f                      /* delta < 0 */
 254  254          xorq    %rax, %rax
 255  255          jmp     2b
 256  256  
 257  257  4:
 258  258          /*
 259  259           * Repopulate %rax with the TSC reading by adding tsc_last to %r9
 260  260           * (which holds tsc - tsc_last)
 261  261           */
 262  262          movq    (%rsp), %rax
 263  263          addq    %r9, %rax
 264  264  
 265  265          /* delta = MIN(tsc, resume_cap) */
 266  266          movq    CP_TSC_RESUME_CAP(%rdi), %rcx
 267  267          cmpq    %rcx, %rax
 268  268          jbe     5f
 269  269          movq    %rcx, %rax
 270  270  5:
 271  271          jmp     2b
 272  272  
 273  273          SET_SIZE(__cp_gethrtime)
 274  274  
 275  275  /*
 276  276   * int
 277  277   * __cp_clock_gettime_monotonic(comm_page_t *cp, timespec_t *tsp)
 278  278   *
 279  279   * Stack usage: 0x8 local + 0x8 call + 0x28 called func. = 0x38 bytes
 280  280   *
 281  281   * %rsp+0x00 - timespec_t *tsp
 282  282   */
 283  283          ENTRY_NP(__cp_clock_gettime_monotonic)
 284  284          subq    $0x8, %rsp
 285  285          movq    %rsi, (%rsp)
 286  286  
 287  287          call    __cp_gethrtime
 288  288  
 289  289          /*
 290  290           * Convert from hrtime_t (int64_t in nanoseconds) to timespec_t.
 291  291           * This uses the same approach as hrt2ts, although it has been updated
 292  292           * to utilize 64-bit math.
 293  293           * 1 / 1,000,000,000 =
 294  294           * 1000100101110000010111110100000100110110101101001010110110011B-26
 295  295           * = 0x112e0be826d694b3 * 2^-26
 296  296           *
 297  297           * secs = (nsecs * 0x112e0be826d694b3) >> 26
 298  298           *
 299  299           * In order to account for the 2s-compliment of negative inputs, a
 300  300           * final operation completes the process:
 301  301           *
 302  302           * secs -= (nsecs >> 63)
 303  303           */
 304  304          movq    %rax, %r11
 305  305          movq    $0x112e0be826d694b3, %rdx
 306  306          imulq   %rdx
 307  307          sarq    $0x1a, %rdx
 308  308          movq    %r11, %rax
 309  309          sarq    $0x3f, %rax
 310  310          subq    %rax, %rdx
 311  311          movq    (%rsp), %rsi
 312  312          movq    %rdx, (%rsi)
 313  313          /*
 314  314           * Populating tv_nsec is easier:
 315  315           * tv_nsec = nsecs - (secs * NANOSEC)
 316  316           */
 317  317          imulq   $NANOSEC, %rdx, %rdx
 318  318          subq    %rdx, %r11
 319  319          movq    %r11, 0x8(%rsi)
 320  320  
 321  321          xorl    %eax, %eax
 322  322          addq    $0x8, %rsp
 323  323          ret
 324  324          SET_SIZE(__cp_clock_gettime_monotonic)
 325  325  
 326  326  /*
 327  327   * int
 328  328   * __cp_clock_gettime_realtime(comm_page_t *cp, timespec_t *tsp)
 329  329   *
 330  330   * Stack usage: 0x18 local + 0x8 call + 0x28 called func. = 0x48 bytes
 331  331   *
 332  332   * %rsp+0x00 - commpage_t *cp
 333  333   * %rsp+0x08 - timespec_t *tsp
 334  334   * %rsp+0x10 - int hres_lock
 335  335   */
 336  336          ENTRY_NP(__cp_clock_gettime_realtime)
 337  337          subq    $0x18, %rsp
 338  338          movq    %rdi, (%rsp)
 339  339          movq    %rsi, 0x8(%rsp)
 340  340  
 341  341  1:
 342  342          movl    CP_HRES_LOCK(%rdi), %eax
 343  343          movl    %eax, 0x10(%rsp)
 344  344  
 345  345          call    __cp_gethrtime
 346  346          movq    (%rsp), %rdi
 347  347          movq    CP_HRES_LAST_TICK(%rdi), %rdx
 348  348          subq    %rdx, %rax                      /* nslt = hrtime - last_tick */
 349  349          jb      1b
 350  350          movq    CP_HRESTIME(%rdi), %r9
 351  351          movq    _CONST(CP_HRESTIME + CP_HRESTIME_INCR)(%rdi), %r10
 352  352          movl    CP_HRESTIME_ADJ(%rdi), %r11d
 353  353  
 354  354          addq    %rax, %r10                      /* now.tv_nsec += nslt */
 355  355  
 356  356          cmpl    $0, %r11d
 357  357          jb      4f                              /* hres_adj > 0 */
 358  358          ja      6f                              /* hres_adj < 0 */
 359  359  
 360  360  2:
 361  361          cmpq    $NANOSEC, %r10
 362  362          jae     8f                              /* tv_nsec >= NANOSEC */
 363  363  
 364  364  3:
 365  365          movl    0x10(%rsp), %eax
 366  366          movl    CP_HRES_LOCK(%rdi), %edx
 367  367          andl    $0xfffffffe, %edx
 368  368          cmpl    %eax, %edx
 369  369          jne     1b
 370  370  
 371  371          movq    0x8(%rsp), %rsi
 372  372          movq    %r9, (%rsi)
 373  373          movq    %r10, 0x8(%rsi)
 374  374  
 375  375          xorl    %eax, %eax
 376  376          addq    $0x18, %rsp
 377  377          ret
 378  378  
 379  379  
 380  380  4:                                              /* hres_adj > 0 */
 381  381          sarq    $ADJ_SHIFT, %rax
 382  382          cmpl    %r11d, %eax
 383  383          jbe     5f
 384  384          movl    %r11d, %eax
 385  385  5:
 386  386          addq    %rax, %r10
 387  387          jmp     2b
 388  388  
 389  389  6:                                              /* hres_adj < 0 */
 390  390          sarq    $ADJ_SHIFT, %rax
 391  391          negl    %r11d
 392  392          cmpl    %r11d, %eax
 393  393          jbe     7f
 394  394          movl    %r11d, %eax
 395  395  7:
 396  396          subq    %rax, %r10
 397  397          jmp     2b
 398  398  
 399  399  8:                                              /* tv_nsec >= NANOSEC */
 400  400          subq    $NANOSEC, %r10
 401  401          incq    %r9
 402  402          cmpq    $NANOSEC, %r10
 403  403          jae     8b
 404  404          jmp     3b
 405  405  
 406  406          SET_SIZE(__cp_clock_gettime_realtime)

↓ open down ↓

406 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX