io-lx-public-vs-joyent Old usr/src/lib/commpage/amd64/cp

   1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright 2016 Joyent, Inc.
  14  */
  15 
  16 #include <sys/asm_linkage.h>
  17 #include <sys/segments.h>
  18 #include <sys/time_impl.h>
  19 #include <sys/tsc.h>
  20 #include <cp_offsets.h>
  21 
  22 #define GETCPU_GDT_OFFSET       SEL_GDT(GDT_CPUID, SEL_UPL)
  23 
  24         .file   "cp_subr.s"
  25 
  26 /*
  27  * These are cloned from TSC and time related code in the kernel.  They should
  28  * be kept in sync in the case that the source values are changed.
  29  * See: uts/i86pc/os/timestamp.c
  30  */
  31 #define NSEC_SHIFT      5
  32 #define ADJ_SHIFT       4
  33 #define NANOSEC         0x3b9aca00
  34 
  35 /*
  36  * hrtime_t
  37  * __cp_tsc_read(comm_page_t *cp)
  38  *
  39  * Stack usage: 0 bytes
  40  */
  41         ENTRY_NP(__cp_tsc_read)
  42         movl    CP_TSC_TYPE(%rdi), %esi
  43         movl    CP_TSC_NCPU(%rdi), %r8d
  44         leaq    CP_TSC_SYNC_TICK_DELTA(%rdi), %r9
  45 
  46         cmpl    $TSC_TSCP, %esi
  47         jne     2f
  48         rdtscp
  49         /*
  50          * When the TSC is read, the low 32 bits are placed in %eax while the
  51          * high 32 bits are placed in %edx.  They are shifted and ORed together
  52          * to obtain the full 64-bit value.
  53          */
  54         shlq    $0x20, %rdx
  55         orq     %rdx, %rax
  56         cmpl    $0, %esi
  57         jne     1f
  58         ret
  59 1:
  60         /*
  61          * When cp_tsc_ncpu is non-zero, it indicates the length of the
  62          * cp_tsc_sync_tick_delta array, which contains per-CPU offsets for the
  63          * TSC.  The CPU ID furnished by the IA32_TSC_AUX register via rdtscp
  64          * is used to look up an offset value in that array and apply it to the
  65          * TSC reading.
  66          */
  67         movq    (%r9, %rcx, 8), %rdx
  68         addq    %rdx, %rax
  69         ret
  70 
  71 2:
  72         /*
  73          * Without rdtscp, there is no way to perform a TSC reading and
  74          * simultaneously query the current CPU.  If tsc_ncpu indicates that
  75          * per-CPU TSC offsets are present, the ID of the current CPU is
  76          * queried before performing a TSC reading.  It will be later compared
  77          * to a second CPU ID lookup to catch CPU migrations.
  78          *
  79          * This method will catch all but the most pathological scheduling.
  80          */
  81         cmpl    $0, %r8d
  82         je      3f
  83         movl    $GETCPU_GDT_OFFSET, %edx
  84         lsl     %dx, %edx
  85 
  86 3:
  87         /* Save the most recently queried CPU ID for later comparison. */
  88         movl    %edx, %r10d
  89 
  90         cmpl    $TSC_RDTSC_MFENCE, %esi
  91         jne     4f
  92         mfence
  93         rdtsc
  94         jmp     7f
  95 
  96 4:
  97         cmpl    $TSC_RDTSC_LFENCE, %esi
  98         jne     5f
  99         lfence
 100         rdtsc
 101         jmp     7f
 102 
 103 5:
 104         cmpl    $TSC_RDTSC_CPUID, %esi
 105         jne     6f
 106         /*
 107          * Since the amd64 ABI dictates that %rbx is callee-saved, it must be
 108          * preserved here.  Its contents will be overwritten when cpuid is used
 109          * as a serializing instruction.
 110          */
 111         movq    %rbx, %r11
 112         xorl    %eax, %eax
 113         cpuid
 114         rdtsc
 115         movq    %r11, %rbx
 116         jmp     7f
 117 
 118 6:
 119         /*
 120          * Other protections should have prevented this function from being
 121          * called in the first place.  The only sane action is to abort.
 122          * The easiest means in this context is via SIGILL.
 123          */
 124         ud2a
 125 
 126 7:
 127         shlq    $0x20, %rdx
 128         orq     %rdx, %rax
 129 
 130         /*
 131          * Query the current CPU again if a per-CPU offset is being applied to
 132          * the TSC reading.  If the result differs from the earlier reading,
 133          * then a migration has occured and the TSC must be read again.
 134          */
 135         cmpl    $0, %r8d
 136         je      8f
 137         movl    $GETCPU_GDT_OFFSET, %edx
 138         lsl     %dx, %edx
 139         cmpl    %edx, %r10d
 140         jne     3b
 141         movq    (%r9, %rdx, 8), %rdx
 142         addq    %rdx, %rax
 143 8:
 144         ret
 145         SET_SIZE(__cp_tsc_read)
 146 
 147 
 148 /*
 149  * uint_t
 150  * __cp_getcpu(comm_page_t *)
 151  *
 152  * Stack usage: 0 bytes
 153  */
 154         ENTRY_NP(__cp_getcpu)
 155         movl    CP_TSC_TYPE(%rdi), %edi
 156         /*
 157          * If RDTSCP is available, it is a quick way to grab the cpu_id which
 158          * is stored in the TSC_AUX MSR by the kernel.
 159          */
 160         cmpl    $TSC_TSCP, %edi
 161         jne     1f
 162         rdtscp
 163         movl    %ecx, %eax
 164         ret
 165 1:
 166         mov     $GETCPU_GDT_OFFSET, %eax
 167         lsl     %ax, %eax
 168         ret
 169         SET_SIZE(__cp_getcpu)
 170 
 171 /*
 172  * hrtime_t
 173  * __cp_gethrtime(comm_page_t *cp)
 174  *
 175  * Stack usage: 0x20 local + 0x8 call = 0x28 bytes
 176  *
 177  * %rsp+0x00 - hrtime_t tsc_last
 178  * %rsp+0x08 - hrtime_t hrtime_base
 179  * %rsp+0x10 - commpage_t *cp
 180  * %rsp+0x18 - int hres_lock
 181  */
 182         ENTRY_NP(__cp_gethrtime)
 183         subq    $0x20, %rsp
 184         movq    %rdi, 0x10(%rsp)
 185 1:
 186         movl    CP_HRES_LOCK(%rdi), %r9d
 187         movl    %r9d, 0x18(%rsp)
 188 
 189         movq    CP_TSC_LAST(%rdi), %rax
 190         movq    CP_TSC_HRTIME_BASE(%rdi), %rdx
 191         movq    %rax, (%rsp)
 192         movq    %rdx, 0x8(%rsp)
 193 
 194         call    __cp_tsc_read
 195         movq    0x10(%rsp), %rdi
 196 
 197         movl    0x18(%rsp), %r9d
 198         movl    CP_HRES_LOCK(%rdi), %edx
 199         andl    $0xfffffffe, %r9d
 200         cmpl    %r9d, %edx
 201         jne     1b
 202 
 203         /*
 204          * The in-kernel logic for calculating hrtime performs several checks
 205          * to protect against edge cases.  That logic is summarized as:
 206          * if (tsc >= tsc_last) {
 207          *         delta -= tsc_last;
 208          * } else if (tsc >= tsc_last - 2*tsc_max_delta) {
 209          *         delta = 0;
 210          * } else {
 211          *         delta = MIN(tsc, tsc_resume_cap);
 212          * }
 213          *
 214          * The below implementation achieves the same result, although it is
 215          * structured for speed and optimized for the fast path:
 216          *
 217          * delta = tsc - tsc_last;
 218          * if (delta < 0) {
 219          *         delta += (tsc_max_delta << 1);
 220          *         if (delta >= 0) {
 221          *                 delta = 0;
 222          *         } else {
 223          *                 delta = MIN(tsc, tsc_resume_cap);
 224          *         }
 225          * }
 226          */
 227         movq    (%rsp), %rdx
 228         subq    %rdx, %rax              /* delta = tsc - tsc_last */
 229         jbe     3f                      /* if (delta < 0) */
 230 
 231 2:
 232         /*
 233          * Optimized TSC_CONVERT_AND_ADD:
 234          * hrtime_base += (tsc_delta * nsec_scale) >> (32 - NSEC_SHIFT)
 235          *
 236          * Since the multiply and shift are done in 128-bit, there is no need
 237          * to worry about overflow.
 238          */
 239         movl    CP_NSEC_SCALE(%rdi), %ecx
 240         mulq    %rcx
 241         shrdq   $_CONST(32 - NSEC_SHIFT), %rdx, %rax
 242         movq    0x8(%rsp), %r8
 243         addq    %r8, %rax
 244 
 245         addq    $0x20, %rsp
 246         ret
 247 
 248 3:
 249         movq    %rax, %r9               /* save (tsc - tsc_last) in r9 */
 250         movl    CP_TSC_MAX_DELTA(%rdi), %ecx
 251         sall    $1, %ecx
 252         addq    %rcx, %rax              /* delta += (tsc_max_delta << 1) */
 253         jae     4f                      /* delta < 0 */
 254         xorq    %rax, %rax
 255         jmp     2b
 256 
 257 4:
 258         /*
 259          * Repopulate %rax with the TSC reading by adding tsc_last to %r9
 260          * (which holds tsc - tsc_last)
 261          */
 262         movq    (%rsp), %rax
 263         addq    %r9, %rax
 264 
 265         /* delta = MIN(tsc, resume_cap) */
 266         movq    CP_TSC_RESUME_CAP(%rdi), %rcx
 267         cmpq    %rcx, %rax
 268         jbe     5f
 269         movq    %rcx, %rax
 270 5:
 271         jmp     2b
 272 
 273         SET_SIZE(__cp_gethrtime)
 274 
 275 /*
 276  * int
 277  * __cp_clock_gettime_monotonic(comm_page_t *cp, timespec_t *tsp)
 278  *
 279  * Stack usage: 0x8 local + 0x8 call + 0x28 called func. = 0x38 bytes
 280  *
 281  * %rsp+0x00 - timespec_t *tsp
 282  */
 283         ENTRY_NP(__cp_clock_gettime_monotonic)
 284         subq    $0x8, %rsp
 285         movq    %rsi, (%rsp)
 286 
 287         call    __cp_gethrtime
 288 
 289         /*
 290          * Convert from hrtime_t (int64_t in nanoseconds) to timespec_t.
 291          * This uses the same approach as hrt2ts, although it has been updated
 292          * to utilize 64-bit math.
 293          * 1 / 1,000,000,000 =
 294          * 1000100101110000010111110100000100110110101101001010110110011B-26
 295          * = 0x112e0be826d694b3 * 2^-26
 296          *
 297          * secs = (nsecs * 0x112e0be826d694b3) >> 26
 298          *
 299          * In order to account for the 2s-compliment of negative inputs, a
 300          * final operation completes the process:
 301          *
 302          * secs -= (nsecs >> 63)
 303          */
 304         movq    %rax, %r11
 305         movq    $0x112e0be826d694b3, %rdx
 306         imulq   %rdx
 307         sarq    $0x1a, %rdx
 308         movq    %r11, %rax
 309         sarq    $0x3f, %rax
 310         subq    %rax, %rdx
 311         movq    (%rsp), %rsi
 312         movq    %rdx, (%rsi)
 313         /*
 314          * Populating tv_nsec is easier:
 315          * tv_nsec = nsecs - (secs * NANOSEC)
 316          */
 317         imulq   $NANOSEC, %rdx, %rdx
 318         subq    %rdx, %r11
 319         movq    %r11, 0x8(%rsi)
 320 
 321         xorl    %eax, %eax
 322         addq    $0x8, %rsp
 323         ret
 324         SET_SIZE(__cp_clock_gettime_monotonic)
 325 
 326 /*
 327  * int
 328  * __cp_clock_gettime_realtime(comm_page_t *cp, timespec_t *tsp)
 329  *
 330  * Stack usage: 0x18 local + 0x8 call + 0x28 called func. = 0x48 bytes
 331  *
 332  * %rsp+0x00 - commpage_t *cp
 333  * %rsp+0x08 - timespec_t *tsp
 334  * %rsp+0x10 - int hres_lock
 335  */
 336         ENTRY_NP(__cp_clock_gettime_realtime)
 337         subq    $0x18, %rsp
 338         movq    %rdi, (%rsp)
 339         movq    %rsi, 0x8(%rsp)
 340 
 341 1:
 342         movl    CP_HRES_LOCK(%rdi), %eax
 343         movl    %eax, 0x10(%rsp)
 344 
 345         call    __cp_gethrtime
 346         movq    (%rsp), %rdi
 347         movq    CP_HRES_LAST_TICK(%rdi), %rdx
 348         subq    %rdx, %rax                      /* nslt = hrtime - last_tick */
 349         jb      1b
 350         movq    CP_HRESTIME(%rdi), %r9
 351         movq    _CONST(CP_HRESTIME + CP_HRESTIME_INCR)(%rdi), %r10
 352         movl    CP_HRESTIME_ADJ(%rdi), %r11d
 353 
 354         addq    %rax, %r10                      /* now.tv_nsec += nslt */
 355 
 356         cmpl    $0, %r11d
 357         jb      4f                              /* hres_adj > 0 */
 358         ja      6f                              /* hres_adj < 0 */
 359 
 360 2:
 361         cmpq    $NANOSEC, %r10
 362         jae     8f                              /* tv_nsec >= NANOSEC */
 363 
 364 3:
 365         movl    0x10(%rsp), %eax
 366         movl    CP_HRES_LOCK(%rdi), %edx
 367         andl    $0xfffffffe, %edx
 368         cmpl    %eax, %edx
 369         jne     1b
 370 
 371         movq    0x8(%rsp), %rsi
 372         movq    %r9, (%rsi)
 373         movq    %r10, 0x8(%rsi)
 374 
 375         xorl    %eax, %eax
 376         addq    $0x18, %rsp
 377         ret
 378 
 379 
 380 4:                                              /* hres_adj > 0 */
 381         sarq    $ADJ_SHIFT, %rax
 382         cmpl    %r11d, %eax
 383         jbe     5f
 384         movl    %r11d, %eax
 385 5:
 386         addq    %rax, %r10
 387         jmp     2b
 388 
 389 6:                                              /* hres_adj < 0 */
 390         sarq    $ADJ_SHIFT, %rax
 391         negl    %r11d
 392         cmpl    %r11d, %eax
 393         jbe     7f
 394         movl    %r11d, %eax
 395 7:
 396         subq    %rax, %r10
 397         jmp     2b
 398 
 399 8:                                              /* tv_nsec >= NANOSEC */
 400         subq    $NANOSEC, %r10
 401         incq    %r9
 402         cmpq    $NANOSEC, %r10
 403         jae     8b
 404         jmp     3b
 405 
 406         SET_SIZE(__cp_clock_gettime_realtime)