Print this page
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/lib/commpage/amd64/cp_subr.s
+++ new/usr/src/lib/commpage/amd64/cp_subr.s
1 1 /*
2 2 * This file and its contents are supplied under the terms of the
3 3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 4 * You may only use this file in accordance with the terms of version
5 5 * 1.0 of the CDDL.
6 6 *
7 7 * A full copy of the text of the CDDL should have accompanied this
8 8 * source. A copy of the CDDL is also available via the Internet at
9 9 * http://www.illumos.org/license/CDDL.
10 10 */
11 11
12 12 /*
13 13 * Copyright 2016 Joyent, Inc.
14 14 */
15 15
16 16 #include <sys/asm_linkage.h>
17 17 #include <sys/segments.h>
18 18 #include <sys/time_impl.h>
19 19 #include <sys/tsc.h>
20 20 #include <cp_offsets.h>
21 21
22 22 #define GETCPU_GDT_OFFSET SEL_GDT(GDT_CPUID, SEL_UPL)
23 23
24 24 .file "cp_subr.s"
25 25
26 26 /*
27 27 * These are cloned from TSC and time related code in the kernel. They should
28 28 * be kept in sync in the case that the source values are changed.
29 29 * See: uts/i86pc/os/timestamp.c
30 30 */
31 31 #define NSEC_SHIFT 5
32 32 #define ADJ_SHIFT 4
33 33 #define NANOSEC 0x3b9aca00
34 34
35 35 /*
36 36 * hrtime_t
37 37 * __cp_tsc_read(comm_page_t *cp)
38 38 *
39 39 * Stack usage: 0 bytes
40 40 */
41 41 ENTRY_NP(__cp_tsc_read)
42 42 movl CP_TSC_TYPE(%rdi), %esi
43 43 movl CP_TSC_NCPU(%rdi), %r8d
44 44 leaq CP_TSC_SYNC_TICK_DELTA(%rdi), %r9
45 45
46 46 cmpl $TSC_TSCP, %esi
47 47 jne 2f
48 48 rdtscp
49 49 /*
50 50 * When the TSC is read, the low 32 bits are placed in %eax while the
51 51 * high 32 bits are placed in %edx. They are shifted and ORed together
52 52 * to obtain the full 64-bit value.
53 53 */
54 54 shlq $0x20, %rdx
55 55 orq %rdx, %rax
56 56 cmpl $0, %esi
57 57 jne 1f
58 58 ret
59 59 1:
60 60 /*
61 61 * When cp_tsc_ncpu is non-zero, it indicates the length of the
62 62 * cp_tsc_sync_tick_delta array, which contains per-CPU offsets for the
63 63 * TSC. The CPU ID furnished by the IA32_TSC_AUX register via rdtscp
64 64 * is used to look up an offset value in that array and apply it to the
65 65 * TSC reading.
66 66 */
67 67 movq (%r9, %rcx, 8), %rdx
68 68 addq %rdx, %rax
69 69 ret
70 70
71 71 2:
72 72 /*
73 73 * Without rdtscp, there is no way to perform a TSC reading and
74 74 * simultaneously query the current CPU. If tsc_ncpu indicates that
75 75 * per-CPU TSC offsets are present, the ID of the current CPU is
76 76 * queried before performing a TSC reading. It will be later compared
77 77 * to a second CPU ID lookup to catch CPU migrations.
78 78 *
79 79 * This method will catch all but the most pathological scheduling.
80 80 */
81 81 cmpl $0, %r8d
82 82 je 3f
83 83 movl $GETCPU_GDT_OFFSET, %edx
84 84 lsl %dx, %edx
85 85
86 86 3:
87 87 /* Save the most recently queried CPU ID for later comparison. */
88 88 movl %edx, %r10d
89 89
90 90 cmpl $TSC_RDTSC_MFENCE, %esi
91 91 jne 4f
92 92 mfence
93 93 rdtsc
94 94 jmp 7f
95 95
96 96 4:
97 97 cmpl $TSC_RDTSC_LFENCE, %esi
98 98 jne 5f
99 99 lfence
100 100 rdtsc
101 101 jmp 7f
102 102
103 103 5:
104 104 cmpl $TSC_RDTSC_CPUID, %esi
105 105 jne 6f
106 106 /*
107 107 * Since the amd64 ABI dictates that %rbx is callee-saved, it must be
108 108 * preserved here. Its contents will be overwritten when cpuid is used
109 109 * as a serializing instruction.
110 110 */
111 111 movq %rbx, %r11
112 112 xorl %eax, %eax
113 113 cpuid
114 114 rdtsc
115 115 movq %r11, %rbx
116 116 jmp 7f
117 117
118 118 6:
119 119 /*
120 120 * Other protections should have prevented this function from being
121 121 * called in the first place. The only sane action is to abort.
122 122 * The easiest means in this context is via SIGILL.
123 123 */
124 124 ud2a
125 125
126 126 7:
127 127 shlq $0x20, %rdx
128 128 orq %rdx, %rax
129 129
130 130 /*
131 131 * Query the current CPU again if a per-CPU offset is being applied to
132 132 * the TSC reading. If the result differs from the earlier reading,
133 133 * then a migration has occured and the TSC must be read again.
134 134 */
135 135 cmpl $0, %r8d
136 136 je 8f
137 137 movl $GETCPU_GDT_OFFSET, %edx
138 138 lsl %dx, %edx
139 139 cmpl %edx, %r10d
140 140 jne 3b
141 141 movq (%r9, %rdx, 8), %rdx
142 142 addq %rdx, %rax
143 143 8:
144 144 ret
145 145 SET_SIZE(__cp_tsc_read)
146 146
147 147
148 148 /*
149 149 * uint_t
150 150 * __cp_getcpu(comm_page_t *)
151 151 *
152 152 * Stack usage: 0 bytes
153 153 */
154 154 ENTRY_NP(__cp_getcpu)
155 155 movl CP_TSC_TYPE(%rdi), %edi
156 156 /*
157 157 * If RDTSCP is available, it is a quick way to grab the cpu_id which
158 158 * is stored in the TSC_AUX MSR by the kernel.
159 159 */
160 160 cmpl $TSC_TSCP, %edi
161 161 jne 1f
162 162 rdtscp
163 163 movl %ecx, %eax
164 164 ret
165 165 1:
166 166 mov $GETCPU_GDT_OFFSET, %eax
167 167 lsl %ax, %eax
168 168 ret
169 169 SET_SIZE(__cp_getcpu)
170 170
171 171 /*
172 172 * hrtime_t
173 173 * __cp_gethrtime(comm_page_t *cp)
174 174 *
175 175 * Stack usage: 0x20 local + 0x8 call = 0x28 bytes
176 176 *
177 177 * %rsp+0x00 - hrtime_t tsc_last
178 178 * %rsp+0x08 - hrtime_t hrtime_base
179 179 * %rsp+0x10 - commpage_t *cp
180 180 * %rsp+0x18 - int hres_lock
181 181 */
182 182 ENTRY_NP(__cp_gethrtime)
183 183 subq $0x20, %rsp
184 184 movq %rdi, 0x10(%rsp)
185 185 1:
186 186 movl CP_HRES_LOCK(%rdi), %r9d
187 187 movl %r9d, 0x18(%rsp)
188 188
189 189 movq CP_TSC_LAST(%rdi), %rax
190 190 movq CP_TSC_HRTIME_BASE(%rdi), %rdx
191 191 movq %rax, (%rsp)
192 192 movq %rdx, 0x8(%rsp)
193 193
194 194 call __cp_tsc_read
195 195 movq 0x10(%rsp), %rdi
196 196
197 197 movl 0x18(%rsp), %r9d
198 198 movl CP_HRES_LOCK(%rdi), %edx
199 199 andl $0xfffffffe, %r9d
200 200 cmpl %r9d, %edx
201 201 jne 1b
202 202
203 203 /*
204 204 * The in-kernel logic for calculating hrtime performs several checks
205 205 * to protect against edge cases. That logic is summarized as:
206 206 * if (tsc >= tsc_last) {
207 207 * delta -= tsc_last;
208 208 * } else if (tsc >= tsc_last - 2*tsc_max_delta) {
209 209 * delta = 0;
210 210 * } else {
211 211 * delta = MIN(tsc, tsc_resume_cap);
212 212 * }
213 213 *
214 214 * The below implementation achieves the same result, although it is
215 215 * structured for speed and optimized for the fast path:
216 216 *
217 217 * delta = tsc - tsc_last;
218 218 * if (delta < 0) {
219 219 * delta += (tsc_max_delta << 1);
220 220 * if (delta >= 0) {
221 221 * delta = 0;
222 222 * } else {
223 223 * delta = MIN(tsc, tsc_resume_cap);
224 224 * }
225 225 * }
226 226 */
227 227 movq (%rsp), %rdx
228 228 subq %rdx, %rax /* delta = tsc - tsc_last */
229 229 jbe 3f /* if (delta < 0) */
230 230
231 231 2:
232 232 /*
233 233 * Optimized TSC_CONVERT_AND_ADD:
234 234 * hrtime_base += (tsc_delta * nsec_scale) >> (32 - NSEC_SHIFT)
235 235 *
236 236 * Since the multiply and shift are done in 128-bit, there is no need
237 237 * to worry about overflow.
238 238 */
239 239 movl CP_NSEC_SCALE(%rdi), %ecx
240 240 mulq %rcx
241 241 shrdq $_CONST(32 - NSEC_SHIFT), %rdx, %rax
242 242 movq 0x8(%rsp), %r8
243 243 addq %r8, %rax
244 244
245 245 addq $0x20, %rsp
246 246 ret
247 247
248 248 3:
249 249 movq %rax, %r9 /* save (tsc - tsc_last) in r9 */
250 250 movl CP_TSC_MAX_DELTA(%rdi), %ecx
251 251 sall $1, %ecx
252 252 addq %rcx, %rax /* delta += (tsc_max_delta << 1) */
253 253 jae 4f /* delta < 0 */
254 254 xorq %rax, %rax
255 255 jmp 2b
256 256
257 257 4:
258 258 /*
259 259 * Repopulate %rax with the TSC reading by adding tsc_last to %r9
260 260 * (which holds tsc - tsc_last)
261 261 */
262 262 movq (%rsp), %rax
263 263 addq %r9, %rax
264 264
265 265 /* delta = MIN(tsc, resume_cap) */
266 266 movq CP_TSC_RESUME_CAP(%rdi), %rcx
267 267 cmpq %rcx, %rax
268 268 jbe 5f
269 269 movq %rcx, %rax
270 270 5:
271 271 jmp 2b
272 272
273 273 SET_SIZE(__cp_gethrtime)
274 274
275 275 /*
276 276 * int
277 277 * __cp_clock_gettime_monotonic(comm_page_t *cp, timespec_t *tsp)
278 278 *
279 279 * Stack usage: 0x8 local + 0x8 call + 0x28 called func. = 0x38 bytes
280 280 *
281 281 * %rsp+0x00 - timespec_t *tsp
282 282 */
283 283 ENTRY_NP(__cp_clock_gettime_monotonic)
284 284 subq $0x8, %rsp
285 285 movq %rsi, (%rsp)
286 286
287 287 call __cp_gethrtime
288 288
289 289 /*
290 290 * Convert from hrtime_t (int64_t in nanoseconds) to timespec_t.
291 291 * This uses the same approach as hrt2ts, although it has been updated
292 292 * to utilize 64-bit math.
293 293 * 1 / 1,000,000,000 =
294 294 * 1000100101110000010111110100000100110110101101001010110110011B-26
295 295 * = 0x112e0be826d694b3 * 2^-26
296 296 *
297 297 * secs = (nsecs * 0x112e0be826d694b3) >> 26
298 298 *
299 299 * In order to account for the 2s-compliment of negative inputs, a
300 300 * final operation completes the process:
301 301 *
302 302 * secs -= (nsecs >> 63)
303 303 */
304 304 movq %rax, %r11
305 305 movq $0x112e0be826d694b3, %rdx
306 306 imulq %rdx
307 307 sarq $0x1a, %rdx
308 308 movq %r11, %rax
309 309 sarq $0x3f, %rax
310 310 subq %rax, %rdx
311 311 movq (%rsp), %rsi
312 312 movq %rdx, (%rsi)
313 313 /*
314 314 * Populating tv_nsec is easier:
315 315 * tv_nsec = nsecs - (secs * NANOSEC)
316 316 */
317 317 imulq $NANOSEC, %rdx, %rdx
318 318 subq %rdx, %r11
319 319 movq %r11, 0x8(%rsi)
320 320
321 321 xorl %eax, %eax
322 322 addq $0x8, %rsp
323 323 ret
324 324 SET_SIZE(__cp_clock_gettime_monotonic)
325 325
326 326 /*
327 327 * int
328 328 * __cp_clock_gettime_realtime(comm_page_t *cp, timespec_t *tsp)
329 329 *
330 330 * Stack usage: 0x18 local + 0x8 call + 0x28 called func. = 0x48 bytes
331 331 *
332 332 * %rsp+0x00 - commpage_t *cp
333 333 * %rsp+0x08 - timespec_t *tsp
334 334 * %rsp+0x10 - int hres_lock
335 335 */
336 336 ENTRY_NP(__cp_clock_gettime_realtime)
337 337 subq $0x18, %rsp
338 338 movq %rdi, (%rsp)
339 339 movq %rsi, 0x8(%rsp)
340 340
341 341 1:
342 342 movl CP_HRES_LOCK(%rdi), %eax
343 343 movl %eax, 0x10(%rsp)
344 344
345 345 call __cp_gethrtime
346 346 movq (%rsp), %rdi
347 347 movq CP_HRES_LAST_TICK(%rdi), %rdx
348 348 subq %rdx, %rax /* nslt = hrtime - last_tick */
349 349 jb 1b
350 350 movq CP_HRESTIME(%rdi), %r9
351 351 movq _CONST(CP_HRESTIME + CP_HRESTIME_INCR)(%rdi), %r10
352 352 movl CP_HRESTIME_ADJ(%rdi), %r11d
353 353
354 354 addq %rax, %r10 /* now.tv_nsec += nslt */
355 355
356 356 cmpl $0, %r11d
357 357 jb 4f /* hres_adj > 0 */
358 358 ja 6f /* hres_adj < 0 */
359 359
360 360 2:
361 361 cmpq $NANOSEC, %r10
362 362 jae 8f /* tv_nsec >= NANOSEC */
363 363
364 364 3:
365 365 movl 0x10(%rsp), %eax
366 366 movl CP_HRES_LOCK(%rdi), %edx
367 367 andl $0xfffffffe, %edx
368 368 cmpl %eax, %edx
369 369 jne 1b
370 370
371 371 movq 0x8(%rsp), %rsi
372 372 movq %r9, (%rsi)
373 373 movq %r10, 0x8(%rsi)
374 374
375 375 xorl %eax, %eax
376 376 addq $0x18, %rsp
377 377 ret
378 378
379 379
380 380 4: /* hres_adj > 0 */
381 381 sarq $ADJ_SHIFT, %rax
382 382 cmpl %r11d, %eax
383 383 jbe 5f
384 384 movl %r11d, %eax
385 385 5:
386 386 addq %rax, %r10
387 387 jmp 2b
388 388
389 389 6: /* hres_adj < 0 */
390 390 sarq $ADJ_SHIFT, %rax
391 391 negl %r11d
392 392 cmpl %r11d, %eax
393 393 jbe 7f
394 394 movl %r11d, %eax
395 395 7:
396 396 subq %rax, %r10
397 397 jmp 2b
398 398
399 399 8: /* tv_nsec >= NANOSEC */
400 400 subq $NANOSEC, %r10
401 401 incq %r9
402 402 cmpq $NANOSEC, %r10
403 403 jae 8b
404 404 jmp 3b
405 405
406 406 SET_SIZE(__cp_clock_gettime_realtime)
|
↓ open down ↓ |
406 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX