Print this page
15254 %ymm registers not restored after signal handler
15367 x86 getfpregs() summons corrupting %xmm ghosts
15333 want x86 /proc xregs support (libc_db, libproc, mdb, etc.)
15336 want libc functions for extended ucontext_t
15334 want ps_lwphandle-specific reg routines
15328 FPU_CW_INIT mistreats reserved bit
15335 i86pc fpu_subr.c isn't really platform-specific
15332 setcontext(2) isn't actually noreturn
15331 need <sys/stdalign.h>
Change-Id: I7060aa86042dfb989f77fc3323c065ea2eafa9ad
Conflicts:
usr/src/uts/common/fs/proc/prcontrol.c
usr/src/uts/intel/os/archdep.c
usr/src/uts/intel/sys/ucontext.h
usr/src/uts/intel/syscall/getcontext.c
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/intel/os/fpu.c
+++ new/usr/src/uts/intel/os/fpu.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
↓ open down ↓ |
14 lines elided |
↑ open up ↑ |
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 * Copyright 2021 Joyent, Inc.
24 24 * Copyright 2021 RackTop Systems, Inc.
25 - * Copyright 2022 Oxide Computer Company
25 + * Copyright 2023 Oxide Computer Company
26 26 */
27 27
28 28 /* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
29 29 /* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */
30 30 /* All Rights Reserved */
31 31
32 32 /* Copyright (c) 1987, 1988 Microsoft Corporation */
33 33 /* All Rights Reserved */
34 34
35 35 /*
36 36 * Copyright (c) 2009, Intel Corporation.
37 37 * All rights reserved.
38 38 */
39 39
40 40 #include <sys/types.h>
41 41 #include <sys/param.h>
42 42 #include <sys/signal.h>
43 43 #include <sys/regset.h>
44 44 #include <sys/privregs.h>
45 45 #include <sys/psw.h>
46 46 #include <sys/trap.h>
47 47 #include <sys/fault.h>
48 48 #include <sys/systm.h>
49 49 #include <sys/user.h>
50 50 #include <sys/file.h>
51 51 #include <sys/proc.h>
52 52 #include <sys/pcb.h>
53 53 #include <sys/lwp.h>
54 54 #include <sys/cpuvar.h>
55 55 #include <sys/thread.h>
|
↓ open down ↓ |
20 lines elided |
↑ open up ↑ |
56 56 #include <sys/disp.h>
57 57 #include <sys/fp.h>
58 58 #include <sys/siginfo.h>
59 59 #include <sys/archsystm.h>
60 60 #include <sys/kmem.h>
61 61 #include <sys/debug.h>
62 62 #include <sys/x86_archext.h>
63 63 #include <sys/sysmacros.h>
64 64 #include <sys/cmn_err.h>
65 65 #include <sys/kfpu.h>
66 +#include <sys/stdbool.h>
67 +#include <sys/stdalign.h>
68 +#include <sys/procfs_isa.h>
69 +#include <sys/sunddi.h>
66 70
67 71 /*
68 72 * FPU Management Overview
69 73 * -----------------------
70 74 *
71 75 * The x86 FPU has evolved substantially since its days as the x87 coprocessor;
72 76 * however, many aspects of its life as a coprocessor are still around in x86.
73 77 *
74 78 * Today, when we refer to the 'FPU', we don't just mean the original x87 FPU.
75 79 * While that state still exists, there is much more that is covered by the FPU.
76 80 * Today, this includes not just traditional FPU state, but also supervisor only
77 81 * state. The following state is currently managed and covered logically by the
78 - * idea of the FPU registers:
82 + * idea of the FPU registers and more generally is called the Extended Processor
83 + * States:
79 84 *
80 85 * o Traditional x87 FPU
81 86 * o Vector Registers (%xmm, %ymm, %zmm)
82 87 * o Memory Protection Extensions (MPX) Bounds Registers
83 88 * o Protected Key Rights Registers (PKRU)
84 89 * o Processor Trace data
90 + * o Control-Flow Enforcement state
91 + * o Hardware Duty Cycle
92 + * o Hardware P-states
85 93 *
86 94 * The rest of this covers how the FPU is managed and controlled, how state is
87 95 * saved and restored between threads, interactions with hypervisors, and other
88 - * information exported to user land through aux vectors. A lot of background
96 + * information exported to userland through aux vectors. A lot of background
89 97 * information is here to synthesize major parts of the Intel SDM, but
90 98 * unfortunately, it is not a replacement for reading it.
91 99 *
92 100 * FPU Control Registers
93 101 * ---------------------
94 102 *
95 103 * Because the x87 FPU began its life as a co-processor and the FPU was
96 104 * optional there are several bits that show up in %cr0 that we have to
97 105 * manipulate when dealing with the FPU. These are:
98 106 *
99 107 * o CR0.ET The 'extension type' bit. This was used originally to indicate
100 108 * that the FPU co-processor was present. Now it is forced on for
101 109 * compatibility. This is often used to verify whether or not the
102 110 * FPU is present.
103 111 *
104 112 * o CR0.NE The 'native error' bit. Used to indicate that native error
105 113 * mode should be enabled. This indicates that we should take traps
106 114 * on FPU errors. The OS enables this early in boot.
107 115 *
108 116 * o CR0.MP The 'Monitor Coprocessor' bit. Used to control whether or not
109 117 * wait/fwait instructions generate a #NM if CR0.TS is set.
110 118 *
111 119 * o CR0.EM The 'Emulation' bit. This is used to cause floating point
112 120 * operations (x87 through SSE4) to trap with a #UD so they can be
113 121 * emulated. The system never sets this bit, but makes sure it is
114 122 * clear on processor start up.
115 123 *
116 124 * o CR0.TS The 'Task Switched' bit. When this is turned on, a floating
117 125 * point operation will generate a #NM. An fwait will as well,
118 126 * depending on the value in CR0.MP.
119 127 *
120 128 * Our general policy is that CR0.ET, CR0.NE, and CR0.MP are always set by
121 129 * the system. Similarly CR0.EM is always unset by the system. CR0.TS has a more
122 130 * complicated role. Historically it has been used to allow running systems to
123 131 * restore the FPU registers lazily. This will be discussed in greater depth
124 132 * later on.
125 133 *
126 134 * %cr4 is also used as part of the FPU control. Specifically we need to worry
127 135 * about the following bits in the system:
128 136 *
129 137 * o CR4.OSFXSR This bit is used to indicate that the OS understands and
130 138 * supports the execution of the fxsave and fxrstor
131 139 * instructions. This bit is required to be set to enable
132 140 * the use of the SSE->SSE4 instructions.
133 141 *
134 142 * o CR4.OSXMMEXCPT This bit is used to indicate that the OS can understand
135 143 * and take a SIMD floating point exception (#XM). This bit
136 144 * is always enabled by the system.
137 145 *
138 146 * o CR4.OSXSAVE This bit is used to indicate that the OS understands and
139 147 * supports the execution of the xsave and xrstor family of
140 148 * instructions. This bit is required to use any of the AVX
141 149 * and newer feature sets.
142 150 *
143 151 * Because all supported processors are 64-bit, they'll always support the XMM
144 152 * extensions and we will enable both CR4.OXFXSR and CR4.OSXMMEXCPT in boot.
145 153 * CR4.OSXSAVE will be enabled and used whenever xsave is reported in cpuid.
146 154 *
147 155 * %xcr0 is used to manage the behavior of the xsave feature set and is only
148 156 * present on the system if xsave is supported. %xcr0 is read and written to
149 157 * through by the xgetbv and xsetbv instructions. This register is present
150 158 * whenever the xsave feature set is supported. Each bit in %xcr0 refers to a
151 159 * different component of the xsave state and controls whether or not that
152 160 * information is saved and restored. For newer feature sets like AVX and MPX,
153 161 * it also controls whether or not the corresponding instructions can be
154 162 * executed (much like CR0.OSFXSR does for the SSE feature sets).
155 163 *
156 164 * Everything in %xcr0 is around features available to users. There is also the
157 165 * IA32_XSS MSR which is used to control supervisor-only features that are still
158 166 * part of the xsave state. Bits that can be set in %xcr0 are reserved in
159 167 * IA32_XSS and vice versa. This is an important property that is particularly
160 168 * relevant to how the xsave instructions operate.
161 169 *
162 170 * Save Mechanisms
163 171 * ---------------
164 172 *
165 173 * When switching between running threads the FPU state needs to be saved and
166 174 * restored by the OS. If this state was not saved, users would rightfully
167 175 * complain about corrupt state. There are three mechanisms that exist on the
168 176 * processor for saving and restoring these state images:
169 177 *
170 178 * o fsave
171 179 * o fxsave
172 180 * o xsave
173 181 *
174 182 * fsave saves and restores only the x87 FPU and is the oldest of these
175 183 * mechanisms. This mechanism is never used in the kernel today because we are
176 184 * always running on systems that support fxsave.
177 185 *
178 186 * The fxsave and fxrstor mechanism allows the x87 FPU and the SSE register
179 187 * state to be saved and restored to and from a struct fxsave_state. This is the
180 188 * default mechanism that is used to save and restore the FPU on amd64. An
181 189 * important aspect of fxsave that was different from the original i386 fsave
182 190 * mechanism is that the restoring of FPU state with pending exceptions will not
183 191 * generate an exception, it will be deferred to the next use of the FPU.
184 192 *
185 193 * The final and by far the most complex mechanism is that of the xsave set.
186 194 * xsave allows for saving and restoring all of the traditional x86 pieces (x87
187 195 * and SSE), while allowing for extensions that will save the %ymm, %zmm, etc.
188 196 * registers.
189 197 *
190 198 * Data is saved and restored into and out of a struct xsave_state. The first
191 199 * part of the struct xsave_state is equivalent to the struct fxsave_state.
192 200 * After that, there is a header which is used to describe the remaining
193 201 * portions of the state. The header is a 64-byte value of which the first two
194 202 * uint64_t values are defined and the rest are reserved and must be zero. The
195 203 * first uint64_t is the xstate_bv member. This describes which values in the
196 204 * xsave_state are actually valid and present. This is updated on a save and
197 205 * used on restore. The second member is the xcomp_bv member. Its last bit
198 206 * determines whether or not a compressed version of the structure is used.
199 207 *
200 208 * When the uncompressed structure is used (currently the only format we
201 209 * support), then each state component is at a fixed offset in the structure,
202 210 * even if it is not being used. For example, if you only saved the AVX related
203 211 * state, but did not save the MPX related state, the offset would not change
204 212 * for any component. With the compressed format, components that aren't used
205 213 * are all elided (though the x87 and SSE state are always there).
206 214 *
207 215 * Unlike fxsave which saves all state, the xsave family does not always save
208 216 * and restore all the state that could be covered by the xsave_state. The
209 217 * instructions all take an argument which is a mask of what to consider. This
210 218 * is the same mask that will be used in the xstate_bv vector and it is also the
211 219 * same values that are present in %xcr0 and IA32_XSS. Though IA32_XSS is only
212 220 * considered with the xsaves and xrstors instructions.
213 221 *
214 222 * When a save or restore is requested, a bitwise and is performed between the
215 223 * requested bits and those that have been enabled in %xcr0. Only the bits that
216 224 * match that are then saved or restored. Others will be silently ignored by
217 225 * the processor. This idea is used often in the OS. We will always request that
218 226 * we save and restore all of the state, but only those portions that are
219 227 * actually enabled in %xcr0 will be touched.
220 228 *
221 229 * If a feature has been asked to be restored that is not set in the xstate_bv
222 230 * feature vector of the save state, then it will be set to its initial state by
223 231 * the processor (usually zeros). Also, when asked to save state, the processor
224 232 * may not write out data that is in its initial state as an optimization. This
225 233 * optimization only applies to saving data and not to restoring data.
226 234 *
227 235 * There are a few different variants of the xsave and xrstor instruction. They
228 236 * are:
229 237 *
230 238 * o xsave This is the original save instruction. It will save all of the
231 239 * requested data in the xsave state structure. It only saves data
232 240 * in the uncompressed (xcomp_bv[63] is zero) format. It may be
233 241 * executed at all privilege levels.
234 242 *
235 243 * o xrstor This is the original restore instruction. It will restore all of
236 244 * the requested data. The xrstor function can handle both the
237 245 * compressed and uncompressed formats. It may be executed at all
238 246 * privilege levels.
239 247 *
240 248 * o xsaveopt This is a variant of the xsave instruction that employs
241 249 * optimizations to try and only write out state that has been
242 250 * modified since the last time an xrstor instruction was called.
243 251 * The processor tracks a tuple of information about the last
244 252 * xrstor and tries to ensure that the same buffer is being used
245 253 * when this optimization is being used. However, because of the
246 254 * way that it tracks the xrstor buffer based on the address of it,
247 255 * it is not suitable for use if that buffer can be easily reused.
248 256 * The most common case is trying to save data to the stack in
249 257 * rtld. It may be executed at all privilege levels.
250 258 *
251 259 * o xsavec This is a variant of the xsave instruction that writes out the
252 260 * compressed form of the xsave_state. Otherwise it behaves as
253 261 * xsave. It may be executed at all privilege levels.
254 262 *
255 263 * o xsaves This is a variant of the xsave instruction. It is similar to
256 264 * xsavec in that it always writes the compressed form of the
257 265 * buffer. Unlike all the other forms, this instruction looks at
258 266 * both the user (%xcr0) and supervisor (IA32_XSS MSR) to determine
259 267 * what to save and restore. xsaves also implements the same
260 268 * optimization that xsaveopt does around modified pieces. User
261 269 * land may not execute the instruction.
262 270 *
263 271 * o xrstors This is a variant of the xrstor instruction. Similar to xsaves
264 272 * it can save and restore both the user and privileged states.
265 273 * Unlike xrstor it can only operate on the compressed form.
266 274 * User land may not execute the instruction.
267 275 *
268 276 * Based on all of these, the kernel has a precedence for what it will use.
269 277 * Basically, xsaves (not supported) is preferred to xsaveopt, which is
270 278 * preferred to xsave. A similar scheme is used when informing rtld (more later)
271 279 * about what it should use. xsavec is preferred to xsave. xsaveopt is not
272 280 * recommended due to the modified optimization not being appropriate for this
273 281 * use.
274 282 *
275 283 * Finally, there is one last gotcha with the xsave state. Importantly some AMD
276 284 * processors did not always save and restore some of the FPU exception state in
277 285 * some cases like Intel did. In those cases the OS will make up for this fact
278 286 * itself.
279 287 *
280 288 * FPU Initialization
281 289 * ------------------
282 290 *
283 291 * One difference with the FPU registers is that not all threads have FPU state,
284 292 * only those that have an lwp. Generally this means kernel threads, which all
285 293 * share p0 and its lwp, do not have FPU state. Though there are definitely
286 294 * exceptions such as kcfpoold. In the rest of this discussion we'll use thread
287 295 * and lwp interchangeably, just think of thread meaning a thread that has a
288 296 * lwp.
289 297 *
290 298 * Each lwp has its FPU state allocated in its pcb (process control block). The
291 299 * actual storage comes from the fpsave_cachep kmem cache. This cache is sized
292 300 * dynamically at start up based on the save mechanism that we're using and the
293 301 * amount of memory required for it. This is dynamic because the xsave_state
294 302 * size varies based on the supported feature set.
295 303 *
296 304 * The hardware side of the FPU is initialized early in boot before we mount the
297 305 * root file system. This is effectively done in fpu_probe(). This is where we
298 306 * make the final decision about what the save and restore mechanisms we should
299 307 * use are, create the fpsave_cachep kmem cache, and initialize a number of
300 308 * function pointers that use save and restoring logic.
301 309 *
302 310 * The thread/lwp side is a a little more involved. There are two different
303 311 * things that we need to concern ourselves with. The first is how the FPU
304 312 * resources are allocated and the second is how the FPU state is initialized
305 313 * for a given lwp.
306 314 *
307 315 * We allocate the FPU save state from our kmem cache as part of lwp_fp_init().
308 316 * This is always called unconditionally by the system as part of creating an
309 317 * LWP.
310 318 *
311 319 * There are three different initialization paths that we deal with. The first
312 320 * is when we are executing a new process. As part of exec all of the register
313 321 * state is reset. The exec case is particularly important because init is born
314 322 * like Athena, sprouting from the head of the kernel, without any true parent
315 323 * to fork from. The second is used whenever we fork or create a new lwp. The
316 324 * third is to deal with special lwps like the agent lwp.
317 325 *
318 326 * During exec, we will call fp_exec() which will initialize and set up the FPU
319 327 * state for the process. That will fill in the initial state for the FPU and
320 328 * also set that state in the FPU itself. As part of fp_exec() we also install a
321 329 * thread context operations vector that takes care of dealing with the saving
322 330 * and restoring of the FPU. These context handlers will also be called whenever
323 331 * an lwp is created or forked. In those cases, to initialize the FPU we will
324 332 * call fp_new_lwp(). Like fp_exec(), fp_new_lwp() will install a context
325 333 * operations vector for the new thread.
|
↓ open down ↓ |
227 lines elided |
↑ open up ↑ |
326 334 *
327 335 * Next we'll end up in the context operation fp_new_lwp(). This saves the
328 336 * current thread's state, initializes the new thread's state, and copies over
329 337 * the relevant parts of the originating thread's state. It's as this point that
330 338 * we also install the FPU context operations into the new thread, which ensures
331 339 * that all future threads that are descendants of the current one get the
332 340 * thread context operations (unless they call exec).
333 341 *
334 342 * To deal with some things like the agent lwp, we double check the state of the
335 343 * FPU in sys_rtt_common() to make sure that it has been enabled before
336 - * returning to user land. In general, this path should be rare, but it's useful
344 + * returning to userland. In general, this path should be rare, but it's useful
337 345 * for the odd lwp here and there.
338 346 *
339 347 * The FPU state will remain valid most of the time. There are times that
340 348 * the state will be rewritten. For example in restorecontext, due to /proc, or
341 349 * the lwp calls exec(). Whether the context is being freed or we are resetting
342 350 * the state, we will call fp_free() to disable the FPU and our context.
343 351 *
344 352 * Finally, when the lwp is destroyed, it will actually destroy and free the FPU
345 353 * state by calling fp_lwp_cleanup().
346 354 *
347 355 * Kernel FPU Multiplexing
348 356 * -----------------------
349 357 *
350 358 * Just as the kernel has to maintain all of the general purpose registers when
351 359 * switching between scheduled threads, the same is true of the FPU registers.
352 360 *
353 361 * When a thread has FPU state, it also has a set of context operations
354 362 * installed. These context operations take care of making sure that the FPU is
|
↓ open down ↓ |
8 lines elided |
↑ open up ↑ |
355 363 * properly saved and restored during a context switch (fpsave_ctxt and
356 364 * fprestore_ctxt respectively). This means that the current implementation of
357 365 * the FPU is 'eager', when a thread is running the CPU will have its FPU state
358 366 * loaded. While this is always true when executing in userland, there are a few
359 367 * cases where this is not true in the kernel.
360 368 *
361 369 * This was not always the case. Traditionally on x86 a 'lazy' FPU restore was
362 370 * employed. This meant that the FPU would be saved on a context switch and the
363 371 * CR0.TS bit would be set. When a thread next tried to use the FPU, it would
364 372 * then take a #NM trap, at which point we would restore the FPU from the save
365 - * area and return to user land. Given the frequency of use of the FPU alone by
366 - * libc, there's no point returning to user land just to trap again.
373 + * area and return to userland. Given the frequency of use of the FPU alone by
374 + * libc, there's no point returning to userland just to trap again.
367 375 *
368 376 * There are a few cases though where the FPU state may need to be changed for a
369 377 * thread on its behalf. The most notable cases are in the case of processes
370 378 * using /proc, restorecontext, forking, etc. In all of these cases the kernel
371 379 * will force a threads FPU state to be saved into the PCB through the fp_save()
372 380 * function. Whenever the FPU is saved, then the FPU_VALID flag is set on the
373 381 * pcb. This indicates that the save state holds currently valid data. As a side
374 382 * effect of this, CR0.TS will be set. To make sure that all of the state is
375 - * updated before returning to user land, in these cases, we set a flag on the
383 + * updated before returning to userland, in these cases, we set a flag on the
376 384 * PCB that says the FPU needs to be updated. This will make sure that we take
377 385 * the slow path out of a system call to fix things up for the thread. Due to
378 386 * the fact that this is a rather rare case, effectively setting the equivalent
379 387 * of t_postsys is acceptable.
380 388 *
381 389 * CR0.TS will be set after a save occurs and cleared when a restore occurs.
382 390 * Generally this means it will be cleared immediately by the new thread that is
383 391 * running in a context switch. However, this isn't the case for kernel threads.
384 392 * They currently operate with CR0.TS set as no kernel state is restored for
385 393 * them. This means that using the FPU will cause a #NM and panic.
386 394 *
387 395 * The FPU_VALID flag on the currently executing thread's pcb is meant to track
388 396 * what the value of CR0.TS should be. If it is set, then CR0.TS will be set.
389 397 * However, because we eagerly restore, the only time that CR0.TS should be set
390 398 * for a non-kernel thread is during operations where it will be cleared before
391 - * returning to user land and importantly, the only data that is in it is its
399 + * returning to userland and importantly, the only data that is in it is its
392 400 * own.
393 401 *
394 402 * Kernel FPU Usage
395 403 * ----------------
396 404 *
397 405 * Traditionally the kernel never used the FPU since it had no need for
398 406 * floating point operations. However, modern FPU hardware supports a variety
399 407 * of SIMD extensions which can speed up code such as parity calculations or
400 408 * encryption.
401 409 *
402 410 * To allow the kernel to take advantage of these features, the
403 411 * kernel_fpu_begin() and kernel_fpu_end() functions should be wrapped
404 412 * around any usage of the FPU by the kernel to ensure that user-level context
405 413 * is properly saved/restored, as well as to properly setup the FPU for use by
406 414 * the kernel. There are a variety of ways this wrapping can be used, as
407 415 * discussed in this section below.
408 416 *
409 417 * When kernel_fpu_begin() and kernel_fpu_end() are used for extended
410 418 * operations, the kernel_fpu_alloc() function should be used to allocate a
411 419 * kfpu_state_t structure that is used to save/restore the thread's kernel FPU
412 420 * state. This structure is not tied to any thread. That is, different threads
413 421 * can reuse the same kfpu_state_t structure, although not concurrently. A
414 422 * kfpu_state_t structure is freed by the kernel_fpu_free() function.
415 423 *
416 424 * In some cases, the kernel may need to use the FPU for a short operation
417 425 * without the overhead to manage a kfpu_state_t structure and without
418 426 * allowing for a context switch off the FPU. In this case the KFPU_NO_STATE
419 427 * bit can be set in the kernel_fpu_begin() and kernel_fpu_end() flags
420 428 * parameter. This indicates that there is no kfpu_state_t. When used this way,
421 429 * kernel preemption should be disabled by the caller (kpreempt_disable) before
422 430 * calling kernel_fpu_begin(), and re-enabled after calling kernel_fpu_end().
423 431 * For this usage, it is important to limit the kernel's FPU use to short
424 432 * operations. The tradeoff between using the FPU without a kfpu_state_t
425 433 * structure vs. the overhead of allowing a context switch while using the FPU
426 434 * should be carefully considered on a case by case basis.
427 435 *
428 436 * In other cases, kernel threads have an LWP, but never execute in user space.
429 437 * In this situation, the LWP's pcb_fpu area can be used to save/restore the
430 438 * kernel's FPU state if the thread is context switched, instead of having to
431 439 * allocate and manage a kfpu_state_t structure. The KFPU_USE_LWP bit in the
432 440 * kernel_fpu_begin() and kernel_fpu_end() flags parameter is used to
433 441 * enable this behavior. It is the caller's responsibility to ensure that this
|
↓ open down ↓ |
32 lines elided |
↑ open up ↑ |
434 442 * is only used for a kernel thread which never executes in user space.
435 443 *
436 444 * FPU Exceptions
437 445 * --------------
438 446 *
439 447 * Certain operations can cause the kernel to take traps due to FPU activity.
440 448 * Generally these events will cause a user process to receive a SIGFPU and if
441 449 * the kernel receives it in kernel context, we will die. Traditionally the #NM
442 450 * (Device Not Available / No Math) exception generated by CR0.TS would have
443 451 * caused us to restore the FPU. Now it is a fatal event regardless of whether
444 - * or not user land causes it.
452 + * or not userland causes it.
445 453 *
446 454 * While there are some cases where the kernel uses the FPU, it is up to the
447 455 * kernel to use the FPU in a way such that it cannot receive a trap or to use
448 456 * the appropriate trap protection mechanisms.
449 457 *
450 458 * Hypervisors
451 459 * -----------
452 460 *
453 461 * When providing support for hypervisors things are a little bit more
454 462 * complicated because the FPU is not virtualized at all. This means that they
455 463 * need to save and restore the FPU and %xcr0 across entry and exit to the
456 464 * guest. To facilitate this, we provide a series of APIs in <sys/hma.h>. These
457 465 * allow us to use the full native state to make sure that we are always saving
458 466 * and restoring the full FPU that the host sees, even when the guest is using a
459 467 * subset.
460 468 *
461 469 * One tricky aspect of this is that the guest may be using a subset of %xcr0
462 470 * and therefore changing our %xcr0 on the fly. It is vital that when we're
463 471 * saving and restoring the FPU that we always use the largest %xcr0 contents
464 472 * otherwise we will end up leaving behind data in it.
465 473 *
|
↓ open down ↓ |
11 lines elided |
↑ open up ↑ |
466 474 * ELF PLT Support
467 475 * ---------------
468 476 *
469 477 * rtld has to preserve a subset of the FPU when it is saving and restoring
470 478 * registers due to the amd64 SYS V ABI. See cmd/sgs/rtld/amd64/boot_elf.s for
471 479 * more information. As a result, we set up an aux vector that contains
472 480 * information about what save and restore mechanisms it should be using and
473 481 * the sizing thereof based on what the kernel supports. This is passed down in
474 482 * a series of aux vectors SUN_AT_FPTYPE and SUN_AT_FPSIZE. This information is
475 483 * initialized in fpu_subr.c.
484 + *
485 + * Signal Handling and the ucontext_t
486 + * ----------------------------------
487 + *
488 + * One of the many gifts that signals give us is the twofold fact that when a
489 + * signal occurs, the signal handler is allowed to change the CPU's state
490 + * arbitrarily and when the signal handler is done executing, we must restore it
491 + * back to the original state. However, the second part of this is that the
492 + * signal handler is actually allowed to modify the state that the thread will
493 + * return to! To create this facade, the kernel will create a full ucontext_t
494 + * state, effectively calling getcontext(2) on the thread's behalf, and a
495 + * pointer to that is given to the signal handler (the void * argument for the
496 + * sa_sigaction function pointer in sigaction(2)). When libc is done with a
497 + * signal, it will call setcontext(2) with that same ucontext_t.
498 + *
499 + * Now, the ucontext_t has a fixed ABI for both ILP32 and LP64 environments and
500 + * it's often declared on the stack itself, with the signal handler spilling all
501 + * this state to the stack. The ucontext_t machine portion was broken into the
502 + * general purpose and floating point registers. In 64-bit code, the floating
503 + * point registers were mostly the same as the results of the fxsave instruction
504 + * (i.e. struct fxsave_state). While the 64-bit kernel still uses the equivalent
505 + * starting point for information, it is transformed into a different shape to
506 + * deal with the history of the 32-bit SYS V ABI.
507 + *
508 + * While this worked, if you're reading this, you're aware that the x86 FPU and
509 + * extended register states didn't stop at the initial 16 128-bit %xmm
510 + * registers. Since then we have added 256-bit %ymm, 512-bit %zmm, and the %k
511 + * opmask registers. None of these fit inside the standard ucontext_t; however,
512 + * they must all be preserved and restored across a signal. While the various
513 + * x86 platform-specific ABIs all suggest that these registers are not preserved
514 + * across a function call, receiving a signal is not a function call and must be
515 + * thought of like a process receiving an interrupt. In other words, this
516 + * extended state must be preserved.
517 + *
518 + * To facilitate this, we have extended the ucontext_t structure with an
519 + * additional flag, UC_XSAVE, which indicates that the traditional padding
520 + * member, uc_xsave, actually is a pointer to the extended state. While this is
521 + * accessible outside of a signal handling context through the combination of
522 + * ucontext_alloc(3C) and getcontext_extd(2), our design around saving this
523 + * state is focused on signal handling. Signal handling spills all this state to
524 + * the stack and if we cannot spill the entire state to the stack then our
525 + * inability to deliver the signal results in the process being killed! While
526 + * there are separate efforts to ensure that the signal stack sizing that is
527 + * used for the minimum and maximum signal sizes are sufficient, we still need
528 + * to do our part to minimize the likelihood here.
529 + *
530 + * In designing this, we make the following observations which have helped us
531 + * focus our design:
532 + *
533 + * o While the start of an xsave area is the traditional 512-byte fxsave XMM
534 + * region, we already have that in the fpregs. Thus there is no reason to
535 + * duplicate it. This not only saves 512 bytes of additional stack space,
536 + * but it also means we don't have to ask which of the version of it to take
537 + * if they were to differ.
538 + *
539 + * o Many applications out there aren't necessarily using the extended vectors
540 + * and even when we do make libc and others take advantage of it, it will
541 + * behoove us to ensure that they are put back into their initial state
542 + * after use. This leads us to expect that in a number of cases, the actual
543 + * extended register state will be in its initial state.
544 + *
545 + * o While the signal handler does allow contents to be modified, we are
546 + * starting with making the interface private and thus allowing us to excise
547 + * components that are in their initial state.
548 + *
549 + * o There are similarities to what we want to create with the compressed
550 + * xsave format; however, because we don't always have support for the
551 + * compressed format, we can't just arbitrarily say let's do a compressed
552 + * save to the user stack.
553 + *
554 + * o Because we are not handing this state directly to and from hardware, we
555 + * don't need to meet some of the constraints of the compressed xsave format
556 + * around wanting alignment for the initial save or additional components.
557 + *
558 + * All of the above lead us to our own unique format for this data. When the
559 + * UC_XSAVE flag is set in the ucontext_t, the uc_xsave member points to a
560 + * uc_xsave_t structure which has a magic version number, a 32-bit length of the
561 + * overall structure, and the 64-bit state bit-vector to represent which
562 + * components are valid. Following this 8-byte header, each component that is
563 + * present in the bit vector is immediately written out in roughly ascending bit
564 + * order (the order is determined based on the order of the fpu_xsave_info
565 + * array).
566 + *
567 + * This makes the rough logic that we have here when taking a signal and writing
568 + * out this state as:
569 + *
570 + * 1. Ensure that the FPU is saved and that the contents of the pcb save area
571 + * are valid. That is, call fp_save() if the state is not already flagged
572 + * with FPU_VALID.
573 + *
574 + * 2. Copy the bit-vector from the save area and remove the XFEATURE_LEGACY_FP
575 + * and XFEATURE_SSE bits as these will be placed in the xsave area.
576 + *
577 + * 3. Initialize the uc_xsave_t by setting our version field, initializing the
578 + * length to the length of the current structure, and then setting the
579 + * modified bit vector above.
580 + *
581 + * 4. Walk each remaining bit of the bit-vector. For each set bit, copy out
582 + * its extended state starting at the current length in the header and then
583 + * increase the header size by that length.
584 + *
585 + * 5. Finally write out the final uc_xsave_t structure.
586 + *
587 + * The above process is also used when someone manually calls getcontext_extd(2)
588 + * to get this state. The main difference between the two is which copyout
589 + * function we use. This deserves some explanation. Our main starting point for
590 + * all the logic here is fpu_signal_copyout(). It takes a copyfunc that allows
591 + * the signal handling context to operate with a different copyout than we
592 + * normally use in say getcontext_extd(2).
593 + *
594 + * When we've received a signal, we're at the intersection of several different
595 + * gotchas. Normal copyout (or ddi_copyout()) will trigger watchpoints. That is,
596 + * the watchpoints effectively set a copyout override function (t_copyops) that
597 + * we end up vectoring to rather than a normal copyout. This allows the data to
598 + * be modified and for the watchpoint to fire. While this is all well and good
599 + * normally, it is problematic if we are trying to handle a signal. The signal
600 + * deliver logic, sendsig(), goes through and disables the watchpoint for the
601 + * region of the stack that we are copying out to. However, disabling
602 + * watchpoints is not sufficient, we also need to use the copyout_noerr
603 + * variants.
604 + *
605 + * These variants also require the use of on_fault() and no_fault() for error
606 + * handling. While it is tempting to try and on_fault() the entire
607 + * fpu_signal_copyout() operation, that is actually fraught for a few reasons.
608 + * The first is that we don't want to disable faults during the entire operation
609 + * as if the kernel messes up we will treat that as a user error. That isn't
610 + * theoretical and happened during development. The second and perhaps more
611 + * important issue is that correctly bounding the on_fault() / no_fault() means
612 + * being careful about state. For example, kernel pre-emption is often disabled
613 + * during parts of these operations, but it needs to be re-enabled when we're
614 + * done. This would require tracking in some volatile variable that this had
615 + * been enabled and disabled and tracking that.
616 + *
617 + * Instead, this is why fpu_signal_copyout() takes a copy out function as an
618 + * argument. When we're in signal handling context, the function will use
619 + * coypout_noerr() and wrap it in the appropriate on_fault() mechanisms.
620 + *
621 + * RESTORING STATE
622 + *
623 + * Copying out our current state is the easier half of this problem. When the
624 + * kernel is done with a signal it calls setcontext(2) with the ucontext_t we
625 + * assembled for it as described above. setcontext(2) isn't just used for
626 + * returning from signals.
627 + *
628 + * The process for this goes in two steps. The first step is to copy in,
629 + * validate, and transform the ucontext_t UC_XSAVE that we created above into an
630 + * equivalent xsave format that we can use the appropriate xrstor function on.
631 + * This first phase is implemented in fpu_signal_copyin(). Once that is done, we
632 + * come back through a second phase that is driven out of restorecontext() and
633 + * is implemented in fpu_set_xsave().
634 + *
635 + * Let's start by discussing the second part of this, which is more
636 + * straightforward. In particular, the second phase assumes that all of the
637 + * validation and error handling has been done by the first phase. This means
638 + * here, we have a buffer that is already the appropriate size
639 + * (cpuid_get_xsave_size()) and all we need to do is make sure that we can
640 + * replace the actual save state with the current one.
641 + *
642 + * The only piece of shenanigans we have to do is around the kernel provided
643 + * notion of 'status' and 'xstatus', which are cached versions of the x87 and
644 + * SSE exception vectors. These are part of the fpregset ABI and therefore we
645 + * need to propagate them from the temporary storage that part 1 sets up in the
646 + * ignored region of the fxsave data. We use that because it is not persisted by
647 + * the CPU, so clobbering it is generally alright.
648 + *
649 + * Once that is done, we simply note that we need a PCB update to occur to
650 + * refresh the FPU state before we return to userland. Given that someone has
651 + * called setcontext(2), this was always going to happen because we have to
652 + * update segment registers and related, so this isn't so bad. With that, let's
653 + * move onto the more nuanced part (1).
654 + *
655 + * When we're handling a setcontext(2) we have, in userland, a data structure
656 + * that should match one we serialized out, though we cannot assume that a user
657 + * has not modified it either accidentally or maliciously. Our goal is to set up
658 + * the appropriate xsave state that can be passed to the CPU's xrstor. The first
659 + * problem we have to deal with is where do we actually put this state?
660 + *
661 + * While not many programs actually call setcontext(2) on their own volition,
662 + * this is going to get hit every time we take a signal. The first thought was
663 + * to re-use the existing thread's save area; however, that's a bit challenging
664 + * for a few reasons. In particular, we would need to ensure that we don't go
665 + * off-CPU for any reason, which we cannot assume with a copyin from a user
666 + * address space. In particular, it is trivial for us to hit a case where the
667 + * stack has been paged out for some reason, which eschews that path.
668 + *
669 + * Instead, whenever a thread first calls setcontext(2), generally from signal
670 + * context, we will at that time allocate another entry from the 'fpsave_cachep'
671 + * kmem cache, giving us a buffer of the appropriate space to handle this. Once
672 + * this buffer has been allocated, we leave it assigned to the thread's pcb and
673 + * only tear it down when the thread itself finally exits. We reason that a
674 + * thread that takes a signal once is either going to have the process exit
675 + * shortly thereafter or is much more likely to take a signal again in the
676 + * future. Many daemons and other processes set things up so signals are
677 + * dispatched via one location, masking signals in other thread, using
678 + * sigsuspend(2), signalfd(3C), or something similar.
679 + *
680 + * With this buffer in hand, we begin our task of reassembling state. Note, all
681 + * of this is conditional on UC_XSAVE being set in the uc_flags member of the
682 + * ucontext_t. If it is not set, then we assume that there is no extended state
683 + * and will use the traditional path of setting the fpregset_t into the system
684 + * via setfpregs().
685 + *
686 + * We first will copyin and validate the uc_xsave_t. In particular, we need to
687 + * make sure the version makes sense and that the xsave component bit-vector
688 + * doesn't have anything unexpected and more importantly unsupported in it, and
689 + * that the addresses we've been given are within the user address space. At
690 + * this point we can walk through our table of implemented bits and process
691 + * them.
692 + *
693 + * For most components in here, the processing is straightforward. We continue
694 + * walking our cursor and copy data into the kernel and place it in the
695 + * appropriate place in our xsave state. If a xsave state component bit-vector
696 + * isn't set, then we must ensure that we have the item in the initial state,
697 + * which for everything other than the x87/SSE state is the memory being zeroed.
698 + *
699 + * The most unique case in the copyin state is that of the x87/SSE state. You
700 + * might recall that we didn't copy it out explicitly as part of the uc_xsave_t,
701 + * but instead have opted to use the single definition in the fpregset_t. Thus
702 + * here, we copy it out of the fpregset_t, which the kernel has helpfully
703 + * already unified into the 64-bit fxsave version prior to calling us, and
704 + * install that into the save area we're building up.
705 + *
706 + * As part of this, there are two important pieces to be aware of. The first is
707 + * that because the fpregset_t has both the status and xstatus members
708 + * mentioned earlier, we temporarily copy them to the software-usable ignored
709 + * areas of the fxsave state so we can corral this extra state into part (2)
710 + * without needing to allocate additional space. The second piece is that when
711 + * we're done processing this we explicitly remove the UC_FPU flag that would
712 + * tell the kernel to proceed with updating that region. The problem is that
713 + * that goes directly into the pcb's save area and not to the intermediate
714 + * buffer as it uses the same entry point as /proc, mainly setfpregs().
715 + *
716 + * We don't do much validation of the actual contents of the registers that are
717 + * being set with the exception of ensuring that no reserved bits of the mxcsr
718 + * are used. This is not as strict as /proc, but failure here means the process
719 + * is likely going to die (returning from setcontext() in a signal handler is
720 + * fatal).
721 + *
722 + * /proc xregs
723 + * -----------
724 + *
725 + * Observability of the state of the extended registers is important for
726 + * understanding the system. While on the surface this is similar to signal
727 + * handling, it is crucially different in a number of ways:
728 + *
729 + * o In signal handling, we're trying to conserve every byte of stack that we
730 + * can.
731 + * o The /proc xregs file will end up in core files, which means that we need
732 + * a way of knowing what components are present and not present in it,
733 + * because this will vary from CPU to CPU due to the addition of
734 + * architectural features. For example, some CPUs support AVX-512, but
735 + * others do not.
736 + * o The signal handling structure is private and we're not trying to have
737 + * software modify it, on the other hand, the /proc interfaces that we
738 + * support we do want software to be able to interrogate and manipulate.
739 + * These need to be something that we can introduce additional components
740 + * into and make other changes that still allow it to work.
741 + *
742 + * The x86 xregs format is documented in proc(5). The short form is that the
743 + * prxregset_hdr_t has a number of information entries, which are of the type
744 + * prxregset_info_t. Each of the information headers has a type, size, and
745 + * offset which indicate where to find the additional data.
746 + *
747 + * Each entry is described as one of the entries in the fpu_xsave_info[]. These
748 + * items either are a 1:1 correspondence with a xsave related feature (e.g.
749 + * there is one entry for each of the three AVX-512 components) or it is
750 + * something synthetic that we provide as additional information such as the
751 + * PRX_INFO_XCR, which is a way of getting information about the system such as
752 + * what is enabled in %xcr0 out there.
753 + *
754 + * Unlike signal handling, we are given the buffer to place everything that
755 + * needs to be written out. This is partially the design of the /proc APIs. That
756 + * is, we will always assemble everything into the entire buffer that /proc asks
757 + * us to, and then it will use as much or as little of it as is required.
758 + * Similarly, when setting things, we don't have to worry about copying in
759 + * information in the same way as signal handling does, because /proc takes care
760 + * of it and always hands us a full buffer. Sizing that is a little nuanced, but
761 + * is all handled in prmachdep.c.
762 + *
763 + * When someone performs a read of the xregs and thus is asking us for the
764 + * current state, there is a little bit of nuance that we need to deal with
765 + * here. The first, is whether or not the FPU is enabled and the second is if
766 + * the FPU is enabled, whether a given component is noted as being in its
767 + * initial state. This basically gives us three possible states for a given
768 + * component:
769 + *
770 + * 1. FPU_EN is not set and FPU_VALID is not set. This means we need to take
771 + * the illumos FPU default for an item. More on that in a moment.
772 + * 2. The saved xsave state indicates that the bit for a given component is
773 + * zero -- specifically the xsh_xstate_bv member of the struct xsave_state.
774 + * In this case, we must take the CPU's default for an item. This is
775 + * usually the same as illumos, but not always.
776 + * 3. The saved xsave state indicates that a given component's state bit is
777 + * valid. The simplest of our cases. We can just take what we have from the
778 + * xsave state.
779 + *
780 + * The CPU's default state for most components other than the x87/SSE state is
781 + * to have it be zeroed. This is what we treat as our default state as well. The
782 + * primary difference is in the initialization of the x87/SSE state. The SYS V
783 + * ABI requires that we enable a different floating point control word then the
784 + * hardware default. This means that when we're dealing with case (1) for
785 + * x87/SSE we have to be more careful than the other components. Thankfully for
786 + * everything else this is just keeping it zeroed.
787 + *
788 + * A reasonable question would be why not just skip components that aren't
789 + * marked as present. There are a few reasons we take a different approach and
790 + * always include it. Both of these are to make lives simpler for consumers. In
791 + * the first case, when someone is performing a read and wants to reassemble and
792 + * answer the question of 'what is the value of %ymm0 or %zmm15', they have
793 + * to combine multiple disparate parts. If one knows that the data we put into
794 + * there is always valid and represents what is in hardware and doesn't have to
795 + * keep track of what are the defaults in different circumstances, then that
796 + * greatly simplifies consumers lives. It also helps us for core files and other
797 + * observability cases because the answer to what is the operating system's
798 + * default may change over time.
799 + *
800 + * Similarly, including all the possible structures means that we have
801 + * simplified someone who does a write. Writes are always setting the full state
802 + * of a thread, meaning that if someone wants to modify only a single register
803 + * they must do a read, modify, and write. By including everything that they
804 + * might need, it makes it easier for consumers to do this and not have to cons
805 + * up the whole structure on their own.
806 + *
807 + * When we're setting state, things change around a little bit. We have a few
808 + * constraints that are laid out in proc(5). In particular, we require that the
809 + * PRX_INFO_XSAVE component always be present to tell us which other components
810 + * we expect to be here and which ones we don't. We also are much stricter about
811 + * writes in several ways. Of all the components, the PRX_INFO_XCR is read-only
812 + * and may not be modified by a calling process. In addition, when we have
813 + * 32-bit applications which have reserved registers in the %ymm, %zmm, etc.
814 + * segments, if they are being written to and have modifications, then we will
815 + * indicate an error there.
816 + *
817 + * Because we are given the entire buffer from userland and don't need to have
818 + * an intermediate place to copy it in, we will validate the entire thing in
819 + * advance. Once it has been validated and we consider it legal, then we will
820 + * translate each entry into its corresponding entry in pcb's normal floating
821 + * point state. This is different from signal handling mostly because of the
822 + * fact that we are not using copyin, and once we get to this point, there is
823 + * no more validation, so we don't have the same concerns around blocking while
824 + * pre-emption is disabled.
825 + *
826 + * The Wrinkle with fpregs
827 + * -----------------------
828 + *
829 + * When we instead turn our attention to the fpregs, whether we're gathering
830 + * them as part of the ucontext_t or as part of /proc, there are a few
831 + * complications that we need to be aware of when we're operating on a kernel
832 + * that is using xsave as the save mechanism. When we're using fxsave as the
833 + * save mechanism, the CPU will always save the entire 512-byte fxsave region.
834 + * The fpregs ABI that the kernel expects is basically this structure itself,
835 + * which is transformed into a 32-bit compatible form in archdep.c.
836 + *
837 + * But xsave makes this much more complex and has been a source of historical
838 + * bugs in the system. In particular, unlike fxsave, xsave has its component bit
839 + * vector that is written out to indicate validity. This means that blindly
840 + * copying the fxsave area without checking those bits will lead us to do the
841 + * wrong thing. The XMM state flag mostly covers the 16 128-bit %xmm registers,
842 + * while the x87 legacy fp flag covers the rest of the state. This is all good,
843 + * aside from the MCXSR.
844 + *
845 + * One of the more complicated pieces of xsave state management is correctly
846 + * answering the question of when the MXCSR is written out to xsave_state. In
847 + * practice, this is rather convoluted and varies. If either the XMM or AVX
848 + * feature bits are set then the CPU will write out the MXCSR and its mask
849 + * register into the traditional fxsave state region. This behavior is dependent
850 + * on the type of save function that we use. xsave and xsaveopt will look at the
851 + * AVX feature bit; however, xsavec does not and only considers the SSE feature
852 + * bit. This means that when we're retrieving things, we need to check both of
853 + * those bits to determine if we should use the initial state or the value
854 + * written out.
855 + *
856 + * When we come to someone trying to set the fpregs through /proc, the main
857 + * question we have is what happens to the extended registers. We have opted to
858 + * implement and document it such that a write to the fpregs only impacts the
859 + * fpregs. Put differently, we will save the FPU state with fp_save() ahead of
860 + * copying the data into the save area, set the state bits for x87 and XMM
861 + * state, and then set the FPU to be restored. All in all, this basically means
862 + * that writing to fpregs does not touch any of the %ymm, %zmm, or other state
863 + * that we might have present.
864 + *
865 + * Forward Looking: Adding Intel AMX Support
866 + * -----------------------------------------
867 + *
868 + * Nothing can stop the march of features being added into the FPU. One of the
869 + * larger chunks that we will need to wrangle with is Intel's Advanced Matrix
870 + * Extensions (AMX), which add a large chunk of xsave state to each process.
871 + * While things like AVX and AVX-512 have been enabled by default, the broader
872 + * OS community has not been wanting to do this for AMX ,because of the size of
873 + * the state which exceeds 8 KiB. While the signal handling state went out of
874 + * its way to minimize the size it wrote to the stack, if this is used, it would
875 + * need to be preserved.
876 + *
877 + * To deal with this reality and the fact that folks don't really want to
878 + * enable it by default for all purposes when its use will be quite special
879 + * purpose, Intel has also added a MSR around extended feature disable or xfd.
880 + * This is what we represent in the PRX_INFO_XCR prx_xfd member. Our starting
881 + * assumption, and the reason that so much of the /proc and signal logic ensures
882 + * that we have the thread and process around, taking as an example the unused
883 + * process argument in fpu_proc_xregs_info(), is that we will follow suit and
884 + * default to having support disabled, but that a process will be able to opt
885 + * into it, which will result in several different assumptions around signal
886 + * stack sizing and cause us to reallocate and extend the pcb's FPU save state.
887 + *
888 + * The following is a list of items to pay attention to for future folks who
889 + * work on this:
890 + *
891 + * o We will want to confirm whether other systems have opted to make this
892 + * process-wide or thread-wide. Assuming process-wide, we will need to do a
893 + * hold of all lwps while making a change. The interface for that probably
894 + * doesn't want to be /proc, as a process probably doesn't want to write to
895 + * its own control file. Changing it for another process could be done
896 + * through the agent-lwp.
897 + * o Opting into this should probably be a one-way street.
898 + * o Opting into this will need to evaluate all threads and in particular
899 + * stack sizes to confirm they adhere to the new minimum.
900 + * o We will need to make sure that setting and clearing the xfd MSR is part
901 + * of the FPU context ops and something we set by default on every CPU.
902 + * o We will need to add a new interface to allow opting into this feature.
903 + * o We will need to ensure that all subsequently created signal stacks adhere
904 + * to a required minimum size that we communicate through libc.
905 + * o We will need to make sure that both rtld and libc no longer rely on a
906 + * static value of the AT_SUN_FPSIZE, but rather realize that this can be
907 + * dynamic. At that time, we should evaluate if we can get away with not
908 + * needing to save this for rtld, even though signal handlers should assume
909 + * they will.
910 + * o The various components (because there is more than one) will want to be
911 + * added to the fpu_xsave_info[]. Consulting the processes's xfd will be
912 + * required and probably require logic changes.
913 + *
914 + * The above is not exhaustive. We'll probably have some other issues and fun
915 + * while doing this.
476 916 */
477 917
918 +/*
919 + * The kind of FPU we advertise to rtld so it knows what to do when working
920 + * through the PLT.
921 + */
922 +int fp_elf = AT_386_FPINFO_FXSAVE;
923 +
924 +/*
925 + * Mechanism to save FPU state.
926 + */
927 +int fp_save_mech = FP_FXSAVE;
928 +
478 929 kmem_cache_t *fpsave_cachep;
479 930
480 931 /* Legacy fxsave layout + xsave header + ymm */
481 932 #define AVX_XSAVE_SIZE (512 + 64 + 256)
482 933
483 934 /*
484 935 * Various sanity checks.
485 936 */
486 937 CTASSERT(sizeof (struct fxsave_state) == 512);
487 938 CTASSERT(sizeof (struct fnsave_state) == 108);
488 939 CTASSERT((offsetof(struct fxsave_state, fx_xmm[0]) & 0xf) == 0);
489 940 CTASSERT(sizeof (struct xsave_state) >= AVX_XSAVE_SIZE);
490 941
491 942 /*
943 + * Basic architectural alignment information.
944 + */
945 +#define FPU_ALIGN_XMM 16
946 +#define FPU_ALIGN_YMM 32
947 +#define FPU_ALIGN_ZMM 64
948 +
949 +/*
492 950 * This structure is the x86 implementation of the kernel FPU that is defined in
493 951 * uts/common/sys/kfpu.h.
494 952 */
495 953
496 954 typedef enum kfpu_flags {
497 955 /*
498 956 * This indicates that the save state has initial FPU data.
499 957 */
500 958 KFPU_F_INITIALIZED = 0x01
501 959 } kfpu_flags_t;
502 960
503 961 struct kfpu_state {
504 962 fpu_ctx_t kfpu_ctx;
505 963 kfpu_flags_t kfpu_flags;
506 964 kthread_t *kfpu_curthread;
507 965 };
508 966
509 967 /*
510 968 * Initial kfpu state for SSE/SSE2 used by fpinit()
511 969 */
512 970 const struct fxsave_state sse_initial = {
513 971 FPU_CW_INIT, /* fx_fcw */
514 972 0, /* fx_fsw */
515 973 0, /* fx_fctw */
516 974 0, /* fx_fop */
517 975 0, /* fx_rip */
518 976 0, /* fx_rdp */
519 977 SSE_MXCSR_INIT /* fx_mxcsr */
520 978 /* rest of structure is zero */
521 979 };
522 980
523 981 /*
524 982 * Initial kfpu state for AVX used by fpinit()
525 983 */
526 984 const struct xsave_state avx_initial = {
527 985 /*
528 986 * The definition below needs to be identical with sse_initial
529 987 * defined above.
530 988 */
531 989 .xs_fxsave = {
532 990 .fx_fcw = FPU_CW_INIT,
533 991 .fx_mxcsr = SSE_MXCSR_INIT,
534 992 },
535 993 .xs_header = {
536 994 /*
537 995 * bit0 = 1 for XSTATE_BV to indicate that legacy fields are
538 996 * valid, and CPU should initialize XMM/YMM.
539 997 */
540 998 .xsh_xstate_bv = 1,
541 999 .xsh_xcomp_bv = 0,
542 1000 },
|
↓ open down ↓ |
41 lines elided |
↑ open up ↑ |
543 1001 };
544 1002
545 1003 /*
546 1004 * mxcsr_mask value (possibly reset in fpu_probe); used to avoid
547 1005 * the #gp exception caused by setting unsupported bits in the
548 1006 * MXCSR register
549 1007 */
550 1008 uint32_t sse_mxcsr_mask = SSE_MXCSR_MASK_DEFAULT;
551 1009
552 1010 /*
553 - * Initial kfpu state for x87 used by fpinit()
554 - */
555 -const struct fnsave_state x87_initial = {
556 - FPU_CW_INIT, /* f_fcw */
557 - 0, /* __f_ign0 */
558 - 0, /* f_fsw */
559 - 0, /* __f_ign1 */
560 - 0xffff, /* f_ftw */
561 - /* rest of structure is zero */
562 -};
563 -
564 -/*
565 1011 * This vector is patched to xsave_ctxt() or xsaveopt_ctxt() if we discover we
566 1012 * have an XSAVE-capable chip in fpu_probe.
567 1013 */
568 1014 void (*fpsave_ctxt)(void *) = fpxsave_ctxt;
569 1015 void (*fprestore_ctxt)(void *) = fpxrestore_ctxt;
570 1016
571 1017 /*
572 1018 * This function pointer is changed to xsaveopt if the CPU is xsaveopt capable.
573 1019 */
574 1020 void (*xsavep)(struct xsave_state *, uint64_t) = xsave;
575 1021
576 1022 static int fpe_sicode(uint_t);
577 1023 static int fpe_simd_sicode(uint_t);
578 1024 static void fp_new_lwp(void *, void *);
579 1025 static void fp_free_ctx(void *, int);
580 1026
581 1027 static struct ctxop *
582 1028 fp_ctxop_allocate(struct fpu_ctx *fp)
583 1029 {
584 1030 const struct ctxop_template tpl = {
585 1031 .ct_rev = CTXOP_TPL_REV,
586 1032 .ct_save = fpsave_ctxt,
587 1033 .ct_restore = fprestore_ctxt,
588 1034 .ct_fork = fp_new_lwp,
589 1035 .ct_lwp_create = fp_new_lwp,
590 1036 .ct_free = fp_free_ctx,
591 1037 };
592 1038 return (ctxop_allocate(&tpl, fp));
593 1039 }
594 1040
595 1041 /*
596 1042 * Copy the state of parent lwp's floating point context into the new lwp.
597 1043 * Invoked for both fork() and lwp_create().
598 1044 *
599 1045 * Note that we inherit -only- the control state (e.g. exception masks,
600 1046 * rounding, precision control, etc.); the FPU registers are otherwise
601 1047 * reset to their initial state.
602 1048 */
603 1049 static void
604 1050 fp_new_lwp(void *parent, void *child)
605 1051 {
606 1052 kthread_id_t t = parent, ct = child;
607 1053 struct fpu_ctx *fp; /* parent fpu context */
608 1054 struct fpu_ctx *cfp; /* new fpu context */
609 1055 struct fxsave_state *fx, *cfx;
610 1056 struct xsave_state *cxs;
611 1057
612 1058 ASSERT(fp_kind != FP_NO);
613 1059
614 1060 fp = &t->t_lwp->lwp_pcb.pcb_fpu;
615 1061 cfp = &ct->t_lwp->lwp_pcb.pcb_fpu;
616 1062
617 1063 /*
618 1064 * If the parent FPU state is still in the FPU hw then save it;
619 1065 * conveniently, fp_save() already does this for us nicely.
620 1066 */
621 1067 fp_save(fp);
622 1068
623 1069 cfp->fpu_flags = FPU_EN | FPU_VALID;
624 1070 cfp->fpu_regs.kfpu_status = 0;
625 1071 cfp->fpu_regs.kfpu_xstatus = 0;
626 1072
627 1073 /*
628 1074 * Make sure that the child's FPU is cleaned up and made ready for user
629 1075 * land.
630 1076 */
631 1077 PCB_SET_UPDATE_FPU(&ct->t_lwp->lwp_pcb);
632 1078
633 1079 switch (fp_save_mech) {
634 1080 case FP_FXSAVE:
635 1081 fx = fp->fpu_regs.kfpu_u.kfpu_fx;
636 1082 cfx = cfp->fpu_regs.kfpu_u.kfpu_fx;
637 1083 bcopy(&sse_initial, cfx, sizeof (*cfx));
638 1084 cfx->fx_mxcsr = fx->fx_mxcsr & ~SSE_MXCSR_EFLAGS;
639 1085 cfx->fx_fcw = fx->fx_fcw;
640 1086 break;
641 1087
642 1088 case FP_XSAVE:
643 1089 cfp->fpu_xsave_mask = fp->fpu_xsave_mask;
644 1090
645 1091 VERIFY(fp->fpu_regs.kfpu_u.kfpu_xs != NULL);
646 1092
647 1093 fx = &fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave;
648 1094 cxs = cfp->fpu_regs.kfpu_u.kfpu_xs;
649 1095 cfx = &cxs->xs_fxsave;
650 1096
651 1097 bcopy(&avx_initial, cxs, sizeof (*cxs));
652 1098 cfx->fx_mxcsr = fx->fx_mxcsr & ~SSE_MXCSR_EFLAGS;
653 1099 cfx->fx_fcw = fx->fx_fcw;
|
↓ open down ↓ |
79 lines elided |
↑ open up ↑ |
654 1100 cxs->xs_header.xsh_xstate_bv |=
655 1101 (get_xcr(XFEATURE_ENABLED_MASK) & XFEATURE_FP_INITIAL);
656 1102 break;
657 1103 default:
658 1104 panic("Invalid fp_save_mech");
659 1105 /*NOTREACHED*/
660 1106 }
661 1107
662 1108 /*
663 1109 * Mark that both the parent and child need to have the FPU cleaned up
664 - * before returning to user land.
1110 + * before returning to userland.
665 1111 */
666 1112
667 1113 ctxop_attach(ct, fp_ctxop_allocate(cfp));
668 1114 }
669 1115
670 1116 /*
671 1117 * Free any state associated with floating point context.
672 1118 * Fp_free can be called in three cases:
673 1119 * 1) from reaper -> thread_free -> freectx-> fp_free
674 1120 * fp context belongs to a thread on deathrow
675 1121 * nothing to do, thread will never be resumed
676 1122 * thread calling ctxfree is reaper
677 1123 *
678 1124 * 2) from exec -> freectx -> fp_free
679 1125 * fp context belongs to the current thread
680 1126 * must disable fpu, thread calling ctxfree is curthread
681 1127 *
682 1128 * 3) from restorecontext -> setfpregs -> fp_free
683 1129 * we have a modified context in the memory (lwp->pcb_fpu)
684 1130 * disable fpu and release the fp context for the CPU
685 1131 *
686 1132 */
687 1133 void
688 1134 fp_free(struct fpu_ctx *fp)
689 1135 {
690 1136 ASSERT(fp_kind != FP_NO);
691 1137
692 1138 if (fp->fpu_flags & FPU_VALID)
693 1139 return;
694 1140
695 1141 kpreempt_disable();
696 1142 /*
697 1143 * We want to do fpsave rather than fpdisable so that we can
698 1144 * keep the fpu_flags as FPU_VALID tracking the CR0_TS bit
699 1145 */
700 1146 fp->fpu_flags |= FPU_VALID;
701 1147 /* If for current thread disable FP to track FPU_VALID */
702 1148 if (curthread->t_lwp && fp == &curthread->t_lwp->lwp_pcb.pcb_fpu) {
703 1149 /* Clear errors if any to prevent frstor from complaining */
704 1150 (void) fperr_reset();
705 1151 if (fp_kind & __FP_SSE)
706 1152 (void) fpxerr_reset();
707 1153 fpdisable();
708 1154 }
709 1155 kpreempt_enable();
710 1156 }
711 1157
712 1158 /*
713 1159 * Wrapper for freectx to make the types line up for fp_free()
714 1160 */
715 1161 static void
716 1162 fp_free_ctx(void *arg, int isexec __unused)
717 1163 {
718 1164 fp_free((struct fpu_ctx *)arg);
719 1165 }
720 1166
721 1167 /*
722 1168 * Store the floating point state and disable the floating point unit.
723 1169 */
724 1170 void
725 1171 fp_save(struct fpu_ctx *fp)
726 1172 {
727 1173 ASSERT(fp_kind != FP_NO);
728 1174
729 1175 kpreempt_disable();
730 1176 if (!fp || fp->fpu_flags & FPU_VALID ||
731 1177 (fp->fpu_flags & FPU_EN) == 0) {
732 1178 kpreempt_enable();
733 1179 return;
734 1180 }
735 1181 ASSERT(curthread->t_lwp && fp == &curthread->t_lwp->lwp_pcb.pcb_fpu);
736 1182
737 1183 switch (fp_save_mech) {
738 1184 case FP_FXSAVE:
739 1185 fpxsave(fp->fpu_regs.kfpu_u.kfpu_fx);
740 1186 break;
741 1187
742 1188 case FP_XSAVE:
743 1189 xsavep(fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask);
744 1190 break;
745 1191 default:
746 1192 panic("Invalid fp_save_mech");
747 1193 /*NOTREACHED*/
748 1194 }
749 1195
750 1196 fp->fpu_flags |= FPU_VALID;
751 1197
752 1198 /*
753 1199 * We save the FPU as part of forking, execing, modifications via /proc,
754 1200 * restorecontext, etc. As such, we need to make sure that we return to
755 1201 * userland with valid state in the FPU. If we're context switched out
756 1202 * before we hit sys_rtt_common() we'll end up having restored the FPU
757 1203 * as part of the context ops operations. The restore logic always makes
758 1204 * sure that FPU_VALID is set before doing a restore so we don't restore
759 1205 * it a second time.
760 1206 */
761 1207 PCB_SET_UPDATE_FPU(&curthread->t_lwp->lwp_pcb);
762 1208
763 1209 kpreempt_enable();
764 1210 }
765 1211
766 1212 /*
767 1213 * Restore the FPU context for the thread:
768 1214 * The possibilities are:
769 1215 * 1. No active FPU context: Load the new context into the FPU hw
770 1216 * and enable the FPU.
771 1217 */
772 1218 void
773 1219 fp_restore(struct fpu_ctx *fp)
774 1220 {
775 1221 switch (fp_save_mech) {
776 1222 case FP_FXSAVE:
777 1223 fpxrestore(fp->fpu_regs.kfpu_u.kfpu_fx);
778 1224 break;
779 1225
780 1226 case FP_XSAVE:
781 1227 xrestore(fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask);
782 1228 break;
783 1229 default:
784 1230 panic("Invalid fp_save_mech");
785 1231 /*NOTREACHED*/
786 1232 }
787 1233
788 1234 fp->fpu_flags &= ~FPU_VALID;
789 1235 }
790 1236
791 1237 /*
792 1238 * Reset the FPU such that it is in a valid state for a new thread that is
793 1239 * coming out of exec. The FPU will be in a usable state at this point. At this
794 1240 * point we know that the FPU state has already been allocated and if this
795 1241 * wasn't an init process, then it will have had fp_free() previously called.
796 1242 */
797 1243 void
798 1244 fp_exec(void)
799 1245 {
800 1246 struct fpu_ctx *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
801 1247
802 1248 if (fp_save_mech == FP_XSAVE) {
803 1249 fp->fpu_xsave_mask = XFEATURE_FP_ALL;
804 1250 }
805 1251
806 1252 struct ctxop *ctx = fp_ctxop_allocate(fp);
807 1253 /*
808 1254 * Make sure that we're not preempted in the middle of initializing the
809 1255 * FPU on CPU.
810 1256 */
811 1257 kpreempt_disable();
812 1258 ctxop_attach(curthread, ctx);
813 1259 fpinit();
814 1260 fp->fpu_flags = FPU_EN;
815 1261 kpreempt_enable();
816 1262 }
817 1263
818 1264
819 1265 /*
820 1266 * Seeds the initial state for the current thread. The possibilities are:
821 1267 * 1. Another process has modified the FPU state before we have done any
822 1268 * initialization: Load the FPU state from the LWP state.
823 1269 * 2. The FPU state has not been externally modified: Load a clean state.
824 1270 */
825 1271 void
826 1272 fp_seed(void)
827 1273 {
828 1274 struct fpu_ctx *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
829 1275
830 1276 ASSERT(curthread->t_preempt >= 1);
831 1277 ASSERT((fp->fpu_flags & FPU_EN) == 0);
832 1278
833 1279 /*
834 1280 * Always initialize a new context and initialize the hardware.
835 1281 */
836 1282 if (fp_save_mech == FP_XSAVE) {
837 1283 fp->fpu_xsave_mask = XFEATURE_FP_ALL;
838 1284 }
839 1285
840 1286 ctxop_attach(curthread, fp_ctxop_allocate(fp));
841 1287 fpinit();
842 1288
843 1289 /*
844 1290 * If FPU_VALID is set, it means someone has modified registers via
845 1291 * /proc. In this case, restore the current lwp's state.
846 1292 */
847 1293 if (fp->fpu_flags & FPU_VALID)
848 1294 fp_restore(fp);
849 1295
850 1296 ASSERT((fp->fpu_flags & FPU_VALID) == 0);
851 1297 fp->fpu_flags = FPU_EN;
852 1298 }
853 1299
854 1300 /*
855 1301 * When using xsave/xrstor, these three functions are used by the lwp code to
856 1302 * manage the memory for the xsave area.
857 1303 */
858 1304 void
|
↓ open down ↓ |
184 lines elided |
↑ open up ↑ |
859 1305 fp_lwp_init(struct _klwp *lwp)
860 1306 {
861 1307 struct fpu_ctx *fp = &lwp->lwp_pcb.pcb_fpu;
862 1308
863 1309 /*
864 1310 * We keep a copy of the pointer in lwp_fpu so that we can restore the
865 1311 * value in forklwp() after we duplicate the parent's LWP state.
866 1312 */
867 1313 lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic =
868 1314 kmem_cache_alloc(fpsave_cachep, KM_SLEEP);
1315 + fp->fpu_signal = NULL;
869 1316
870 1317 if (fp_save_mech == FP_XSAVE) {
871 1318 /*
872 1319 *
873 1320 * We bzero since the fpinit() code path will only
874 1321 * partially initialize the xsave area using avx_inital.
875 1322 */
876 1323 ASSERT(cpuid_get_xsave_size() >= sizeof (struct xsave_state));
877 1324 bzero(fp->fpu_regs.kfpu_u.kfpu_xs, cpuid_get_xsave_size());
878 1325 }
879 1326 }
880 1327
|
↓ open down ↓ |
2 lines elided |
↑ open up ↑ |
881 1328 void
882 1329 fp_lwp_cleanup(struct _klwp *lwp)
883 1330 {
884 1331 struct fpu_ctx *fp = &lwp->lwp_pcb.pcb_fpu;
885 1332
886 1333 if (fp->fpu_regs.kfpu_u.kfpu_generic != NULL) {
887 1334 kmem_cache_free(fpsave_cachep,
888 1335 fp->fpu_regs.kfpu_u.kfpu_generic);
889 1336 lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic = NULL;
890 1337 }
1338 +
1339 + if (fp->fpu_signal != NULL) {
1340 + kmem_cache_free(fpsave_cachep, fp->fpu_signal);
1341 + fp->fpu_signal = NULL;
1342 + }
891 1343 }
892 1344
893 1345 /*
894 1346 * Called during the process of forklwp(). The kfpu_u pointer will have been
895 1347 * overwritten while copying the parent's LWP structure. We have a valid copy
896 1348 * stashed in the child's lwp_fpu which we use to restore the correct value.
897 1349 */
898 1350 void
899 1351 fp_lwp_dup(struct _klwp *lwp)
900 1352 {
901 1353 void *xp = lwp->lwp_fpu;
902 1354 size_t sz;
903 1355
904 1356 switch (fp_save_mech) {
905 1357 case FP_FXSAVE:
906 1358 sz = sizeof (struct fxsave_state);
907 1359 break;
908 1360 case FP_XSAVE:
909 1361 sz = cpuid_get_xsave_size();
|
↓ open down ↓ |
9 lines elided |
↑ open up ↑ |
910 1362 break;
911 1363 default:
912 1364 panic("Invalid fp_save_mech");
913 1365 /*NOTREACHED*/
914 1366 }
915 1367
916 1368 /* copy the parent's values into the new lwp's struct */
917 1369 bcopy(lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic, xp, sz);
918 1370 /* now restore the pointer */
919 1371 lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic = xp;
1372 + /* Ensure that we don't inherit our parent's signal state */
1373 + lwp->lwp_pcb.pcb_fpu.fpu_signal = NULL;
920 1374 }
921 1375
922 1376 /*
923 1377 * Handle a processor extension error fault
924 1378 * Returns non zero for error.
925 1379 */
926 1380
927 1381 /*ARGSUSED*/
928 1382 int
929 1383 fpexterrflt(struct regs *rp)
930 1384 {
931 1385 uint32_t fpcw, fpsw;
932 1386 fpu_ctx_t *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
933 1387
934 1388 ASSERT(fp_kind != FP_NO);
935 1389
936 1390 /*
937 1391 * Now we can enable the interrupts.
938 1392 * (NOTE: x87 fp exceptions come thru interrupt gate)
939 1393 */
940 1394 sti();
941 1395
942 1396 if (!fpu_exists)
943 1397 return (FPE_FLTINV);
944 1398
945 1399 /*
946 1400 * Do an unconditional save of the FP state. If it's dirty (TS=0),
947 1401 * it'll be saved into the fpu context area passed in (that of the
948 1402 * current thread). If it's not dirty (it may not be, due to
949 1403 * an intervening save due to a context switch between the sti(),
950 1404 * above and here, then it's safe to just use the stored values in
951 1405 * the context save area to determine the cause of the fault.
952 1406 */
953 1407 fp_save(fp);
954 1408
955 1409 /* clear exception flags in saved state, as if by fnclex */
956 1410 switch (fp_save_mech) {
957 1411 case FP_FXSAVE:
958 1412 fpsw = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw;
959 1413 fpcw = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fcw;
960 1414 fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw &= ~FPS_SW_EFLAGS;
961 1415 break;
962 1416
963 1417 case FP_XSAVE:
964 1418 fpsw = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw;
965 1419 fpcw = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fcw;
966 1420 fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw &= ~FPS_SW_EFLAGS;
967 1421 /*
968 1422 * Always set LEGACY_FP as it may have been cleared by XSAVE
969 1423 * instruction
970 1424 */
971 1425 fp->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv |=
972 1426 XFEATURE_LEGACY_FP;
973 1427 break;
974 1428 default:
975 1429 panic("Invalid fp_save_mech");
976 1430 /*NOTREACHED*/
977 1431 }
978 1432
979 1433 fp->fpu_regs.kfpu_status = fpsw;
980 1434
981 1435 if ((fpsw & FPS_ES) == 0)
982 1436 return (0); /* No exception */
983 1437
984 1438 /*
985 1439 * "and" the exception flags with the complement of the mask
986 1440 * bits to determine which exception occurred
987 1441 */
988 1442 return (fpe_sicode(fpsw & ~fpcw & 0x3f));
989 1443 }
990 1444
991 1445 /*
992 1446 * Handle an SSE/SSE2 precise exception.
993 1447 * Returns a non-zero sicode for error.
994 1448 */
995 1449 /*ARGSUSED*/
996 1450 int
997 1451 fpsimderrflt(struct regs *rp)
998 1452 {
999 1453 uint32_t mxcsr, xmask;
1000 1454 fpu_ctx_t *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
1001 1455
1002 1456 ASSERT(fp_kind & __FP_SSE);
1003 1457
1004 1458 /*
1005 1459 * NOTE: Interrupts are disabled during execution of this
1006 1460 * function. They are enabled by the caller in trap.c.
1007 1461 */
1008 1462
1009 1463 /*
1010 1464 * The only way we could have gotten here if there is no FP unit
1011 1465 * is via a user executing an INT $19 instruction, so there is
1012 1466 * no fault in that case.
1013 1467 */
1014 1468 if (!fpu_exists)
1015 1469 return (0);
1016 1470
1017 1471 /*
1018 1472 * Do an unconditional save of the FP state. If it's dirty (TS=0),
1019 1473 * it'll be saved into the fpu context area passed in (that of the
1020 1474 * current thread). If it's not dirty, then it's safe to just use
1021 1475 * the stored values in the context save area to determine the
1022 1476 * cause of the fault.
1023 1477 */
1024 1478 fp_save(fp); /* save the FPU state */
1025 1479
1026 1480 if (fp_save_mech == FP_XSAVE) {
1027 1481 mxcsr = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_mxcsr;
1028 1482 fp->fpu_regs.kfpu_status =
1029 1483 fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw;
1030 1484 } else {
1031 1485 mxcsr = fp->fpu_regs.kfpu_u.kfpu_fx->fx_mxcsr;
1032 1486 fp->fpu_regs.kfpu_status = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw;
1033 1487 }
1034 1488 fp->fpu_regs.kfpu_xstatus = mxcsr;
1035 1489
1036 1490 /*
1037 1491 * compute the mask that determines which conditions can cause
1038 1492 * a #xm exception, and use this to clean the status bits so that
1039 1493 * we can identify the true cause of this one.
1040 1494 */
1041 1495 xmask = (mxcsr >> 7) & SSE_MXCSR_EFLAGS;
1042 1496 return (fpe_simd_sicode((mxcsr & SSE_MXCSR_EFLAGS) & ~xmask));
1043 1497 }
1044 1498
1045 1499 /*
1046 1500 * In the unlikely event that someone is relying on this subcode being
1047 1501 * FPE_FLTILL for denormalize exceptions, it can always be patched back
1048 1502 * again to restore old behaviour.
1049 1503 */
1050 1504 int fpe_fltden = FPE_FLTDEN;
1051 1505
1052 1506 /*
1053 1507 * Map from the FPU status word to the FP exception si_code.
1054 1508 */
1055 1509 static int
1056 1510 fpe_sicode(uint_t sw)
1057 1511 {
1058 1512 if (sw & FPS_IE)
1059 1513 return (FPE_FLTINV);
1060 1514 if (sw & FPS_ZE)
1061 1515 return (FPE_FLTDIV);
1062 1516 if (sw & FPS_DE)
1063 1517 return (fpe_fltden);
1064 1518 if (sw & FPS_OE)
1065 1519 return (FPE_FLTOVF);
1066 1520 if (sw & FPS_UE)
1067 1521 return (FPE_FLTUND);
1068 1522 if (sw & FPS_PE)
1069 1523 return (FPE_FLTRES);
1070 1524 return (FPE_FLTINV); /* default si_code for other exceptions */
1071 1525 }
1072 1526
1073 1527 /*
1074 1528 * Map from the SSE status word to the FP exception si_code.
1075 1529 */
1076 1530 static int
1077 1531 fpe_simd_sicode(uint_t sw)
1078 1532 {
1079 1533 if (sw & SSE_IE)
1080 1534 return (FPE_FLTINV);
1081 1535 if (sw & SSE_ZE)
1082 1536 return (FPE_FLTDIV);
1083 1537 if (sw & SSE_DE)
1084 1538 return (FPE_FLTDEN);
1085 1539 if (sw & SSE_OE)
1086 1540 return (FPE_FLTOVF);
1087 1541 if (sw & SSE_UE)
1088 1542 return (FPE_FLTUND);
1089 1543 if (sw & SSE_PE)
1090 1544 return (FPE_FLTRES);
1091 1545 return (FPE_FLTINV); /* default si_code for other exceptions */
1092 1546 }
1093 1547
1094 1548 /*
1095 1549 * This routine is invoked as part of libc's __fpstart implementation
1096 1550 * via sysi86(2).
1097 1551 *
1098 1552 * It may be called -before- any context has been assigned in which case
1099 1553 * we try and avoid touching the hardware. Or it may be invoked well
1100 1554 * after the context has been assigned and fiddled with, in which case
1101 1555 * just tweak it directly.
1102 1556 */
1103 1557 void
1104 1558 fpsetcw(uint16_t fcw, uint32_t mxcsr)
1105 1559 {
1106 1560 struct fpu_ctx *fp = &curthread->t_lwp->lwp_pcb.pcb_fpu;
1107 1561 struct fxsave_state *fx;
1108 1562
1109 1563 if (!fpu_exists || fp_kind == FP_NO)
1110 1564 return;
1111 1565
1112 1566 if ((fp->fpu_flags & FPU_EN) == 0) {
1113 1567 if (fcw == FPU_CW_INIT && mxcsr == SSE_MXCSR_INIT) {
1114 1568 /*
1115 1569 * Common case. Floating point unit not yet
1116 1570 * enabled, and kernel already intends to initialize
1117 1571 * the hardware the way the caller wants.
1118 1572 */
1119 1573 return;
1120 1574 }
1121 1575 /*
1122 1576 * Hmm. Userland wants a different default.
1123 1577 * Do a fake "first trap" to establish the context, then
1124 1578 * handle as if we already had a context before we came in.
1125 1579 */
1126 1580 kpreempt_disable();
1127 1581 fp_seed();
1128 1582 kpreempt_enable();
1129 1583 }
1130 1584
1131 1585 /*
1132 1586 * Ensure that the current hardware state is flushed back to the
1133 1587 * pcb, then modify that copy. Next use of the fp will
1134 1588 * restore the context.
1135 1589 */
1136 1590 fp_save(fp);
1137 1591
1138 1592 switch (fp_save_mech) {
1139 1593 case FP_FXSAVE:
1140 1594 fx = fp->fpu_regs.kfpu_u.kfpu_fx;
1141 1595 fx->fx_fcw = fcw;
1142 1596 fx->fx_mxcsr = sse_mxcsr_mask & mxcsr;
1143 1597 break;
1144 1598
1145 1599 case FP_XSAVE:
1146 1600 fx = &fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave;
1147 1601 fx->fx_fcw = fcw;
1148 1602 fx->fx_mxcsr = sse_mxcsr_mask & mxcsr;
1149 1603 /*
1150 1604 * Always set LEGACY_FP as it may have been cleared by XSAVE
1151 1605 * instruction
1152 1606 */
1153 1607 fp->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv |=
1154 1608 XFEATURE_LEGACY_FP;
1155 1609 break;
1156 1610 default:
1157 1611 panic("Invalid fp_save_mech");
1158 1612 /*NOTREACHED*/
1159 1613 }
1160 1614 }
1161 1615
1162 1616 static void
1163 1617 kernel_fpu_fpstate_init(kfpu_state_t *kfpu)
1164 1618 {
1165 1619 struct xsave_state *xs;
1166 1620
1167 1621 switch (fp_save_mech) {
1168 1622 case FP_FXSAVE:
1169 1623 bcopy(&sse_initial, kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_fx,
1170 1624 sizeof (struct fxsave_state));
1171 1625 kfpu->kfpu_ctx.fpu_xsave_mask = 0;
1172 1626 break;
1173 1627 case FP_XSAVE:
1174 1628 xs = kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_xs;
1175 1629 bzero(xs, cpuid_get_xsave_size());
1176 1630 bcopy(&avx_initial, xs, sizeof (*xs));
1177 1631 xs->xs_header.xsh_xstate_bv = XFEATURE_LEGACY_FP | XFEATURE_SSE;
1178 1632 kfpu->kfpu_ctx.fpu_xsave_mask = XFEATURE_FP_ALL;
1179 1633 break;
1180 1634 default:
1181 1635 panic("invalid fp_save_mech");
1182 1636 }
1183 1637
1184 1638 /*
1185 1639 * Set the corresponding flags that the system expects on the FPU state
1186 1640 * to indicate that this is our state. The FPU_EN flag is required to
1187 1641 * indicate that FPU usage is allowed. The FPU_KERN flag is explicitly
1188 1642 * not set below as it represents that this state is being suppressed
1189 1643 * by the kernel.
1190 1644 */
1191 1645 kfpu->kfpu_ctx.fpu_flags = FPU_EN | FPU_VALID;
1192 1646 kfpu->kfpu_flags |= KFPU_F_INITIALIZED;
1193 1647 }
1194 1648
1195 1649 kfpu_state_t *
1196 1650 kernel_fpu_alloc(int kmflags)
1197 1651 {
1198 1652 kfpu_state_t *kfpu;
1199 1653
1200 1654 if ((kfpu = kmem_zalloc(sizeof (kfpu_state_t), kmflags)) == NULL) {
1201 1655 return (NULL);
1202 1656 }
1203 1657
1204 1658 kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic =
1205 1659 kmem_cache_alloc(fpsave_cachep, kmflags);
1206 1660 if (kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic == NULL) {
1207 1661 kmem_free(kfpu, sizeof (kfpu_state_t));
1208 1662 return (NULL);
1209 1663 }
1210 1664
1211 1665 kernel_fpu_fpstate_init(kfpu);
1212 1666
1213 1667 return (kfpu);
1214 1668 }
1215 1669
1216 1670 void
1217 1671 kernel_fpu_free(kfpu_state_t *kfpu)
1218 1672 {
1219 1673 kmem_cache_free(fpsave_cachep,
1220 1674 kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic);
1221 1675 kmem_free(kfpu, sizeof (kfpu_state_t));
1222 1676 }
1223 1677
1224 1678 static void
1225 1679 kernel_fpu_ctx_save(void *arg)
1226 1680 {
1227 1681 kfpu_state_t *kfpu = arg;
1228 1682 fpu_ctx_t *pf;
1229 1683
1230 1684 if (kfpu == NULL) {
1231 1685 /*
1232 1686 * A NULL kfpu implies this is a kernel thread with an LWP and
1233 1687 * no user-level FPU usage. Use the lwp fpu save area.
1234 1688 */
1235 1689 pf = &curthread->t_lwp->lwp_pcb.pcb_fpu;
1236 1690
1237 1691 ASSERT(curthread->t_procp->p_flag & SSYS);
1238 1692 ASSERT3U(pf->fpu_flags & FPU_VALID, ==, 0);
1239 1693
1240 1694 fp_save(pf);
1241 1695 } else {
1242 1696 pf = &kfpu->kfpu_ctx;
1243 1697
1244 1698 ASSERT3P(kfpu->kfpu_curthread, ==, curthread);
1245 1699 ASSERT3U(pf->fpu_flags & FPU_VALID, ==, 0);
1246 1700
1247 1701 /*
1248 1702 * Note, we can't use fp_save because it assumes that we're
1249 1703 * saving to the thread's PCB and not somewhere else. Because
1250 1704 * this is a different FPU context, we instead have to do this
1251 1705 * ourselves.
1252 1706 */
1253 1707 switch (fp_save_mech) {
1254 1708 case FP_FXSAVE:
1255 1709 fpxsave(pf->fpu_regs.kfpu_u.kfpu_fx);
1256 1710 break;
1257 1711 case FP_XSAVE:
1258 1712 xsavep(pf->fpu_regs.kfpu_u.kfpu_xs, pf->fpu_xsave_mask);
1259 1713 break;
1260 1714 default:
1261 1715 panic("Invalid fp_save_mech");
1262 1716 }
1263 1717
1264 1718 /*
1265 1719 * Because we have saved context here, our save state is no
1266 1720 * longer valid and therefore needs to be reinitialized.
1267 1721 */
1268 1722 kfpu->kfpu_flags &= ~KFPU_F_INITIALIZED;
1269 1723 }
1270 1724
1271 1725 pf->fpu_flags |= FPU_VALID;
1272 1726
1273 1727 /*
1274 1728 * Clear KFPU flag. This allows swtch to check for improper kernel
1275 1729 * usage of the FPU (i.e. switching to a new thread while the old
1276 1730 * thread was in the kernel and using the FPU, but did not perform a
1277 1731 * context save).
1278 1732 */
1279 1733 curthread->t_flag &= ~T_KFPU;
1280 1734 }
1281 1735
1282 1736 static void
1283 1737 kernel_fpu_ctx_restore(void *arg)
1284 1738 {
1285 1739 kfpu_state_t *kfpu = arg;
1286 1740 fpu_ctx_t *pf;
1287 1741
1288 1742 if (kfpu == NULL) {
1289 1743 /*
1290 1744 * A NULL kfpu implies this is a kernel thread with an LWP and
1291 1745 * no user-level FPU usage. Use the lwp fpu save area.
1292 1746 */
1293 1747 pf = &curthread->t_lwp->lwp_pcb.pcb_fpu;
1294 1748
1295 1749 ASSERT(curthread->t_procp->p_flag & SSYS);
1296 1750 ASSERT3U(pf->fpu_flags & FPU_VALID, !=, 0);
1297 1751 } else {
1298 1752 pf = &kfpu->kfpu_ctx;
1299 1753
1300 1754 ASSERT3P(kfpu->kfpu_curthread, ==, curthread);
1301 1755 ASSERT3U(pf->fpu_flags & FPU_VALID, !=, 0);
1302 1756 }
1303 1757
1304 1758 fp_restore(pf);
1305 1759 curthread->t_flag |= T_KFPU;
1306 1760 }
1307 1761
1308 1762 /*
1309 1763 * Validate that the thread is not switching off-cpu while actively using the
1310 1764 * FPU within the kernel.
1311 1765 */
1312 1766 void
1313 1767 kernel_fpu_no_swtch(void)
1314 1768 {
1315 1769 if ((curthread->t_flag & T_KFPU) != 0) {
1316 1770 panic("curthread swtch-ing while the kernel is using the FPU");
1317 1771 }
1318 1772 }
1319 1773
1320 1774 static const struct ctxop_template kfpu_ctxop_tpl = {
1321 1775 .ct_rev = CTXOP_TPL_REV,
1322 1776 .ct_save = kernel_fpu_ctx_save,
1323 1777 .ct_restore = kernel_fpu_ctx_restore,
1324 1778 };
1325 1779
1326 1780 void
1327 1781 kernel_fpu_begin(kfpu_state_t *kfpu, uint_t flags)
1328 1782 {
1329 1783 klwp_t *pl = curthread->t_lwp;
1330 1784 struct ctxop *ctx;
1331 1785
1332 1786 if ((curthread->t_flag & T_KFPU) != 0) {
1333 1787 panic("curthread attempting to nest kernel FPU states");
1334 1788 }
1335 1789
1336 1790 /* KFPU_USE_LWP and KFPU_NO_STATE are mutually exclusive. */
1337 1791 ASSERT((flags & (KFPU_USE_LWP | KFPU_NO_STATE)) !=
1338 1792 (KFPU_USE_LWP | KFPU_NO_STATE));
1339 1793
1340 1794 if ((flags & KFPU_NO_STATE) == KFPU_NO_STATE) {
1341 1795 /*
1342 1796 * Since we don't have a kfpu_state or usable lwp pcb_fpu to
1343 1797 * hold our kernel FPU context, we depend on the caller doing
1344 1798 * kpreempt_disable for the duration of our FPU usage. This
1345 1799 * should only be done for very short periods of time.
1346 1800 */
1347 1801 ASSERT(curthread->t_preempt > 0);
1348 1802 ASSERT(kfpu == NULL);
1349 1803
1350 1804 if (pl != NULL) {
1351 1805 /*
1352 1806 * We might have already saved once so FPU_VALID could
1353 1807 * be set. This is handled in fp_save.
1354 1808 */
1355 1809 fp_save(&pl->lwp_pcb.pcb_fpu);
1356 1810 pl->lwp_pcb.pcb_fpu.fpu_flags |= FPU_KERNEL;
1357 1811 }
1358 1812
1359 1813 curthread->t_flag |= T_KFPU;
1360 1814
1361 1815 /* Always restore the fpu to the initial state. */
1362 1816 fpinit();
1363 1817
1364 1818 return;
1365 1819 }
1366 1820
1367 1821 /*
1368 1822 * We either have a kfpu, or are using the LWP pcb_fpu for context ops.
1369 1823 */
1370 1824
1371 1825 if ((flags & KFPU_USE_LWP) == 0) {
1372 1826 if (kfpu->kfpu_curthread != NULL)
1373 1827 panic("attempting to reuse kernel FPU state at %p when "
1374 1828 "another thread already is using", kfpu);
1375 1829
1376 1830 if ((kfpu->kfpu_flags & KFPU_F_INITIALIZED) == 0)
1377 1831 kernel_fpu_fpstate_init(kfpu);
1378 1832
1379 1833 kfpu->kfpu_curthread = curthread;
1380 1834 }
1381 1835
1382 1836 /*
1383 1837 * Not all threads may have an active LWP. If they do and we're not
1384 1838 * going to re-use the LWP, then we should go ahead and save the state.
1385 1839 * We must also note that the fpu is now being used by the kernel and
1386 1840 * therefore we do not want to manage the fpu state via the user-level
1387 1841 * thread's context handlers.
1388 1842 *
1389 1843 * We might have already saved once (due to a prior use of the kernel
1390 1844 * FPU or another code path) so FPU_VALID could be set. This is handled
1391 1845 * by fp_save, as is the FPU_EN check.
1392 1846 */
1393 1847 ctx = ctxop_allocate(&kfpu_ctxop_tpl, kfpu);
1394 1848 kpreempt_disable();
1395 1849 if (pl != NULL) {
1396 1850 if ((flags & KFPU_USE_LWP) == 0)
1397 1851 fp_save(&pl->lwp_pcb.pcb_fpu);
1398 1852 pl->lwp_pcb.pcb_fpu.fpu_flags |= FPU_KERNEL;
1399 1853 }
1400 1854
1401 1855 /*
1402 1856 * Set the context operations for kernel FPU usage. Because kernel FPU
1403 1857 * setup and ctxop attachment needs to happen under the protection of
1404 1858 * kpreempt_disable(), we allocate the ctxop outside the guard so its
1405 1859 * sleeping allocation will not cause a voluntary swtch(). This allows
1406 1860 * the rest of the initialization to proceed, ensuring valid state for
1407 1861 * the ctxop handlers.
1408 1862 */
1409 1863 ctxop_attach(curthread, ctx);
1410 1864 curthread->t_flag |= T_KFPU;
1411 1865
1412 1866 if ((flags & KFPU_USE_LWP) == KFPU_USE_LWP) {
1413 1867 /*
1414 1868 * For pure kernel threads with an LWP, we can use the LWP's
1415 1869 * pcb_fpu to save/restore context.
1416 1870 */
1417 1871 fpu_ctx_t *pf = &pl->lwp_pcb.pcb_fpu;
1418 1872
1419 1873 VERIFY(curthread->t_procp->p_flag & SSYS);
1420 1874 VERIFY(kfpu == NULL);
1421 1875 ASSERT((pf->fpu_flags & FPU_EN) == 0);
1422 1876
1423 1877 /* Always restore the fpu to the initial state. */
1424 1878 if (fp_save_mech == FP_XSAVE)
1425 1879 pf->fpu_xsave_mask = XFEATURE_FP_ALL;
1426 1880 fpinit();
1427 1881 pf->fpu_flags = FPU_EN | FPU_KERNEL;
1428 1882 } else {
1429 1883 /* initialize the kfpu state */
1430 1884 kernel_fpu_ctx_restore(kfpu);
1431 1885 }
1432 1886 kpreempt_enable();
1433 1887 }
1434 1888
1435 1889 void
1436 1890 kernel_fpu_end(kfpu_state_t *kfpu, uint_t flags)
1437 1891 {
1438 1892 if ((curthread->t_flag & T_KFPU) == 0) {
1439 1893 panic("curthread attempting to clear kernel FPU state "
1440 1894 "without using it");
1441 1895 }
1442 1896
1443 1897 /*
1444 1898 * General comments on why the rest of this function is structured the
1445 1899 * way it is. Be aware that there is a lot of subtlety here.
1446 1900 *
1447 1901 * If a user-level thread ever uses the fpu while in the kernel, then
1448 1902 * we cannot call fpdisable since that does STTS. That will set the
1449 1903 * ts bit in %cr0 which will cause an exception if anything touches the
1450 1904 * fpu. However, the user-level context switch handler (fpsave_ctxt)
1451 1905 * needs to access the fpu to save the registers into the pcb.
1452 1906 * fpsave_ctxt relies on CLTS having been done to clear the ts bit in
1453 1907 * fprestore_ctxt when the thread context switched onto the CPU.
1454 1908 *
1455 1909 * Calling fpdisable only effects the current CPU's %cr0 register.
1456 1910 *
1457 1911 * During ctxop_remove and kpreempt_enable, we can voluntarily context
1458 1912 * switch, so the CPU we were on when we entered this function might
1459 1913 * not be the same one we're on when we return from ctxop_remove or end
1460 1914 * the function. Note there can be user-level context switch handlers
1461 1915 * still installed if this is a user-level thread.
1462 1916 *
1463 1917 * We also must be careful in the unlikely chance we're running in an
1464 1918 * interrupt thread, since we can't leave the CPU's %cr0 TS state set
1465 1919 * incorrectly for the "real" thread to resume on this CPU.
1466 1920 */
1467 1921
1468 1922 if ((flags & KFPU_NO_STATE) == 0) {
1469 1923 kpreempt_disable();
1470 1924 } else {
1471 1925 ASSERT(curthread->t_preempt > 0);
1472 1926 }
1473 1927
1474 1928 curthread->t_flag &= ~T_KFPU;
1475 1929
1476 1930 /*
1477 1931 * When we are ending things, we explicitly don't save the current
1478 1932 * kernel FPU state back to the temporary state. The kfpu API is not
1479 1933 * intended to be a permanent save location.
1480 1934 *
1481 1935 * If this is a user-level thread and we were to context switch
1482 1936 * before returning to user-land, fpsave_ctxt will be a no-op since we
1483 1937 * already saved the user-level FPU state the first time we run
1484 1938 * kernel_fpu_begin (i.e. we won't save the bad kernel fpu state over
1485 1939 * the user-level fpu state). The fpsave_ctxt functions only save if
1486 1940 * FPU_VALID is not already set. fp_save also set PCB_SET_UPDATE_FPU so
1487 1941 * fprestore_ctxt will be done in sys_rtt_common when the thread
1488 1942 * finally returns to user-land.
1489 1943 */
1490 1944
1491 1945 if ((curthread->t_procp->p_flag & SSYS) != 0 &&
1492 1946 curthread->t_intr == NULL) {
1493 1947 /*
1494 1948 * A kernel thread which is not an interrupt thread, so we
1495 1949 * STTS now.
1496 1950 */
1497 1951 fpdisable();
1498 1952 }
1499 1953
1500 1954 if ((flags & KFPU_NO_STATE) == 0) {
1501 1955 ctxop_remove(curthread, &kfpu_ctxop_tpl, kfpu);
1502 1956
1503 1957 if (kfpu != NULL) {
1504 1958 if (kfpu->kfpu_curthread != curthread) {
1505 1959 panic("attempting to end kernel FPU state "
1506 1960 "for %p, but active thread is not "
1507 1961 "curthread", kfpu);
1508 1962 } else {
1509 1963 kfpu->kfpu_curthread = NULL;
1510 1964 }
1511 1965 }
1512 1966
1513 1967 kpreempt_enable();
1514 1968 }
1515 1969
|
↓ open down ↓ |
586 lines elided |
↑ open up ↑ |
1516 1970 if (curthread->t_lwp != NULL) {
1517 1971 uint_t f;
1518 1972
1519 1973 if (flags & KFPU_USE_LWP) {
1520 1974 f = FPU_EN | FPU_KERNEL;
1521 1975 } else {
1522 1976 f = FPU_KERNEL;
1523 1977 }
1524 1978 curthread->t_lwp->lwp_pcb.pcb_fpu.fpu_flags &= ~f;
1525 1979 }
1980 +}
1981 +
1982 +/*
1983 + * Fill in FPU information that is required by exec.
1984 + */
1985 +void
1986 +fpu_auxv_info(int *typep, size_t *lenp)
1987 +{
1988 + *typep = fp_elf;
1989 + switch (fp_save_mech) {
1990 + case FP_FXSAVE:
1991 + *lenp = sizeof (struct fxsave_state);
1992 + break;
1993 + case FP_XSAVE:
1994 + *lenp = cpuid_get_xsave_size();
1995 + break;
1996 + default:
1997 + *lenp = 0;
1998 + break;
1999 + }
2000 +}
2001 +
2002 +/*
2003 + * This function exists to transform an xsave_state into an fxsave_state. The
2004 + * way that we have to do this is nuanced. We assume that callers have already
2005 + * handled FPU_EN and thus we only need to consider the xsave_state and its
2006 + * component vector itself. This results in the following cases that we need to
2007 + * consider:
2008 + *
2009 + * o Neither the x87 / XMM state bits are set. We use the hardware default and
2010 + * need to ensure to copy the xsave header.
2011 + * o Both x87 / XMM state bits are set. We can copy everything.
2012 + * o Only the x87 bit is set. We need to copy the x87 state but make the XMM
2013 + * state be in the initial case.
2014 + * o Only the XMM bit is set. The reverse of the above case.
2015 + *
2016 + * The illumos and hardware defaults in 'sse_initial' and 'avx_initial' are
2017 + * generally the same; however, the default floating point control word is
2018 + * different.
2019 + *
2020 + * Finally, we have the complication of the MXCSR and MCXSR_MASK registers.
2021 + * Because we are using xsave and xsaveopt in the kernel right now and not
2022 + * xsavec, the hardware may write out the MXCSR and MXCSR_MASK registers if the
2023 + * XFEATURE_AVX bit is set. Therefore if we don't have the XMM bit set but AVX
2024 + * is set, we must also come back and copy out the MXCSR register. Sorry, we
2025 + * don't make the rules.
2026 + */
2027 +static void
2028 +fpu_xsave_to_fxsave(const struct xsave_state *xsave, struct fxsave_state *fx)
2029 +{
2030 + const uint64_t comps = xsave->xs_header.xsh_xstate_bv;
2031 +
2032 + switch (comps & (XFEATURE_LEGACY_FP | XFEATURE_SSE)) {
2033 + case XFEATURE_LEGACY_FP | XFEATURE_SSE:
2034 + bcopy(xsave, fx, sizeof (*fx));
2035 + return;
2036 + case XFEATURE_LEGACY_FP:
2037 + bcopy(xsave, fx, offsetof(struct fxsave_state, fx_xmm));
2038 + fx->fx_mxcsr = SSE_MXCSR_INIT;
2039 + fx->fx_mxcsr_mask = 0;
2040 + break;
2041 + case XFEATURE_SSE:
2042 + bcopy(&sse_initial, fx, offsetof(struct fxsave_state,
2043 + fx_mxcsr));
2044 +
2045 + fx->fx_fcw = FPU_CW_INIT_HW;
2046 + fx->fx_mxcsr = xsave->xs_fxsave.fx_mxcsr;
2047 + fx->fx_mxcsr_mask = xsave->xs_fxsave.fx_mxcsr_mask;
2048 + bcopy(xsave->xs_fxsave.fx_xmm, fx->fx_xmm, sizeof (fx->fx_xmm));
2049 + break;
2050 + default:
2051 + bcopy(&sse_initial, fx, sizeof (*fx));
2052 + fx->fx_fcw = FPU_CW_INIT_HW;
2053 + break;
2054 + }
2055 +
2056 + /*
2057 + * Account for the AVX causing MXCSR to be valid.
2058 + */
2059 + if ((xsave->xs_header.xsh_xstate_bv & XFEATURE_AVX) != 0 &&
2060 + (xsave->xs_header.xsh_xstate_bv & XFEATURE_SSE) == 0) {
2061 + fx->fx_mxcsr = xsave->xs_fxsave.fx_mxcsr;
2062 + fx->fx_mxcsr_mask = xsave->xs_fxsave.fx_mxcsr_mask;
2063 + }
2064 +}
2065 +
2066 +/*
2067 + * This function is designed to answer the question of are we using any xsave
2068 + * family of instructions in context switch and therefore we have this state.
2069 + * This should still remain true if we are using xsavec or xsaves in the kernel
2070 + * in the future.
2071 + */
2072 +boolean_t
2073 +fpu_xsave_enabled(void)
2074 +{
2075 + return (fp_save_mech == FP_XSAVE);
2076 +}
2077 +
2078 +/*
2079 + * The following structure is used to track and manage the programmatic
2080 + * construction of /proc and signal stack spilling of xsave information. All
2081 + * known xsave types that the kernel supports must be included here.
2082 + */
2083 +typedef struct xsave_proc_info {
2084 + /*
2085 + * This matches the /proc xregs type that this data represents. This s
2086 + * used for /proc only.
2087 + */
2088 + uint32_t xi_type;
2089 + /*
2090 + * This indicates the size of the /proc data that we're operating on.
2091 + * This is only used for /proc.
2092 + */
2093 + size_t xi_size;
2094 + /*
2095 + * This indicates the alignment that we want to have for the member when
2096 + * we're writing out. This is not used when setting data. This is only
2097 + * used for /proc.
2098 + */
2099 + size_t xi_align;
2100 + /*
2101 + * This indicates whether this member must always be considered or not.
2102 + * This is used in both /proc and context/signal handling.
2103 + */
2104 + bool xi_always;
2105 + /*
2106 + * This contains the corresponding bits in the xsave bit vector that
2107 + * corresponds to this entry. This is used for both /proc and
2108 + * context/signal handling.
2109 + */
2110 + uint64_t xi_bits;
2111 + /*
2112 + * The xi_fill function pointer is used to write out the /proc regset
2113 + * data (e.g. when a user reads xregs). This is only used for the /proc
2114 + * handling. The xi_valid function pointer is used instead to validate a
2115 + * given set of data that we've read in, while the xi_set pointer is
2116 + * used to actually transform the data in the underlying fpu save area.
2117 + */
2118 + void (*xi_fill)(const fpu_ctx_t *, const struct xsave_proc_info *,
2119 + void *);
2120 + bool (*xi_valid)(model_t, const void *);
2121 + void (*xi_set)(fpu_ctx_t *, const struct xsave_proc_info *,
2122 + uint64_t, const void *);
2123 + /*
2124 + * The xi_signal_in and xi_signal_out function pointers are used for
2125 + * extended context and signal handling information. They are used when
2126 + * reading in data from a ucontex_t and writing it out respectively.
2127 + * These are only used for context/signal handling.
2128 + */
2129 + int (*xi_signal_in)(const struct xsave_proc_info *,
2130 + const ucontext_t *, const uc_xsave_t *, void *, uintptr_t *,
2131 + const uintptr_t);
2132 + int (*xi_signal_out)(const struct xsave_proc_info *, fpu_copyout_f,
2133 + uc_xsave_t *, const void *fpup, uintptr_t);
2134 +} xsave_proc_info_t;
2135 +
2136 +static bool
2137 +fpu_proc_xregs_initial_state(const fpu_ctx_t *fpu, uint64_t feats)
2138 +{
2139 + if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == 0) {
2140 + return (B_TRUE);
2141 + }
2142 +
2143 + return ((fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv &
2144 + feats) == 0);
2145 +}
2146 +
2147 +static void
2148 +fpu_proc_xregs_xcr_fill(const fpu_ctx_t *fpu, const xsave_proc_info_t *info,
2149 + void *datap)
2150 +{
2151 + prxregset_xcr_t *xcr = datap;
2152 +
2153 + xcr->prx_xcr_xcr0 = xsave_bv_all;
2154 +}
2155 +
2156 +/*
2157 + * Unlike other instruction portions, we treat the xsave header and the legacy
2158 + * XMM section together as both are somewhat tied at the instruction hip. Unlike
2159 + * the latter values, the initial state here is not quite the same.
2160 + */
2161 +static void
2162 +fpu_proc_xregs_xsave_fill(const fpu_ctx_t *fpu, const xsave_proc_info_t *info,
2163 + void *datap)
2164 +{
2165 + prxregset_xsave_t *prxsave = datap;
2166 + const struct xsave_state *xsave = fpu->fpu_regs.kfpu_u.kfpu_xs;
2167 + size_t hdr_off;
2168 +
2169 + /*
2170 + * In the x87/XMM case, the no device vs. initial state is different
2171 + * because the initial state case still wants us to copy the real xsave
2172 + * header. It's also worth calling out that the actual illumos default
2173 + * fxsave state is not the same as what Intel documents. The main
2174 + * difference is in what the x87 FPU control word is. This results in
2175 + * the following different cases that we need to think about:
2176 + *
2177 + * o FPU_EN is not set. So we use the illumos default.
2178 + */
2179 + if ((fpu->fpu_flags & FPU_EN) == 0) {
2180 + bcopy(&avx_initial, prxsave, sizeof (*prxsave));
2181 + return;
2182 + }
2183 +
2184 + /*
2185 + * Convert all the fxsave region while taking into account the validity
2186 + * of the xsave bits. The prxregset_xsave_t structure is identical in
2187 + * the first 512-bits to the prxsave structure.
2188 + */
2189 + fpu_xsave_to_fxsave(xsave, (struct fxsave_state *)prxsave);
2190 +
2191 + /*
2192 + * Now that we've dealt with the x87 and XMM state, take care of the
2193 + * header.
2194 + */
2195 + hdr_off = offsetof(prxregset_xsave_t, prx_xsh_xstate_bv);
2196 + bcopy((const void *)((uintptr_t)xsave + hdr_off),
2197 + (void *)((uintptr_t)prxsave + hdr_off),
2198 + sizeof (struct xsave_header));
2199 +}
2200 +
2201 +static void
2202 +fpu_proc_xregs_std_fill(const fpu_ctx_t *fpu, const xsave_proc_info_t *info,
2203 + void *datap)
2204 +{
2205 + if (!fpu_proc_xregs_initial_state(fpu, info->xi_bits)) {
2206 + size_t size, off;
2207 + const void *xsave_off;
2208 +
2209 + cpuid_get_xsave_info(info->xi_bits, &size, &off);
2210 + ASSERT3U(size, ==, info->xi_size);
2211 + xsave_off = (void *)((uintptr_t)fpu->fpu_regs.kfpu_u.kfpu_xs +
2212 + off);
2213 + bcopy(xsave_off, datap, info->xi_size);
2214 + }
2215 +}
2216 +
2217 +/*
2218 + * Users are not allowed to actually set the xcr information this way. However,
2219 + * to make it easier for someone to just do a read, modify, write, of the xregs
2220 + * data, if it is identical, then we will accept it (and do nothing).
2221 + */
2222 +static bool
2223 +fpu_proc_xregs_xcr_valid(model_t model, const void *datap)
2224 +{
2225 + const prxregset_xcr_t *xcr = datap;
2226 +
2227 + return (xcr->prx_xcr_xcr0 == xsave_bv_all && xcr->prx_xcr_xfd == 0 &&
2228 + xcr->prx_xcr_pad[0] == 0 && xcr->prx_xcr_pad[1] == 0);
2229 +}
2230 +
2231 +/*
2232 + * To match traditional /proc semantics, we do not error if reserved bits of
2233 + * MXCSR are set, they will be masked off when writing data. We do not allow
2234 + * someone to indicate that they are asking for compressed xsave data, hence the
2235 + * check that prx_xsh_comp_bv is zero. Finally, we will check that each
2236 + * component that was indicated in the xstate_bv is present as another item as
2237 + * part of the broader validation path.
2238 + */
2239 +static bool
2240 +fpu_proc_xregs_xsave_valid(model_t model, const void *datap)
2241 +{
2242 + const prxregset_xsave_t *xsave = datap;
2243 + uint64_t rsvd[6] = { 0 };
2244 +
2245 + if (bcmp(rsvd, xsave->prx_xsh_reserved, sizeof (rsvd)) != 0 ||
2246 + xsave->prx_xsh_xcomp_bv != 0) {
2247 + return (false);
2248 + }
2249 +
2250 + if ((xsave->prx_xsh_xstate_bv & ~xsave_bv_all) != 0) {
2251 + return (false);
2252 + }
2253 +
2254 + return (true);
2255 +}
2256 +
2257 +/*
2258 + * The YMM, ZMM, and Hi-ZMM registers are all valid when in an LP64 environment
2259 + * on x86; however, when operating in ILP32, subsets are reserved. We basically
2260 + * require that all reserved portions are set to zero as our way to accept them.
2261 + */
2262 +static bool
2263 +fpu_proc_xregs_ymm_valid(model_t model, const void *datap)
2264 +{
2265 + upad128_t ymm_zero[8];
2266 + const prxregset_ymm_t *ymm = datap;
2267 +
2268 + if (model == DATAMODEL_LP64) {
2269 + return (true);
2270 + }
2271 +
2272 + bzero(&ymm_zero, sizeof (ymm_zero));
2273 + return (bcmp(&ymm->prx_ymm[8], &ymm_zero, sizeof (ymm_zero)) == 0);
2274 +}
2275 +
2276 +static bool
2277 +fpu_proc_xregs_zmm_valid(model_t model, const void *datap)
2278 +{
2279 + upad256_t zmm_zero[8];
2280 + const prxregset_zmm_t *zmm = datap;
2281 +
2282 + if (model == DATAMODEL_LP64) {
2283 + return (true);
2284 + }
2285 +
2286 + bzero(&zmm_zero, sizeof (zmm_zero));
2287 + return (bcmp(&zmm->prx_zmm[8], &zmm_zero, sizeof (zmm_zero)) == 0);
2288 +}
2289 +
2290 +static bool
2291 +fpu_proc_xregs_hi_zmm_valid(model_t model, const void *datap)
2292 +{
2293 + prxregset_hi_zmm_t hi_zmm_zero;
2294 + const prxregset_hi_zmm_t *hi_zmm = datap;
2295 +
2296 + if (model == DATAMODEL_LP64) {
2297 + return (true);
2298 + }
2299 +
2300 + bzero(&hi_zmm_zero, sizeof (hi_zmm_zero));
2301 + return (bcmp(hi_zmm, &hi_zmm_zero, sizeof (hi_zmm_zero)) == 0);
2302 +}
2303 +
2304 +/*
2305 + * The xsave state consists of the first 512 byes of the XMM state and then the
2306 + * xsave header itself. Because of the xsave header, this structure is marked
2307 + * with xi_always, so we must always process and consider it.
2308 + *
2309 + * Semantically if either of the bits around SSE / x87 is set, then we will copy
2310 + * the entire thing. This may mean that we end up copying a region that is not
2311 + * valid into the save area; however, that should be OK as we still have the
2312 + * specific bit flags that indicate what we should consider or not.
2313 + *
2314 + * There is one additional wrinkle we need to consider and honor here. The CPU
2315 + * will load the MXCSR values if the AVX bit is set in an xrstor regardless of
2316 + * anything else. So if if this is set and we do not have a valid x87/XMM bits
2317 + * set then we will set the MXCSR to its default state in case the processor
2318 + * tries to load it. For reference see:
2319 + *
2320 + * o Intel SDM Volume 1: 13.8.1 Standard Form of XRSTOR
2321 + * o AMD64 Volume 2: Section 11.5.9 MXCSR State Management
2322 + *
2323 + * Note, the behavior around this changes depending on whether using the
2324 + * compressed xrstor or not. We are not, but it's worth being aware of. We do
2325 + * not worry about MXCSR_MASK because the instructions ignore it.
2326 + */
2327 +static void
2328 +fpu_proc_xregs_xsave_set(fpu_ctx_t *fpu, const xsave_proc_info_t *info,
2329 + uint64_t xsave_bv, const void *datap)
2330 +{
2331 + const struct xsave_state *xs = datap;
2332 +
2333 + if ((xsave_bv & info->xi_bits) != 0) {
2334 + bcopy(&xs->xs_fxsave, &fpu->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave,
2335 + sizeof (struct fxsave_state));
2336 + } else if ((xsave_bv & XFEATURE_AVX) != 0) {
2337 + fpu->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_mxcsr =
2338 + SSE_MXCSR_INIT;
2339 + }
2340 +
2341 + bcopy(&xs->xs_header, &fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header,
2342 + sizeof (struct xsave_header));
2343 + fpu->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_mxcsr &= sse_mxcsr_mask;
2344 +}
2345 +
2346 +static void
2347 +fpu_proc_xregs_std_set(fpu_ctx_t *fpu, const xsave_proc_info_t *info,
2348 + uint64_t xsave_bv, const void *datap)
2349 +{
2350 + size_t size, off;
2351 + void *xsave_off;
2352 +
2353 + cpuid_get_xsave_info(info->xi_bits, &size, &off);
2354 + xsave_off = (void *)((uintptr_t)fpu->fpu_regs.kfpu_u.kfpu_xs +
2355 + off);
2356 + bcopy(datap, xsave_off, size);
2357 +}
2358 +
2359 +/*
2360 + * Dealing with XMM data is a little more annoying here. If UC_FPU is set, it
2361 + * also contains a copy of the XMM region. That must take priority over anything
2362 + * we have here. In the copyout code we do not set the XMM bits here as
2363 + * something to copy, therefore if they are set, we currently treat that as an
2364 + * error.
2365 + *
2366 + * The system has always gone through and cleaned up the reserved bits in the
2367 + * fxsave state when someone calls setcontext(). Therefore we need to do the
2368 + * same thing which is why you see the masking of the mxcsr below.
2369 + *
2370 + * Finally, there is one last wrinkle here that we need to consider. The
2371 + * fpregset_t has historically had two private words that are used to convey the
2372 + * status which cache the status/exception information. Therefore, we well...
2373 + * cheat. Intel has left bytes 464 (0x1d0) through 511 (0x1ff) available for us
2374 + * to do what we want. So we will pass this through that for the moment to help
2375 + * us pass this state around without too much extra allocation.
2376 + */
2377 +static int
2378 +fpu_signal_copyin_xmm(const xsave_proc_info_t *info, const ucontext_t *kuc,
2379 + const uc_xsave_t *ucx, void *fpup, uintptr_t *udatap,
2380 + const uintptr_t max_udata)
2381 +{
2382 + struct xsave_state *xsave = fpup;
2383 +
2384 + if ((ucx->ucx_bv & info->xi_bits) != 0) {
2385 + return (EINVAL);
2386 + }
2387 +
2388 + if ((kuc->uc_flags & UC_FPU) != 0) {
2389 + bcopy(&kuc->uc_mcontext.fpregs, &xsave->xs_fxsave,
2390 + sizeof (struct fxsave_state));
2391 + xsave->xs_fxsave.__fx_ign2[3]._l[0] =
2392 + kuc->uc_mcontext.fpregs.fp_reg_set.fpchip_state.status;
2393 + xsave->xs_fxsave.__fx_ign2[3]._l[1] =
2394 + kuc->uc_mcontext.fpregs.fp_reg_set.fpchip_state.xstatus;
2395 + xsave->xs_fxsave.fx_mxcsr &= sse_mxcsr_mask;
2396 + xsave->xs_header.xsh_xstate_bv |= info->xi_bits;
2397 + }
2398 +
2399 + return (0);
2400 +}
2401 +
2402 +static int
2403 +fpu_signal_copyin_std(const xsave_proc_info_t *info, const ucontext_t *kuc,
2404 + const uc_xsave_t *ucx, void *fpup, uintptr_t *udatap,
2405 + const uintptr_t max_udata)
2406 +{
2407 + size_t len, xsave_off;
2408 + void *copy_to;
2409 + struct xsave_state *xsave = fpup;
2410 +
2411 + cpuid_get_xsave_info(info->xi_bits, &len, &xsave_off);
2412 + if (*udatap + len > max_udata) {
2413 + return (EOVERFLOW);
2414 + }
2415 +
2416 + copy_to = (void *)((uintptr_t)fpup + xsave_off);
2417 + if (ddi_copyin((void *)*udatap, copy_to, len, 0) != 0) {
2418 + return (EFAULT);
2419 + }
2420 +
2421 + xsave->xs_header.xsh_xstate_bv |= info->xi_bits;
2422 + *udatap = *udatap + len;
2423 +
2424 + return (0);
2425 +}
2426 +
2427 +static int
2428 +fpu_signal_copyout_std(const xsave_proc_info_t *info, fpu_copyout_f copyfunc,
2429 + uc_xsave_t *ucx, const void *fpup, uintptr_t udatap)
2430 +{
2431 + size_t len, xsave_off;
2432 + const void *copy_from;
2433 + void *copy_to;
2434 + int ret;
2435 +
2436 + cpuid_get_xsave_info(info->xi_bits, &len, &xsave_off);
2437 + copy_from = (void *)(uintptr_t)fpup + xsave_off;
2438 + copy_to = (void *)(udatap + ucx->ucx_len);
2439 +
2440 + ret = copyfunc(copy_from, copy_to, len);
2441 + if (ret != 0) {
2442 + return (ret);
2443 + }
2444 +
2445 + ucx->ucx_len += len;
2446 + ucx->ucx_bv |= info->xi_bits;
2447 + return (0);
2448 +}
2449 +
2450 +/*
2451 + * This table contains information about the extended FPU states and synthetic
2452 + * information we create for /proc, the ucontext_t, and signal handling. The
2453 + * definition of the xsave_proc_info_t describes how each member is used.
2454 + *
2455 + * In general, this table is expected to be in the order of the xsave data
2456 + * structure itself. Synthetic elements that we create can go anywhere and new
2457 + * ones should be inserted at the end. This structure is walked in order to
2458 + * produce the /proc and signal handling logic, so changing the order is
2459 + * meaningful for those and probably should not be done lightly.
2460 + */
2461 +static const xsave_proc_info_t fpu_xsave_info[] = { {
2462 + .xi_type = PRX_INFO_XCR,
2463 + .xi_size = sizeof (prxregset_xcr_t),
2464 + .xi_align = alignof (prxregset_xcr_t),
2465 + .xi_always = true,
2466 + .xi_bits = 0,
2467 + .xi_fill = fpu_proc_xregs_xcr_fill,
2468 + .xi_valid = fpu_proc_xregs_xcr_valid
2469 +}, {
2470 + /*
2471 + * The XSAVE entry covers both the xsave header and the %xmm registers.
2472 + * Note, there is no signal copyout information for the %xmm registers
2473 + * because it is expected that that data is already in the fpregset_t.
2474 + */
2475 + .xi_type = PRX_INFO_XSAVE,
2476 + .xi_size = sizeof (prxregset_xsave_t),
2477 + .xi_align = FPU_ALIGN_XMM,
2478 + .xi_always = true,
2479 + .xi_bits = XFEATURE_LEGACY_FP | XFEATURE_SSE,
2480 + .xi_fill = fpu_proc_xregs_xsave_fill,
2481 + .xi_set = fpu_proc_xregs_xsave_set,
2482 + .xi_valid = fpu_proc_xregs_xsave_valid,
2483 + .xi_signal_in = fpu_signal_copyin_xmm
2484 +}, {
2485 + .xi_type = PRX_INFO_YMM,
2486 + .xi_size = sizeof (prxregset_ymm_t),
2487 + .xi_align = FPU_ALIGN_YMM,
2488 + .xi_always = false,
2489 + .xi_bits = XFEATURE_AVX,
2490 + .xi_fill = fpu_proc_xregs_std_fill,
2491 + .xi_set = fpu_proc_xregs_std_set,
2492 + .xi_signal_in = fpu_signal_copyin_std,
2493 + .xi_valid = fpu_proc_xregs_ymm_valid,
2494 + .xi_signal_out = fpu_signal_copyout_std
2495 +}, {
2496 + /*
2497 + * There is no /proc validation function for the mask registers because
2498 + * they are the same in ILP32 / LP64 and there is nothing for us to
2499 + * actually validate.
2500 + */
2501 + .xi_type = PRX_INFO_OPMASK,
2502 + .xi_size = sizeof (prxregset_opmask_t),
2503 + .xi_align = alignof (prxregset_opmask_t),
2504 + .xi_always = false,
2505 + .xi_bits = XFEATURE_AVX512_OPMASK,
2506 + .xi_fill = fpu_proc_xregs_std_fill,
2507 + .xi_set = fpu_proc_xregs_std_set,
2508 + .xi_signal_in = fpu_signal_copyin_std,
2509 + .xi_signal_out = fpu_signal_copyout_std
2510 +}, {
2511 + .xi_type = PRX_INFO_ZMM,
2512 + .xi_size = sizeof (prxregset_zmm_t),
2513 + .xi_align = FPU_ALIGN_ZMM,
2514 + .xi_always = false,
2515 + .xi_bits = XFEATURE_AVX512_ZMM,
2516 + .xi_fill = fpu_proc_xregs_std_fill,
2517 + .xi_set = fpu_proc_xregs_std_set,
2518 + .xi_valid = fpu_proc_xregs_zmm_valid,
2519 + .xi_signal_in = fpu_signal_copyin_std,
2520 + .xi_signal_out = fpu_signal_copyout_std
2521 +}, {
2522 + .xi_type = PRX_INFO_HI_ZMM,
2523 + .xi_size = sizeof (prxregset_hi_zmm_t),
2524 + .xi_align = FPU_ALIGN_ZMM,
2525 + .xi_always = false,
2526 + .xi_bits = XFEATURE_AVX512_HI_ZMM,
2527 + .xi_fill = fpu_proc_xregs_std_fill,
2528 + .xi_set = fpu_proc_xregs_std_set,
2529 + .xi_valid = fpu_proc_xregs_hi_zmm_valid,
2530 + .xi_signal_in = fpu_signal_copyin_std,
2531 + .xi_signal_out = fpu_signal_copyout_std
2532 +} };
2533 +
2534 +static bool
2535 +fpu_proc_xregs_include(const xsave_proc_info_t *infop)
2536 +{
2537 + return (infop->xi_always || (xsave_bv_all & infop->xi_bits) != 0);
2538 +}
2539 +
2540 +void
2541 +fpu_proc_xregs_info(struct proc *p __unused, uint32_t *ninfop, uint32_t *sizep,
2542 + uint32_t *dstart)
2543 +{
2544 + size_t ret = sizeof (prxregset_hdr_t);
2545 + uint32_t ninfo = 0;
2546 +
2547 + ASSERT(fpu_xsave_enabled());
2548 +
2549 + /*
2550 + * Right now the set of flags that are enabled in the FPU is global.
2551 + * That is, while the pcb's fcpu_ctx_t has the fpu_xsave_mask, the
2552 + * actual things that might show up and we care about are all about what
2553 + * is set up in %xcr0 which is stored in the global xsave_bv_all. If we
2554 + * move to per-process FPU enablement which is likely to come with AMX,
2555 + * then this will need the proc_t to look at, hence why we've set things
2556 + * up with the unused variable above.
2557 + *
2558 + * We take two passes through the array. The first is just to count up
2559 + * how many informational entries we need.
2560 + */
2561 + for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
2562 + if (!fpu_proc_xregs_include(&fpu_xsave_info[i]))
2563 + continue;
2564 + ninfo++;
2565 + }
2566 +
2567 + ASSERT3U(ninfo, >, 0);
2568 + ret += sizeof (prxregset_info_t) * ninfo;
2569 +
2570 + for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
2571 + size_t curphase;
2572 + if (!fpu_proc_xregs_include(&fpu_xsave_info[i]))
2573 + continue;
2574 +
2575 + curphase = ret % fpu_xsave_info[i].xi_align;
2576 + if (ret < fpu_xsave_info[i].xi_align) {
2577 + ret = fpu_xsave_info[i].xi_align;
2578 + } else if (curphase != 0) {
2579 + ret += curphase;
2580 + }
2581 +
2582 + if (i == 0 && dstart != NULL) {
2583 + *dstart = ret;
2584 + }
2585 +
2586 + ret += fpu_xsave_info[i].xi_size;
2587 + }
2588 +
2589 + VERIFY3U(ret, <=, UINT32_MAX);
2590 + if (sizep != NULL) {
2591 + *sizep = ret;
2592 + }
2593 +
2594 + if (ninfop != NULL) {
2595 + *ninfop = ninfo;
2596 + }
2597 +}
2598 +
2599 +/*
2600 + * This function supports /proc. Because /proc does not have a process locked
2601 + * while processing a PCSXREG, so this tries to establish an upper bound that we
2602 + * will validate later in fpu_proc_xregs_set(). We basically say that if you
2603 + * take the maximum xsave size and add 1 KiB that is a good enough approximation
2604 + * for the maximum size.
2605 + */
2606 +size_t
2607 +fpu_proc_xregs_max_size(void)
2608 +{
2609 + VERIFY(fpu_xsave_enabled());
2610 + return (cpuid_get_xsave_size() + 0x1000);
2611 +}
2612 +
2613 +/*
2614 + * This functions supports /proc. In particular, it's meant to perform the
2615 + * following:
2616 + *
2617 + * o Potentially save the current thread's registers.
2618 + * o Write out the x86 xsave /proc xregs format data from the xsave data we
2619 + * actually have. Note, this can be a little weird for cases where the FPU is
2620 + * not actually enabled, which happens for system processes.
2621 + * /proc let us read this state?
2622 + */
2623 +void
2624 +fpu_proc_xregs_get(struct _klwp *lwp, void *buf)
2625 +{
2626 + uint32_t size, ninfo, curinfo, dstart;
2627 + fpu_ctx_t *fpu = &lwp->lwp_pcb.pcb_fpu;
2628 + prxregset_hdr_t *hdr = buf;
2629 +
2630 + ASSERT(fpu_xsave_enabled());
2631 + fpu_proc_xregs_info(lwp->lwp_procp, &ninfo, &size, &dstart);
2632 +
2633 + /*
2634 + * Before we get going, defensively zero out all the data buffer so that
2635 + * the rest of the fill functions can assume a specific base.
2636 + */
2637 + bzero(buf, size);
2638 +
2639 + kpreempt_disable();
2640 + if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
2641 + /*
2642 + * This case suggests that thread in question doesn't have a
2643 + * valid FPU save state which should only happen when it is on
2644 + * CPU. If this is the case, we must ensure that we save the
2645 + * current FPU state before proceeding. We also sanity check
2646 + * several things here before doing this as using /proc on
2647 + * yourself is always exciting. fp_save() will ensure that the
2648 + * thread is flagged to go back to being an eager FPU before
2649 + * returning back to userland.
2650 + */
2651 + VERIFY3P(curthread, ==, lwptot(lwp));
2652 + VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
2653 + fp_save(fpu);
2654 + }
2655 + kpreempt_enable();
2656 +
2657 + hdr->pr_type = PR_TYPE_XSAVE;
2658 + hdr->pr_size = size;
2659 + hdr->pr_flags = hdr->pr_pad[0] = hdr->pr_pad[1] = hdr->pr_pad[2] =
2660 + hdr->pr_pad[3] = 0;
2661 + hdr->pr_ninfo = ninfo;
2662 +
2663 + curinfo = 0;
2664 + for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
2665 + void *startp;
2666 + uint32_t phase;
2667 +
2668 + if (!fpu_proc_xregs_include(&fpu_xsave_info[i]))
2669 + continue;
2670 +
2671 + phase = dstart % fpu_xsave_info[i].xi_align;
2672 + if (dstart < fpu_xsave_info[i].xi_align) {
2673 + ASSERT3U(i, !=, 0);
2674 + dstart = fpu_xsave_info[i].xi_align;
2675 + } else if (phase != 0) {
2676 + ASSERT3U(i, !=, 0);
2677 + dstart += phase;
2678 + }
2679 +
2680 + hdr->pr_info[curinfo].pri_type = fpu_xsave_info[i].xi_type;
2681 + hdr->pr_info[curinfo].pri_flags = 0;
2682 + hdr->pr_info[curinfo].pri_size = fpu_xsave_info[i].xi_size;
2683 + hdr->pr_info[curinfo].pri_offset = dstart;
2684 +
2685 + startp = (void *)((uintptr_t)buf + dstart);
2686 + fpu_xsave_info[i].xi_fill(fpu, &fpu_xsave_info[i], startp);
2687 + dstart += fpu_xsave_info[i].xi_size;
2688 + ASSERT3U(curinfo, <=, ninfo);
2689 + curinfo++;
2690 + }
2691 +}
2692 +
2693 +/*
2694 + * We have been asked to set the data in the FPU for a given thread. Our
2695 + * prmachdep code has already validated that the raw semantics of the data that
2696 + * we have are valid (that is the appropriate sizes, offsets, and flags). We now
2697 + * apply additional checking here:
2698 + *
2699 + * o The xsave structure is present and only valid bits are set.
2700 + * o If the xsave component bit-vector is set, we have the corresponding proc
2701 + * info item.
2702 + * o Read-only items are ignored if and only if they actually match what we
2703 + * gave the user mostly as a courtesy to simplify things here.
2704 + * o ILP32 processes which can't support many of the regions are allowed to
2705 + * have the items here (as we likely gave them to them), but they must be
2706 + * zero if they are set.
2707 + *
2708 + * We take a first pass through all the data, validating it makes sense for the
2709 + * FPU. Only after that point do we ensure that we have the FPU data in question
2710 + * and then we clobber all the FPU data. Part of the semantics of setting this
2711 + * is that we're setting the entire extended FPU.
2712 + */
2713 +int
2714 +fpu_proc_xregs_set(struct _klwp *lwp, void *buf)
2715 +{
2716 + prxregset_hdr_t *prx = buf;
2717 + model_t model = lwp_getdatamodel(lwp);
2718 + uint64_t bv_found = 0;
2719 + const prxregset_xsave_t *xsave = NULL;
2720 + fpu_ctx_t *fpu = &lwp->lwp_pcb.pcb_fpu;
2721 +
2722 + VERIFY(fpu_xsave_enabled());
2723 +
2724 + /*
2725 + * First, walk each note info header that we have from the user and
2726 + * proceed to validate it. The prmachdep code has already validated that
2727 + * the size, type, and offset information is valid, but it has not
2728 + * validated the semantic contents of this or if someone is trying to
2729 + * write something they shouldn't.
2730 + *
2731 + * While we walk this, we keep track of where the xsave header is. We
2732 + * also track all of the bits that we have found along the way so we can
2733 + * match up and ensure that everything that was set has a corresponding
2734 + * bit in the xsave bitmap. If we have something in the xsave bitmap,
2735 + * but not its corresponding data, then that is an error. However, we
2736 + * allow folks to write data regions without the bit set in the xsave
2737 + * data to make the read, modify, write process simpler.
2738 + */
2739 + for (uint32_t i = 0; i < prx->pr_ninfo; i++) {
2740 + const prxregset_info_t *info = &prx->pr_info[i];
2741 + bool found = false;
2742 +
2743 + for (size_t pt = 0; pt < ARRAY_SIZE(fpu_xsave_info); pt++) {
2744 + void *data;
2745 + if (info->pri_type != fpu_xsave_info[pt].xi_type)
2746 + continue;
2747 +
2748 + found = true;
2749 + data = (void *)((uintptr_t)buf + info->pri_offset);
2750 + if (fpu_xsave_info[pt].xi_valid != NULL &&
2751 + !fpu_xsave_info[pt].xi_valid(model, data)) {
2752 + return (EINVAL);
2753 + }
2754 +
2755 + if (info->pri_type == PRX_INFO_XSAVE) {
2756 + xsave = data;
2757 + }
2758 + bv_found |= fpu_xsave_info[pt].xi_bits;
2759 + break;
2760 + }
2761 +
2762 + if (!found) {
2763 + return (EINVAL);
2764 + }
2765 + }
2766 +
2767 + /*
2768 + * No xsave data, no dice.
2769 + */
2770 + if (xsave == NULL) {
2771 + return (EINVAL);
2772 + }
2773 +
2774 + /*
2775 + * If anything is set in the xsave header that was not found as we
2776 + * walked structures, then that is an error. The opposite is not true as
2777 + * discussed above.
2778 + */
2779 + if ((xsave->prx_xsh_xstate_bv & ~bv_found) != 0) {
2780 + return (EINVAL);
2781 + }
2782 +
2783 + /*
2784 + * At this point, we consider all the data actually valid. Now we must
2785 + * set up this information in the save area. If this is our own lwp, we
2786 + * must disable it first. Otherwise, we expect that it is already valid.
2787 + * To try to sanitize this, we will defensively zero the entire region
2788 + * as we are setting everything that will result in here.
2789 + */
2790 + kpreempt_disable();
2791 + if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
2792 + /*
2793 + * This case suggests that thread in question doesn't have a
2794 + * valid FPU save state which should only happen when it is on
2795 + * CPU. If this is the case, we explicitly disable the FPU, but
2796 + * do not save it before proceeding. We also sanity check
2797 + * several things here before doing this as using /proc on
2798 + * yourself is always exciting. Unlike fp_save(), fp_free() does
2799 + * not signal that an update is required, so we unconditionally
2800 + * set that for all threads.
2801 + */
2802 + VERIFY3P(curthread, ==, lwptot(lwp));
2803 + VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
2804 + fp_free(fpu);
2805 + }
2806 + PCB_SET_UPDATE_FPU(&lwp->lwp_pcb);
2807 + bzero(lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic,
2808 + cpuid_get_xsave_size());
2809 +
2810 + for (uint32_t i = 0; i < prx->pr_ninfo; i++) {
2811 + const prxregset_info_t *info = &prx->pr_info[i];
2812 + bool found = false;
2813 +
2814 + for (size_t pt = 0; pt < ARRAY_SIZE(fpu_xsave_info); pt++) {
2815 + const void *data;
2816 + if (info->pri_type != fpu_xsave_info[pt].xi_type)
2817 + continue;
2818 +
2819 + /*
2820 + * Check if we have a set function and if we should
2821 + * include this. We may not if this is something like
2822 + * PRX_INFO_XCR which is read-only.
2823 + *
2824 + * We may not include a given entry as it may not have
2825 + * been set in the actual xsave state that we have been
2826 + * asked to restore, in which case to not break the
2827 + * xsaveopt logic, we must leave it in its initial
2828 + * state, e.g. zeroed (generally). XMM data initial
2829 + * state is not zeroed, but is marked with xi_always to
2830 + * help account for this.
2831 + */
2832 + found = true;
2833 + if (fpu_xsave_info[pt].xi_set == NULL)
2834 + break;
2835 + if (!fpu_xsave_info[pt].xi_always &&
2836 + (xsave->prx_xsh_xstate_bv &
2837 + fpu_xsave_info[pt].xi_bits) !=
2838 + fpu_xsave_info[pt].xi_bits) {
2839 + break;
2840 + }
2841 +
2842 + data = (void *)((uintptr_t)buf + info->pri_offset);
2843 + fpu_xsave_info[pt].xi_set(fpu, &fpu_xsave_info[pt],
2844 + xsave->prx_xsh_xstate_bv, data);
2845 + }
2846 +
2847 + VERIFY(found);
2848 + }
2849 + kpreempt_enable();
2850 +
2851 + return (0);
2852 +}
2853 +
2854 +/*
2855 + * To be included in the signal copyout logic we must have a copy function and
2856 + * the bit in question must be included. Note, we don't consult xi_always here
2857 + * as that is really part of what is always present for xsave logic and
2858 + * therefore isn't really pertinent here because of our custom format. See the
2859 + * big theory statement for more info.
2860 + */
2861 +static bool
2862 +fpu_signal_include(const xsave_proc_info_t *infop, uint64_t xs_bv)
2863 +{
2864 + return ((infop->xi_bits & xs_bv) == infop->xi_bits &&
2865 + infop->xi_signal_out != NULL);
2866 +}
2867 +
2868 +/*
2869 + * We need to fill out the xsave related data into the ucontext_t that we've
2870 + * been given. We should have a valid user pointer at this point in the uc_xsave
2871 + * member. This is much simpler than the copyin that we have. Here are the
2872 + * current assumptions:
2873 + *
2874 + * o This is being called for the current thread. This is not meant to operate
2875 + * on an arbitrary thread's state.
2876 + * o We cannot assume whether the FPU is valid in the pcb or not. While most
2877 + * callers will have just called getfpregs() which saved the state, don't
2878 + * assume that.
2879 + * o We assume that the user address has the requisite required space for this
2880 + * to be copied out.
2881 + * o We assume that copyfunc() will ensure we are not copying into a kernel
2882 + * address.
2883 + *
2884 + * For more information on the format of the data, see the 'Signal Handling and
2885 + * the ucontext_t' portion of the big theory statement. We copy out all the
2886 + * constituent parts and then come back and write out the actual final header
2887 + * information.
2888 + */
2889 +int
2890 +fpu_signal_copyout(struct _klwp *lwp, uintptr_t uaddr, fpu_copyout_f copyfunc)
2891 +{
2892 + struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
2893 + uint64_t xs_bv;
2894 + uc_xsave_t ucx;
2895 + int ret;
2896 +
2897 + VERIFY3P(curthread, ==, lwptot(lwp));
2898 + VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
2899 + ASSERT3U(fpu->fpu_flags & FPU_EN, ==, FPU_EN);
2900 +
2901 + if (!fpu_xsave_enabled()) {
2902 + return (ENOTSUP);
2903 + }
2904 +
2905 + /*
2906 + * Unlike when we're dealing with /proc, we can unconditionally call
2907 + * fp_save() because this is always called in the context that the lwp
2908 + * we're operating on is always the one on CPU (which is what fp_save()
2909 + * asserts).
2910 + */
2911 + fp_save(fpu);
2912 +
2913 + bzero(&ucx, sizeof (ucx));
2914 + ucx.ucx_vers = UC_XSAVE_VERS;
2915 + ucx.ucx_len += sizeof (uc_xsave_t);
2916 +
2917 + xs_bv = fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv;
2918 + for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
2919 + const xsave_proc_info_t *info = &fpu_xsave_info[i];
2920 +
2921 + if (!fpu_signal_include(&fpu_xsave_info[i], xs_bv))
2922 + continue;
2923 + ret = info->xi_signal_out(info, copyfunc, &ucx,
2924 + lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic,
2925 + uaddr);
2926 + if (ret != 0) {
2927 + kpreempt_enable();
2928 + return (ret);
2929 + }
2930 + }
2931 +
2932 + /*
2933 + * Now that everything has been copied out, we should have an accurate
2934 + * value in the uc_xsave_t header and we can copy that out at the start
2935 + * of the user data.
2936 + */
2937 + ret = copyfunc(&ucx, (void *)uaddr, sizeof (ucx));
2938 + return (ret);
2939 +}
2940 +
2941 +/*
2942 + * Here we've been given a ucontext_t which potentially has a user pointer to
2943 + * xsave state that we've copied out previously. In this case we need to do the
2944 + * following, assuming UC_XSAVE is present:
2945 + *
2946 + * o Copy in our header and validate it.
2947 + * o Allocate an fpu context to use as a holding ground for all this data.
2948 + * o If UC_FPU is set, override the xsave structure with the saved XMM state,
2949 + * clear UC_FPU, and make sure that the correct xsave_bv bits are set.
2950 + *
2951 + * Currently we always allocate the additional state as a holding ground for the
2952 + * FPU. What we're copying in may not be valid and we don't want to clobber the
2953 + * existing FPU state or deal with merging it until we believe it's reasonable
2954 + * enough. The proc_t is here to set us up for when we have per-process settings
2955 + * in the extended feature disable MSRs.
2956 + */
2957 +int
2958 +fpu_signal_copyin(struct _klwp *lwp, ucontext_t *kuc)
2959 +{
2960 + uc_xsave_t ucx;
2961 + uint64_t bv;
2962 + uintptr_t data, max_data;
2963 + void *fpu;
2964 + proc_t *p = lwp->lwp_procp;
2965 + size_t ksize;
2966 +
2967 + /*
2968 + * Because this has been opaque filler and the kernel has never
2969 + * historically looked at it, we don't really care about the uc_xsave
2970 + * pointer being garbage in the case that the flag is not set. While
2971 + * this isn't perhaps the most sporting choice in some cases, this is on
2972 + * the other hand, pragmatic.
2973 + */
2974 + if ((kuc->uc_flags & UC_XSAVE) != 0) {
2975 + if (kuc->uc_xsave == 0) {
2976 + return (EINVAL);
2977 + }
2978 +
2979 + if (!fpu_xsave_enabled()) {
2980 + return (ENOTSUP);
2981 + }
2982 + } else {
2983 + return (0);
2984 + }
2985 +
2986 + if (ddi_copyin((const void *)kuc->uc_xsave, &ucx, sizeof (ucx), 0) !=
2987 + 0) {
2988 + return (EFAULT);
2989 + }
2990 +
2991 + ksize = cpuid_get_xsave_size();
2992 + if (ucx.ucx_vers != UC_XSAVE_VERS || ucx.ucx_len < sizeof (ucx) ||
2993 + ucx.ucx_len > ksize ||
2994 + (ucx.ucx_bv & ~xsave_bv_all) != 0 ||
2995 + (uintptr_t)p->p_as->a_userlimit - ucx.ucx_len <
2996 + (uintptr_t)kuc->uc_xsave) {
2997 + return (EINVAL);
2998 + }
2999 +
3000 + /*
3001 + * OK, our goal right now is to recreate a valid xsave_state structure
3002 + * that we'll ultimately end up having to merge with our existing one in
3003 + * the FPU save state. The reason we describe this as a merge is to help
3004 + * future us when we want to retain supervisor state which will never be
3005 + * part of userland signal state. The design of the userland signal
3006 + * state is basically to compress it as much as we can. This is done for
3007 + * two reasons:
3008 + *
3009 + * 1) We currently consider this a private interface.
3010 + * 2) We really want to minimize the actual amount of stack space we
3011 + * use as much as possible. Most applications aren't using AVX-512
3012 + * right now, so doing our own compression style is worthwhile. If
3013 + * libc adopts AVX-512 routines, we may want to change this.
3014 + *
3015 + * On the allocation below, our assumption is that if a thread has taken
3016 + * a signal, then it is likely to take a signal again in the future (or
3017 + * be shortly headed to its demise). As such, when that happens we will
3018 + * leave the allocated signal stack around for the process. Most
3019 + * applications don't allow all threads to take signals, so this should
3020 + * hopefully help amortize the cost of the allocation.
3021 + */
3022 + max_data = (uintptr_t)kuc->uc_xsave + ucx.ucx_len;
3023 + data = (uintptr_t)kuc->uc_xsave + sizeof (ucx);
3024 + bv = ucx.ucx_bv;
3025 + if (lwp->lwp_pcb.pcb_fpu.fpu_signal == NULL) {
3026 + lwp->lwp_pcb.pcb_fpu.fpu_signal =
3027 + kmem_cache_alloc(fpsave_cachep, KM_SLEEP);
3028 + }
3029 + fpu = lwp->lwp_pcb.pcb_fpu.fpu_signal;
3030 +
3031 + /*
3032 + * Unconditionally initialize the memory we get in here to ensure that
3033 + * it is in a reasonable state for ourselves. This ensures that unused
3034 + * regions are mostly left in their initial state (the main exception
3035 + * here is the x87/XMM state, but that should be OK). We don't fill in
3036 + * the initial xsave state as we expect that to happen as part of our
3037 + * processing.
3038 + */
3039 + bzero(fpu, ksize);
3040 +
3041 + for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
3042 + int ret;
3043 + const xsave_proc_info_t *info = &fpu_xsave_info[i];
3044 + if (!info->xi_always && (info->xi_bits & bv) == 0)
3045 + continue;
3046 + bv &= ~info->xi_bits;
3047 +
3048 + if (info->xi_signal_in == NULL)
3049 + continue;
3050 + ret = info->xi_signal_in(info, kuc, &ucx, fpu, &data, max_data);
3051 + if (ret != 0) {
3052 + return (ret);
3053 + }
3054 + }
3055 + ASSERT0(bv);
3056 +
3057 + /*
3058 + * As described in the big theory statement section 'Signal Handling and
3059 + * the ucontext_t', we always remove UC_FPU from here as we've taken
3060 + * care of reassembling it ourselves.
3061 + */
3062 + kuc->uc_flags &= ~UC_FPU;
3063 + kuc->uc_xsave = (uintptr_t)fpu;
3064 +
3065 + return (0);
3066 +}
3067 +
3068 +/*
3069 + * This determines the size of the signal stack that we need for our custom form
3070 + * of the xsave state.
3071 + */
3072 +size_t
3073 +fpu_signal_size(struct _klwp *lwp)
3074 +{
3075 + struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
3076 + size_t len = sizeof (uc_xsave_t);
3077 + uint64_t xs_bv;
3078 +
3079 + VERIFY3P(curthread, ==, lwptot(lwp));
3080 + VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
3081 + ASSERT3U(fpu->fpu_flags & FPU_EN, ==, FPU_EN);
3082 +
3083 + if (!fpu_xsave_enabled()) {
3084 + return (0);
3085 + }
3086 +
3087 + kpreempt_disable();
3088 + if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
3089 + fp_save(fpu);
3090 + }
3091 +
3092 + xs_bv = fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv;
3093 + for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
3094 + size_t comp_size;
3095 +
3096 + if (!fpu_signal_include(&fpu_xsave_info[i], xs_bv))
3097 + continue;
3098 +
3099 + cpuid_get_xsave_info(fpu_xsave_info[i].xi_bits, &comp_size,
3100 + NULL);
3101 + len += comp_size;
3102 + }
3103 +
3104 + kpreempt_enable();
3105 + return (len);
3106 +}
3107 +
3108 +/*
3109 + * This function is used in service of restorecontext() to set the specified
3110 + * thread's extended FPU state to the passed in data. Our assumptions at this
3111 + * point from the system are:
3112 + *
3113 + * o Someone has already verified that the actual xsave header is correct.
3114 + * o Any traditional XMM state that causes a #gp has been clamped.
3115 + * o That data is basically the correct sized xsave state structure. Right now
3116 + * that means it is not compressed and follows the CPUID-based rules for
3117 + * constructing and laying out data.
3118 + * o That the lwp argument does refer to the current thread.
3119 + *
3120 + * Our primary purpose here is to merge the current FPU state with what exists
3121 + * here. Right now, "merge", strictly speaking is just "replace". We can get
3122 + * away with just replacing everything because all we currently save are user
3123 + * states. If we start saving kernel states in here, this will get more nuanced
3124 + * and we will need to be more careful about how we store data here.
3125 + */
3126 +void
3127 +fpu_set_xsave(struct _klwp *lwp, const void *data)
3128 +{
3129 + struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
3130 + uint32_t status, xstatus;
3131 + struct xsave_state *dst_xsave;
3132 +
3133 + ASSERT(fpu_xsave_enabled());
3134 + VERIFY3P(curthread, ==, lwptot(lwp));
3135 + VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
3136 + ASSERT3U(fpu->fpu_flags & FPU_EN, ==, FPU_EN);
3137 +
3138 + /*
3139 + * We use fp_save() here rather than a stock fpdisable() so we can
3140 + * attempt to honor our invariants that when the thread state has been
3141 + * saved, the valid flag is set, even though we're going to be
3142 + * overwriting it shortly. If we just called fpdisable() then we would
3143 + * basically be asking for trouble.
3144 + *
3145 + * Because we are modifying the state here and we don't want the system
3146 + * to end up in an odd state, we are being a little paranoid and
3147 + * disabling preemption across this operation. In particular, once the
3148 + * state is properly tagged with FPU_VALID, there should be no other way
3149 + * that this thread can return to userland and get cleared out because
3150 + * we're resetting its context; however, we let paranoia win out.
3151 + */
3152 + kpreempt_disable();
3153 + if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
3154 + fp_save(fpu);
3155 + }
3156 +
3157 + bcopy(data, lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic,
3158 + cpuid_get_xsave_size());
3159 + dst_xsave = lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic;
3160 + status = dst_xsave->xs_fxsave.__fx_ign2[3]._l[0];
3161 + xstatus = dst_xsave->xs_fxsave.__fx_ign2[3]._l[1];
3162 + dst_xsave->xs_fxsave.__fx_ign2[3]._l[0] = 0;
3163 + dst_xsave->xs_fxsave.__fx_ign2[3]._l[1] = 0;
3164 +
3165 + /*
3166 + * These two status words are information that the kernel itself uses to
3167 + * track additional information and is part of the traditional fpregset,
3168 + * but is not part of our xregs information. Because we are setting this
3169 + * state, we leave it up to the rest of the kernel to determine whether
3170 + * this came from an fpregset_t or is being reset to the default of 0.
3171 + */
3172 + fpu->fpu_regs.kfpu_status = status;
3173 + fpu->fpu_regs.kfpu_xstatus = xstatus;
3174 +
3175 + fpu->fpu_flags |= FPU_VALID;
3176 + PCB_SET_UPDATE_FPU(&lwp->lwp_pcb);
3177 + kpreempt_enable();
3178 +}
3179 +
3180 +/*
3181 + * Convert the current FPU state to the traditional fpregset_t. In the 64-bit
3182 + * kernel, this is just an fxsave_state with additional values for the status
3183 + * and xstatus members.
3184 + *
3185 + * This has the same nuance as the xregs cases discussed above, but is simpler
3186 + * in that we only need to handle the fxsave state, but more complicated because
3187 + * we need to check our save mechanism.
3188 + */
3189 +void
3190 +fpu_get_fpregset(struct _klwp *lwp, fpregset_t *fp)
3191 +{
3192 + struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
3193 +
3194 + kpreempt_disable();
3195 + fp->fp_reg_set.fpchip_state.status = fpu->fpu_regs.kfpu_status;
3196 + fp->fp_reg_set.fpchip_state.xstatus = fpu->fpu_regs.kfpu_xstatus;
3197 +
3198 + if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
3199 + /*
3200 + * If we're requesting the fpregs of a thread that isn't
3201 + * currently valid and isn't the one that we're executing, then
3202 + * we consider getting this information to be a best-effort and
3203 + * we will not stop the thread in question to serialize it,
3204 + * which means possibly getting stale data. This is the
3205 + * traditional semantics that the system has used to service
3206 + * this for /proc.
3207 + */
3208 + if (curthread == lwptot(lwp)) {
3209 + VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
3210 + fp_save(fpu);
3211 + }
3212 + }
3213 +
3214 + /*
3215 + * If the FPU is not enabled and the state isn't valid (due to someone
3216 + * else setting it), just copy the initial state.
3217 + */
3218 + if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == 0) {
3219 + bcopy(&sse_initial, fp, sizeof (sse_initial));
3220 + kpreempt_enable();
3221 + return;
3222 + }
3223 +
3224 + /*
3225 + * Given that we have an enabled FPU, we must look at the type of FPU
3226 + * save mechanism to clean this up. In particular, while we can just
3227 + * copy the save area with FXSAVE, with XSAVE we must carefully copy
3228 + * only the bits that are valid and reset the rest to their default
3229 + * state.
3230 + */
3231 + switch (fp_save_mech) {
3232 + case FP_FXSAVE:
3233 + bcopy(fpu->fpu_regs.kfpu_u.kfpu_fx, fp,
3234 + sizeof (struct fxsave_state));
3235 + break;
3236 + case FP_XSAVE:
3237 + fpu_xsave_to_fxsave(fpu->fpu_regs.kfpu_u.kfpu_xs,
3238 + (struct fxsave_state *)fp);
3239 + break;
3240 + default:
3241 + panic("Invalid fp_save_mech");
3242 + }
3243 +
3244 + kpreempt_enable();
3245 +}
3246 +
3247 +/*
3248 + * This is a request to set the ABI fpregset_t into our actual hardware state.
3249 + * In the 64-bit kernel the first 512 bytes of the fpregset_t is the same as the
3250 + * 512-byte fxsave area.
3251 + */
3252 +void
3253 +fpu_set_fpregset(struct _klwp *lwp, const fpregset_t *fp)
3254 +{
3255 + struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
3256 +
3257 + kpreempt_disable();
3258 + if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
3259 + /*
3260 + * We always save the entire FPU. This is required if we're
3261 + * using xsave. If we're using fxsave, we could skip the
3262 + * 512-byte write and instead just disable the FPU since we'd be
3263 + * replacing it all. For now we don't bother with more
3264 + * conditional logic.
3265 + */
3266 + VERIFY3P(curthread, ==, lwptot(lwp));
3267 + VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
3268 + fp_save(fpu);
3269 + }
3270 +
3271 + fpu->fpu_regs.kfpu_xstatus = fp->fp_reg_set.fpchip_state.xstatus;
3272 + fpu->fpu_regs.kfpu_status = fp->fp_reg_set.fpchip_state.status;
3273 + switch (fp_save_mech) {
3274 + case FP_FXSAVE:
3275 + bcopy(fp, fpu->fpu_regs.kfpu_u.kfpu_fx,
3276 + sizeof (struct fxsave_state));
3277 + break;
3278 + case FP_XSAVE:
3279 + bcopy(fp, fpu->fpu_regs.kfpu_u.kfpu_xs,
3280 + sizeof (struct fxsave_state));
3281 + fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv |=
3282 + XFEATURE_LEGACY_FP | XFEATURE_SSE;
3283 + break;
3284 + default:
3285 + panic("Invalid fp_save_mech");
3286 + }
3287 +
3288 + fpu->fpu_flags |= FPU_VALID;
3289 + PCB_SET_UPDATE_FPU(&lwp->lwp_pcb);
3290 + kpreempt_enable();
1526 3291 }
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX