1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2021 Joyent, Inc.
24 * Copyright 2021 RackTop Systems, Inc.
25 * Copyright 2022 Oxide Computer Company
26 */
27
28 /* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
29 /* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */
30 /* All Rights Reserved */
31
32 /* Copyright (c) 1987, 1988 Microsoft Corporation */
33 /* All Rights Reserved */
34
35 /*
36 * Copyright (c) 2009, Intel Corporation.
37 * All rights reserved.
38 */
39
40 #include <sys/types.h>
41 #include <sys/param.h>
42 #include <sys/signal.h>
43 #include <sys/regset.h>
44 #include <sys/privregs.h>
45 #include <sys/psw.h>
46 #include <sys/trap.h>
47 #include <sys/fault.h>
48 #include <sys/systm.h>
49 #include <sys/user.h>
50 #include <sys/file.h>
51 #include <sys/proc.h>
52 #include <sys/pcb.h>
53 #include <sys/lwp.h>
54 #include <sys/cpuvar.h>
55 #include <sys/thread.h>
56 #include <sys/disp.h>
57 #include <sys/fp.h>
58 #include <sys/siginfo.h>
59 #include <sys/archsystm.h>
60 #include <sys/kmem.h>
61 #include <sys/debug.h>
62 #include <sys/x86_archext.h>
63 #include <sys/sysmacros.h>
64 #include <sys/cmn_err.h>
65 #include <sys/kfpu.h>
66
67 /*
68 * FPU Management Overview
69 * -----------------------
70 *
71 * The x86 FPU has evolved substantially since its days as the x87 coprocessor;
72 * however, many aspects of its life as a coprocessor are still around in x86.
73 *
74 * Today, when we refer to the 'FPU', we don't just mean the original x87 FPU.
75 * While that state still exists, there is much more that is covered by the FPU.
76 * Today, this includes not just traditional FPU state, but also supervisor only
77 * state. The following state is currently managed and covered logically by the
78 * idea of the FPU registers:
79 *
80 * o Traditional x87 FPU
81 * o Vector Registers (%xmm, %ymm, %zmm)
82 * o Memory Protection Extensions (MPX) Bounds Registers
83 * o Protected Key Rights Registers (PKRU)
84 * o Processor Trace data
85 *
86 * The rest of this covers how the FPU is managed and controlled, how state is
87 * saved and restored between threads, interactions with hypervisors, and other
88 * information exported to user land through aux vectors. A lot of background
89 * information is here to synthesize major parts of the Intel SDM, but
90 * unfortunately, it is not a replacement for reading it.
91 *
92 * FPU Control Registers
93 * ---------------------
94 *
95 * Because the x87 FPU began its life as a co-processor and the FPU was
96 * optional there are several bits that show up in %cr0 that we have to
97 * manipulate when dealing with the FPU. These are:
98 *
99 * o CR0.ET The 'extension type' bit. This was used originally to indicate
100 * that the FPU co-processor was present. Now it is forced on for
101 * compatibility. This is often used to verify whether or not the
102 * FPU is present.
103 *
104 * o CR0.NE The 'native error' bit. Used to indicate that native error
105 * mode should be enabled. This indicates that we should take traps
106 * on FPU errors. The OS enables this early in boot.
107 *
108 * o CR0.MP The 'Monitor Coprocessor' bit. Used to control whether or not
109 * wait/fwait instructions generate a #NM if CR0.TS is set.
110 *
111 * o CR0.EM The 'Emulation' bit. This is used to cause floating point
112 * operations (x87 through SSE4) to trap with a #UD so they can be
113 * emulated. The system never sets this bit, but makes sure it is
114 * clear on processor start up.
115 *
116 * o CR0.TS The 'Task Switched' bit. When this is turned on, a floating
117 * point operation will generate a #NM. An fwait will as well,
118 * depending on the value in CR0.MP.
119 *
120 * Our general policy is that CR0.ET, CR0.NE, and CR0.MP are always set by
121 * the system. Similarly CR0.EM is always unset by the system. CR0.TS has a more
122 * complicated role. Historically it has been used to allow running systems to
123 * restore the FPU registers lazily. This will be discussed in greater depth
124 * later on.
125 *
126 * %cr4 is also used as part of the FPU control. Specifically we need to worry
127 * about the following bits in the system:
128 *
129 * o CR4.OSFXSR This bit is used to indicate that the OS understands and
130 * supports the execution of the fxsave and fxrstor
131 * instructions. This bit is required to be set to enable
132 * the use of the SSE->SSE4 instructions.
133 *
134 * o CR4.OSXMMEXCPT This bit is used to indicate that the OS can understand
135 * and take a SIMD floating point exception (#XM). This bit
136 * is always enabled by the system.
137 *
138 * o CR4.OSXSAVE This bit is used to indicate that the OS understands and
139 * supports the execution of the xsave and xrstor family of
140 * instructions. This bit is required to use any of the AVX
141 * and newer feature sets.
142 *
143 * Because all supported processors are 64-bit, they'll always support the XMM
144 * extensions and we will enable both CR4.OXFXSR and CR4.OSXMMEXCPT in boot.
145 * CR4.OSXSAVE will be enabled and used whenever xsave is reported in cpuid.
146 *
147 * %xcr0 is used to manage the behavior of the xsave feature set and is only
148 * present on the system if xsave is supported. %xcr0 is read and written to
149 * through by the xgetbv and xsetbv instructions. This register is present
150 * whenever the xsave feature set is supported. Each bit in %xcr0 refers to a
151 * different component of the xsave state and controls whether or not that
152 * information is saved and restored. For newer feature sets like AVX and MPX,
153 * it also controls whether or not the corresponding instructions can be
154 * executed (much like CR0.OSFXSR does for the SSE feature sets).
155 *
156 * Everything in %xcr0 is around features available to users. There is also the
157 * IA32_XSS MSR which is used to control supervisor-only features that are still
158 * part of the xsave state. Bits that can be set in %xcr0 are reserved in
159 * IA32_XSS and vice versa. This is an important property that is particularly
160 * relevant to how the xsave instructions operate.
161 *
162 * Save Mechanisms
163 * ---------------
164 *
165 * When switching between running threads the FPU state needs to be saved and
166 * restored by the OS. If this state was not saved, users would rightfully
167 * complain about corrupt state. There are three mechanisms that exist on the
168 * processor for saving and restoring these state images:
169 *
170 * o fsave
171 * o fxsave
172 * o xsave
173 *
174 * fsave saves and restores only the x87 FPU and is the oldest of these
175 * mechanisms. This mechanism is never used in the kernel today because we are
176 * always running on systems that support fxsave.
177 *
178 * The fxsave and fxrstor mechanism allows the x87 FPU and the SSE register
179 * state to be saved and restored to and from a struct fxsave_state. This is the
180 * default mechanism that is used to save and restore the FPU on amd64. An
181 * important aspect of fxsave that was different from the original i386 fsave
182 * mechanism is that the restoring of FPU state with pending exceptions will not
183 * generate an exception, it will be deferred to the next use of the FPU.
184 *
185 * The final and by far the most complex mechanism is that of the xsave set.
186 * xsave allows for saving and restoring all of the traditional x86 pieces (x87
187 * and SSE), while allowing for extensions that will save the %ymm, %zmm, etc.
188 * registers.
189 *
190 * Data is saved and restored into and out of a struct xsave_state. The first
191 * part of the struct xsave_state is equivalent to the struct fxsave_state.
192 * After that, there is a header which is used to describe the remaining
193 * portions of the state. The header is a 64-byte value of which the first two
194 * uint64_t values are defined and the rest are reserved and must be zero. The
195 * first uint64_t is the xstate_bv member. This describes which values in the
196 * xsave_state are actually valid and present. This is updated on a save and
197 * used on restore. The second member is the xcomp_bv member. Its last bit
198 * determines whether or not a compressed version of the structure is used.
199 *
200 * When the uncompressed structure is used (currently the only format we
201 * support), then each state component is at a fixed offset in the structure,
202 * even if it is not being used. For example, if you only saved the AVX related
203 * state, but did not save the MPX related state, the offset would not change
204 * for any component. With the compressed format, components that aren't used
205 * are all elided (though the x87 and SSE state are always there).
206 *
207 * Unlike fxsave which saves all state, the xsave family does not always save
208 * and restore all the state that could be covered by the xsave_state. The
209 * instructions all take an argument which is a mask of what to consider. This
210 * is the same mask that will be used in the xstate_bv vector and it is also the
211 * same values that are present in %xcr0 and IA32_XSS. Though IA32_XSS is only
212 * considered with the xsaves and xrstors instructions.
213 *
214 * When a save or restore is requested, a bitwise and is performed between the
215 * requested bits and those that have been enabled in %xcr0. Only the bits that
216 * match that are then saved or restored. Others will be silently ignored by
217 * the processor. This idea is used often in the OS. We will always request that
218 * we save and restore all of the state, but only those portions that are
219 * actually enabled in %xcr0 will be touched.
220 *
221 * If a feature has been asked to be restored that is not set in the xstate_bv
222 * feature vector of the save state, then it will be set to its initial state by
223 * the processor (usually zeros). Also, when asked to save state, the processor
224 * may not write out data that is in its initial state as an optimization. This
225 * optimization only applies to saving data and not to restoring data.
226 *
227 * There are a few different variants of the xsave and xrstor instruction. They
228 * are:
229 *
230 * o xsave This is the original save instruction. It will save all of the
231 * requested data in the xsave state structure. It only saves data
232 * in the uncompressed (xcomp_bv[63] is zero) format. It may be
233 * executed at all privilege levels.
234 *
235 * o xrstor This is the original restore instruction. It will restore all of
236 * the requested data. The xrstor function can handle both the
237 * compressed and uncompressed formats. It may be executed at all
238 * privilege levels.
239 *
240 * o xsaveopt This is a variant of the xsave instruction that employs
241 * optimizations to try and only write out state that has been
242 * modified since the last time an xrstor instruction was called.
243 * The processor tracks a tuple of information about the last
244 * xrstor and tries to ensure that the same buffer is being used
245 * when this optimization is being used. However, because of the
246 * way that it tracks the xrstor buffer based on the address of it,
247 * it is not suitable for use if that buffer can be easily reused.
248 * The most common case is trying to save data to the stack in
249 * rtld. It may be executed at all privilege levels.
250 *
251 * o xsavec This is a variant of the xsave instruction that writes out the
252 * compressed form of the xsave_state. Otherwise it behaves as
253 * xsave. It may be executed at all privilege levels.
254 *
255 * o xsaves This is a variant of the xsave instruction. It is similar to
256 * xsavec in that it always writes the compressed form of the
257 * buffer. Unlike all the other forms, this instruction looks at
258 * both the user (%xcr0) and supervisor (IA32_XSS MSR) to determine
259 * what to save and restore. xsaves also implements the same
260 * optimization that xsaveopt does around modified pieces. User
261 * land may not execute the instruction.
262 *
263 * o xrstors This is a variant of the xrstor instruction. Similar to xsaves
264 * it can save and restore both the user and privileged states.
265 * Unlike xrstor it can only operate on the compressed form.
266 * User land may not execute the instruction.
267 *
268 * Based on all of these, the kernel has a precedence for what it will use.
269 * Basically, xsaves (not supported) is preferred to xsaveopt, which is
270 * preferred to xsave. A similar scheme is used when informing rtld (more later)
271 * about what it should use. xsavec is preferred to xsave. xsaveopt is not
272 * recommended due to the modified optimization not being appropriate for this
273 * use.
274 *
275 * Finally, there is one last gotcha with the xsave state. Importantly some AMD
276 * processors did not always save and restore some of the FPU exception state in
277 * some cases like Intel did. In those cases the OS will make up for this fact
278 * itself.
279 *
280 * FPU Initialization
281 * ------------------
282 *
283 * One difference with the FPU registers is that not all threads have FPU state,
284 * only those that have an lwp. Generally this means kernel threads, which all
285 * share p0 and its lwp, do not have FPU state. Though there are definitely
286 * exceptions such as kcfpoold. In the rest of this discussion we'll use thread
287 * and lwp interchangeably, just think of thread meaning a thread that has a
288 * lwp.
289 *
290 * Each lwp has its FPU state allocated in its pcb (process control block). The
291 * actual storage comes from the fpsave_cachep kmem cache. This cache is sized
292 * dynamically at start up based on the save mechanism that we're using and the
293 * amount of memory required for it. This is dynamic because the xsave_state
294 * size varies based on the supported feature set.
295 *
296 * The hardware side of the FPU is initialized early in boot before we mount the
297 * root file system. This is effectively done in fpu_probe(). This is where we
298 * make the final decision about what the save and restore mechanisms we should
299 * use are, create the fpsave_cachep kmem cache, and initialize a number of
300 * function pointers that use save and restoring logic.
301 *
302 * The thread/lwp side is a a little more involved. There are two different
303 * things that we need to concern ourselves with. The first is how the FPU
304 * resources are allocated and the second is how the FPU state is initialized
305 * for a given lwp.
306 *
307 * We allocate the FPU save state from our kmem cache as part of lwp_fp_init().
308 * This is always called unconditionally by the system as part of creating an
309 * LWP.
310 *
311 * There are three different initialization paths that we deal with. The first
312 * is when we are executing a new process. As part of exec all of the register
313 * state is reset. The exec case is particularly important because init is born
314 * like Athena, sprouting from the head of the kernel, without any true parent
315 * to fork from. The second is used whenever we fork or create a new lwp. The
316 * third is to deal with special lwps like the agent lwp.
317 *
318 * During exec, we will call fp_exec() which will initialize and set up the FPU
319 * state for the process. That will fill in the initial state for the FPU and
320 * also set that state in the FPU itself. As part of fp_exec() we also install a
321 * thread context operations vector that takes care of dealing with the saving
322 * and restoring of the FPU. These context handlers will also be called whenever
323 * an lwp is created or forked. In those cases, to initialize the FPU we will
324 * call fp_new_lwp(). Like fp_exec(), fp_new_lwp() will install a context
325 * operations vector for the new thread.
326 *
327 * Next we'll end up in the context operation fp_new_lwp(). This saves the
328 * current thread's state, initializes the new thread's state, and copies over
329 * the relevant parts of the originating thread's state. It's as this point that
330 * we also install the FPU context operations into the new thread, which ensures
331 * that all future threads that are descendants of the current one get the
332 * thread context operations (unless they call exec).
333 *
334 * To deal with some things like the agent lwp, we double check the state of the
335 * FPU in sys_rtt_common() to make sure that it has been enabled before
336 * returning to user land. In general, this path should be rare, but it's useful
337 * for the odd lwp here and there.
338 *
339 * The FPU state will remain valid most of the time. There are times that
340 * the state will be rewritten. For example in restorecontext, due to /proc, or
341 * the lwp calls exec(). Whether the context is being freed or we are resetting
342 * the state, we will call fp_free() to disable the FPU and our context.
343 *
344 * Finally, when the lwp is destroyed, it will actually destroy and free the FPU
345 * state by calling fp_lwp_cleanup().
346 *
347 * Kernel FPU Multiplexing
348 * -----------------------
349 *
350 * Just as the kernel has to maintain all of the general purpose registers when
351 * switching between scheduled threads, the same is true of the FPU registers.
352 *
353 * When a thread has FPU state, it also has a set of context operations
354 * installed. These context operations take care of making sure that the FPU is
355 * properly saved and restored during a context switch (fpsave_ctxt and
356 * fprestore_ctxt respectively). This means that the current implementation of
357 * the FPU is 'eager', when a thread is running the CPU will have its FPU state
358 * loaded. While this is always true when executing in userland, there are a few
359 * cases where this is not true in the kernel.
360 *
361 * This was not always the case. Traditionally on x86 a 'lazy' FPU restore was
362 * employed. This meant that the FPU would be saved on a context switch and the
363 * CR0.TS bit would be set. When a thread next tried to use the FPU, it would
364 * then take a #NM trap, at which point we would restore the FPU from the save
365 * area and return to user land. Given the frequency of use of the FPU alone by
366 * libc, there's no point returning to user land just to trap again.
367 *
368 * There are a few cases though where the FPU state may need to be changed for a
369 * thread on its behalf. The most notable cases are in the case of processes
370 * using /proc, restorecontext, forking, etc. In all of these cases the kernel
371 * will force a threads FPU state to be saved into the PCB through the fp_save()
372 * function. Whenever the FPU is saved, then the FPU_VALID flag is set on the
373 * pcb. This indicates that the save state holds currently valid data. As a side
374 * effect of this, CR0.TS will be set. To make sure that all of the state is
375 * updated before returning to user land, in these cases, we set a flag on the
376 * PCB that says the FPU needs to be updated. This will make sure that we take
377 * the slow path out of a system call to fix things up for the thread. Due to
378 * the fact that this is a rather rare case, effectively setting the equivalent
379 * of t_postsys is acceptable.
380 *
381 * CR0.TS will be set after a save occurs and cleared when a restore occurs.
382 * Generally this means it will be cleared immediately by the new thread that is
383 * running in a context switch. However, this isn't the case for kernel threads.
384 * They currently operate with CR0.TS set as no kernel state is restored for
385 * them. This means that using the FPU will cause a #NM and panic.
386 *
387 * The FPU_VALID flag on the currently executing thread's pcb is meant to track
388 * what the value of CR0.TS should be. If it is set, then CR0.TS will be set.
389 * However, because we eagerly restore, the only time that CR0.TS should be set
390 * for a non-kernel thread is during operations where it will be cleared before
391 * returning to user land and importantly, the only data that is in it is its
392 * own.
393 *
394 * Kernel FPU Usage
395 * ----------------
396 *
397 * Traditionally the kernel never used the FPU since it had no need for
398 * floating point operations. However, modern FPU hardware supports a variety
399 * of SIMD extensions which can speed up code such as parity calculations or
400 * encryption.
401 *
402 * To allow the kernel to take advantage of these features, the
403 * kernel_fpu_begin() and kernel_fpu_end() functions should be wrapped
404 * around any usage of the FPU by the kernel to ensure that user-level context
405 * is properly saved/restored, as well as to properly setup the FPU for use by
406 * the kernel. There are a variety of ways this wrapping can be used, as
407 * discussed in this section below.
408 *
409 * When kernel_fpu_begin() and kernel_fpu_end() are used for extended
410 * operations, the kernel_fpu_alloc() function should be used to allocate a
411 * kfpu_state_t structure that is used to save/restore the thread's kernel FPU
412 * state. This structure is not tied to any thread. That is, different threads
413 * can reuse the same kfpu_state_t structure, although not concurrently. A
414 * kfpu_state_t structure is freed by the kernel_fpu_free() function.
415 *
416 * In some cases, the kernel may need to use the FPU for a short operation
417 * without the overhead to manage a kfpu_state_t structure and without
418 * allowing for a context switch off the FPU. In this case the KFPU_NO_STATE
419 * bit can be set in the kernel_fpu_begin() and kernel_fpu_end() flags
420 * parameter. This indicates that there is no kfpu_state_t. When used this way,
421 * kernel preemption should be disabled by the caller (kpreempt_disable) before
422 * calling kernel_fpu_begin(), and re-enabled after calling kernel_fpu_end().
423 * For this usage, it is important to limit the kernel's FPU use to short
424 * operations. The tradeoff between using the FPU without a kfpu_state_t
425 * structure vs. the overhead of allowing a context switch while using the FPU
426 * should be carefully considered on a case by case basis.
427 *
428 * In other cases, kernel threads have an LWP, but never execute in user space.
429 * In this situation, the LWP's pcb_fpu area can be used to save/restore the
430 * kernel's FPU state if the thread is context switched, instead of having to
431 * allocate and manage a kfpu_state_t structure. The KFPU_USE_LWP bit in the
432 * kernel_fpu_begin() and kernel_fpu_end() flags parameter is used to
433 * enable this behavior. It is the caller's responsibility to ensure that this
434 * is only used for a kernel thread which never executes in user space.
435 *
436 * FPU Exceptions
437 * --------------
438 *
439 * Certain operations can cause the kernel to take traps due to FPU activity.
440 * Generally these events will cause a user process to receive a SIGFPU and if
441 * the kernel receives it in kernel context, we will die. Traditionally the #NM
442 * (Device Not Available / No Math) exception generated by CR0.TS would have
443 * caused us to restore the FPU. Now it is a fatal event regardless of whether
444 * or not user land causes it.
445 *
446 * While there are some cases where the kernel uses the FPU, it is up to the
447 * kernel to use the FPU in a way such that it cannot receive a trap or to use
448 * the appropriate trap protection mechanisms.
449 *
450 * Hypervisors
451 * -----------
452 *
453 * When providing support for hypervisors things are a little bit more
454 * complicated because the FPU is not virtualized at all. This means that they
455 * need to save and restore the FPU and %xcr0 across entry and exit to the
456 * guest. To facilitate this, we provide a series of APIs in <sys/hma.h>. These
457 * allow us to use the full native state to make sure that we are always saving
458 * and restoring the full FPU that the host sees, even when the guest is using a
459 * subset.
460 *
461 * One tricky aspect of this is that the guest may be using a subset of %xcr0
462 * and therefore changing our %xcr0 on the fly. It is vital that when we're
463 * saving and restoring the FPU that we always use the largest %xcr0 contents
464 * otherwise we will end up leaving behind data in it.
465 *
466 * ELF PLT Support
467 * ---------------
468 *
469 * rtld has to preserve a subset of the FPU when it is saving and restoring
470 * registers due to the amd64 SYS V ABI. See cmd/sgs/rtld/amd64/boot_elf.s for
471 * more information. As a result, we set up an aux vector that contains
472 * information about what save and restore mechanisms it should be using and
473 * the sizing thereof based on what the kernel supports. This is passed down in
474 * a series of aux vectors SUN_AT_FPTYPE and SUN_AT_FPSIZE. This information is
475 * initialized in fpu_subr.c.
476 */
477
478 kmem_cache_t *fpsave_cachep;
479
480 /* Legacy fxsave layout + xsave header + ymm */
481 #define AVX_XSAVE_SIZE (512 + 64 + 256)
482
483 /*
484 * Various sanity checks.
485 */
486 CTASSERT(sizeof (struct fxsave_state) == 512);
487 CTASSERT(sizeof (struct fnsave_state) == 108);
488 CTASSERT((offsetof(struct fxsave_state, fx_xmm[0]) & 0xf) == 0);
489 CTASSERT(sizeof (struct xsave_state) >= AVX_XSAVE_SIZE);
490
491 /*
492 * This structure is the x86 implementation of the kernel FPU that is defined in
493 * uts/common/sys/kfpu.h.
494 */
495
496 typedef enum kfpu_flags {
497 /*
498 * This indicates that the save state has initial FPU data.
499 */
500 KFPU_F_INITIALIZED = 0x01
501 } kfpu_flags_t;
502
503 struct kfpu_state {
504 fpu_ctx_t kfpu_ctx;
505 kfpu_flags_t kfpu_flags;
506 kthread_t *kfpu_curthread;
507 };
508
509 /*
510 * Initial kfpu state for SSE/SSE2 used by fpinit()
511 */
512 const struct fxsave_state sse_initial = {
513 FPU_CW_INIT, /* fx_fcw */
514 0, /* fx_fsw */
515 0, /* fx_fctw */
516 0, /* fx_fop */
517 0, /* fx_rip */
518 0, /* fx_rdp */
519 SSE_MXCSR_INIT /* fx_mxcsr */
520 /* rest of structure is zero */
521 };
522
523 /*
524 * Initial kfpu state for AVX used by fpinit()
525 */
526 const struct xsave_state avx_initial = {
527 /*
528 * The definition below needs to be identical with sse_initial
529 * defined above.
530 */
531 .xs_fxsave = {
532 .fx_fcw = FPU_CW_INIT,
533 .fx_mxcsr = SSE_MXCSR_INIT,
534 },
535 .xs_header = {
536 /*
537 * bit0 = 1 for XSTATE_BV to indicate that legacy fields are
538 * valid, and CPU should initialize XMM/YMM.
539 */
540 .xsh_xstate_bv = 1,
541 .xsh_xcomp_bv = 0,
542 },
543 };
544
545 /*
546 * mxcsr_mask value (possibly reset in fpu_probe); used to avoid
547 * the #gp exception caused by setting unsupported bits in the
548 * MXCSR register
549 */
550 uint32_t sse_mxcsr_mask = SSE_MXCSR_MASK_DEFAULT;
551
552 /*
553 * Initial kfpu state for x87 used by fpinit()
554 */
555 const struct fnsave_state x87_initial = {
556 FPU_CW_INIT, /* f_fcw */
557 0, /* __f_ign0 */
558 0, /* f_fsw */
559 0, /* __f_ign1 */
560 0xffff, /* f_ftw */
561 /* rest of structure is zero */
562 };
563
564 /*
565 * This vector is patched to xsave_ctxt() or xsaveopt_ctxt() if we discover we
566 * have an XSAVE-capable chip in fpu_probe.
567 */
568 void (*fpsave_ctxt)(void *) = fpxsave_ctxt;
569 void (*fprestore_ctxt)(void *) = fpxrestore_ctxt;
570
571 /*
572 * This function pointer is changed to xsaveopt if the CPU is xsaveopt capable.
573 */
574 void (*xsavep)(struct xsave_state *, uint64_t) = xsave;
575
576 static int fpe_sicode(uint_t);
577 static int fpe_simd_sicode(uint_t);
578 static void fp_new_lwp(void *, void *);
579 static void fp_free_ctx(void *, int);
580
581 static struct ctxop *
582 fp_ctxop_allocate(struct fpu_ctx *fp)
583 {
584 const struct ctxop_template tpl = {
585 .ct_rev = CTXOP_TPL_REV,
586 .ct_save = fpsave_ctxt,
587 .ct_restore = fprestore_ctxt,
588 .ct_fork = fp_new_lwp,
589 .ct_lwp_create = fp_new_lwp,
590 .ct_free = fp_free_ctx,
591 };
592 return (ctxop_allocate(&tpl, fp));
593 }
594
595 /*
596 * Copy the state of parent lwp's floating point context into the new lwp.
597 * Invoked for both fork() and lwp_create().
598 *
599 * Note that we inherit -only- the control state (e.g. exception masks,
600 * rounding, precision control, etc.); the FPU registers are otherwise
601 * reset to their initial state.
602 */
603 static void
604 fp_new_lwp(void *parent, void *child)
605 {
606 kthread_id_t t = parent, ct = child;
607 struct fpu_ctx *fp; /* parent fpu context */
608 struct fpu_ctx *cfp; /* new fpu context */
609 struct fxsave_state *fx, *cfx;
610 struct xsave_state *cxs;
611
612 ASSERT(fp_kind != FP_NO);
613
614 fp = &t->t_lwp->lwp_pcb.pcb_fpu;
615 cfp = &ct->t_lwp->lwp_pcb.pcb_fpu;
616
617 /*
618 * If the parent FPU state is still in the FPU hw then save it;
619 * conveniently, fp_save() already does this for us nicely.
620 */
621 fp_save(fp);
622
623 cfp->fpu_flags = FPU_EN | FPU_VALID;
624 cfp->fpu_regs.kfpu_status = 0;
625 cfp->fpu_regs.kfpu_xstatus = 0;
626
627 /*
628 * Make sure that the child's FPU is cleaned up and made ready for user
629 * land.
630 */
631 PCB_SET_UPDATE_FPU(&ct->t_lwp->lwp_pcb);
632
633 switch (fp_save_mech) {
634 case FP_FXSAVE:
635 fx = fp->fpu_regs.kfpu_u.kfpu_fx;
636 cfx = cfp->fpu_regs.kfpu_u.kfpu_fx;
637 bcopy(&sse_initial, cfx, sizeof (*cfx));
638 cfx->fx_mxcsr = fx->fx_mxcsr & ~SSE_MXCSR_EFLAGS;
639 cfx->fx_fcw = fx->fx_fcw;
640 break;
641
642 case FP_XSAVE:
643 cfp->fpu_xsave_mask = fp->fpu_xsave_mask;
644
645 VERIFY(fp->fpu_regs.kfpu_u.kfpu_xs != NULL);
646
647 fx = &fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave;
648 cxs = cfp->fpu_regs.kfpu_u.kfpu_xs;
649 cfx = &cxs->xs_fxsave;
650
651 bcopy(&avx_initial, cxs, sizeof (*cxs));
652 cfx->fx_mxcsr = fx->fx_mxcsr & ~SSE_MXCSR_EFLAGS;
653 cfx->fx_fcw = fx->fx_fcw;
654 cxs->xs_header.xsh_xstate_bv |=
655 (get_xcr(XFEATURE_ENABLED_MASK) & XFEATURE_FP_INITIAL);
656 break;
657 default:
658 panic("Invalid fp_save_mech");
659 /*NOTREACHED*/
660 }
661
662 /*
663 * Mark that both the parent and child need to have the FPU cleaned up
664 * before returning to user land.
665 */
666
667 ctxop_attach(ct, fp_ctxop_allocate(cfp));
668 }
669
670 /*
671 * Free any state associated with floating point context.
672 * Fp_free can be called in three cases:
673 * 1) from reaper -> thread_free -> freectx-> fp_free
674 * fp context belongs to a thread on deathrow
675 * nothing to do, thread will never be resumed
676 * thread calling ctxfree is reaper
677 *
678 * 2) from exec -> freectx -> fp_free
679 * fp context belongs to the current thread
680 * must disable fpu, thread calling ctxfree is curthread
681 *
682 * 3) from restorecontext -> setfpregs -> fp_free
683 * we have a modified context in the memory (lwp->pcb_fpu)
684 * disable fpu and release the fp context for the CPU
685 *
686 */
687 void
688 fp_free(struct fpu_ctx *fp)
689 {
690 ASSERT(fp_kind != FP_NO);
691
692 if (fp->fpu_flags & FPU_VALID)
693 return;
694
695 kpreempt_disable();
696 /*
697 * We want to do fpsave rather than fpdisable so that we can
698 * keep the fpu_flags as FPU_VALID tracking the CR0_TS bit
699 */
700 fp->fpu_flags |= FPU_VALID;
701 /* If for current thread disable FP to track FPU_VALID */
702 if (curthread->t_lwp && fp == &curthread->t_lwp->lwp_pcb.pcb_fpu) {
703 /* Clear errors if any to prevent frstor from complaining */
704 (void) fperr_reset();
705 if (fp_kind & __FP_SSE)
706 (void) fpxerr_reset();
707 fpdisable();
708 }
709 kpreempt_enable();
710 }
711
712 /*
713 * Wrapper for freectx to make the types line up for fp_free()
714 */
715 static void
716 fp_free_ctx(void *arg, int isexec __unused)
717 {
718 fp_free((struct fpu_ctx *)arg);
719 }
720
721 /*
722 * Store the floating point state and disable the floating point unit.
723 */
724 void
725 fp_save(struct fpu_ctx *fp)
726 {
727 ASSERT(fp_kind != FP_NO);
728
729 kpreempt_disable();
730 if (!fp || fp->fpu_flags & FPU_VALID ||
731 (fp->fpu_flags & FPU_EN) == 0) {
732 kpreempt_enable();
733 return;
734 }
735 ASSERT(curthread->t_lwp && fp == &curthread->t_lwp->lwp_pcb.pcb_fpu);
736
737 switch (fp_save_mech) {
738 case FP_FXSAVE:
739 fpxsave(fp->fpu_regs.kfpu_u.kfpu_fx);
740 break;
741
742 case FP_XSAVE:
743 xsavep(fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask);
744 break;
745 default:
746 panic("Invalid fp_save_mech");
747 /*NOTREACHED*/
748 }
749
750 fp->fpu_flags |= FPU_VALID;
751
752 /*
753 * We save the FPU as part of forking, execing, modifications via /proc,
754 * restorecontext, etc. As such, we need to make sure that we return to
755 * userland with valid state in the FPU. If we're context switched out
756 * before we hit sys_rtt_common() we'll end up having restored the FPU
757 * as part of the context ops operations. The restore logic always makes
758 * sure that FPU_VALID is set before doing a restore so we don't restore
759 * it a second time.
760 */
761 PCB_SET_UPDATE_FPU(&curthread->t_lwp->lwp_pcb);
762
763 kpreempt_enable();
764 }
765
766 /*
767 * Restore the FPU context for the thread:
768 * The possibilities are:
769 * 1. No active FPU context: Load the new context into the FPU hw
770 * and enable the FPU.
771 */
772 void
773 fp_restore(struct fpu_ctx *fp)
774 {
775 switch (fp_save_mech) {
776 case FP_FXSAVE:
777 fpxrestore(fp->fpu_regs.kfpu_u.kfpu_fx);
778 break;
779
780 case FP_XSAVE:
781 xrestore(fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask);
782 break;
783 default:
784 panic("Invalid fp_save_mech");
785 /*NOTREACHED*/
786 }
787
788 fp->fpu_flags &= ~FPU_VALID;
789 }
790
791 /*
792 * Reset the FPU such that it is in a valid state for a new thread that is
793 * coming out of exec. The FPU will be in a usable state at this point. At this
794 * point we know that the FPU state has already been allocated and if this
795 * wasn't an init process, then it will have had fp_free() previously called.
796 */
797 void
798 fp_exec(void)
799 {
800 struct fpu_ctx *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
801
802 if (fp_save_mech == FP_XSAVE) {
803 fp->fpu_xsave_mask = XFEATURE_FP_ALL;
804 }
805
806 struct ctxop *ctx = fp_ctxop_allocate(fp);
807 /*
808 * Make sure that we're not preempted in the middle of initializing the
809 * FPU on CPU.
810 */
811 kpreempt_disable();
812 ctxop_attach(curthread, ctx);
813 fpinit();
814 fp->fpu_flags = FPU_EN;
815 kpreempt_enable();
816 }
817
818
819 /*
820 * Seeds the initial state for the current thread. The possibilities are:
821 * 1. Another process has modified the FPU state before we have done any
822 * initialization: Load the FPU state from the LWP state.
823 * 2. The FPU state has not been externally modified: Load a clean state.
824 */
825 void
826 fp_seed(void)
827 {
828 struct fpu_ctx *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
829
830 ASSERT(curthread->t_preempt >= 1);
831 ASSERT((fp->fpu_flags & FPU_EN) == 0);
832
833 /*
834 * Always initialize a new context and initialize the hardware.
835 */
836 if (fp_save_mech == FP_XSAVE) {
837 fp->fpu_xsave_mask = XFEATURE_FP_ALL;
838 }
839
840 ctxop_attach(curthread, fp_ctxop_allocate(fp));
841 fpinit();
842
843 /*
844 * If FPU_VALID is set, it means someone has modified registers via
845 * /proc. In this case, restore the current lwp's state.
846 */
847 if (fp->fpu_flags & FPU_VALID)
848 fp_restore(fp);
849
850 ASSERT((fp->fpu_flags & FPU_VALID) == 0);
851 fp->fpu_flags = FPU_EN;
852 }
853
854 /*
855 * When using xsave/xrstor, these three functions are used by the lwp code to
856 * manage the memory for the xsave area.
857 */
858 void
859 fp_lwp_init(struct _klwp *lwp)
860 {
861 struct fpu_ctx *fp = &lwp->lwp_pcb.pcb_fpu;
862
863 /*
864 * We keep a copy of the pointer in lwp_fpu so that we can restore the
865 * value in forklwp() after we duplicate the parent's LWP state.
866 */
867 lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic =
868 kmem_cache_alloc(fpsave_cachep, KM_SLEEP);
869
870 if (fp_save_mech == FP_XSAVE) {
871 /*
872 *
873 * We bzero since the fpinit() code path will only
874 * partially initialize the xsave area using avx_inital.
875 */
876 ASSERT(cpuid_get_xsave_size() >= sizeof (struct xsave_state));
877 bzero(fp->fpu_regs.kfpu_u.kfpu_xs, cpuid_get_xsave_size());
878 }
879 }
880
881 void
882 fp_lwp_cleanup(struct _klwp *lwp)
883 {
884 struct fpu_ctx *fp = &lwp->lwp_pcb.pcb_fpu;
885
886 if (fp->fpu_regs.kfpu_u.kfpu_generic != NULL) {
887 kmem_cache_free(fpsave_cachep,
888 fp->fpu_regs.kfpu_u.kfpu_generic);
889 lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic = NULL;
890 }
891 }
892
893 /*
894 * Called during the process of forklwp(). The kfpu_u pointer will have been
895 * overwritten while copying the parent's LWP structure. We have a valid copy
896 * stashed in the child's lwp_fpu which we use to restore the correct value.
897 */
898 void
899 fp_lwp_dup(struct _klwp *lwp)
900 {
901 void *xp = lwp->lwp_fpu;
902 size_t sz;
903
904 switch (fp_save_mech) {
905 case FP_FXSAVE:
906 sz = sizeof (struct fxsave_state);
907 break;
908 case FP_XSAVE:
909 sz = cpuid_get_xsave_size();
910 break;
911 default:
912 panic("Invalid fp_save_mech");
913 /*NOTREACHED*/
914 }
915
916 /* copy the parent's values into the new lwp's struct */
917 bcopy(lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic, xp, sz);
918 /* now restore the pointer */
919 lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic = xp;
920 }
921
922 /*
923 * Handle a processor extension error fault
924 * Returns non zero for error.
925 */
926
927 /*ARGSUSED*/
928 int
929 fpexterrflt(struct regs *rp)
930 {
931 uint32_t fpcw, fpsw;
932 fpu_ctx_t *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
933
934 ASSERT(fp_kind != FP_NO);
935
936 /*
937 * Now we can enable the interrupts.
938 * (NOTE: x87 fp exceptions come thru interrupt gate)
939 */
940 sti();
941
942 if (!fpu_exists)
943 return (FPE_FLTINV);
944
945 /*
946 * Do an unconditional save of the FP state. If it's dirty (TS=0),
947 * it'll be saved into the fpu context area passed in (that of the
948 * current thread). If it's not dirty (it may not be, due to
949 * an intervening save due to a context switch between the sti(),
950 * above and here, then it's safe to just use the stored values in
951 * the context save area to determine the cause of the fault.
952 */
953 fp_save(fp);
954
955 /* clear exception flags in saved state, as if by fnclex */
956 switch (fp_save_mech) {
957 case FP_FXSAVE:
958 fpsw = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw;
959 fpcw = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fcw;
960 fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw &= ~FPS_SW_EFLAGS;
961 break;
962
963 case FP_XSAVE:
964 fpsw = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw;
965 fpcw = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fcw;
966 fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw &= ~FPS_SW_EFLAGS;
967 /*
968 * Always set LEGACY_FP as it may have been cleared by XSAVE
969 * instruction
970 */
971 fp->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv |=
972 XFEATURE_LEGACY_FP;
973 break;
974 default:
975 panic("Invalid fp_save_mech");
976 /*NOTREACHED*/
977 }
978
979 fp->fpu_regs.kfpu_status = fpsw;
980
981 if ((fpsw & FPS_ES) == 0)
982 return (0); /* No exception */
983
984 /*
985 * "and" the exception flags with the complement of the mask
986 * bits to determine which exception occurred
987 */
988 return (fpe_sicode(fpsw & ~fpcw & 0x3f));
989 }
990
991 /*
992 * Handle an SSE/SSE2 precise exception.
993 * Returns a non-zero sicode for error.
994 */
995 /*ARGSUSED*/
996 int
997 fpsimderrflt(struct regs *rp)
998 {
999 uint32_t mxcsr, xmask;
1000 fpu_ctx_t *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
1001
1002 ASSERT(fp_kind & __FP_SSE);
1003
1004 /*
1005 * NOTE: Interrupts are disabled during execution of this
1006 * function. They are enabled by the caller in trap.c.
1007 */
1008
1009 /*
1010 * The only way we could have gotten here if there is no FP unit
1011 * is via a user executing an INT $19 instruction, so there is
1012 * no fault in that case.
1013 */
1014 if (!fpu_exists)
1015 return (0);
1016
1017 /*
1018 * Do an unconditional save of the FP state. If it's dirty (TS=0),
1019 * it'll be saved into the fpu context area passed in (that of the
1020 * current thread). If it's not dirty, then it's safe to just use
1021 * the stored values in the context save area to determine the
1022 * cause of the fault.
1023 */
1024 fp_save(fp); /* save the FPU state */
1025
1026 if (fp_save_mech == FP_XSAVE) {
1027 mxcsr = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_mxcsr;
1028 fp->fpu_regs.kfpu_status =
1029 fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw;
1030 } else {
1031 mxcsr = fp->fpu_regs.kfpu_u.kfpu_fx->fx_mxcsr;
1032 fp->fpu_regs.kfpu_status = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw;
1033 }
1034 fp->fpu_regs.kfpu_xstatus = mxcsr;
1035
1036 /*
1037 * compute the mask that determines which conditions can cause
1038 * a #xm exception, and use this to clean the status bits so that
1039 * we can identify the true cause of this one.
1040 */
1041 xmask = (mxcsr >> 7) & SSE_MXCSR_EFLAGS;
1042 return (fpe_simd_sicode((mxcsr & SSE_MXCSR_EFLAGS) & ~xmask));
1043 }
1044
1045 /*
1046 * In the unlikely event that someone is relying on this subcode being
1047 * FPE_FLTILL for denormalize exceptions, it can always be patched back
1048 * again to restore old behaviour.
1049 */
1050 int fpe_fltden = FPE_FLTDEN;
1051
1052 /*
1053 * Map from the FPU status word to the FP exception si_code.
1054 */
1055 static int
1056 fpe_sicode(uint_t sw)
1057 {
1058 if (sw & FPS_IE)
1059 return (FPE_FLTINV);
1060 if (sw & FPS_ZE)
1061 return (FPE_FLTDIV);
1062 if (sw & FPS_DE)
1063 return (fpe_fltden);
1064 if (sw & FPS_OE)
1065 return (FPE_FLTOVF);
1066 if (sw & FPS_UE)
1067 return (FPE_FLTUND);
1068 if (sw & FPS_PE)
1069 return (FPE_FLTRES);
1070 return (FPE_FLTINV); /* default si_code for other exceptions */
1071 }
1072
1073 /*
1074 * Map from the SSE status word to the FP exception si_code.
1075 */
1076 static int
1077 fpe_simd_sicode(uint_t sw)
1078 {
1079 if (sw & SSE_IE)
1080 return (FPE_FLTINV);
1081 if (sw & SSE_ZE)
1082 return (FPE_FLTDIV);
1083 if (sw & SSE_DE)
1084 return (FPE_FLTDEN);
1085 if (sw & SSE_OE)
1086 return (FPE_FLTOVF);
1087 if (sw & SSE_UE)
1088 return (FPE_FLTUND);
1089 if (sw & SSE_PE)
1090 return (FPE_FLTRES);
1091 return (FPE_FLTINV); /* default si_code for other exceptions */
1092 }
1093
1094 /*
1095 * This routine is invoked as part of libc's __fpstart implementation
1096 * via sysi86(2).
1097 *
1098 * It may be called -before- any context has been assigned in which case
1099 * we try and avoid touching the hardware. Or it may be invoked well
1100 * after the context has been assigned and fiddled with, in which case
1101 * just tweak it directly.
1102 */
1103 void
1104 fpsetcw(uint16_t fcw, uint32_t mxcsr)
1105 {
1106 struct fpu_ctx *fp = &curthread->t_lwp->lwp_pcb.pcb_fpu;
1107 struct fxsave_state *fx;
1108
1109 if (!fpu_exists || fp_kind == FP_NO)
1110 return;
1111
1112 if ((fp->fpu_flags & FPU_EN) == 0) {
1113 if (fcw == FPU_CW_INIT && mxcsr == SSE_MXCSR_INIT) {
1114 /*
1115 * Common case. Floating point unit not yet
1116 * enabled, and kernel already intends to initialize
1117 * the hardware the way the caller wants.
1118 */
1119 return;
1120 }
1121 /*
1122 * Hmm. Userland wants a different default.
1123 * Do a fake "first trap" to establish the context, then
1124 * handle as if we already had a context before we came in.
1125 */
1126 kpreempt_disable();
1127 fp_seed();
1128 kpreempt_enable();
1129 }
1130
1131 /*
1132 * Ensure that the current hardware state is flushed back to the
1133 * pcb, then modify that copy. Next use of the fp will
1134 * restore the context.
1135 */
1136 fp_save(fp);
1137
1138 switch (fp_save_mech) {
1139 case FP_FXSAVE:
1140 fx = fp->fpu_regs.kfpu_u.kfpu_fx;
1141 fx->fx_fcw = fcw;
1142 fx->fx_mxcsr = sse_mxcsr_mask & mxcsr;
1143 break;
1144
1145 case FP_XSAVE:
1146 fx = &fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave;
1147 fx->fx_fcw = fcw;
1148 fx->fx_mxcsr = sse_mxcsr_mask & mxcsr;
1149 /*
1150 * Always set LEGACY_FP as it may have been cleared by XSAVE
1151 * instruction
1152 */
1153 fp->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv |=
1154 XFEATURE_LEGACY_FP;
1155 break;
1156 default:
1157 panic("Invalid fp_save_mech");
1158 /*NOTREACHED*/
1159 }
1160 }
1161
1162 static void
1163 kernel_fpu_fpstate_init(kfpu_state_t *kfpu)
1164 {
1165 struct xsave_state *xs;
1166
1167 switch (fp_save_mech) {
1168 case FP_FXSAVE:
1169 bcopy(&sse_initial, kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_fx,
1170 sizeof (struct fxsave_state));
1171 kfpu->kfpu_ctx.fpu_xsave_mask = 0;
1172 break;
1173 case FP_XSAVE:
1174 xs = kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_xs;
1175 bzero(xs, cpuid_get_xsave_size());
1176 bcopy(&avx_initial, xs, sizeof (*xs));
1177 xs->xs_header.xsh_xstate_bv = XFEATURE_LEGACY_FP | XFEATURE_SSE;
1178 kfpu->kfpu_ctx.fpu_xsave_mask = XFEATURE_FP_ALL;
1179 break;
1180 default:
1181 panic("invalid fp_save_mech");
1182 }
1183
1184 /*
1185 * Set the corresponding flags that the system expects on the FPU state
1186 * to indicate that this is our state. The FPU_EN flag is required to
1187 * indicate that FPU usage is allowed. The FPU_KERN flag is explicitly
1188 * not set below as it represents that this state is being suppressed
1189 * by the kernel.
1190 */
1191 kfpu->kfpu_ctx.fpu_flags = FPU_EN | FPU_VALID;
1192 kfpu->kfpu_flags |= KFPU_F_INITIALIZED;
1193 }
1194
1195 kfpu_state_t *
1196 kernel_fpu_alloc(int kmflags)
1197 {
1198 kfpu_state_t *kfpu;
1199
1200 if ((kfpu = kmem_zalloc(sizeof (kfpu_state_t), kmflags)) == NULL) {
1201 return (NULL);
1202 }
1203
1204 kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic =
1205 kmem_cache_alloc(fpsave_cachep, kmflags);
1206 if (kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic == NULL) {
1207 kmem_free(kfpu, sizeof (kfpu_state_t));
1208 return (NULL);
1209 }
1210
1211 kernel_fpu_fpstate_init(kfpu);
1212
1213 return (kfpu);
1214 }
1215
1216 void
1217 kernel_fpu_free(kfpu_state_t *kfpu)
1218 {
1219 kmem_cache_free(fpsave_cachep,
1220 kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic);
1221 kmem_free(kfpu, sizeof (kfpu_state_t));
1222 }
1223
1224 static void
1225 kernel_fpu_ctx_save(void *arg)
1226 {
1227 kfpu_state_t *kfpu = arg;
1228 fpu_ctx_t *pf;
1229
1230 if (kfpu == NULL) {
1231 /*
1232 * A NULL kfpu implies this is a kernel thread with an LWP and
1233 * no user-level FPU usage. Use the lwp fpu save area.
1234 */
1235 pf = &curthread->t_lwp->lwp_pcb.pcb_fpu;
1236
1237 ASSERT(curthread->t_procp->p_flag & SSYS);
1238 ASSERT3U(pf->fpu_flags & FPU_VALID, ==, 0);
1239
1240 fp_save(pf);
1241 } else {
1242 pf = &kfpu->kfpu_ctx;
1243
1244 ASSERT3P(kfpu->kfpu_curthread, ==, curthread);
1245 ASSERT3U(pf->fpu_flags & FPU_VALID, ==, 0);
1246
1247 /*
1248 * Note, we can't use fp_save because it assumes that we're
1249 * saving to the thread's PCB and not somewhere else. Because
1250 * this is a different FPU context, we instead have to do this
1251 * ourselves.
1252 */
1253 switch (fp_save_mech) {
1254 case FP_FXSAVE:
1255 fpxsave(pf->fpu_regs.kfpu_u.kfpu_fx);
1256 break;
1257 case FP_XSAVE:
1258 xsavep(pf->fpu_regs.kfpu_u.kfpu_xs, pf->fpu_xsave_mask);
1259 break;
1260 default:
1261 panic("Invalid fp_save_mech");
1262 }
1263
1264 /*
1265 * Because we have saved context here, our save state is no
1266 * longer valid and therefore needs to be reinitialized.
1267 */
1268 kfpu->kfpu_flags &= ~KFPU_F_INITIALIZED;
1269 }
1270
1271 pf->fpu_flags |= FPU_VALID;
1272
1273 /*
1274 * Clear KFPU flag. This allows swtch to check for improper kernel
1275 * usage of the FPU (i.e. switching to a new thread while the old
1276 * thread was in the kernel and using the FPU, but did not perform a
1277 * context save).
1278 */
1279 curthread->t_flag &= ~T_KFPU;
1280 }
1281
1282 static void
1283 kernel_fpu_ctx_restore(void *arg)
1284 {
1285 kfpu_state_t *kfpu = arg;
1286 fpu_ctx_t *pf;
1287
1288 if (kfpu == NULL) {
1289 /*
1290 * A NULL kfpu implies this is a kernel thread with an LWP and
1291 * no user-level FPU usage. Use the lwp fpu save area.
1292 */
1293 pf = &curthread->t_lwp->lwp_pcb.pcb_fpu;
1294
1295 ASSERT(curthread->t_procp->p_flag & SSYS);
1296 ASSERT3U(pf->fpu_flags & FPU_VALID, !=, 0);
1297 } else {
1298 pf = &kfpu->kfpu_ctx;
1299
1300 ASSERT3P(kfpu->kfpu_curthread, ==, curthread);
1301 ASSERT3U(pf->fpu_flags & FPU_VALID, !=, 0);
1302 }
1303
1304 fp_restore(pf);
1305 curthread->t_flag |= T_KFPU;
1306 }
1307
1308 /*
1309 * Validate that the thread is not switching off-cpu while actively using the
1310 * FPU within the kernel.
1311 */
1312 void
1313 kernel_fpu_no_swtch(void)
1314 {
1315 if ((curthread->t_flag & T_KFPU) != 0) {
1316 panic("curthread swtch-ing while the kernel is using the FPU");
1317 }
1318 }
1319
1320 static const struct ctxop_template kfpu_ctxop_tpl = {
1321 .ct_rev = CTXOP_TPL_REV,
1322 .ct_save = kernel_fpu_ctx_save,
1323 .ct_restore = kernel_fpu_ctx_restore,
1324 };
1325
1326 void
1327 kernel_fpu_begin(kfpu_state_t *kfpu, uint_t flags)
1328 {
1329 klwp_t *pl = curthread->t_lwp;
1330 struct ctxop *ctx;
1331
1332 if ((curthread->t_flag & T_KFPU) != 0) {
1333 panic("curthread attempting to nest kernel FPU states");
1334 }
1335
1336 /* KFPU_USE_LWP and KFPU_NO_STATE are mutually exclusive. */
1337 ASSERT((flags & (KFPU_USE_LWP | KFPU_NO_STATE)) !=
1338 (KFPU_USE_LWP | KFPU_NO_STATE));
1339
1340 if ((flags & KFPU_NO_STATE) == KFPU_NO_STATE) {
1341 /*
1342 * Since we don't have a kfpu_state or usable lwp pcb_fpu to
1343 * hold our kernel FPU context, we depend on the caller doing
1344 * kpreempt_disable for the duration of our FPU usage. This
1345 * should only be done for very short periods of time.
1346 */
1347 ASSERT(curthread->t_preempt > 0);
1348 ASSERT(kfpu == NULL);
1349
1350 if (pl != NULL) {
1351 /*
1352 * We might have already saved once so FPU_VALID could
1353 * be set. This is handled in fp_save.
1354 */
1355 fp_save(&pl->lwp_pcb.pcb_fpu);
1356 pl->lwp_pcb.pcb_fpu.fpu_flags |= FPU_KERNEL;
1357 }
1358
1359 curthread->t_flag |= T_KFPU;
1360
1361 /* Always restore the fpu to the initial state. */
1362 fpinit();
1363
1364 return;
1365 }
1366
1367 /*
1368 * We either have a kfpu, or are using the LWP pcb_fpu for context ops.
1369 */
1370
1371 if ((flags & KFPU_USE_LWP) == 0) {
1372 if (kfpu->kfpu_curthread != NULL)
1373 panic("attempting to reuse kernel FPU state at %p when "
1374 "another thread already is using", kfpu);
1375
1376 if ((kfpu->kfpu_flags & KFPU_F_INITIALIZED) == 0)
1377 kernel_fpu_fpstate_init(kfpu);
1378
1379 kfpu->kfpu_curthread = curthread;
1380 }
1381
1382 /*
1383 * Not all threads may have an active LWP. If they do and we're not
1384 * going to re-use the LWP, then we should go ahead and save the state.
1385 * We must also note that the fpu is now being used by the kernel and
1386 * therefore we do not want to manage the fpu state via the user-level
1387 * thread's context handlers.
1388 *
1389 * We might have already saved once (due to a prior use of the kernel
1390 * FPU or another code path) so FPU_VALID could be set. This is handled
1391 * by fp_save, as is the FPU_EN check.
1392 */
1393 ctx = ctxop_allocate(&kfpu_ctxop_tpl, kfpu);
1394 kpreempt_disable();
1395 if (pl != NULL) {
1396 if ((flags & KFPU_USE_LWP) == 0)
1397 fp_save(&pl->lwp_pcb.pcb_fpu);
1398 pl->lwp_pcb.pcb_fpu.fpu_flags |= FPU_KERNEL;
1399 }
1400
1401 /*
1402 * Set the context operations for kernel FPU usage. Because kernel FPU
1403 * setup and ctxop attachment needs to happen under the protection of
1404 * kpreempt_disable(), we allocate the ctxop outside the guard so its
1405 * sleeping allocation will not cause a voluntary swtch(). This allows
1406 * the rest of the initialization to proceed, ensuring valid state for
1407 * the ctxop handlers.
1408 */
1409 ctxop_attach(curthread, ctx);
1410 curthread->t_flag |= T_KFPU;
1411
1412 if ((flags & KFPU_USE_LWP) == KFPU_USE_LWP) {
1413 /*
1414 * For pure kernel threads with an LWP, we can use the LWP's
1415 * pcb_fpu to save/restore context.
1416 */
1417 fpu_ctx_t *pf = &pl->lwp_pcb.pcb_fpu;
1418
1419 VERIFY(curthread->t_procp->p_flag & SSYS);
1420 VERIFY(kfpu == NULL);
1421 ASSERT((pf->fpu_flags & FPU_EN) == 0);
1422
1423 /* Always restore the fpu to the initial state. */
1424 if (fp_save_mech == FP_XSAVE)
1425 pf->fpu_xsave_mask = XFEATURE_FP_ALL;
1426 fpinit();
1427 pf->fpu_flags = FPU_EN | FPU_KERNEL;
1428 } else {
1429 /* initialize the kfpu state */
1430 kernel_fpu_ctx_restore(kfpu);
1431 }
1432 kpreempt_enable();
1433 }
1434
1435 void
1436 kernel_fpu_end(kfpu_state_t *kfpu, uint_t flags)
1437 {
1438 if ((curthread->t_flag & T_KFPU) == 0) {
1439 panic("curthread attempting to clear kernel FPU state "
1440 "without using it");
1441 }
1442
1443 /*
1444 * General comments on why the rest of this function is structured the
1445 * way it is. Be aware that there is a lot of subtlety here.
1446 *
1447 * If a user-level thread ever uses the fpu while in the kernel, then
1448 * we cannot call fpdisable since that does STTS. That will set the
1449 * ts bit in %cr0 which will cause an exception if anything touches the
1450 * fpu. However, the user-level context switch handler (fpsave_ctxt)
1451 * needs to access the fpu to save the registers into the pcb.
1452 * fpsave_ctxt relies on CLTS having been done to clear the ts bit in
1453 * fprestore_ctxt when the thread context switched onto the CPU.
1454 *
1455 * Calling fpdisable only effects the current CPU's %cr0 register.
1456 *
1457 * During ctxop_remove and kpreempt_enable, we can voluntarily context
1458 * switch, so the CPU we were on when we entered this function might
1459 * not be the same one we're on when we return from ctxop_remove or end
1460 * the function. Note there can be user-level context switch handlers
1461 * still installed if this is a user-level thread.
1462 *
1463 * We also must be careful in the unlikely chance we're running in an
1464 * interrupt thread, since we can't leave the CPU's %cr0 TS state set
1465 * incorrectly for the "real" thread to resume on this CPU.
1466 */
1467
1468 if ((flags & KFPU_NO_STATE) == 0) {
1469 kpreempt_disable();
1470 } else {
1471 ASSERT(curthread->t_preempt > 0);
1472 }
1473
1474 curthread->t_flag &= ~T_KFPU;
1475
1476 /*
1477 * When we are ending things, we explicitly don't save the current
1478 * kernel FPU state back to the temporary state. The kfpu API is not
1479 * intended to be a permanent save location.
1480 *
1481 * If this is a user-level thread and we were to context switch
1482 * before returning to user-land, fpsave_ctxt will be a no-op since we
1483 * already saved the user-level FPU state the first time we run
1484 * kernel_fpu_begin (i.e. we won't save the bad kernel fpu state over
1485 * the user-level fpu state). The fpsave_ctxt functions only save if
1486 * FPU_VALID is not already set. fp_save also set PCB_SET_UPDATE_FPU so
1487 * fprestore_ctxt will be done in sys_rtt_common when the thread
1488 * finally returns to user-land.
1489 */
1490
1491 if ((curthread->t_procp->p_flag & SSYS) != 0 &&
1492 curthread->t_intr == NULL) {
1493 /*
1494 * A kernel thread which is not an interrupt thread, so we
1495 * STTS now.
1496 */
1497 fpdisable();
1498 }
1499
1500 if ((flags & KFPU_NO_STATE) == 0) {
1501 ctxop_remove(curthread, &kfpu_ctxop_tpl, kfpu);
1502
1503 if (kfpu != NULL) {
1504 if (kfpu->kfpu_curthread != curthread) {
1505 panic("attempting to end kernel FPU state "
1506 "for %p, but active thread is not "
1507 "curthread", kfpu);
1508 } else {
1509 kfpu->kfpu_curthread = NULL;
1510 }
1511 }
1512
1513 kpreempt_enable();
1514 }
1515
1516 if (curthread->t_lwp != NULL) {
1517 uint_t f;
1518
1519 if (flags & KFPU_USE_LWP) {
1520 f = FPU_EN | FPU_KERNEL;
1521 } else {
1522 f = FPU_KERNEL;
1523 }
1524 curthread->t_lwp->lwp_pcb.pcb_fpu.fpu_flags &= ~f;
1525 }
1526 }