base-ij-webrev Wdiff usr/src/uts/intel/os/fpu.c

Print this page

15254 %ymm registers not restored after signal handler
15367 x86 getfpregs() summons corrupting %xmm ghosts
15333 want x86 /proc xregs support (libc_db, libproc, mdb, etc.)
15336 want libc functions for extended ucontext_t
15334 want ps_lwphandle-specific reg routines
15328 FPU_CW_INIT mistreats reserved bit
15335 i86pc fpu_subr.c isn't really platform-specific
15332 setcontext(2) isn't actually noreturn
15331 need <sys/stdalign.h>
Change-Id: I7060aa86042dfb989f77fc3323c065ea2eafa9ad
Conflicts:
    usr/src/uts/common/fs/proc/prcontrol.c
    usr/src/uts/intel/os/archdep.c
    usr/src/uts/intel/sys/ucontext.h
    usr/src/uts/intel/syscall/getcontext.c

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/intel/os/fpu.c
          +++ new/usr/src/uts/intel/os/fpu.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.

↓ open down ↓

14 lines elided

↑ open up ↑

  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright 2021 Joyent, Inc.
  24   24   * Copyright 2021 RackTop Systems, Inc.
  25      - * Copyright 2022 Oxide Computer Company
       25 + * Copyright 2023 Oxide Computer Company
  26   26   */
  27   27  
  28   28  /*      Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
  29   29  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T   */
  30   30  /*              All Rights Reserved                             */
  31   31  
  32   32  /*      Copyright (c) 1987, 1988 Microsoft Corporation          */
  33   33  /*              All Rights Reserved                             */
  34   34  
  35   35  /*

  36   36   * Copyright (c) 2009, Intel Corporation.
  37   37   * All rights reserved.
  38   38   */
  39   39  
  40   40  #include <sys/types.h>
  41   41  #include <sys/param.h>
  42   42  #include <sys/signal.h>
  43   43  #include <sys/regset.h>
  44   44  #include <sys/privregs.h>
  45   45  #include <sys/psw.h>
  46   46  #include <sys/trap.h>
  47   47  #include <sys/fault.h>
  48   48  #include <sys/systm.h>
  49   49  #include <sys/user.h>
  50   50  #include <sys/file.h>
  51   51  #include <sys/proc.h>
  52   52  #include <sys/pcb.h>
  53   53  #include <sys/lwp.h>
  54   54  #include <sys/cpuvar.h>
  55   55  #include <sys/thread.h>

↓ open down ↓

20 lines elided

↑ open up ↑

  56   56  #include <sys/disp.h>
  57   57  #include <sys/fp.h>
  58   58  #include <sys/siginfo.h>
  59   59  #include <sys/archsystm.h>
  60   60  #include <sys/kmem.h>
  61   61  #include <sys/debug.h>
  62   62  #include <sys/x86_archext.h>
  63   63  #include <sys/sysmacros.h>
  64   64  #include <sys/cmn_err.h>
  65   65  #include <sys/kfpu.h>
       66 +#include <sys/stdbool.h>
       67 +#include <sys/stdalign.h>
       68 +#include <sys/procfs_isa.h>
       69 +#include <sys/sunddi.h>
  66   70  
  67   71  /*
  68   72   * FPU Management Overview
  69   73   * -----------------------
  70   74   *
  71   75   * The x86 FPU has evolved substantially since its days as the x87 coprocessor;
  72   76   * however, many aspects of its life as a coprocessor are still around in x86.
  73   77   *
  74   78   * Today, when we refer to the 'FPU', we don't just mean the original x87 FPU.
  75   79   * While that state still exists, there is much more that is covered by the FPU.
  76   80   * Today, this includes not just traditional FPU state, but also supervisor only
  77   81   * state. The following state is currently managed and covered logically by the
  78      - * idea of the FPU registers:
       82 + * idea of the FPU registers and more generally is called the Extended Processor
       83 + * States:
  79   84   *
  80   85   *    o Traditional x87 FPU
  81   86   *    o Vector Registers (%xmm, %ymm, %zmm)
  82   87   *    o Memory Protection Extensions (MPX) Bounds Registers
  83   88   *    o Protected Key Rights Registers (PKRU)
  84   89   *    o Processor Trace data
       90 + *    o Control-Flow Enforcement state
       91 + *    o Hardware Duty Cycle
       92 + *    o Hardware P-states
  85   93   *
  86   94   * The rest of this covers how the FPU is managed and controlled, how state is
  87   95   * saved and restored between threads, interactions with hypervisors, and other
  88      - * information exported to user land through aux vectors. A lot of background
       96 + * information exported to userland through aux vectors. A lot of background
  89   97   * information is here to synthesize major parts of the Intel SDM, but
  90   98   * unfortunately, it is not a replacement for reading it.
  91   99   *
  92  100   * FPU Control Registers
  93  101   * ---------------------
  94  102   *
  95  103   * Because the x87 FPU began its life as a co-processor and the FPU was
  96  104   * optional there are several bits that show up in %cr0 that we have to
  97  105   * manipulate when dealing with the FPU. These are:
  98  106   *

99 107 * o CR0.ET The 'extension type' bit. This was used originally to indicate
100 108 * that the FPU co-processor was present. Now it is forced on for
101 109 * compatibility. This is often used to verify whether or not the
102 110 * FPU is present.
103 111 *
104 112 * o CR0.NE The 'native error' bit. Used to indicate that native error
105 113 * mode should be enabled. This indicates that we should take traps
106 114 * on FPU errors. The OS enables this early in boot.
107 115 *
108 116 * o CR0.MP The 'Monitor Coprocessor' bit. Used to control whether or not
109 117 * wait/fwait instructions generate a #NM if CR0.TS is set.
110 118 *
111 119 * o CR0.EM The 'Emulation' bit. This is used to cause floating point
112 120 * operations (x87 through SSE4) to trap with a #UD so they can be
113 121 * emulated. The system never sets this bit, but makes sure it is
114 122 * clear on processor start up.
115 123 *
116 124 * o CR0.TS The 'Task Switched' bit. When this is turned on, a floating
117 125 * point operation will generate a #NM. An fwait will as well,
118 126 * depending on the value in CR0.MP.
119 127 *
120 128 * Our general policy is that CR0.ET, CR0.NE, and CR0.MP are always set by
121 129 * the system. Similarly CR0.EM is always unset by the system. CR0.TS has a more
122 130 * complicated role. Historically it has been used to allow running systems to
123 131 * restore the FPU registers lazily. This will be discussed in greater depth
124 132 * later on.
125 133 *
126 134 * %cr4 is also used as part of the FPU control. Specifically we need to worry
127 135 * about the following bits in the system:
128 136 *
129 137 * o CR4.OSFXSR This bit is used to indicate that the OS understands and
130 138 * supports the execution of the fxsave and fxrstor
131 139 * instructions. This bit is required to be set to enable
132 140 * the use of the SSE->SSE4 instructions.
133 141 *
134 142 * o CR4.OSXMMEXCPT This bit is used to indicate that the OS can understand
135 143 * and take a SIMD floating point exception (#XM). This bit
136 144 * is always enabled by the system.
137 145 *
138 146 * o CR4.OSXSAVE This bit is used to indicate that the OS understands and
139 147 * supports the execution of the xsave and xrstor family of
140 148 * instructions. This bit is required to use any of the AVX
141 149 * and newer feature sets.
142 150 *
143 151 * Because all supported processors are 64-bit, they'll always support the XMM
144 152 * extensions and we will enable both CR4.OXFXSR and CR4.OSXMMEXCPT in boot.
145 153 * CR4.OSXSAVE will be enabled and used whenever xsave is reported in cpuid.
146 154 *
147 155 * %xcr0 is used to manage the behavior of the xsave feature set and is only
148 156 * present on the system if xsave is supported. %xcr0 is read and written to
149 157 * through by the xgetbv and xsetbv instructions. This register is present
150 158 * whenever the xsave feature set is supported. Each bit in %xcr0 refers to a
151 159 * different component of the xsave state and controls whether or not that
152 160 * information is saved and restored. For newer feature sets like AVX and MPX,
153 161 * it also controls whether or not the corresponding instructions can be
154 162 * executed (much like CR0.OSFXSR does for the SSE feature sets).
155 163 *
156 164 * Everything in %xcr0 is around features available to users. There is also the
157 165 * IA32_XSS MSR which is used to control supervisor-only features that are still
158 166 * part of the xsave state. Bits that can be set in %xcr0 are reserved in
159 167 * IA32_XSS and vice versa. This is an important property that is particularly
160 168 * relevant to how the xsave instructions operate.
161 169 *
162 170 * Save Mechanisms
163 171 * ---------------
164 172 *
165 173 * When switching between running threads the FPU state needs to be saved and
166 174 * restored by the OS. If this state was not saved, users would rightfully
167 175 * complain about corrupt state. There are three mechanisms that exist on the
168 176 * processor for saving and restoring these state images:
169 177 *
170 178 * o fsave
171 179 * o fxsave
172 180 * o xsave
173 181 *
174 182 * fsave saves and restores only the x87 FPU and is the oldest of these
175 183 * mechanisms. This mechanism is never used in the kernel today because we are
176 184 * always running on systems that support fxsave.
177 185 *
178 186 * The fxsave and fxrstor mechanism allows the x87 FPU and the SSE register
179 187 * state to be saved and restored to and from a struct fxsave_state. This is the
180 188 * default mechanism that is used to save and restore the FPU on amd64. An
181 189 * important aspect of fxsave that was different from the original i386 fsave
182 190 * mechanism is that the restoring of FPU state with pending exceptions will not
183 191 * generate an exception, it will be deferred to the next use of the FPU.
184 192 *
185 193 * The final and by far the most complex mechanism is that of the xsave set.
186 194 * xsave allows for saving and restoring all of the traditional x86 pieces (x87
187 195 * and SSE), while allowing for extensions that will save the %ymm, %zmm, etc.
188 196 * registers.
189 197 *
190 198 * Data is saved and restored into and out of a struct xsave_state. The first
191 199 * part of the struct xsave_state is equivalent to the struct fxsave_state.
192 200 * After that, there is a header which is used to describe the remaining
193 201 * portions of the state. The header is a 64-byte value of which the first two
194 202 * uint64_t values are defined and the rest are reserved and must be zero. The
195 203 * first uint64_t is the xstate_bv member. This describes which values in the
196 204 * xsave_state are actually valid and present. This is updated on a save and
197 205 * used on restore. The second member is the xcomp_bv member. Its last bit
198 206 * determines whether or not a compressed version of the structure is used.
199 207 *
200 208 * When the uncompressed structure is used (currently the only format we
201 209 * support), then each state component is at a fixed offset in the structure,
202 210 * even if it is not being used. For example, if you only saved the AVX related
203 211 * state, but did not save the MPX related state, the offset would not change
204 212 * for any component. With the compressed format, components that aren't used
205 213 * are all elided (though the x87 and SSE state are always there).
206 214 *
207 215 * Unlike fxsave which saves all state, the xsave family does not always save
208 216 * and restore all the state that could be covered by the xsave_state. The
209 217 * instructions all take an argument which is a mask of what to consider. This
210 218 * is the same mask that will be used in the xstate_bv vector and it is also the
211 219 * same values that are present in %xcr0 and IA32_XSS. Though IA32_XSS is only
212 220 * considered with the xsaves and xrstors instructions.
213 221 *
214 222 * When a save or restore is requested, a bitwise and is performed between the
215 223 * requested bits and those that have been enabled in %xcr0. Only the bits that
216 224 * match that are then saved or restored. Others will be silently ignored by
217 225 * the processor. This idea is used often in the OS. We will always request that
218 226 * we save and restore all of the state, but only those portions that are
219 227 * actually enabled in %xcr0 will be touched.
220 228 *
221 229 * If a feature has been asked to be restored that is not set in the xstate_bv
222 230 * feature vector of the save state, then it will be set to its initial state by
223 231 * the processor (usually zeros). Also, when asked to save state, the processor
224 232 * may not write out data that is in its initial state as an optimization. This
225 233 * optimization only applies to saving data and not to restoring data.
226 234 *
227 235 * There are a few different variants of the xsave and xrstor instruction. They
228 236 * are:
229 237 *
230 238 * o xsave This is the original save instruction. It will save all of the
231 239 * requested data in the xsave state structure. It only saves data
232 240 * in the uncompressed (xcomp_bv[63] is zero) format. It may be
233 241 * executed at all privilege levels.
234 242 *
235 243 * o xrstor This is the original restore instruction. It will restore all of
236 244 * the requested data. The xrstor function can handle both the
237 245 * compressed and uncompressed formats. It may be executed at all
238 246 * privilege levels.
239 247 *
240 248 * o xsaveopt This is a variant of the xsave instruction that employs
241 249 * optimizations to try and only write out state that has been
242 250 * modified since the last time an xrstor instruction was called.
243 251 * The processor tracks a tuple of information about the last
244 252 * xrstor and tries to ensure that the same buffer is being used
245 253 * when this optimization is being used. However, because of the
246 254 * way that it tracks the xrstor buffer based on the address of it,
247 255 * it is not suitable for use if that buffer can be easily reused.
248 256 * The most common case is trying to save data to the stack in
249 257 * rtld. It may be executed at all privilege levels.
250 258 *
251 259 * o xsavec This is a variant of the xsave instruction that writes out the
252 260 * compressed form of the xsave_state. Otherwise it behaves as
253 261 * xsave. It may be executed at all privilege levels.
254 262 *
255 263 * o xsaves This is a variant of the xsave instruction. It is similar to
256 264 * xsavec in that it always writes the compressed form of the
257 265 * buffer. Unlike all the other forms, this instruction looks at
258 266 * both the user (%xcr0) and supervisor (IA32_XSS MSR) to determine
259 267 * what to save and restore. xsaves also implements the same
260 268 * optimization that xsaveopt does around modified pieces. User
261 269 * land may not execute the instruction.
262 270 *
263 271 * o xrstors This is a variant of the xrstor instruction. Similar to xsaves
264 272 * it can save and restore both the user and privileged states.
265 273 * Unlike xrstor it can only operate on the compressed form.
266 274 * User land may not execute the instruction.
267 275 *
268 276 * Based on all of these, the kernel has a precedence for what it will use.
269 277 * Basically, xsaves (not supported) is preferred to xsaveopt, which is
270 278 * preferred to xsave. A similar scheme is used when informing rtld (more later)
271 279 * about what it should use. xsavec is preferred to xsave. xsaveopt is not
272 280 * recommended due to the modified optimization not being appropriate for this
273 281 * use.
274 282 *
275 283 * Finally, there is one last gotcha with the xsave state. Importantly some AMD
276 284 * processors did not always save and restore some of the FPU exception state in
277 285 * some cases like Intel did. In those cases the OS will make up for this fact
278 286 * itself.
279 287 *
280 288 * FPU Initialization
281 289 * ------------------
282 290 *
283 291 * One difference with the FPU registers is that not all threads have FPU state,
284 292 * only those that have an lwp. Generally this means kernel threads, which all
285 293 * share p0 and its lwp, do not have FPU state. Though there are definitely
286 294 * exceptions such as kcfpoold. In the rest of this discussion we'll use thread
287 295 * and lwp interchangeably, just think of thread meaning a thread that has a
288 296 * lwp.
289 297 *
290 298 * Each lwp has its FPU state allocated in its pcb (process control block). The
291 299 * actual storage comes from the fpsave_cachep kmem cache. This cache is sized
292 300 * dynamically at start up based on the save mechanism that we're using and the
293 301 * amount of memory required for it. This is dynamic because the xsave_state
294 302 * size varies based on the supported feature set.
295 303 *
296 304 * The hardware side of the FPU is initialized early in boot before we mount the
297 305 * root file system. This is effectively done in fpu_probe(). This is where we
298 306 * make the final decision about what the save and restore mechanisms we should
299 307 * use are, create the fpsave_cachep kmem cache, and initialize a number of
300 308 * function pointers that use save and restoring logic.
301 309 *
302 310 * The thread/lwp side is a a little more involved. There are two different
303 311 * things that we need to concern ourselves with. The first is how the FPU
304 312 * resources are allocated and the second is how the FPU state is initialized
305 313 * for a given lwp.
306 314 *
307 315 * We allocate the FPU save state from our kmem cache as part of lwp_fp_init().
308 316 * This is always called unconditionally by the system as part of creating an
309 317 * LWP.
310 318 *
311 319 * There are three different initialization paths that we deal with. The first
312 320 * is when we are executing a new process. As part of exec all of the register
313 321 * state is reset. The exec case is particularly important because init is born
314 322 * like Athena, sprouting from the head of the kernel, without any true parent
315 323 * to fork from. The second is used whenever we fork or create a new lwp. The
316 324 * third is to deal with special lwps like the agent lwp.
317 325 *
318 326 * During exec, we will call fp_exec() which will initialize and set up the FPU
319 327 * state for the process. That will fill in the initial state for the FPU and
320 328 * also set that state in the FPU itself. As part of fp_exec() we also install a
321 329 * thread context operations vector that takes care of dealing with the saving
322 330 * and restoring of the FPU. These context handlers will also be called whenever
323 331 * an lwp is created or forked. In those cases, to initialize the FPU we will
324 332 * call fp_new_lwp(). Like fp_exec(), fp_new_lwp() will install a context
325 333 * operations vector for the new thread.

↓ open down ↓

227 lines elided

↑ open up ↑

 326  334   *
 327  335   * Next we'll end up in the context operation fp_new_lwp(). This saves the
 328  336   * current thread's state, initializes the new thread's state, and copies over
 329  337   * the relevant parts of the originating thread's state. It's as this point that
 330  338   * we also install the FPU context operations into the new thread, which ensures
 331  339   * that all future threads that are descendants of the current one get the
 332  340   * thread context operations (unless they call exec).
 333  341   *
 334  342   * To deal with some things like the agent lwp, we double check the state of the
 335  343   * FPU in sys_rtt_common() to make sure that it has been enabled before
 336      - * returning to user land. In general, this path should be rare, but it's useful
      344 + * returning to userland. In general, this path should be rare, but it's useful
 337  345   * for the odd lwp here and there.
 338  346   *
 339  347   * The FPU state will remain valid most of the time. There are times that
 340  348   * the state will be rewritten. For example in restorecontext, due to /proc, or
 341  349   * the lwp calls exec(). Whether the context is being freed or we are resetting
 342  350   * the state, we will call fp_free() to disable the FPU and our context.
 343  351   *
 344  352   * Finally, when the lwp is destroyed, it will actually destroy and free the FPU
 345  353   * state by calling fp_lwp_cleanup().
 346  354   *

 347  355   * Kernel FPU Multiplexing
 348  356   * -----------------------
 349  357   *
 350  358   * Just as the kernel has to maintain all of the general purpose registers when
 351  359   * switching between scheduled threads, the same is true of the FPU registers.
 352  360   *
 353  361   * When a thread has FPU state, it also has a set of context operations
 354  362   * installed. These context operations take care of making sure that the FPU is

↓ open down ↓

8 lines elided

↑ open up ↑

 355  363   * properly saved and restored during a context switch (fpsave_ctxt and
 356  364   * fprestore_ctxt respectively). This means that the current implementation of
 357  365   * the FPU is 'eager', when a thread is running the CPU will have its FPU state
 358  366   * loaded. While this is always true when executing in userland, there are a few
 359  367   * cases where this is not true in the kernel.
 360  368   *
 361  369   * This was not always the case. Traditionally on x86 a 'lazy' FPU restore was
 362  370   * employed. This meant that the FPU would be saved on a context switch and the
 363  371   * CR0.TS bit would be set. When a thread next tried to use the FPU, it would
 364  372   * then take a #NM trap, at which point we would restore the FPU from the save
 365      - * area and return to user land. Given the frequency of use of the FPU alone by
 366      - * libc, there's no point returning to user land just to trap again.
      373 + * area and return to userland. Given the frequency of use of the FPU alone by
      374 + * libc, there's no point returning to userland just to trap again.
 367  375   *
 368  376   * There are a few cases though where the FPU state may need to be changed for a
 369  377   * thread on its behalf. The most notable cases are in the case of processes
 370  378   * using /proc, restorecontext, forking, etc. In all of these cases the kernel
 371  379   * will force a threads FPU state to be saved into the PCB through the fp_save()
 372  380   * function. Whenever the FPU is saved, then the FPU_VALID flag is set on the
 373  381   * pcb. This indicates that the save state holds currently valid data. As a side
 374  382   * effect of this, CR0.TS will be set. To make sure that all of the state is
 375      - * updated before returning to user land, in these cases, we set a flag on the
      383 + * updated before returning to userland, in these cases, we set a flag on the
 376  384   * PCB that says the FPU needs to be updated. This will make sure that we take
 377  385   * the slow path out of a system call to fix things up for the thread. Due to
 378  386   * the fact that this is a rather rare case, effectively setting the equivalent
 379  387   * of t_postsys is acceptable.
 380  388   *
 381  389   * CR0.TS will be set after a save occurs and cleared when a restore occurs.
 382  390   * Generally this means it will be cleared immediately by the new thread that is
 383  391   * running in a context switch. However, this isn't the case for kernel threads.
 384  392   * They currently operate with CR0.TS set as no kernel state is restored for
 385  393   * them. This means that using the FPU will cause a #NM and panic.
 386  394   *
 387  395   * The FPU_VALID flag on the currently executing thread's pcb is meant to track
 388  396   * what the value of CR0.TS should be. If it is set, then CR0.TS will be set.
 389  397   * However, because we eagerly restore, the only time that CR0.TS should be set
 390  398   * for a non-kernel thread is during operations where it will be cleared before
 391      - * returning to user land and importantly, the only data that is in it is its
      399 + * returning to userland and importantly, the only data that is in it is its
 392  400   * own.
 393  401   *
 394  402   * Kernel FPU Usage
 395  403   * ----------------
 396  404   *
 397  405   * Traditionally the kernel never used the FPU since it had no need for
 398  406   * floating point operations. However, modern FPU hardware supports a variety
 399  407   * of SIMD extensions which can speed up code such as parity calculations or
 400  408   * encryption.
 401  409   *

 402  410   * To allow the kernel to take advantage of these features, the
 403  411   * kernel_fpu_begin() and kernel_fpu_end() functions should be wrapped
 404  412   * around any usage of the FPU by the kernel to ensure that user-level context
 405  413   * is properly saved/restored, as well as to properly setup the FPU for use by
 406  414   * the kernel. There are a variety of ways this wrapping can be used, as
 407  415   * discussed in this section below.
 408  416   *
 409  417   * When kernel_fpu_begin() and kernel_fpu_end() are used for extended
 410  418   * operations, the kernel_fpu_alloc() function should be used to allocate a
 411  419   * kfpu_state_t structure that is used to save/restore the thread's kernel FPU
 412  420   * state. This structure is not tied to any thread. That is, different threads
 413  421   * can reuse the same kfpu_state_t structure, although not concurrently. A
 414  422   * kfpu_state_t structure is freed by the kernel_fpu_free() function.
 415  423   *
 416  424   * In some cases, the kernel may need to use the FPU for a short operation
 417  425   * without the overhead to manage a kfpu_state_t structure and without
 418  426   * allowing for a context switch off the FPU. In this case the KFPU_NO_STATE
 419  427   * bit can be set in the kernel_fpu_begin() and kernel_fpu_end() flags
 420  428   * parameter. This indicates that there is no kfpu_state_t. When used this way,
 421  429   * kernel preemption should be disabled by the caller (kpreempt_disable) before
 422  430   * calling kernel_fpu_begin(), and re-enabled after calling kernel_fpu_end().
 423  431   * For this usage, it is important to limit the kernel's FPU use to short
 424  432   * operations. The tradeoff between using the FPU without a kfpu_state_t
 425  433   * structure vs. the overhead of allowing a context switch while using the FPU
 426  434   * should be carefully considered on a case by case basis.
 427  435   *
 428  436   * In other cases, kernel threads have an LWP, but never execute in user space.
 429  437   * In this situation, the LWP's pcb_fpu area can be used to save/restore the
 430  438   * kernel's FPU state if the thread is context switched, instead of having to
 431  439   * allocate and manage a kfpu_state_t structure. The KFPU_USE_LWP bit in the
 432  440   * kernel_fpu_begin() and kernel_fpu_end() flags parameter is used to
 433  441   * enable this behavior. It is the caller's responsibility to ensure that this

↓ open down ↓

32 lines elided

↑ open up ↑

 434  442   * is only used for a kernel thread which never executes in user space.
 435  443   *
 436  444   * FPU Exceptions
 437  445   * --------------
 438  446   *
 439  447   * Certain operations can cause the kernel to take traps due to FPU activity.
 440  448   * Generally these events will cause a user process to receive a SIGFPU and if
 441  449   * the kernel receives it in kernel context, we will die. Traditionally the #NM
 442  450   * (Device Not Available / No Math) exception generated by CR0.TS would have
 443  451   * caused us to restore the FPU. Now it is a fatal event regardless of whether
 444      - * or not user land causes it.
      452 + * or not userland causes it.
 445  453   *
 446  454   * While there are some cases where the kernel uses the FPU, it is up to the
 447  455   * kernel to use the FPU in a way such that it cannot receive a trap or to use
 448  456   * the appropriate trap protection mechanisms.
 449  457   *
 450  458   * Hypervisors
 451  459   * -----------
 452  460   *
 453  461   * When providing support for hypervisors things are a little bit more
 454  462   * complicated because the FPU is not virtualized at all. This means that they

 455  463   * need to save and restore the FPU and %xcr0 across entry and exit to the
 456  464   * guest. To facilitate this, we provide a series of APIs in <sys/hma.h>. These
 457  465   * allow us to use the full native state to make sure that we are always saving
 458  466   * and restoring the full FPU that the host sees, even when the guest is using a
 459  467   * subset.
 460  468   *
 461  469   * One tricky aspect of this is that the guest may be using a subset of %xcr0
 462  470   * and therefore changing our %xcr0 on the fly. It is vital that when we're
 463  471   * saving and restoring the FPU that we always use the largest %xcr0 contents
 464  472   * otherwise we will end up leaving behind data in it.
 465  473   *

↓ open down ↓

11 lines elided

↑ open up ↑

 466  474   * ELF PLT Support
 467  475   * ---------------
 468  476   *
 469  477   * rtld has to preserve a subset of the FPU when it is saving and restoring
 470  478   * registers due to the amd64 SYS V ABI. See cmd/sgs/rtld/amd64/boot_elf.s for
 471  479   * more information. As a result, we set up an aux vector that contains
 472  480   * information about what save and restore mechanisms it should be using and
 473  481   * the sizing thereof based on what the kernel supports. This is passed down in
 474  482   * a series of aux vectors SUN_AT_FPTYPE and SUN_AT_FPSIZE. This information is
 475  483   * initialized in fpu_subr.c.
      484 + *
      485 + * Signal Handling and the ucontext_t
      486 + * ----------------------------------
      487 + *
      488 + * One of the many gifts that signals give us is the twofold fact that when a
      489 + * signal occurs, the signal handler is allowed to change the CPU's state
      490 + * arbitrarily and when the signal handler is done executing, we must restore it
      491 + * back to the original state. However, the second part of this is that the
      492 + * signal handler is actually allowed to modify the state that the thread will
      493 + * return to! To create this facade, the kernel will create a full ucontext_t
      494 + * state, effectively calling getcontext(2) on the thread's behalf, and a
      495 + * pointer to that is given to the signal handler (the void * argument for the
      496 + * sa_sigaction function pointer in sigaction(2)). When libc is done with a
      497 + * signal, it will call setcontext(2) with that same ucontext_t.
      498 + *
      499 + * Now, the ucontext_t has a fixed ABI for both ILP32 and LP64 environments and
      500 + * it's often declared on the stack itself, with the signal handler spilling all
      501 + * this state to the stack. The ucontext_t machine portion was broken into the
      502 + * general purpose and floating point registers. In 64-bit code, the floating
      503 + * point registers were mostly the same as the results of the fxsave instruction
      504 + * (i.e. struct fxsave_state). While the 64-bit kernel still uses the equivalent
      505 + * starting point for information, it is transformed into a different shape to
      506 + * deal with the history of the 32-bit SYS V ABI.
      507 + *
      508 + * While this worked, if you're reading this, you're aware that the x86 FPU and
      509 + * extended register states didn't stop at the initial 16 128-bit %xmm
      510 + * registers. Since then we have added 256-bit %ymm, 512-bit %zmm, and the %k
      511 + * opmask registers. None of these fit inside the standard ucontext_t; however,
      512 + * they must all be preserved and restored across a signal. While the various
      513 + * x86 platform-specific ABIs all suggest that these registers are not preserved
      514 + * across a function call, receiving a signal is not a function call and must be
      515 + * thought of like a process receiving an interrupt. In other words, this
      516 + * extended state must be preserved.
      517 + *
      518 + * To facilitate this, we have extended the ucontext_t structure with an
      519 + * additional flag, UC_XSAVE, which indicates that the traditional padding
      520 + * member, uc_xsave, actually is a pointer to the extended state. While this is
      521 + * accessible outside of a signal handling context through the combination of
      522 + * ucontext_alloc(3C) and getcontext_extd(2), our design around saving this
      523 + * state is focused on signal handling. Signal handling spills all this state to
      524 + * the stack and if we cannot spill the entire state to the stack then our
      525 + * inability to deliver the signal results in the process being killed! While
      526 + * there are separate efforts to ensure that the signal stack sizing that is
      527 + * used for the minimum and maximum signal sizes are sufficient, we still need
      528 + * to do our part to minimize the likelihood here.
      529 + *
      530 + * In designing this, we make the following observations which have helped us
      531 + * focus our design:
      532 + *
      533 + *   o While the start of an xsave area is the traditional 512-byte fxsave XMM
      534 + *     region, we already have that in the fpregs. Thus there is no reason to
      535 + *     duplicate it. This not only saves 512 bytes of additional stack space,
      536 + *     but it also means we don't have to ask which of the version of it to take
      537 + *     if they were to differ.
      538 + *
      539 + *   o Many applications out there aren't necessarily using the extended vectors
      540 + *     and even when we do make libc and others take advantage of it, it will
      541 + *     behoove us to ensure that they are put back into their initial state
      542 + *     after use. This leads us to expect that in a number of cases, the actual
      543 + *     extended register state will be in its initial state.
      544 + *
      545 + *   o While the signal handler does allow contents to be modified, we are
      546 + *     starting with making the interface private and thus allowing us to excise
      547 + *     components that are in their initial state.
      548 + *
      549 + *   o There are similarities to what we want to create with the compressed
      550 + *     xsave format; however, because we don't always have support for the
      551 + *     compressed format, we can't just arbitrarily say let's do a compressed
      552 + *     save to the user stack.
      553 + *
      554 + *   o Because we are not handing this state directly to and from hardware, we
      555 + *     don't need to meet some of the constraints of the compressed xsave format
      556 + *     around wanting alignment for the initial save or additional components.
      557 + *
      558 + * All of the above lead us to our own unique format for this data. When the
      559 + * UC_XSAVE flag is set in the ucontext_t, the uc_xsave member points to a
      560 + * uc_xsave_t structure which has a magic version number, a 32-bit length of the
      561 + * overall structure, and the 64-bit state bit-vector to represent which
      562 + * components are valid. Following this 8-byte header, each component that is
      563 + * present in the bit vector is immediately written out in roughly ascending bit
      564 + * order (the order is determined based on the order of the fpu_xsave_info
      565 + * array).
      566 + *
      567 + * This makes the rough logic that we have here when taking a signal and writing
      568 + * out this state as:
      569 + *
      570 + *   1. Ensure that the FPU is saved and that the contents of the pcb save area
      571 + *      are valid. That is, call fp_save() if the state is not already flagged
      572 + *      with FPU_VALID.
      573 + *
      574 + *   2. Copy the bit-vector from the save area and remove the XFEATURE_LEGACY_FP
      575 + *      and XFEATURE_SSE bits as these will be placed in the xsave area.
      576 + *
      577 + *   3. Initialize the uc_xsave_t by setting our version field, initializing the
      578 + *      length to the length of the current structure, and then setting the
      579 + *      modified bit vector above.
      580 + *
      581 + *   4. Walk each remaining bit of the bit-vector. For each set bit, copy out
      582 + *      its extended state starting at the current length in the header and then
      583 + *      increase the header size by that length.
      584 + *
      585 + *   5. Finally write out the final uc_xsave_t structure.
      586 + *
      587 + * The above process is also used when someone manually calls getcontext_extd(2)
      588 + * to get this state. The main difference between the two is which copyout
      589 + * function we use. This deserves some explanation. Our main starting point for
      590 + * all the logic here is fpu_signal_copyout(). It takes a copyfunc that allows
      591 + * the signal handling context to operate with a different copyout than we
      592 + * normally use in say getcontext_extd(2).
      593 + *
      594 + * When we've received a signal, we're at the intersection of several different
      595 + * gotchas. Normal copyout (or ddi_copyout()) will trigger watchpoints. That is,
      596 + * the watchpoints effectively set a copyout override function (t_copyops) that
      597 + * we end up vectoring to rather than a normal copyout. This allows the data to
      598 + * be modified and for the watchpoint to fire. While this is all well and good
      599 + * normally, it is problematic if we are trying to handle a signal. The signal
      600 + * deliver logic, sendsig(), goes through and disables the watchpoint for the
      601 + * region of the stack that we are copying out to. However, disabling
      602 + * watchpoints is not sufficient, we also need to use the copyout_noerr
      603 + * variants.
      604 + *
      605 + * These variants also require the use of on_fault() and no_fault() for error
      606 + * handling. While it is tempting to try and on_fault() the entire
      607 + * fpu_signal_copyout() operation, that is actually fraught for a few reasons.
      608 + * The first is that we don't want to disable faults during the entire operation
      609 + * as if the kernel messes up we will treat that as a user error. That isn't
      610 + * theoretical and happened during development. The second and perhaps more
      611 + * important issue is that correctly bounding the on_fault() / no_fault() means
      612 + * being careful about state. For example, kernel pre-emption is often disabled
      613 + * during parts of these operations, but it needs to be re-enabled when we're
      614 + * done. This would require tracking in some volatile variable that this had
      615 + * been enabled and disabled and tracking that.
      616 + *
      617 + * Instead, this is why fpu_signal_copyout() takes a copy out function as an
      618 + * argument. When we're in signal handling context, the function will use
      619 + * coypout_noerr() and wrap it in the appropriate on_fault() mechanisms.
      620 + *
      621 + * RESTORING STATE
      622 + *
      623 + * Copying out our current state is the easier half of this problem. When the
      624 + * kernel is done with a signal it calls setcontext(2) with the ucontext_t we
      625 + * assembled for it as described above. setcontext(2) isn't just used for
      626 + * returning from signals.
      627 + *
      628 + * The process for this goes in two steps. The first step is to copy in,
      629 + * validate, and transform the ucontext_t UC_XSAVE that we created above into an
      630 + * equivalent xsave format that we can use the appropriate xrstor function on.
      631 + * This first phase is implemented in fpu_signal_copyin(). Once that is done, we
      632 + * come back through a second phase that is driven out of restorecontext() and
      633 + * is implemented in fpu_set_xsave().
      634 + *
      635 + * Let's start by discussing the second part of this, which is more
      636 + * straightforward. In particular, the second phase assumes that all of the
      637 + * validation and error handling has been done by the first phase. This means
      638 + * here, we have a buffer that is already the appropriate size
      639 + * (cpuid_get_xsave_size()) and all we need to do is make sure that we can
      640 + * replace the actual save state with the current one.
      641 + *
      642 + * The only piece of shenanigans we have to do is around the kernel provided
      643 + * notion of 'status' and 'xstatus', which are cached versions of the x87 and
      644 + * SSE exception vectors. These are part of the fpregset ABI and therefore we
      645 + * need to propagate them from the temporary storage that part 1 sets up in the
      646 + * ignored region of the fxsave data. We use that because it is not persisted by
      647 + * the CPU, so clobbering it is generally alright.
      648 + *
      649 + * Once that is done, we simply note that we need a PCB update to occur to
      650 + * refresh the FPU state before we return to userland. Given that someone has
      651 + * called setcontext(2), this was always going to happen because we have to
      652 + * update segment registers and related, so this isn't so bad. With that, let's
      653 + * move onto the more nuanced part (1).
      654 + *
      655 + * When we're handling a setcontext(2) we have, in userland, a data structure
      656 + * that should match one we serialized out, though we cannot assume that a user
      657 + * has not modified it either accidentally or maliciously. Our goal is to set up
      658 + * the appropriate xsave state that can be passed to the CPU's xrstor. The first
      659 + * problem we have to deal with is where do we actually put this state?
      660 + *
      661 + * While not many programs actually call setcontext(2) on their own volition,
      662 + * this is going to get hit every time we take a signal. The first thought was
      663 + * to re-use the existing thread's save area; however, that's a bit challenging
      664 + * for a few reasons. In particular, we would need to ensure that we don't go
      665 + * off-CPU for any reason, which we cannot assume with a copyin from a user
      666 + * address space. In particular, it is trivial for us to hit a case where the
      667 + * stack has been paged out for some reason, which eschews that path.
      668 + *
      669 + * Instead, whenever a thread first calls setcontext(2), generally from signal
      670 + * context, we will at that time allocate another entry from the 'fpsave_cachep'
      671 + * kmem cache, giving us a buffer of the appropriate space to handle this. Once
      672 + * this buffer has been allocated, we leave it assigned to the thread's pcb and
      673 + * only tear it down when the thread itself finally exits. We reason that a
      674 + * thread that takes a signal once is either going to have the process exit
      675 + * shortly thereafter or is much more likely to take a signal again in the
      676 + * future. Many daemons and other processes set things up so signals are
      677 + * dispatched via one location, masking signals in other thread, using
      678 + * sigsuspend(2), signalfd(3C), or something similar.
      679 + *
      680 + * With this buffer in hand, we begin our task of reassembling state. Note, all
      681 + * of this is conditional on UC_XSAVE being set in the uc_flags member of the
      682 + * ucontext_t. If it is not set, then we assume that there is no extended state
      683 + * and will use the traditional path of setting the fpregset_t into the system
      684 + * via setfpregs().
      685 + *
      686 + * We first will copyin and validate the uc_xsave_t. In particular, we need to
      687 + * make sure the version makes sense and that the xsave component bit-vector
      688 + * doesn't have anything unexpected and more importantly unsupported in it, and
      689 + * that the addresses we've been given are within the user address space. At
      690 + * this point we can walk through our table of implemented bits and process
      691 + * them.
      692 + *
      693 + * For most components in here, the processing is straightforward. We continue
      694 + * walking our cursor and copy data into the kernel and place it in the
      695 + * appropriate place in our xsave state. If a xsave state component bit-vector
      696 + * isn't set, then we must ensure that we have the item in the initial state,
      697 + * which for everything other than the x87/SSE state is the memory being zeroed.
      698 + *
      699 + * The most unique case in the copyin state is that of the x87/SSE state. You
      700 + * might recall that we didn't copy it out explicitly as part of the uc_xsave_t,
      701 + * but instead have opted to use the single definition in the fpregset_t. Thus
      702 + * here, we copy it out of the fpregset_t, which the kernel has helpfully
      703 + * already unified into the 64-bit fxsave version prior to calling us, and
      704 + * install that into the save area we're building up.
      705 + *
      706 + * As part of this, there are two important pieces to be aware of. The first is
      707 + * that because the fpregset_t has both the status and xstatus members
      708 + * mentioned earlier, we temporarily copy them to the software-usable ignored
      709 + * areas of the fxsave state so we can corral this extra state into part (2)
      710 + * without needing to allocate additional space. The second piece is that when
      711 + * we're done processing this we explicitly remove the UC_FPU flag that would
      712 + * tell the kernel to proceed with updating that region. The problem is that
      713 + * that goes directly into the pcb's save area and not to the intermediate
      714 + * buffer as it uses the same entry point as /proc, mainly setfpregs().
      715 + *
      716 + * We don't do much validation of the actual contents of the registers that are
      717 + * being set with the exception of ensuring that no reserved bits of the mxcsr
      718 + * are used. This is not as strict as /proc, but failure here means the process
      719 + * is likely going to die (returning from setcontext() in a signal handler is
      720 + * fatal).
      721 + *
      722 + * /proc xregs
      723 + * -----------
      724 + *
      725 + * Observability of the state of the extended registers is important for
      726 + * understanding the system. While on the surface this is similar to signal
      727 + * handling, it is crucially different in a number of ways:
      728 + *
      729 + *   o In signal handling, we're trying to conserve every byte of stack that we
      730 + *     can.
      731 + *   o The /proc xregs file will end up in core files, which means that we need
      732 + *     a way of knowing what components are present and not present in it,
      733 + *     because this will vary from CPU to CPU due to the addition of
      734 + *     architectural features. For example, some CPUs support AVX-512, but
      735 + *     others do not.
      736 + *   o The signal handling structure is private and we're not trying to have
      737 + *     software modify it, on the other hand, the /proc interfaces that we
      738 + *     support we do want software to be able to interrogate and manipulate.
      739 + *     These need to be something that we can introduce additional components
      740 + *     into and make other changes that still allow it to work.
      741 + *
      742 + * The x86 xregs format is documented in proc(5). The short form is that the
      743 + * prxregset_hdr_t has a number of information entries, which are of the type
      744 + * prxregset_info_t. Each of the information headers has a type, size, and
      745 + * offset which indicate where to find the additional data.
      746 + *
      747 + * Each entry is described as one of the entries in the fpu_xsave_info[]. These
      748 + * items either are a 1:1 correspondence with a xsave related feature (e.g.
      749 + * there is one entry for each of the three AVX-512 components) or it is
      750 + * something synthetic that we provide as additional information such as the
      751 + * PRX_INFO_XCR, which is a way of getting information about the system such as
      752 + * what is enabled in %xcr0 out there.
      753 + *
      754 + * Unlike signal handling, we are given the buffer to place everything that
      755 + * needs to be written out. This is partially the design of the /proc APIs. That
      756 + * is, we will always assemble everything into the entire buffer that /proc asks
      757 + * us to, and then it will use as much or as little of it as is required.
      758 + * Similarly, when setting things, we don't have to worry about copying in
      759 + * information in the same way as signal handling does, because /proc takes care
      760 + * of it and always hands us a full buffer. Sizing that is a little nuanced, but
      761 + * is all handled in prmachdep.c.
      762 + *
      763 + * When someone performs a read of the xregs and thus is asking us for the
      764 + * current state, there is a little bit of nuance that we need to deal with
      765 + * here. The first, is whether or not the FPU is enabled and the second is if
      766 + * the FPU is enabled, whether a given component is noted as being in its
      767 + * initial state. This basically gives us three possible states for a given
      768 + * component:
      769 + *
      770 + *   1. FPU_EN is not set and FPU_VALID is not set. This means we need to take
      771 + *      the illumos FPU default for an item. More on that in a moment.
      772 + *   2. The saved xsave state indicates that the bit for a given component is
      773 + *      zero -- specifically the xsh_xstate_bv member of the struct xsave_state.
      774 + *      In this case, we must take the CPU's default for an item. This is
      775 + *      usually the same as illumos, but not always.
      776 + *   3. The saved xsave state indicates that a given component's state bit is
      777 + *      valid. The simplest of our cases. We can just take what we have from the
      778 + *      xsave state.
      779 + *
      780 + * The CPU's default state for most components other than the x87/SSE state is
      781 + * to have it be zeroed. This is what we treat as our default state as well. The
      782 + * primary difference is in the initialization of the x87/SSE state. The SYS V
      783 + * ABI requires that we enable a different floating point control word then the
      784 + * hardware default. This means that when we're dealing with case (1) for
      785 + * x87/SSE we have to be more careful than the other components. Thankfully for
      786 + * everything else this is just keeping it zeroed.
      787 + *
      788 + * A reasonable question would be why not just skip components that aren't
      789 + * marked as present. There are a few reasons we take a different approach and
      790 + * always include it. Both of these are to make lives simpler for consumers. In
      791 + * the first case, when someone is performing a read and wants to reassemble and
      792 + * answer the question of 'what is the value of %ymm0 or %zmm15', they have
      793 + * to combine multiple disparate parts. If one knows that the data we put into
      794 + * there is always valid and represents what is in hardware and doesn't have to
      795 + * keep track of what are the defaults in different circumstances, then that
      796 + * greatly simplifies consumers lives. It also helps us for core files and other
      797 + * observability cases because the answer to what is the operating system's
      798 + * default may change over time.
      799 + *
      800 + * Similarly, including all the possible structures means that we have
      801 + * simplified someone who does a write. Writes are always setting the full state
      802 + * of a thread, meaning that if someone wants to modify only a single register
      803 + * they must do a read, modify, and write. By including everything that they
      804 + * might need, it makes it easier for consumers to do this and not have to cons
      805 + * up the whole structure on their own.
      806 + *
      807 + * When we're setting state, things change around a little bit. We have a few
      808 + * constraints that are laid out in proc(5). In particular, we require that the
      809 + * PRX_INFO_XSAVE component always be present to tell us which other components
      810 + * we expect to be here and which ones we don't. We also are much stricter about
      811 + * writes in several ways. Of all the components, the PRX_INFO_XCR is read-only
      812 + * and may not be modified by a calling process. In addition, when we have
      813 + * 32-bit applications which have reserved registers in the %ymm, %zmm, etc.
      814 + * segments, if they are being written to and have modifications, then we will
      815 + * indicate an error there.
      816 + *
      817 + * Because we are given the entire buffer from userland and don't need to have
      818 + * an intermediate place to copy it in, we will validate the entire thing in
      819 + * advance. Once it has been validated and we consider it legal, then we will
      820 + * translate each entry into its corresponding entry in pcb's normal floating
      821 + * point state. This is different from signal handling mostly because of the
      822 + * fact that we are not using copyin, and once we get to this point, there is
      823 + * no more validation, so we don't have the same concerns around blocking while
      824 + * pre-emption is disabled.
      825 + *
      826 + * The Wrinkle with fpregs
      827 + * -----------------------
      828 + *
      829 + * When we instead turn our attention to the fpregs, whether we're gathering
      830 + * them as part of the ucontext_t or as part of /proc, there are a few
      831 + * complications that we need to be aware of when we're operating on a kernel
      832 + * that is using xsave as the save mechanism. When we're using fxsave as the
      833 + * save mechanism, the CPU will always save the entire 512-byte fxsave region.
      834 + * The fpregs ABI that the kernel expects is basically this structure itself,
      835 + * which is transformed into a 32-bit compatible form in archdep.c.
      836 + *
      837 + * But xsave makes this much more complex and has been a source of historical
      838 + * bugs in the system. In particular, unlike fxsave, xsave has its component bit
      839 + * vector that is written out to indicate validity. This means that blindly
      840 + * copying the fxsave area without checking those bits will lead us to do the
      841 + * wrong thing. The XMM state flag mostly covers the 16 128-bit %xmm registers,
      842 + * while the x87 legacy fp flag covers the rest of the state. This is all good,
      843 + * aside from the MCXSR.
      844 + *
      845 + * One of the more complicated pieces of xsave state management is correctly
      846 + * answering the question of when the MXCSR is written out to xsave_state. In
      847 + * practice, this is rather convoluted and varies. If either the XMM or AVX
      848 + * feature bits are set then the CPU will write out the MXCSR and its mask
      849 + * register into the traditional fxsave state region. This behavior is dependent
      850 + * on the type of save function that we use. xsave and xsaveopt will look at the
      851 + * AVX feature bit; however, xsavec does not and only considers the SSE feature
      852 + * bit. This means that when we're retrieving things, we need to check both of
      853 + * those bits to determine if we should use the initial state or the value
      854 + * written out.
      855 + *
      856 + * When we come to someone trying to set the fpregs through /proc, the main
      857 + * question we have is what happens to the extended registers. We have opted to
      858 + * implement and document it such that a write to the fpregs only impacts the
      859 + * fpregs. Put differently, we will save the FPU state with fp_save() ahead of
      860 + * copying the data into the save area, set the state bits for x87 and XMM
      861 + * state, and then set the FPU to be restored. All in all, this basically means
      862 + * that writing to fpregs does not touch any of the %ymm, %zmm, or other state
      863 + * that we might have present.
      864 + *
      865 + * Forward Looking: Adding Intel AMX Support
      866 + * -----------------------------------------
      867 + *
      868 + * Nothing can stop the march of features being added into the FPU. One of the
      869 + * larger chunks that we will need to wrangle with is Intel's Advanced Matrix
      870 + * Extensions (AMX), which add a large chunk of xsave state to each process.
      871 + * While things like AVX and AVX-512 have been enabled by default, the broader
      872 + * OS community has not been wanting to do this for AMX ,because of the size of
      873 + * the state which exceeds 8 KiB. While the signal handling state went out of
      874 + * its way to minimize the size it wrote to the stack, if this is used, it would
      875 + * need to be preserved.
      876 + *
      877 + * To deal with this reality and the fact that folks don't really want to
      878 + * enable it by default for all purposes when its use will be quite special
      879 + * purpose, Intel has also added a MSR around extended feature disable or xfd.
      880 + * This is what we represent in the PRX_INFO_XCR prx_xfd member. Our starting
      881 + * assumption, and the reason that so much of the /proc and signal logic ensures
      882 + * that we have the thread and process around, taking as an example the unused
      883 + * process argument in fpu_proc_xregs_info(), is that we will follow suit and
      884 + * default to having support disabled, but that a process will be able to opt
      885 + * into it, which will result in several different assumptions around signal
      886 + * stack sizing and cause us to reallocate and extend the pcb's FPU save state.
      887 + *
      888 + * The following is a list of items to pay attention to for future folks who
      889 + * work on this:
      890 + *
      891 + *   o We will want to confirm whether other systems have opted to make this
      892 + *     process-wide or thread-wide. Assuming process-wide, we will need to do a
      893 + *     hold of all lwps while making a change. The interface for that probably
      894 + *     doesn't want to be /proc, as a process probably doesn't want to write to
      895 + *     its own control file. Changing it for another process could be done
      896 + *     through the agent-lwp.
      897 + *   o Opting into this should probably be a one-way street.
      898 + *   o Opting into this will need to evaluate all threads and in particular
      899 + *     stack sizes to confirm they adhere to the new minimum.
      900 + *   o We will need to make sure that setting and clearing the xfd MSR is part
      901 + *     of the FPU context ops and something we set by default on every CPU.
      902 + *   o We will need to add a new interface to allow opting into this feature.
      903 + *   o We will need to ensure that all subsequently created signal stacks adhere
      904 + *     to a required minimum size that we communicate through libc.
      905 + *   o We will need to make sure that both rtld and libc no longer rely on a
      906 + *     static value of the AT_SUN_FPSIZE, but rather realize that this can be
      907 + *     dynamic. At that time, we should evaluate if we can get away with not
      908 + *     needing to save this for rtld, even though signal handlers should assume
      909 + *     they will.
      910 + *   o The various components (because there is more than one) will want to be
      911 + *     added to the fpu_xsave_info[]. Consulting the processes's xfd will be
      912 + *     required and probably require logic changes.
      913 + *
      914 + * The above is not exhaustive. We'll probably have some other issues and fun
      915 + * while doing this.
 476  916   */
 477  917  
      918 +/*
      919 + * The kind of FPU we advertise to rtld so it knows what to do when working
      920 + * through the PLT.
      921 + */
      922 +int fp_elf = AT_386_FPINFO_FXSAVE;
      923 +
      924 +/*
      925 + * Mechanism to save FPU state.
      926 + */
      927 +int fp_save_mech = FP_FXSAVE;
      928 +
 478  929  kmem_cache_t *fpsave_cachep;
 479  930  
 480  931  /* Legacy fxsave layout + xsave header + ymm */
 481  932  #define AVX_XSAVE_SIZE          (512 + 64 + 256)
 482  933  
 483  934  /*
 484  935   * Various sanity checks.
 485  936   */
 486  937  CTASSERT(sizeof (struct fxsave_state) == 512);
 487  938  CTASSERT(sizeof (struct fnsave_state) == 108);
 488  939  CTASSERT((offsetof(struct fxsave_state, fx_xmm[0]) & 0xf) == 0);
 489  940  CTASSERT(sizeof (struct xsave_state) >= AVX_XSAVE_SIZE);
 490  941  
 491  942  /*
      943 + * Basic architectural alignment information.
      944 + */
      945 +#define FPU_ALIGN_XMM   16
      946 +#define FPU_ALIGN_YMM   32
      947 +#define FPU_ALIGN_ZMM   64
      948 +
      949 +/*
 492  950   * This structure is the x86 implementation of the kernel FPU that is defined in
 493  951   * uts/common/sys/kfpu.h.
 494  952   */
 495  953  
 496  954  typedef enum kfpu_flags {
 497  955          /*
 498  956           * This indicates that the save state has initial FPU data.
 499  957           */
 500  958          KFPU_F_INITIALIZED = 0x01
 501  959  } kfpu_flags_t;

 502  960  
 503  961  struct kfpu_state {
 504  962          fpu_ctx_t       kfpu_ctx;
 505  963          kfpu_flags_t    kfpu_flags;
 506  964          kthread_t       *kfpu_curthread;
 507  965  };
 508  966  
 509  967  /*
 510  968   * Initial kfpu state for SSE/SSE2 used by fpinit()
 511  969   */
 512  970  const struct fxsave_state sse_initial = {
 513  971          FPU_CW_INIT,    /* fx_fcw */
 514  972          0,              /* fx_fsw */
 515  973          0,              /* fx_fctw */
 516  974          0,              /* fx_fop */
 517  975          0,              /* fx_rip */
 518  976          0,              /* fx_rdp */
 519  977          SSE_MXCSR_INIT  /* fx_mxcsr */
 520  978          /* rest of structure is zero */
 521  979  };
 522  980  
 523  981  /*
 524  982   * Initial kfpu state for AVX used by fpinit()
 525  983   */
 526  984  const struct xsave_state avx_initial = {
 527  985          /*
 528  986           * The definition below needs to be identical with sse_initial
 529  987           * defined above.
 530  988           */
 531  989          .xs_fxsave = {
 532  990                  .fx_fcw = FPU_CW_INIT,
 533  991                  .fx_mxcsr = SSE_MXCSR_INIT,
 534  992          },
 535  993          .xs_header = {
 536  994                  /*
 537  995                   * bit0 = 1 for XSTATE_BV to indicate that legacy fields are
 538  996                   * valid, and CPU should initialize XMM/YMM.
 539  997                   */
 540  998                  .xsh_xstate_bv = 1,
 541  999                  .xsh_xcomp_bv = 0,
 542 1000          },

↓ open down ↓

41 lines elided

↑ open up ↑

 543 1001  };
 544 1002  
 545 1003  /*
 546 1004   * mxcsr_mask value (possibly reset in fpu_probe); used to avoid
 547 1005   * the #gp exception caused by setting unsupported bits in the
 548 1006   * MXCSR register
 549 1007   */
 550 1008  uint32_t sse_mxcsr_mask = SSE_MXCSR_MASK_DEFAULT;
 551 1009  
 552 1010  /*
 553      - * Initial kfpu state for x87 used by fpinit()
 554      - */
 555      -const struct fnsave_state x87_initial = {
 556      -        FPU_CW_INIT,    /* f_fcw */
 557      -        0,              /* __f_ign0 */
 558      -        0,              /* f_fsw */
 559      -        0,              /* __f_ign1 */
 560      -        0xffff,         /* f_ftw */
 561      -        /* rest of structure is zero */
 562      -};
 563      -
 564      -/*
 565 1011   * This vector is patched to xsave_ctxt() or xsaveopt_ctxt() if we discover we
 566 1012   * have an XSAVE-capable chip in fpu_probe.
 567 1013   */
 568 1014  void (*fpsave_ctxt)(void *) = fpxsave_ctxt;
 569 1015  void (*fprestore_ctxt)(void *) = fpxrestore_ctxt;
 570 1016  
 571 1017  /*
 572 1018   * This function pointer is changed to xsaveopt if the CPU is xsaveopt capable.
 573 1019   */
 574 1020  void (*xsavep)(struct xsave_state *, uint64_t) = xsave;

 575 1021  
 576 1022  static int fpe_sicode(uint_t);
 577 1023  static int fpe_simd_sicode(uint_t);
 578 1024  static void fp_new_lwp(void *, void *);
 579 1025  static void fp_free_ctx(void *, int);
 580 1026  
 581 1027  static struct ctxop *
 582 1028  fp_ctxop_allocate(struct fpu_ctx *fp)
 583 1029  {
 584 1030          const struct ctxop_template tpl = {
 585 1031                  .ct_rev         = CTXOP_TPL_REV,
 586 1032                  .ct_save        = fpsave_ctxt,
 587 1033                  .ct_restore     = fprestore_ctxt,
 588 1034                  .ct_fork        = fp_new_lwp,
 589 1035                  .ct_lwp_create  = fp_new_lwp,
 590 1036                  .ct_free        = fp_free_ctx,
 591 1037          };
 592 1038          return (ctxop_allocate(&tpl, fp));
 593 1039  }
 594 1040  
 595 1041  /*
 596 1042   * Copy the state of parent lwp's floating point context into the new lwp.
 597 1043   * Invoked for both fork() and lwp_create().
 598 1044   *
 599 1045   * Note that we inherit -only- the control state (e.g. exception masks,
 600 1046   * rounding, precision control, etc.); the FPU registers are otherwise
 601 1047   * reset to their initial state.
 602 1048   */
 603 1049  static void
 604 1050  fp_new_lwp(void *parent, void *child)
 605 1051  {
 606 1052          kthread_id_t t = parent, ct = child;
 607 1053          struct fpu_ctx *fp;             /* parent fpu context */
 608 1054          struct fpu_ctx *cfp;            /* new fpu context */
 609 1055          struct fxsave_state *fx, *cfx;
 610 1056          struct xsave_state *cxs;
 611 1057  
 612 1058          ASSERT(fp_kind != FP_NO);
 613 1059  
 614 1060          fp = &t->t_lwp->lwp_pcb.pcb_fpu;
 615 1061          cfp = &ct->t_lwp->lwp_pcb.pcb_fpu;
 616 1062  
 617 1063          /*
 618 1064           * If the parent FPU state is still in the FPU hw then save it;
 619 1065           * conveniently, fp_save() already does this for us nicely.
 620 1066           */
 621 1067          fp_save(fp);
 622 1068  
 623 1069          cfp->fpu_flags = FPU_EN | FPU_VALID;
 624 1070          cfp->fpu_regs.kfpu_status = 0;
 625 1071          cfp->fpu_regs.kfpu_xstatus = 0;
 626 1072  
 627 1073          /*
 628 1074           * Make sure that the child's FPU is cleaned up and made ready for user
 629 1075           * land.
 630 1076           */
 631 1077          PCB_SET_UPDATE_FPU(&ct->t_lwp->lwp_pcb);
 632 1078  
 633 1079          switch (fp_save_mech) {
 634 1080          case FP_FXSAVE:
 635 1081                  fx = fp->fpu_regs.kfpu_u.kfpu_fx;
 636 1082                  cfx = cfp->fpu_regs.kfpu_u.kfpu_fx;
 637 1083                  bcopy(&sse_initial, cfx, sizeof (*cfx));
 638 1084                  cfx->fx_mxcsr = fx->fx_mxcsr & ~SSE_MXCSR_EFLAGS;
 639 1085                  cfx->fx_fcw = fx->fx_fcw;
 640 1086                  break;
 641 1087  
 642 1088          case FP_XSAVE:
 643 1089                  cfp->fpu_xsave_mask = fp->fpu_xsave_mask;
 644 1090  
 645 1091                  VERIFY(fp->fpu_regs.kfpu_u.kfpu_xs != NULL);
 646 1092  
 647 1093                  fx = &fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave;
 648 1094                  cxs = cfp->fpu_regs.kfpu_u.kfpu_xs;
 649 1095                  cfx = &cxs->xs_fxsave;
 650 1096  
 651 1097                  bcopy(&avx_initial, cxs, sizeof (*cxs));
 652 1098                  cfx->fx_mxcsr = fx->fx_mxcsr & ~SSE_MXCSR_EFLAGS;
 653 1099                  cfx->fx_fcw = fx->fx_fcw;

↓ open down ↓

79 lines elided

↑ open up ↑

 654 1100                  cxs->xs_header.xsh_xstate_bv |=
 655 1101                      (get_xcr(XFEATURE_ENABLED_MASK) & XFEATURE_FP_INITIAL);
 656 1102                  break;
 657 1103          default:
 658 1104                  panic("Invalid fp_save_mech");
 659 1105                  /*NOTREACHED*/
 660 1106          }
 661 1107  
 662 1108          /*
 663 1109           * Mark that both the parent and child need to have the FPU cleaned up
 664      -         * before returning to user land.
     1110 +         * before returning to userland.
 665 1111           */
 666 1112  
 667 1113          ctxop_attach(ct, fp_ctxop_allocate(cfp));
 668 1114  }
 669 1115  
 670 1116  /*
 671 1117   * Free any state associated with floating point context.
 672 1118   * Fp_free can be called in three cases:
 673 1119   * 1) from reaper -> thread_free -> freectx-> fp_free
 674 1120   *      fp context belongs to a thread on deathrow

 675 1121   *      nothing to do,  thread will never be resumed
 676 1122   *      thread calling ctxfree is reaper
 677 1123   *
 678 1124   * 2) from exec -> freectx -> fp_free
 679 1125   *      fp context belongs to the current thread
 680 1126   *      must disable fpu, thread calling ctxfree is curthread
 681 1127   *
 682 1128   * 3) from restorecontext -> setfpregs -> fp_free
 683 1129   *      we have a modified context in the memory (lwp->pcb_fpu)
 684 1130   *      disable fpu and release the fp context for the CPU
 685 1131   *
 686 1132   */
 687 1133  void
 688 1134  fp_free(struct fpu_ctx *fp)
 689 1135  {
 690 1136          ASSERT(fp_kind != FP_NO);
 691 1137  
 692 1138          if (fp->fpu_flags & FPU_VALID)
 693 1139                  return;
 694 1140  
 695 1141          kpreempt_disable();
 696 1142          /*
 697 1143           * We want to do fpsave rather than fpdisable so that we can
 698 1144           * keep the fpu_flags as FPU_VALID tracking the CR0_TS bit
 699 1145           */
 700 1146          fp->fpu_flags |= FPU_VALID;
 701 1147          /* If for current thread disable FP to track FPU_VALID */
 702 1148          if (curthread->t_lwp && fp == &curthread->t_lwp->lwp_pcb.pcb_fpu) {
 703 1149                  /* Clear errors if any to prevent frstor from complaining */
 704 1150                  (void) fperr_reset();
 705 1151                  if (fp_kind & __FP_SSE)
 706 1152                          (void) fpxerr_reset();
 707 1153                  fpdisable();
 708 1154          }
 709 1155          kpreempt_enable();
 710 1156  }
 711 1157  
 712 1158  /*
 713 1159   * Wrapper for freectx to make the types line up for fp_free()
 714 1160   */
 715 1161  static void
 716 1162  fp_free_ctx(void *arg, int isexec __unused)
 717 1163  {
 718 1164          fp_free((struct fpu_ctx *)arg);
 719 1165  }
 720 1166  
 721 1167  /*
 722 1168   * Store the floating point state and disable the floating point unit.
 723 1169   */
 724 1170  void
 725 1171  fp_save(struct fpu_ctx *fp)
 726 1172  {
 727 1173          ASSERT(fp_kind != FP_NO);
 728 1174  
 729 1175          kpreempt_disable();
 730 1176          if (!fp || fp->fpu_flags & FPU_VALID ||
 731 1177              (fp->fpu_flags & FPU_EN) == 0) {
 732 1178                  kpreempt_enable();
 733 1179                  return;
 734 1180          }
 735 1181          ASSERT(curthread->t_lwp && fp == &curthread->t_lwp->lwp_pcb.pcb_fpu);
 736 1182  
 737 1183          switch (fp_save_mech) {
 738 1184          case FP_FXSAVE:
 739 1185                  fpxsave(fp->fpu_regs.kfpu_u.kfpu_fx);
 740 1186                  break;
 741 1187  
 742 1188          case FP_XSAVE:
 743 1189                  xsavep(fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask);
 744 1190                  break;
 745 1191          default:
 746 1192                  panic("Invalid fp_save_mech");
 747 1193                  /*NOTREACHED*/
 748 1194          }
 749 1195  
 750 1196          fp->fpu_flags |= FPU_VALID;
 751 1197  
 752 1198          /*
 753 1199           * We save the FPU as part of forking, execing, modifications via /proc,
 754 1200           * restorecontext, etc. As such, we need to make sure that we return to
 755 1201           * userland with valid state in the FPU. If we're context switched out
 756 1202           * before we hit sys_rtt_common() we'll end up having restored the FPU
 757 1203           * as part of the context ops operations. The restore logic always makes
 758 1204           * sure that FPU_VALID is set before doing a restore so we don't restore
 759 1205           * it a second time.
 760 1206           */
 761 1207          PCB_SET_UPDATE_FPU(&curthread->t_lwp->lwp_pcb);
 762 1208  
 763 1209          kpreempt_enable();
 764 1210  }
 765 1211  
 766 1212  /*
 767 1213   * Restore the FPU context for the thread:
 768 1214   * The possibilities are:
 769 1215   *      1. No active FPU context: Load the new context into the FPU hw
 770 1216   *         and enable the FPU.
 771 1217   */
 772 1218  void
 773 1219  fp_restore(struct fpu_ctx *fp)
 774 1220  {
 775 1221          switch (fp_save_mech) {
 776 1222          case FP_FXSAVE:
 777 1223                  fpxrestore(fp->fpu_regs.kfpu_u.kfpu_fx);
 778 1224                  break;
 779 1225  
 780 1226          case FP_XSAVE:
 781 1227                  xrestore(fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask);
 782 1228                  break;
 783 1229          default:
 784 1230                  panic("Invalid fp_save_mech");
 785 1231                  /*NOTREACHED*/
 786 1232          }
 787 1233  
 788 1234          fp->fpu_flags &= ~FPU_VALID;
 789 1235  }
 790 1236  
 791 1237  /*
 792 1238   * Reset the FPU such that it is in a valid state for a new thread that is
 793 1239   * coming out of exec. The FPU will be in a usable state at this point. At this
 794 1240   * point we know that the FPU state has already been allocated and if this
 795 1241   * wasn't an init process, then it will have had fp_free() previously called.
 796 1242   */
 797 1243  void
 798 1244  fp_exec(void)
 799 1245  {
 800 1246          struct fpu_ctx *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
 801 1247  
 802 1248          if (fp_save_mech == FP_XSAVE) {
 803 1249                  fp->fpu_xsave_mask = XFEATURE_FP_ALL;
 804 1250          }
 805 1251  
 806 1252          struct ctxop *ctx = fp_ctxop_allocate(fp);
 807 1253          /*
 808 1254           * Make sure that we're not preempted in the middle of initializing the
 809 1255           * FPU on CPU.
 810 1256           */
 811 1257          kpreempt_disable();
 812 1258          ctxop_attach(curthread, ctx);
 813 1259          fpinit();
 814 1260          fp->fpu_flags = FPU_EN;
 815 1261          kpreempt_enable();
 816 1262  }
 817 1263  
 818 1264  
 819 1265  /*
 820 1266   * Seeds the initial state for the current thread.  The possibilities are:
 821 1267   *      1. Another process has modified the FPU state before we have done any
 822 1268   *         initialization: Load the FPU state from the LWP state.
 823 1269   *      2. The FPU state has not been externally modified:  Load a clean state.
 824 1270   */
 825 1271  void
 826 1272  fp_seed(void)
 827 1273  {
 828 1274          struct fpu_ctx *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
 829 1275  
 830 1276          ASSERT(curthread->t_preempt >= 1);
 831 1277          ASSERT((fp->fpu_flags & FPU_EN) == 0);
 832 1278  
 833 1279          /*
 834 1280           * Always initialize a new context and initialize the hardware.
 835 1281           */
 836 1282          if (fp_save_mech == FP_XSAVE) {
 837 1283                  fp->fpu_xsave_mask = XFEATURE_FP_ALL;
 838 1284          }
 839 1285  
 840 1286          ctxop_attach(curthread, fp_ctxop_allocate(fp));
 841 1287          fpinit();
 842 1288  
 843 1289          /*
 844 1290           * If FPU_VALID is set, it means someone has modified registers via
 845 1291           * /proc.  In this case, restore the current lwp's state.
 846 1292           */
 847 1293          if (fp->fpu_flags & FPU_VALID)
 848 1294                  fp_restore(fp);
 849 1295  
 850 1296          ASSERT((fp->fpu_flags & FPU_VALID) == 0);
 851 1297          fp->fpu_flags = FPU_EN;
 852 1298  }
 853 1299  
 854 1300  /*
 855 1301   * When using xsave/xrstor, these three functions are used by the lwp code to
 856 1302   * manage the memory for the xsave area.
 857 1303   */
 858 1304  void

↓ open down ↓

184 lines elided

↑ open up ↑

 859 1305  fp_lwp_init(struct _klwp *lwp)
 860 1306  {
 861 1307          struct fpu_ctx *fp = &lwp->lwp_pcb.pcb_fpu;
 862 1308  
 863 1309          /*
 864 1310           * We keep a copy of the pointer in lwp_fpu so that we can restore the
 865 1311           * value in forklwp() after we duplicate the parent's LWP state.
 866 1312           */
 867 1313          lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic =
 868 1314              kmem_cache_alloc(fpsave_cachep, KM_SLEEP);
     1315 +        fp->fpu_signal = NULL;
 869 1316  
 870 1317          if (fp_save_mech == FP_XSAVE) {
 871 1318                  /*
 872 1319                   *
 873 1320                   * We bzero since the fpinit() code path will only
 874 1321                   * partially initialize the xsave area using avx_inital.
 875 1322                   */
 876 1323                  ASSERT(cpuid_get_xsave_size() >= sizeof (struct xsave_state));
 877 1324                  bzero(fp->fpu_regs.kfpu_u.kfpu_xs, cpuid_get_xsave_size());
 878 1325          }

 879 1326  }
 880 1327

↓ open down ↓

2 lines elided

↑ open up ↑

 881 1328  void
 882 1329  fp_lwp_cleanup(struct _klwp *lwp)
 883 1330  {
 884 1331          struct fpu_ctx *fp = &lwp->lwp_pcb.pcb_fpu;
 885 1332  
 886 1333          if (fp->fpu_regs.kfpu_u.kfpu_generic != NULL) {
 887 1334                  kmem_cache_free(fpsave_cachep,
 888 1335                      fp->fpu_regs.kfpu_u.kfpu_generic);
 889 1336                  lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic = NULL;
 890 1337          }
     1338 +
     1339 +        if (fp->fpu_signal != NULL) {
     1340 +                kmem_cache_free(fpsave_cachep, fp->fpu_signal);
     1341 +                fp->fpu_signal = NULL;
     1342 +        }
 891 1343  }
 892 1344  
 893 1345  /*
 894 1346   * Called during the process of forklwp(). The kfpu_u pointer will have been
 895 1347   * overwritten while copying the parent's LWP structure. We have a valid copy
 896 1348   * stashed in the child's lwp_fpu which we use to restore the correct value.
 897 1349   */
 898 1350  void
 899 1351  fp_lwp_dup(struct _klwp *lwp)
 900 1352  {

 901 1353          void *xp = lwp->lwp_fpu;
 902 1354          size_t sz;
 903 1355  
 904 1356          switch (fp_save_mech) {
 905 1357          case FP_FXSAVE:
 906 1358                  sz = sizeof (struct fxsave_state);
 907 1359                  break;
 908 1360          case FP_XSAVE:
 909 1361                  sz = cpuid_get_xsave_size();

↓ open down ↓

9 lines elided

↑ open up ↑

 910 1362                  break;
 911 1363          default:
 912 1364                  panic("Invalid fp_save_mech");
 913 1365                  /*NOTREACHED*/
 914 1366          }
 915 1367  
 916 1368          /* copy the parent's values into the new lwp's struct */
 917 1369          bcopy(lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic, xp, sz);
 918 1370          /* now restore the pointer */
 919 1371          lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic = xp;
     1372 +        /* Ensure that we don't inherit our parent's signal state */
     1373 +        lwp->lwp_pcb.pcb_fpu.fpu_signal = NULL;
 920 1374  }
 921 1375  
 922 1376  /*
 923 1377   * Handle a processor extension error fault
 924 1378   * Returns non zero for error.
 925 1379   */
 926 1380  
 927 1381  /*ARGSUSED*/
 928 1382  int
 929 1383  fpexterrflt(struct regs *rp)

 930 1384  {
 931 1385          uint32_t fpcw, fpsw;
 932 1386          fpu_ctx_t *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
 933 1387  
 934 1388          ASSERT(fp_kind != FP_NO);
 935 1389  
 936 1390          /*
 937 1391           * Now we can enable the interrupts.
 938 1392           * (NOTE: x87 fp exceptions come thru interrupt gate)
 939 1393           */
 940 1394          sti();
 941 1395  
 942 1396          if (!fpu_exists)
 943 1397                  return (FPE_FLTINV);
 944 1398  
 945 1399          /*
 946 1400           * Do an unconditional save of the FP state.  If it's dirty (TS=0),
 947 1401           * it'll be saved into the fpu context area passed in (that of the
 948 1402           * current thread).  If it's not dirty (it may not be, due to
 949 1403           * an intervening save due to a context switch between the sti(),
 950 1404           * above and here, then it's safe to just use the stored values in
 951 1405           * the context save area to determine the cause of the fault.
 952 1406           */
 953 1407          fp_save(fp);
 954 1408  
 955 1409          /* clear exception flags in saved state, as if by fnclex */
 956 1410          switch (fp_save_mech) {
 957 1411          case FP_FXSAVE:
 958 1412                  fpsw = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw;
 959 1413                  fpcw = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fcw;
 960 1414                  fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw &= ~FPS_SW_EFLAGS;
 961 1415                  break;
 962 1416  
 963 1417          case FP_XSAVE:
 964 1418                  fpsw = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw;
 965 1419                  fpcw = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fcw;
 966 1420                  fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw &= ~FPS_SW_EFLAGS;
 967 1421                  /*
 968 1422                   * Always set LEGACY_FP as it may have been cleared by XSAVE
 969 1423                   * instruction
 970 1424                   */
 971 1425                  fp->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv |=
 972 1426                      XFEATURE_LEGACY_FP;
 973 1427                  break;
 974 1428          default:
 975 1429                  panic("Invalid fp_save_mech");
 976 1430                  /*NOTREACHED*/
 977 1431          }
 978 1432  
 979 1433          fp->fpu_regs.kfpu_status = fpsw;
 980 1434  
 981 1435          if ((fpsw & FPS_ES) == 0)
 982 1436                  return (0);             /* No exception */
 983 1437  
 984 1438          /*
 985 1439           * "and" the exception flags with the complement of the mask
 986 1440           * bits to determine which exception occurred
 987 1441           */
 988 1442          return (fpe_sicode(fpsw & ~fpcw & 0x3f));
 989 1443  }
 990 1444  
 991 1445  /*
 992 1446   * Handle an SSE/SSE2 precise exception.
 993 1447   * Returns a non-zero sicode for error.
 994 1448   */
 995 1449  /*ARGSUSED*/
 996 1450  int
 997 1451  fpsimderrflt(struct regs *rp)
 998 1452  {
 999 1453          uint32_t mxcsr, xmask;
1000 1454          fpu_ctx_t *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
1001 1455  
1002 1456          ASSERT(fp_kind & __FP_SSE);
1003 1457  
1004 1458          /*
1005 1459           * NOTE: Interrupts are disabled during execution of this
1006 1460           * function.  They are enabled by the caller in trap.c.
1007 1461           */
1008 1462  
1009 1463          /*
1010 1464           * The only way we could have gotten here if there is no FP unit
1011 1465           * is via a user executing an INT $19 instruction, so there is
1012 1466           * no fault in that case.
1013 1467           */
1014 1468          if (!fpu_exists)
1015 1469                  return (0);
1016 1470  
1017 1471          /*
1018 1472           * Do an unconditional save of the FP state.  If it's dirty (TS=0),
1019 1473           * it'll be saved into the fpu context area passed in (that of the
1020 1474           * current thread).  If it's not dirty, then it's safe to just use
1021 1475           * the stored values in the context save area to determine the
1022 1476           * cause of the fault.
1023 1477           */
1024 1478          fp_save(fp);            /* save the FPU state */
1025 1479  
1026 1480          if (fp_save_mech == FP_XSAVE) {
1027 1481                  mxcsr = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_mxcsr;
1028 1482                  fp->fpu_regs.kfpu_status =
1029 1483                      fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw;
1030 1484          } else {
1031 1485                  mxcsr = fp->fpu_regs.kfpu_u.kfpu_fx->fx_mxcsr;
1032 1486                  fp->fpu_regs.kfpu_status = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw;
1033 1487          }
1034 1488          fp->fpu_regs.kfpu_xstatus = mxcsr;
1035 1489  
1036 1490          /*
1037 1491           * compute the mask that determines which conditions can cause
1038 1492           * a #xm exception, and use this to clean the status bits so that
1039 1493           * we can identify the true cause of this one.
1040 1494           */
1041 1495          xmask = (mxcsr >> 7) & SSE_MXCSR_EFLAGS;
1042 1496          return (fpe_simd_sicode((mxcsr & SSE_MXCSR_EFLAGS) & ~xmask));
1043 1497  }
1044 1498  
1045 1499  /*
1046 1500   * In the unlikely event that someone is relying on this subcode being
1047 1501   * FPE_FLTILL for denormalize exceptions, it can always be patched back
1048 1502   * again to restore old behaviour.
1049 1503   */
1050 1504  int fpe_fltden = FPE_FLTDEN;
1051 1505  
1052 1506  /*
1053 1507   * Map from the FPU status word to the FP exception si_code.
1054 1508   */
1055 1509  static int
1056 1510  fpe_sicode(uint_t sw)
1057 1511  {
1058 1512          if (sw & FPS_IE)
1059 1513                  return (FPE_FLTINV);
1060 1514          if (sw & FPS_ZE)
1061 1515                  return (FPE_FLTDIV);
1062 1516          if (sw & FPS_DE)
1063 1517                  return (fpe_fltden);
1064 1518          if (sw & FPS_OE)
1065 1519                  return (FPE_FLTOVF);
1066 1520          if (sw & FPS_UE)
1067 1521                  return (FPE_FLTUND);
1068 1522          if (sw & FPS_PE)
1069 1523                  return (FPE_FLTRES);
1070 1524          return (FPE_FLTINV);    /* default si_code for other exceptions */
1071 1525  }
1072 1526  
1073 1527  /*
1074 1528   * Map from the SSE status word to the FP exception si_code.
1075 1529   */
1076 1530  static int
1077 1531  fpe_simd_sicode(uint_t sw)
1078 1532  {
1079 1533          if (sw & SSE_IE)
1080 1534                  return (FPE_FLTINV);
1081 1535          if (sw & SSE_ZE)
1082 1536                  return (FPE_FLTDIV);
1083 1537          if (sw & SSE_DE)
1084 1538                  return (FPE_FLTDEN);
1085 1539          if (sw & SSE_OE)
1086 1540                  return (FPE_FLTOVF);
1087 1541          if (sw & SSE_UE)
1088 1542                  return (FPE_FLTUND);
1089 1543          if (sw & SSE_PE)
1090 1544                  return (FPE_FLTRES);
1091 1545          return (FPE_FLTINV);    /* default si_code for other exceptions */
1092 1546  }
1093 1547  
1094 1548  /*
1095 1549   * This routine is invoked as part of libc's __fpstart implementation
1096 1550   * via sysi86(2).
1097 1551   *
1098 1552   * It may be called -before- any context has been assigned in which case
1099 1553   * we try and avoid touching the hardware.  Or it may be invoked well
1100 1554   * after the context has been assigned and fiddled with, in which case
1101 1555   * just tweak it directly.
1102 1556   */
1103 1557  void
1104 1558  fpsetcw(uint16_t fcw, uint32_t mxcsr)
1105 1559  {
1106 1560          struct fpu_ctx *fp = &curthread->t_lwp->lwp_pcb.pcb_fpu;
1107 1561          struct fxsave_state *fx;
1108 1562  
1109 1563          if (!fpu_exists || fp_kind == FP_NO)
1110 1564                  return;
1111 1565  
1112 1566          if ((fp->fpu_flags & FPU_EN) == 0) {
1113 1567                  if (fcw == FPU_CW_INIT && mxcsr == SSE_MXCSR_INIT) {
1114 1568                          /*
1115 1569                           * Common case.  Floating point unit not yet
1116 1570                           * enabled, and kernel already intends to initialize
1117 1571                           * the hardware the way the caller wants.
1118 1572                           */
1119 1573                          return;
1120 1574                  }
1121 1575                  /*
1122 1576                   * Hmm.  Userland wants a different default.
1123 1577                   * Do a fake "first trap" to establish the context, then
1124 1578                   * handle as if we already had a context before we came in.
1125 1579                   */
1126 1580                  kpreempt_disable();
1127 1581                  fp_seed();
1128 1582                  kpreempt_enable();
1129 1583          }
1130 1584  
1131 1585          /*
1132 1586           * Ensure that the current hardware state is flushed back to the
1133 1587           * pcb, then modify that copy.  Next use of the fp will
1134 1588           * restore the context.
1135 1589           */
1136 1590          fp_save(fp);
1137 1591  
1138 1592          switch (fp_save_mech) {
1139 1593          case FP_FXSAVE:
1140 1594                  fx = fp->fpu_regs.kfpu_u.kfpu_fx;
1141 1595                  fx->fx_fcw = fcw;
1142 1596                  fx->fx_mxcsr = sse_mxcsr_mask & mxcsr;
1143 1597                  break;
1144 1598  
1145 1599          case FP_XSAVE:
1146 1600                  fx = &fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave;
1147 1601                  fx->fx_fcw = fcw;
1148 1602                  fx->fx_mxcsr = sse_mxcsr_mask & mxcsr;
1149 1603                  /*
1150 1604                   * Always set LEGACY_FP as it may have been cleared by XSAVE
1151 1605                   * instruction
1152 1606                   */
1153 1607                  fp->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv |=
1154 1608                      XFEATURE_LEGACY_FP;
1155 1609                  break;
1156 1610          default:
1157 1611                  panic("Invalid fp_save_mech");
1158 1612                  /*NOTREACHED*/
1159 1613          }
1160 1614  }
1161 1615  
1162 1616  static void
1163 1617  kernel_fpu_fpstate_init(kfpu_state_t *kfpu)
1164 1618  {
1165 1619          struct xsave_state *xs;
1166 1620  
1167 1621          switch (fp_save_mech) {
1168 1622          case FP_FXSAVE:
1169 1623                  bcopy(&sse_initial, kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_fx,
1170 1624                      sizeof (struct fxsave_state));
1171 1625                  kfpu->kfpu_ctx.fpu_xsave_mask = 0;
1172 1626                  break;
1173 1627          case FP_XSAVE:
1174 1628                  xs = kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_xs;
1175 1629                  bzero(xs, cpuid_get_xsave_size());
1176 1630                  bcopy(&avx_initial, xs, sizeof (*xs));
1177 1631                  xs->xs_header.xsh_xstate_bv = XFEATURE_LEGACY_FP | XFEATURE_SSE;
1178 1632                  kfpu->kfpu_ctx.fpu_xsave_mask = XFEATURE_FP_ALL;
1179 1633                  break;
1180 1634          default:
1181 1635                  panic("invalid fp_save_mech");
1182 1636          }
1183 1637  
1184 1638          /*
1185 1639           * Set the corresponding flags that the system expects on the FPU state
1186 1640           * to indicate that this is our state. The FPU_EN flag is required to
1187 1641           * indicate that FPU usage is allowed. The FPU_KERN flag is explicitly
1188 1642           * not set below as it represents that this state is being suppressed
1189 1643           * by the kernel.
1190 1644           */
1191 1645          kfpu->kfpu_ctx.fpu_flags = FPU_EN | FPU_VALID;
1192 1646          kfpu->kfpu_flags |= KFPU_F_INITIALIZED;
1193 1647  }
1194 1648  
1195 1649  kfpu_state_t *
1196 1650  kernel_fpu_alloc(int kmflags)
1197 1651  {
1198 1652          kfpu_state_t *kfpu;
1199 1653  
1200 1654          if ((kfpu = kmem_zalloc(sizeof (kfpu_state_t), kmflags)) == NULL) {
1201 1655                  return (NULL);
1202 1656          }
1203 1657  
1204 1658          kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic =
1205 1659              kmem_cache_alloc(fpsave_cachep, kmflags);
1206 1660          if (kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic == NULL) {
1207 1661                  kmem_free(kfpu, sizeof (kfpu_state_t));
1208 1662                  return (NULL);
1209 1663          }
1210 1664  
1211 1665          kernel_fpu_fpstate_init(kfpu);
1212 1666  
1213 1667          return (kfpu);
1214 1668  }
1215 1669  
1216 1670  void
1217 1671  kernel_fpu_free(kfpu_state_t *kfpu)
1218 1672  {
1219 1673          kmem_cache_free(fpsave_cachep,
1220 1674              kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic);
1221 1675          kmem_free(kfpu, sizeof (kfpu_state_t));
1222 1676  }
1223 1677  
1224 1678  static void
1225 1679  kernel_fpu_ctx_save(void *arg)
1226 1680  {
1227 1681          kfpu_state_t *kfpu = arg;
1228 1682          fpu_ctx_t *pf;
1229 1683  
1230 1684          if (kfpu == NULL) {
1231 1685                  /*
1232 1686                   * A NULL kfpu implies this is a kernel thread with an LWP and
1233 1687                   * no user-level FPU usage. Use the lwp fpu save area.
1234 1688                   */
1235 1689                  pf = &curthread->t_lwp->lwp_pcb.pcb_fpu;
1236 1690  
1237 1691                  ASSERT(curthread->t_procp->p_flag & SSYS);
1238 1692                  ASSERT3U(pf->fpu_flags & FPU_VALID, ==, 0);
1239 1693  
1240 1694                  fp_save(pf);
1241 1695          } else {
1242 1696                  pf = &kfpu->kfpu_ctx;
1243 1697  
1244 1698                  ASSERT3P(kfpu->kfpu_curthread, ==, curthread);
1245 1699                  ASSERT3U(pf->fpu_flags & FPU_VALID, ==, 0);
1246 1700  
1247 1701                  /*
1248 1702                   * Note, we can't use fp_save because it assumes that we're
1249 1703                   * saving to the thread's PCB and not somewhere else. Because
1250 1704                   * this is a different FPU context, we instead have to do this
1251 1705                   * ourselves.
1252 1706                   */
1253 1707                  switch (fp_save_mech) {
1254 1708                  case FP_FXSAVE:
1255 1709                          fpxsave(pf->fpu_regs.kfpu_u.kfpu_fx);
1256 1710                          break;
1257 1711                  case FP_XSAVE:
1258 1712                          xsavep(pf->fpu_regs.kfpu_u.kfpu_xs, pf->fpu_xsave_mask);
1259 1713                          break;
1260 1714                  default:
1261 1715                          panic("Invalid fp_save_mech");
1262 1716                  }
1263 1717  
1264 1718                  /*
1265 1719                   * Because we have saved context here, our save state is no
1266 1720                   * longer valid and therefore needs to be reinitialized.
1267 1721                   */
1268 1722                  kfpu->kfpu_flags &= ~KFPU_F_INITIALIZED;
1269 1723          }
1270 1724  
1271 1725          pf->fpu_flags |= FPU_VALID;
1272 1726  
1273 1727          /*
1274 1728           * Clear KFPU flag. This allows swtch to check for improper kernel
1275 1729           * usage of the FPU (i.e. switching to a new thread while the old
1276 1730           * thread was in the kernel and using the FPU, but did not perform a
1277 1731           * context save).
1278 1732           */
1279 1733          curthread->t_flag &= ~T_KFPU;
1280 1734  }
1281 1735  
1282 1736  static void
1283 1737  kernel_fpu_ctx_restore(void *arg)
1284 1738  {
1285 1739          kfpu_state_t *kfpu = arg;
1286 1740          fpu_ctx_t *pf;
1287 1741  
1288 1742          if (kfpu == NULL) {
1289 1743                  /*
1290 1744                   * A NULL kfpu implies this is a kernel thread with an LWP and
1291 1745                   * no user-level FPU usage. Use the lwp fpu save area.
1292 1746                   */
1293 1747                  pf = &curthread->t_lwp->lwp_pcb.pcb_fpu;
1294 1748  
1295 1749                  ASSERT(curthread->t_procp->p_flag & SSYS);
1296 1750                  ASSERT3U(pf->fpu_flags & FPU_VALID, !=, 0);
1297 1751          } else {
1298 1752                  pf = &kfpu->kfpu_ctx;
1299 1753  
1300 1754                  ASSERT3P(kfpu->kfpu_curthread, ==, curthread);
1301 1755                  ASSERT3U(pf->fpu_flags & FPU_VALID, !=, 0);
1302 1756          }
1303 1757  
1304 1758          fp_restore(pf);
1305 1759          curthread->t_flag |= T_KFPU;
1306 1760  }
1307 1761  
1308 1762  /*
1309 1763   * Validate that the thread is not switching off-cpu while actively using the
1310 1764   * FPU within the kernel.
1311 1765   */
1312 1766  void
1313 1767  kernel_fpu_no_swtch(void)
1314 1768  {
1315 1769          if ((curthread->t_flag & T_KFPU) != 0) {
1316 1770                  panic("curthread swtch-ing while the kernel is using the FPU");
1317 1771          }
1318 1772  }
1319 1773  
1320 1774  static const struct ctxop_template kfpu_ctxop_tpl = {
1321 1775          .ct_rev         = CTXOP_TPL_REV,
1322 1776          .ct_save        = kernel_fpu_ctx_save,
1323 1777          .ct_restore     = kernel_fpu_ctx_restore,
1324 1778  };
1325 1779  
1326 1780  void
1327 1781  kernel_fpu_begin(kfpu_state_t *kfpu, uint_t flags)
1328 1782  {
1329 1783          klwp_t *pl = curthread->t_lwp;
1330 1784          struct ctxop *ctx;
1331 1785  
1332 1786          if ((curthread->t_flag & T_KFPU) != 0) {
1333 1787                  panic("curthread attempting to nest kernel FPU states");
1334 1788          }
1335 1789  
1336 1790          /* KFPU_USE_LWP and KFPU_NO_STATE are mutually exclusive. */
1337 1791          ASSERT((flags & (KFPU_USE_LWP | KFPU_NO_STATE)) !=
1338 1792              (KFPU_USE_LWP | KFPU_NO_STATE));
1339 1793  
1340 1794          if ((flags & KFPU_NO_STATE) == KFPU_NO_STATE) {
1341 1795                  /*
1342 1796                   * Since we don't have a kfpu_state or usable lwp pcb_fpu to
1343 1797                   * hold our kernel FPU context, we depend on the caller doing
1344 1798                   * kpreempt_disable for the duration of our FPU usage. This
1345 1799                   * should only be done for very short periods of time.
1346 1800                   */
1347 1801                  ASSERT(curthread->t_preempt > 0);
1348 1802                  ASSERT(kfpu == NULL);
1349 1803  
1350 1804                  if (pl != NULL) {
1351 1805                          /*
1352 1806                           * We might have already saved once so FPU_VALID could
1353 1807                           * be set. This is handled in fp_save.
1354 1808                           */
1355 1809                          fp_save(&pl->lwp_pcb.pcb_fpu);
1356 1810                          pl->lwp_pcb.pcb_fpu.fpu_flags |= FPU_KERNEL;
1357 1811                  }
1358 1812  
1359 1813                  curthread->t_flag |= T_KFPU;
1360 1814  
1361 1815                  /* Always restore the fpu to the initial state. */
1362 1816                  fpinit();
1363 1817  
1364 1818                  return;
1365 1819          }
1366 1820  
1367 1821          /*
1368 1822           * We either have a kfpu, or are using the LWP pcb_fpu for context ops.
1369 1823           */
1370 1824  
1371 1825          if ((flags & KFPU_USE_LWP) == 0) {
1372 1826                  if (kfpu->kfpu_curthread != NULL)
1373 1827                          panic("attempting to reuse kernel FPU state at %p when "
1374 1828                              "another thread already is using", kfpu);
1375 1829  
1376 1830                  if ((kfpu->kfpu_flags & KFPU_F_INITIALIZED) == 0)
1377 1831                          kernel_fpu_fpstate_init(kfpu);
1378 1832  
1379 1833                  kfpu->kfpu_curthread = curthread;
1380 1834          }
1381 1835  
1382 1836          /*
1383 1837           * Not all threads may have an active LWP. If they do and we're not
1384 1838           * going to re-use the LWP, then we should go ahead and save the state.
1385 1839           * We must also note that the fpu is now being used by the kernel and
1386 1840           * therefore we do not want to manage the fpu state via the user-level
1387 1841           * thread's context handlers.
1388 1842           *
1389 1843           * We might have already saved once (due to a prior use of the kernel
1390 1844           * FPU or another code path) so FPU_VALID could be set. This is handled
1391 1845           * by fp_save, as is the FPU_EN check.
1392 1846           */
1393 1847          ctx = ctxop_allocate(&kfpu_ctxop_tpl, kfpu);
1394 1848          kpreempt_disable();
1395 1849          if (pl != NULL) {
1396 1850                  if ((flags & KFPU_USE_LWP) == 0)
1397 1851                          fp_save(&pl->lwp_pcb.pcb_fpu);
1398 1852                  pl->lwp_pcb.pcb_fpu.fpu_flags |= FPU_KERNEL;
1399 1853          }
1400 1854  
1401 1855          /*
1402 1856           * Set the context operations for kernel FPU usage.  Because kernel FPU
1403 1857           * setup and ctxop attachment needs to happen under the protection of
1404 1858           * kpreempt_disable(), we allocate the ctxop outside the guard so its
1405 1859           * sleeping allocation will not cause a voluntary swtch().  This allows
1406 1860           * the rest of the initialization to proceed, ensuring valid state for
1407 1861           * the ctxop handlers.
1408 1862           */
1409 1863          ctxop_attach(curthread, ctx);
1410 1864          curthread->t_flag |= T_KFPU;
1411 1865  
1412 1866          if ((flags & KFPU_USE_LWP) == KFPU_USE_LWP) {
1413 1867                  /*
1414 1868                   * For pure kernel threads with an LWP, we can use the LWP's
1415 1869                   * pcb_fpu to save/restore context.
1416 1870                   */
1417 1871                  fpu_ctx_t *pf = &pl->lwp_pcb.pcb_fpu;
1418 1872  
1419 1873                  VERIFY(curthread->t_procp->p_flag & SSYS);
1420 1874                  VERIFY(kfpu == NULL);
1421 1875                  ASSERT((pf->fpu_flags & FPU_EN) == 0);
1422 1876  
1423 1877                  /* Always restore the fpu to the initial state. */
1424 1878                  if (fp_save_mech == FP_XSAVE)
1425 1879                          pf->fpu_xsave_mask = XFEATURE_FP_ALL;
1426 1880                  fpinit();
1427 1881                  pf->fpu_flags = FPU_EN | FPU_KERNEL;
1428 1882          } else {
1429 1883                  /* initialize the kfpu state */
1430 1884                  kernel_fpu_ctx_restore(kfpu);
1431 1885          }
1432 1886          kpreempt_enable();
1433 1887  }
1434 1888  
1435 1889  void
1436 1890  kernel_fpu_end(kfpu_state_t *kfpu, uint_t flags)
1437 1891  {
1438 1892          if ((curthread->t_flag & T_KFPU) == 0) {
1439 1893                  panic("curthread attempting to clear kernel FPU state "
1440 1894                      "without using it");
1441 1895          }
1442 1896  
1443 1897          /*
1444 1898           * General comments on why the rest of this function is structured the
1445 1899           * way it is. Be aware that there is a lot of subtlety here.
1446 1900           *
1447 1901           * If a user-level thread ever uses the fpu while in the kernel, then
1448 1902           * we cannot call fpdisable since that does STTS. That will set the
1449 1903           * ts bit in %cr0 which will cause an exception if anything touches the
1450 1904           * fpu. However, the user-level context switch handler (fpsave_ctxt)
1451 1905           * needs to access the fpu to save the registers into the pcb.
1452 1906           * fpsave_ctxt relies on CLTS having been done to clear the ts bit in
1453 1907           * fprestore_ctxt when the thread context switched onto the CPU.
1454 1908           *
1455 1909           * Calling fpdisable only effects the current CPU's %cr0 register.
1456 1910           *
1457 1911           * During ctxop_remove and kpreempt_enable, we can voluntarily context
1458 1912           * switch, so the CPU we were on when we entered this function might
1459 1913           * not be the same one we're on when we return from ctxop_remove or end
1460 1914           * the function. Note there can be user-level context switch handlers
1461 1915           * still installed if this is a user-level thread.
1462 1916           *
1463 1917           * We also must be careful in the unlikely chance we're running in an
1464 1918           * interrupt thread, since we can't leave the CPU's %cr0 TS state set
1465 1919           * incorrectly for the "real" thread to resume on this CPU.
1466 1920           */
1467 1921  
1468 1922          if ((flags & KFPU_NO_STATE) == 0) {
1469 1923                  kpreempt_disable();
1470 1924          } else {
1471 1925                  ASSERT(curthread->t_preempt > 0);
1472 1926          }
1473 1927  
1474 1928          curthread->t_flag &= ~T_KFPU;
1475 1929  
1476 1930          /*
1477 1931           * When we are ending things, we explicitly don't save the current
1478 1932           * kernel FPU state back to the temporary state. The kfpu API is not
1479 1933           * intended to be a permanent save location.
1480 1934           *
1481 1935           * If this is a user-level thread and we were to context switch
1482 1936           * before returning to user-land, fpsave_ctxt will be a no-op since we
1483 1937           * already saved the user-level FPU state the first time we run
1484 1938           * kernel_fpu_begin (i.e. we won't save the bad kernel fpu state over
1485 1939           * the user-level fpu state). The fpsave_ctxt functions only save if
1486 1940           * FPU_VALID is not already set. fp_save also set PCB_SET_UPDATE_FPU so
1487 1941           * fprestore_ctxt will be done in sys_rtt_common when the thread
1488 1942           * finally returns to user-land.
1489 1943           */
1490 1944  
1491 1945          if ((curthread->t_procp->p_flag & SSYS) != 0 &&
1492 1946              curthread->t_intr == NULL) {
1493 1947                  /*
1494 1948                   * A kernel thread which is not an interrupt thread, so we
1495 1949                   * STTS now.
1496 1950                   */
1497 1951                  fpdisable();
1498 1952          }
1499 1953  
1500 1954          if ((flags & KFPU_NO_STATE) == 0) {
1501 1955                  ctxop_remove(curthread, &kfpu_ctxop_tpl, kfpu);
1502 1956  
1503 1957                  if (kfpu != NULL) {
1504 1958                          if (kfpu->kfpu_curthread != curthread) {
1505 1959                                  panic("attempting to end kernel FPU state "
1506 1960                                      "for %p, but active thread is not "
1507 1961                                      "curthread", kfpu);
1508 1962                          } else {
1509 1963                                  kfpu->kfpu_curthread = NULL;
1510 1964                          }
1511 1965                  }
1512 1966  
1513 1967                  kpreempt_enable();
1514 1968          }
1515 1969

↓ open down ↓

586 lines elided

↑ open up ↑

1516 1970          if (curthread->t_lwp != NULL) {
1517 1971                  uint_t f;
1518 1972  
1519 1973                  if (flags & KFPU_USE_LWP) {
1520 1974                          f = FPU_EN | FPU_KERNEL;
1521 1975                  } else {
1522 1976                          f = FPU_KERNEL;
1523 1977                  }
1524 1978                  curthread->t_lwp->lwp_pcb.pcb_fpu.fpu_flags &= ~f;
1525 1979          }
     1980 +}
     1981 +
     1982 +/*
     1983 + * Fill in FPU information that is required by exec.
     1984 + */
     1985 +void
     1986 +fpu_auxv_info(int *typep, size_t *lenp)
     1987 +{
     1988 +        *typep = fp_elf;
     1989 +        switch (fp_save_mech) {
     1990 +        case FP_FXSAVE:
     1991 +                *lenp = sizeof (struct fxsave_state);
     1992 +                break;
     1993 +        case FP_XSAVE:
     1994 +                *lenp = cpuid_get_xsave_size();
     1995 +                break;
     1996 +        default:
     1997 +                *lenp = 0;
     1998 +                break;
     1999 +        }
     2000 +}
     2001 +
     2002 +/*
     2003 + * This function exists to transform an xsave_state into an fxsave_state. The
     2004 + * way that we have to do this is nuanced. We assume that callers have already
     2005 + * handled FPU_EN and thus we only need to consider the xsave_state and its
     2006 + * component vector itself. This results in the following cases that we need to
     2007 + * consider:
     2008 + *
     2009 + *   o Neither the x87 / XMM state bits are set. We use the hardware default and
     2010 + *     need to ensure to copy the xsave header.
     2011 + *   o Both x87 / XMM state bits are set. We can copy everything.
     2012 + *   o Only the x87 bit is set. We need to copy the x87 state but make the XMM
     2013 + *     state be in the initial case.
     2014 + *   o Only the XMM bit is set. The reverse of the above case.
     2015 + *
     2016 + * The illumos and hardware defaults in 'sse_initial' and 'avx_initial' are
     2017 + * generally the same; however, the default floating point control word is
     2018 + * different.
     2019 + *
     2020 + * Finally, we have the complication of the MXCSR and MCXSR_MASK registers.
     2021 + * Because we are using xsave and xsaveopt in the kernel right now and not
     2022 + * xsavec, the hardware may write out the MXCSR and MXCSR_MASK registers if the
     2023 + * XFEATURE_AVX bit is set. Therefore if we don't have the XMM bit set but AVX
     2024 + * is set, we must also come back and copy out the MXCSR register. Sorry, we
     2025 + * don't make the rules.
     2026 + */
     2027 +static void
     2028 +fpu_xsave_to_fxsave(const struct xsave_state *xsave, struct fxsave_state *fx)
     2029 +{
     2030 +        const uint64_t comps = xsave->xs_header.xsh_xstate_bv;
     2031 +
     2032 +        switch (comps & (XFEATURE_LEGACY_FP | XFEATURE_SSE)) {
     2033 +        case XFEATURE_LEGACY_FP | XFEATURE_SSE:
     2034 +                bcopy(xsave, fx, sizeof (*fx));
     2035 +                return;
     2036 +        case XFEATURE_LEGACY_FP:
     2037 +                bcopy(xsave, fx, offsetof(struct fxsave_state, fx_xmm));
     2038 +                fx->fx_mxcsr = SSE_MXCSR_INIT;
     2039 +                fx->fx_mxcsr_mask = 0;
     2040 +                break;
     2041 +        case XFEATURE_SSE:
     2042 +                bcopy(&sse_initial, fx, offsetof(struct fxsave_state,
     2043 +                    fx_mxcsr));
     2044 +
     2045 +                fx->fx_fcw = FPU_CW_INIT_HW;
     2046 +                fx->fx_mxcsr = xsave->xs_fxsave.fx_mxcsr;
     2047 +                fx->fx_mxcsr_mask = xsave->xs_fxsave.fx_mxcsr_mask;
     2048 +                bcopy(xsave->xs_fxsave.fx_xmm, fx->fx_xmm, sizeof (fx->fx_xmm));
     2049 +                break;
     2050 +        default:
     2051 +                bcopy(&sse_initial, fx, sizeof (*fx));
     2052 +                fx->fx_fcw = FPU_CW_INIT_HW;
     2053 +                break;
     2054 +        }
     2055 +
     2056 +        /*
     2057 +         * Account for the AVX causing MXCSR to be valid.
     2058 +         */
     2059 +        if ((xsave->xs_header.xsh_xstate_bv & XFEATURE_AVX) != 0 &&
     2060 +            (xsave->xs_header.xsh_xstate_bv & XFEATURE_SSE) == 0) {
     2061 +                fx->fx_mxcsr = xsave->xs_fxsave.fx_mxcsr;
     2062 +                fx->fx_mxcsr_mask = xsave->xs_fxsave.fx_mxcsr_mask;
     2063 +        }
     2064 +}
     2065 +
     2066 +/*
     2067 + * This function is designed to answer the question of are we using any xsave
     2068 + * family of instructions in context switch and therefore we have this state.
     2069 + * This should still remain true if we are using xsavec or xsaves in the kernel
     2070 + * in the future.
     2071 + */
     2072 +boolean_t
     2073 +fpu_xsave_enabled(void)
     2074 +{
     2075 +        return (fp_save_mech == FP_XSAVE);
     2076 +}
     2077 +
     2078 +/*
     2079 + * The following structure is used to track and manage the programmatic
     2080 + * construction of /proc and signal stack spilling of xsave information. All
     2081 + * known xsave types that the kernel supports must be included here.
     2082 + */
     2083 +typedef struct xsave_proc_info {
     2084 +        /*
     2085 +         * This matches the /proc xregs type that this data represents. This s
     2086 +         * used for /proc only.
     2087 +         */
     2088 +        uint32_t xi_type;
     2089 +        /*
     2090 +         * This indicates the size of the /proc data that we're operating on.
     2091 +         * This is only used for /proc.
     2092 +         */
     2093 +        size_t  xi_size;
     2094 +        /*
     2095 +         * This indicates the alignment that we want to have for the member when
     2096 +         * we're writing out. This is not used when setting data. This is only
     2097 +         * used for /proc.
     2098 +         */
     2099 +        size_t  xi_align;
     2100 +        /*
     2101 +         * This indicates whether this member must always be considered or not.
     2102 +         * This is used in both /proc and context/signal handling.
     2103 +         */
     2104 +        bool    xi_always;
     2105 +        /*
     2106 +         * This contains the corresponding bits in the xsave bit vector that
     2107 +         * corresponds to this entry. This is used for both /proc and
     2108 +         * context/signal handling.
     2109 +         */
     2110 +        uint64_t xi_bits;
     2111 +        /*
     2112 +         * The xi_fill function pointer is used to write out the /proc regset
     2113 +         * data (e.g. when a user reads xregs). This is only used for the /proc
     2114 +         * handling. The xi_valid function pointer is used instead to validate a
     2115 +         * given set of data that we've read in, while the xi_set pointer is
     2116 +         * used to actually transform the data in the underlying fpu save area.
     2117 +         */
     2118 +        void    (*xi_fill)(const fpu_ctx_t *, const struct xsave_proc_info *,
     2119 +            void *);
     2120 +        bool    (*xi_valid)(model_t, const void *);
     2121 +        void    (*xi_set)(fpu_ctx_t *, const struct xsave_proc_info *,
     2122 +            uint64_t, const void *);
     2123 +        /*
     2124 +         * The xi_signal_in and xi_signal_out function pointers are used for
     2125 +         * extended context and signal handling information. They are used when
     2126 +         * reading in data from a ucontex_t and writing it out respectively.
     2127 +         * These are only used for context/signal handling.
     2128 +         */
     2129 +        int     (*xi_signal_in)(const struct xsave_proc_info *,
     2130 +            const ucontext_t *, const uc_xsave_t *, void *, uintptr_t *,
     2131 +            const uintptr_t);
     2132 +        int     (*xi_signal_out)(const struct xsave_proc_info *, fpu_copyout_f,
     2133 +            uc_xsave_t *, const void *fpup, uintptr_t);
     2134 +} xsave_proc_info_t;
     2135 +
     2136 +static bool
     2137 +fpu_proc_xregs_initial_state(const fpu_ctx_t *fpu, uint64_t feats)
     2138 +{
     2139 +        if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == 0) {
     2140 +                return (B_TRUE);
     2141 +        }
     2142 +
     2143 +        return ((fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv &
     2144 +            feats) == 0);
     2145 +}
     2146 +
     2147 +static void
     2148 +fpu_proc_xregs_xcr_fill(const fpu_ctx_t *fpu, const xsave_proc_info_t *info,
     2149 +    void *datap)
     2150 +{
     2151 +        prxregset_xcr_t *xcr = datap;
     2152 +
     2153 +        xcr->prx_xcr_xcr0 = xsave_bv_all;
     2154 +}
     2155 +
     2156 +/*
     2157 + * Unlike other instruction portions, we treat the xsave header and the legacy
     2158 + * XMM section together as both are somewhat tied at the instruction hip. Unlike
     2159 + * the latter values, the initial state here is not quite the same.
     2160 + */
     2161 +static void
     2162 +fpu_proc_xregs_xsave_fill(const fpu_ctx_t *fpu, const xsave_proc_info_t *info,
     2163 +    void *datap)
     2164 +{
     2165 +        prxregset_xsave_t *prxsave = datap;
     2166 +        const struct xsave_state *xsave = fpu->fpu_regs.kfpu_u.kfpu_xs;
     2167 +        size_t hdr_off;
     2168 +
     2169 +        /*
     2170 +         * In the x87/XMM case, the no device vs. initial state is different
     2171 +         * because the initial state case still wants us to copy the real xsave
     2172 +         * header. It's also worth calling out that the actual illumos default
     2173 +         * fxsave state is not the same as what Intel documents. The main
     2174 +         * difference is in what the x87 FPU control word is. This results in
     2175 +         * the following different cases that we need to think about:
     2176 +         *
     2177 +         *   o FPU_EN is not set. So we use the illumos default.
     2178 +         */
     2179 +        if ((fpu->fpu_flags & FPU_EN) == 0) {
     2180 +                bcopy(&avx_initial, prxsave, sizeof (*prxsave));
     2181 +                return;
     2182 +        }
     2183 +
     2184 +        /*
     2185 +         * Convert all the fxsave region while taking into account the validity
     2186 +         * of the xsave bits. The prxregset_xsave_t structure is identical in
     2187 +         * the first 512-bits to the prxsave structure.
     2188 +         */
     2189 +        fpu_xsave_to_fxsave(xsave, (struct fxsave_state *)prxsave);
     2190 +
     2191 +        /*
     2192 +         * Now that we've dealt with the x87 and XMM state, take care of the
     2193 +         * header.
     2194 +         */
     2195 +        hdr_off = offsetof(prxregset_xsave_t, prx_xsh_xstate_bv);
     2196 +        bcopy((const void *)((uintptr_t)xsave + hdr_off),
     2197 +            (void *)((uintptr_t)prxsave + hdr_off),
     2198 +            sizeof (struct xsave_header));
     2199 +}
     2200 +
     2201 +static void
     2202 +fpu_proc_xregs_std_fill(const fpu_ctx_t *fpu, const xsave_proc_info_t *info,
     2203 +    void *datap)
     2204 +{
     2205 +        if (!fpu_proc_xregs_initial_state(fpu, info->xi_bits)) {
     2206 +                size_t size, off;
     2207 +                const void *xsave_off;
     2208 +
     2209 +                cpuid_get_xsave_info(info->xi_bits, &size, &off);
     2210 +                ASSERT3U(size, ==, info->xi_size);
     2211 +                xsave_off = (void *)((uintptr_t)fpu->fpu_regs.kfpu_u.kfpu_xs +
     2212 +                    off);
     2213 +                bcopy(xsave_off, datap, info->xi_size);
     2214 +        }
     2215 +}
     2216 +
     2217 +/*
     2218 + * Users are not allowed to actually set the xcr information this way. However,
     2219 + * to make it easier for someone to just do a read, modify, write, of the xregs
     2220 + * data, if it is identical, then we will accept it (and do nothing).
     2221 + */
     2222 +static bool
     2223 +fpu_proc_xregs_xcr_valid(model_t model, const void *datap)
     2224 +{
     2225 +        const prxregset_xcr_t *xcr = datap;
     2226 +
     2227 +        return (xcr->prx_xcr_xcr0 == xsave_bv_all && xcr->prx_xcr_xfd == 0 &&
     2228 +            xcr->prx_xcr_pad[0] == 0 && xcr->prx_xcr_pad[1] == 0);
     2229 +}
     2230 +
     2231 +/*
     2232 + * To match traditional /proc semantics, we do not error if reserved bits of
     2233 + * MXCSR are set, they will be masked off when writing data. We do not allow
     2234 + * someone to indicate that they are asking for compressed xsave data, hence the
     2235 + * check that prx_xsh_comp_bv is zero. Finally, we will check that each
     2236 + * component that was indicated in the xstate_bv is present as another item as
     2237 + * part of the broader validation path.
     2238 + */
     2239 +static bool
     2240 +fpu_proc_xregs_xsave_valid(model_t model, const void *datap)
     2241 +{
     2242 +        const prxregset_xsave_t *xsave = datap;
     2243 +        uint64_t rsvd[6] = { 0 };
     2244 +
     2245 +        if (bcmp(rsvd, xsave->prx_xsh_reserved, sizeof (rsvd)) != 0 ||
     2246 +            xsave->prx_xsh_xcomp_bv != 0) {
     2247 +                return (false);
     2248 +        }
     2249 +
     2250 +        if ((xsave->prx_xsh_xstate_bv & ~xsave_bv_all) != 0) {
     2251 +                return (false);
     2252 +        }
     2253 +
     2254 +        return (true);
     2255 +}
     2256 +
     2257 +/*
     2258 + * The YMM, ZMM, and Hi-ZMM registers are all valid when in an LP64 environment
     2259 + * on x86; however, when operating in ILP32, subsets are reserved. We basically
     2260 + * require that all reserved portions are set to zero as our way to accept them.
     2261 + */
     2262 +static bool
     2263 +fpu_proc_xregs_ymm_valid(model_t model, const void *datap)
     2264 +{
     2265 +        upad128_t ymm_zero[8];
     2266 +        const prxregset_ymm_t *ymm = datap;
     2267 +
     2268 +        if (model == DATAMODEL_LP64) {
     2269 +                return (true);
     2270 +        }
     2271 +
     2272 +        bzero(&ymm_zero, sizeof (ymm_zero));
     2273 +        return (bcmp(&ymm->prx_ymm[8], &ymm_zero, sizeof (ymm_zero)) == 0);
     2274 +}
     2275 +
     2276 +static bool
     2277 +fpu_proc_xregs_zmm_valid(model_t model, const void *datap)
     2278 +{
     2279 +        upad256_t zmm_zero[8];
     2280 +        const prxregset_zmm_t *zmm = datap;
     2281 +
     2282 +        if (model == DATAMODEL_LP64) {
     2283 +                return (true);
     2284 +        }
     2285 +
     2286 +        bzero(&zmm_zero, sizeof (zmm_zero));
     2287 +        return (bcmp(&zmm->prx_zmm[8], &zmm_zero, sizeof (zmm_zero)) == 0);
     2288 +}
     2289 +
     2290 +static bool
     2291 +fpu_proc_xregs_hi_zmm_valid(model_t model, const void *datap)
     2292 +{
     2293 +        prxregset_hi_zmm_t hi_zmm_zero;
     2294 +        const prxregset_hi_zmm_t *hi_zmm = datap;
     2295 +
     2296 +        if (model == DATAMODEL_LP64) {
     2297 +                return (true);
     2298 +        }
     2299 +
     2300 +        bzero(&hi_zmm_zero, sizeof (hi_zmm_zero));
     2301 +        return (bcmp(hi_zmm, &hi_zmm_zero, sizeof (hi_zmm_zero)) == 0);
     2302 +}
     2303 +
     2304 +/*
     2305 + * The xsave state consists of the first 512 byes of the XMM state and then the
     2306 + * xsave header itself. Because of the xsave header, this structure is marked
     2307 + * with xi_always, so we must always process and consider it.
     2308 + *
     2309 + * Semantically if either of the bits around SSE / x87 is set, then we will copy
     2310 + * the entire thing. This may mean that we end up copying a region that is not
     2311 + * valid into the save area; however, that should be OK as we still have the
     2312 + * specific bit flags that indicate what we should consider or not.
     2313 + *
     2314 + * There is one additional wrinkle we need to consider and honor here. The CPU
     2315 + * will load the MXCSR values if the AVX bit is set in an xrstor regardless of
     2316 + * anything else. So if if this is set and we do not have a valid x87/XMM bits
     2317 + * set then we will set the MXCSR to its default state in case the processor
     2318 + * tries to load it. For reference see:
     2319 + *
     2320 + *   o Intel SDM Volume 1: 13.8.1 Standard Form of XRSTOR
     2321 + *   o AMD64 Volume 2: Section 11.5.9 MXCSR State Management
     2322 + *
     2323 + * Note, the behavior around this changes depending on whether using the
     2324 + * compressed xrstor or not. We are not, but it's worth being aware of. We do
     2325 + * not worry about MXCSR_MASK because the instructions ignore it.
     2326 + */
     2327 +static void
     2328 +fpu_proc_xregs_xsave_set(fpu_ctx_t *fpu, const xsave_proc_info_t *info,
     2329 +    uint64_t xsave_bv, const void *datap)
     2330 +{
     2331 +        const struct xsave_state *xs = datap;
     2332 +
     2333 +        if ((xsave_bv & info->xi_bits) != 0) {
     2334 +                bcopy(&xs->xs_fxsave, &fpu->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave,
     2335 +                    sizeof (struct fxsave_state));
     2336 +        } else if ((xsave_bv & XFEATURE_AVX) != 0) {
     2337 +                fpu->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_mxcsr =
     2338 +                    SSE_MXCSR_INIT;
     2339 +        }
     2340 +
     2341 +        bcopy(&xs->xs_header, &fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header,
     2342 +            sizeof (struct xsave_header));
     2343 +        fpu->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_mxcsr &= sse_mxcsr_mask;
     2344 +}
     2345 +
     2346 +static void
     2347 +fpu_proc_xregs_std_set(fpu_ctx_t *fpu, const xsave_proc_info_t *info,
     2348 +    uint64_t xsave_bv, const void *datap)
     2349 +{
     2350 +        size_t size, off;
     2351 +        void *xsave_off;
     2352 +
     2353 +        cpuid_get_xsave_info(info->xi_bits, &size, &off);
     2354 +        xsave_off = (void *)((uintptr_t)fpu->fpu_regs.kfpu_u.kfpu_xs +
     2355 +            off);
     2356 +        bcopy(datap, xsave_off, size);
     2357 +}
     2358 +
     2359 +/*
     2360 + * Dealing with XMM data is a little more annoying here. If UC_FPU is set, it
     2361 + * also contains a copy of the XMM region. That must take priority over anything
     2362 + * we have here. In the copyout code we do not set the XMM bits here as
     2363 + * something to copy, therefore if they are set, we currently treat that as an
     2364 + * error.
     2365 + *
     2366 + * The system has always gone through and cleaned up the reserved bits in the
     2367 + * fxsave state when someone calls setcontext(). Therefore we need to do the
     2368 + * same thing which is why you see the masking of the mxcsr below.
     2369 + *
     2370 + * Finally, there is one last wrinkle here that we need to consider. The
     2371 + * fpregset_t has historically had two private words that are used to convey the
     2372 + * status which cache the status/exception information. Therefore, we well...
     2373 + * cheat. Intel has left bytes 464 (0x1d0) through 511 (0x1ff) available for us
     2374 + * to do what we want. So we will pass this through that for the moment to help
     2375 + * us pass this state around without too much extra allocation.
     2376 + */
     2377 +static int
     2378 +fpu_signal_copyin_xmm(const xsave_proc_info_t *info, const ucontext_t *kuc,
     2379 +    const uc_xsave_t *ucx, void *fpup, uintptr_t *udatap,
     2380 +    const uintptr_t max_udata)
     2381 +{
     2382 +        struct xsave_state *xsave = fpup;
     2383 +
     2384 +        if ((ucx->ucx_bv & info->xi_bits) != 0) {
     2385 +                return (EINVAL);
     2386 +        }
     2387 +
     2388 +        if ((kuc->uc_flags & UC_FPU) != 0) {
     2389 +                bcopy(&kuc->uc_mcontext.fpregs, &xsave->xs_fxsave,
     2390 +                    sizeof (struct fxsave_state));
     2391 +                xsave->xs_fxsave.__fx_ign2[3]._l[0] =
     2392 +                    kuc->uc_mcontext.fpregs.fp_reg_set.fpchip_state.status;
     2393 +                xsave->xs_fxsave.__fx_ign2[3]._l[1] =
     2394 +                    kuc->uc_mcontext.fpregs.fp_reg_set.fpchip_state.xstatus;
     2395 +                xsave->xs_fxsave.fx_mxcsr &= sse_mxcsr_mask;
     2396 +                xsave->xs_header.xsh_xstate_bv |= info->xi_bits;
     2397 +        }
     2398 +
     2399 +        return (0);
     2400 +}
     2401 +
     2402 +static int
     2403 +fpu_signal_copyin_std(const xsave_proc_info_t *info, const ucontext_t *kuc,
     2404 +    const uc_xsave_t *ucx, void *fpup, uintptr_t *udatap,
     2405 +    const uintptr_t max_udata)
     2406 +{
     2407 +        size_t len, xsave_off;
     2408 +        void *copy_to;
     2409 +        struct xsave_state *xsave = fpup;
     2410 +
     2411 +        cpuid_get_xsave_info(info->xi_bits, &len, &xsave_off);
     2412 +        if (*udatap + len > max_udata) {
     2413 +                return (EOVERFLOW);
     2414 +        }
     2415 +
     2416 +        copy_to = (void *)((uintptr_t)fpup + xsave_off);
     2417 +        if (ddi_copyin((void *)*udatap, copy_to, len, 0) != 0) {
     2418 +                return (EFAULT);
     2419 +        }
     2420 +
     2421 +        xsave->xs_header.xsh_xstate_bv |= info->xi_bits;
     2422 +        *udatap = *udatap + len;
     2423 +
     2424 +        return (0);
     2425 +}
     2426 +
     2427 +static int
     2428 +fpu_signal_copyout_std(const xsave_proc_info_t *info, fpu_copyout_f copyfunc,
     2429 +    uc_xsave_t *ucx, const void *fpup, uintptr_t udatap)
     2430 +{
     2431 +        size_t len, xsave_off;
     2432 +        const void *copy_from;
     2433 +        void *copy_to;
     2434 +        int ret;
     2435 +
     2436 +        cpuid_get_xsave_info(info->xi_bits, &len, &xsave_off);
     2437 +        copy_from = (void *)(uintptr_t)fpup + xsave_off;
     2438 +        copy_to = (void *)(udatap + ucx->ucx_len);
     2439 +
     2440 +        ret = copyfunc(copy_from, copy_to, len);
     2441 +        if (ret != 0) {
     2442 +                return (ret);
     2443 +        }
     2444 +
     2445 +        ucx->ucx_len += len;
     2446 +        ucx->ucx_bv |= info->xi_bits;
     2447 +        return (0);
     2448 +}
     2449 +
     2450 +/*
     2451 + * This table contains information about the extended FPU states and synthetic
     2452 + * information we create for /proc, the ucontext_t, and signal handling. The
     2453 + * definition of the xsave_proc_info_t describes how each member is used.
     2454 + *
     2455 + * In general, this table is expected to be in the order of the xsave data
     2456 + * structure itself. Synthetic elements that we create can go anywhere and new
     2457 + * ones should be inserted at the end. This structure is walked in order to
     2458 + * produce the /proc and signal handling logic, so changing the order is
     2459 + * meaningful for those and probably should not be done lightly.
     2460 + */
     2461 +static const xsave_proc_info_t fpu_xsave_info[] = { {
     2462 +        .xi_type = PRX_INFO_XCR,
     2463 +        .xi_size = sizeof (prxregset_xcr_t),
     2464 +        .xi_align = alignof (prxregset_xcr_t),
     2465 +        .xi_always = true,
     2466 +        .xi_bits = 0,
     2467 +        .xi_fill = fpu_proc_xregs_xcr_fill,
     2468 +        .xi_valid = fpu_proc_xregs_xcr_valid
     2469 +}, {
     2470 +        /*
     2471 +         * The XSAVE entry covers both the xsave header and the %xmm registers.
     2472 +         * Note, there is no signal copyout information for the %xmm registers
     2473 +         * because it is expected that that data is already in the fpregset_t.
     2474 +         */
     2475 +        .xi_type = PRX_INFO_XSAVE,
     2476 +        .xi_size = sizeof (prxregset_xsave_t),
     2477 +        .xi_align = FPU_ALIGN_XMM,
     2478 +        .xi_always = true,
     2479 +        .xi_bits = XFEATURE_LEGACY_FP | XFEATURE_SSE,
     2480 +        .xi_fill = fpu_proc_xregs_xsave_fill,
     2481 +        .xi_set = fpu_proc_xregs_xsave_set,
     2482 +        .xi_valid = fpu_proc_xregs_xsave_valid,
     2483 +        .xi_signal_in = fpu_signal_copyin_xmm
     2484 +}, {
     2485 +        .xi_type = PRX_INFO_YMM,
     2486 +        .xi_size = sizeof (prxregset_ymm_t),
     2487 +        .xi_align = FPU_ALIGN_YMM,
     2488 +        .xi_always = false,
     2489 +        .xi_bits = XFEATURE_AVX,
     2490 +        .xi_fill = fpu_proc_xregs_std_fill,
     2491 +        .xi_set = fpu_proc_xregs_std_set,
     2492 +        .xi_signal_in = fpu_signal_copyin_std,
     2493 +        .xi_valid = fpu_proc_xregs_ymm_valid,
     2494 +        .xi_signal_out = fpu_signal_copyout_std
     2495 +}, {
     2496 +        /*
     2497 +         * There is no /proc validation function for the mask registers because
     2498 +         * they are the same in ILP32 / LP64 and there is nothing for us to
     2499 +         * actually validate.
     2500 +         */
     2501 +        .xi_type = PRX_INFO_OPMASK,
     2502 +        .xi_size = sizeof (prxregset_opmask_t),
     2503 +        .xi_align = alignof (prxregset_opmask_t),
     2504 +        .xi_always = false,
     2505 +        .xi_bits = XFEATURE_AVX512_OPMASK,
     2506 +        .xi_fill = fpu_proc_xregs_std_fill,
     2507 +        .xi_set = fpu_proc_xregs_std_set,
     2508 +        .xi_signal_in = fpu_signal_copyin_std,
     2509 +        .xi_signal_out = fpu_signal_copyout_std
     2510 +}, {
     2511 +        .xi_type = PRX_INFO_ZMM,
     2512 +        .xi_size = sizeof (prxregset_zmm_t),
     2513 +        .xi_align = FPU_ALIGN_ZMM,
     2514 +        .xi_always = false,
     2515 +        .xi_bits = XFEATURE_AVX512_ZMM,
     2516 +        .xi_fill = fpu_proc_xregs_std_fill,
     2517 +        .xi_set = fpu_proc_xregs_std_set,
     2518 +        .xi_valid = fpu_proc_xregs_zmm_valid,
     2519 +        .xi_signal_in = fpu_signal_copyin_std,
     2520 +        .xi_signal_out = fpu_signal_copyout_std
     2521 +}, {
     2522 +        .xi_type = PRX_INFO_HI_ZMM,
     2523 +        .xi_size = sizeof (prxregset_hi_zmm_t),
     2524 +        .xi_align = FPU_ALIGN_ZMM,
     2525 +        .xi_always = false,
     2526 +        .xi_bits = XFEATURE_AVX512_HI_ZMM,
     2527 +        .xi_fill = fpu_proc_xregs_std_fill,
     2528 +        .xi_set = fpu_proc_xregs_std_set,
     2529 +        .xi_valid = fpu_proc_xregs_hi_zmm_valid,
     2530 +        .xi_signal_in = fpu_signal_copyin_std,
     2531 +        .xi_signal_out = fpu_signal_copyout_std
     2532 +} };
     2533 +
     2534 +static bool
     2535 +fpu_proc_xregs_include(const xsave_proc_info_t *infop)
     2536 +{
     2537 +        return (infop->xi_always || (xsave_bv_all & infop->xi_bits) != 0);
     2538 +}
     2539 +
     2540 +void
     2541 +fpu_proc_xregs_info(struct proc *p __unused, uint32_t *ninfop, uint32_t *sizep,
     2542 +    uint32_t *dstart)
     2543 +{
     2544 +        size_t ret = sizeof (prxregset_hdr_t);
     2545 +        uint32_t ninfo = 0;
     2546 +
     2547 +        ASSERT(fpu_xsave_enabled());
     2548 +
     2549 +        /*
     2550 +         * Right now the set of flags that are enabled in the FPU is global.
     2551 +         * That is, while the pcb's fcpu_ctx_t has the fpu_xsave_mask, the
     2552 +         * actual things that might show up and we care about are all about what
     2553 +         * is set up in %xcr0 which is stored in the global xsave_bv_all. If we
     2554 +         * move to per-process FPU enablement which is likely to come with AMX,
     2555 +         * then this will need the proc_t to look at, hence why we've set things
     2556 +         * up with the unused variable above.
     2557 +         *
     2558 +         * We take two passes through the array. The first is just to count up
     2559 +         * how many informational entries we need.
     2560 +         */
     2561 +        for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
     2562 +                if (!fpu_proc_xregs_include(&fpu_xsave_info[i]))
     2563 +                        continue;
     2564 +                ninfo++;
     2565 +        }
     2566 +
     2567 +        ASSERT3U(ninfo, >, 0);
     2568 +        ret += sizeof (prxregset_info_t) * ninfo;
     2569 +
     2570 +        for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
     2571 +                size_t curphase;
     2572 +                if (!fpu_proc_xregs_include(&fpu_xsave_info[i]))
     2573 +                        continue;
     2574 +
     2575 +                curphase = ret % fpu_xsave_info[i].xi_align;
     2576 +                if (ret < fpu_xsave_info[i].xi_align) {
     2577 +                        ret = fpu_xsave_info[i].xi_align;
     2578 +                } else if (curphase != 0) {
     2579 +                        ret += curphase;
     2580 +                }
     2581 +
     2582 +                if (i == 0 && dstart != NULL) {
     2583 +                        *dstart = ret;
     2584 +                }
     2585 +
     2586 +                ret += fpu_xsave_info[i].xi_size;
     2587 +        }
     2588 +
     2589 +        VERIFY3U(ret, <=, UINT32_MAX);
     2590 +        if (sizep != NULL) {
     2591 +                *sizep = ret;
     2592 +        }
     2593 +
     2594 +        if (ninfop != NULL) {
     2595 +                *ninfop = ninfo;
     2596 +        }
     2597 +}
     2598 +
     2599 +/*
     2600 + * This function supports /proc. Because /proc does not have a process locked
     2601 + * while processing a PCSXREG, so this tries to establish an upper bound that we
     2602 + * will validate later in fpu_proc_xregs_set(). We basically say that if you
     2603 + * take the maximum xsave size and add 1 KiB that is a good enough approximation
     2604 + * for the maximum size.
     2605 + */
     2606 +size_t
     2607 +fpu_proc_xregs_max_size(void)
     2608 +{
     2609 +        VERIFY(fpu_xsave_enabled());
     2610 +        return (cpuid_get_xsave_size() + 0x1000);
     2611 +}
     2612 +
     2613 +/*
     2614 + * This functions supports /proc. In particular, it's meant to perform the
     2615 + * following:
     2616 + *
     2617 + *  o Potentially save the current thread's registers.
     2618 + *  o Write out the x86 xsave /proc xregs format data from the xsave data we
     2619 + *    actually have. Note, this can be a little weird for cases where the FPU is
     2620 + *    not actually enabled, which happens for system processes.
     2621 + *    /proc let us read this state?
     2622 + */
     2623 +void
     2624 +fpu_proc_xregs_get(struct _klwp *lwp, void *buf)
     2625 +{
     2626 +        uint32_t size, ninfo, curinfo, dstart;
     2627 +        fpu_ctx_t *fpu = &lwp->lwp_pcb.pcb_fpu;
     2628 +        prxregset_hdr_t *hdr = buf;
     2629 +
     2630 +        ASSERT(fpu_xsave_enabled());
     2631 +        fpu_proc_xregs_info(lwp->lwp_procp, &ninfo, &size, &dstart);
     2632 +
     2633 +        /*
     2634 +         * Before we get going, defensively zero out all the data buffer so that
     2635 +         * the rest of the fill functions can assume a specific base.
     2636 +         */
     2637 +        bzero(buf, size);
     2638 +
     2639 +        kpreempt_disable();
     2640 +        if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
     2641 +                /*
     2642 +                 * This case suggests that thread in question doesn't have a
     2643 +                 * valid FPU save state which should only happen when it is on
     2644 +                 * CPU. If this is the case, we must ensure that we save the
     2645 +                 * current FPU state before proceeding. We also sanity check
     2646 +                 * several things here before doing this as using /proc on
     2647 +                 * yourself is always exciting. fp_save() will ensure that the
     2648 +                 * thread is flagged to go back to being an eager FPU before
     2649 +                 * returning back to userland.
     2650 +                 */
     2651 +                VERIFY3P(curthread, ==, lwptot(lwp));
     2652 +                VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
     2653 +                fp_save(fpu);
     2654 +        }
     2655 +        kpreempt_enable();
     2656 +
     2657 +        hdr->pr_type = PR_TYPE_XSAVE;
     2658 +        hdr->pr_size = size;
     2659 +        hdr->pr_flags = hdr->pr_pad[0] = hdr->pr_pad[1] = hdr->pr_pad[2] =
     2660 +            hdr->pr_pad[3] = 0;
     2661 +        hdr->pr_ninfo = ninfo;
     2662 +
     2663 +        curinfo = 0;
     2664 +        for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
     2665 +                void *startp;
     2666 +                uint32_t phase;
     2667 +
     2668 +                if (!fpu_proc_xregs_include(&fpu_xsave_info[i]))
     2669 +                        continue;
     2670 +
     2671 +                phase = dstart % fpu_xsave_info[i].xi_align;
     2672 +                if (dstart < fpu_xsave_info[i].xi_align) {
     2673 +                        ASSERT3U(i, !=, 0);
     2674 +                        dstart = fpu_xsave_info[i].xi_align;
     2675 +                } else if (phase != 0) {
     2676 +                        ASSERT3U(i, !=, 0);
     2677 +                        dstart += phase;
     2678 +                }
     2679 +
     2680 +                hdr->pr_info[curinfo].pri_type = fpu_xsave_info[i].xi_type;
     2681 +                hdr->pr_info[curinfo].pri_flags = 0;
     2682 +                hdr->pr_info[curinfo].pri_size = fpu_xsave_info[i].xi_size;
     2683 +                hdr->pr_info[curinfo].pri_offset = dstart;
     2684 +
     2685 +                startp = (void *)((uintptr_t)buf + dstart);
     2686 +                fpu_xsave_info[i].xi_fill(fpu, &fpu_xsave_info[i], startp);
     2687 +                dstart += fpu_xsave_info[i].xi_size;
     2688 +                ASSERT3U(curinfo, <=, ninfo);
     2689 +                curinfo++;
     2690 +        }
     2691 +}
     2692 +
     2693 +/*
     2694 + * We have been asked to set the data in the FPU for a given thread. Our
     2695 + * prmachdep code has already validated that the raw semantics of the data that
     2696 + * we have are valid (that is the appropriate sizes, offsets, and flags). We now
     2697 + * apply additional checking here:
     2698 + *
     2699 + *   o The xsave structure is present and only valid bits are set.
     2700 + *   o If the xsave component bit-vector is set, we have the corresponding proc
     2701 + *     info item.
     2702 + *   o Read-only items are ignored if and only if they actually match what we
     2703 + *     gave the user mostly as a courtesy to simplify things here.
     2704 + *   o ILP32 processes which can't support many of the regions are allowed to
     2705 + *     have the items here (as we likely gave them to them), but they must be
     2706 + *     zero if they are set.
     2707 + *
     2708 + * We take a first pass through all the data, validating it makes sense for the
     2709 + * FPU. Only after that point do we ensure that we have the FPU data in question
     2710 + * and then we clobber all the FPU data. Part of the semantics of setting this
     2711 + * is that we're setting the entire extended FPU.
     2712 + */
     2713 +int
     2714 +fpu_proc_xregs_set(struct _klwp *lwp, void *buf)
     2715 +{
     2716 +        prxregset_hdr_t *prx = buf;
     2717 +        model_t model = lwp_getdatamodel(lwp);
     2718 +        uint64_t bv_found = 0;
     2719 +        const prxregset_xsave_t *xsave = NULL;
     2720 +        fpu_ctx_t *fpu = &lwp->lwp_pcb.pcb_fpu;
     2721 +
     2722 +        VERIFY(fpu_xsave_enabled());
     2723 +
     2724 +        /*
     2725 +         * First, walk each note info header that we have from the user and
     2726 +         * proceed to validate it. The prmachdep code has already validated that
     2727 +         * the size, type, and offset information is valid, but it has not
     2728 +         * validated the semantic contents of this or if someone is trying to
     2729 +         * write something they shouldn't.
     2730 +         *
     2731 +         * While we walk this, we keep track of where the xsave header is. We
     2732 +         * also track all of the bits that we have found along the way so we can
     2733 +         * match up and ensure that everything that was set has a corresponding
     2734 +         * bit in the xsave bitmap. If we have something in the xsave bitmap,
     2735 +         * but not its corresponding data, then that is an error. However, we
     2736 +         * allow folks to write data regions without the bit set in the xsave
     2737 +         * data to make the read, modify, write process simpler.
     2738 +         */
     2739 +        for (uint32_t i = 0; i < prx->pr_ninfo; i++) {
     2740 +                const prxregset_info_t *info = &prx->pr_info[i];
     2741 +                bool found = false;
     2742 +
     2743 +                for (size_t pt = 0; pt < ARRAY_SIZE(fpu_xsave_info); pt++) {
     2744 +                        void *data;
     2745 +                        if (info->pri_type != fpu_xsave_info[pt].xi_type)
     2746 +                                continue;
     2747 +
     2748 +                        found = true;
     2749 +                        data = (void *)((uintptr_t)buf + info->pri_offset);
     2750 +                        if (fpu_xsave_info[pt].xi_valid != NULL &&
     2751 +                            !fpu_xsave_info[pt].xi_valid(model, data)) {
     2752 +                                return (EINVAL);
     2753 +                        }
     2754 +
     2755 +                        if (info->pri_type == PRX_INFO_XSAVE) {
     2756 +                                xsave = data;
     2757 +                        }
     2758 +                        bv_found |= fpu_xsave_info[pt].xi_bits;
     2759 +                        break;
     2760 +                }
     2761 +
     2762 +                if (!found) {
     2763 +                        return (EINVAL);
     2764 +                }
     2765 +        }
     2766 +
     2767 +        /*
     2768 +         * No xsave data, no dice.
     2769 +         */
     2770 +        if (xsave == NULL) {
     2771 +                return (EINVAL);
     2772 +        }
     2773 +
     2774 +        /*
     2775 +         * If anything is set in the xsave header that was not found as we
     2776 +         * walked structures, then that is an error. The opposite is not true as
     2777 +         * discussed above.
     2778 +         */
     2779 +        if ((xsave->prx_xsh_xstate_bv & ~bv_found) != 0) {
     2780 +                return (EINVAL);
     2781 +        }
     2782 +
     2783 +        /*
     2784 +         * At this point, we consider all the data actually valid. Now we must
     2785 +         * set up this information in the save area. If this is our own lwp, we
     2786 +         * must disable it first. Otherwise, we expect that it is already valid.
     2787 +         * To try to sanitize this, we will defensively zero the entire region
     2788 +         * as we are setting everything that will result in here.
     2789 +         */
     2790 +        kpreempt_disable();
     2791 +        if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
     2792 +                /*
     2793 +                 * This case suggests that thread in question doesn't have a
     2794 +                 * valid FPU save state which should only happen when it is on
     2795 +                 * CPU. If this is the case, we explicitly disable the FPU, but
     2796 +                 * do not save it before proceeding. We also sanity check
     2797 +                 * several things here before doing this as using /proc on
     2798 +                 * yourself is always exciting. Unlike fp_save(), fp_free() does
     2799 +                 * not signal that an update is required, so we unconditionally
     2800 +                 * set that for all threads.
     2801 +                 */
     2802 +                VERIFY3P(curthread, ==, lwptot(lwp));
     2803 +                VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
     2804 +                fp_free(fpu);
     2805 +        }
     2806 +        PCB_SET_UPDATE_FPU(&lwp->lwp_pcb);
     2807 +        bzero(lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic,
     2808 +            cpuid_get_xsave_size());
     2809 +
     2810 +        for (uint32_t i = 0; i < prx->pr_ninfo; i++) {
     2811 +                const prxregset_info_t *info = &prx->pr_info[i];
     2812 +                bool found = false;
     2813 +
     2814 +                for (size_t pt = 0; pt < ARRAY_SIZE(fpu_xsave_info); pt++) {
     2815 +                        const void *data;
     2816 +                        if (info->pri_type != fpu_xsave_info[pt].xi_type)
     2817 +                                continue;
     2818 +
     2819 +                        /*
     2820 +                         * Check if we have a set function and if we should
     2821 +                         * include this. We may not if this is something like
     2822 +                         * PRX_INFO_XCR which is read-only.
     2823 +                         *
     2824 +                         * We may not include a given entry as it may not have
     2825 +                         * been set in the actual xsave state that we have been
     2826 +                         * asked to restore, in which case to not break the
     2827 +                         * xsaveopt logic, we must leave it in its initial
     2828 +                         * state, e.g. zeroed (generally). XMM data initial
     2829 +                         * state is not zeroed, but is marked with xi_always to
     2830 +                         * help account for this.
     2831 +                         */
     2832 +                        found = true;
     2833 +                        if (fpu_xsave_info[pt].xi_set == NULL)
     2834 +                                break;
     2835 +                        if (!fpu_xsave_info[pt].xi_always &&
     2836 +                            (xsave->prx_xsh_xstate_bv &
     2837 +                            fpu_xsave_info[pt].xi_bits) !=
     2838 +                            fpu_xsave_info[pt].xi_bits) {
     2839 +                                break;
     2840 +                        }
     2841 +
     2842 +                        data = (void *)((uintptr_t)buf + info->pri_offset);
     2843 +                        fpu_xsave_info[pt].xi_set(fpu, &fpu_xsave_info[pt],
     2844 +                            xsave->prx_xsh_xstate_bv, data);
     2845 +                }
     2846 +
     2847 +                VERIFY(found);
     2848 +        }
     2849 +        kpreempt_enable();
     2850 +
     2851 +        return (0);
     2852 +}
     2853 +
     2854 +/*
     2855 + * To be included in the signal copyout logic we must have a copy function and
     2856 + * the bit in question must be included. Note, we don't consult xi_always here
     2857 + * as that is really part of what is always present for xsave logic and
     2858 + * therefore isn't really pertinent here because of our custom format. See the
     2859 + * big theory statement for more info.
     2860 + */
     2861 +static bool
     2862 +fpu_signal_include(const xsave_proc_info_t *infop, uint64_t xs_bv)
     2863 +{
     2864 +        return ((infop->xi_bits & xs_bv) == infop->xi_bits &&
     2865 +            infop->xi_signal_out != NULL);
     2866 +}
     2867 +
     2868 +/*
     2869 + * We need to fill out the xsave related data into the ucontext_t that we've
     2870 + * been given. We should have a valid user pointer at this point in the uc_xsave
     2871 + * member. This is much simpler than the copyin that we have. Here are the
     2872 + * current assumptions:
     2873 + *
     2874 + *   o This is being called for the current thread. This is not meant to operate
     2875 + *     on an arbitrary thread's state.
     2876 + *   o We cannot assume whether the FPU is valid in the pcb or not. While most
     2877 + *     callers will have just called getfpregs() which saved the state, don't
     2878 + *     assume that.
     2879 + *   o We assume that the user address has the requisite required space for this
     2880 + *     to be copied out.
     2881 + *   o We assume that copyfunc() will ensure we are not copying into a kernel
     2882 + *     address.
     2883 + *
     2884 + * For more information on the format of the data, see the 'Signal Handling and
     2885 + * the ucontext_t' portion of the big theory statement. We copy out all the
     2886 + * constituent parts and then come back and write out the actual final header
     2887 + * information.
     2888 + */
     2889 +int
     2890 +fpu_signal_copyout(struct _klwp *lwp, uintptr_t uaddr, fpu_copyout_f copyfunc)
     2891 +{
     2892 +        struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
     2893 +        uint64_t xs_bv;
     2894 +        uc_xsave_t ucx;
     2895 +        int ret;
     2896 +
     2897 +        VERIFY3P(curthread, ==, lwptot(lwp));
     2898 +        VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
     2899 +        ASSERT3U(fpu->fpu_flags & FPU_EN, ==, FPU_EN);
     2900 +
     2901 +        if (!fpu_xsave_enabled()) {
     2902 +                return (ENOTSUP);
     2903 +        }
     2904 +
     2905 +        /*
     2906 +         * Unlike when we're dealing with /proc, we can unconditionally call
     2907 +         * fp_save() because this is always called in the context that the lwp
     2908 +         * we're operating on is always the one on CPU (which is what fp_save()
     2909 +         * asserts).
     2910 +         */
     2911 +        fp_save(fpu);
     2912 +
     2913 +        bzero(&ucx, sizeof (ucx));
     2914 +        ucx.ucx_vers = UC_XSAVE_VERS;
     2915 +        ucx.ucx_len += sizeof (uc_xsave_t);
     2916 +
     2917 +        xs_bv = fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv;
     2918 +        for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
     2919 +                const xsave_proc_info_t *info = &fpu_xsave_info[i];
     2920 +
     2921 +                if (!fpu_signal_include(&fpu_xsave_info[i], xs_bv))
     2922 +                        continue;
     2923 +                ret = info->xi_signal_out(info, copyfunc, &ucx,
     2924 +                    lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic,
     2925 +                    uaddr);
     2926 +                if (ret != 0) {
     2927 +                        kpreempt_enable();
     2928 +                        return (ret);
     2929 +                }
     2930 +        }
     2931 +
     2932 +        /*
     2933 +         * Now that everything has been copied out, we should have an accurate
     2934 +         * value in the uc_xsave_t header and we can copy that out at the start
     2935 +         * of the user data.
     2936 +         */
     2937 +        ret = copyfunc(&ucx, (void *)uaddr, sizeof (ucx));
     2938 +        return (ret);
     2939 +}
     2940 +
     2941 +/*
     2942 + * Here we've been given a ucontext_t which potentially has a user pointer to
     2943 + * xsave state that we've copied out previously. In this case we need to do the
     2944 + * following, assuming UC_XSAVE is present:
     2945 + *
     2946 + *   o Copy in our header and validate it.
     2947 + *   o Allocate an fpu context to use as a holding ground for all this data.
     2948 + *   o If UC_FPU is set, override the xsave structure with the saved XMM state,
     2949 + *     clear UC_FPU, and make sure that the correct xsave_bv bits are set.
     2950 + *
     2951 + * Currently we always allocate the additional state as a holding ground for the
     2952 + * FPU. What we're copying in may not be valid and we don't want to clobber the
     2953 + * existing FPU state or deal with merging it until we believe it's reasonable
     2954 + * enough. The proc_t is here to set us up for when we have per-process settings
     2955 + * in the extended feature disable MSRs.
     2956 + */
     2957 +int
     2958 +fpu_signal_copyin(struct _klwp *lwp, ucontext_t *kuc)
     2959 +{
     2960 +        uc_xsave_t ucx;
     2961 +        uint64_t bv;
     2962 +        uintptr_t data, max_data;
     2963 +        void *fpu;
     2964 +        proc_t *p = lwp->lwp_procp;
     2965 +        size_t ksize;
     2966 +
     2967 +        /*
     2968 +         * Because this has been opaque filler and the kernel has never
     2969 +         * historically looked at it, we don't really care about the uc_xsave
     2970 +         * pointer being garbage in the case that the flag is not set. While
     2971 +         * this isn't perhaps the most sporting choice in some cases, this is on
     2972 +         * the other hand, pragmatic.
     2973 +         */
     2974 +        if ((kuc->uc_flags & UC_XSAVE) != 0) {
     2975 +                if (kuc->uc_xsave == 0) {
     2976 +                        return (EINVAL);
     2977 +                }
     2978 +
     2979 +                if (!fpu_xsave_enabled()) {
     2980 +                        return (ENOTSUP);
     2981 +                }
     2982 +        } else {
     2983 +                return (0);
     2984 +        }
     2985 +
     2986 +        if (ddi_copyin((const void *)kuc->uc_xsave, &ucx, sizeof (ucx), 0) !=
     2987 +            0) {
     2988 +                return (EFAULT);
     2989 +        }
     2990 +
     2991 +        ksize = cpuid_get_xsave_size();
     2992 +        if (ucx.ucx_vers != UC_XSAVE_VERS || ucx.ucx_len < sizeof (ucx) ||
     2993 +            ucx.ucx_len > ksize ||
     2994 +            (ucx.ucx_bv & ~xsave_bv_all) != 0 ||
     2995 +            (uintptr_t)p->p_as->a_userlimit - ucx.ucx_len <
     2996 +            (uintptr_t)kuc->uc_xsave) {
     2997 +                return (EINVAL);
     2998 +        }
     2999 +
     3000 +        /*
     3001 +         * OK, our goal right now is to recreate a valid xsave_state structure
     3002 +         * that we'll ultimately end up having to merge with our existing one in
     3003 +         * the FPU save state. The reason we describe this as a merge is to help
     3004 +         * future us when we want to retain supervisor state which will never be
     3005 +         * part of userland signal state. The design of the userland signal
     3006 +         * state is basically to compress it as much as we can. This is done for
     3007 +         * two reasons:
     3008 +         *
     3009 +         *   1) We currently consider this a private interface.
     3010 +         *   2) We really want to minimize the actual amount of stack space we
     3011 +         *      use as much as possible. Most applications aren't using AVX-512
     3012 +         *      right now, so doing our own compression style is worthwhile. If
     3013 +         *      libc adopts AVX-512 routines, we may want to change this.
     3014 +         *
     3015 +         * On the allocation below, our assumption is that if a thread has taken
     3016 +         * a signal, then it is likely to take a signal again in the future (or
     3017 +         * be shortly headed to its demise). As such, when that happens we will
     3018 +         * leave the allocated signal stack around for the process. Most
     3019 +         * applications don't allow all threads to take signals, so this should
     3020 +         * hopefully help amortize the cost of the allocation.
     3021 +         */
     3022 +        max_data = (uintptr_t)kuc->uc_xsave + ucx.ucx_len;
     3023 +        data = (uintptr_t)kuc->uc_xsave + sizeof (ucx);
     3024 +        bv = ucx.ucx_bv;
     3025 +        if (lwp->lwp_pcb.pcb_fpu.fpu_signal == NULL) {
     3026 +                lwp->lwp_pcb.pcb_fpu.fpu_signal =
     3027 +                    kmem_cache_alloc(fpsave_cachep, KM_SLEEP);
     3028 +        }
     3029 +        fpu = lwp->lwp_pcb.pcb_fpu.fpu_signal;
     3030 +
     3031 +        /*
     3032 +         * Unconditionally initialize the memory we get in here to ensure that
     3033 +         * it is in a reasonable state for ourselves. This ensures that unused
     3034 +         * regions are mostly left in their initial state (the main exception
     3035 +         * here is the x87/XMM state, but that should be OK). We don't fill in
     3036 +         * the initial xsave state as we expect that to happen as part of our
     3037 +         * processing.
     3038 +         */
     3039 +        bzero(fpu, ksize);
     3040 +
     3041 +        for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
     3042 +                int ret;
     3043 +                const xsave_proc_info_t *info = &fpu_xsave_info[i];
     3044 +                if (!info->xi_always && (info->xi_bits & bv) == 0)
     3045 +                        continue;
     3046 +                bv &= ~info->xi_bits;
     3047 +
     3048 +                if (info->xi_signal_in == NULL)
     3049 +                        continue;
     3050 +                ret = info->xi_signal_in(info, kuc, &ucx, fpu, &data, max_data);
     3051 +                if (ret != 0) {
     3052 +                        return (ret);
     3053 +                }
     3054 +        }
     3055 +        ASSERT0(bv);
     3056 +
     3057 +        /*
     3058 +         * As described in the big theory statement section 'Signal Handling and
     3059 +         * the ucontext_t', we always remove UC_FPU from here as we've taken
     3060 +         * care of reassembling it ourselves.
     3061 +         */
     3062 +        kuc->uc_flags &= ~UC_FPU;
     3063 +        kuc->uc_xsave = (uintptr_t)fpu;
     3064 +
     3065 +        return (0);
     3066 +}
     3067 +
     3068 +/*
     3069 + * This determines the size of the signal stack that we need for our custom form
     3070 + * of the xsave state.
     3071 + */
     3072 +size_t
     3073 +fpu_signal_size(struct _klwp *lwp)
     3074 +{
     3075 +        struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
     3076 +        size_t len = sizeof (uc_xsave_t);
     3077 +        uint64_t xs_bv;
     3078 +
     3079 +        VERIFY3P(curthread, ==, lwptot(lwp));
     3080 +        VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
     3081 +        ASSERT3U(fpu->fpu_flags & FPU_EN, ==, FPU_EN);
     3082 +
     3083 +        if (!fpu_xsave_enabled()) {
     3084 +                return (0);
     3085 +        }
     3086 +
     3087 +        kpreempt_disable();
     3088 +        if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
     3089 +                fp_save(fpu);
     3090 +        }
     3091 +
     3092 +        xs_bv = fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv;
     3093 +        for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
     3094 +                size_t comp_size;
     3095 +
     3096 +                if (!fpu_signal_include(&fpu_xsave_info[i], xs_bv))
     3097 +                        continue;
     3098 +
     3099 +                cpuid_get_xsave_info(fpu_xsave_info[i].xi_bits, &comp_size,
     3100 +                    NULL);
     3101 +                len += comp_size;
     3102 +        }
     3103 +
     3104 +        kpreempt_enable();
     3105 +        return (len);
     3106 +}
     3107 +
     3108 +/*
     3109 + * This function is used in service of restorecontext() to set the specified
     3110 + * thread's extended FPU state to the passed in data. Our assumptions at this
     3111 + * point from the system are:
     3112 + *
     3113 + *   o Someone has already verified that the actual xsave header is correct.
     3114 + *   o Any traditional XMM state that causes a #gp has been clamped.
     3115 + *   o That data is basically the correct sized xsave state structure. Right now
     3116 + *     that means it is not compressed and follows the CPUID-based rules for
     3117 + *     constructing and laying out data.
     3118 + *   o That the lwp argument does refer to the current thread.
     3119 + *
     3120 + * Our primary purpose here is to merge the current FPU state with what exists
     3121 + * here. Right now, "merge", strictly speaking is just "replace". We can get
     3122 + * away with just replacing everything because all we currently save are user
     3123 + * states. If we start saving kernel states in here, this will get more nuanced
     3124 + * and we will need to be more careful about how we store data here.
     3125 + */
     3126 +void
     3127 +fpu_set_xsave(struct _klwp *lwp, const void *data)
     3128 +{
     3129 +        struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
     3130 +        uint32_t status, xstatus;
     3131 +        struct xsave_state *dst_xsave;
     3132 +
     3133 +        ASSERT(fpu_xsave_enabled());
     3134 +        VERIFY3P(curthread, ==, lwptot(lwp));
     3135 +        VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
     3136 +        ASSERT3U(fpu->fpu_flags & FPU_EN, ==, FPU_EN);
     3137 +
     3138 +        /*
     3139 +         * We use fp_save() here rather than a stock fpdisable() so we can
     3140 +         * attempt to honor our invariants that when the thread state has been
     3141 +         * saved, the valid flag is set, even though we're going to be
     3142 +         * overwriting it shortly. If we just called fpdisable() then we would
     3143 +         * basically be asking for trouble.
     3144 +         *
     3145 +         * Because we are modifying the state here and we don't want the system
     3146 +         * to end up in an odd state, we are being a little paranoid and
     3147 +         * disabling preemption across this operation. In particular, once the
     3148 +         * state is properly tagged with FPU_VALID, there should be no other way
     3149 +         * that this thread can return to userland and get cleared out because
     3150 +         * we're resetting its context; however, we let paranoia win out.
     3151 +         */
     3152 +        kpreempt_disable();
     3153 +        if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
     3154 +                fp_save(fpu);
     3155 +        }
     3156 +
     3157 +        bcopy(data, lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic,
     3158 +            cpuid_get_xsave_size());
     3159 +        dst_xsave = lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic;
     3160 +        status = dst_xsave->xs_fxsave.__fx_ign2[3]._l[0];
     3161 +        xstatus = dst_xsave->xs_fxsave.__fx_ign2[3]._l[1];
     3162 +        dst_xsave->xs_fxsave.__fx_ign2[3]._l[0] = 0;
     3163 +        dst_xsave->xs_fxsave.__fx_ign2[3]._l[1] = 0;
     3164 +
     3165 +        /*
     3166 +         * These two status words are information that the kernel itself uses to
     3167 +         * track additional information and is part of the traditional fpregset,
     3168 +         * but is not part of our xregs information. Because we are setting this
     3169 +         * state, we leave it up to the rest of the kernel to determine whether
     3170 +         * this came from an fpregset_t or is being reset to the default of 0.
     3171 +         */
     3172 +        fpu->fpu_regs.kfpu_status = status;
     3173 +        fpu->fpu_regs.kfpu_xstatus = xstatus;
     3174 +
     3175 +        fpu->fpu_flags |= FPU_VALID;
     3176 +        PCB_SET_UPDATE_FPU(&lwp->lwp_pcb);
     3177 +        kpreempt_enable();
     3178 +}
     3179 +
     3180 +/*
     3181 + * Convert the current FPU state to the traditional fpregset_t. In the 64-bit
     3182 + * kernel, this is just an fxsave_state with additional values for the status
     3183 + * and xstatus members.
     3184 + *
     3185 + * This has the same nuance as the xregs cases discussed above, but is simpler
     3186 + * in that we only need to handle the fxsave state, but more complicated because
     3187 + * we need to check our save mechanism.
     3188 + */
     3189 +void
     3190 +fpu_get_fpregset(struct _klwp *lwp, fpregset_t *fp)
     3191 +{
     3192 +        struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
     3193 +
     3194 +        kpreempt_disable();
     3195 +        fp->fp_reg_set.fpchip_state.status = fpu->fpu_regs.kfpu_status;
     3196 +        fp->fp_reg_set.fpchip_state.xstatus = fpu->fpu_regs.kfpu_xstatus;
     3197 +
     3198 +        if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
     3199 +                /*
     3200 +                 * If we're requesting the fpregs of a thread that isn't
     3201 +                 * currently valid and isn't the one that we're executing, then
     3202 +                 * we consider getting this information to be a best-effort and
     3203 +                 * we will not stop the thread in question to serialize it,
     3204 +                 * which means possibly getting stale data. This is the
     3205 +                 * traditional semantics that the system has used to service
     3206 +                 * this for /proc.
     3207 +                 */
     3208 +                if (curthread == lwptot(lwp)) {
     3209 +                        VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
     3210 +                        fp_save(fpu);
     3211 +                }
     3212 +        }
     3213 +
     3214 +        /*
     3215 +         * If the FPU is not enabled and the state isn't valid (due to someone
     3216 +         * else setting it), just copy the initial state.
     3217 +         */
     3218 +        if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == 0) {
     3219 +                bcopy(&sse_initial, fp, sizeof (sse_initial));
     3220 +                kpreempt_enable();
     3221 +                return;
     3222 +        }
     3223 +
     3224 +        /*
     3225 +         * Given that we have an enabled FPU, we must look at the type of FPU
     3226 +         * save mechanism to clean this up. In particular, while we can just
     3227 +         * copy the save area with FXSAVE, with XSAVE we must carefully copy
     3228 +         * only the bits that are valid and reset the rest to their default
     3229 +         * state.
     3230 +         */
     3231 +        switch (fp_save_mech) {
     3232 +        case FP_FXSAVE:
     3233 +                bcopy(fpu->fpu_regs.kfpu_u.kfpu_fx, fp,
     3234 +                    sizeof (struct fxsave_state));
     3235 +                break;
     3236 +        case FP_XSAVE:
     3237 +                fpu_xsave_to_fxsave(fpu->fpu_regs.kfpu_u.kfpu_xs,
     3238 +                    (struct fxsave_state *)fp);
     3239 +                break;
     3240 +        default:
     3241 +                panic("Invalid fp_save_mech");
     3242 +        }
     3243 +
     3244 +        kpreempt_enable();
     3245 +}
     3246 +
     3247 +/*
     3248 + * This is a request to set the ABI fpregset_t into our actual hardware state.
     3249 + * In the 64-bit kernel the first 512 bytes of the fpregset_t is the same as the
     3250 + * 512-byte fxsave area.
     3251 + */
     3252 +void
     3253 +fpu_set_fpregset(struct _klwp *lwp, const fpregset_t *fp)
     3254 +{
     3255 +        struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
     3256 +
     3257 +        kpreempt_disable();
     3258 +        if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
     3259 +                /*
     3260 +                 * We always save the entire FPU. This is required if we're
     3261 +                 * using xsave. If we're using fxsave, we could skip the
     3262 +                 * 512-byte write and instead just disable the FPU since we'd be
     3263 +                 * replacing it all. For now we don't bother with more
     3264 +                 * conditional logic.
     3265 +                 */
     3266 +                VERIFY3P(curthread, ==, lwptot(lwp));
     3267 +                VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
     3268 +                fp_save(fpu);
     3269 +        }
     3270 +
     3271 +        fpu->fpu_regs.kfpu_xstatus = fp->fp_reg_set.fpchip_state.xstatus;
     3272 +        fpu->fpu_regs.kfpu_status = fp->fp_reg_set.fpchip_state.status;
     3273 +        switch (fp_save_mech) {
     3274 +        case FP_FXSAVE:
     3275 +                bcopy(fp, fpu->fpu_regs.kfpu_u.kfpu_fx,
     3276 +                    sizeof (struct fxsave_state));
     3277 +                break;
     3278 +        case FP_XSAVE:
     3279 +                bcopy(fp, fpu->fpu_regs.kfpu_u.kfpu_xs,
     3280 +                    sizeof (struct fxsave_state));
     3281 +                fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv |=
     3282 +                    XFEATURE_LEGACY_FP | XFEATURE_SSE;
     3283 +                break;
     3284 +        default:
     3285 +                panic("Invalid fp_save_mech");
     3286 +        }
     3287 +
     3288 +        fpu->fpu_flags |= FPU_VALID;
     3289 +        PCB_SET_UPDATE_FPU(&lwp->lwp_pcb);
     3290 +        kpreempt_enable();
1526 3291  }

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX