base-ij-webrev Cdiff usr/src/uts/intel/os/fpu.c

Print this page

15254 %ymm registers not restored after signal handler
15367 x86 getfpregs() summons corrupting %xmm ghosts
15333 want x86 /proc xregs support (libc_db, libproc, mdb, etc.)
15336 want libc functions for extended ucontext_t
15334 want ps_lwphandle-specific reg routines
15328 FPU_CW_INIT mistreats reserved bit
15335 i86pc fpu_subr.c isn't really platform-specific
15332 setcontext(2) isn't actually noreturn
15331 need <sys/stdalign.h>
Change-Id: I7060aa86042dfb989f77fc3323c065ea2eafa9ad
Conflicts:
    usr/src/uts/common/fs/proc/prcontrol.c
    usr/src/uts/intel/os/archdep.c
    usr/src/uts/intel/sys/ucontext.h
    usr/src/uts/intel/syscall/getcontext.c


*** 20,30 ****
   */
  /*
   * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
   * Copyright 2021 Joyent, Inc.
   * Copyright 2021 RackTop Systems, Inc.
!  * Copyright 2022 Oxide Computer Company
   */
  
  /*      Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T   */
  /*              All Rights Reserved                             */
--- 20,30 ----
   */
  /*
   * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
   * Copyright 2021 Joyent, Inc.
   * Copyright 2021 RackTop Systems, Inc.
!  * Copyright 2023 Oxide Computer Company
   */
  
  /*      Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T   */
  /*              All Rights Reserved                             */
*** 61,70 ****
--- 61,74 ----
  #include <sys/debug.h>
  #include <sys/x86_archext.h>
  #include <sys/sysmacros.h>
  #include <sys/cmn_err.h>
  #include <sys/kfpu.h>
+ #include <sys/stdbool.h>
+ #include <sys/stdalign.h>
+ #include <sys/procfs_isa.h>
+ #include <sys/sunddi.h>
  
  /*
   * FPU Management Overview
   * -----------------------
   *
*** 73,93 ****
   *
   * Today, when we refer to the 'FPU', we don't just mean the original x87 FPU.
   * While that state still exists, there is much more that is covered by the FPU.
   * Today, this includes not just traditional FPU state, but also supervisor only
   * state. The following state is currently managed and covered logically by the
!  * idea of the FPU registers:
   *
   *    o Traditional x87 FPU
   *    o Vector Registers (%xmm, %ymm, %zmm)
   *    o Memory Protection Extensions (MPX) Bounds Registers
   *    o Protected Key Rights Registers (PKRU)
   *    o Processor Trace data
   *
   * The rest of this covers how the FPU is managed and controlled, how state is
   * saved and restored between threads, interactions with hypervisors, and other
!  * information exported to user land through aux vectors. A lot of background
   * information is here to synthesize major parts of the Intel SDM, but
   * unfortunately, it is not a replacement for reading it.
   *
   * FPU Control Registers
   * ---------------------
--- 77,101 ----
   *
   * Today, when we refer to the 'FPU', we don't just mean the original x87 FPU.
   * While that state still exists, there is much more that is covered by the FPU.
   * Today, this includes not just traditional FPU state, but also supervisor only
   * state. The following state is currently managed and covered logically by the
!  * idea of the FPU registers and more generally is called the Extended Processor
!  * States:
   *
   *    o Traditional x87 FPU
   *    o Vector Registers (%xmm, %ymm, %zmm)
   *    o Memory Protection Extensions (MPX) Bounds Registers
   *    o Protected Key Rights Registers (PKRU)
   *    o Processor Trace data
+  *    o Control-Flow Enforcement state
+  *    o Hardware Duty Cycle
+  *    o Hardware P-states
   *
   * The rest of this covers how the FPU is managed and controlled, how state is
   * saved and restored between threads, interactions with hypervisors, and other
!  * information exported to userland through aux vectors. A lot of background
   * information is here to synthesize major parts of the Intel SDM, but
   * unfortunately, it is not a replacement for reading it.
   *
   * FPU Control Registers
   * ---------------------
*** 331,341 ****
   * that all future threads that are descendants of the current one get the
   * thread context operations (unless they call exec).
   *
   * To deal with some things like the agent lwp, we double check the state of the
   * FPU in sys_rtt_common() to make sure that it has been enabled before
!  * returning to user land. In general, this path should be rare, but it's useful
   * for the odd lwp here and there.
   *
   * The FPU state will remain valid most of the time. There are times that
   * the state will be rewritten. For example in restorecontext, due to /proc, or
   * the lwp calls exec(). Whether the context is being freed or we are resetting
--- 339,349 ----
   * that all future threads that are descendants of the current one get the
   * thread context operations (unless they call exec).
   *
   * To deal with some things like the agent lwp, we double check the state of the
   * FPU in sys_rtt_common() to make sure that it has been enabled before
!  * returning to userland. In general, this path should be rare, but it's useful
   * for the odd lwp here and there.
   *
   * The FPU state will remain valid most of the time. There are times that
   * the state will be rewritten. For example in restorecontext, due to /proc, or
   * the lwp calls exec(). Whether the context is being freed or we are resetting
*** 360,380 ****
   *
   * This was not always the case. Traditionally on x86 a 'lazy' FPU restore was
   * employed. This meant that the FPU would be saved on a context switch and the
   * CR0.TS bit would be set. When a thread next tried to use the FPU, it would
   * then take a #NM trap, at which point we would restore the FPU from the save
!  * area and return to user land. Given the frequency of use of the FPU alone by
!  * libc, there's no point returning to user land just to trap again.
   *
   * There are a few cases though where the FPU state may need to be changed for a
   * thread on its behalf. The most notable cases are in the case of processes
   * using /proc, restorecontext, forking, etc. In all of these cases the kernel
   * will force a threads FPU state to be saved into the PCB through the fp_save()
   * function. Whenever the FPU is saved, then the FPU_VALID flag is set on the
   * pcb. This indicates that the save state holds currently valid data. As a side
   * effect of this, CR0.TS will be set. To make sure that all of the state is
!  * updated before returning to user land, in these cases, we set a flag on the
   * PCB that says the FPU needs to be updated. This will make sure that we take
   * the slow path out of a system call to fix things up for the thread. Due to
   * the fact that this is a rather rare case, effectively setting the equivalent
   * of t_postsys is acceptable.
   *
--- 368,388 ----
   *
   * This was not always the case. Traditionally on x86 a 'lazy' FPU restore was
   * employed. This meant that the FPU would be saved on a context switch and the
   * CR0.TS bit would be set. When a thread next tried to use the FPU, it would
   * then take a #NM trap, at which point we would restore the FPU from the save
!  * area and return to userland. Given the frequency of use of the FPU alone by
!  * libc, there's no point returning to userland just to trap again.
   *
   * There are a few cases though where the FPU state may need to be changed for a
   * thread on its behalf. The most notable cases are in the case of processes
   * using /proc, restorecontext, forking, etc. In all of these cases the kernel
   * will force a threads FPU state to be saved into the PCB through the fp_save()
   * function. Whenever the FPU is saved, then the FPU_VALID flag is set on the
   * pcb. This indicates that the save state holds currently valid data. As a side
   * effect of this, CR0.TS will be set. To make sure that all of the state is
!  * updated before returning to userland, in these cases, we set a flag on the
   * PCB that says the FPU needs to be updated. This will make sure that we take
   * the slow path out of a system call to fix things up for the thread. Due to
   * the fact that this is a rather rare case, effectively setting the equivalent
   * of t_postsys is acceptable.
   *
*** 386,396 ****
   *
   * The FPU_VALID flag on the currently executing thread's pcb is meant to track
   * what the value of CR0.TS should be. If it is set, then CR0.TS will be set.
   * However, because we eagerly restore, the only time that CR0.TS should be set
   * for a non-kernel thread is during operations where it will be cleared before
!  * returning to user land and importantly, the only data that is in it is its
   * own.
   *
   * Kernel FPU Usage
   * ----------------
   *
--- 394,404 ----
   *
   * The FPU_VALID flag on the currently executing thread's pcb is meant to track
   * what the value of CR0.TS should be. If it is set, then CR0.TS will be set.
   * However, because we eagerly restore, the only time that CR0.TS should be set
   * for a non-kernel thread is during operations where it will be cleared before
!  * returning to userland and importantly, the only data that is in it is its
   * own.
   *
   * Kernel FPU Usage
   * ----------------
   *
*** 439,449 ****
   * Certain operations can cause the kernel to take traps due to FPU activity.
   * Generally these events will cause a user process to receive a SIGFPU and if
   * the kernel receives it in kernel context, we will die. Traditionally the #NM
   * (Device Not Available / No Math) exception generated by CR0.TS would have
   * caused us to restore the FPU. Now it is a fatal event regardless of whether
!  * or not user land causes it.
   *
   * While there are some cases where the kernel uses the FPU, it is up to the
   * kernel to use the FPU in a way such that it cannot receive a trap or to use
   * the appropriate trap protection mechanisms.
   *
--- 447,457 ----
   * Certain operations can cause the kernel to take traps due to FPU activity.
   * Generally these events will cause a user process to receive a SIGFPU and if
   * the kernel receives it in kernel context, we will die. Traditionally the #NM
   * (Device Not Available / No Math) exception generated by CR0.TS would have
   * caused us to restore the FPU. Now it is a fatal event regardless of whether
!  * or not userland causes it.
   *
   * While there are some cases where the kernel uses the FPU, it is up to the
   * kernel to use the FPU in a way such that it cannot receive a trap or to use
   * the appropriate trap protection mechanisms.
   *
*** 471,482 ****
--- 479,933 ----
   * more information. As a result, we set up an aux vector that contains
   * information about what save and restore mechanisms it should be using and
   * the sizing thereof based on what the kernel supports. This is passed down in
   * a series of aux vectors SUN_AT_FPTYPE and SUN_AT_FPSIZE. This information is
   * initialized in fpu_subr.c.
+  *
+  * Signal Handling and the ucontext_t
+  * ----------------------------------
+  *
+  * One of the many gifts that signals give us is the twofold fact that when a
+  * signal occurs, the signal handler is allowed to change the CPU's state
+  * arbitrarily and when the signal handler is done executing, we must restore it
+  * back to the original state. However, the second part of this is that the
+  * signal handler is actually allowed to modify the state that the thread will
+  * return to! To create this facade, the kernel will create a full ucontext_t
+  * state, effectively calling getcontext(2) on the thread's behalf, and a
+  * pointer to that is given to the signal handler (the void * argument for the
+  * sa_sigaction function pointer in sigaction(2)). When libc is done with a
+  * signal, it will call setcontext(2) with that same ucontext_t.
+  *
+  * Now, the ucontext_t has a fixed ABI for both ILP32 and LP64 environments and
+  * it's often declared on the stack itself, with the signal handler spilling all
+  * this state to the stack. The ucontext_t machine portion was broken into the
+  * general purpose and floating point registers. In 64-bit code, the floating
+  * point registers were mostly the same as the results of the fxsave instruction
+  * (i.e. struct fxsave_state). While the 64-bit kernel still uses the equivalent
+  * starting point for information, it is transformed into a different shape to
+  * deal with the history of the 32-bit SYS V ABI.
+  *
+  * While this worked, if you're reading this, you're aware that the x86 FPU and
+  * extended register states didn't stop at the initial 16 128-bit %xmm
+  * registers. Since then we have added 256-bit %ymm, 512-bit %zmm, and the %k
+  * opmask registers. None of these fit inside the standard ucontext_t; however,
+  * they must all be preserved and restored across a signal. While the various
+  * x86 platform-specific ABIs all suggest that these registers are not preserved
+  * across a function call, receiving a signal is not a function call and must be
+  * thought of like a process receiving an interrupt. In other words, this
+  * extended state must be preserved.
+  *
+  * To facilitate this, we have extended the ucontext_t structure with an
+  * additional flag, UC_XSAVE, which indicates that the traditional padding
+  * member, uc_xsave, actually is a pointer to the extended state. While this is
+  * accessible outside of a signal handling context through the combination of
+  * ucontext_alloc(3C) and getcontext_extd(2), our design around saving this
+  * state is focused on signal handling. Signal handling spills all this state to
+  * the stack and if we cannot spill the entire state to the stack then our
+  * inability to deliver the signal results in the process being killed! While
+  * there are separate efforts to ensure that the signal stack sizing that is
+  * used for the minimum and maximum signal sizes are sufficient, we still need
+  * to do our part to minimize the likelihood here.
+  *
+  * In designing this, we make the following observations which have helped us
+  * focus our design:
+  *
+  *   o While the start of an xsave area is the traditional 512-byte fxsave XMM
+  *     region, we already have that in the fpregs. Thus there is no reason to
+  *     duplicate it. This not only saves 512 bytes of additional stack space,
+  *     but it also means we don't have to ask which of the version of it to take
+  *     if they were to differ.
+  *
+  *   o Many applications out there aren't necessarily using the extended vectors
+  *     and even when we do make libc and others take advantage of it, it will
+  *     behoove us to ensure that they are put back into their initial state
+  *     after use. This leads us to expect that in a number of cases, the actual
+  *     extended register state will be in its initial state.
+  *
+  *   o While the signal handler does allow contents to be modified, we are
+  *     starting with making the interface private and thus allowing us to excise
+  *     components that are in their initial state.
+  *
+  *   o There are similarities to what we want to create with the compressed
+  *     xsave format; however, because we don't always have support for the
+  *     compressed format, we can't just arbitrarily say let's do a compressed
+  *     save to the user stack.
+  *
+  *   o Because we are not handing this state directly to and from hardware, we
+  *     don't need to meet some of the constraints of the compressed xsave format
+  *     around wanting alignment for the initial save or additional components.
+  *
+  * All of the above lead us to our own unique format for this data. When the
+  * UC_XSAVE flag is set in the ucontext_t, the uc_xsave member points to a
+  * uc_xsave_t structure which has a magic version number, a 32-bit length of the
+  * overall structure, and the 64-bit state bit-vector to represent which
+  * components are valid. Following this 8-byte header, each component that is
+  * present in the bit vector is immediately written out in roughly ascending bit
+  * order (the order is determined based on the order of the fpu_xsave_info
+  * array).
+  *
+  * This makes the rough logic that we have here when taking a signal and writing
+  * out this state as:
+  *
+  *   1. Ensure that the FPU is saved and that the contents of the pcb save area
+  *      are valid. That is, call fp_save() if the state is not already flagged
+  *      with FPU_VALID.
+  *
+  *   2. Copy the bit-vector from the save area and remove the XFEATURE_LEGACY_FP
+  *      and XFEATURE_SSE bits as these will be placed in the xsave area.
+  *
+  *   3. Initialize the uc_xsave_t by setting our version field, initializing the
+  *      length to the length of the current structure, and then setting the
+  *      modified bit vector above.
+  *
+  *   4. Walk each remaining bit of the bit-vector. For each set bit, copy out
+  *      its extended state starting at the current length in the header and then
+  *      increase the header size by that length.
+  *
+  *   5. Finally write out the final uc_xsave_t structure.
+  *
+  * The above process is also used when someone manually calls getcontext_extd(2)
+  * to get this state. The main difference between the two is which copyout
+  * function we use. This deserves some explanation. Our main starting point for
+  * all the logic here is fpu_signal_copyout(). It takes a copyfunc that allows
+  * the signal handling context to operate with a different copyout than we
+  * normally use in say getcontext_extd(2).
+  *
+  * When we've received a signal, we're at the intersection of several different
+  * gotchas. Normal copyout (or ddi_copyout()) will trigger watchpoints. That is,
+  * the watchpoints effectively set a copyout override function (t_copyops) that
+  * we end up vectoring to rather than a normal copyout. This allows the data to
+  * be modified and for the watchpoint to fire. While this is all well and good
+  * normally, it is problematic if we are trying to handle a signal. The signal
+  * deliver logic, sendsig(), goes through and disables the watchpoint for the
+  * region of the stack that we are copying out to. However, disabling
+  * watchpoints is not sufficient, we also need to use the copyout_noerr
+  * variants.
+  *
+  * These variants also require the use of on_fault() and no_fault() for error
+  * handling. While it is tempting to try and on_fault() the entire
+  * fpu_signal_copyout() operation, that is actually fraught for a few reasons.
+  * The first is that we don't want to disable faults during the entire operation
+  * as if the kernel messes up we will treat that as a user error. That isn't
+  * theoretical and happened during development. The second and perhaps more
+  * important issue is that correctly bounding the on_fault() / no_fault() means
+  * being careful about state. For example, kernel pre-emption is often disabled
+  * during parts of these operations, but it needs to be re-enabled when we're
+  * done. This would require tracking in some volatile variable that this had
+  * been enabled and disabled and tracking that.
+  *
+  * Instead, this is why fpu_signal_copyout() takes a copy out function as an
+  * argument. When we're in signal handling context, the function will use
+  * coypout_noerr() and wrap it in the appropriate on_fault() mechanisms.
+  *
+  * RESTORING STATE
+  *
+  * Copying out our current state is the easier half of this problem. When the
+  * kernel is done with a signal it calls setcontext(2) with the ucontext_t we
+  * assembled for it as described above. setcontext(2) isn't just used for
+  * returning from signals.
+  *
+  * The process for this goes in two steps. The first step is to copy in,
+  * validate, and transform the ucontext_t UC_XSAVE that we created above into an
+  * equivalent xsave format that we can use the appropriate xrstor function on.
+  * This first phase is implemented in fpu_signal_copyin(). Once that is done, we
+  * come back through a second phase that is driven out of restorecontext() and
+  * is implemented in fpu_set_xsave().
+  *
+  * Let's start by discussing the second part of this, which is more
+  * straightforward. In particular, the second phase assumes that all of the
+  * validation and error handling has been done by the first phase. This means
+  * here, we have a buffer that is already the appropriate size
+  * (cpuid_get_xsave_size()) and all we need to do is make sure that we can
+  * replace the actual save state with the current one.
+  *
+  * The only piece of shenanigans we have to do is around the kernel provided
+  * notion of 'status' and 'xstatus', which are cached versions of the x87 and
+  * SSE exception vectors. These are part of the fpregset ABI and therefore we
+  * need to propagate them from the temporary storage that part 1 sets up in the
+  * ignored region of the fxsave data. We use that because it is not persisted by
+  * the CPU, so clobbering it is generally alright.
+  *
+  * Once that is done, we simply note that we need a PCB update to occur to
+  * refresh the FPU state before we return to userland. Given that someone has
+  * called setcontext(2), this was always going to happen because we have to
+  * update segment registers and related, so this isn't so bad. With that, let's
+  * move onto the more nuanced part (1).
+  *
+  * When we're handling a setcontext(2) we have, in userland, a data structure
+  * that should match one we serialized out, though we cannot assume that a user
+  * has not modified it either accidentally or maliciously. Our goal is to set up
+  * the appropriate xsave state that can be passed to the CPU's xrstor. The first
+  * problem we have to deal with is where do we actually put this state?
+  *
+  * While not many programs actually call setcontext(2) on their own volition,
+  * this is going to get hit every time we take a signal. The first thought was
+  * to re-use the existing thread's save area; however, that's a bit challenging
+  * for a few reasons. In particular, we would need to ensure that we don't go
+  * off-CPU for any reason, which we cannot assume with a copyin from a user
+  * address space. In particular, it is trivial for us to hit a case where the
+  * stack has been paged out for some reason, which eschews that path.
+  *
+  * Instead, whenever a thread first calls setcontext(2), generally from signal
+  * context, we will at that time allocate another entry from the 'fpsave_cachep'
+  * kmem cache, giving us a buffer of the appropriate space to handle this. Once
+  * this buffer has been allocated, we leave it assigned to the thread's pcb and
+  * only tear it down when the thread itself finally exits. We reason that a
+  * thread that takes a signal once is either going to have the process exit
+  * shortly thereafter or is much more likely to take a signal again in the
+  * future. Many daemons and other processes set things up so signals are
+  * dispatched via one location, masking signals in other thread, using
+  * sigsuspend(2), signalfd(3C), or something similar.
+  *
+  * With this buffer in hand, we begin our task of reassembling state. Note, all
+  * of this is conditional on UC_XSAVE being set in the uc_flags member of the
+  * ucontext_t. If it is not set, then we assume that there is no extended state
+  * and will use the traditional path of setting the fpregset_t into the system
+  * via setfpregs().
+  *
+  * We first will copyin and validate the uc_xsave_t. In particular, we need to
+  * make sure the version makes sense and that the xsave component bit-vector
+  * doesn't have anything unexpected and more importantly unsupported in it, and
+  * that the addresses we've been given are within the user address space. At
+  * this point we can walk through our table of implemented bits and process
+  * them.
+  *
+  * For most components in here, the processing is straightforward. We continue
+  * walking our cursor and copy data into the kernel and place it in the
+  * appropriate place in our xsave state. If a xsave state component bit-vector
+  * isn't set, then we must ensure that we have the item in the initial state,
+  * which for everything other than the x87/SSE state is the memory being zeroed.
+  *
+  * The most unique case in the copyin state is that of the x87/SSE state. You
+  * might recall that we didn't copy it out explicitly as part of the uc_xsave_t,
+  * but instead have opted to use the single definition in the fpregset_t. Thus
+  * here, we copy it out of the fpregset_t, which the kernel has helpfully
+  * already unified into the 64-bit fxsave version prior to calling us, and
+  * install that into the save area we're building up.
+  *
+  * As part of this, there are two important pieces to be aware of. The first is
+  * that because the fpregset_t has both the status and xstatus members
+  * mentioned earlier, we temporarily copy them to the software-usable ignored
+  * areas of the fxsave state so we can corral this extra state into part (2)
+  * without needing to allocate additional space. The second piece is that when
+  * we're done processing this we explicitly remove the UC_FPU flag that would
+  * tell the kernel to proceed with updating that region. The problem is that
+  * that goes directly into the pcb's save area and not to the intermediate
+  * buffer as it uses the same entry point as /proc, mainly setfpregs().
+  *
+  * We don't do much validation of the actual contents of the registers that are
+  * being set with the exception of ensuring that no reserved bits of the mxcsr
+  * are used. This is not as strict as /proc, but failure here means the process
+  * is likely going to die (returning from setcontext() in a signal handler is
+  * fatal).
+  *
+  * /proc xregs
+  * -----------
+  *
+  * Observability of the state of the extended registers is important for
+  * understanding the system. While on the surface this is similar to signal
+  * handling, it is crucially different in a number of ways:
+  *
+  *   o In signal handling, we're trying to conserve every byte of stack that we
+  *     can.
+  *   o The /proc xregs file will end up in core files, which means that we need
+  *     a way of knowing what components are present and not present in it,
+  *     because this will vary from CPU to CPU due to the addition of
+  *     architectural features. For example, some CPUs support AVX-512, but
+  *     others do not.
+  *   o The signal handling structure is private and we're not trying to have
+  *     software modify it, on the other hand, the /proc interfaces that we
+  *     support we do want software to be able to interrogate and manipulate.
+  *     These need to be something that we can introduce additional components
+  *     into and make other changes that still allow it to work.
+  *
+  * The x86 xregs format is documented in proc(5). The short form is that the
+  * prxregset_hdr_t has a number of information entries, which are of the type
+  * prxregset_info_t. Each of the information headers has a type, size, and
+  * offset which indicate where to find the additional data.
+  *
+  * Each entry is described as one of the entries in the fpu_xsave_info[]. These
+  * items either are a 1:1 correspondence with a xsave related feature (e.g.
+  * there is one entry for each of the three AVX-512 components) or it is
+  * something synthetic that we provide as additional information such as the
+  * PRX_INFO_XCR, which is a way of getting information about the system such as
+  * what is enabled in %xcr0 out there.
+  *
+  * Unlike signal handling, we are given the buffer to place everything that
+  * needs to be written out. This is partially the design of the /proc APIs. That
+  * is, we will always assemble everything into the entire buffer that /proc asks
+  * us to, and then it will use as much or as little of it as is required.
+  * Similarly, when setting things, we don't have to worry about copying in
+  * information in the same way as signal handling does, because /proc takes care
+  * of it and always hands us a full buffer. Sizing that is a little nuanced, but
+  * is all handled in prmachdep.c.
+  *
+  * When someone performs a read of the xregs and thus is asking us for the
+  * current state, there is a little bit of nuance that we need to deal with
+  * here. The first, is whether or not the FPU is enabled and the second is if
+  * the FPU is enabled, whether a given component is noted as being in its
+  * initial state. This basically gives us three possible states for a given
+  * component:
+  *
+  *   1. FPU_EN is not set and FPU_VALID is not set. This means we need to take
+  *      the illumos FPU default for an item. More on that in a moment.
+  *   2. The saved xsave state indicates that the bit for a given component is
+  *      zero -- specifically the xsh_xstate_bv member of the struct xsave_state.
+  *      In this case, we must take the CPU's default for an item. This is
+  *      usually the same as illumos, but not always.
+  *   3. The saved xsave state indicates that a given component's state bit is
+  *      valid. The simplest of our cases. We can just take what we have from the
+  *      xsave state.
+  *
+  * The CPU's default state for most components other than the x87/SSE state is
+  * to have it be zeroed. This is what we treat as our default state as well. The
+  * primary difference is in the initialization of the x87/SSE state. The SYS V
+  * ABI requires that we enable a different floating point control word then the
+  * hardware default. This means that when we're dealing with case (1) for
+  * x87/SSE we have to be more careful than the other components. Thankfully for
+  * everything else this is just keeping it zeroed.
+  *
+  * A reasonable question would be why not just skip components that aren't
+  * marked as present. There are a few reasons we take a different approach and
+  * always include it. Both of these are to make lives simpler for consumers. In
+  * the first case, when someone is performing a read and wants to reassemble and
+  * answer the question of 'what is the value of %ymm0 or %zmm15', they have
+  * to combine multiple disparate parts. If one knows that the data we put into
+  * there is always valid and represents what is in hardware and doesn't have to
+  * keep track of what are the defaults in different circumstances, then that
+  * greatly simplifies consumers lives. It also helps us for core files and other
+  * observability cases because the answer to what is the operating system's
+  * default may change over time.
+  *
+  * Similarly, including all the possible structures means that we have
+  * simplified someone who does a write. Writes are always setting the full state
+  * of a thread, meaning that if someone wants to modify only a single register
+  * they must do a read, modify, and write. By including everything that they
+  * might need, it makes it easier for consumers to do this and not have to cons
+  * up the whole structure on their own.
+  *
+  * When we're setting state, things change around a little bit. We have a few
+  * constraints that are laid out in proc(5). In particular, we require that the
+  * PRX_INFO_XSAVE component always be present to tell us which other components
+  * we expect to be here and which ones we don't. We also are much stricter about
+  * writes in several ways. Of all the components, the PRX_INFO_XCR is read-only
+  * and may not be modified by a calling process. In addition, when we have
+  * 32-bit applications which have reserved registers in the %ymm, %zmm, etc.
+  * segments, if they are being written to and have modifications, then we will
+  * indicate an error there.
+  *
+  * Because we are given the entire buffer from userland and don't need to have
+  * an intermediate place to copy it in, we will validate the entire thing in
+  * advance. Once it has been validated and we consider it legal, then we will
+  * translate each entry into its corresponding entry in pcb's normal floating
+  * point state. This is different from signal handling mostly because of the
+  * fact that we are not using copyin, and once we get to this point, there is
+  * no more validation, so we don't have the same concerns around blocking while
+  * pre-emption is disabled.
+  *
+  * The Wrinkle with fpregs
+  * -----------------------
+  *
+  * When we instead turn our attention to the fpregs, whether we're gathering
+  * them as part of the ucontext_t or as part of /proc, there are a few
+  * complications that we need to be aware of when we're operating on a kernel
+  * that is using xsave as the save mechanism. When we're using fxsave as the
+  * save mechanism, the CPU will always save the entire 512-byte fxsave region.
+  * The fpregs ABI that the kernel expects is basically this structure itself,
+  * which is transformed into a 32-bit compatible form in archdep.c.
+  *
+  * But xsave makes this much more complex and has been a source of historical
+  * bugs in the system. In particular, unlike fxsave, xsave has its component bit
+  * vector that is written out to indicate validity. This means that blindly
+  * copying the fxsave area without checking those bits will lead us to do the
+  * wrong thing. The XMM state flag mostly covers the 16 128-bit %xmm registers,
+  * while the x87 legacy fp flag covers the rest of the state. This is all good,
+  * aside from the MCXSR.
+  *
+  * One of the more complicated pieces of xsave state management is correctly
+  * answering the question of when the MXCSR is written out to xsave_state. In
+  * practice, this is rather convoluted and varies. If either the XMM or AVX
+  * feature bits are set then the CPU will write out the MXCSR and its mask
+  * register into the traditional fxsave state region. This behavior is dependent
+  * on the type of save function that we use. xsave and xsaveopt will look at the
+  * AVX feature bit; however, xsavec does not and only considers the SSE feature
+  * bit. This means that when we're retrieving things, we need to check both of
+  * those bits to determine if we should use the initial state or the value
+  * written out.
+  *
+  * When we come to someone trying to set the fpregs through /proc, the main
+  * question we have is what happens to the extended registers. We have opted to
+  * implement and document it such that a write to the fpregs only impacts the
+  * fpregs. Put differently, we will save the FPU state with fp_save() ahead of
+  * copying the data into the save area, set the state bits for x87 and XMM
+  * state, and then set the FPU to be restored. All in all, this basically means
+  * that writing to fpregs does not touch any of the %ymm, %zmm, or other state
+  * that we might have present.
+  *
+  * Forward Looking: Adding Intel AMX Support
+  * -----------------------------------------
+  *
+  * Nothing can stop the march of features being added into the FPU. One of the
+  * larger chunks that we will need to wrangle with is Intel's Advanced Matrix
+  * Extensions (AMX), which add a large chunk of xsave state to each process.
+  * While things like AVX and AVX-512 have been enabled by default, the broader
+  * OS community has not been wanting to do this for AMX ,because of the size of
+  * the state which exceeds 8 KiB. While the signal handling state went out of
+  * its way to minimize the size it wrote to the stack, if this is used, it would
+  * need to be preserved.
+  *
+  * To deal with this reality and the fact that folks don't really want to
+  * enable it by default for all purposes when its use will be quite special
+  * purpose, Intel has also added a MSR around extended feature disable or xfd.
+  * This is what we represent in the PRX_INFO_XCR prx_xfd member. Our starting
+  * assumption, and the reason that so much of the /proc and signal logic ensures
+  * that we have the thread and process around, taking as an example the unused
+  * process argument in fpu_proc_xregs_info(), is that we will follow suit and
+  * default to having support disabled, but that a process will be able to opt
+  * into it, which will result in several different assumptions around signal
+  * stack sizing and cause us to reallocate and extend the pcb's FPU save state.
+  *
+  * The following is a list of items to pay attention to for future folks who
+  * work on this:
+  *
+  *   o We will want to confirm whether other systems have opted to make this
+  *     process-wide or thread-wide. Assuming process-wide, we will need to do a
+  *     hold of all lwps while making a change. The interface for that probably
+  *     doesn't want to be /proc, as a process probably doesn't want to write to
+  *     its own control file. Changing it for another process could be done
+  *     through the agent-lwp.
+  *   o Opting into this should probably be a one-way street.
+  *   o Opting into this will need to evaluate all threads and in particular
+  *     stack sizes to confirm they adhere to the new minimum.
+  *   o We will need to make sure that setting and clearing the xfd MSR is part
+  *     of the FPU context ops and something we set by default on every CPU.
+  *   o We will need to add a new interface to allow opting into this feature.
+  *   o We will need to ensure that all subsequently created signal stacks adhere
+  *     to a required minimum size that we communicate through libc.
+  *   o We will need to make sure that both rtld and libc no longer rely on a
+  *     static value of the AT_SUN_FPSIZE, but rather realize that this can be
+  *     dynamic. At that time, we should evaluate if we can get away with not
+  *     needing to save this for rtld, even though signal handlers should assume
+  *     they will.
+  *   o The various components (because there is more than one) will want to be
+  *     added to the fpu_xsave_info[]. Consulting the processes's xfd will be
+  *     required and probably require logic changes.
+  *
+  * The above is not exhaustive. We'll probably have some other issues and fun
+  * while doing this.
   */
  
+ /*
+  * The kind of FPU we advertise to rtld so it knows what to do when working
+  * through the PLT.
+  */
+ int fp_elf = AT_386_FPINFO_FXSAVE;
+ 
+ /*
+  * Mechanism to save FPU state.
+  */
+ int fp_save_mech = FP_FXSAVE;
+ 
  kmem_cache_t *fpsave_cachep;
  
  /* Legacy fxsave layout + xsave header + ymm */
  #define AVX_XSAVE_SIZE          (512 + 64 + 256)
  
*** 487,496 ****
--- 938,954 ----
  CTASSERT(sizeof (struct fnsave_state) == 108);
  CTASSERT((offsetof(struct fxsave_state, fx_xmm[0]) & 0xf) == 0);
  CTASSERT(sizeof (struct xsave_state) >= AVX_XSAVE_SIZE);
  
  /*
+  * Basic architectural alignment information.
+  */
+ #define FPU_ALIGN_XMM   16
+ #define FPU_ALIGN_YMM   32
+ #define FPU_ALIGN_ZMM   64
+ 
+ /*
   * This structure is the x86 implementation of the kernel FPU that is defined in
   * uts/common/sys/kfpu.h.
   */
  
  typedef enum kfpu_flags {
*** 548,569 ****
   * MXCSR register
   */
  uint32_t sse_mxcsr_mask = SSE_MXCSR_MASK_DEFAULT;
  
  /*
-  * Initial kfpu state for x87 used by fpinit()
-  */
- const struct fnsave_state x87_initial = {
-         FPU_CW_INIT,    /* f_fcw */
-         0,              /* __f_ign0 */
-         0,              /* f_fsw */
-         0,              /* __f_ign1 */
-         0xffff,         /* f_ftw */
-         /* rest of structure is zero */
- };
- 
- /*
   * This vector is patched to xsave_ctxt() or xsaveopt_ctxt() if we discover we
   * have an XSAVE-capable chip in fpu_probe.
   */
  void (*fpsave_ctxt)(void *) = fpxsave_ctxt;
  void (*fprestore_ctxt)(void *) = fpxrestore_ctxt;
--- 1006,1015 ----
*** 659,669 ****
                  /*NOTREACHED*/
          }
  
          /*
           * Mark that both the parent and child need to have the FPU cleaned up
!          * before returning to user land.
           */
  
          ctxop_attach(ct, fp_ctxop_allocate(cfp));
  }
  
--- 1105,1115 ----
                  /*NOTREACHED*/
          }
  
          /*
           * Mark that both the parent and child need to have the FPU cleaned up
!          * before returning to userland.
           */
  
          ctxop_attach(ct, fp_ctxop_allocate(cfp));
  }
  
*** 864,873 ****
--- 1310,1320 ----
           * We keep a copy of the pointer in lwp_fpu so that we can restore the
           * value in forklwp() after we duplicate the parent's LWP state.
           */
          lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic =
              kmem_cache_alloc(fpsave_cachep, KM_SLEEP);
+         fp->fpu_signal = NULL;
  
          if (fp_save_mech == FP_XSAVE) {
                  /*
                   *
                   * We bzero since the fpinit() code path will only
*** 886,895 ****
--- 1333,1347 ----
          if (fp->fpu_regs.kfpu_u.kfpu_generic != NULL) {
                  kmem_cache_free(fpsave_cachep,
                      fp->fpu_regs.kfpu_u.kfpu_generic);
                  lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic = NULL;
          }
+ 
+         if (fp->fpu_signal != NULL) {
+                 kmem_cache_free(fpsave_cachep, fp->fpu_signal);
+                 fp->fpu_signal = NULL;
+         }
  }
  
  /*
   * Called during the process of forklwp(). The kfpu_u pointer will have been
   * overwritten while copying the parent's LWP structure. We have a valid copy
*** 915,924 ****
--- 1367,1378 ----
  
          /* copy the parent's values into the new lwp's struct */
          bcopy(lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic, xp, sz);
          /* now restore the pointer */
          lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic = xp;
+         /* Ensure that we don't inherit our parent's signal state */
+         lwp->lwp_pcb.pcb_fpu.fpu_signal = NULL;
  }
  
  /*
   * Handle a processor extension error fault
   * Returns non zero for error.
*** 1521,1526 ****
--- 1975,3291 ----
                  } else {
                          f = FPU_KERNEL;
                  }
                  curthread->t_lwp->lwp_pcb.pcb_fpu.fpu_flags &= ~f;
          }
+ }
+ 
+ /*
+  * Fill in FPU information that is required by exec.
+  */
+ void
+ fpu_auxv_info(int *typep, size_t *lenp)
+ {
+         *typep = fp_elf;
+         switch (fp_save_mech) {
+         case FP_FXSAVE:
+                 *lenp = sizeof (struct fxsave_state);
+                 break;
+         case FP_XSAVE:
+                 *lenp = cpuid_get_xsave_size();
+                 break;
+         default:
+                 *lenp = 0;
+                 break;
+         }
+ }
+ 
+ /*
+  * This function exists to transform an xsave_state into an fxsave_state. The
+  * way that we have to do this is nuanced. We assume that callers have already
+  * handled FPU_EN and thus we only need to consider the xsave_state and its
+  * component vector itself. This results in the following cases that we need to
+  * consider:
+  *
+  *   o Neither the x87 / XMM state bits are set. We use the hardware default and
+  *     need to ensure to copy the xsave header.
+  *   o Both x87 / XMM state bits are set. We can copy everything.
+  *   o Only the x87 bit is set. We need to copy the x87 state but make the XMM
+  *     state be in the initial case.
+  *   o Only the XMM bit is set. The reverse of the above case.
+  *
+  * The illumos and hardware defaults in 'sse_initial' and 'avx_initial' are
+  * generally the same; however, the default floating point control word is
+  * different.
+  *
+  * Finally, we have the complication of the MXCSR and MCXSR_MASK registers.
+  * Because we are using xsave and xsaveopt in the kernel right now and not
+  * xsavec, the hardware may write out the MXCSR and MXCSR_MASK registers if the
+  * XFEATURE_AVX bit is set. Therefore if we don't have the XMM bit set but AVX
+  * is set, we must also come back and copy out the MXCSR register. Sorry, we
+  * don't make the rules.
+  */
+ static void
+ fpu_xsave_to_fxsave(const struct xsave_state *xsave, struct fxsave_state *fx)
+ {
+         const uint64_t comps = xsave->xs_header.xsh_xstate_bv;
+ 
+         switch (comps & (XFEATURE_LEGACY_FP | XFEATURE_SSE)) {
+         case XFEATURE_LEGACY_FP | XFEATURE_SSE:
+                 bcopy(xsave, fx, sizeof (*fx));
+                 return;
+         case XFEATURE_LEGACY_FP:
+                 bcopy(xsave, fx, offsetof(struct fxsave_state, fx_xmm));
+                 fx->fx_mxcsr = SSE_MXCSR_INIT;
+                 fx->fx_mxcsr_mask = 0;
+                 break;
+         case XFEATURE_SSE:
+                 bcopy(&sse_initial, fx, offsetof(struct fxsave_state,
+                     fx_mxcsr));
+ 
+                 fx->fx_fcw = FPU_CW_INIT_HW;
+                 fx->fx_mxcsr = xsave->xs_fxsave.fx_mxcsr;
+                 fx->fx_mxcsr_mask = xsave->xs_fxsave.fx_mxcsr_mask;
+                 bcopy(xsave->xs_fxsave.fx_xmm, fx->fx_xmm, sizeof (fx->fx_xmm));
+                 break;
+         default:
+                 bcopy(&sse_initial, fx, sizeof (*fx));
+                 fx->fx_fcw = FPU_CW_INIT_HW;
+                 break;
+         }
+ 
+         /*
+          * Account for the AVX causing MXCSR to be valid.
+          */
+         if ((xsave->xs_header.xsh_xstate_bv & XFEATURE_AVX) != 0 &&
+             (xsave->xs_header.xsh_xstate_bv & XFEATURE_SSE) == 0) {
+                 fx->fx_mxcsr = xsave->xs_fxsave.fx_mxcsr;
+                 fx->fx_mxcsr_mask = xsave->xs_fxsave.fx_mxcsr_mask;
+         }
+ }
+ 
+ /*
+  * This function is designed to answer the question of are we using any xsave
+  * family of instructions in context switch and therefore we have this state.
+  * This should still remain true if we are using xsavec or xsaves in the kernel
+  * in the future.
+  */
+ boolean_t
+ fpu_xsave_enabled(void)
+ {
+         return (fp_save_mech == FP_XSAVE);
+ }
+ 
+ /*
+  * The following structure is used to track and manage the programmatic
+  * construction of /proc and signal stack spilling of xsave information. All
+  * known xsave types that the kernel supports must be included here.
+  */
+ typedef struct xsave_proc_info {
+         /*
+          * This matches the /proc xregs type that this data represents. This s
+          * used for /proc only.
+          */
+         uint32_t xi_type;
+         /*
+          * This indicates the size of the /proc data that we're operating on.
+          * This is only used for /proc.
+          */
+         size_t  xi_size;
+         /*
+          * This indicates the alignment that we want to have for the member when
+          * we're writing out. This is not used when setting data. This is only
+          * used for /proc.
+          */
+         size_t  xi_align;
+         /*
+          * This indicates whether this member must always be considered or not.
+          * This is used in both /proc and context/signal handling.
+          */
+         bool    xi_always;
+         /*
+          * This contains the corresponding bits in the xsave bit vector that
+          * corresponds to this entry. This is used for both /proc and
+          * context/signal handling.
+          */
+         uint64_t xi_bits;
+         /*
+          * The xi_fill function pointer is used to write out the /proc regset
+          * data (e.g. when a user reads xregs). This is only used for the /proc
+          * handling. The xi_valid function pointer is used instead to validate a
+          * given set of data that we've read in, while the xi_set pointer is
+          * used to actually transform the data in the underlying fpu save area.
+          */
+         void    (*xi_fill)(const fpu_ctx_t *, const struct xsave_proc_info *,
+             void *);
+         bool    (*xi_valid)(model_t, const void *);
+         void    (*xi_set)(fpu_ctx_t *, const struct xsave_proc_info *,
+             uint64_t, const void *);
+         /*
+          * The xi_signal_in and xi_signal_out function pointers are used for
+          * extended context and signal handling information. They are used when
+          * reading in data from a ucontex_t and writing it out respectively.
+          * These are only used for context/signal handling.
+          */
+         int     (*xi_signal_in)(const struct xsave_proc_info *,
+             const ucontext_t *, const uc_xsave_t *, void *, uintptr_t *,
+             const uintptr_t);
+         int     (*xi_signal_out)(const struct xsave_proc_info *, fpu_copyout_f,
+             uc_xsave_t *, const void *fpup, uintptr_t);
+ } xsave_proc_info_t;
+ 
+ static bool
+ fpu_proc_xregs_initial_state(const fpu_ctx_t *fpu, uint64_t feats)
+ {
+         if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == 0) {
+                 return (B_TRUE);
+         }
+ 
+         return ((fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv &
+             feats) == 0);
+ }
+ 
+ static void
+ fpu_proc_xregs_xcr_fill(const fpu_ctx_t *fpu, const xsave_proc_info_t *info,
+     void *datap)
+ {
+         prxregset_xcr_t *xcr = datap;
+ 
+         xcr->prx_xcr_xcr0 = xsave_bv_all;
+ }
+ 
+ /*
+  * Unlike other instruction portions, we treat the xsave header and the legacy
+  * XMM section together as both are somewhat tied at the instruction hip. Unlike
+  * the latter values, the initial state here is not quite the same.
+  */
+ static void
+ fpu_proc_xregs_xsave_fill(const fpu_ctx_t *fpu, const xsave_proc_info_t *info,
+     void *datap)
+ {
+         prxregset_xsave_t *prxsave = datap;
+         const struct xsave_state *xsave = fpu->fpu_regs.kfpu_u.kfpu_xs;
+         size_t hdr_off;
+ 
+         /*
+          * In the x87/XMM case, the no device vs. initial state is different
+          * because the initial state case still wants us to copy the real xsave
+          * header. It's also worth calling out that the actual illumos default
+          * fxsave state is not the same as what Intel documents. The main
+          * difference is in what the x87 FPU control word is. This results in
+          * the following different cases that we need to think about:
+          *
+          *   o FPU_EN is not set. So we use the illumos default.
+          */
+         if ((fpu->fpu_flags & FPU_EN) == 0) {
+                 bcopy(&avx_initial, prxsave, sizeof (*prxsave));
+                 return;
+         }
+ 
+         /*
+          * Convert all the fxsave region while taking into account the validity
+          * of the xsave bits. The prxregset_xsave_t structure is identical in
+          * the first 512-bits to the prxsave structure.
+          */
+         fpu_xsave_to_fxsave(xsave, (struct fxsave_state *)prxsave);
+ 
+         /*
+          * Now that we've dealt with the x87 and XMM state, take care of the
+          * header.
+          */
+         hdr_off = offsetof(prxregset_xsave_t, prx_xsh_xstate_bv);
+         bcopy((const void *)((uintptr_t)xsave + hdr_off),
+             (void *)((uintptr_t)prxsave + hdr_off),
+             sizeof (struct xsave_header));
+ }
+ 
+ static void
+ fpu_proc_xregs_std_fill(const fpu_ctx_t *fpu, const xsave_proc_info_t *info,
+     void *datap)
+ {
+         if (!fpu_proc_xregs_initial_state(fpu, info->xi_bits)) {
+                 size_t size, off;
+                 const void *xsave_off;
+ 
+                 cpuid_get_xsave_info(info->xi_bits, &size, &off);
+                 ASSERT3U(size, ==, info->xi_size);
+                 xsave_off = (void *)((uintptr_t)fpu->fpu_regs.kfpu_u.kfpu_xs +
+                     off);
+                 bcopy(xsave_off, datap, info->xi_size);
+         }
+ }
+ 
+ /*
+  * Users are not allowed to actually set the xcr information this way. However,
+  * to make it easier for someone to just do a read, modify, write, of the xregs
+  * data, if it is identical, then we will accept it (and do nothing).
+  */
+ static bool
+ fpu_proc_xregs_xcr_valid(model_t model, const void *datap)
+ {
+         const prxregset_xcr_t *xcr = datap;
+ 
+         return (xcr->prx_xcr_xcr0 == xsave_bv_all && xcr->prx_xcr_xfd == 0 &&
+             xcr->prx_xcr_pad[0] == 0 && xcr->prx_xcr_pad[1] == 0);
+ }
+ 
+ /*
+  * To match traditional /proc semantics, we do not error if reserved bits of
+  * MXCSR are set, they will be masked off when writing data. We do not allow
+  * someone to indicate that they are asking for compressed xsave data, hence the
+  * check that prx_xsh_comp_bv is zero. Finally, we will check that each
+  * component that was indicated in the xstate_bv is present as another item as
+  * part of the broader validation path.
+  */
+ static bool
+ fpu_proc_xregs_xsave_valid(model_t model, const void *datap)
+ {
+         const prxregset_xsave_t *xsave = datap;
+         uint64_t rsvd[6] = { 0 };
+ 
+         if (bcmp(rsvd, xsave->prx_xsh_reserved, sizeof (rsvd)) != 0 ||
+             xsave->prx_xsh_xcomp_bv != 0) {
+                 return (false);
+         }
+ 
+         if ((xsave->prx_xsh_xstate_bv & ~xsave_bv_all) != 0) {
+                 return (false);
+         }
+ 
+         return (true);
+ }
+ 
+ /*
+  * The YMM, ZMM, and Hi-ZMM registers are all valid when in an LP64 environment
+  * on x86; however, when operating in ILP32, subsets are reserved. We basically
+  * require that all reserved portions are set to zero as our way to accept them.
+  */
+ static bool
+ fpu_proc_xregs_ymm_valid(model_t model, const void *datap)
+ {
+         upad128_t ymm_zero[8];
+         const prxregset_ymm_t *ymm = datap;
+ 
+         if (model == DATAMODEL_LP64) {
+                 return (true);
+         }
+ 
+         bzero(&ymm_zero, sizeof (ymm_zero));
+         return (bcmp(&ymm->prx_ymm[8], &ymm_zero, sizeof (ymm_zero)) == 0);
+ }
+ 
+ static bool
+ fpu_proc_xregs_zmm_valid(model_t model, const void *datap)
+ {
+         upad256_t zmm_zero[8];
+         const prxregset_zmm_t *zmm = datap;
+ 
+         if (model == DATAMODEL_LP64) {
+                 return (true);
+         }
+ 
+         bzero(&zmm_zero, sizeof (zmm_zero));
+         return (bcmp(&zmm->prx_zmm[8], &zmm_zero, sizeof (zmm_zero)) == 0);
+ }
+ 
+ static bool
+ fpu_proc_xregs_hi_zmm_valid(model_t model, const void *datap)
+ {
+         prxregset_hi_zmm_t hi_zmm_zero;
+         const prxregset_hi_zmm_t *hi_zmm = datap;
+ 
+         if (model == DATAMODEL_LP64) {
+                 return (true);
+         }
+ 
+         bzero(&hi_zmm_zero, sizeof (hi_zmm_zero));
+         return (bcmp(hi_zmm, &hi_zmm_zero, sizeof (hi_zmm_zero)) == 0);
+ }
+ 
+ /*
+  * The xsave state consists of the first 512 byes of the XMM state and then the
+  * xsave header itself. Because of the xsave header, this structure is marked
+  * with xi_always, so we must always process and consider it.
+  *
+  * Semantically if either of the bits around SSE / x87 is set, then we will copy
+  * the entire thing. This may mean that we end up copying a region that is not
+  * valid into the save area; however, that should be OK as we still have the
+  * specific bit flags that indicate what we should consider or not.
+  *
+  * There is one additional wrinkle we need to consider and honor here. The CPU
+  * will load the MXCSR values if the AVX bit is set in an xrstor regardless of
+  * anything else. So if if this is set and we do not have a valid x87/XMM bits
+  * set then we will set the MXCSR to its default state in case the processor
+  * tries to load it. For reference see:
+  *
+  *   o Intel SDM Volume 1: 13.8.1 Standard Form of XRSTOR
+  *   o AMD64 Volume 2: Section 11.5.9 MXCSR State Management
+  *
+  * Note, the behavior around this changes depending on whether using the
+  * compressed xrstor or not. We are not, but it's worth being aware of. We do
+  * not worry about MXCSR_MASK because the instructions ignore it.
+  */
+ static void
+ fpu_proc_xregs_xsave_set(fpu_ctx_t *fpu, const xsave_proc_info_t *info,
+     uint64_t xsave_bv, const void *datap)
+ {
+         const struct xsave_state *xs = datap;
+ 
+         if ((xsave_bv & info->xi_bits) != 0) {
+                 bcopy(&xs->xs_fxsave, &fpu->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave,
+                     sizeof (struct fxsave_state));
+         } else if ((xsave_bv & XFEATURE_AVX) != 0) {
+                 fpu->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_mxcsr =
+                     SSE_MXCSR_INIT;
+         }
+ 
+         bcopy(&xs->xs_header, &fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header,
+             sizeof (struct xsave_header));
+         fpu->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_mxcsr &= sse_mxcsr_mask;
+ }
+ 
+ static void
+ fpu_proc_xregs_std_set(fpu_ctx_t *fpu, const xsave_proc_info_t *info,
+     uint64_t xsave_bv, const void *datap)
+ {
+         size_t size, off;
+         void *xsave_off;
+ 
+         cpuid_get_xsave_info(info->xi_bits, &size, &off);
+         xsave_off = (void *)((uintptr_t)fpu->fpu_regs.kfpu_u.kfpu_xs +
+             off);
+         bcopy(datap, xsave_off, size);
+ }
+ 
+ /*
+  * Dealing with XMM data is a little more annoying here. If UC_FPU is set, it
+  * also contains a copy of the XMM region. That must take priority over anything
+  * we have here. In the copyout code we do not set the XMM bits here as
+  * something to copy, therefore if they are set, we currently treat that as an
+  * error.
+  *
+  * The system has always gone through and cleaned up the reserved bits in the
+  * fxsave state when someone calls setcontext(). Therefore we need to do the
+  * same thing which is why you see the masking of the mxcsr below.
+  *
+  * Finally, there is one last wrinkle here that we need to consider. The
+  * fpregset_t has historically had two private words that are used to convey the
+  * status which cache the status/exception information. Therefore, we well...
+  * cheat. Intel has left bytes 464 (0x1d0) through 511 (0x1ff) available for us
+  * to do what we want. So we will pass this through that for the moment to help
+  * us pass this state around without too much extra allocation.
+  */
+ static int
+ fpu_signal_copyin_xmm(const xsave_proc_info_t *info, const ucontext_t *kuc,
+     const uc_xsave_t *ucx, void *fpup, uintptr_t *udatap,
+     const uintptr_t max_udata)
+ {
+         struct xsave_state *xsave = fpup;
+ 
+         if ((ucx->ucx_bv & info->xi_bits) != 0) {
+                 return (EINVAL);
+         }
+ 
+         if ((kuc->uc_flags & UC_FPU) != 0) {
+                 bcopy(&kuc->uc_mcontext.fpregs, &xsave->xs_fxsave,
+                     sizeof (struct fxsave_state));
+                 xsave->xs_fxsave.__fx_ign2[3]._l[0] =
+                     kuc->uc_mcontext.fpregs.fp_reg_set.fpchip_state.status;
+                 xsave->xs_fxsave.__fx_ign2[3]._l[1] =
+                     kuc->uc_mcontext.fpregs.fp_reg_set.fpchip_state.xstatus;
+                 xsave->xs_fxsave.fx_mxcsr &= sse_mxcsr_mask;
+                 xsave->xs_header.xsh_xstate_bv |= info->xi_bits;
+         }
+ 
+         return (0);
+ }
+ 
+ static int
+ fpu_signal_copyin_std(const xsave_proc_info_t *info, const ucontext_t *kuc,
+     const uc_xsave_t *ucx, void *fpup, uintptr_t *udatap,
+     const uintptr_t max_udata)
+ {
+         size_t len, xsave_off;
+         void *copy_to;
+         struct xsave_state *xsave = fpup;
+ 
+         cpuid_get_xsave_info(info->xi_bits, &len, &xsave_off);
+         if (*udatap + len > max_udata) {
+                 return (EOVERFLOW);
+         }
+ 
+         copy_to = (void *)((uintptr_t)fpup + xsave_off);
+         if (ddi_copyin((void *)*udatap, copy_to, len, 0) != 0) {
+                 return (EFAULT);
+         }
+ 
+         xsave->xs_header.xsh_xstate_bv |= info->xi_bits;
+         *udatap = *udatap + len;
+ 
+         return (0);
+ }
+ 
+ static int
+ fpu_signal_copyout_std(const xsave_proc_info_t *info, fpu_copyout_f copyfunc,
+     uc_xsave_t *ucx, const void *fpup, uintptr_t udatap)
+ {
+         size_t len, xsave_off;
+         const void *copy_from;
+         void *copy_to;
+         int ret;
+ 
+         cpuid_get_xsave_info(info->xi_bits, &len, &xsave_off);
+         copy_from = (void *)(uintptr_t)fpup + xsave_off;
+         copy_to = (void *)(udatap + ucx->ucx_len);
+ 
+         ret = copyfunc(copy_from, copy_to, len);
+         if (ret != 0) {
+                 return (ret);
+         }
+ 
+         ucx->ucx_len += len;
+         ucx->ucx_bv |= info->xi_bits;
+         return (0);
+ }
+ 
+ /*
+  * This table contains information about the extended FPU states and synthetic
+  * information we create for /proc, the ucontext_t, and signal handling. The
+  * definition of the xsave_proc_info_t describes how each member is used.
+  *
+  * In general, this table is expected to be in the order of the xsave data
+  * structure itself. Synthetic elements that we create can go anywhere and new
+  * ones should be inserted at the end. This structure is walked in order to
+  * produce the /proc and signal handling logic, so changing the order is
+  * meaningful for those and probably should not be done lightly.
+  */
+ static const xsave_proc_info_t fpu_xsave_info[] = { {
+         .xi_type = PRX_INFO_XCR,
+         .xi_size = sizeof (prxregset_xcr_t),
+         .xi_align = alignof (prxregset_xcr_t),
+         .xi_always = true,
+         .xi_bits = 0,
+         .xi_fill = fpu_proc_xregs_xcr_fill,
+         .xi_valid = fpu_proc_xregs_xcr_valid
+ }, {
+         /*
+          * The XSAVE entry covers both the xsave header and the %xmm registers.
+          * Note, there is no signal copyout information for the %xmm registers
+          * because it is expected that that data is already in the fpregset_t.
+          */
+         .xi_type = PRX_INFO_XSAVE,
+         .xi_size = sizeof (prxregset_xsave_t),
+         .xi_align = FPU_ALIGN_XMM,
+         .xi_always = true,
+         .xi_bits = XFEATURE_LEGACY_FP | XFEATURE_SSE,
+         .xi_fill = fpu_proc_xregs_xsave_fill,
+         .xi_set = fpu_proc_xregs_xsave_set,
+         .xi_valid = fpu_proc_xregs_xsave_valid,
+         .xi_signal_in = fpu_signal_copyin_xmm
+ }, {
+         .xi_type = PRX_INFO_YMM,
+         .xi_size = sizeof (prxregset_ymm_t),
+         .xi_align = FPU_ALIGN_YMM,
+         .xi_always = false,
+         .xi_bits = XFEATURE_AVX,
+         .xi_fill = fpu_proc_xregs_std_fill,
+         .xi_set = fpu_proc_xregs_std_set,
+         .xi_signal_in = fpu_signal_copyin_std,
+         .xi_valid = fpu_proc_xregs_ymm_valid,
+         .xi_signal_out = fpu_signal_copyout_std
+ }, {
+         /*
+          * There is no /proc validation function for the mask registers because
+          * they are the same in ILP32 / LP64 and there is nothing for us to
+          * actually validate.
+          */
+         .xi_type = PRX_INFO_OPMASK,
+         .xi_size = sizeof (prxregset_opmask_t),
+         .xi_align = alignof (prxregset_opmask_t),
+         .xi_always = false,
+         .xi_bits = XFEATURE_AVX512_OPMASK,
+         .xi_fill = fpu_proc_xregs_std_fill,
+         .xi_set = fpu_proc_xregs_std_set,
+         .xi_signal_in = fpu_signal_copyin_std,
+         .xi_signal_out = fpu_signal_copyout_std
+ }, {
+         .xi_type = PRX_INFO_ZMM,
+         .xi_size = sizeof (prxregset_zmm_t),
+         .xi_align = FPU_ALIGN_ZMM,
+         .xi_always = false,
+         .xi_bits = XFEATURE_AVX512_ZMM,
+         .xi_fill = fpu_proc_xregs_std_fill,
+         .xi_set = fpu_proc_xregs_std_set,
+         .xi_valid = fpu_proc_xregs_zmm_valid,
+         .xi_signal_in = fpu_signal_copyin_std,
+         .xi_signal_out = fpu_signal_copyout_std
+ }, {
+         .xi_type = PRX_INFO_HI_ZMM,
+         .xi_size = sizeof (prxregset_hi_zmm_t),
+         .xi_align = FPU_ALIGN_ZMM,
+         .xi_always = false,
+         .xi_bits = XFEATURE_AVX512_HI_ZMM,
+         .xi_fill = fpu_proc_xregs_std_fill,
+         .xi_set = fpu_proc_xregs_std_set,
+         .xi_valid = fpu_proc_xregs_hi_zmm_valid,
+         .xi_signal_in = fpu_signal_copyin_std,
+         .xi_signal_out = fpu_signal_copyout_std
+ } };
+ 
+ static bool
+ fpu_proc_xregs_include(const xsave_proc_info_t *infop)
+ {
+         return (infop->xi_always || (xsave_bv_all & infop->xi_bits) != 0);
+ }
+ 
+ void
+ fpu_proc_xregs_info(struct proc *p __unused, uint32_t *ninfop, uint32_t *sizep,
+     uint32_t *dstart)
+ {
+         size_t ret = sizeof (prxregset_hdr_t);
+         uint32_t ninfo = 0;
+ 
+         ASSERT(fpu_xsave_enabled());
+ 
+         /*
+          * Right now the set of flags that are enabled in the FPU is global.
+          * That is, while the pcb's fcpu_ctx_t has the fpu_xsave_mask, the
+          * actual things that might show up and we care about are all about what
+          * is set up in %xcr0 which is stored in the global xsave_bv_all. If we
+          * move to per-process FPU enablement which is likely to come with AMX,
+          * then this will need the proc_t to look at, hence why we've set things
+          * up with the unused variable above.
+          *
+          * We take two passes through the array. The first is just to count up
+          * how many informational entries we need.
+          */
+         for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
+                 if (!fpu_proc_xregs_include(&fpu_xsave_info[i]))
+                         continue;
+                 ninfo++;
+         }
+ 
+         ASSERT3U(ninfo, >, 0);
+         ret += sizeof (prxregset_info_t) * ninfo;
+ 
+         for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
+                 size_t curphase;
+                 if (!fpu_proc_xregs_include(&fpu_xsave_info[i]))
+                         continue;
+ 
+                 curphase = ret % fpu_xsave_info[i].xi_align;
+                 if (ret < fpu_xsave_info[i].xi_align) {
+                         ret = fpu_xsave_info[i].xi_align;
+                 } else if (curphase != 0) {
+                         ret += curphase;
+                 }
+ 
+                 if (i == 0 && dstart != NULL) {
+                         *dstart = ret;
+                 }
+ 
+                 ret += fpu_xsave_info[i].xi_size;
+         }
+ 
+         VERIFY3U(ret, <=, UINT32_MAX);
+         if (sizep != NULL) {
+                 *sizep = ret;
+         }
+ 
+         if (ninfop != NULL) {
+                 *ninfop = ninfo;
+         }
+ }
+ 
+ /*
+  * This function supports /proc. Because /proc does not have a process locked
+  * while processing a PCSXREG, so this tries to establish an upper bound that we
+  * will validate later in fpu_proc_xregs_set(). We basically say that if you
+  * take the maximum xsave size and add 1 KiB that is a good enough approximation
+  * for the maximum size.
+  */
+ size_t
+ fpu_proc_xregs_max_size(void)
+ {
+         VERIFY(fpu_xsave_enabled());
+         return (cpuid_get_xsave_size() + 0x1000);
+ }
+ 
+ /*
+  * This functions supports /proc. In particular, it's meant to perform the
+  * following:
+  *
+  *  o Potentially save the current thread's registers.
+  *  o Write out the x86 xsave /proc xregs format data from the xsave data we
+  *    actually have. Note, this can be a little weird for cases where the FPU is
+  *    not actually enabled, which happens for system processes.
+  *    /proc let us read this state?
+  */
+ void
+ fpu_proc_xregs_get(struct _klwp *lwp, void *buf)
+ {
+         uint32_t size, ninfo, curinfo, dstart;
+         fpu_ctx_t *fpu = &lwp->lwp_pcb.pcb_fpu;
+         prxregset_hdr_t *hdr = buf;
+ 
+         ASSERT(fpu_xsave_enabled());
+         fpu_proc_xregs_info(lwp->lwp_procp, &ninfo, &size, &dstart);
+ 
+         /*
+          * Before we get going, defensively zero out all the data buffer so that
+          * the rest of the fill functions can assume a specific base.
+          */
+         bzero(buf, size);
+ 
+         kpreempt_disable();
+         if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
+                 /*
+                  * This case suggests that thread in question doesn't have a
+                  * valid FPU save state which should only happen when it is on
+                  * CPU. If this is the case, we must ensure that we save the
+                  * current FPU state before proceeding. We also sanity check
+                  * several things here before doing this as using /proc on
+                  * yourself is always exciting. fp_save() will ensure that the
+                  * thread is flagged to go back to being an eager FPU before
+                  * returning back to userland.
+                  */
+                 VERIFY3P(curthread, ==, lwptot(lwp));
+                 VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
+                 fp_save(fpu);
+         }
+         kpreempt_enable();
+ 
+         hdr->pr_type = PR_TYPE_XSAVE;
+         hdr->pr_size = size;
+         hdr->pr_flags = hdr->pr_pad[0] = hdr->pr_pad[1] = hdr->pr_pad[2] =
+             hdr->pr_pad[3] = 0;
+         hdr->pr_ninfo = ninfo;
+ 
+         curinfo = 0;
+         for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
+                 void *startp;
+                 uint32_t phase;
+ 
+                 if (!fpu_proc_xregs_include(&fpu_xsave_info[i]))
+                         continue;
+ 
+                 phase = dstart % fpu_xsave_info[i].xi_align;
+                 if (dstart < fpu_xsave_info[i].xi_align) {
+                         ASSERT3U(i, !=, 0);
+                         dstart = fpu_xsave_info[i].xi_align;
+                 } else if (phase != 0) {
+                         ASSERT3U(i, !=, 0);
+                         dstart += phase;
+                 }
+ 
+                 hdr->pr_info[curinfo].pri_type = fpu_xsave_info[i].xi_type;
+                 hdr->pr_info[curinfo].pri_flags = 0;
+                 hdr->pr_info[curinfo].pri_size = fpu_xsave_info[i].xi_size;
+                 hdr->pr_info[curinfo].pri_offset = dstart;
+ 
+                 startp = (void *)((uintptr_t)buf + dstart);
+                 fpu_xsave_info[i].xi_fill(fpu, &fpu_xsave_info[i], startp);
+                 dstart += fpu_xsave_info[i].xi_size;
+                 ASSERT3U(curinfo, <=, ninfo);
+                 curinfo++;
+         }
+ }
+ 
+ /*
+  * We have been asked to set the data in the FPU for a given thread. Our
+  * prmachdep code has already validated that the raw semantics of the data that
+  * we have are valid (that is the appropriate sizes, offsets, and flags). We now
+  * apply additional checking here:
+  *
+  *   o The xsave structure is present and only valid bits are set.
+  *   o If the xsave component bit-vector is set, we have the corresponding proc
+  *     info item.
+  *   o Read-only items are ignored if and only if they actually match what we
+  *     gave the user mostly as a courtesy to simplify things here.
+  *   o ILP32 processes which can't support many of the regions are allowed to
+  *     have the items here (as we likely gave them to them), but they must be
+  *     zero if they are set.
+  *
+  * We take a first pass through all the data, validating it makes sense for the
+  * FPU. Only after that point do we ensure that we have the FPU data in question
+  * and then we clobber all the FPU data. Part of the semantics of setting this
+  * is that we're setting the entire extended FPU.
+  */
+ int
+ fpu_proc_xregs_set(struct _klwp *lwp, void *buf)
+ {
+         prxregset_hdr_t *prx = buf;
+         model_t model = lwp_getdatamodel(lwp);
+         uint64_t bv_found = 0;
+         const prxregset_xsave_t *xsave = NULL;
+         fpu_ctx_t *fpu = &lwp->lwp_pcb.pcb_fpu;
+ 
+         VERIFY(fpu_xsave_enabled());
+ 
+         /*
+          * First, walk each note info header that we have from the user and
+          * proceed to validate it. The prmachdep code has already validated that
+          * the size, type, and offset information is valid, but it has not
+          * validated the semantic contents of this or if someone is trying to
+          * write something they shouldn't.
+          *
+          * While we walk this, we keep track of where the xsave header is. We
+          * also track all of the bits that we have found along the way so we can
+          * match up and ensure that everything that was set has a corresponding
+          * bit in the xsave bitmap. If we have something in the xsave bitmap,
+          * but not its corresponding data, then that is an error. However, we
+          * allow folks to write data regions without the bit set in the xsave
+          * data to make the read, modify, write process simpler.
+          */
+         for (uint32_t i = 0; i < prx->pr_ninfo; i++) {
+                 const prxregset_info_t *info = &prx->pr_info[i];
+                 bool found = false;
+ 
+                 for (size_t pt = 0; pt < ARRAY_SIZE(fpu_xsave_info); pt++) {
+                         void *data;
+                         if (info->pri_type != fpu_xsave_info[pt].xi_type)
+                                 continue;
+ 
+                         found = true;
+                         data = (void *)((uintptr_t)buf + info->pri_offset);
+                         if (fpu_xsave_info[pt].xi_valid != NULL &&
+                             !fpu_xsave_info[pt].xi_valid(model, data)) {
+                                 return (EINVAL);
+                         }
+ 
+                         if (info->pri_type == PRX_INFO_XSAVE) {
+                                 xsave = data;
+                         }
+                         bv_found |= fpu_xsave_info[pt].xi_bits;
+                         break;
+                 }
+ 
+                 if (!found) {
+                         return (EINVAL);
+                 }
+         }
+ 
+         /*
+          * No xsave data, no dice.
+          */
+         if (xsave == NULL) {
+                 return (EINVAL);
+         }
+ 
+         /*
+          * If anything is set in the xsave header that was not found as we
+          * walked structures, then that is an error. The opposite is not true as
+          * discussed above.
+          */
+         if ((xsave->prx_xsh_xstate_bv & ~bv_found) != 0) {
+                 return (EINVAL);
+         }
+ 
+         /*
+          * At this point, we consider all the data actually valid. Now we must
+          * set up this information in the save area. If this is our own lwp, we
+          * must disable it first. Otherwise, we expect that it is already valid.
+          * To try to sanitize this, we will defensively zero the entire region
+          * as we are setting everything that will result in here.
+          */
+         kpreempt_disable();
+         if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
+                 /*
+                  * This case suggests that thread in question doesn't have a
+                  * valid FPU save state which should only happen when it is on
+                  * CPU. If this is the case, we explicitly disable the FPU, but
+                  * do not save it before proceeding. We also sanity check
+                  * several things here before doing this as using /proc on
+                  * yourself is always exciting. Unlike fp_save(), fp_free() does
+                  * not signal that an update is required, so we unconditionally
+                  * set that for all threads.
+                  */
+                 VERIFY3P(curthread, ==, lwptot(lwp));
+                 VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
+                 fp_free(fpu);
+         }
+         PCB_SET_UPDATE_FPU(&lwp->lwp_pcb);
+         bzero(lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic,
+             cpuid_get_xsave_size());
+ 
+         for (uint32_t i = 0; i < prx->pr_ninfo; i++) {
+                 const prxregset_info_t *info = &prx->pr_info[i];
+                 bool found = false;
+ 
+                 for (size_t pt = 0; pt < ARRAY_SIZE(fpu_xsave_info); pt++) {
+                         const void *data;
+                         if (info->pri_type != fpu_xsave_info[pt].xi_type)
+                                 continue;
+ 
+                         /*
+                          * Check if we have a set function and if we should
+                          * include this. We may not if this is something like
+                          * PRX_INFO_XCR which is read-only.
+                          *
+                          * We may not include a given entry as it may not have
+                          * been set in the actual xsave state that we have been
+                          * asked to restore, in which case to not break the
+                          * xsaveopt logic, we must leave it in its initial
+                          * state, e.g. zeroed (generally). XMM data initial
+                          * state is not zeroed, but is marked with xi_always to
+                          * help account for this.
+                          */
+                         found = true;
+                         if (fpu_xsave_info[pt].xi_set == NULL)
+                                 break;
+                         if (!fpu_xsave_info[pt].xi_always &&
+                             (xsave->prx_xsh_xstate_bv &
+                             fpu_xsave_info[pt].xi_bits) !=
+                             fpu_xsave_info[pt].xi_bits) {
+                                 break;
+                         }
+ 
+                         data = (void *)((uintptr_t)buf + info->pri_offset);
+                         fpu_xsave_info[pt].xi_set(fpu, &fpu_xsave_info[pt],
+                             xsave->prx_xsh_xstate_bv, data);
+                 }
+ 
+                 VERIFY(found);
+         }
+         kpreempt_enable();
+ 
+         return (0);
+ }
+ 
+ /*
+  * To be included in the signal copyout logic we must have a copy function and
+  * the bit in question must be included. Note, we don't consult xi_always here
+  * as that is really part of what is always present for xsave logic and
+  * therefore isn't really pertinent here because of our custom format. See the
+  * big theory statement for more info.
+  */
+ static bool
+ fpu_signal_include(const xsave_proc_info_t *infop, uint64_t xs_bv)
+ {
+         return ((infop->xi_bits & xs_bv) == infop->xi_bits &&
+             infop->xi_signal_out != NULL);
+ }
+ 
+ /*
+  * We need to fill out the xsave related data into the ucontext_t that we've
+  * been given. We should have a valid user pointer at this point in the uc_xsave
+  * member. This is much simpler than the copyin that we have. Here are the
+  * current assumptions:
+  *
+  *   o This is being called for the current thread. This is not meant to operate
+  *     on an arbitrary thread's state.
+  *   o We cannot assume whether the FPU is valid in the pcb or not. While most
+  *     callers will have just called getfpregs() which saved the state, don't
+  *     assume that.
+  *   o We assume that the user address has the requisite required space for this
+  *     to be copied out.
+  *   o We assume that copyfunc() will ensure we are not copying into a kernel
+  *     address.
+  *
+  * For more information on the format of the data, see the 'Signal Handling and
+  * the ucontext_t' portion of the big theory statement. We copy out all the
+  * constituent parts and then come back and write out the actual final header
+  * information.
+  */
+ int
+ fpu_signal_copyout(struct _klwp *lwp, uintptr_t uaddr, fpu_copyout_f copyfunc)
+ {
+         struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
+         uint64_t xs_bv;
+         uc_xsave_t ucx;
+         int ret;
+ 
+         VERIFY3P(curthread, ==, lwptot(lwp));
+         VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
+         ASSERT3U(fpu->fpu_flags & FPU_EN, ==, FPU_EN);
+ 
+         if (!fpu_xsave_enabled()) {
+                 return (ENOTSUP);
+         }
+ 
+         /*
+          * Unlike when we're dealing with /proc, we can unconditionally call
+          * fp_save() because this is always called in the context that the lwp
+          * we're operating on is always the one on CPU (which is what fp_save()
+          * asserts).
+          */
+         fp_save(fpu);
+ 
+         bzero(&ucx, sizeof (ucx));
+         ucx.ucx_vers = UC_XSAVE_VERS;
+         ucx.ucx_len += sizeof (uc_xsave_t);
+ 
+         xs_bv = fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv;
+         for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
+                 const xsave_proc_info_t *info = &fpu_xsave_info[i];
+ 
+                 if (!fpu_signal_include(&fpu_xsave_info[i], xs_bv))
+                         continue;
+                 ret = info->xi_signal_out(info, copyfunc, &ucx,
+                     lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic,
+                     uaddr);
+                 if (ret != 0) {
+                         kpreempt_enable();
+                         return (ret);
+                 }
+         }
+ 
+         /*
+          * Now that everything has been copied out, we should have an accurate
+          * value in the uc_xsave_t header and we can copy that out at the start
+          * of the user data.
+          */
+         ret = copyfunc(&ucx, (void *)uaddr, sizeof (ucx));
+         return (ret);
+ }
+ 
+ /*
+  * Here we've been given a ucontext_t which potentially has a user pointer to
+  * xsave state that we've copied out previously. In this case we need to do the
+  * following, assuming UC_XSAVE is present:
+  *
+  *   o Copy in our header and validate it.
+  *   o Allocate an fpu context to use as a holding ground for all this data.
+  *   o If UC_FPU is set, override the xsave structure with the saved XMM state,
+  *     clear UC_FPU, and make sure that the correct xsave_bv bits are set.
+  *
+  * Currently we always allocate the additional state as a holding ground for the
+  * FPU. What we're copying in may not be valid and we don't want to clobber the
+  * existing FPU state or deal with merging it until we believe it's reasonable
+  * enough. The proc_t is here to set us up for when we have per-process settings
+  * in the extended feature disable MSRs.
+  */
+ int
+ fpu_signal_copyin(struct _klwp *lwp, ucontext_t *kuc)
+ {
+         uc_xsave_t ucx;
+         uint64_t bv;
+         uintptr_t data, max_data;
+         void *fpu;
+         proc_t *p = lwp->lwp_procp;
+         size_t ksize;
+ 
+         /*
+          * Because this has been opaque filler and the kernel has never
+          * historically looked at it, we don't really care about the uc_xsave
+          * pointer being garbage in the case that the flag is not set. While
+          * this isn't perhaps the most sporting choice in some cases, this is on
+          * the other hand, pragmatic.
+          */
+         if ((kuc->uc_flags & UC_XSAVE) != 0) {
+                 if (kuc->uc_xsave == 0) {
+                         return (EINVAL);
+                 }
+ 
+                 if (!fpu_xsave_enabled()) {
+                         return (ENOTSUP);
+                 }
+         } else {
+                 return (0);
+         }
+ 
+         if (ddi_copyin((const void *)kuc->uc_xsave, &ucx, sizeof (ucx), 0) !=
+             0) {
+                 return (EFAULT);
+         }
+ 
+         ksize = cpuid_get_xsave_size();
+         if (ucx.ucx_vers != UC_XSAVE_VERS || ucx.ucx_len < sizeof (ucx) ||
+             ucx.ucx_len > ksize ||
+             (ucx.ucx_bv & ~xsave_bv_all) != 0 ||
+             (uintptr_t)p->p_as->a_userlimit - ucx.ucx_len <
+             (uintptr_t)kuc->uc_xsave) {
+                 return (EINVAL);
+         }
+ 
+         /*
+          * OK, our goal right now is to recreate a valid xsave_state structure
+          * that we'll ultimately end up having to merge with our existing one in
+          * the FPU save state. The reason we describe this as a merge is to help
+          * future us when we want to retain supervisor state which will never be
+          * part of userland signal state. The design of the userland signal
+          * state is basically to compress it as much as we can. This is done for
+          * two reasons:
+          *
+          *   1) We currently consider this a private interface.
+          *   2) We really want to minimize the actual amount of stack space we
+          *      use as much as possible. Most applications aren't using AVX-512
+          *      right now, so doing our own compression style is worthwhile. If
+          *      libc adopts AVX-512 routines, we may want to change this.
+          *
+          * On the allocation below, our assumption is that if a thread has taken
+          * a signal, then it is likely to take a signal again in the future (or
+          * be shortly headed to its demise). As such, when that happens we will
+          * leave the allocated signal stack around for the process. Most
+          * applications don't allow all threads to take signals, so this should
+          * hopefully help amortize the cost of the allocation.
+          */
+         max_data = (uintptr_t)kuc->uc_xsave + ucx.ucx_len;
+         data = (uintptr_t)kuc->uc_xsave + sizeof (ucx);
+         bv = ucx.ucx_bv;
+         if (lwp->lwp_pcb.pcb_fpu.fpu_signal == NULL) {
+                 lwp->lwp_pcb.pcb_fpu.fpu_signal =
+                     kmem_cache_alloc(fpsave_cachep, KM_SLEEP);
+         }
+         fpu = lwp->lwp_pcb.pcb_fpu.fpu_signal;
+ 
+         /*
+          * Unconditionally initialize the memory we get in here to ensure that
+          * it is in a reasonable state for ourselves. This ensures that unused
+          * regions are mostly left in their initial state (the main exception
+          * here is the x87/XMM state, but that should be OK). We don't fill in
+          * the initial xsave state as we expect that to happen as part of our
+          * processing.
+          */
+         bzero(fpu, ksize);
+ 
+         for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
+                 int ret;
+                 const xsave_proc_info_t *info = &fpu_xsave_info[i];
+                 if (!info->xi_always && (info->xi_bits & bv) == 0)
+                         continue;
+                 bv &= ~info->xi_bits;
+ 
+                 if (info->xi_signal_in == NULL)
+                         continue;
+                 ret = info->xi_signal_in(info, kuc, &ucx, fpu, &data, max_data);
+                 if (ret != 0) {
+                         return (ret);
+                 }
+         }
+         ASSERT0(bv);
+ 
+         /*
+          * As described in the big theory statement section 'Signal Handling and
+          * the ucontext_t', we always remove UC_FPU from here as we've taken
+          * care of reassembling it ourselves.
+          */
+         kuc->uc_flags &= ~UC_FPU;
+         kuc->uc_xsave = (uintptr_t)fpu;
+ 
+         return (0);
+ }
+ 
+ /*
+  * This determines the size of the signal stack that we need for our custom form
+  * of the xsave state.
+  */
+ size_t
+ fpu_signal_size(struct _klwp *lwp)
+ {
+         struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
+         size_t len = sizeof (uc_xsave_t);
+         uint64_t xs_bv;
+ 
+         VERIFY3P(curthread, ==, lwptot(lwp));
+         VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
+         ASSERT3U(fpu->fpu_flags & FPU_EN, ==, FPU_EN);
+ 
+         if (!fpu_xsave_enabled()) {
+                 return (0);
+         }
+ 
+         kpreempt_disable();
+         if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
+                 fp_save(fpu);
+         }
+ 
+         xs_bv = fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv;
+         for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
+                 size_t comp_size;
+ 
+                 if (!fpu_signal_include(&fpu_xsave_info[i], xs_bv))
+                         continue;
+ 
+                 cpuid_get_xsave_info(fpu_xsave_info[i].xi_bits, &comp_size,
+                     NULL);
+                 len += comp_size;
+         }
+ 
+         kpreempt_enable();
+         return (len);
+ }
+ 
+ /*
+  * This function is used in service of restorecontext() to set the specified
+  * thread's extended FPU state to the passed in data. Our assumptions at this
+  * point from the system are:
+  *
+  *   o Someone has already verified that the actual xsave header is correct.
+  *   o Any traditional XMM state that causes a #gp has been clamped.
+  *   o That data is basically the correct sized xsave state structure. Right now
+  *     that means it is not compressed and follows the CPUID-based rules for
+  *     constructing and laying out data.
+  *   o That the lwp argument does refer to the current thread.
+  *
+  * Our primary purpose here is to merge the current FPU state with what exists
+  * here. Right now, "merge", strictly speaking is just "replace". We can get
+  * away with just replacing everything because all we currently save are user
+  * states. If we start saving kernel states in here, this will get more nuanced
+  * and we will need to be more careful about how we store data here.
+  */
+ void
+ fpu_set_xsave(struct _klwp *lwp, const void *data)
+ {
+         struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
+         uint32_t status, xstatus;
+         struct xsave_state *dst_xsave;
+ 
+         ASSERT(fpu_xsave_enabled());
+         VERIFY3P(curthread, ==, lwptot(lwp));
+         VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
+         ASSERT3U(fpu->fpu_flags & FPU_EN, ==, FPU_EN);
+ 
+         /*
+          * We use fp_save() here rather than a stock fpdisable() so we can
+          * attempt to honor our invariants that when the thread state has been
+          * saved, the valid flag is set, even though we're going to be
+          * overwriting it shortly. If we just called fpdisable() then we would
+          * basically be asking for trouble.
+          *
+          * Because we are modifying the state here and we don't want the system
+          * to end up in an odd state, we are being a little paranoid and
+          * disabling preemption across this operation. In particular, once the
+          * state is properly tagged with FPU_VALID, there should be no other way
+          * that this thread can return to userland and get cleared out because
+          * we're resetting its context; however, we let paranoia win out.
+          */
+         kpreempt_disable();
+         if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
+                 fp_save(fpu);
+         }
+ 
+         bcopy(data, lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic,
+             cpuid_get_xsave_size());
+         dst_xsave = lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic;
+         status = dst_xsave->xs_fxsave.__fx_ign2[3]._l[0];
+         xstatus = dst_xsave->xs_fxsave.__fx_ign2[3]._l[1];
+         dst_xsave->xs_fxsave.__fx_ign2[3]._l[0] = 0;
+         dst_xsave->xs_fxsave.__fx_ign2[3]._l[1] = 0;
+ 
+         /*
+          * These two status words are information that the kernel itself uses to
+          * track additional information and is part of the traditional fpregset,
+          * but is not part of our xregs information. Because we are setting this
+          * state, we leave it up to the rest of the kernel to determine whether
+          * this came from an fpregset_t or is being reset to the default of 0.
+          */
+         fpu->fpu_regs.kfpu_status = status;
+         fpu->fpu_regs.kfpu_xstatus = xstatus;
+ 
+         fpu->fpu_flags |= FPU_VALID;
+         PCB_SET_UPDATE_FPU(&lwp->lwp_pcb);
+         kpreempt_enable();
+ }
+ 
+ /*
+  * Convert the current FPU state to the traditional fpregset_t. In the 64-bit
+  * kernel, this is just an fxsave_state with additional values for the status
+  * and xstatus members.
+  *
+  * This has the same nuance as the xregs cases discussed above, but is simpler
+  * in that we only need to handle the fxsave state, but more complicated because
+  * we need to check our save mechanism.
+  */
+ void
+ fpu_get_fpregset(struct _klwp *lwp, fpregset_t *fp)
+ {
+         struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
+ 
+         kpreempt_disable();
+         fp->fp_reg_set.fpchip_state.status = fpu->fpu_regs.kfpu_status;
+         fp->fp_reg_set.fpchip_state.xstatus = fpu->fpu_regs.kfpu_xstatus;
+ 
+         if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
+                 /*
+                  * If we're requesting the fpregs of a thread that isn't
+                  * currently valid and isn't the one that we're executing, then
+                  * we consider getting this information to be a best-effort and
+                  * we will not stop the thread in question to serialize it,
+                  * which means possibly getting stale data. This is the
+                  * traditional semantics that the system has used to service
+                  * this for /proc.
+                  */
+                 if (curthread == lwptot(lwp)) {
+                         VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
+                         fp_save(fpu);
+                 }
+         }
+ 
+         /*
+          * If the FPU is not enabled and the state isn't valid (due to someone
+          * else setting it), just copy the initial state.
+          */
+         if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == 0) {
+                 bcopy(&sse_initial, fp, sizeof (sse_initial));
+                 kpreempt_enable();
+                 return;
+         }
+ 
+         /*
+          * Given that we have an enabled FPU, we must look at the type of FPU
+          * save mechanism to clean this up. In particular, while we can just
+          * copy the save area with FXSAVE, with XSAVE we must carefully copy
+          * only the bits that are valid and reset the rest to their default
+          * state.
+          */
+         switch (fp_save_mech) {
+         case FP_FXSAVE:
+                 bcopy(fpu->fpu_regs.kfpu_u.kfpu_fx, fp,
+                     sizeof (struct fxsave_state));
+                 break;
+         case FP_XSAVE:
+                 fpu_xsave_to_fxsave(fpu->fpu_regs.kfpu_u.kfpu_xs,
+                     (struct fxsave_state *)fp);
+                 break;
+         default:
+                 panic("Invalid fp_save_mech");
+         }
+ 
+         kpreempt_enable();
+ }
+ 
+ /*
+  * This is a request to set the ABI fpregset_t into our actual hardware state.
+  * In the 64-bit kernel the first 512 bytes of the fpregset_t is the same as the
+  * 512-byte fxsave area.
+  */
+ void
+ fpu_set_fpregset(struct _klwp *lwp, const fpregset_t *fp)
+ {
+         struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
+ 
+         kpreempt_disable();
+         if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
+                 /*
+                  * We always save the entire FPU. This is required if we're
+                  * using xsave. If we're using fxsave, we could skip the
+                  * 512-byte write and instead just disable the FPU since we'd be
+                  * replacing it all. For now we don't bother with more
+                  * conditional logic.
+                  */
+                 VERIFY3P(curthread, ==, lwptot(lwp));
+                 VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
+                 fp_save(fpu);
+         }
+ 
+         fpu->fpu_regs.kfpu_xstatus = fp->fp_reg_set.fpchip_state.xstatus;
+         fpu->fpu_regs.kfpu_status = fp->fp_reg_set.fpchip_state.status;
+         switch (fp_save_mech) {
+         case FP_FXSAVE:
+                 bcopy(fp, fpu->fpu_regs.kfpu_u.kfpu_fx,
+                     sizeof (struct fxsave_state));
+                 break;
+         case FP_XSAVE:
+                 bcopy(fp, fpu->fpu_regs.kfpu_u.kfpu_xs,
+                     sizeof (struct fxsave_state));
+                 fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv |=
+                     XFEATURE_LEGACY_FP | XFEATURE_SSE;
+                 break;
+         default:
+                 panic("Invalid fp_save_mech");
+         }
+ 
+         fpu->fpu_flags |= FPU_VALID;
+         PCB_SET_UPDATE_FPU(&lwp->lwp_pcb);
+         kpreempt_enable();
  }