1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2021 Joyent, Inc.
  24  * Copyright 2021 RackTop Systems, Inc.
  25  * Copyright 2022 Oxide Computer Company
  26  */
  27 
  28 /*      Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
  29 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T   */
  30 /*              All Rights Reserved                             */
  31 
  32 /*      Copyright (c) 1987, 1988 Microsoft Corporation          */
  33 /*              All Rights Reserved                             */
  34 
  35 /*
  36  * Copyright (c) 2009, Intel Corporation.
  37  * All rights reserved.
  38  */
  39 
  40 #include <sys/types.h>
  41 #include <sys/param.h>
  42 #include <sys/signal.h>
  43 #include <sys/regset.h>
  44 #include <sys/privregs.h>
  45 #include <sys/psw.h>
  46 #include <sys/trap.h>
  47 #include <sys/fault.h>
  48 #include <sys/systm.h>
  49 #include <sys/user.h>
  50 #include <sys/file.h>
  51 #include <sys/proc.h>
  52 #include <sys/pcb.h>
  53 #include <sys/lwp.h>
  54 #include <sys/cpuvar.h>
  55 #include <sys/thread.h>
  56 #include <sys/disp.h>
  57 #include <sys/fp.h>
  58 #include <sys/siginfo.h>
  59 #include <sys/archsystm.h>
  60 #include <sys/kmem.h>
  61 #include <sys/debug.h>
  62 #include <sys/x86_archext.h>
  63 #include <sys/sysmacros.h>
  64 #include <sys/cmn_err.h>
  65 #include <sys/kfpu.h>
  66 
  67 /*
  68  * FPU Management Overview
  69  * -----------------------
  70  *
  71  * The x86 FPU has evolved substantially since its days as the x87 coprocessor;
  72  * however, many aspects of its life as a coprocessor are still around in x86.
  73  *
  74  * Today, when we refer to the 'FPU', we don't just mean the original x87 FPU.
  75  * While that state still exists, there is much more that is covered by the FPU.
  76  * Today, this includes not just traditional FPU state, but also supervisor only
  77  * state. The following state is currently managed and covered logically by the
  78  * idea of the FPU registers:
  79  *
  80  *    o Traditional x87 FPU
  81  *    o Vector Registers (%xmm, %ymm, %zmm)
  82  *    o Memory Protection Extensions (MPX) Bounds Registers
  83  *    o Protected Key Rights Registers (PKRU)
  84  *    o Processor Trace data
  85  *
  86  * The rest of this covers how the FPU is managed and controlled, how state is
  87  * saved and restored between threads, interactions with hypervisors, and other
  88  * information exported to user land through aux vectors. A lot of background
  89  * information is here to synthesize major parts of the Intel SDM, but
  90  * unfortunately, it is not a replacement for reading it.
  91  *
  92  * FPU Control Registers
  93  * ---------------------
  94  *
  95  * Because the x87 FPU began its life as a co-processor and the FPU was
  96  * optional there are several bits that show up in %cr0 that we have to
  97  * manipulate when dealing with the FPU. These are:
  98  *
  99  *   o CR0.ET   The 'extension type' bit. This was used originally to indicate
 100  *              that the FPU co-processor was present. Now it is forced on for
 101  *              compatibility. This is often used to verify whether or not the
 102  *              FPU is present.
 103  *
 104  *   o CR0.NE   The 'native error' bit. Used to indicate that native error
 105  *              mode should be enabled. This indicates that we should take traps
 106  *              on FPU errors. The OS enables this early in boot.
 107  *
 108  *   o CR0.MP   The 'Monitor Coprocessor' bit. Used to control whether or not
 109  *              wait/fwait instructions generate a #NM if CR0.TS is set.
 110  *
 111  *   o CR0.EM   The 'Emulation' bit. This is used to cause floating point
 112  *              operations (x87 through SSE4) to trap with a #UD so they can be
 113  *              emulated. The system never sets this bit, but makes sure it is
 114  *              clear on processor start up.
 115  *
 116  *   o CR0.TS   The 'Task Switched' bit. When this is turned on, a floating
 117  *              point operation will generate a #NM. An fwait will as well,
 118  *              depending on the value in CR0.MP.
 119  *
 120  * Our general policy is that CR0.ET, CR0.NE, and CR0.MP are always set by
 121  * the system. Similarly CR0.EM is always unset by the system. CR0.TS has a more
 122  * complicated role. Historically it has been used to allow running systems to
 123  * restore the FPU registers lazily. This will be discussed in greater depth
 124  * later on.
 125  *
 126  * %cr4 is also used as part of the FPU control. Specifically we need to worry
 127  * about the following bits in the system:
 128  *
 129  *   o CR4.OSFXSR       This bit is used to indicate that the OS understands and
 130  *                      supports the execution of the fxsave and fxrstor
 131  *                      instructions. This bit is required to be set to enable
 132  *                      the use of the SSE->SSE4 instructions.
 133  *
 134  *   o CR4.OSXMMEXCPT   This bit is used to indicate that the OS can understand
 135  *                      and take a SIMD floating point exception (#XM). This bit
 136  *                      is always enabled by the system.
 137  *
 138  *   o CR4.OSXSAVE      This bit is used to indicate that the OS understands and
 139  *                      supports the execution of the xsave and xrstor family of
 140  *                      instructions. This bit is required to use any of the AVX
 141  *                      and newer feature sets.
 142  *
 143  * Because all supported processors are 64-bit, they'll always support the XMM
 144  * extensions and we will enable both CR4.OXFXSR and CR4.OSXMMEXCPT in boot.
 145  * CR4.OSXSAVE will be enabled and used whenever xsave is reported in cpuid.
 146  *
 147  * %xcr0 is used to manage the behavior of the xsave feature set and is only
 148  * present on the system if xsave is supported. %xcr0 is read and written to
 149  * through by the xgetbv and xsetbv instructions. This register is present
 150  * whenever the xsave feature set is supported. Each bit in %xcr0 refers to a
 151  * different component of the xsave state and controls whether or not that
 152  * information is saved and restored. For newer feature sets like AVX and MPX,
 153  * it also controls whether or not the corresponding instructions can be
 154  * executed (much like CR0.OSFXSR does for the SSE feature sets).
 155  *
 156  * Everything in %xcr0 is around features available to users. There is also the
 157  * IA32_XSS MSR which is used to control supervisor-only features that are still
 158  * part of the xsave state. Bits that can be set in %xcr0 are reserved in
 159  * IA32_XSS and vice versa. This is an important property that is particularly
 160  * relevant to how the xsave instructions operate.
 161  *
 162  * Save Mechanisms
 163  * ---------------
 164  *
 165  * When switching between running threads the FPU state needs to be saved and
 166  * restored by the OS. If this state was not saved, users would rightfully
 167  * complain about corrupt state. There are three mechanisms that exist on the
 168  * processor for saving and restoring these state images:
 169  *
 170  *   o fsave
 171  *   o fxsave
 172  *   o xsave
 173  *
 174  * fsave saves and restores only the x87 FPU and is the oldest of these
 175  * mechanisms. This mechanism is never used in the kernel today because we are
 176  * always running on systems that support fxsave.
 177  *
 178  * The fxsave and fxrstor mechanism allows the x87 FPU and the SSE register
 179  * state to be saved and restored to and from a struct fxsave_state. This is the
 180  * default mechanism that is used to save and restore the FPU on amd64. An
 181  * important aspect of fxsave that was different from the original i386 fsave
 182  * mechanism is that the restoring of FPU state with pending exceptions will not
 183  * generate an exception, it will be deferred to the next use of the FPU.
 184  *
 185  * The final and by far the most complex mechanism is that of the xsave set.
 186  * xsave allows for saving and restoring all of the traditional x86 pieces (x87
 187  * and SSE), while allowing for extensions that will save the %ymm, %zmm, etc.
 188  * registers.
 189  *
 190  * Data is saved and restored into and out of a struct xsave_state. The first
 191  * part of the struct xsave_state is equivalent to the struct fxsave_state.
 192  * After that, there is a header which is used to describe the remaining
 193  * portions of the state. The header is a 64-byte value of which the first two
 194  * uint64_t values are defined and the rest are reserved and must be zero. The
 195  * first uint64_t is the xstate_bv member. This describes which values in the
 196  * xsave_state are actually valid and present. This is updated on a save and
 197  * used on restore. The second member is the xcomp_bv member. Its last bit
 198  * determines whether or not a compressed version of the structure is used.
 199  *
 200  * When the uncompressed structure is used (currently the only format we
 201  * support), then each state component is at a fixed offset in the structure,
 202  * even if it is not being used. For example, if you only saved the AVX related
 203  * state, but did not save the MPX related state, the offset would not change
 204  * for any component. With the compressed format, components that aren't used
 205  * are all elided (though the x87 and SSE state are always there).
 206  *
 207  * Unlike fxsave which saves all state, the xsave family does not always save
 208  * and restore all the state that could be covered by the xsave_state. The
 209  * instructions all take an argument which is a mask of what to consider. This
 210  * is the same mask that will be used in the xstate_bv vector and it is also the
 211  * same values that are present in %xcr0 and IA32_XSS. Though IA32_XSS is only
 212  * considered with the xsaves and xrstors instructions.
 213  *
 214  * When a save or restore is requested, a bitwise and is performed between the
 215  * requested bits and those that have been enabled in %xcr0. Only the bits that
 216  * match that are then saved or restored. Others will be silently ignored by
 217  * the processor. This idea is used often in the OS. We will always request that
 218  * we save and restore all of the state, but only those portions that are
 219  * actually enabled in %xcr0 will be touched.
 220  *
 221  * If a feature has been asked to be restored that is not set in the xstate_bv
 222  * feature vector of the save state, then it will be set to its initial state by
 223  * the processor (usually zeros). Also, when asked to save state, the processor
 224  * may not write out data that is in its initial state as an optimization. This
 225  * optimization only applies to saving data and not to restoring data.
 226  *
 227  * There are a few different variants of the xsave and xrstor instruction. They
 228  * are:
 229  *
 230  *   o xsave    This is the original save instruction. It will save all of the
 231  *              requested data in the xsave state structure. It only saves data
 232  *              in the uncompressed (xcomp_bv[63] is zero) format. It may be
 233  *              executed at all privilege levels.
 234  *
 235  *   o xrstor   This is the original restore instruction. It will restore all of
 236  *              the requested data. The xrstor function can handle both the
 237  *              compressed and uncompressed formats. It may be executed at all
 238  *              privilege levels.
 239  *
 240  *   o xsaveopt This is a variant of the xsave instruction that employs
 241  *              optimizations to try and only write out state that has been
 242  *              modified since the last time an xrstor instruction was called.
 243  *              The processor tracks a tuple of information about the last
 244  *              xrstor and tries to ensure that the same buffer is being used
 245  *              when this optimization is being used. However, because of the
 246  *              way that it tracks the xrstor buffer based on the address of it,
 247  *              it is not suitable for use if that buffer can be easily reused.
 248  *              The most common case is trying to save data to the stack in
 249  *              rtld. It may be executed at all privilege levels.
 250  *
 251  *   o xsavec   This is a variant of the xsave instruction that writes out the
 252  *              compressed form of the xsave_state. Otherwise it behaves as
 253  *              xsave. It may be executed at all privilege levels.
 254  *
 255  *   o xsaves   This is a variant of the xsave instruction. It is similar to
 256  *              xsavec in that it always writes the compressed form of the
 257  *              buffer. Unlike all the other forms, this instruction looks at
 258  *              both the user (%xcr0) and supervisor (IA32_XSS MSR) to determine
 259  *              what to save and restore. xsaves also implements the same
 260  *              optimization that xsaveopt does around modified pieces. User
 261  *              land may not execute the instruction.
 262  *
 263  *   o xrstors  This is a variant of the xrstor instruction. Similar to xsaves
 264  *              it can save and restore both the user and privileged states.
 265  *              Unlike xrstor it can only operate on the compressed form.
 266  *              User land may not execute the instruction.
 267  *
 268  * Based on all of these, the kernel has a precedence for what it will use.
 269  * Basically, xsaves (not supported) is preferred to xsaveopt, which is
 270  * preferred to xsave. A similar scheme is used when informing rtld (more later)
 271  * about what it should use. xsavec is preferred to xsave. xsaveopt is not
 272  * recommended due to the modified optimization not being appropriate for this
 273  * use.
 274  *
 275  * Finally, there is one last gotcha with the xsave state. Importantly some AMD
 276  * processors did not always save and restore some of the FPU exception state in
 277  * some cases like Intel did. In those cases the OS will make up for this fact
 278  * itself.
 279  *
 280  * FPU Initialization
 281  * ------------------
 282  *
 283  * One difference with the FPU registers is that not all threads have FPU state,
 284  * only those that have an lwp. Generally this means kernel threads, which all
 285  * share p0 and its lwp, do not have FPU state. Though there are definitely
 286  * exceptions such as kcfpoold. In the rest of this discussion we'll use thread
 287  * and lwp interchangeably, just think of thread meaning a thread that has a
 288  * lwp.
 289  *
 290  * Each lwp has its FPU state allocated in its pcb (process control block). The
 291  * actual storage comes from the fpsave_cachep kmem cache. This cache is sized
 292  * dynamically at start up based on the save mechanism that we're using and the
 293  * amount of memory required for it. This is dynamic because the xsave_state
 294  * size varies based on the supported feature set.
 295  *
 296  * The hardware side of the FPU is initialized early in boot before we mount the
 297  * root file system. This is effectively done in fpu_probe(). This is where we
 298  * make the final decision about what the save and restore mechanisms we should
 299  * use are, create the fpsave_cachep kmem cache, and initialize a number of
 300  * function pointers that use save and restoring logic.
 301  *
 302  * The thread/lwp side is a a little more involved. There are two different
 303  * things that we need to concern ourselves with. The first is how the FPU
 304  * resources are allocated and the second is how the FPU state is initialized
 305  * for a given lwp.
 306  *
 307  * We allocate the FPU save state from our kmem cache as part of lwp_fp_init().
 308  * This is always called unconditionally by the system as part of creating an
 309  * LWP.
 310  *
 311  * There are three different initialization paths that we deal with. The first
 312  * is when we are executing a new process. As part of exec all of the register
 313  * state is reset. The exec case is particularly important because init is born
 314  * like Athena, sprouting from the head of the kernel, without any true parent
 315  * to fork from. The second is used whenever we fork or create a new lwp.  The
 316  * third is to deal with special lwps like the agent lwp.
 317  *
 318  * During exec, we will call fp_exec() which will initialize and set up the FPU
 319  * state for the process. That will fill in the initial state for the FPU and
 320  * also set that state in the FPU itself. As part of fp_exec() we also install a
 321  * thread context operations vector that takes care of dealing with the saving
 322  * and restoring of the FPU. These context handlers will also be called whenever
 323  * an lwp is created or forked. In those cases, to initialize the FPU we will
 324  * call fp_new_lwp(). Like fp_exec(), fp_new_lwp() will install a context
 325  * operations vector for the new thread.
 326  *
 327  * Next we'll end up in the context operation fp_new_lwp(). This saves the
 328  * current thread's state, initializes the new thread's state, and copies over
 329  * the relevant parts of the originating thread's state. It's as this point that
 330  * we also install the FPU context operations into the new thread, which ensures
 331  * that all future threads that are descendants of the current one get the
 332  * thread context operations (unless they call exec).
 333  *
 334  * To deal with some things like the agent lwp, we double check the state of the
 335  * FPU in sys_rtt_common() to make sure that it has been enabled before
 336  * returning to user land. In general, this path should be rare, but it's useful
 337  * for the odd lwp here and there.
 338  *
 339  * The FPU state will remain valid most of the time. There are times that
 340  * the state will be rewritten. For example in restorecontext, due to /proc, or
 341  * the lwp calls exec(). Whether the context is being freed or we are resetting
 342  * the state, we will call fp_free() to disable the FPU and our context.
 343  *
 344  * Finally, when the lwp is destroyed, it will actually destroy and free the FPU
 345  * state by calling fp_lwp_cleanup().
 346  *
 347  * Kernel FPU Multiplexing
 348  * -----------------------
 349  *
 350  * Just as the kernel has to maintain all of the general purpose registers when
 351  * switching between scheduled threads, the same is true of the FPU registers.
 352  *
 353  * When a thread has FPU state, it also has a set of context operations
 354  * installed. These context operations take care of making sure that the FPU is
 355  * properly saved and restored during a context switch (fpsave_ctxt and
 356  * fprestore_ctxt respectively). This means that the current implementation of
 357  * the FPU is 'eager', when a thread is running the CPU will have its FPU state
 358  * loaded. While this is always true when executing in userland, there are a few
 359  * cases where this is not true in the kernel.
 360  *
 361  * This was not always the case. Traditionally on x86 a 'lazy' FPU restore was
 362  * employed. This meant that the FPU would be saved on a context switch and the
 363  * CR0.TS bit would be set. When a thread next tried to use the FPU, it would
 364  * then take a #NM trap, at which point we would restore the FPU from the save
 365  * area and return to user land. Given the frequency of use of the FPU alone by
 366  * libc, there's no point returning to user land just to trap again.
 367  *
 368  * There are a few cases though where the FPU state may need to be changed for a
 369  * thread on its behalf. The most notable cases are in the case of processes
 370  * using /proc, restorecontext, forking, etc. In all of these cases the kernel
 371  * will force a threads FPU state to be saved into the PCB through the fp_save()
 372  * function. Whenever the FPU is saved, then the FPU_VALID flag is set on the
 373  * pcb. This indicates that the save state holds currently valid data. As a side
 374  * effect of this, CR0.TS will be set. To make sure that all of the state is
 375  * updated before returning to user land, in these cases, we set a flag on the
 376  * PCB that says the FPU needs to be updated. This will make sure that we take
 377  * the slow path out of a system call to fix things up for the thread. Due to
 378  * the fact that this is a rather rare case, effectively setting the equivalent
 379  * of t_postsys is acceptable.
 380  *
 381  * CR0.TS will be set after a save occurs and cleared when a restore occurs.
 382  * Generally this means it will be cleared immediately by the new thread that is
 383  * running in a context switch. However, this isn't the case for kernel threads.
 384  * They currently operate with CR0.TS set as no kernel state is restored for
 385  * them. This means that using the FPU will cause a #NM and panic.
 386  *
 387  * The FPU_VALID flag on the currently executing thread's pcb is meant to track
 388  * what the value of CR0.TS should be. If it is set, then CR0.TS will be set.
 389  * However, because we eagerly restore, the only time that CR0.TS should be set
 390  * for a non-kernel thread is during operations where it will be cleared before
 391  * returning to user land and importantly, the only data that is in it is its
 392  * own.
 393  *
 394  * Kernel FPU Usage
 395  * ----------------
 396  *
 397  * Traditionally the kernel never used the FPU since it had no need for
 398  * floating point operations. However, modern FPU hardware supports a variety
 399  * of SIMD extensions which can speed up code such as parity calculations or
 400  * encryption.
 401  *
 402  * To allow the kernel to take advantage of these features, the
 403  * kernel_fpu_begin() and kernel_fpu_end() functions should be wrapped
 404  * around any usage of the FPU by the kernel to ensure that user-level context
 405  * is properly saved/restored, as well as to properly setup the FPU for use by
 406  * the kernel. There are a variety of ways this wrapping can be used, as
 407  * discussed in this section below.
 408  *
 409  * When kernel_fpu_begin() and kernel_fpu_end() are used for extended
 410  * operations, the kernel_fpu_alloc() function should be used to allocate a
 411  * kfpu_state_t structure that is used to save/restore the thread's kernel FPU
 412  * state. This structure is not tied to any thread. That is, different threads
 413  * can reuse the same kfpu_state_t structure, although not concurrently. A
 414  * kfpu_state_t structure is freed by the kernel_fpu_free() function.
 415  *
 416  * In some cases, the kernel may need to use the FPU for a short operation
 417  * without the overhead to manage a kfpu_state_t structure and without
 418  * allowing for a context switch off the FPU. In this case the KFPU_NO_STATE
 419  * bit can be set in the kernel_fpu_begin() and kernel_fpu_end() flags
 420  * parameter. This indicates that there is no kfpu_state_t. When used this way,
 421  * kernel preemption should be disabled by the caller (kpreempt_disable) before
 422  * calling kernel_fpu_begin(), and re-enabled after calling kernel_fpu_end().
 423  * For this usage, it is important to limit the kernel's FPU use to short
 424  * operations. The tradeoff between using the FPU without a kfpu_state_t
 425  * structure vs. the overhead of allowing a context switch while using the FPU
 426  * should be carefully considered on a case by case basis.
 427  *
 428  * In other cases, kernel threads have an LWP, but never execute in user space.
 429  * In this situation, the LWP's pcb_fpu area can be used to save/restore the
 430  * kernel's FPU state if the thread is context switched, instead of having to
 431  * allocate and manage a kfpu_state_t structure. The KFPU_USE_LWP bit in the
 432  * kernel_fpu_begin() and kernel_fpu_end() flags parameter is used to
 433  * enable this behavior. It is the caller's responsibility to ensure that this
 434  * is only used for a kernel thread which never executes in user space.
 435  *
 436  * FPU Exceptions
 437  * --------------
 438  *
 439  * Certain operations can cause the kernel to take traps due to FPU activity.
 440  * Generally these events will cause a user process to receive a SIGFPU and if
 441  * the kernel receives it in kernel context, we will die. Traditionally the #NM
 442  * (Device Not Available / No Math) exception generated by CR0.TS would have
 443  * caused us to restore the FPU. Now it is a fatal event regardless of whether
 444  * or not user land causes it.
 445  *
 446  * While there are some cases where the kernel uses the FPU, it is up to the
 447  * kernel to use the FPU in a way such that it cannot receive a trap or to use
 448  * the appropriate trap protection mechanisms.
 449  *
 450  * Hypervisors
 451  * -----------
 452  *
 453  * When providing support for hypervisors things are a little bit more
 454  * complicated because the FPU is not virtualized at all. This means that they
 455  * need to save and restore the FPU and %xcr0 across entry and exit to the
 456  * guest. To facilitate this, we provide a series of APIs in <sys/hma.h>. These
 457  * allow us to use the full native state to make sure that we are always saving
 458  * and restoring the full FPU that the host sees, even when the guest is using a
 459  * subset.
 460  *
 461  * One tricky aspect of this is that the guest may be using a subset of %xcr0
 462  * and therefore changing our %xcr0 on the fly. It is vital that when we're
 463  * saving and restoring the FPU that we always use the largest %xcr0 contents
 464  * otherwise we will end up leaving behind data in it.
 465  *
 466  * ELF PLT Support
 467  * ---------------
 468  *
 469  * rtld has to preserve a subset of the FPU when it is saving and restoring
 470  * registers due to the amd64 SYS V ABI. See cmd/sgs/rtld/amd64/boot_elf.s for
 471  * more information. As a result, we set up an aux vector that contains
 472  * information about what save and restore mechanisms it should be using and
 473  * the sizing thereof based on what the kernel supports. This is passed down in
 474  * a series of aux vectors SUN_AT_FPTYPE and SUN_AT_FPSIZE. This information is
 475  * initialized in fpu_subr.c.
 476  */
 477 
 478 kmem_cache_t *fpsave_cachep;
 479 
 480 /* Legacy fxsave layout + xsave header + ymm */
 481 #define AVX_XSAVE_SIZE          (512 + 64 + 256)
 482 
 483 /*
 484  * Various sanity checks.
 485  */
 486 CTASSERT(sizeof (struct fxsave_state) == 512);
 487 CTASSERT(sizeof (struct fnsave_state) == 108);
 488 CTASSERT((offsetof(struct fxsave_state, fx_xmm[0]) & 0xf) == 0);
 489 CTASSERT(sizeof (struct xsave_state) >= AVX_XSAVE_SIZE);
 490 
 491 /*
 492  * This structure is the x86 implementation of the kernel FPU that is defined in
 493  * uts/common/sys/kfpu.h.
 494  */
 495 
 496 typedef enum kfpu_flags {
 497         /*
 498          * This indicates that the save state has initial FPU data.
 499          */
 500         KFPU_F_INITIALIZED = 0x01
 501 } kfpu_flags_t;
 502 
 503 struct kfpu_state {
 504         fpu_ctx_t       kfpu_ctx;
 505         kfpu_flags_t    kfpu_flags;
 506         kthread_t       *kfpu_curthread;
 507 };
 508 
 509 /*
 510  * Initial kfpu state for SSE/SSE2 used by fpinit()
 511  */
 512 const struct fxsave_state sse_initial = {
 513         FPU_CW_INIT,    /* fx_fcw */
 514         0,              /* fx_fsw */
 515         0,              /* fx_fctw */
 516         0,              /* fx_fop */
 517         0,              /* fx_rip */
 518         0,              /* fx_rdp */
 519         SSE_MXCSR_INIT  /* fx_mxcsr */
 520         /* rest of structure is zero */
 521 };
 522 
 523 /*
 524  * Initial kfpu state for AVX used by fpinit()
 525  */
 526 const struct xsave_state avx_initial = {
 527         /*
 528          * The definition below needs to be identical with sse_initial
 529          * defined above.
 530          */
 531         .xs_fxsave = {
 532                 .fx_fcw = FPU_CW_INIT,
 533                 .fx_mxcsr = SSE_MXCSR_INIT,
 534         },
 535         .xs_header = {
 536                 /*
 537                  * bit0 = 1 for XSTATE_BV to indicate that legacy fields are
 538                  * valid, and CPU should initialize XMM/YMM.
 539                  */
 540                 .xsh_xstate_bv = 1,
 541                 .xsh_xcomp_bv = 0,
 542         },
 543 };
 544 
 545 /*
 546  * mxcsr_mask value (possibly reset in fpu_probe); used to avoid
 547  * the #gp exception caused by setting unsupported bits in the
 548  * MXCSR register
 549  */
 550 uint32_t sse_mxcsr_mask = SSE_MXCSR_MASK_DEFAULT;
 551 
 552 /*
 553  * Initial kfpu state for x87 used by fpinit()
 554  */
 555 const struct fnsave_state x87_initial = {
 556         FPU_CW_INIT,    /* f_fcw */
 557         0,              /* __f_ign0 */
 558         0,              /* f_fsw */
 559         0,              /* __f_ign1 */
 560         0xffff,         /* f_ftw */
 561         /* rest of structure is zero */
 562 };
 563 
 564 /*
 565  * This vector is patched to xsave_ctxt() or xsaveopt_ctxt() if we discover we
 566  * have an XSAVE-capable chip in fpu_probe.
 567  */
 568 void (*fpsave_ctxt)(void *) = fpxsave_ctxt;
 569 void (*fprestore_ctxt)(void *) = fpxrestore_ctxt;
 570 
 571 /*
 572  * This function pointer is changed to xsaveopt if the CPU is xsaveopt capable.
 573  */
 574 void (*xsavep)(struct xsave_state *, uint64_t) = xsave;
 575 
 576 static int fpe_sicode(uint_t);
 577 static int fpe_simd_sicode(uint_t);
 578 static void fp_new_lwp(void *, void *);
 579 static void fp_free_ctx(void *, int);
 580 
 581 static struct ctxop *
 582 fp_ctxop_allocate(struct fpu_ctx *fp)
 583 {
 584         const struct ctxop_template tpl = {
 585                 .ct_rev         = CTXOP_TPL_REV,
 586                 .ct_save        = fpsave_ctxt,
 587                 .ct_restore     = fprestore_ctxt,
 588                 .ct_fork        = fp_new_lwp,
 589                 .ct_lwp_create  = fp_new_lwp,
 590                 .ct_free        = fp_free_ctx,
 591         };
 592         return (ctxop_allocate(&tpl, fp));
 593 }
 594 
 595 /*
 596  * Copy the state of parent lwp's floating point context into the new lwp.
 597  * Invoked for both fork() and lwp_create().
 598  *
 599  * Note that we inherit -only- the control state (e.g. exception masks,
 600  * rounding, precision control, etc.); the FPU registers are otherwise
 601  * reset to their initial state.
 602  */
 603 static void
 604 fp_new_lwp(void *parent, void *child)
 605 {
 606         kthread_id_t t = parent, ct = child;
 607         struct fpu_ctx *fp;             /* parent fpu context */
 608         struct fpu_ctx *cfp;            /* new fpu context */
 609         struct fxsave_state *fx, *cfx;
 610         struct xsave_state *cxs;
 611 
 612         ASSERT(fp_kind != FP_NO);
 613 
 614         fp = &t->t_lwp->lwp_pcb.pcb_fpu;
 615         cfp = &ct->t_lwp->lwp_pcb.pcb_fpu;
 616 
 617         /*
 618          * If the parent FPU state is still in the FPU hw then save it;
 619          * conveniently, fp_save() already does this for us nicely.
 620          */
 621         fp_save(fp);
 622 
 623         cfp->fpu_flags = FPU_EN | FPU_VALID;
 624         cfp->fpu_regs.kfpu_status = 0;
 625         cfp->fpu_regs.kfpu_xstatus = 0;
 626 
 627         /*
 628          * Make sure that the child's FPU is cleaned up and made ready for user
 629          * land.
 630          */
 631         PCB_SET_UPDATE_FPU(&ct->t_lwp->lwp_pcb);
 632 
 633         switch (fp_save_mech) {
 634         case FP_FXSAVE:
 635                 fx = fp->fpu_regs.kfpu_u.kfpu_fx;
 636                 cfx = cfp->fpu_regs.kfpu_u.kfpu_fx;
 637                 bcopy(&sse_initial, cfx, sizeof (*cfx));
 638                 cfx->fx_mxcsr = fx->fx_mxcsr & ~SSE_MXCSR_EFLAGS;
 639                 cfx->fx_fcw = fx->fx_fcw;
 640                 break;
 641 
 642         case FP_XSAVE:
 643                 cfp->fpu_xsave_mask = fp->fpu_xsave_mask;
 644 
 645                 VERIFY(fp->fpu_regs.kfpu_u.kfpu_xs != NULL);
 646 
 647                 fx = &fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave;
 648                 cxs = cfp->fpu_regs.kfpu_u.kfpu_xs;
 649                 cfx = &cxs->xs_fxsave;
 650 
 651                 bcopy(&avx_initial, cxs, sizeof (*cxs));
 652                 cfx->fx_mxcsr = fx->fx_mxcsr & ~SSE_MXCSR_EFLAGS;
 653                 cfx->fx_fcw = fx->fx_fcw;
 654                 cxs->xs_header.xsh_xstate_bv |=
 655                     (get_xcr(XFEATURE_ENABLED_MASK) & XFEATURE_FP_INITIAL);
 656                 break;
 657         default:
 658                 panic("Invalid fp_save_mech");
 659                 /*NOTREACHED*/
 660         }
 661 
 662         /*
 663          * Mark that both the parent and child need to have the FPU cleaned up
 664          * before returning to user land.
 665          */
 666 
 667         ctxop_attach(ct, fp_ctxop_allocate(cfp));
 668 }
 669 
 670 /*
 671  * Free any state associated with floating point context.
 672  * Fp_free can be called in three cases:
 673  * 1) from reaper -> thread_free -> freectx-> fp_free
 674  *      fp context belongs to a thread on deathrow
 675  *      nothing to do,  thread will never be resumed
 676  *      thread calling ctxfree is reaper
 677  *
 678  * 2) from exec -> freectx -> fp_free
 679  *      fp context belongs to the current thread
 680  *      must disable fpu, thread calling ctxfree is curthread
 681  *
 682  * 3) from restorecontext -> setfpregs -> fp_free
 683  *      we have a modified context in the memory (lwp->pcb_fpu)
 684  *      disable fpu and release the fp context for the CPU
 685  *
 686  */
 687 void
 688 fp_free(struct fpu_ctx *fp)
 689 {
 690         ASSERT(fp_kind != FP_NO);
 691 
 692         if (fp->fpu_flags & FPU_VALID)
 693                 return;
 694 
 695         kpreempt_disable();
 696         /*
 697          * We want to do fpsave rather than fpdisable so that we can
 698          * keep the fpu_flags as FPU_VALID tracking the CR0_TS bit
 699          */
 700         fp->fpu_flags |= FPU_VALID;
 701         /* If for current thread disable FP to track FPU_VALID */
 702         if (curthread->t_lwp && fp == &curthread->t_lwp->lwp_pcb.pcb_fpu) {
 703                 /* Clear errors if any to prevent frstor from complaining */
 704                 (void) fperr_reset();
 705                 if (fp_kind & __FP_SSE)
 706                         (void) fpxerr_reset();
 707                 fpdisable();
 708         }
 709         kpreempt_enable();
 710 }
 711 
 712 /*
 713  * Wrapper for freectx to make the types line up for fp_free()
 714  */
 715 static void
 716 fp_free_ctx(void *arg, int isexec __unused)
 717 {
 718         fp_free((struct fpu_ctx *)arg);
 719 }
 720 
 721 /*
 722  * Store the floating point state and disable the floating point unit.
 723  */
 724 void
 725 fp_save(struct fpu_ctx *fp)
 726 {
 727         ASSERT(fp_kind != FP_NO);
 728 
 729         kpreempt_disable();
 730         if (!fp || fp->fpu_flags & FPU_VALID ||
 731             (fp->fpu_flags & FPU_EN) == 0) {
 732                 kpreempt_enable();
 733                 return;
 734         }
 735         ASSERT(curthread->t_lwp && fp == &curthread->t_lwp->lwp_pcb.pcb_fpu);
 736 
 737         switch (fp_save_mech) {
 738         case FP_FXSAVE:
 739                 fpxsave(fp->fpu_regs.kfpu_u.kfpu_fx);
 740                 break;
 741 
 742         case FP_XSAVE:
 743                 xsavep(fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask);
 744                 break;
 745         default:
 746                 panic("Invalid fp_save_mech");
 747                 /*NOTREACHED*/
 748         }
 749 
 750         fp->fpu_flags |= FPU_VALID;
 751 
 752         /*
 753          * We save the FPU as part of forking, execing, modifications via /proc,
 754          * restorecontext, etc. As such, we need to make sure that we return to
 755          * userland with valid state in the FPU. If we're context switched out
 756          * before we hit sys_rtt_common() we'll end up having restored the FPU
 757          * as part of the context ops operations. The restore logic always makes
 758          * sure that FPU_VALID is set before doing a restore so we don't restore
 759          * it a second time.
 760          */
 761         PCB_SET_UPDATE_FPU(&curthread->t_lwp->lwp_pcb);
 762 
 763         kpreempt_enable();
 764 }
 765 
 766 /*
 767  * Restore the FPU context for the thread:
 768  * The possibilities are:
 769  *      1. No active FPU context: Load the new context into the FPU hw
 770  *         and enable the FPU.
 771  */
 772 void
 773 fp_restore(struct fpu_ctx *fp)
 774 {
 775         switch (fp_save_mech) {
 776         case FP_FXSAVE:
 777                 fpxrestore(fp->fpu_regs.kfpu_u.kfpu_fx);
 778                 break;
 779 
 780         case FP_XSAVE:
 781                 xrestore(fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask);
 782                 break;
 783         default:
 784                 panic("Invalid fp_save_mech");
 785                 /*NOTREACHED*/
 786         }
 787 
 788         fp->fpu_flags &= ~FPU_VALID;
 789 }
 790 
 791 /*
 792  * Reset the FPU such that it is in a valid state for a new thread that is
 793  * coming out of exec. The FPU will be in a usable state at this point. At this
 794  * point we know that the FPU state has already been allocated and if this
 795  * wasn't an init process, then it will have had fp_free() previously called.
 796  */
 797 void
 798 fp_exec(void)
 799 {
 800         struct fpu_ctx *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
 801 
 802         if (fp_save_mech == FP_XSAVE) {
 803                 fp->fpu_xsave_mask = XFEATURE_FP_ALL;
 804         }
 805 
 806         struct ctxop *ctx = fp_ctxop_allocate(fp);
 807         /*
 808          * Make sure that we're not preempted in the middle of initializing the
 809          * FPU on CPU.
 810          */
 811         kpreempt_disable();
 812         ctxop_attach(curthread, ctx);
 813         fpinit();
 814         fp->fpu_flags = FPU_EN;
 815         kpreempt_enable();
 816 }
 817 
 818 
 819 /*
 820  * Seeds the initial state for the current thread.  The possibilities are:
 821  *      1. Another process has modified the FPU state before we have done any
 822  *         initialization: Load the FPU state from the LWP state.
 823  *      2. The FPU state has not been externally modified:  Load a clean state.
 824  */
 825 void
 826 fp_seed(void)
 827 {
 828         struct fpu_ctx *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
 829 
 830         ASSERT(curthread->t_preempt >= 1);
 831         ASSERT((fp->fpu_flags & FPU_EN) == 0);
 832 
 833         /*
 834          * Always initialize a new context and initialize the hardware.
 835          */
 836         if (fp_save_mech == FP_XSAVE) {
 837                 fp->fpu_xsave_mask = XFEATURE_FP_ALL;
 838         }
 839 
 840         ctxop_attach(curthread, fp_ctxop_allocate(fp));
 841         fpinit();
 842 
 843         /*
 844          * If FPU_VALID is set, it means someone has modified registers via
 845          * /proc.  In this case, restore the current lwp's state.
 846          */
 847         if (fp->fpu_flags & FPU_VALID)
 848                 fp_restore(fp);
 849 
 850         ASSERT((fp->fpu_flags & FPU_VALID) == 0);
 851         fp->fpu_flags = FPU_EN;
 852 }
 853 
 854 /*
 855  * When using xsave/xrstor, these three functions are used by the lwp code to
 856  * manage the memory for the xsave area.
 857  */
 858 void
 859 fp_lwp_init(struct _klwp *lwp)
 860 {
 861         struct fpu_ctx *fp = &lwp->lwp_pcb.pcb_fpu;
 862 
 863         /*
 864          * We keep a copy of the pointer in lwp_fpu so that we can restore the
 865          * value in forklwp() after we duplicate the parent's LWP state.
 866          */
 867         lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic =
 868             kmem_cache_alloc(fpsave_cachep, KM_SLEEP);
 869 
 870         if (fp_save_mech == FP_XSAVE) {
 871                 /*
 872                  *
 873                  * We bzero since the fpinit() code path will only
 874                  * partially initialize the xsave area using avx_inital.
 875                  */
 876                 ASSERT(cpuid_get_xsave_size() >= sizeof (struct xsave_state));
 877                 bzero(fp->fpu_regs.kfpu_u.kfpu_xs, cpuid_get_xsave_size());
 878         }
 879 }
 880 
 881 void
 882 fp_lwp_cleanup(struct _klwp *lwp)
 883 {
 884         struct fpu_ctx *fp = &lwp->lwp_pcb.pcb_fpu;
 885 
 886         if (fp->fpu_regs.kfpu_u.kfpu_generic != NULL) {
 887                 kmem_cache_free(fpsave_cachep,
 888                     fp->fpu_regs.kfpu_u.kfpu_generic);
 889                 lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic = NULL;
 890         }
 891 }
 892 
 893 /*
 894  * Called during the process of forklwp(). The kfpu_u pointer will have been
 895  * overwritten while copying the parent's LWP structure. We have a valid copy
 896  * stashed in the child's lwp_fpu which we use to restore the correct value.
 897  */
 898 void
 899 fp_lwp_dup(struct _klwp *lwp)
 900 {
 901         void *xp = lwp->lwp_fpu;
 902         size_t sz;
 903 
 904         switch (fp_save_mech) {
 905         case FP_FXSAVE:
 906                 sz = sizeof (struct fxsave_state);
 907                 break;
 908         case FP_XSAVE:
 909                 sz = cpuid_get_xsave_size();
 910                 break;
 911         default:
 912                 panic("Invalid fp_save_mech");
 913                 /*NOTREACHED*/
 914         }
 915 
 916         /* copy the parent's values into the new lwp's struct */
 917         bcopy(lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic, xp, sz);
 918         /* now restore the pointer */
 919         lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic = xp;
 920 }
 921 
 922 /*
 923  * Handle a processor extension error fault
 924  * Returns non zero for error.
 925  */
 926 
 927 /*ARGSUSED*/
 928 int
 929 fpexterrflt(struct regs *rp)
 930 {
 931         uint32_t fpcw, fpsw;
 932         fpu_ctx_t *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
 933 
 934         ASSERT(fp_kind != FP_NO);
 935 
 936         /*
 937          * Now we can enable the interrupts.
 938          * (NOTE: x87 fp exceptions come thru interrupt gate)
 939          */
 940         sti();
 941 
 942         if (!fpu_exists)
 943                 return (FPE_FLTINV);
 944 
 945         /*
 946          * Do an unconditional save of the FP state.  If it's dirty (TS=0),
 947          * it'll be saved into the fpu context area passed in (that of the
 948          * current thread).  If it's not dirty (it may not be, due to
 949          * an intervening save due to a context switch between the sti(),
 950          * above and here, then it's safe to just use the stored values in
 951          * the context save area to determine the cause of the fault.
 952          */
 953         fp_save(fp);
 954 
 955         /* clear exception flags in saved state, as if by fnclex */
 956         switch (fp_save_mech) {
 957         case FP_FXSAVE:
 958                 fpsw = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw;
 959                 fpcw = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fcw;
 960                 fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw &= ~FPS_SW_EFLAGS;
 961                 break;
 962 
 963         case FP_XSAVE:
 964                 fpsw = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw;
 965                 fpcw = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fcw;
 966                 fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw &= ~FPS_SW_EFLAGS;
 967                 /*
 968                  * Always set LEGACY_FP as it may have been cleared by XSAVE
 969                  * instruction
 970                  */
 971                 fp->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv |=
 972                     XFEATURE_LEGACY_FP;
 973                 break;
 974         default:
 975                 panic("Invalid fp_save_mech");
 976                 /*NOTREACHED*/
 977         }
 978 
 979         fp->fpu_regs.kfpu_status = fpsw;
 980 
 981         if ((fpsw & FPS_ES) == 0)
 982                 return (0);             /* No exception */
 983 
 984         /*
 985          * "and" the exception flags with the complement of the mask
 986          * bits to determine which exception occurred
 987          */
 988         return (fpe_sicode(fpsw & ~fpcw & 0x3f));
 989 }
 990 
 991 /*
 992  * Handle an SSE/SSE2 precise exception.
 993  * Returns a non-zero sicode for error.
 994  */
 995 /*ARGSUSED*/
 996 int
 997 fpsimderrflt(struct regs *rp)
 998 {
 999         uint32_t mxcsr, xmask;
1000         fpu_ctx_t *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
1001 
1002         ASSERT(fp_kind & __FP_SSE);
1003 
1004         /*
1005          * NOTE: Interrupts are disabled during execution of this
1006          * function.  They are enabled by the caller in trap.c.
1007          */
1008 
1009         /*
1010          * The only way we could have gotten here if there is no FP unit
1011          * is via a user executing an INT $19 instruction, so there is
1012          * no fault in that case.
1013          */
1014         if (!fpu_exists)
1015                 return (0);
1016 
1017         /*
1018          * Do an unconditional save of the FP state.  If it's dirty (TS=0),
1019          * it'll be saved into the fpu context area passed in (that of the
1020          * current thread).  If it's not dirty, then it's safe to just use
1021          * the stored values in the context save area to determine the
1022          * cause of the fault.
1023          */
1024         fp_save(fp);            /* save the FPU state */
1025 
1026         if (fp_save_mech == FP_XSAVE) {
1027                 mxcsr = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_mxcsr;
1028                 fp->fpu_regs.kfpu_status =
1029                     fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw;
1030         } else {
1031                 mxcsr = fp->fpu_regs.kfpu_u.kfpu_fx->fx_mxcsr;
1032                 fp->fpu_regs.kfpu_status = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw;
1033         }
1034         fp->fpu_regs.kfpu_xstatus = mxcsr;
1035 
1036         /*
1037          * compute the mask that determines which conditions can cause
1038          * a #xm exception, and use this to clean the status bits so that
1039          * we can identify the true cause of this one.
1040          */
1041         xmask = (mxcsr >> 7) & SSE_MXCSR_EFLAGS;
1042         return (fpe_simd_sicode((mxcsr & SSE_MXCSR_EFLAGS) & ~xmask));
1043 }
1044 
1045 /*
1046  * In the unlikely event that someone is relying on this subcode being
1047  * FPE_FLTILL for denormalize exceptions, it can always be patched back
1048  * again to restore old behaviour.
1049  */
1050 int fpe_fltden = FPE_FLTDEN;
1051 
1052 /*
1053  * Map from the FPU status word to the FP exception si_code.
1054  */
1055 static int
1056 fpe_sicode(uint_t sw)
1057 {
1058         if (sw & FPS_IE)
1059                 return (FPE_FLTINV);
1060         if (sw & FPS_ZE)
1061                 return (FPE_FLTDIV);
1062         if (sw & FPS_DE)
1063                 return (fpe_fltden);
1064         if (sw & FPS_OE)
1065                 return (FPE_FLTOVF);
1066         if (sw & FPS_UE)
1067                 return (FPE_FLTUND);
1068         if (sw & FPS_PE)
1069                 return (FPE_FLTRES);
1070         return (FPE_FLTINV);    /* default si_code for other exceptions */
1071 }
1072 
1073 /*
1074  * Map from the SSE status word to the FP exception si_code.
1075  */
1076 static int
1077 fpe_simd_sicode(uint_t sw)
1078 {
1079         if (sw & SSE_IE)
1080                 return (FPE_FLTINV);
1081         if (sw & SSE_ZE)
1082                 return (FPE_FLTDIV);
1083         if (sw & SSE_DE)
1084                 return (FPE_FLTDEN);
1085         if (sw & SSE_OE)
1086                 return (FPE_FLTOVF);
1087         if (sw & SSE_UE)
1088                 return (FPE_FLTUND);
1089         if (sw & SSE_PE)
1090                 return (FPE_FLTRES);
1091         return (FPE_FLTINV);    /* default si_code for other exceptions */
1092 }
1093 
1094 /*
1095  * This routine is invoked as part of libc's __fpstart implementation
1096  * via sysi86(2).
1097  *
1098  * It may be called -before- any context has been assigned in which case
1099  * we try and avoid touching the hardware.  Or it may be invoked well
1100  * after the context has been assigned and fiddled with, in which case
1101  * just tweak it directly.
1102  */
1103 void
1104 fpsetcw(uint16_t fcw, uint32_t mxcsr)
1105 {
1106         struct fpu_ctx *fp = &curthread->t_lwp->lwp_pcb.pcb_fpu;
1107         struct fxsave_state *fx;
1108 
1109         if (!fpu_exists || fp_kind == FP_NO)
1110                 return;
1111 
1112         if ((fp->fpu_flags & FPU_EN) == 0) {
1113                 if (fcw == FPU_CW_INIT && mxcsr == SSE_MXCSR_INIT) {
1114                         /*
1115                          * Common case.  Floating point unit not yet
1116                          * enabled, and kernel already intends to initialize
1117                          * the hardware the way the caller wants.
1118                          */
1119                         return;
1120                 }
1121                 /*
1122                  * Hmm.  Userland wants a different default.
1123                  * Do a fake "first trap" to establish the context, then
1124                  * handle as if we already had a context before we came in.
1125                  */
1126                 kpreempt_disable();
1127                 fp_seed();
1128                 kpreempt_enable();
1129         }
1130 
1131         /*
1132          * Ensure that the current hardware state is flushed back to the
1133          * pcb, then modify that copy.  Next use of the fp will
1134          * restore the context.
1135          */
1136         fp_save(fp);
1137 
1138         switch (fp_save_mech) {
1139         case FP_FXSAVE:
1140                 fx = fp->fpu_regs.kfpu_u.kfpu_fx;
1141                 fx->fx_fcw = fcw;
1142                 fx->fx_mxcsr = sse_mxcsr_mask & mxcsr;
1143                 break;
1144 
1145         case FP_XSAVE:
1146                 fx = &fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave;
1147                 fx->fx_fcw = fcw;
1148                 fx->fx_mxcsr = sse_mxcsr_mask & mxcsr;
1149                 /*
1150                  * Always set LEGACY_FP as it may have been cleared by XSAVE
1151                  * instruction
1152                  */
1153                 fp->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv |=
1154                     XFEATURE_LEGACY_FP;
1155                 break;
1156         default:
1157                 panic("Invalid fp_save_mech");
1158                 /*NOTREACHED*/
1159         }
1160 }
1161 
1162 static void
1163 kernel_fpu_fpstate_init(kfpu_state_t *kfpu)
1164 {
1165         struct xsave_state *xs;
1166 
1167         switch (fp_save_mech) {
1168         case FP_FXSAVE:
1169                 bcopy(&sse_initial, kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_fx,
1170                     sizeof (struct fxsave_state));
1171                 kfpu->kfpu_ctx.fpu_xsave_mask = 0;
1172                 break;
1173         case FP_XSAVE:
1174                 xs = kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_xs;
1175                 bzero(xs, cpuid_get_xsave_size());
1176                 bcopy(&avx_initial, xs, sizeof (*xs));
1177                 xs->xs_header.xsh_xstate_bv = XFEATURE_LEGACY_FP | XFEATURE_SSE;
1178                 kfpu->kfpu_ctx.fpu_xsave_mask = XFEATURE_FP_ALL;
1179                 break;
1180         default:
1181                 panic("invalid fp_save_mech");
1182         }
1183 
1184         /*
1185          * Set the corresponding flags that the system expects on the FPU state
1186          * to indicate that this is our state. The FPU_EN flag is required to
1187          * indicate that FPU usage is allowed. The FPU_KERN flag is explicitly
1188          * not set below as it represents that this state is being suppressed
1189          * by the kernel.
1190          */
1191         kfpu->kfpu_ctx.fpu_flags = FPU_EN | FPU_VALID;
1192         kfpu->kfpu_flags |= KFPU_F_INITIALIZED;
1193 }
1194 
1195 kfpu_state_t *
1196 kernel_fpu_alloc(int kmflags)
1197 {
1198         kfpu_state_t *kfpu;
1199 
1200         if ((kfpu = kmem_zalloc(sizeof (kfpu_state_t), kmflags)) == NULL) {
1201                 return (NULL);
1202         }
1203 
1204         kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic =
1205             kmem_cache_alloc(fpsave_cachep, kmflags);
1206         if (kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic == NULL) {
1207                 kmem_free(kfpu, sizeof (kfpu_state_t));
1208                 return (NULL);
1209         }
1210 
1211         kernel_fpu_fpstate_init(kfpu);
1212 
1213         return (kfpu);
1214 }
1215 
1216 void
1217 kernel_fpu_free(kfpu_state_t *kfpu)
1218 {
1219         kmem_cache_free(fpsave_cachep,
1220             kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic);
1221         kmem_free(kfpu, sizeof (kfpu_state_t));
1222 }
1223 
1224 static void
1225 kernel_fpu_ctx_save(void *arg)
1226 {
1227         kfpu_state_t *kfpu = arg;
1228         fpu_ctx_t *pf;
1229 
1230         if (kfpu == NULL) {
1231                 /*
1232                  * A NULL kfpu implies this is a kernel thread with an LWP and
1233                  * no user-level FPU usage. Use the lwp fpu save area.
1234                  */
1235                 pf = &curthread->t_lwp->lwp_pcb.pcb_fpu;
1236 
1237                 ASSERT(curthread->t_procp->p_flag & SSYS);
1238                 ASSERT3U(pf->fpu_flags & FPU_VALID, ==, 0);
1239 
1240                 fp_save(pf);
1241         } else {
1242                 pf = &kfpu->kfpu_ctx;
1243 
1244                 ASSERT3P(kfpu->kfpu_curthread, ==, curthread);
1245                 ASSERT3U(pf->fpu_flags & FPU_VALID, ==, 0);
1246 
1247                 /*
1248                  * Note, we can't use fp_save because it assumes that we're
1249                  * saving to the thread's PCB and not somewhere else. Because
1250                  * this is a different FPU context, we instead have to do this
1251                  * ourselves.
1252                  */
1253                 switch (fp_save_mech) {
1254                 case FP_FXSAVE:
1255                         fpxsave(pf->fpu_regs.kfpu_u.kfpu_fx);
1256                         break;
1257                 case FP_XSAVE:
1258                         xsavep(pf->fpu_regs.kfpu_u.kfpu_xs, pf->fpu_xsave_mask);
1259                         break;
1260                 default:
1261                         panic("Invalid fp_save_mech");
1262                 }
1263 
1264                 /*
1265                  * Because we have saved context here, our save state is no
1266                  * longer valid and therefore needs to be reinitialized.
1267                  */
1268                 kfpu->kfpu_flags &= ~KFPU_F_INITIALIZED;
1269         }
1270 
1271         pf->fpu_flags |= FPU_VALID;
1272 
1273         /*
1274          * Clear KFPU flag. This allows swtch to check for improper kernel
1275          * usage of the FPU (i.e. switching to a new thread while the old
1276          * thread was in the kernel and using the FPU, but did not perform a
1277          * context save).
1278          */
1279         curthread->t_flag &= ~T_KFPU;
1280 }
1281 
1282 static void
1283 kernel_fpu_ctx_restore(void *arg)
1284 {
1285         kfpu_state_t *kfpu = arg;
1286         fpu_ctx_t *pf;
1287 
1288         if (kfpu == NULL) {
1289                 /*
1290                  * A NULL kfpu implies this is a kernel thread with an LWP and
1291                  * no user-level FPU usage. Use the lwp fpu save area.
1292                  */
1293                 pf = &curthread->t_lwp->lwp_pcb.pcb_fpu;
1294 
1295                 ASSERT(curthread->t_procp->p_flag & SSYS);
1296                 ASSERT3U(pf->fpu_flags & FPU_VALID, !=, 0);
1297         } else {
1298                 pf = &kfpu->kfpu_ctx;
1299 
1300                 ASSERT3P(kfpu->kfpu_curthread, ==, curthread);
1301                 ASSERT3U(pf->fpu_flags & FPU_VALID, !=, 0);
1302         }
1303 
1304         fp_restore(pf);
1305         curthread->t_flag |= T_KFPU;
1306 }
1307 
1308 /*
1309  * Validate that the thread is not switching off-cpu while actively using the
1310  * FPU within the kernel.
1311  */
1312 void
1313 kernel_fpu_no_swtch(void)
1314 {
1315         if ((curthread->t_flag & T_KFPU) != 0) {
1316                 panic("curthread swtch-ing while the kernel is using the FPU");
1317         }
1318 }
1319 
1320 static const struct ctxop_template kfpu_ctxop_tpl = {
1321         .ct_rev         = CTXOP_TPL_REV,
1322         .ct_save        = kernel_fpu_ctx_save,
1323         .ct_restore     = kernel_fpu_ctx_restore,
1324 };
1325 
1326 void
1327 kernel_fpu_begin(kfpu_state_t *kfpu, uint_t flags)
1328 {
1329         klwp_t *pl = curthread->t_lwp;
1330         struct ctxop *ctx;
1331 
1332         if ((curthread->t_flag & T_KFPU) != 0) {
1333                 panic("curthread attempting to nest kernel FPU states");
1334         }
1335 
1336         /* KFPU_USE_LWP and KFPU_NO_STATE are mutually exclusive. */
1337         ASSERT((flags & (KFPU_USE_LWP | KFPU_NO_STATE)) !=
1338             (KFPU_USE_LWP | KFPU_NO_STATE));
1339 
1340         if ((flags & KFPU_NO_STATE) == KFPU_NO_STATE) {
1341                 /*
1342                  * Since we don't have a kfpu_state or usable lwp pcb_fpu to
1343                  * hold our kernel FPU context, we depend on the caller doing
1344                  * kpreempt_disable for the duration of our FPU usage. This
1345                  * should only be done for very short periods of time.
1346                  */
1347                 ASSERT(curthread->t_preempt > 0);
1348                 ASSERT(kfpu == NULL);
1349 
1350                 if (pl != NULL) {
1351                         /*
1352                          * We might have already saved once so FPU_VALID could
1353                          * be set. This is handled in fp_save.
1354                          */
1355                         fp_save(&pl->lwp_pcb.pcb_fpu);
1356                         pl->lwp_pcb.pcb_fpu.fpu_flags |= FPU_KERNEL;
1357                 }
1358 
1359                 curthread->t_flag |= T_KFPU;
1360 
1361                 /* Always restore the fpu to the initial state. */
1362                 fpinit();
1363 
1364                 return;
1365         }
1366 
1367         /*
1368          * We either have a kfpu, or are using the LWP pcb_fpu for context ops.
1369          */
1370 
1371         if ((flags & KFPU_USE_LWP) == 0) {
1372                 if (kfpu->kfpu_curthread != NULL)
1373                         panic("attempting to reuse kernel FPU state at %p when "
1374                             "another thread already is using", kfpu);
1375 
1376                 if ((kfpu->kfpu_flags & KFPU_F_INITIALIZED) == 0)
1377                         kernel_fpu_fpstate_init(kfpu);
1378 
1379                 kfpu->kfpu_curthread = curthread;
1380         }
1381 
1382         /*
1383          * Not all threads may have an active LWP. If they do and we're not
1384          * going to re-use the LWP, then we should go ahead and save the state.
1385          * We must also note that the fpu is now being used by the kernel and
1386          * therefore we do not want to manage the fpu state via the user-level
1387          * thread's context handlers.
1388          *
1389          * We might have already saved once (due to a prior use of the kernel
1390          * FPU or another code path) so FPU_VALID could be set. This is handled
1391          * by fp_save, as is the FPU_EN check.
1392          */
1393         ctx = ctxop_allocate(&kfpu_ctxop_tpl, kfpu);
1394         kpreempt_disable();
1395         if (pl != NULL) {
1396                 if ((flags & KFPU_USE_LWP) == 0)
1397                         fp_save(&pl->lwp_pcb.pcb_fpu);
1398                 pl->lwp_pcb.pcb_fpu.fpu_flags |= FPU_KERNEL;
1399         }
1400 
1401         /*
1402          * Set the context operations for kernel FPU usage.  Because kernel FPU
1403          * setup and ctxop attachment needs to happen under the protection of
1404          * kpreempt_disable(), we allocate the ctxop outside the guard so its
1405          * sleeping allocation will not cause a voluntary swtch().  This allows
1406          * the rest of the initialization to proceed, ensuring valid state for
1407          * the ctxop handlers.
1408          */
1409         ctxop_attach(curthread, ctx);
1410         curthread->t_flag |= T_KFPU;
1411 
1412         if ((flags & KFPU_USE_LWP) == KFPU_USE_LWP) {
1413                 /*
1414                  * For pure kernel threads with an LWP, we can use the LWP's
1415                  * pcb_fpu to save/restore context.
1416                  */
1417                 fpu_ctx_t *pf = &pl->lwp_pcb.pcb_fpu;
1418 
1419                 VERIFY(curthread->t_procp->p_flag & SSYS);
1420                 VERIFY(kfpu == NULL);
1421                 ASSERT((pf->fpu_flags & FPU_EN) == 0);
1422 
1423                 /* Always restore the fpu to the initial state. */
1424                 if (fp_save_mech == FP_XSAVE)
1425                         pf->fpu_xsave_mask = XFEATURE_FP_ALL;
1426                 fpinit();
1427                 pf->fpu_flags = FPU_EN | FPU_KERNEL;
1428         } else {
1429                 /* initialize the kfpu state */
1430                 kernel_fpu_ctx_restore(kfpu);
1431         }
1432         kpreempt_enable();
1433 }
1434 
1435 void
1436 kernel_fpu_end(kfpu_state_t *kfpu, uint_t flags)
1437 {
1438         if ((curthread->t_flag & T_KFPU) == 0) {
1439                 panic("curthread attempting to clear kernel FPU state "
1440                     "without using it");
1441         }
1442 
1443         /*
1444          * General comments on why the rest of this function is structured the
1445          * way it is. Be aware that there is a lot of subtlety here.
1446          *
1447          * If a user-level thread ever uses the fpu while in the kernel, then
1448          * we cannot call fpdisable since that does STTS. That will set the
1449          * ts bit in %cr0 which will cause an exception if anything touches the
1450          * fpu. However, the user-level context switch handler (fpsave_ctxt)
1451          * needs to access the fpu to save the registers into the pcb.
1452          * fpsave_ctxt relies on CLTS having been done to clear the ts bit in
1453          * fprestore_ctxt when the thread context switched onto the CPU.
1454          *
1455          * Calling fpdisable only effects the current CPU's %cr0 register.
1456          *
1457          * During ctxop_remove and kpreempt_enable, we can voluntarily context
1458          * switch, so the CPU we were on when we entered this function might
1459          * not be the same one we're on when we return from ctxop_remove or end
1460          * the function. Note there can be user-level context switch handlers
1461          * still installed if this is a user-level thread.
1462          *
1463          * We also must be careful in the unlikely chance we're running in an
1464          * interrupt thread, since we can't leave the CPU's %cr0 TS state set
1465          * incorrectly for the "real" thread to resume on this CPU.
1466          */
1467 
1468         if ((flags & KFPU_NO_STATE) == 0) {
1469                 kpreempt_disable();
1470         } else {
1471                 ASSERT(curthread->t_preempt > 0);
1472         }
1473 
1474         curthread->t_flag &= ~T_KFPU;
1475 
1476         /*
1477          * When we are ending things, we explicitly don't save the current
1478          * kernel FPU state back to the temporary state. The kfpu API is not
1479          * intended to be a permanent save location.
1480          *
1481          * If this is a user-level thread and we were to context switch
1482          * before returning to user-land, fpsave_ctxt will be a no-op since we
1483          * already saved the user-level FPU state the first time we run
1484          * kernel_fpu_begin (i.e. we won't save the bad kernel fpu state over
1485          * the user-level fpu state). The fpsave_ctxt functions only save if
1486          * FPU_VALID is not already set. fp_save also set PCB_SET_UPDATE_FPU so
1487          * fprestore_ctxt will be done in sys_rtt_common when the thread
1488          * finally returns to user-land.
1489          */
1490 
1491         if ((curthread->t_procp->p_flag & SSYS) != 0 &&
1492             curthread->t_intr == NULL) {
1493                 /*
1494                  * A kernel thread which is not an interrupt thread, so we
1495                  * STTS now.
1496                  */
1497                 fpdisable();
1498         }
1499 
1500         if ((flags & KFPU_NO_STATE) == 0) {
1501                 ctxop_remove(curthread, &kfpu_ctxop_tpl, kfpu);
1502 
1503                 if (kfpu != NULL) {
1504                         if (kfpu->kfpu_curthread != curthread) {
1505                                 panic("attempting to end kernel FPU state "
1506                                     "for %p, but active thread is not "
1507                                     "curthread", kfpu);
1508                         } else {
1509                                 kfpu->kfpu_curthread = NULL;
1510                         }
1511                 }
1512 
1513                 kpreempt_enable();
1514         }
1515 
1516         if (curthread->t_lwp != NULL) {
1517                 uint_t f;
1518 
1519                 if (flags & KFPU_USE_LWP) {
1520                         f = FPU_EN | FPU_KERNEL;
1521                 } else {
1522                         f = FPU_KERNEL;
1523                 }
1524                 curthread->t_lwp->lwp_pcb.pcb_fpu.fpu_flags &= ~f;
1525         }
1526 }