ij-vs-gate Wdiff usr/src/uts/intel/os/fpu.c

Print this page

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/intel/os/fpu.c
          +++ new/usr/src/uts/intel/os/fpu.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright 2021 Joyent, Inc.
  24   24   * Copyright 2021 RackTop Systems, Inc.
  25   25   * Copyright 2023 Oxide Computer Company
  26   26   */
  27   27  
  28   28  /*      Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
  29   29  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T   */
  30   30  /*              All Rights Reserved                             */
  31   31  
  32   32  /*      Copyright (c) 1987, 1988 Microsoft Corporation          */
  33   33  /*              All Rights Reserved                             */
  34   34  
  35   35  /*
  36   36   * Copyright (c) 2009, Intel Corporation.
  37   37   * All rights reserved.
  38   38   */
  39   39  
  40   40  #include <sys/types.h>
  41   41  #include <sys/param.h>
  42   42  #include <sys/signal.h>
  43   43  #include <sys/regset.h>
  44   44  #include <sys/privregs.h>
  45   45  #include <sys/psw.h>
  46   46  #include <sys/trap.h>
  47   47  #include <sys/fault.h>
  48   48  #include <sys/systm.h>
  49   49  #include <sys/user.h>
  50   50  #include <sys/file.h>
  51   51  #include <sys/proc.h>
  52   52  #include <sys/pcb.h>
  53   53  #include <sys/lwp.h>
  54   54  #include <sys/cpuvar.h>
  55   55  #include <sys/thread.h>
  56   56  #include <sys/disp.h>
  57   57  #include <sys/fp.h>
  58   58  #include <sys/siginfo.h>
  59   59  #include <sys/archsystm.h>
  60   60  #include <sys/kmem.h>
  61   61  #include <sys/debug.h>
  62   62  #include <sys/x86_archext.h>
  63   63  #include <sys/sysmacros.h>
  64   64  #include <sys/cmn_err.h>
  65   65  #include <sys/kfpu.h>
  66   66  #include <sys/stdbool.h>
  67   67  #include <sys/stdalign.h>
  68   68  #include <sys/procfs_isa.h>
  69   69  #include <sys/sunddi.h>
  70   70  
  71   71  /*
  72   72   * FPU Management Overview
  73   73   * -----------------------
  74   74   *
  75   75   * The x86 FPU has evolved substantially since its days as the x87 coprocessor;
  76   76   * however, many aspects of its life as a coprocessor are still around in x86.
  77   77   *
  78   78   * Today, when we refer to the 'FPU', we don't just mean the original x87 FPU.
  79   79   * While that state still exists, there is much more that is covered by the FPU.
  80   80   * Today, this includes not just traditional FPU state, but also supervisor only
  81   81   * state. The following state is currently managed and covered logically by the
  82   82   * idea of the FPU registers and more generally is called the Extended Processor
  83   83   * States:
  84   84   *
  85   85   *    o Traditional x87 FPU
  86   86   *    o Vector Registers (%xmm, %ymm, %zmm)
  87   87   *    o Memory Protection Extensions (MPX) Bounds Registers
  88   88   *    o Protected Key Rights Registers (PKRU)
  89   89   *    o Processor Trace data
  90   90   *    o Control-Flow Enforcement state
  91   91   *    o Hardware Duty Cycle
  92   92   *    o Hardware P-states
  93   93   *
  94   94   * The rest of this covers how the FPU is managed and controlled, how state is
  95   95   * saved and restored between threads, interactions with hypervisors, and other
  96   96   * information exported to userland through aux vectors. A lot of background
  97   97   * information is here to synthesize major parts of the Intel SDM, but
  98   98   * unfortunately, it is not a replacement for reading it.
  99   99   *
 100  100   * FPU Control Registers
 101  101   * ---------------------
 102  102   *
 103  103   * Because the x87 FPU began its life as a co-processor and the FPU was
 104  104   * optional there are several bits that show up in %cr0 that we have to
 105  105   * manipulate when dealing with the FPU. These are:
 106  106   *
 107  107   *   o CR0.ET   The 'extension type' bit. This was used originally to indicate
 108  108   *              that the FPU co-processor was present. Now it is forced on for
 109  109   *              compatibility. This is often used to verify whether or not the
 110  110   *              FPU is present.
 111  111   *
 112  112   *   o CR0.NE   The 'native error' bit. Used to indicate that native error
 113  113   *              mode should be enabled. This indicates that we should take traps
 114  114   *              on FPU errors. The OS enables this early in boot.
 115  115   *
 116  116   *   o CR0.MP   The 'Monitor Coprocessor' bit. Used to control whether or not
 117  117   *              wait/fwait instructions generate a #NM if CR0.TS is set.
 118  118   *
 119  119   *   o CR0.EM   The 'Emulation' bit. This is used to cause floating point
 120  120   *              operations (x87 through SSE4) to trap with a #UD so they can be
 121  121   *              emulated. The system never sets this bit, but makes sure it is
 122  122   *              clear on processor start up.
 123  123   *
 124  124   *   o CR0.TS   The 'Task Switched' bit. When this is turned on, a floating
 125  125   *              point operation will generate a #NM. An fwait will as well,
 126  126   *              depending on the value in CR0.MP.
 127  127   *
 128  128   * Our general policy is that CR0.ET, CR0.NE, and CR0.MP are always set by
 129  129   * the system. Similarly CR0.EM is always unset by the system. CR0.TS has a more
 130  130   * complicated role. Historically it has been used to allow running systems to
 131  131   * restore the FPU registers lazily. This will be discussed in greater depth
 132  132   * later on.
 133  133   *
 134  134   * %cr4 is also used as part of the FPU control. Specifically we need to worry
 135  135   * about the following bits in the system:
 136  136   *
 137  137   *   o CR4.OSFXSR       This bit is used to indicate that the OS understands and
 138  138   *                      supports the execution of the fxsave and fxrstor
 139  139   *                      instructions. This bit is required to be set to enable
 140  140   *                      the use of the SSE->SSE4 instructions.
 141  141   *
 142  142   *   o CR4.OSXMMEXCPT   This bit is used to indicate that the OS can understand
 143  143   *                      and take a SIMD floating point exception (#XM). This bit
 144  144   *                      is always enabled by the system.
 145  145   *
 146  146   *   o CR4.OSXSAVE      This bit is used to indicate that the OS understands and
 147  147   *                      supports the execution of the xsave and xrstor family of
 148  148   *                      instructions. This bit is required to use any of the AVX
 149  149   *                      and newer feature sets.
 150  150   *
 151  151   * Because all supported processors are 64-bit, they'll always support the XMM
 152  152   * extensions and we will enable both CR4.OXFXSR and CR4.OSXMMEXCPT in boot.
 153  153   * CR4.OSXSAVE will be enabled and used whenever xsave is reported in cpuid.
 154  154   *
 155  155   * %xcr0 is used to manage the behavior of the xsave feature set and is only
 156  156   * present on the system if xsave is supported. %xcr0 is read and written to
 157  157   * through by the xgetbv and xsetbv instructions. This register is present
 158  158   * whenever the xsave feature set is supported. Each bit in %xcr0 refers to a
 159  159   * different component of the xsave state and controls whether or not that
 160  160   * information is saved and restored. For newer feature sets like AVX and MPX,
 161  161   * it also controls whether or not the corresponding instructions can be
 162  162   * executed (much like CR0.OSFXSR does for the SSE feature sets).
 163  163   *
 164  164   * Everything in %xcr0 is around features available to users. There is also the
 165  165   * IA32_XSS MSR which is used to control supervisor-only features that are still
 166  166   * part of the xsave state. Bits that can be set in %xcr0 are reserved in
 167  167   * IA32_XSS and vice versa. This is an important property that is particularly
 168  168   * relevant to how the xsave instructions operate.
 169  169   *
 170  170   * Save Mechanisms
 171  171   * ---------------
 172  172   *
 173  173   * When switching between running threads the FPU state needs to be saved and
 174  174   * restored by the OS. If this state was not saved, users would rightfully
 175  175   * complain about corrupt state. There are three mechanisms that exist on the
 176  176   * processor for saving and restoring these state images:
 177  177   *
 178  178   *   o fsave
 179  179   *   o fxsave
 180  180   *   o xsave
 181  181   *
 182  182   * fsave saves and restores only the x87 FPU and is the oldest of these
 183  183   * mechanisms. This mechanism is never used in the kernel today because we are
 184  184   * always running on systems that support fxsave.
 185  185   *
 186  186   * The fxsave and fxrstor mechanism allows the x87 FPU and the SSE register
 187  187   * state to be saved and restored to and from a struct fxsave_state. This is the
 188  188   * default mechanism that is used to save and restore the FPU on amd64. An
 189  189   * important aspect of fxsave that was different from the original i386 fsave
 190  190   * mechanism is that the restoring of FPU state with pending exceptions will not
 191  191   * generate an exception, it will be deferred to the next use of the FPU.
 192  192   *
 193  193   * The final and by far the most complex mechanism is that of the xsave set.
 194  194   * xsave allows for saving and restoring all of the traditional x86 pieces (x87
 195  195   * and SSE), while allowing for extensions that will save the %ymm, %zmm, etc.
 196  196   * registers.
 197  197   *
 198  198   * Data is saved and restored into and out of a struct xsave_state. The first
 199  199   * part of the struct xsave_state is equivalent to the struct fxsave_state.
 200  200   * After that, there is a header which is used to describe the remaining
 201  201   * portions of the state. The header is a 64-byte value of which the first two
 202  202   * uint64_t values are defined and the rest are reserved and must be zero. The
 203  203   * first uint64_t is the xstate_bv member. This describes which values in the
 204  204   * xsave_state are actually valid and present. This is updated on a save and
 205  205   * used on restore. The second member is the xcomp_bv member. Its last bit
 206  206   * determines whether or not a compressed version of the structure is used.
 207  207   *
 208  208   * When the uncompressed structure is used (currently the only format we
 209  209   * support), then each state component is at a fixed offset in the structure,
 210  210   * even if it is not being used. For example, if you only saved the AVX related
 211  211   * state, but did not save the MPX related state, the offset would not change
 212  212   * for any component. With the compressed format, components that aren't used
 213  213   * are all elided (though the x87 and SSE state are always there).
 214  214   *
 215  215   * Unlike fxsave which saves all state, the xsave family does not always save
 216  216   * and restore all the state that could be covered by the xsave_state. The
 217  217   * instructions all take an argument which is a mask of what to consider. This
 218  218   * is the same mask that will be used in the xstate_bv vector and it is also the
 219  219   * same values that are present in %xcr0 and IA32_XSS. Though IA32_XSS is only
 220  220   * considered with the xsaves and xrstors instructions.
 221  221   *
 222  222   * When a save or restore is requested, a bitwise and is performed between the
 223  223   * requested bits and those that have been enabled in %xcr0. Only the bits that
 224  224   * match that are then saved or restored. Others will be silently ignored by
 225  225   * the processor. This idea is used often in the OS. We will always request that
 226  226   * we save and restore all of the state, but only those portions that are
 227  227   * actually enabled in %xcr0 will be touched.
 228  228   *
 229  229   * If a feature has been asked to be restored that is not set in the xstate_bv
 230  230   * feature vector of the save state, then it will be set to its initial state by
 231  231   * the processor (usually zeros). Also, when asked to save state, the processor
 232  232   * may not write out data that is in its initial state as an optimization. This
 233  233   * optimization only applies to saving data and not to restoring data.
 234  234   *
 235  235   * There are a few different variants of the xsave and xrstor instruction. They
 236  236   * are:
 237  237   *
 238  238   *   o xsave    This is the original save instruction. It will save all of the
 239  239   *              requested data in the xsave state structure. It only saves data
 240  240   *              in the uncompressed (xcomp_bv[63] is zero) format. It may be
 241  241   *              executed at all privilege levels.
 242  242   *
 243  243   *   o xrstor   This is the original restore instruction. It will restore all of
 244  244   *              the requested data. The xrstor function can handle both the
 245  245   *              compressed and uncompressed formats. It may be executed at all
 246  246   *              privilege levels.
 247  247   *
 248  248   *   o xsaveopt This is a variant of the xsave instruction that employs
 249  249   *              optimizations to try and only write out state that has been
 250  250   *              modified since the last time an xrstor instruction was called.
 251  251   *              The processor tracks a tuple of information about the last
 252  252   *              xrstor and tries to ensure that the same buffer is being used
 253  253   *              when this optimization is being used. However, because of the
 254  254   *              way that it tracks the xrstor buffer based on the address of it,
 255  255   *              it is not suitable for use if that buffer can be easily reused.
 256  256   *              The most common case is trying to save data to the stack in
 257  257   *              rtld. It may be executed at all privilege levels.
 258  258   *
 259  259   *   o xsavec   This is a variant of the xsave instruction that writes out the
 260  260   *              compressed form of the xsave_state. Otherwise it behaves as
 261  261   *              xsave. It may be executed at all privilege levels.
 262  262   *
 263  263   *   o xsaves   This is a variant of the xsave instruction. It is similar to
 264  264   *              xsavec in that it always writes the compressed form of the
 265  265   *              buffer. Unlike all the other forms, this instruction looks at
 266  266   *              both the user (%xcr0) and supervisor (IA32_XSS MSR) to determine
 267  267   *              what to save and restore. xsaves also implements the same
 268  268   *              optimization that xsaveopt does around modified pieces. User
 269  269   *              land may not execute the instruction.
 270  270   *
 271  271   *   o xrstors  This is a variant of the xrstor instruction. Similar to xsaves
 272  272   *              it can save and restore both the user and privileged states.
 273  273   *              Unlike xrstor it can only operate on the compressed form.
 274  274   *              User land may not execute the instruction.
 275  275   *
 276  276   * Based on all of these, the kernel has a precedence for what it will use.
 277  277   * Basically, xsaves (not supported) is preferred to xsaveopt, which is
 278  278   * preferred to xsave. A similar scheme is used when informing rtld (more later)
 279  279   * about what it should use. xsavec is preferred to xsave. xsaveopt is not
 280  280   * recommended due to the modified optimization not being appropriate for this
 281  281   * use.
 282  282   *
 283  283   * Finally, there is one last gotcha with the xsave state. Importantly some AMD
 284  284   * processors did not always save and restore some of the FPU exception state in
 285  285   * some cases like Intel did. In those cases the OS will make up for this fact
 286  286   * itself.
 287  287   *
 288  288   * FPU Initialization
 289  289   * ------------------
 290  290   *
 291  291   * One difference with the FPU registers is that not all threads have FPU state,
 292  292   * only those that have an lwp. Generally this means kernel threads, which all
 293  293   * share p0 and its lwp, do not have FPU state. Though there are definitely
 294  294   * exceptions such as kcfpoold. In the rest of this discussion we'll use thread
 295  295   * and lwp interchangeably, just think of thread meaning a thread that has a
 296  296   * lwp.
 297  297   *
 298  298   * Each lwp has its FPU state allocated in its pcb (process control block). The
 299  299   * actual storage comes from the fpsave_cachep kmem cache. This cache is sized
 300  300   * dynamically at start up based on the save mechanism that we're using and the
 301  301   * amount of memory required for it. This is dynamic because the xsave_state
 302  302   * size varies based on the supported feature set.
 303  303   *
 304  304   * The hardware side of the FPU is initialized early in boot before we mount the
 305  305   * root file system. This is effectively done in fpu_probe(). This is where we
 306  306   * make the final decision about what the save and restore mechanisms we should
 307  307   * use are, create the fpsave_cachep kmem cache, and initialize a number of
 308  308   * function pointers that use save and restoring logic.
 309  309   *
 310  310   * The thread/lwp side is a a little more involved. There are two different
 311  311   * things that we need to concern ourselves with. The first is how the FPU
 312  312   * resources are allocated and the second is how the FPU state is initialized
 313  313   * for a given lwp.
 314  314   *
 315  315   * We allocate the FPU save state from our kmem cache as part of lwp_fp_init().
 316  316   * This is always called unconditionally by the system as part of creating an
 317  317   * LWP.
 318  318   *
 319  319   * There are three different initialization paths that we deal with. The first
 320  320   * is when we are executing a new process. As part of exec all of the register
 321  321   * state is reset. The exec case is particularly important because init is born
 322  322   * like Athena, sprouting from the head of the kernel, without any true parent
 323  323   * to fork from. The second is used whenever we fork or create a new lwp.  The
 324  324   * third is to deal with special lwps like the agent lwp.
 325  325   *
 326  326   * During exec, we will call fp_exec() which will initialize and set up the FPU
 327  327   * state for the process. That will fill in the initial state for the FPU and
 328  328   * also set that state in the FPU itself. As part of fp_exec() we also install a
 329  329   * thread context operations vector that takes care of dealing with the saving
 330  330   * and restoring of the FPU. These context handlers will also be called whenever
 331  331   * an lwp is created or forked. In those cases, to initialize the FPU we will
 332  332   * call fp_new_lwp(). Like fp_exec(), fp_new_lwp() will install a context
 333  333   * operations vector for the new thread.
 334  334   *
 335  335   * Next we'll end up in the context operation fp_new_lwp(). This saves the
 336  336   * current thread's state, initializes the new thread's state, and copies over
 337  337   * the relevant parts of the originating thread's state. It's as this point that
 338  338   * we also install the FPU context operations into the new thread, which ensures
 339  339   * that all future threads that are descendants of the current one get the
 340  340   * thread context operations (unless they call exec).
 341  341   *
 342  342   * To deal with some things like the agent lwp, we double check the state of the
 343  343   * FPU in sys_rtt_common() to make sure that it has been enabled before
 344  344   * returning to userland. In general, this path should be rare, but it's useful
 345  345   * for the odd lwp here and there.
 346  346   *
 347  347   * The FPU state will remain valid most of the time. There are times that
 348  348   * the state will be rewritten. For example in restorecontext, due to /proc, or
 349  349   * the lwp calls exec(). Whether the context is being freed or we are resetting
 350  350   * the state, we will call fp_free() to disable the FPU and our context.
 351  351   *
 352  352   * Finally, when the lwp is destroyed, it will actually destroy and free the FPU
 353  353   * state by calling fp_lwp_cleanup().
 354  354   *
 355  355   * Kernel FPU Multiplexing
 356  356   * -----------------------
 357  357   *
 358  358   * Just as the kernel has to maintain all of the general purpose registers when
 359  359   * switching between scheduled threads, the same is true of the FPU registers.
 360  360   *
 361  361   * When a thread has FPU state, it also has a set of context operations
 362  362   * installed. These context operations take care of making sure that the FPU is
 363  363   * properly saved and restored during a context switch (fpsave_ctxt and
 364  364   * fprestore_ctxt respectively). This means that the current implementation of
 365  365   * the FPU is 'eager', when a thread is running the CPU will have its FPU state
 366  366   * loaded. While this is always true when executing in userland, there are a few
 367  367   * cases where this is not true in the kernel.
 368  368   *
 369  369   * This was not always the case. Traditionally on x86 a 'lazy' FPU restore was
 370  370   * employed. This meant that the FPU would be saved on a context switch and the
 371  371   * CR0.TS bit would be set. When a thread next tried to use the FPU, it would
 372  372   * then take a #NM trap, at which point we would restore the FPU from the save
 373  373   * area and return to userland. Given the frequency of use of the FPU alone by
 374  374   * libc, there's no point returning to userland just to trap again.
 375  375   *
 376  376   * There are a few cases though where the FPU state may need to be changed for a
 377  377   * thread on its behalf. The most notable cases are in the case of processes
 378  378   * using /proc, restorecontext, forking, etc. In all of these cases the kernel
 379  379   * will force a threads FPU state to be saved into the PCB through the fp_save()
 380  380   * function. Whenever the FPU is saved, then the FPU_VALID flag is set on the
 381  381   * pcb. This indicates that the save state holds currently valid data. As a side
 382  382   * effect of this, CR0.TS will be set. To make sure that all of the state is
 383  383   * updated before returning to userland, in these cases, we set a flag on the
 384  384   * PCB that says the FPU needs to be updated. This will make sure that we take
 385  385   * the slow path out of a system call to fix things up for the thread. Due to
 386  386   * the fact that this is a rather rare case, effectively setting the equivalent
 387  387   * of t_postsys is acceptable.
 388  388   *
 389  389   * CR0.TS will be set after a save occurs and cleared when a restore occurs.
 390  390   * Generally this means it will be cleared immediately by the new thread that is
 391  391   * running in a context switch. However, this isn't the case for kernel threads.
 392  392   * They currently operate with CR0.TS set as no kernel state is restored for
 393  393   * them. This means that using the FPU will cause a #NM and panic.
 394  394   *
 395  395   * The FPU_VALID flag on the currently executing thread's pcb is meant to track
 396  396   * what the value of CR0.TS should be. If it is set, then CR0.TS will be set.
 397  397   * However, because we eagerly restore, the only time that CR0.TS should be set
 398  398   * for a non-kernel thread is during operations where it will be cleared before
 399  399   * returning to userland and importantly, the only data that is in it is its
 400  400   * own.
 401  401   *
 402  402   * Kernel FPU Usage
 403  403   * ----------------
 404  404   *
 405  405   * Traditionally the kernel never used the FPU since it had no need for
 406  406   * floating point operations. However, modern FPU hardware supports a variety
 407  407   * of SIMD extensions which can speed up code such as parity calculations or
 408  408   * encryption.
 409  409   *
 410  410   * To allow the kernel to take advantage of these features, the
 411  411   * kernel_fpu_begin() and kernel_fpu_end() functions should be wrapped
 412  412   * around any usage of the FPU by the kernel to ensure that user-level context
 413  413   * is properly saved/restored, as well as to properly setup the FPU for use by
 414  414   * the kernel. There are a variety of ways this wrapping can be used, as
 415  415   * discussed in this section below.
 416  416   *
 417  417   * When kernel_fpu_begin() and kernel_fpu_end() are used for extended
 418  418   * operations, the kernel_fpu_alloc() function should be used to allocate a
 419  419   * kfpu_state_t structure that is used to save/restore the thread's kernel FPU
 420  420   * state. This structure is not tied to any thread. That is, different threads
 421  421   * can reuse the same kfpu_state_t structure, although not concurrently. A
 422  422   * kfpu_state_t structure is freed by the kernel_fpu_free() function.
 423  423   *
 424  424   * In some cases, the kernel may need to use the FPU for a short operation
 425  425   * without the overhead to manage a kfpu_state_t structure and without
 426  426   * allowing for a context switch off the FPU. In this case the KFPU_NO_STATE
 427  427   * bit can be set in the kernel_fpu_begin() and kernel_fpu_end() flags
 428  428   * parameter. This indicates that there is no kfpu_state_t. When used this way,
 429  429   * kernel preemption should be disabled by the caller (kpreempt_disable) before
 430  430   * calling kernel_fpu_begin(), and re-enabled after calling kernel_fpu_end().
 431  431   * For this usage, it is important to limit the kernel's FPU use to short
 432  432   * operations. The tradeoff between using the FPU without a kfpu_state_t
 433  433   * structure vs. the overhead of allowing a context switch while using the FPU
 434  434   * should be carefully considered on a case by case basis.
 435  435   *
 436  436   * In other cases, kernel threads have an LWP, but never execute in user space.
 437  437   * In this situation, the LWP's pcb_fpu area can be used to save/restore the
 438  438   * kernel's FPU state if the thread is context switched, instead of having to
 439  439   * allocate and manage a kfpu_state_t structure. The KFPU_USE_LWP bit in the
 440  440   * kernel_fpu_begin() and kernel_fpu_end() flags parameter is used to
 441  441   * enable this behavior. It is the caller's responsibility to ensure that this
 442  442   * is only used for a kernel thread which never executes in user space.
 443  443   *
 444  444   * FPU Exceptions
 445  445   * --------------
 446  446   *
 447  447   * Certain operations can cause the kernel to take traps due to FPU activity.
 448  448   * Generally these events will cause a user process to receive a SIGFPU and if
 449  449   * the kernel receives it in kernel context, we will die. Traditionally the #NM
 450  450   * (Device Not Available / No Math) exception generated by CR0.TS would have
 451  451   * caused us to restore the FPU. Now it is a fatal event regardless of whether
 452  452   * or not userland causes it.
 453  453   *
 454  454   * While there are some cases where the kernel uses the FPU, it is up to the
 455  455   * kernel to use the FPU in a way such that it cannot receive a trap or to use
 456  456   * the appropriate trap protection mechanisms.
 457  457   *
 458  458   * Hypervisors
 459  459   * -----------
 460  460   *
 461  461   * When providing support for hypervisors things are a little bit more
 462  462   * complicated because the FPU is not virtualized at all. This means that they
 463  463   * need to save and restore the FPU and %xcr0 across entry and exit to the
 464  464   * guest. To facilitate this, we provide a series of APIs in <sys/hma.h>. These
 465  465   * allow us to use the full native state to make sure that we are always saving
 466  466   * and restoring the full FPU that the host sees, even when the guest is using a
 467  467   * subset.
 468  468   *
 469  469   * One tricky aspect of this is that the guest may be using a subset of %xcr0
 470  470   * and therefore changing our %xcr0 on the fly. It is vital that when we're
 471  471   * saving and restoring the FPU that we always use the largest %xcr0 contents
 472  472   * otherwise we will end up leaving behind data in it.
 473  473   *
 474  474   * ELF PLT Support
 475  475   * ---------------
 476  476   *
 477  477   * rtld has to preserve a subset of the FPU when it is saving and restoring
 478  478   * registers due to the amd64 SYS V ABI. See cmd/sgs/rtld/amd64/boot_elf.s for
 479  479   * more information. As a result, we set up an aux vector that contains
 480  480   * information about what save and restore mechanisms it should be using and
 481  481   * the sizing thereof based on what the kernel supports. This is passed down in
 482  482   * a series of aux vectors SUN_AT_FPTYPE and SUN_AT_FPSIZE. This information is
 483  483   * initialized in fpu_subr.c.
 484  484   *
 485  485   * Signal Handling and the ucontext_t
 486  486   * ----------------------------------
 487  487   *
 488  488   * One of the many gifts that signals give us is the twofold fact that when a
 489  489   * signal occurs, the signal handler is allowed to change the CPU's state
 490  490   * arbitrarily and when the signal handler is done executing, we must restore it
 491  491   * back to the original state. However, the second part of this is that the
 492  492   * signal handler is actually allowed to modify the state that the thread will
 493  493   * return to! To create this facade, the kernel will create a full ucontext_t
 494  494   * state, effectively calling getcontext(2) on the thread's behalf, and a
 495  495   * pointer to that is given to the signal handler (the void * argument for the
 496  496   * sa_sigaction function pointer in sigaction(2)). When libc is done with a
 497  497   * signal, it will call setcontext(2) with that same ucontext_t.
 498  498   *
 499  499   * Now, the ucontext_t has a fixed ABI for both ILP32 and LP64 environments and
 500  500   * it's often declared on the stack itself, with the signal handler spilling all
 501  501   * this state to the stack. The ucontext_t machine portion was broken into the
 502  502   * general purpose and floating point registers. In 64-bit code, the floating
 503  503   * point registers were mostly the same as the results of the fxsave instruction
 504  504   * (i.e. struct fxsave_state). While the 64-bit kernel still uses the equivalent
 505  505   * starting point for information, it is transformed into a different shape to
 506  506   * deal with the history of the 32-bit SYS V ABI.
 507  507   *
 508  508   * While this worked, if you're reading this, you're aware that the x86 FPU and
 509  509   * extended register states didn't stop at the initial 16 128-bit %xmm
 510  510   * registers. Since then we have added 256-bit %ymm, 512-bit %zmm, and the %k
 511  511   * opmask registers. None of these fit inside the standard ucontext_t; however,
 512  512   * they must all be preserved and restored across a signal. While the various
 513  513   * x86 platform-specific ABIs all suggest that these registers are not preserved
 514  514   * across a function call, receiving a signal is not a function call and must be
 515  515   * thought of like a process receiving an interrupt. In other words, this
 516  516   * extended state must be preserved.
 517  517   *
 518  518   * To facilitate this, we have extended the ucontext_t structure with an
 519  519   * additional flag, UC_XSAVE, which indicates that the traditional padding
 520  520   * member, uc_xsave, actually is a pointer to the extended state. While this is
 521  521   * accessible outside of a signal handling context through the combination of
 522  522   * ucontext_alloc(3C) and getcontext_extd(2), our design around saving this
 523  523   * state is focused on signal handling. Signal handling spills all this state to
 524  524   * the stack and if we cannot spill the entire state to the stack then our
 525  525   * inability to deliver the signal results in the process being killed! While
 526  526   * there are separate efforts to ensure that the signal stack sizing that is
 527  527   * used for the minimum and maximum signal sizes are sufficient, we still need
 528  528   * to do our part to minimize the likelihood here.
 529  529   *
 530  530   * In designing this, we make the following observations which have helped us
 531  531   * focus our design:
 532  532   *
 533  533   *   o While the start of an xsave area is the traditional 512-byte fxsave XMM
 534  534   *     region, we already have that in the fpregs. Thus there is no reason to
 535  535   *     duplicate it. This not only saves 512 bytes of additional stack space,
 536  536   *     but it also means we don't have to ask which of the version of it to take
 537  537   *     if they were to differ.
 538  538   *
 539  539   *   o Many applications out there aren't necessarily using the extended vectors
 540  540   *     and even when we do make libc and others take advantage of it, it will
 541  541   *     behoove us to ensure that they are put back into their initial state
 542  542   *     after use. This leads us to expect that in a number of cases, the actual
 543  543   *     extended register state will be in its initial state.
 544  544   *
 545  545   *   o While the signal handler does allow contents to be modified, we are
 546  546   *     starting with making the interface private and thus allowing us to excise
 547  547   *     components that are in their initial state.
 548  548   *
 549  549   *   o There are similarities to what we want to create with the compressed
 550  550   *     xsave format; however, because we don't always have support for the
 551  551   *     compressed format, we can't just arbitrarily say let's do a compressed
 552  552   *     save to the user stack.
 553  553   *
 554  554   *   o Because we are not handing this state directly to and from hardware, we
 555  555   *     don't need to meet some of the constraints of the compressed xsave format
 556  556   *     around wanting alignment for the initial save or additional components.
 557  557   *
 558  558   * All of the above lead us to our own unique format for this data. When the
 559  559   * UC_XSAVE flag is set in the ucontext_t, the uc_xsave member points to a
 560  560   * uc_xsave_t structure which has a magic version number, a 32-bit length of the
 561  561   * overall structure, and the 64-bit state bit-vector to represent which
 562  562   * components are valid. Following this 8-byte header, each component that is
 563  563   * present in the bit vector is immediately written out in roughly ascending bit
 564  564   * order (the order is determined based on the order of the fpu_xsave_info
 565  565   * array).
 566  566   *
 567  567   * This makes the rough logic that we have here when taking a signal and writing
 568  568   * out this state as:
 569  569   *
 570  570   *   1. Ensure that the FPU is saved and that the contents of the pcb save area
 571  571   *      are valid. That is, call fp_save() if the state is not already flagged
 572  572   *      with FPU_VALID.
 573  573   *
 574  574   *   2. Copy the bit-vector from the save area and remove the XFEATURE_LEGACY_FP
 575  575   *      and XFEATURE_SSE bits as these will be placed in the xsave area.
 576  576   *
 577  577   *   3. Initialize the uc_xsave_t by setting our version field, initializing the
 578  578   *      length to the length of the current structure, and then setting the
 579  579   *      modified bit vector above.
 580  580   *
 581  581   *   4. Walk each remaining bit of the bit-vector. For each set bit, copy out
 582  582   *      its extended state starting at the current length in the header and then
 583  583   *      increase the header size by that length.
 584  584   *
 585  585   *   5. Finally write out the final uc_xsave_t structure.
 586  586   *
 587  587   * The above process is also used when someone manually calls getcontext_extd(2)
 588  588   * to get this state. The main difference between the two is which copyout
 589  589   * function we use. This deserves some explanation. Our main starting point for
 590  590   * all the logic here is fpu_signal_copyout(). It takes a copyfunc that allows
 591  591   * the signal handling context to operate with a different copyout than we
 592  592   * normally use in say getcontext_extd(2).
 593  593   *
 594  594   * When we've received a signal, we're at the intersection of several different
 595  595   * gotchas. Normal copyout (or ddi_copyout()) will trigger watchpoints. That is,
 596  596   * the watchpoints effectively set a copyout override function (t_copyops) that
 597  597   * we end up vectoring to rather than a normal copyout. This allows the data to
 598  598   * be modified and for the watchpoint to fire. While this is all well and good
 599  599   * normally, it is problematic if we are trying to handle a signal. The signal
 600  600   * deliver logic, sendsig(), goes through and disables the watchpoint for the
 601  601   * region of the stack that we are copying out to. However, disabling
 602  602   * watchpoints is not sufficient, we also need to use the copyout_noerr
 603  603   * variants.
 604  604   *
 605  605   * These variants also require the use of on_fault() and no_fault() for error
 606  606   * handling. While it is tempting to try and on_fault() the entire
 607  607   * fpu_signal_copyout() operation, that is actually fraught for a few reasons.
 608  608   * The first is that we don't want to disable faults during the entire operation
 609  609   * as if the kernel messes up we will treat that as a user error. That isn't
 610  610   * theoretical and happened during development. The second and perhaps more
 611  611   * important issue is that correctly bounding the on_fault() / no_fault() means
 612  612   * being careful about state. For example, kernel pre-emption is often disabled
 613  613   * during parts of these operations, but it needs to be re-enabled when we're
 614  614   * done. This would require tracking in some volatile variable that this had
 615  615   * been enabled and disabled and tracking that.
 616  616   *
 617  617   * Instead, this is why fpu_signal_copyout() takes a copy out function as an
 618  618   * argument. When we're in signal handling context, the function will use
 619  619   * coypout_noerr() and wrap it in the appropriate on_fault() mechanisms.
 620  620   *
 621  621   * RESTORING STATE
 622  622   *
 623  623   * Copying out our current state is the easier half of this problem. When the
 624  624   * kernel is done with a signal it calls setcontext(2) with the ucontext_t we
 625  625   * assembled for it as described above. setcontext(2) isn't just used for
 626  626   * returning from signals.
 627  627   *
 628  628   * The process for this goes in two steps. The first step is to copy in,
 629  629   * validate, and transform the ucontext_t UC_XSAVE that we created above into an
 630  630   * equivalent xsave format that we can use the appropriate xrstor function on.
 631  631   * This first phase is implemented in fpu_signal_copyin(). Once that is done, we
 632  632   * come back through a second phase that is driven out of restorecontext() and
 633  633   * is implemented in fpu_set_xsave().
 634  634   *
 635  635   * Let's start by discussing the second part of this, which is more
 636  636   * straightforward. In particular, the second phase assumes that all of the
 637  637   * validation and error handling has been done by the first phase. This means
 638  638   * here, we have a buffer that is already the appropriate size
 639  639   * (cpuid_get_xsave_size()) and all we need to do is make sure that we can
 640  640   * replace the actual save state with the current one.
 641  641   *
 642  642   * The only piece of shenanigans we have to do is around the kernel provided
 643  643   * notion of 'status' and 'xstatus', which are cached versions of the x87 and
 644  644   * SSE exception vectors. These are part of the fpregset ABI and therefore we
 645  645   * need to propagate them from the temporary storage that part 1 sets up in the
 646  646   * ignored region of the fxsave data. We use that because it is not persisted by
 647  647   * the CPU, so clobbering it is generally alright.
 648  648   *
 649  649   * Once that is done, we simply note that we need a PCB update to occur to
 650  650   * refresh the FPU state before we return to userland. Given that someone has
 651  651   * called setcontext(2), this was always going to happen because we have to
 652  652   * update segment registers and related, so this isn't so bad. With that, let's
 653  653   * move onto the more nuanced part (1).
 654  654   *
 655  655   * When we're handling a setcontext(2) we have, in userland, a data structure
 656  656   * that should match one we serialized out, though we cannot assume that a user
 657  657   * has not modified it either accidentally or maliciously. Our goal is to set up
 658  658   * the appropriate xsave state that can be passed to the CPU's xrstor. The first
 659  659   * problem we have to deal with is where do we actually put this state?
 660  660   *
 661  661   * While not many programs actually call setcontext(2) on their own volition,
 662  662   * this is going to get hit every time we take a signal. The first thought was
 663  663   * to re-use the existing thread's save area; however, that's a bit challenging
 664  664   * for a few reasons. In particular, we would need to ensure that we don't go
 665  665   * off-CPU for any reason, which we cannot assume with a copyin from a user
 666  666   * address space. In particular, it is trivial for us to hit a case where the
 667  667   * stack has been paged out for some reason, which eschews that path.
 668  668   *
 669  669   * Instead, whenever a thread first calls setcontext(2), generally from signal
 670  670   * context, we will at that time allocate another entry from the 'fpsave_cachep'
 671  671   * kmem cache, giving us a buffer of the appropriate space to handle this. Once
 672  672   * this buffer has been allocated, we leave it assigned to the thread's pcb and
 673  673   * only tear it down when the thread itself finally exits. We reason that a
 674  674   * thread that takes a signal once is either going to have the process exit
 675  675   * shortly thereafter or is much more likely to take a signal again in the
 676  676   * future. Many daemons and other processes set things up so signals are
 677  677   * dispatched via one location, masking signals in other thread, using
 678  678   * sigsuspend(2), signalfd(3C), or something similar.
 679  679   *
 680  680   * With this buffer in hand, we begin our task of reassembling state. Note, all
 681  681   * of this is conditional on UC_XSAVE being set in the uc_flags member of the
 682  682   * ucontext_t. If it is not set, then we assume that there is no extended state
 683  683   * and will use the traditional path of setting the fpregset_t into the system
 684  684   * via setfpregs().
 685  685   *
 686  686   * We first will copyin and validate the uc_xsave_t. In particular, we need to
 687  687   * make sure the version makes sense and that the xsave component bit-vector
 688  688   * doesn't have anything unexpected and more importantly unsupported in it, and
 689  689   * that the addresses we've been given are within the user address space. At
 690  690   * this point we can walk through our table of implemented bits and process
 691  691   * them.
 692  692   *
 693  693   * For most components in here, the processing is straightforward. We continue
 694  694   * walking our cursor and copy data into the kernel and place it in the
 695  695   * appropriate place in our xsave state. If a xsave state component bit-vector
 696  696   * isn't set, then we must ensure that we have the item in the initial state,
 697  697   * which for everything other than the x87/SSE state is the memory being zeroed.
 698  698   *
 699  699   * The most unique case in the copyin state is that of the x87/SSE state. You
 700  700   * might recall that we didn't copy it out explicitly as part of the uc_xsave_t,
 701  701   * but instead have opted to use the single definition in the fpregset_t. Thus
 702  702   * here, we copy it out of the fpregset_t, which the kernel has helpfully
 703  703   * already unified into the 64-bit fxsave version prior to calling us, and
 704  704   * install that into the save area we're building up.
 705  705   *
 706  706   * As part of this, there are two important pieces to be aware of. The first is
 707  707   * that because the fpregset_t has both the status and xstatus members
 708  708   * mentioned earlier, we temporarily copy them to the software-usable ignored
 709  709   * areas of the fxsave state so we can corral this extra state into part (2)
 710  710   * without needing to allocate additional space. The second piece is that when
 711  711   * we're done processing this we explicitly remove the UC_FPU flag that would
 712  712   * tell the kernel to proceed with updating that region. The problem is that
 713  713   * that goes directly into the pcb's save area and not to the intermediate
 714  714   * buffer as it uses the same entry point as /proc, mainly setfpregs().
 715  715   *
 716  716   * We don't do much validation of the actual contents of the registers that are
 717  717   * being set with the exception of ensuring that no reserved bits of the mxcsr
 718  718   * are used. This is not as strict as /proc, but failure here means the process
 719  719   * is likely going to die (returning from setcontext() in a signal handler is
 720  720   * fatal).
 721  721   *
 722  722   * /proc xregs
 723  723   * -----------
 724  724   *
 725  725   * Observability of the state of the extended registers is important for
 726  726   * understanding the system. While on the surface this is similar to signal
 727  727   * handling, it is crucially different in a number of ways:
 728  728   *
 729  729   *   o In signal handling, we're trying to conserve every byte of stack that we
 730  730   *     can.
 731  731   *   o The /proc xregs file will end up in core files, which means that we need
 732  732   *     a way of knowing what components are present and not present in it,
 733  733   *     because this will vary from CPU to CPU due to the addition of
 734  734   *     architectural features. For example, some CPUs support AVX-512, but
 735  735   *     others do not.
 736  736   *   o The signal handling structure is private and we're not trying to have
 737  737   *     software modify it, on the other hand, the /proc interfaces that we
 738  738   *     support we do want software to be able to interrogate and manipulate.
 739  739   *     These need to be something that we can introduce additional components
 740  740   *     into and make other changes that still allow it to work.
 741  741   *
 742  742   * The x86 xregs format is documented in proc(5). The short form is that the
 743  743   * prxregset_hdr_t has a number of information entries, which are of the type
 744  744   * prxregset_info_t. Each of the information headers has a type, size, and
 745  745   * offset which indicate where to find the additional data.
 746  746   *
 747  747   * Each entry is described as one of the entries in the fpu_xsave_info[]. These
 748  748   * items either are a 1:1 correspondence with a xsave related feature (e.g.
 749  749   * there is one entry for each of the three AVX-512 components) or it is
 750  750   * something synthetic that we provide as additional information such as the
 751  751   * PRX_INFO_XCR, which is a way of getting information about the system such as
 752  752   * what is enabled in %xcr0 out there.
 753  753   *
 754  754   * Unlike signal handling, we are given the buffer to place everything that
 755  755   * needs to be written out. This is partially the design of the /proc APIs. That
 756  756   * is, we will always assemble everything into the entire buffer that /proc asks
 757  757   * us to, and then it will use as much or as little of it as is required.
 758  758   * Similarly, when setting things, we don't have to worry about copying in
 759  759   * information in the same way as signal handling does, because /proc takes care
 760  760   * of it and always hands us a full buffer. Sizing that is a little nuanced, but
 761  761   * is all handled in prmachdep.c.
 762  762   *
 763  763   * When someone performs a read of the xregs and thus is asking us for the
 764  764   * current state, there is a little bit of nuance that we need to deal with
 765  765   * here. The first, is whether or not the FPU is enabled and the second is if
 766  766   * the FPU is enabled, whether a given component is noted as being in its
 767  767   * initial state. This basically gives us three possible states for a given
 768  768   * component:
 769  769   *
 770  770   *   1. FPU_EN is not set and FPU_VALID is not set. This means we need to take
 771  771   *      the illumos FPU default for an item. More on that in a moment.
 772  772   *   2. The saved xsave state indicates that the bit for a given component is
 773  773   *      zero -- specifically the xsh_xstate_bv member of the struct xsave_state.
 774  774   *      In this case, we must take the CPU's default for an item. This is
 775  775   *      usually the same as illumos, but not always.
 776  776   *   3. The saved xsave state indicates that a given component's state bit is
 777  777   *      valid. The simplest of our cases. We can just take what we have from the
 778  778   *      xsave state.
 779  779   *
 780  780   * The CPU's default state for most components other than the x87/SSE state is
 781  781   * to have it be zeroed. This is what we treat as our default state as well. The
 782  782   * primary difference is in the initialization of the x87/SSE state. The SYS V
 783  783   * ABI requires that we enable a different floating point control word then the
 784  784   * hardware default. This means that when we're dealing with case (1) for
 785  785   * x87/SSE we have to be more careful than the other components. Thankfully for
 786  786   * everything else this is just keeping it zeroed.
 787  787   *
 788  788   * A reasonable question would be why not just skip components that aren't
 789  789   * marked as present. There are a few reasons we take a different approach and
 790  790   * always include it. Both of these are to make lives simpler for consumers. In
 791  791   * the first case, when someone is performing a read and wants to reassemble and
 792  792   * answer the question of 'what is the value of %ymm0 or %zmm15', they have
 793  793   * to combine multiple disparate parts. If one knows that the data we put into
 794  794   * there is always valid and represents what is in hardware and doesn't have to
 795  795   * keep track of what are the defaults in different circumstances, then that
 796  796   * greatly simplifies consumers lives. It also helps us for core files and other
 797  797   * observability cases because the answer to what is the operating system's
 798  798   * default may change over time.
 799  799   *
 800  800   * Similarly, including all the possible structures means that we have
 801  801   * simplified someone who does a write. Writes are always setting the full state
 802  802   * of a thread, meaning that if someone wants to modify only a single register
 803  803   * they must do a read, modify, and write. By including everything that they
 804  804   * might need, it makes it easier for consumers to do this and not have to cons
 805  805   * up the whole structure on their own.
 806  806   *
 807  807   * When we're setting state, things change around a little bit. We have a few
 808  808   * constraints that are laid out in proc(5). In particular, we require that the
 809  809   * PRX_INFO_XSAVE component always be present to tell us which other components
 810  810   * we expect to be here and which ones we don't. We also are much stricter about
 811  811   * writes in several ways. Of all the components, the PRX_INFO_XCR is read-only
 812  812   * and may not be modified by a calling process. In addition, when we have
 813  813   * 32-bit applications which have reserved registers in the %ymm, %zmm, etc.
 814  814   * segments, if they are being written to and have modifications, then we will
 815  815   * indicate an error there.
 816  816   *
 817  817   * Because we are given the entire buffer from userland and don't need to have
 818  818   * an intermediate place to copy it in, we will validate the entire thing in
 819  819   * advance. Once it has been validated and we consider it legal, then we will
 820  820   * translate each entry into its corresponding entry in pcb's normal floating
 821  821   * point state. This is different from signal handling mostly because of the
 822  822   * fact that we are not using copyin, and once we get to this point, there is
 823  823   * no more validation, so we don't have the same concerns around blocking while
 824  824   * pre-emption is disabled.
 825  825   *
 826  826   * The Wrinkle with fpregs
 827  827   * -----------------------
 828  828   *
 829  829   * When we instead turn our attention to the fpregs, whether we're gathering
 830  830   * them as part of the ucontext_t or as part of /proc, there are a few
 831  831   * complications that we need to be aware of when we're operating on a kernel
 832  832   * that is using xsave as the save mechanism. When we're using fxsave as the
 833  833   * save mechanism, the CPU will always save the entire 512-byte fxsave region.
 834  834   * The fpregs ABI that the kernel expects is basically this structure itself,
 835  835   * which is transformed into a 32-bit compatible form in archdep.c.
 836  836   *
 837  837   * But xsave makes this much more complex and has been a source of historical
 838  838   * bugs in the system. In particular, unlike fxsave, xsave has its component bit
 839  839   * vector that is written out to indicate validity. This means that blindly
 840  840   * copying the fxsave area without checking those bits will lead us to do the
 841  841   * wrong thing. The XMM state flag mostly covers the 16 128-bit %xmm registers,
 842  842   * while the x87 legacy fp flag covers the rest of the state. This is all good,
 843  843   * aside from the MCXSR.
 844  844   *
 845  845   * One of the more complicated pieces of xsave state management is correctly
 846  846   * answering the question of when the MXCSR is written out to xsave_state. In
 847  847   * practice, this is rather convoluted and varies. If either the XMM or AVX
 848  848   * feature bits are set then the CPU will write out the MXCSR and its mask
 849  849   * register into the traditional fxsave state region. This behavior is dependent
 850  850   * on the type of save function that we use. xsave and xsaveopt will look at the
 851  851   * AVX feature bit; however, xsavec does not and only considers the SSE feature
 852  852   * bit. This means that when we're retrieving things, we need to check both of
 853  853   * those bits to determine if we should use the initial state or the value
 854  854   * written out.
 855  855   *
 856  856   * When we come to someone trying to set the fpregs through /proc, the main
 857  857   * question we have is what happens to the extended registers. We have opted to
 858  858   * implement and document it such that a write to the fpregs only impacts the
 859  859   * fpregs. Put differently, we will save the FPU state with fp_save() ahead of
 860  860   * copying the data into the save area, set the state bits for x87 and XMM
 861  861   * state, and then set the FPU to be restored. All in all, this basically means
 862  862   * that writing to fpregs does not touch any of the %ymm, %zmm, or other state
 863  863   * that we might have present.
 864  864   *
 865  865   * Forward Looking: Adding Intel AMX Support
 866  866   * -----------------------------------------
 867  867   *
 868  868   * Nothing can stop the march of features being added into the FPU. One of the
 869  869   * larger chunks that we will need to wrangle with is Intel's Advanced Matrix
 870  870   * Extensions (AMX), which add a large chunk of xsave state to each process.
 871  871   * While things like AVX and AVX-512 have been enabled by default, the broader
 872  872   * OS community has not been wanting to do this for AMX ,because of the size of
 873  873   * the state which exceeds 8 KiB. While the signal handling state went out of
 874  874   * its way to minimize the size it wrote to the stack, if this is used, it would
 875  875   * need to be preserved.
 876  876   *
 877  877   * To deal with this reality and the fact that folks don't really want to
 878  878   * enable it by default for all purposes when its use will be quite special
 879  879   * purpose, Intel has also added a MSR around extended feature disable or xfd.
 880  880   * This is what we represent in the PRX_INFO_XCR prx_xfd member. Our starting
 881  881   * assumption, and the reason that so much of the /proc and signal logic ensures
 882  882   * that we have the thread and process around, taking as an example the unused
 883  883   * process argument in fpu_proc_xregs_info(), is that we will follow suit and
 884  884   * default to having support disabled, but that a process will be able to opt
 885  885   * into it, which will result in several different assumptions around signal
 886  886   * stack sizing and cause us to reallocate and extend the pcb's FPU save state.
 887  887   *
 888  888   * The following is a list of items to pay attention to for future folks who
 889  889   * work on this:
 890  890   *
 891  891   *   o We will want to confirm whether other systems have opted to make this
 892  892   *     process-wide or thread-wide. Assuming process-wide, we will need to do a
 893  893   *     hold of all lwps while making a change. The interface for that probably
 894  894   *     doesn't want to be /proc, as a process probably doesn't want to write to
 895  895   *     its own control file. Changing it for another process could be done
 896  896   *     through the agent-lwp.
 897  897   *   o Opting into this should probably be a one-way street.
 898  898   *   o Opting into this will need to evaluate all threads and in particular
 899  899   *     stack sizes to confirm they adhere to the new minimum.
 900  900   *   o We will need to make sure that setting and clearing the xfd MSR is part
 901  901   *     of the FPU context ops and something we set by default on every CPU.
 902  902   *   o We will need to add a new interface to allow opting into this feature.
 903  903   *   o We will need to ensure that all subsequently created signal stacks adhere
 904  904   *     to a required minimum size that we communicate through libc.
 905  905   *   o We will need to make sure that both rtld and libc no longer rely on a
 906  906   *     static value of the AT_SUN_FPSIZE, but rather realize that this can be
 907  907   *     dynamic. At that time, we should evaluate if we can get away with not
 908  908   *     needing to save this for rtld, even though signal handlers should assume
 909  909   *     they will.
 910  910   *   o The various components (because there is more than one) will want to be
 911  911   *     added to the fpu_xsave_info[]. Consulting the processes's xfd will be
 912  912   *     required and probably require logic changes.
 913  913   *
 914  914   * The above is not exhaustive. We'll probably have some other issues and fun
 915  915   * while doing this.
 916  916   */
 917  917  
 918  918  /*
 919  919   * The kind of FPU we advertise to rtld so it knows what to do when working
 920  920   * through the PLT.
 921  921   */
 922  922  int fp_elf = AT_386_FPINFO_FXSAVE;
 923  923  
 924  924  /*
 925  925   * Mechanism to save FPU state.
 926  926   */
 927  927  int fp_save_mech = FP_FXSAVE;
 928  928  
 929  929  kmem_cache_t *fpsave_cachep;
 930  930  
 931  931  /* Legacy fxsave layout + xsave header + ymm */
 932  932  #define AVX_XSAVE_SIZE          (512 + 64 + 256)
 933  933  
 934  934  /*
 935  935   * Various sanity checks.
 936  936   */
 937  937  CTASSERT(sizeof (struct fxsave_state) == 512);
 938  938  CTASSERT(sizeof (struct fnsave_state) == 108);
 939  939  CTASSERT((offsetof(struct fxsave_state, fx_xmm[0]) & 0xf) == 0);
 940  940  CTASSERT(sizeof (struct xsave_state) >= AVX_XSAVE_SIZE);
 941  941  
 942  942  /*
 943  943   * Basic architectural alignment information.
 944  944   */
 945  945  #define FPU_ALIGN_XMM   16
 946  946  #define FPU_ALIGN_YMM   32
 947  947  #define FPU_ALIGN_ZMM   64
 948  948  
 949  949  /*
 950  950   * This structure is the x86 implementation of the kernel FPU that is defined in
 951  951   * uts/common/sys/kfpu.h.
 952  952   */
 953  953  
 954  954  typedef enum kfpu_flags {
 955  955          /*
 956  956           * This indicates that the save state has initial FPU data.
 957  957           */
 958  958          KFPU_F_INITIALIZED = 0x01
 959  959  } kfpu_flags_t;
 960  960  
 961  961  struct kfpu_state {
 962  962          fpu_ctx_t       kfpu_ctx;
 963  963          kfpu_flags_t    kfpu_flags;
 964  964          kthread_t       *kfpu_curthread;
 965  965  };
 966  966  
 967  967  /*
 968  968   * Initial kfpu state for SSE/SSE2 used by fpinit()
 969  969   */
 970  970  const struct fxsave_state sse_initial = {
 971  971          FPU_CW_INIT,    /* fx_fcw */
 972  972          0,              /* fx_fsw */
 973  973          0,              /* fx_fctw */
 974  974          0,              /* fx_fop */
 975  975          0,              /* fx_rip */
 976  976          0,              /* fx_rdp */
 977  977          SSE_MXCSR_INIT  /* fx_mxcsr */
 978  978          /* rest of structure is zero */
 979  979  };
 980  980  
 981  981  /*
 982  982   * Initial kfpu state for AVX used by fpinit()
 983  983   */
 984  984  const struct xsave_state avx_initial = {
 985  985          /*
 986  986           * The definition below needs to be identical with sse_initial
 987  987           * defined above.
 988  988           */
 989  989          .xs_fxsave = {
 990  990                  .fx_fcw = FPU_CW_INIT,
 991  991                  .fx_mxcsr = SSE_MXCSR_INIT,
 992  992          },
 993  993          .xs_header = {
 994  994                  /*
 995  995                   * bit0 = 1 for XSTATE_BV to indicate that legacy fields are
 996  996                   * valid, and CPU should initialize XMM/YMM.
 997  997                   */
 998  998                  .xsh_xstate_bv = 1,
 999  999                  .xsh_xcomp_bv = 0,
1000 1000          },
1001 1001  };
1002 1002  
1003 1003  /*
1004 1004   * mxcsr_mask value (possibly reset in fpu_probe); used to avoid
1005 1005   * the #gp exception caused by setting unsupported bits in the
1006 1006   * MXCSR register
1007 1007   */
1008 1008  uint32_t sse_mxcsr_mask = SSE_MXCSR_MASK_DEFAULT;
1009 1009  
1010 1010  /*
1011 1011   * This vector is patched to xsave_ctxt() or xsaveopt_ctxt() if we discover we
1012 1012   * have an XSAVE-capable chip in fpu_probe.
1013 1013   */
1014 1014  void (*fpsave_ctxt)(void *) = fpxsave_ctxt;
1015 1015  void (*fprestore_ctxt)(void *) = fpxrestore_ctxt;
1016 1016  
1017 1017  /*
1018 1018   * This function pointer is changed to xsaveopt if the CPU is xsaveopt capable.
1019 1019   */
1020 1020  void (*xsavep)(struct xsave_state *, uint64_t) = xsave;
1021 1021  
1022 1022  static int fpe_sicode(uint_t);
1023 1023  static int fpe_simd_sicode(uint_t);
1024 1024  static void fp_new_lwp(void *, void *);
1025 1025  static void fp_free_ctx(void *, int);
1026 1026  
1027 1027  static struct ctxop *
1028 1028  fp_ctxop_allocate(struct fpu_ctx *fp)
1029 1029  {
1030 1030          const struct ctxop_template tpl = {
1031 1031                  .ct_rev         = CTXOP_TPL_REV,
1032 1032                  .ct_save        = fpsave_ctxt,
1033 1033                  .ct_restore     = fprestore_ctxt,
1034 1034                  .ct_fork        = fp_new_lwp,
1035 1035                  .ct_lwp_create  = fp_new_lwp,
1036 1036                  .ct_free        = fp_free_ctx,
1037 1037          };
1038 1038          return (ctxop_allocate(&tpl, fp));
1039 1039  }
1040 1040  
1041 1041  /*
1042 1042   * Copy the state of parent lwp's floating point context into the new lwp.
1043 1043   * Invoked for both fork() and lwp_create().
1044 1044   *
1045 1045   * Note that we inherit -only- the control state (e.g. exception masks,
1046 1046   * rounding, precision control, etc.); the FPU registers are otherwise
1047 1047   * reset to their initial state.
1048 1048   */
1049 1049  static void
1050 1050  fp_new_lwp(void *parent, void *child)
1051 1051  {
1052 1052          kthread_id_t t = parent, ct = child;
1053 1053          struct fpu_ctx *fp;             /* parent fpu context */
1054 1054          struct fpu_ctx *cfp;            /* new fpu context */
1055 1055          struct fxsave_state *fx, *cfx;
1056 1056          struct xsave_state *cxs;
1057 1057  
1058 1058          ASSERT(fp_kind != FP_NO);
1059 1059  
1060 1060          fp = &t->t_lwp->lwp_pcb.pcb_fpu;
1061 1061          cfp = &ct->t_lwp->lwp_pcb.pcb_fpu;
1062 1062  
1063 1063          /*
1064 1064           * If the parent FPU state is still in the FPU hw then save it;
1065 1065           * conveniently, fp_save() already does this for us nicely.
1066 1066           */
1067 1067          fp_save(fp);
1068 1068  
1069 1069          cfp->fpu_flags = FPU_EN | FPU_VALID;
1070 1070          cfp->fpu_regs.kfpu_status = 0;
1071 1071          cfp->fpu_regs.kfpu_xstatus = 0;
1072 1072  
1073 1073          /*
1074 1074           * Make sure that the child's FPU is cleaned up and made ready for user
1075 1075           * land.
1076 1076           */
1077 1077          PCB_SET_UPDATE_FPU(&ct->t_lwp->lwp_pcb);
1078 1078  
1079 1079          switch (fp_save_mech) {
1080 1080          case FP_FXSAVE:
1081 1081                  fx = fp->fpu_regs.kfpu_u.kfpu_fx;
1082 1082                  cfx = cfp->fpu_regs.kfpu_u.kfpu_fx;
1083 1083                  bcopy(&sse_initial, cfx, sizeof (*cfx));
1084 1084                  cfx->fx_mxcsr = fx->fx_mxcsr & ~SSE_MXCSR_EFLAGS;
1085 1085                  cfx->fx_fcw = fx->fx_fcw;
1086 1086                  break;
1087 1087  
1088 1088          case FP_XSAVE:
1089 1089                  cfp->fpu_xsave_mask = fp->fpu_xsave_mask;
1090 1090  
1091 1091                  VERIFY(fp->fpu_regs.kfpu_u.kfpu_xs != NULL);
1092 1092  
1093 1093                  fx = &fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave;
1094 1094                  cxs = cfp->fpu_regs.kfpu_u.kfpu_xs;
1095 1095                  cfx = &cxs->xs_fxsave;
1096 1096  
1097 1097                  bcopy(&avx_initial, cxs, sizeof (*cxs));
1098 1098                  cfx->fx_mxcsr = fx->fx_mxcsr & ~SSE_MXCSR_EFLAGS;
1099 1099                  cfx->fx_fcw = fx->fx_fcw;
1100 1100                  cxs->xs_header.xsh_xstate_bv |=
1101 1101                      (get_xcr(XFEATURE_ENABLED_MASK) & XFEATURE_FP_INITIAL);
1102 1102                  break;
1103 1103          default:
1104 1104                  panic("Invalid fp_save_mech");
1105 1105                  /*NOTREACHED*/
1106 1106          }
1107 1107  
1108 1108          /*
1109 1109           * Mark that both the parent and child need to have the FPU cleaned up
1110 1110           * before returning to userland.
1111 1111           */
1112 1112  
1113 1113          ctxop_attach(ct, fp_ctxop_allocate(cfp));
1114 1114  }
1115 1115  
1116 1116  /*
1117 1117   * Free any state associated with floating point context.
1118 1118   * Fp_free can be called in three cases:
1119 1119   * 1) from reaper -> thread_free -> freectx-> fp_free
1120 1120   *      fp context belongs to a thread on deathrow
1121 1121   *      nothing to do,  thread will never be resumed
1122 1122   *      thread calling ctxfree is reaper
1123 1123   *
1124 1124   * 2) from exec -> freectx -> fp_free
1125 1125   *      fp context belongs to the current thread
1126 1126   *      must disable fpu, thread calling ctxfree is curthread
1127 1127   *
1128 1128   * 3) from restorecontext -> setfpregs -> fp_free
1129 1129   *      we have a modified context in the memory (lwp->pcb_fpu)
1130 1130   *      disable fpu and release the fp context for the CPU
1131 1131   *
1132 1132   */
1133 1133  void
1134 1134  fp_free(struct fpu_ctx *fp)
1135 1135  {
1136 1136          ASSERT(fp_kind != FP_NO);
1137 1137  
1138 1138          if (fp->fpu_flags & FPU_VALID)
1139 1139                  return;
1140 1140  
1141 1141          kpreempt_disable();
1142 1142          /*
1143 1143           * We want to do fpsave rather than fpdisable so that we can
1144 1144           * keep the fpu_flags as FPU_VALID tracking the CR0_TS bit
1145 1145           */
1146 1146          fp->fpu_flags |= FPU_VALID;
1147 1147          /* If for current thread disable FP to track FPU_VALID */
1148 1148          if (curthread->t_lwp && fp == &curthread->t_lwp->lwp_pcb.pcb_fpu) {
1149 1149                  /* Clear errors if any to prevent frstor from complaining */
1150 1150                  (void) fperr_reset();
1151 1151                  if (fp_kind & __FP_SSE)
1152 1152                          (void) fpxerr_reset();
1153 1153                  fpdisable();
1154 1154          }
1155 1155          kpreempt_enable();
1156 1156  }
1157 1157  
1158 1158  /*
1159 1159   * Wrapper for freectx to make the types line up for fp_free()
1160 1160   */
1161 1161  static void
1162 1162  fp_free_ctx(void *arg, int isexec __unused)
1163 1163  {
1164 1164          fp_free((struct fpu_ctx *)arg);
1165 1165  }
1166 1166  
1167 1167  /*
1168 1168   * Store the floating point state and disable the floating point unit.
1169 1169   */
1170 1170  void
1171 1171  fp_save(struct fpu_ctx *fp)
1172 1172  {
1173 1173          ASSERT(fp_kind != FP_NO);
1174 1174  
1175 1175          kpreempt_disable();
1176 1176          if (!fp || fp->fpu_flags & FPU_VALID ||
1177 1177              (fp->fpu_flags & FPU_EN) == 0) {
1178 1178                  kpreempt_enable();
1179 1179                  return;
1180 1180          }
1181 1181          ASSERT(curthread->t_lwp && fp == &curthread->t_lwp->lwp_pcb.pcb_fpu);
1182 1182  
1183 1183          switch (fp_save_mech) {
1184 1184          case FP_FXSAVE:
1185 1185                  fpxsave(fp->fpu_regs.kfpu_u.kfpu_fx);
1186 1186                  break;
1187 1187  
1188 1188          case FP_XSAVE:
1189 1189                  xsavep(fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask);
1190 1190                  break;
1191 1191          default:
1192 1192                  panic("Invalid fp_save_mech");
1193 1193                  /*NOTREACHED*/
1194 1194          }
1195 1195  
1196 1196          fp->fpu_flags |= FPU_VALID;
1197 1197  
1198 1198          /*
1199 1199           * We save the FPU as part of forking, execing, modifications via /proc,
1200 1200           * restorecontext, etc. As such, we need to make sure that we return to
1201 1201           * userland with valid state in the FPU. If we're context switched out
1202 1202           * before we hit sys_rtt_common() we'll end up having restored the FPU
1203 1203           * as part of the context ops operations. The restore logic always makes
1204 1204           * sure that FPU_VALID is set before doing a restore so we don't restore
1205 1205           * it a second time.
1206 1206           */
1207 1207          PCB_SET_UPDATE_FPU(&curthread->t_lwp->lwp_pcb);
1208 1208  
1209 1209          kpreempt_enable();
1210 1210  }
1211 1211  
1212 1212  /*
1213 1213   * Restore the FPU context for the thread:
1214 1214   * The possibilities are:
1215 1215   *      1. No active FPU context: Load the new context into the FPU hw
1216 1216   *         and enable the FPU.
1217 1217   */
1218 1218  void
1219 1219  fp_restore(struct fpu_ctx *fp)
1220 1220  {
1221 1221          switch (fp_save_mech) {
1222 1222          case FP_FXSAVE:
1223 1223                  fpxrestore(fp->fpu_regs.kfpu_u.kfpu_fx);
1224 1224                  break;
1225 1225  
1226 1226          case FP_XSAVE:
1227 1227                  xrestore(fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask);
1228 1228                  break;
1229 1229          default:
1230 1230                  panic("Invalid fp_save_mech");
1231 1231                  /*NOTREACHED*/
1232 1232          }
1233 1233  
1234 1234          fp->fpu_flags &= ~FPU_VALID;
1235 1235  }
1236 1236  
1237 1237  /*
1238 1238   * Reset the FPU such that it is in a valid state for a new thread that is
1239 1239   * coming out of exec. The FPU will be in a usable state at this point. At this
1240 1240   * point we know that the FPU state has already been allocated and if this
1241 1241   * wasn't an init process, then it will have had fp_free() previously called.
1242 1242   */
1243 1243  void
1244 1244  fp_exec(void)
1245 1245  {
1246 1246          struct fpu_ctx *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
1247 1247  
1248 1248          if (fp_save_mech == FP_XSAVE) {
1249 1249                  fp->fpu_xsave_mask = XFEATURE_FP_ALL;
1250 1250          }
1251 1251  
1252 1252          struct ctxop *ctx = fp_ctxop_allocate(fp);
1253 1253          /*
1254 1254           * Make sure that we're not preempted in the middle of initializing the
1255 1255           * FPU on CPU.
1256 1256           */
1257 1257          kpreempt_disable();
1258 1258          ctxop_attach(curthread, ctx);
1259 1259          fpinit();
1260 1260          fp->fpu_flags = FPU_EN;
1261 1261          kpreempt_enable();
1262 1262  }
1263 1263  
1264 1264  
1265 1265  /*
1266 1266   * Seeds the initial state for the current thread.  The possibilities are:
1267 1267   *      1. Another process has modified the FPU state before we have done any
1268 1268   *         initialization: Load the FPU state from the LWP state.
1269 1269   *      2. The FPU state has not been externally modified:  Load a clean state.
1270 1270   */
1271 1271  void
1272 1272  fp_seed(void)
1273 1273  {
1274 1274          struct fpu_ctx *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
1275 1275  
1276 1276          ASSERT(curthread->t_preempt >= 1);
1277 1277          ASSERT((fp->fpu_flags & FPU_EN) == 0);
1278 1278  
1279 1279          /*
1280 1280           * Always initialize a new context and initialize the hardware.
1281 1281           */
1282 1282          if (fp_save_mech == FP_XSAVE) {
1283 1283                  fp->fpu_xsave_mask = XFEATURE_FP_ALL;
1284 1284          }
1285 1285  
1286 1286          ctxop_attach(curthread, fp_ctxop_allocate(fp));
1287 1287          fpinit();
1288 1288  
1289 1289          /*
1290 1290           * If FPU_VALID is set, it means someone has modified registers via
1291 1291           * /proc.  In this case, restore the current lwp's state.
1292 1292           */
1293 1293          if (fp->fpu_flags & FPU_VALID)
1294 1294                  fp_restore(fp);
1295 1295  
1296 1296          ASSERT((fp->fpu_flags & FPU_VALID) == 0);
1297 1297          fp->fpu_flags = FPU_EN;
1298 1298  }
1299 1299  
1300 1300  /*
1301 1301   * When using xsave/xrstor, these three functions are used by the lwp code to
1302 1302   * manage the memory for the xsave area.
1303 1303   */
1304 1304  void
1305 1305  fp_lwp_init(struct _klwp *lwp)
1306 1306  {
1307 1307          struct fpu_ctx *fp = &lwp->lwp_pcb.pcb_fpu;
1308 1308  
1309 1309          /*
1310 1310           * We keep a copy of the pointer in lwp_fpu so that we can restore the
1311 1311           * value in forklwp() after we duplicate the parent's LWP state.
1312 1312           */
1313 1313          lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic =
1314 1314              kmem_cache_alloc(fpsave_cachep, KM_SLEEP);
1315 1315          fp->fpu_signal = NULL;
1316 1316  
1317 1317          if (fp_save_mech == FP_XSAVE) {
1318 1318                  /*
1319 1319                   *
1320 1320                   * We bzero since the fpinit() code path will only
1321 1321                   * partially initialize the xsave area using avx_inital.
1322 1322                   */
1323 1323                  ASSERT(cpuid_get_xsave_size() >= sizeof (struct xsave_state));
1324 1324                  bzero(fp->fpu_regs.kfpu_u.kfpu_xs, cpuid_get_xsave_size());
1325 1325          }
1326 1326  }
1327 1327  
1328 1328  void
1329 1329  fp_lwp_cleanup(struct _klwp *lwp)
1330 1330  {
1331 1331          struct fpu_ctx *fp = &lwp->lwp_pcb.pcb_fpu;
1332 1332  
1333 1333          if (fp->fpu_regs.kfpu_u.kfpu_generic != NULL) {
1334 1334                  kmem_cache_free(fpsave_cachep,
1335 1335                      fp->fpu_regs.kfpu_u.kfpu_generic);
1336 1336                  lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic = NULL;
1337 1337          }
1338 1338  
1339 1339          if (fp->fpu_signal != NULL) {
1340 1340                  kmem_cache_free(fpsave_cachep, fp->fpu_signal);
1341 1341                  fp->fpu_signal = NULL;
1342 1342          }
1343 1343  }
1344 1344  
1345 1345  /*
1346 1346   * Called during the process of forklwp(). The kfpu_u pointer will have been
1347 1347   * overwritten while copying the parent's LWP structure. We have a valid copy
1348 1348   * stashed in the child's lwp_fpu which we use to restore the correct value.
1349 1349   */
1350 1350  void
1351 1351  fp_lwp_dup(struct _klwp *lwp)
1352 1352  {
1353 1353          void *xp = lwp->lwp_fpu;
1354 1354          size_t sz;
1355 1355  
1356 1356          switch (fp_save_mech) {
1357 1357          case FP_FXSAVE:
1358 1358                  sz = sizeof (struct fxsave_state);
1359 1359                  break;
1360 1360          case FP_XSAVE:
1361 1361                  sz = cpuid_get_xsave_size();
1362 1362                  break;
1363 1363          default:
1364 1364                  panic("Invalid fp_save_mech");
1365 1365                  /*NOTREACHED*/
1366 1366          }
1367 1367  
1368 1368          /* copy the parent's values into the new lwp's struct */
1369 1369          bcopy(lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic, xp, sz);
1370 1370          /* now restore the pointer */
1371 1371          lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic = xp;
1372 1372          /* Ensure that we don't inherit our parent's signal state */
1373 1373          lwp->lwp_pcb.pcb_fpu.fpu_signal = NULL;
1374 1374  }
1375 1375  
1376 1376  /*
1377 1377   * Handle a processor extension error fault
1378 1378   * Returns non zero for error.
1379 1379   */
1380 1380  
1381 1381  /*ARGSUSED*/
1382 1382  int
1383 1383  fpexterrflt(struct regs *rp)
1384 1384  {
1385 1385          uint32_t fpcw, fpsw;
1386 1386          fpu_ctx_t *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
1387 1387  
1388 1388          ASSERT(fp_kind != FP_NO);
1389 1389  
1390 1390          /*
1391 1391           * Now we can enable the interrupts.
1392 1392           * (NOTE: x87 fp exceptions come thru interrupt gate)
1393 1393           */
1394 1394          sti();
1395 1395  
1396 1396          if (!fpu_exists)
1397 1397                  return (FPE_FLTINV);
1398 1398  
1399 1399          /*
1400 1400           * Do an unconditional save of the FP state.  If it's dirty (TS=0),
1401 1401           * it'll be saved into the fpu context area passed in (that of the
1402 1402           * current thread).  If it's not dirty (it may not be, due to
1403 1403           * an intervening save due to a context switch between the sti(),
1404 1404           * above and here, then it's safe to just use the stored values in
1405 1405           * the context save area to determine the cause of the fault.
1406 1406           */
1407 1407          fp_save(fp);
1408 1408  
1409 1409          /* clear exception flags in saved state, as if by fnclex */
1410 1410          switch (fp_save_mech) {
1411 1411          case FP_FXSAVE:
1412 1412                  fpsw = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw;
1413 1413                  fpcw = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fcw;
1414 1414                  fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw &= ~FPS_SW_EFLAGS;
1415 1415                  break;
1416 1416  
1417 1417          case FP_XSAVE:
1418 1418                  fpsw = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw;
1419 1419                  fpcw = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fcw;
1420 1420                  fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw &= ~FPS_SW_EFLAGS;
1421 1421                  /*
1422 1422                   * Always set LEGACY_FP as it may have been cleared by XSAVE
1423 1423                   * instruction
1424 1424                   */
1425 1425                  fp->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv |=
1426 1426                      XFEATURE_LEGACY_FP;
1427 1427                  break;
1428 1428          default:
1429 1429                  panic("Invalid fp_save_mech");
1430 1430                  /*NOTREACHED*/
1431 1431          }
1432 1432  
1433 1433          fp->fpu_regs.kfpu_status = fpsw;
1434 1434  
1435 1435          if ((fpsw & FPS_ES) == 0)
1436 1436                  return (0);             /* No exception */
1437 1437  
1438 1438          /*
1439 1439           * "and" the exception flags with the complement of the mask
1440 1440           * bits to determine which exception occurred
1441 1441           */
1442 1442          return (fpe_sicode(fpsw & ~fpcw & 0x3f));
1443 1443  }
1444 1444  
1445 1445  /*
1446 1446   * Handle an SSE/SSE2 precise exception.
1447 1447   * Returns a non-zero sicode for error.
1448 1448   */
1449 1449  /*ARGSUSED*/
1450 1450  int
1451 1451  fpsimderrflt(struct regs *rp)
1452 1452  {
1453 1453          uint32_t mxcsr, xmask;
1454 1454          fpu_ctx_t *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
1455 1455  
1456 1456          ASSERT(fp_kind & __FP_SSE);
1457 1457  
1458 1458          /*
1459 1459           * NOTE: Interrupts are disabled during execution of this
1460 1460           * function.  They are enabled by the caller in trap.c.
1461 1461           */
1462 1462  
1463 1463          /*
1464 1464           * The only way we could have gotten here if there is no FP unit
1465 1465           * is via a user executing an INT $19 instruction, so there is
1466 1466           * no fault in that case.
1467 1467           */
1468 1468          if (!fpu_exists)
1469 1469                  return (0);
1470 1470  
1471 1471          /*
1472 1472           * Do an unconditional save of the FP state.  If it's dirty (TS=0),
1473 1473           * it'll be saved into the fpu context area passed in (that of the
1474 1474           * current thread).  If it's not dirty, then it's safe to just use
1475 1475           * the stored values in the context save area to determine the
1476 1476           * cause of the fault.
1477 1477           */
1478 1478          fp_save(fp);            /* save the FPU state */
1479 1479  
1480 1480          if (fp_save_mech == FP_XSAVE) {
1481 1481                  mxcsr = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_mxcsr;
1482 1482                  fp->fpu_regs.kfpu_status =
1483 1483                      fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw;
1484 1484          } else {
1485 1485                  mxcsr = fp->fpu_regs.kfpu_u.kfpu_fx->fx_mxcsr;
1486 1486                  fp->fpu_regs.kfpu_status = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw;
1487 1487          }
1488 1488          fp->fpu_regs.kfpu_xstatus = mxcsr;
1489 1489  
1490 1490          /*
1491 1491           * compute the mask that determines which conditions can cause
1492 1492           * a #xm exception, and use this to clean the status bits so that
1493 1493           * we can identify the true cause of this one.
1494 1494           */
1495 1495          xmask = (mxcsr >> 7) & SSE_MXCSR_EFLAGS;
1496 1496          return (fpe_simd_sicode((mxcsr & SSE_MXCSR_EFLAGS) & ~xmask));
1497 1497  }
1498 1498  
1499 1499  /*
1500 1500   * In the unlikely event that someone is relying on this subcode being
1501 1501   * FPE_FLTILL for denormalize exceptions, it can always be patched back
1502 1502   * again to restore old behaviour.
1503 1503   */
1504 1504  int fpe_fltden = FPE_FLTDEN;
1505 1505  
1506 1506  /*
1507 1507   * Map from the FPU status word to the FP exception si_code.
1508 1508   */
1509 1509  static int
1510 1510  fpe_sicode(uint_t sw)
1511 1511  {
1512 1512          if (sw & FPS_IE)
1513 1513                  return (FPE_FLTINV);
1514 1514          if (sw & FPS_ZE)
1515 1515                  return (FPE_FLTDIV);
1516 1516          if (sw & FPS_DE)
1517 1517                  return (fpe_fltden);
1518 1518          if (sw & FPS_OE)
1519 1519                  return (FPE_FLTOVF);
1520 1520          if (sw & FPS_UE)
1521 1521                  return (FPE_FLTUND);
1522 1522          if (sw & FPS_PE)
1523 1523                  return (FPE_FLTRES);
1524 1524          return (FPE_FLTINV);    /* default si_code for other exceptions */
1525 1525  }
1526 1526  
1527 1527  /*
1528 1528   * Map from the SSE status word to the FP exception si_code.
1529 1529   */
1530 1530  static int
1531 1531  fpe_simd_sicode(uint_t sw)
1532 1532  {
1533 1533          if (sw & SSE_IE)
1534 1534                  return (FPE_FLTINV);
1535 1535          if (sw & SSE_ZE)
1536 1536                  return (FPE_FLTDIV);
1537 1537          if (sw & SSE_DE)
1538 1538                  return (FPE_FLTDEN);
1539 1539          if (sw & SSE_OE)
1540 1540                  return (FPE_FLTOVF);
1541 1541          if (sw & SSE_UE)
1542 1542                  return (FPE_FLTUND);
1543 1543          if (sw & SSE_PE)
1544 1544                  return (FPE_FLTRES);
1545 1545          return (FPE_FLTINV);    /* default si_code for other exceptions */
1546 1546  }
1547 1547  
1548 1548  /*
1549 1549   * This routine is invoked as part of libc's __fpstart implementation
1550 1550   * via sysi86(2).
1551 1551   *
1552 1552   * It may be called -before- any context has been assigned in which case
1553 1553   * we try and avoid touching the hardware.  Or it may be invoked well
1554 1554   * after the context has been assigned and fiddled with, in which case
1555 1555   * just tweak it directly.
1556 1556   */
1557 1557  void
1558 1558  fpsetcw(uint16_t fcw, uint32_t mxcsr)
1559 1559  {
1560 1560          struct fpu_ctx *fp = &curthread->t_lwp->lwp_pcb.pcb_fpu;
1561 1561          struct fxsave_state *fx;
1562 1562  
1563 1563          if (!fpu_exists || fp_kind == FP_NO)
1564 1564                  return;
1565 1565  
1566 1566          if ((fp->fpu_flags & FPU_EN) == 0) {
1567 1567                  if (fcw == FPU_CW_INIT && mxcsr == SSE_MXCSR_INIT) {
1568 1568                          /*
1569 1569                           * Common case.  Floating point unit not yet
1570 1570                           * enabled, and kernel already intends to initialize
1571 1571                           * the hardware the way the caller wants.
1572 1572                           */
1573 1573                          return;
1574 1574                  }
1575 1575                  /*
1576 1576                   * Hmm.  Userland wants a different default.
1577 1577                   * Do a fake "first trap" to establish the context, then
1578 1578                   * handle as if we already had a context before we came in.
1579 1579                   */
1580 1580                  kpreempt_disable();
1581 1581                  fp_seed();
1582 1582                  kpreempt_enable();
1583 1583          }
1584 1584  
1585 1585          /*
1586 1586           * Ensure that the current hardware state is flushed back to the
1587 1587           * pcb, then modify that copy.  Next use of the fp will
1588 1588           * restore the context.
1589 1589           */
1590 1590          fp_save(fp);
1591 1591  
1592 1592          switch (fp_save_mech) {
1593 1593          case FP_FXSAVE:
1594 1594                  fx = fp->fpu_regs.kfpu_u.kfpu_fx;
1595 1595                  fx->fx_fcw = fcw;
1596 1596                  fx->fx_mxcsr = sse_mxcsr_mask & mxcsr;
1597 1597                  break;
1598 1598  
1599 1599          case FP_XSAVE:
1600 1600                  fx = &fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave;
1601 1601                  fx->fx_fcw = fcw;
1602 1602                  fx->fx_mxcsr = sse_mxcsr_mask & mxcsr;
1603 1603                  /*
1604 1604                   * Always set LEGACY_FP as it may have been cleared by XSAVE
1605 1605                   * instruction
1606 1606                   */
1607 1607                  fp->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv |=
1608 1608                      XFEATURE_LEGACY_FP;
1609 1609                  break;
1610 1610          default:
1611 1611                  panic("Invalid fp_save_mech");
1612 1612                  /*NOTREACHED*/
1613 1613          }
1614 1614  }
1615 1615  
1616 1616  static void
1617 1617  kernel_fpu_fpstate_init(kfpu_state_t *kfpu)
1618 1618  {
1619 1619          struct xsave_state *xs;
1620 1620  
1621 1621          switch (fp_save_mech) {
1622 1622          case FP_FXSAVE:
1623 1623                  bcopy(&sse_initial, kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_fx,
1624 1624                      sizeof (struct fxsave_state));
1625 1625                  kfpu->kfpu_ctx.fpu_xsave_mask = 0;
1626 1626                  break;
1627 1627          case FP_XSAVE:
1628 1628                  xs = kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_xs;
1629 1629                  bzero(xs, cpuid_get_xsave_size());
1630 1630                  bcopy(&avx_initial, xs, sizeof (*xs));
1631 1631                  xs->xs_header.xsh_xstate_bv = XFEATURE_LEGACY_FP | XFEATURE_SSE;
1632 1632                  kfpu->kfpu_ctx.fpu_xsave_mask = XFEATURE_FP_ALL;
1633 1633                  break;
1634 1634          default:
1635 1635                  panic("invalid fp_save_mech");
1636 1636          }
1637 1637  
1638 1638          /*
1639 1639           * Set the corresponding flags that the system expects on the FPU state
1640 1640           * to indicate that this is our state. The FPU_EN flag is required to
1641 1641           * indicate that FPU usage is allowed. The FPU_KERN flag is explicitly
1642 1642           * not set below as it represents that this state is being suppressed
1643 1643           * by the kernel.
1644 1644           */
1645 1645          kfpu->kfpu_ctx.fpu_flags = FPU_EN | FPU_VALID;
1646 1646          kfpu->kfpu_flags |= KFPU_F_INITIALIZED;
1647 1647  }
1648 1648  
1649 1649  kfpu_state_t *
1650 1650  kernel_fpu_alloc(int kmflags)
1651 1651  {
1652 1652          kfpu_state_t *kfpu;
1653 1653  
1654 1654          if ((kfpu = kmem_zalloc(sizeof (kfpu_state_t), kmflags)) == NULL) {
1655 1655                  return (NULL);
1656 1656          }
1657 1657  
1658 1658          kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic =
1659 1659              kmem_cache_alloc(fpsave_cachep, kmflags);
1660 1660          if (kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic == NULL) {
1661 1661                  kmem_free(kfpu, sizeof (kfpu_state_t));
1662 1662                  return (NULL);
1663 1663          }
1664 1664  
1665 1665          kernel_fpu_fpstate_init(kfpu);
1666 1666  
1667 1667          return (kfpu);
1668 1668  }
1669 1669  
1670 1670  void
1671 1671  kernel_fpu_free(kfpu_state_t *kfpu)
1672 1672  {
1673 1673          kmem_cache_free(fpsave_cachep,
1674 1674              kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic);
1675 1675          kmem_free(kfpu, sizeof (kfpu_state_t));
1676 1676  }
1677 1677  
1678 1678  static void
1679 1679  kernel_fpu_ctx_save(void *arg)
1680 1680  {
1681 1681          kfpu_state_t *kfpu = arg;
1682 1682          fpu_ctx_t *pf;
1683 1683  
1684 1684          if (kfpu == NULL) {
1685 1685                  /*
1686 1686                   * A NULL kfpu implies this is a kernel thread with an LWP and
1687 1687                   * no user-level FPU usage. Use the lwp fpu save area.
1688 1688                   */
1689 1689                  pf = &curthread->t_lwp->lwp_pcb.pcb_fpu;
1690 1690  
1691 1691                  ASSERT(curthread->t_procp->p_flag & SSYS);
1692 1692                  ASSERT3U(pf->fpu_flags & FPU_VALID, ==, 0);
1693 1693  
1694 1694                  fp_save(pf);
1695 1695          } else {
1696 1696                  pf = &kfpu->kfpu_ctx;
1697 1697  
1698 1698                  ASSERT3P(kfpu->kfpu_curthread, ==, curthread);
1699 1699                  ASSERT3U(pf->fpu_flags & FPU_VALID, ==, 0);
1700 1700  
1701 1701                  /*
1702 1702                   * Note, we can't use fp_save because it assumes that we're
1703 1703                   * saving to the thread's PCB and not somewhere else. Because
1704 1704                   * this is a different FPU context, we instead have to do this
1705 1705                   * ourselves.
1706 1706                   */
1707 1707                  switch (fp_save_mech) {
1708 1708                  case FP_FXSAVE:
1709 1709                          fpxsave(pf->fpu_regs.kfpu_u.kfpu_fx);
1710 1710                          break;
1711 1711                  case FP_XSAVE:
1712 1712                          xsavep(pf->fpu_regs.kfpu_u.kfpu_xs, pf->fpu_xsave_mask);
1713 1713                          break;
1714 1714                  default:
1715 1715                          panic("Invalid fp_save_mech");
1716 1716                  }
1717 1717  
1718 1718                  /*
1719 1719                   * Because we have saved context here, our save state is no
1720 1720                   * longer valid and therefore needs to be reinitialized.
1721 1721                   */
1722 1722                  kfpu->kfpu_flags &= ~KFPU_F_INITIALIZED;
1723 1723          }
1724 1724  
1725 1725          pf->fpu_flags |= FPU_VALID;
1726 1726  
1727 1727          /*
1728 1728           * Clear KFPU flag. This allows swtch to check for improper kernel
1729 1729           * usage of the FPU (i.e. switching to a new thread while the old
1730 1730           * thread was in the kernel and using the FPU, but did not perform a
1731 1731           * context save).
1732 1732           */
1733 1733          curthread->t_flag &= ~T_KFPU;
1734 1734  }
1735 1735  
1736 1736  static void
1737 1737  kernel_fpu_ctx_restore(void *arg)
1738 1738  {
1739 1739          kfpu_state_t *kfpu = arg;
1740 1740          fpu_ctx_t *pf;
1741 1741  
1742 1742          if (kfpu == NULL) {
1743 1743                  /*
1744 1744                   * A NULL kfpu implies this is a kernel thread with an LWP and
1745 1745                   * no user-level FPU usage. Use the lwp fpu save area.
1746 1746                   */
1747 1747                  pf = &curthread->t_lwp->lwp_pcb.pcb_fpu;
1748 1748  
1749 1749                  ASSERT(curthread->t_procp->p_flag & SSYS);
1750 1750                  ASSERT3U(pf->fpu_flags & FPU_VALID, !=, 0);
1751 1751          } else {
1752 1752                  pf = &kfpu->kfpu_ctx;
1753 1753  
1754 1754                  ASSERT3P(kfpu->kfpu_curthread, ==, curthread);
1755 1755                  ASSERT3U(pf->fpu_flags & FPU_VALID, !=, 0);
1756 1756          }
1757 1757  
1758 1758          fp_restore(pf);
1759 1759          curthread->t_flag |= T_KFPU;
1760 1760  }
1761 1761  
1762 1762  /*
1763 1763   * Validate that the thread is not switching off-cpu while actively using the
1764 1764   * FPU within the kernel.
1765 1765   */
1766 1766  void
1767 1767  kernel_fpu_no_swtch(void)
1768 1768  {
1769 1769          if ((curthread->t_flag & T_KFPU) != 0) {
1770 1770                  panic("curthread swtch-ing while the kernel is using the FPU");
1771 1771          }
1772 1772  }
1773 1773  
1774 1774  static const struct ctxop_template kfpu_ctxop_tpl = {
1775 1775          .ct_rev         = CTXOP_TPL_REV,
1776 1776          .ct_save        = kernel_fpu_ctx_save,
1777 1777          .ct_restore     = kernel_fpu_ctx_restore,
1778 1778  };
1779 1779  
1780 1780  void
1781 1781  kernel_fpu_begin(kfpu_state_t *kfpu, uint_t flags)
1782 1782  {
1783 1783          klwp_t *pl = curthread->t_lwp;
1784 1784          struct ctxop *ctx;
1785 1785  
1786 1786          if ((curthread->t_flag & T_KFPU) != 0) {
1787 1787                  panic("curthread attempting to nest kernel FPU states");
1788 1788          }
1789 1789  
1790 1790          /* KFPU_USE_LWP and KFPU_NO_STATE are mutually exclusive. */
1791 1791          ASSERT((flags & (KFPU_USE_LWP | KFPU_NO_STATE)) !=
1792 1792              (KFPU_USE_LWP | KFPU_NO_STATE));
1793 1793  
1794 1794          if ((flags & KFPU_NO_STATE) == KFPU_NO_STATE) {
1795 1795                  /*
1796 1796                   * Since we don't have a kfpu_state or usable lwp pcb_fpu to
1797 1797                   * hold our kernel FPU context, we depend on the caller doing
1798 1798                   * kpreempt_disable for the duration of our FPU usage. This
1799 1799                   * should only be done for very short periods of time.
1800 1800                   */
1801 1801                  ASSERT(curthread->t_preempt > 0);
1802 1802                  ASSERT(kfpu == NULL);
1803 1803  
1804 1804                  if (pl != NULL) {
1805 1805                          /*
1806 1806                           * We might have already saved once so FPU_VALID could
1807 1807                           * be set. This is handled in fp_save.
1808 1808                           */
1809 1809                          fp_save(&pl->lwp_pcb.pcb_fpu);
1810 1810                          pl->lwp_pcb.pcb_fpu.fpu_flags |= FPU_KERNEL;
1811 1811                  }
1812 1812  
1813 1813                  curthread->t_flag |= T_KFPU;
1814 1814  
1815 1815                  /* Always restore the fpu to the initial state. */
1816 1816                  fpinit();
1817 1817  
1818 1818                  return;
1819 1819          }
1820 1820  
1821 1821          /*
1822 1822           * We either have a kfpu, or are using the LWP pcb_fpu for context ops.
1823 1823           */
1824 1824  
1825 1825          if ((flags & KFPU_USE_LWP) == 0) {
1826 1826                  if (kfpu->kfpu_curthread != NULL)
1827 1827                          panic("attempting to reuse kernel FPU state at %p when "
1828 1828                              "another thread already is using", kfpu);
1829 1829  
1830 1830                  if ((kfpu->kfpu_flags & KFPU_F_INITIALIZED) == 0)
1831 1831                          kernel_fpu_fpstate_init(kfpu);
1832 1832  
1833 1833                  kfpu->kfpu_curthread = curthread;
1834 1834          }
1835 1835  
1836 1836          /*
1837 1837           * Not all threads may have an active LWP. If they do and we're not
1838 1838           * going to re-use the LWP, then we should go ahead and save the state.
1839 1839           * We must also note that the fpu is now being used by the kernel and
1840 1840           * therefore we do not want to manage the fpu state via the user-level
1841 1841           * thread's context handlers.
1842 1842           *
1843 1843           * We might have already saved once (due to a prior use of the kernel
1844 1844           * FPU or another code path) so FPU_VALID could be set. This is handled
1845 1845           * by fp_save, as is the FPU_EN check.
1846 1846           */
1847 1847          ctx = ctxop_allocate(&kfpu_ctxop_tpl, kfpu);
1848 1848          kpreempt_disable();
1849 1849          if (pl != NULL) {
1850 1850                  if ((flags & KFPU_USE_LWP) == 0)
1851 1851                          fp_save(&pl->lwp_pcb.pcb_fpu);
1852 1852                  pl->lwp_pcb.pcb_fpu.fpu_flags |= FPU_KERNEL;
1853 1853          }
1854 1854  
1855 1855          /*
1856 1856           * Set the context operations for kernel FPU usage.  Because kernel FPU
1857 1857           * setup and ctxop attachment needs to happen under the protection of
1858 1858           * kpreempt_disable(), we allocate the ctxop outside the guard so its
1859 1859           * sleeping allocation will not cause a voluntary swtch().  This allows
1860 1860           * the rest of the initialization to proceed, ensuring valid state for
1861 1861           * the ctxop handlers.
1862 1862           */
1863 1863          ctxop_attach(curthread, ctx);
1864 1864          curthread->t_flag |= T_KFPU;
1865 1865  
1866 1866          if ((flags & KFPU_USE_LWP) == KFPU_USE_LWP) {
1867 1867                  /*
1868 1868                   * For pure kernel threads with an LWP, we can use the LWP's
1869 1869                   * pcb_fpu to save/restore context.
1870 1870                   */
1871 1871                  fpu_ctx_t *pf = &pl->lwp_pcb.pcb_fpu;
1872 1872  
1873 1873                  VERIFY(curthread->t_procp->p_flag & SSYS);
1874 1874                  VERIFY(kfpu == NULL);
1875 1875                  ASSERT((pf->fpu_flags & FPU_EN) == 0);
1876 1876  
1877 1877                  /* Always restore the fpu to the initial state. */
1878 1878                  if (fp_save_mech == FP_XSAVE)
1879 1879                          pf->fpu_xsave_mask = XFEATURE_FP_ALL;
1880 1880                  fpinit();
1881 1881                  pf->fpu_flags = FPU_EN | FPU_KERNEL;
1882 1882          } else {
1883 1883                  /* initialize the kfpu state */
1884 1884                  kernel_fpu_ctx_restore(kfpu);
1885 1885          }
1886 1886          kpreempt_enable();
1887 1887  }
1888 1888  
1889 1889  void
1890 1890  kernel_fpu_end(kfpu_state_t *kfpu, uint_t flags)
1891 1891  {
1892 1892          if ((curthread->t_flag & T_KFPU) == 0) {
1893 1893                  panic("curthread attempting to clear kernel FPU state "
1894 1894                      "without using it");
1895 1895          }
1896 1896  
1897 1897          /*
1898 1898           * General comments on why the rest of this function is structured the
1899 1899           * way it is. Be aware that there is a lot of subtlety here.
1900 1900           *
1901 1901           * If a user-level thread ever uses the fpu while in the kernel, then
1902 1902           * we cannot call fpdisable since that does STTS. That will set the
1903 1903           * ts bit in %cr0 which will cause an exception if anything touches the
1904 1904           * fpu. However, the user-level context switch handler (fpsave_ctxt)
1905 1905           * needs to access the fpu to save the registers into the pcb.
1906 1906           * fpsave_ctxt relies on CLTS having been done to clear the ts bit in
1907 1907           * fprestore_ctxt when the thread context switched onto the CPU.
1908 1908           *
1909 1909           * Calling fpdisable only effects the current CPU's %cr0 register.
1910 1910           *
1911 1911           * During ctxop_remove and kpreempt_enable, we can voluntarily context
1912 1912           * switch, so the CPU we were on when we entered this function might
1913 1913           * not be the same one we're on when we return from ctxop_remove or end
1914 1914           * the function. Note there can be user-level context switch handlers
1915 1915           * still installed if this is a user-level thread.
1916 1916           *
1917 1917           * We also must be careful in the unlikely chance we're running in an
1918 1918           * interrupt thread, since we can't leave the CPU's %cr0 TS state set
1919 1919           * incorrectly for the "real" thread to resume on this CPU.
1920 1920           */
1921 1921  
1922 1922          if ((flags & KFPU_NO_STATE) == 0) {
1923 1923                  kpreempt_disable();
1924 1924          } else {
1925 1925                  ASSERT(curthread->t_preempt > 0);
1926 1926          }
1927 1927  
1928 1928          curthread->t_flag &= ~T_KFPU;
1929 1929  
1930 1930          /*
1931 1931           * When we are ending things, we explicitly don't save the current
1932 1932           * kernel FPU state back to the temporary state. The kfpu API is not
1933 1933           * intended to be a permanent save location.
1934 1934           *
1935 1935           * If this is a user-level thread and we were to context switch
1936 1936           * before returning to user-land, fpsave_ctxt will be a no-op since we
1937 1937           * already saved the user-level FPU state the first time we run
1938 1938           * kernel_fpu_begin (i.e. we won't save the bad kernel fpu state over
1939 1939           * the user-level fpu state). The fpsave_ctxt functions only save if
1940 1940           * FPU_VALID is not already set. fp_save also set PCB_SET_UPDATE_FPU so
1941 1941           * fprestore_ctxt will be done in sys_rtt_common when the thread
1942 1942           * finally returns to user-land.
1943 1943           */
1944 1944  
1945 1945          if ((curthread->t_procp->p_flag & SSYS) != 0 &&
1946 1946              curthread->t_intr == NULL) {
1947 1947                  /*
1948 1948                   * A kernel thread which is not an interrupt thread, so we
1949 1949                   * STTS now.
1950 1950                   */
1951 1951                  fpdisable();
1952 1952          }
1953 1953  
1954 1954          if ((flags & KFPU_NO_STATE) == 0) {
1955 1955                  ctxop_remove(curthread, &kfpu_ctxop_tpl, kfpu);
1956 1956  
1957 1957                  if (kfpu != NULL) {
1958 1958                          if (kfpu->kfpu_curthread != curthread) {
1959 1959                                  panic("attempting to end kernel FPU state "
1960 1960                                      "for %p, but active thread is not "
1961 1961                                      "curthread", kfpu);
1962 1962                          } else {
1963 1963                                  kfpu->kfpu_curthread = NULL;
1964 1964                          }
1965 1965                  }
1966 1966  
1967 1967                  kpreempt_enable();
1968 1968          }
1969 1969  
1970 1970          if (curthread->t_lwp != NULL) {
1971 1971                  uint_t f;
1972 1972  
1973 1973                  if (flags & KFPU_USE_LWP) {
1974 1974                          f = FPU_EN | FPU_KERNEL;
1975 1975                  } else {
1976 1976                          f = FPU_KERNEL;
1977 1977                  }
1978 1978                  curthread->t_lwp->lwp_pcb.pcb_fpu.fpu_flags &= ~f;
1979 1979          }
1980 1980  }
1981 1981  
1982 1982  /*
1983 1983   * Fill in FPU information that is required by exec.
1984 1984   */
1985 1985  void
1986 1986  fpu_auxv_info(int *typep, size_t *lenp)
1987 1987  {
1988 1988          *typep = fp_elf;
1989 1989          switch (fp_save_mech) {
1990 1990          case FP_FXSAVE:
1991 1991                  *lenp = sizeof (struct fxsave_state);
1992 1992                  break;
1993 1993          case FP_XSAVE:
1994 1994                  *lenp = cpuid_get_xsave_size();
1995 1995                  break;
1996 1996          default:
1997 1997                  *lenp = 0;
1998 1998                  break;
1999 1999          }
2000 2000  }
2001 2001  
2002 2002  /*
2003 2003   * This function exists to transform an xsave_state into an fxsave_state. The
2004 2004   * way that we have to do this is nuanced. We assume that callers have already
2005 2005   * handled FPU_EN and thus we only need to consider the xsave_state and its
2006 2006   * component vector itself. This results in the following cases that we need to
2007 2007   * consider:
2008 2008   *
2009 2009   *   o Neither the x87 / XMM state bits are set. We use the hardware default and
2010 2010   *     need to ensure to copy the xsave header.
2011 2011   *   o Both x87 / XMM state bits are set. We can copy everything.
2012 2012   *   o Only the x87 bit is set. We need to copy the x87 state but make the XMM
2013 2013   *     state be in the initial case.
2014 2014   *   o Only the XMM bit is set. The reverse of the above case.
2015 2015   *
2016 2016   * The illumos and hardware defaults in 'sse_initial' and 'avx_initial' are
2017 2017   * generally the same; however, the default floating point control word is
2018 2018   * different.
2019 2019   *
2020 2020   * Finally, we have the complication of the MXCSR and MCXSR_MASK registers.
2021 2021   * Because we are using xsave and xsaveopt in the kernel right now and not
2022 2022   * xsavec, the hardware may write out the MXCSR and MXCSR_MASK registers if the
2023 2023   * XFEATURE_AVX bit is set. Therefore if we don't have the XMM bit set but AVX
2024 2024   * is set, we must also come back and copy out the MXCSR register. Sorry, we
2025 2025   * don't make the rules.
2026 2026   */
2027 2027  static void
2028 2028  fpu_xsave_to_fxsave(const struct xsave_state *xsave, struct fxsave_state *fx)
2029 2029  {
2030 2030          const uint64_t comps = xsave->xs_header.xsh_xstate_bv;
2031 2031  
2032 2032          switch (comps & (XFEATURE_LEGACY_FP | XFEATURE_SSE)) {
2033 2033          case XFEATURE_LEGACY_FP | XFEATURE_SSE:
2034 2034                  bcopy(xsave, fx, sizeof (*fx));
2035 2035                  return;
2036 2036          case XFEATURE_LEGACY_FP:
2037 2037                  bcopy(xsave, fx, offsetof(struct fxsave_state, fx_xmm));
2038 2038                  fx->fx_mxcsr = SSE_MXCSR_INIT;
2039 2039                  fx->fx_mxcsr_mask = 0;
2040 2040                  break;
2041 2041          case XFEATURE_SSE:
2042 2042                  bcopy(&sse_initial, fx, offsetof(struct fxsave_state,
2043 2043                      fx_mxcsr));
2044 2044  
2045 2045                  fx->fx_fcw = FPU_CW_INIT_HW;
2046 2046                  fx->fx_mxcsr = xsave->xs_fxsave.fx_mxcsr;
2047 2047                  fx->fx_mxcsr_mask = xsave->xs_fxsave.fx_mxcsr_mask;
2048 2048                  bcopy(xsave->xs_fxsave.fx_xmm, fx->fx_xmm, sizeof (fx->fx_xmm));
2049 2049                  break;
2050 2050          default:
2051 2051                  bcopy(&sse_initial, fx, sizeof (*fx));
2052 2052                  fx->fx_fcw = FPU_CW_INIT_HW;
2053 2053                  break;
2054 2054          }
2055 2055  
2056 2056          /*
2057 2057           * Account for the AVX causing MXCSR to be valid.
2058 2058           */
2059 2059          if ((xsave->xs_header.xsh_xstate_bv & XFEATURE_AVX) != 0 &&
2060 2060              (xsave->xs_header.xsh_xstate_bv & XFEATURE_SSE) == 0) {
2061 2061                  fx->fx_mxcsr = xsave->xs_fxsave.fx_mxcsr;
2062 2062                  fx->fx_mxcsr_mask = xsave->xs_fxsave.fx_mxcsr_mask;
2063 2063          }
2064 2064  }
2065 2065  
2066 2066  /*
2067 2067   * This function is designed to answer the question of are we using any xsave
2068 2068   * family of instructions in context switch and therefore we have this state.
2069 2069   * This should still remain true if we are using xsavec or xsaves in the kernel
2070 2070   * in the future.
2071 2071   */
2072 2072  boolean_t
2073 2073  fpu_xsave_enabled(void)
2074 2074  {
2075 2075          return (fp_save_mech == FP_XSAVE);
2076 2076  }
2077 2077  
2078 2078  /*
2079 2079   * The following structure is used to track and manage the programmatic
2080 2080   * construction of /proc and signal stack spilling of xsave information. All
2081 2081   * known xsave types that the kernel supports must be included here.
2082 2082   */
2083 2083  typedef struct xsave_proc_info {
2084 2084          /*
2085 2085           * This matches the /proc xregs type that this data represents. This s
2086 2086           * used for /proc only.
2087 2087           */
2088 2088          uint32_t xi_type;
2089 2089          /*
2090 2090           * This indicates the size of the /proc data that we're operating on.
2091 2091           * This is only used for /proc.
2092 2092           */
2093 2093          size_t  xi_size;
2094 2094          /*
2095 2095           * This indicates the alignment that we want to have for the member when
2096 2096           * we're writing out. This is not used when setting data. This is only
2097 2097           * used for /proc.
2098 2098           */
2099 2099          size_t  xi_align;
2100 2100          /*
2101 2101           * This indicates whether this member must always be considered or not.
2102 2102           * This is used in both /proc and context/signal handling.
2103 2103           */
2104 2104          bool    xi_always;
2105 2105          /*
2106 2106           * This contains the corresponding bits in the xsave bit vector that
2107 2107           * corresponds to this entry. This is used for both /proc and
2108 2108           * context/signal handling.
2109 2109           */
2110 2110          uint64_t xi_bits;
2111 2111          /*
2112 2112           * The xi_fill function pointer is used to write out the /proc regset
2113 2113           * data (e.g. when a user reads xregs). This is only used for the /proc
2114 2114           * handling. The xi_valid function pointer is used instead to validate a
2115 2115           * given set of data that we've read in, while the xi_set pointer is
2116 2116           * used to actually transform the data in the underlying fpu save area.
2117 2117           */
2118 2118          void    (*xi_fill)(const fpu_ctx_t *, const struct xsave_proc_info *,
2119 2119              void *);
2120 2120          bool    (*xi_valid)(model_t, const void *);
2121 2121          void    (*xi_set)(fpu_ctx_t *, const struct xsave_proc_info *,
2122 2122              uint64_t, const void *);
2123 2123          /*
2124 2124           * The xi_signal_in and xi_signal_out function pointers are used for
2125 2125           * extended context and signal handling information. They are used when
2126 2126           * reading in data from a ucontex_t and writing it out respectively.
2127 2127           * These are only used for context/signal handling.
2128 2128           */
2129 2129          int     (*xi_signal_in)(const struct xsave_proc_info *,
2130 2130              const ucontext_t *, const uc_xsave_t *, void *, uintptr_t *,
2131 2131              const uintptr_t);
2132 2132          int     (*xi_signal_out)(const struct xsave_proc_info *, fpu_copyout_f,
2133 2133              uc_xsave_t *, const void *fpup, uintptr_t);
2134 2134  } xsave_proc_info_t;
2135 2135  
2136 2136  static bool
2137 2137  fpu_proc_xregs_initial_state(const fpu_ctx_t *fpu, uint64_t feats)
2138 2138  {
2139 2139          if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == 0) {
2140 2140                  return (B_TRUE);
2141 2141          }
2142 2142  
2143 2143          return ((fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv &
2144 2144              feats) == 0);
2145 2145  }
2146 2146  
2147 2147  static void
2148 2148  fpu_proc_xregs_xcr_fill(const fpu_ctx_t *fpu, const xsave_proc_info_t *info,
2149 2149      void *datap)
2150 2150  {
2151 2151          prxregset_xcr_t *xcr = datap;
2152 2152  
2153 2153          xcr->prx_xcr_xcr0 = xsave_bv_all;
2154 2154  }
2155 2155  
2156 2156  /*
2157 2157   * Unlike other instruction portions, we treat the xsave header and the legacy
2158 2158   * XMM section together as both are somewhat tied at the instruction hip. Unlike
2159 2159   * the latter values, the initial state here is not quite the same.
2160 2160   */
2161 2161  static void
2162 2162  fpu_proc_xregs_xsave_fill(const fpu_ctx_t *fpu, const xsave_proc_info_t *info,
2163 2163      void *datap)
2164 2164  {
2165 2165          prxregset_xsave_t *prxsave = datap;
2166 2166          const struct xsave_state *xsave = fpu->fpu_regs.kfpu_u.kfpu_xs;
2167 2167          size_t hdr_off;
2168 2168  
2169 2169          /*
2170 2170           * In the x87/XMM case, the no device vs. initial state is different
2171 2171           * because the initial state case still wants us to copy the real xsave
2172 2172           * header. It's also worth calling out that the actual illumos default
2173 2173           * fxsave state is not the same as what Intel documents. The main
2174 2174           * difference is in what the x87 FPU control word is. This results in
2175 2175           * the following different cases that we need to think about:
2176 2176           *
2177 2177           *   o FPU_EN is not set. So we use the illumos default.
2178 2178           */
2179 2179          if ((fpu->fpu_flags & FPU_EN) == 0) {
2180 2180                  bcopy(&avx_initial, prxsave, sizeof (*prxsave));
2181 2181                  return;
2182 2182          }
2183 2183  
2184 2184          /*
2185 2185           * Convert all the fxsave region while taking into account the validity
2186 2186           * of the xsave bits. The prxregset_xsave_t structure is identical in
2187 2187           * the first 512-bits to the prxsave structure.
2188 2188           */
2189 2189          fpu_xsave_to_fxsave(xsave, (struct fxsave_state *)prxsave);
2190 2190  
2191 2191          /*
2192 2192           * Now that we've dealt with the x87 and XMM state, take care of the
2193 2193           * header.
2194 2194           */
2195 2195          hdr_off = offsetof(prxregset_xsave_t, prx_xsh_xstate_bv);
2196 2196          bcopy((const void *)((uintptr_t)xsave + hdr_off),
2197 2197              (void *)((uintptr_t)prxsave + hdr_off),
2198 2198              sizeof (struct xsave_header));
2199 2199  }
2200 2200  
2201 2201  static void
2202 2202  fpu_proc_xregs_std_fill(const fpu_ctx_t *fpu, const xsave_proc_info_t *info,
2203 2203      void *datap)
2204 2204  {
2205 2205          if (!fpu_proc_xregs_initial_state(fpu, info->xi_bits)) {
2206 2206                  size_t size, off;
2207 2207                  const void *xsave_off;
2208 2208  
2209 2209                  cpuid_get_xsave_info(info->xi_bits, &size, &off);
2210 2210                  ASSERT3U(size, ==, info->xi_size);
2211 2211                  xsave_off = (void *)((uintptr_t)fpu->fpu_regs.kfpu_u.kfpu_xs +
2212 2212                      off);
2213 2213                  bcopy(xsave_off, datap, info->xi_size);
2214 2214          }
2215 2215  }
2216 2216  
2217 2217  /*
2218 2218   * Users are not allowed to actually set the xcr information this way. However,
2219 2219   * to make it easier for someone to just do a read, modify, write, of the xregs
2220 2220   * data, if it is identical, then we will accept it (and do nothing).
2221 2221   */
2222 2222  static bool
2223 2223  fpu_proc_xregs_xcr_valid(model_t model, const void *datap)
2224 2224  {
2225 2225          const prxregset_xcr_t *xcr = datap;
2226 2226  
2227 2227          return (xcr->prx_xcr_xcr0 == xsave_bv_all && xcr->prx_xcr_xfd == 0 &&
2228 2228              xcr->prx_xcr_pad[0] == 0 && xcr->prx_xcr_pad[1] == 0);
2229 2229  }
2230 2230  
2231 2231  /*
2232 2232   * To match traditional /proc semantics, we do not error if reserved bits of
2233 2233   * MXCSR are set, they will be masked off when writing data. We do not allow
2234 2234   * someone to indicate that they are asking for compressed xsave data, hence the
2235 2235   * check that prx_xsh_comp_bv is zero. Finally, we will check that each
2236 2236   * component that was indicated in the xstate_bv is present as another item as
2237 2237   * part of the broader validation path.
2238 2238   */
2239 2239  static bool
2240 2240  fpu_proc_xregs_xsave_valid(model_t model, const void *datap)
2241 2241  {
2242 2242          const prxregset_xsave_t *xsave = datap;
2243 2243          uint64_t rsvd[6] = { 0 };
2244 2244  
2245 2245          if (bcmp(rsvd, xsave->prx_xsh_reserved, sizeof (rsvd)) != 0 ||
2246 2246              xsave->prx_xsh_xcomp_bv != 0) {
2247 2247                  return (false);
2248 2248          }
2249 2249  
2250 2250          if ((xsave->prx_xsh_xstate_bv & ~xsave_bv_all) != 0) {
2251 2251                  return (false);
2252 2252          }
2253 2253  
2254 2254          return (true);
2255 2255  }
2256 2256  
2257 2257  /*
2258 2258   * The YMM, ZMM, and Hi-ZMM registers are all valid when in an LP64 environment
2259 2259   * on x86; however, when operating in ILP32, subsets are reserved. We basically
2260 2260   * require that all reserved portions are set to zero as our way to accept them.
2261 2261   */
2262 2262  static bool
2263 2263  fpu_proc_xregs_ymm_valid(model_t model, const void *datap)
2264 2264  {
2265 2265          upad128_t ymm_zero[8];
2266 2266          const prxregset_ymm_t *ymm = datap;
2267 2267  
2268 2268          if (model == DATAMODEL_LP64) {
2269 2269                  return (true);
2270 2270          }
2271 2271  
2272 2272          bzero(&ymm_zero, sizeof (ymm_zero));
2273 2273          return (bcmp(&ymm->prx_ymm[8], &ymm_zero, sizeof (ymm_zero)) == 0);
2274 2274  }
2275 2275  
2276 2276  static bool
2277 2277  fpu_proc_xregs_zmm_valid(model_t model, const void *datap)
2278 2278  {
2279 2279          upad256_t zmm_zero[8];
2280 2280          const prxregset_zmm_t *zmm = datap;
2281 2281  
2282 2282          if (model == DATAMODEL_LP64) {
2283 2283                  return (true);
2284 2284          }
2285 2285  
2286 2286          bzero(&zmm_zero, sizeof (zmm_zero));
2287 2287          return (bcmp(&zmm->prx_zmm[8], &zmm_zero, sizeof (zmm_zero)) == 0);
2288 2288  }
2289 2289  
2290 2290  static bool
2291 2291  fpu_proc_xregs_hi_zmm_valid(model_t model, const void *datap)
2292 2292  {
2293 2293          prxregset_hi_zmm_t hi_zmm_zero;
2294 2294          const prxregset_hi_zmm_t *hi_zmm = datap;
2295 2295  
2296 2296          if (model == DATAMODEL_LP64) {
2297 2297                  return (true);
2298 2298          }
2299 2299  
2300 2300          bzero(&hi_zmm_zero, sizeof (hi_zmm_zero));
2301 2301          return (bcmp(hi_zmm, &hi_zmm_zero, sizeof (hi_zmm_zero)) == 0);
2302 2302  }
2303 2303  
2304 2304  /*
2305 2305   * The xsave state consists of the first 512 byes of the XMM state and then the
2306 2306   * xsave header itself. Because of the xsave header, this structure is marked
2307 2307   * with xi_always, so we must always process and consider it.
2308 2308   *
2309 2309   * Semantically if either of the bits around SSE / x87 is set, then we will copy
2310 2310   * the entire thing. This may mean that we end up copying a region that is not
2311 2311   * valid into the save area; however, that should be OK as we still have the
2312 2312   * specific bit flags that indicate what we should consider or not.
2313 2313   *
2314 2314   * There is one additional wrinkle we need to consider and honor here. The CPU
2315 2315   * will load the MXCSR values if the AVX bit is set in an xrstor regardless of
2316 2316   * anything else. So if if this is set and we do not have a valid x87/XMM bits
2317 2317   * set then we will set the MXCSR to its default state in case the processor
2318 2318   * tries to load it. For reference see:
2319 2319   *
2320 2320   *   o Intel SDM Volume 1: 13.8.1 Standard Form of XRSTOR
2321 2321   *   o AMD64 Volume 2: Section 11.5.9 MXCSR State Management
2322 2322   *
2323 2323   * Note, the behavior around this changes depending on whether using the
2324 2324   * compressed xrstor or not. We are not, but it's worth being aware of. We do
2325 2325   * not worry about MXCSR_MASK because the instructions ignore it.
2326 2326   */
2327 2327  static void
2328 2328  fpu_proc_xregs_xsave_set(fpu_ctx_t *fpu, const xsave_proc_info_t *info,
2329 2329      uint64_t xsave_bv, const void *datap)
2330 2330  {
2331 2331          const struct xsave_state *xs = datap;
2332 2332  
2333 2333          if ((xsave_bv & info->xi_bits) != 0) {
2334 2334                  bcopy(&xs->xs_fxsave, &fpu->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave,
2335 2335                      sizeof (struct fxsave_state));
2336 2336          } else if ((xsave_bv & XFEATURE_AVX) != 0) {
2337 2337                  fpu->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_mxcsr =
2338 2338                      SSE_MXCSR_INIT;
2339 2339          }
2340 2340  
2341 2341          bcopy(&xs->xs_header, &fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header,
2342 2342              sizeof (struct xsave_header));
2343 2343          fpu->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_mxcsr &= sse_mxcsr_mask;
2344 2344  }
2345 2345  
2346 2346  static void
2347 2347  fpu_proc_xregs_std_set(fpu_ctx_t *fpu, const xsave_proc_info_t *info,
2348 2348      uint64_t xsave_bv, const void *datap)
2349 2349  {
2350 2350          size_t size, off;
2351 2351          void *xsave_off;
2352 2352  
2353 2353          cpuid_get_xsave_info(info->xi_bits, &size, &off);
2354 2354          xsave_off = (void *)((uintptr_t)fpu->fpu_regs.kfpu_u.kfpu_xs +
2355 2355              off);
2356 2356          bcopy(datap, xsave_off, size);
2357 2357  }
2358 2358  
2359 2359  /*
2360 2360   * Dealing with XMM data is a little more annoying here. If UC_FPU is set, it
2361 2361   * also contains a copy of the XMM region. That must take priority over anything
2362 2362   * we have here. In the copyout code we do not set the XMM bits here as
2363 2363   * something to copy, therefore if they are set, we currently treat that as an
2364 2364   * error.
2365 2365   *
2366 2366   * The system has always gone through and cleaned up the reserved bits in the
2367 2367   * fxsave state when someone calls setcontext(). Therefore we need to do the
2368 2368   * same thing which is why you see the masking of the mxcsr below.
2369 2369   *
2370 2370   * Finally, there is one last wrinkle here that we need to consider. The
2371 2371   * fpregset_t has historically had two private words that are used to convey the
2372 2372   * status which cache the status/exception information. Therefore, we well...
2373 2373   * cheat. Intel has left bytes 464 (0x1d0) through 511 (0x1ff) available for us
2374 2374   * to do what we want. So we will pass this through that for the moment to help
2375 2375   * us pass this state around without too much extra allocation.
2376 2376   */
2377 2377  static int
2378 2378  fpu_signal_copyin_xmm(const xsave_proc_info_t *info, const ucontext_t *kuc,
2379 2379      const uc_xsave_t *ucx, void *fpup, uintptr_t *udatap,
2380 2380      const uintptr_t max_udata)
2381 2381  {
2382 2382          struct xsave_state *xsave = fpup;
2383 2383  
2384 2384          if ((ucx->ucx_bv & info->xi_bits) != 0) {
2385 2385                  return (EINVAL);
2386 2386          }
2387 2387  
2388 2388          if ((kuc->uc_flags & UC_FPU) != 0) {
2389 2389                  bcopy(&kuc->uc_mcontext.fpregs, &xsave->xs_fxsave,
2390 2390                      sizeof (struct fxsave_state));
2391 2391                  xsave->xs_fxsave.__fx_ign2[3]._l[0] =
2392 2392                      kuc->uc_mcontext.fpregs.fp_reg_set.fpchip_state.status;
2393 2393                  xsave->xs_fxsave.__fx_ign2[3]._l[1] =
2394 2394                      kuc->uc_mcontext.fpregs.fp_reg_set.fpchip_state.xstatus;
2395 2395                  xsave->xs_fxsave.fx_mxcsr &= sse_mxcsr_mask;
2396 2396                  xsave->xs_header.xsh_xstate_bv |= info->xi_bits;
2397 2397          }
2398 2398  
2399 2399          return (0);
2400 2400  }
2401 2401  
2402 2402  static int
2403 2403  fpu_signal_copyin_std(const xsave_proc_info_t *info, const ucontext_t *kuc,
2404 2404      const uc_xsave_t *ucx, void *fpup, uintptr_t *udatap,
2405 2405      const uintptr_t max_udata)
2406 2406  {
2407 2407          size_t len, xsave_off;
2408 2408          void *copy_to;
2409 2409          struct xsave_state *xsave = fpup;
2410 2410  
2411 2411          cpuid_get_xsave_info(info->xi_bits, &len, &xsave_off);
2412 2412          if (*udatap + len > max_udata) {
2413 2413                  return (EOVERFLOW);
2414 2414          }
2415 2415  
2416 2416          copy_to = (void *)((uintptr_t)fpup + xsave_off);
2417 2417          if (ddi_copyin((void *)*udatap, copy_to, len, 0) != 0) {
2418 2418                  return (EFAULT);
2419 2419          }
2420 2420  
2421 2421          xsave->xs_header.xsh_xstate_bv |= info->xi_bits;
2422 2422          *udatap = *udatap + len;
2423 2423  
2424 2424          return (0);
2425 2425  }
2426 2426  
2427 2427  static int
2428 2428  fpu_signal_copyout_std(const xsave_proc_info_t *info, fpu_copyout_f copyfunc,
2429 2429      uc_xsave_t *ucx, const void *fpup, uintptr_t udatap)
2430 2430  {
2431 2431          size_t len, xsave_off;
2432 2432          const void *copy_from;
2433 2433          void *copy_to;
2434 2434          int ret;
2435 2435  
2436 2436          cpuid_get_xsave_info(info->xi_bits, &len, &xsave_off);
2437 2437          copy_from = (void *)(uintptr_t)fpup + xsave_off;
2438 2438          copy_to = (void *)(udatap + ucx->ucx_len);
2439 2439  
2440 2440          ret = copyfunc(copy_from, copy_to, len);
2441 2441          if (ret != 0) {
2442 2442                  return (ret);
2443 2443          }
2444 2444  
2445 2445          ucx->ucx_len += len;
2446 2446          ucx->ucx_bv |= info->xi_bits;
2447 2447          return (0);
2448 2448  }
2449 2449  
2450 2450  /*
2451 2451   * This table contains information about the extended FPU states and synthetic
2452 2452   * information we create for /proc, the ucontext_t, and signal handling. The
2453 2453   * definition of the xsave_proc_info_t describes how each member is used.
2454 2454   *
2455 2455   * In general, this table is expected to be in the order of the xsave data
2456 2456   * structure itself. Synthetic elements that we create can go anywhere and new
2457 2457   * ones should be inserted at the end. This structure is walked in order to
2458 2458   * produce the /proc and signal handling logic, so changing the order is
2459 2459   * meaningful for those and probably should not be done lightly.
2460 2460   */
2461 2461  static const xsave_proc_info_t fpu_xsave_info[] = { {
2462 2462          .xi_type = PRX_INFO_XCR,
2463 2463          .xi_size = sizeof (prxregset_xcr_t),
2464 2464          .xi_align = alignof (prxregset_xcr_t),
2465 2465          .xi_always = true,
2466 2466          .xi_bits = 0,
2467 2467          .xi_fill = fpu_proc_xregs_xcr_fill,
2468 2468          .xi_valid = fpu_proc_xregs_xcr_valid
2469 2469  }, {
2470 2470          /*
2471 2471           * The XSAVE entry covers both the xsave header and the %xmm registers.
2472 2472           * Note, there is no signal copyout information for the %xmm registers
2473 2473           * because it is expected that that data is already in the fpregset_t.
2474 2474           */
2475 2475          .xi_type = PRX_INFO_XSAVE,
2476 2476          .xi_size = sizeof (prxregset_xsave_t),
2477 2477          .xi_align = FPU_ALIGN_XMM,
2478 2478          .xi_always = true,
2479 2479          .xi_bits = XFEATURE_LEGACY_FP | XFEATURE_SSE,
2480 2480          .xi_fill = fpu_proc_xregs_xsave_fill,
2481 2481          .xi_set = fpu_proc_xregs_xsave_set,
2482 2482          .xi_valid = fpu_proc_xregs_xsave_valid,
2483 2483          .xi_signal_in = fpu_signal_copyin_xmm
2484 2484  }, {
2485 2485          .xi_type = PRX_INFO_YMM,
2486 2486          .xi_size = sizeof (prxregset_ymm_t),
2487 2487          .xi_align = FPU_ALIGN_YMM,
2488 2488          .xi_always = false,
2489 2489          .xi_bits = XFEATURE_AVX,
2490 2490          .xi_fill = fpu_proc_xregs_std_fill,
2491 2491          .xi_set = fpu_proc_xregs_std_set,
2492 2492          .xi_signal_in = fpu_signal_copyin_std,
2493 2493          .xi_valid = fpu_proc_xregs_ymm_valid,
2494 2494          .xi_signal_out = fpu_signal_copyout_std
2495 2495  }, {
2496 2496          /*
2497 2497           * There is no /proc validation function for the mask registers because
2498 2498           * they are the same in ILP32 / LP64 and there is nothing for us to
2499 2499           * actually validate.
2500 2500           */
2501 2501          .xi_type = PRX_INFO_OPMASK,
2502 2502          .xi_size = sizeof (prxregset_opmask_t),
2503 2503          .xi_align = alignof (prxregset_opmask_t),
2504 2504          .xi_always = false,
2505 2505          .xi_bits = XFEATURE_AVX512_OPMASK,
2506 2506          .xi_fill = fpu_proc_xregs_std_fill,
2507 2507          .xi_set = fpu_proc_xregs_std_set,
2508 2508          .xi_signal_in = fpu_signal_copyin_std,
2509 2509          .xi_signal_out = fpu_signal_copyout_std
2510 2510  }, {
2511 2511          .xi_type = PRX_INFO_ZMM,
2512 2512          .xi_size = sizeof (prxregset_zmm_t),
2513 2513          .xi_align = FPU_ALIGN_ZMM,
2514 2514          .xi_always = false,
2515 2515          .xi_bits = XFEATURE_AVX512_ZMM,
2516 2516          .xi_fill = fpu_proc_xregs_std_fill,
2517 2517          .xi_set = fpu_proc_xregs_std_set,
2518 2518          .xi_valid = fpu_proc_xregs_zmm_valid,
2519 2519          .xi_signal_in = fpu_signal_copyin_std,
2520 2520          .xi_signal_out = fpu_signal_copyout_std
2521 2521  }, {
2522 2522          .xi_type = PRX_INFO_HI_ZMM,
2523 2523          .xi_size = sizeof (prxregset_hi_zmm_t),
2524 2524          .xi_align = FPU_ALIGN_ZMM,
2525 2525          .xi_always = false,
2526 2526          .xi_bits = XFEATURE_AVX512_HI_ZMM,
2527 2527          .xi_fill = fpu_proc_xregs_std_fill,
2528 2528          .xi_set = fpu_proc_xregs_std_set,
2529 2529          .xi_valid = fpu_proc_xregs_hi_zmm_valid,
2530 2530          .xi_signal_in = fpu_signal_copyin_std,
2531 2531          .xi_signal_out = fpu_signal_copyout_std
2532 2532  } };
2533 2533  
2534 2534  static bool
2535 2535  fpu_proc_xregs_include(const xsave_proc_info_t *infop)
2536 2536  {
2537 2537          return (infop->xi_always || (xsave_bv_all & infop->xi_bits) != 0);
2538 2538  }
2539 2539  
2540 2540  void
2541 2541  fpu_proc_xregs_info(struct proc *p __unused, uint32_t *ninfop, uint32_t *sizep,
2542 2542      uint32_t *dstart)
2543 2543  {
2544 2544          size_t ret = sizeof (prxregset_hdr_t);
2545 2545          uint32_t ninfo = 0;
2546 2546  
2547 2547          ASSERT(fpu_xsave_enabled());
2548 2548  
2549 2549          /*
2550 2550           * Right now the set of flags that are enabled in the FPU is global.
2551 2551           * That is, while the pcb's fcpu_ctx_t has the fpu_xsave_mask, the
2552 2552           * actual things that might show up and we care about are all about what
2553 2553           * is set up in %xcr0 which is stored in the global xsave_bv_all. If we
2554 2554           * move to per-process FPU enablement which is likely to come with AMX,
2555 2555           * then this will need the proc_t to look at, hence why we've set things
2556 2556           * up with the unused variable above.
2557 2557           *
2558 2558           * We take two passes through the array. The first is just to count up
2559 2559           * how many informational entries we need.
2560 2560           */
2561 2561          for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
2562 2562                  if (!fpu_proc_xregs_include(&fpu_xsave_info[i]))
2563 2563                          continue;
2564 2564                  ninfo++;
2565 2565          }
2566 2566  
2567 2567          ASSERT3U(ninfo, >, 0);
2568 2568          ret += sizeof (prxregset_info_t) * ninfo;
2569 2569  
2570 2570          for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
2571 2571                  size_t curphase;
2572 2572                  if (!fpu_proc_xregs_include(&fpu_xsave_info[i]))
2573 2573                          continue;
2574 2574  
2575 2575                  curphase = ret % fpu_xsave_info[i].xi_align;
2576 2576                  if (ret < fpu_xsave_info[i].xi_align) {
2577 2577                          ret = fpu_xsave_info[i].xi_align;
2578 2578                  } else if (curphase != 0) {
2579 2579                          ret += curphase;
2580 2580                  }
2581 2581  
2582 2582                  if (i == 0 && dstart != NULL) {
2583 2583                          *dstart = ret;
2584 2584                  }
2585 2585  
2586 2586                  ret += fpu_xsave_info[i].xi_size;
2587 2587          }
2588 2588  
2589 2589          VERIFY3U(ret, <=, UINT32_MAX);
2590 2590          if (sizep != NULL) {
2591 2591                  *sizep = ret;
2592 2592          }
2593 2593  
2594 2594          if (ninfop != NULL) {
2595 2595                  *ninfop = ninfo;
2596 2596          }
2597 2597  }
2598 2598  
2599 2599  /*
2600 2600   * This function supports /proc. Because /proc does not have a process locked
2601 2601   * while processing a PCSXREG, so this tries to establish an upper bound that we
2602 2602   * will validate later in fpu_proc_xregs_set(). We basically say that if you
2603 2603   * take the maximum xsave size and add 1 KiB that is a good enough approximation
2604 2604   * for the maximum size.
2605 2605   */
2606 2606  size_t
2607 2607  fpu_proc_xregs_max_size(void)
2608 2608  {
2609 2609          VERIFY(fpu_xsave_enabled());
2610 2610          return (cpuid_get_xsave_size() + 0x1000);
2611 2611  }
2612 2612  
2613 2613  /*
2614 2614   * This functions supports /proc. In particular, it's meant to perform the
2615 2615   * following:
2616 2616   *
2617 2617   *  o Potentially save the current thread's registers.
2618 2618   *  o Write out the x86 xsave /proc xregs format data from the xsave data we
2619 2619   *    actually have. Note, this can be a little weird for cases where the FPU is
2620 2620   *    not actually enabled, which happens for system processes.
2621 2621   *    /proc let us read this state?
2622 2622   */
2623 2623  void
2624 2624  fpu_proc_xregs_get(struct _klwp *lwp, void *buf)
2625 2625  {
2626 2626          uint32_t size, ninfo, curinfo, dstart;
2627 2627          fpu_ctx_t *fpu = &lwp->lwp_pcb.pcb_fpu;
2628 2628          prxregset_hdr_t *hdr = buf;
2629 2629  
2630 2630          ASSERT(fpu_xsave_enabled());
2631 2631          fpu_proc_xregs_info(lwp->lwp_procp, &ninfo, &size, &dstart);
2632 2632  
2633 2633          /*
2634 2634           * Before we get going, defensively zero out all the data buffer so that
2635 2635           * the rest of the fill functions can assume a specific base.
2636 2636           */
2637 2637          bzero(buf, size);
2638 2638  
2639 2639          kpreempt_disable();
2640 2640          if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
2641 2641                  /*
2642 2642                   * This case suggests that thread in question doesn't have a
2643 2643                   * valid FPU save state which should only happen when it is on
2644 2644                   * CPU. If this is the case, we must ensure that we save the
2645 2645                   * current FPU state before proceeding. We also sanity check
2646 2646                   * several things here before doing this as using /proc on
2647 2647                   * yourself is always exciting. fp_save() will ensure that the
2648 2648                   * thread is flagged to go back to being an eager FPU before
2649 2649                   * returning back to userland.
2650 2650                   */
2651 2651                  VERIFY3P(curthread, ==, lwptot(lwp));
2652 2652                  VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
2653 2653                  fp_save(fpu);
2654 2654          }
2655 2655          kpreempt_enable();
2656 2656  
2657 2657          hdr->pr_type = PR_TYPE_XSAVE;
2658 2658          hdr->pr_size = size;
2659 2659          hdr->pr_flags = hdr->pr_pad[0] = hdr->pr_pad[1] = hdr->pr_pad[2] =
2660 2660              hdr->pr_pad[3] = 0;
2661 2661          hdr->pr_ninfo = ninfo;
2662 2662  
2663 2663          curinfo = 0;
2664 2664          for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
2665 2665                  void *startp;
2666 2666                  uint32_t phase;
2667 2667  
2668 2668                  if (!fpu_proc_xregs_include(&fpu_xsave_info[i]))
2669 2669                          continue;
2670 2670  
2671 2671                  phase = dstart % fpu_xsave_info[i].xi_align;
2672 2672                  if (dstart < fpu_xsave_info[i].xi_align) {
2673 2673                          ASSERT3U(i, !=, 0);
2674 2674                          dstart = fpu_xsave_info[i].xi_align;
2675 2675                  } else if (phase != 0) {
2676 2676                          ASSERT3U(i, !=, 0);
2677 2677                          dstart += phase;
2678 2678                  }
2679 2679  
2680 2680                  hdr->pr_info[curinfo].pri_type = fpu_xsave_info[i].xi_type;
2681 2681                  hdr->pr_info[curinfo].pri_flags = 0;
2682 2682                  hdr->pr_info[curinfo].pri_size = fpu_xsave_info[i].xi_size;
2683 2683                  hdr->pr_info[curinfo].pri_offset = dstart;
2684 2684  
2685 2685                  startp = (void *)((uintptr_t)buf + dstart);
2686 2686                  fpu_xsave_info[i].xi_fill(fpu, &fpu_xsave_info[i], startp);
2687 2687                  dstart += fpu_xsave_info[i].xi_size;
2688 2688                  ASSERT3U(curinfo, <=, ninfo);
2689 2689                  curinfo++;
2690 2690          }
2691 2691  }
2692 2692  
2693 2693  /*
2694 2694   * We have been asked to set the data in the FPU for a given thread. Our
2695 2695   * prmachdep code has already validated that the raw semantics of the data that
2696 2696   * we have are valid (that is the appropriate sizes, offsets, and flags). We now
2697 2697   * apply additional checking here:
2698 2698   *
2699 2699   *   o The xsave structure is present and only valid bits are set.
2700 2700   *   o If the xsave component bit-vector is set, we have the corresponding proc
2701 2701   *     info item.
2702 2702   *   o Read-only items are ignored if and only if they actually match what we
2703 2703   *     gave the user mostly as a courtesy to simplify things here.
2704 2704   *   o ILP32 processes which can't support many of the regions are allowed to
2705 2705   *     have the items here (as we likely gave them to them), but they must be
2706 2706   *     zero if they are set.
2707 2707   *
2708 2708   * We take a first pass through all the data, validating it makes sense for the
2709 2709   * FPU. Only after that point do we ensure that we have the FPU data in question
2710 2710   * and then we clobber all the FPU data. Part of the semantics of setting this
2711 2711   * is that we're setting the entire extended FPU.
2712 2712   */
2713 2713  int
2714 2714  fpu_proc_xregs_set(struct _klwp *lwp, void *buf)
2715 2715  {
2716 2716          prxregset_hdr_t *prx = buf;
2717 2717          model_t model = lwp_getdatamodel(lwp);
2718 2718          uint64_t bv_found = 0;
2719 2719          const prxregset_xsave_t *xsave = NULL;
2720 2720          fpu_ctx_t *fpu = &lwp->lwp_pcb.pcb_fpu;
2721 2721  
2722 2722          VERIFY(fpu_xsave_enabled());
2723 2723  
2724 2724          /*
2725 2725           * First, walk each note info header that we have from the user and
2726 2726           * proceed to validate it. The prmachdep code has already validated that
2727 2727           * the size, type, and offset information is valid, but it has not
2728 2728           * validated the semantic contents of this or if someone is trying to
2729 2729           * write something they shouldn't.
2730 2730           *
2731 2731           * While we walk this, we keep track of where the xsave header is. We
2732 2732           * also track all of the bits that we have found along the way so we can
2733 2733           * match up and ensure that everything that was set has a corresponding
2734 2734           * bit in the xsave bitmap. If we have something in the xsave bitmap,
2735 2735           * but not its corresponding data, then that is an error. However, we
2736 2736           * allow folks to write data regions without the bit set in the xsave
2737 2737           * data to make the read, modify, write process simpler.
2738 2738           */
2739 2739          for (uint32_t i = 0; i < prx->pr_ninfo; i++) {
2740 2740                  const prxregset_info_t *info = &prx->pr_info[i];
2741 2741                  bool found = false;
2742 2742  
2743 2743                  for (size_t pt = 0; pt < ARRAY_SIZE(fpu_xsave_info); pt++) {
2744 2744                          void *data;
2745 2745                          if (info->pri_type != fpu_xsave_info[pt].xi_type)
2746 2746                                  continue;
2747 2747  
2748 2748                          found = true;
2749 2749                          data = (void *)((uintptr_t)buf + info->pri_offset);
2750 2750                          if (fpu_xsave_info[pt].xi_valid != NULL &&
2751 2751                              !fpu_xsave_info[pt].xi_valid(model, data)) {
2752 2752                                  return (EINVAL);
2753 2753                          }
2754 2754  
2755 2755                          if (info->pri_type == PRX_INFO_XSAVE) {
2756 2756                                  xsave = data;
2757 2757                          }
2758 2758                          bv_found |= fpu_xsave_info[pt].xi_bits;
2759 2759                          break;
2760 2760                  }
2761 2761  
2762 2762                  if (!found) {
2763 2763                          return (EINVAL);
2764 2764                  }
2765 2765          }
2766 2766  
2767 2767          /*
2768 2768           * No xsave data, no dice.
2769 2769           */
2770 2770          if (xsave == NULL) {
2771 2771                  return (EINVAL);
2772 2772          }
2773 2773  
2774 2774          /*
2775 2775           * If anything is set in the xsave header that was not found as we
2776 2776           * walked structures, then that is an error. The opposite is not true as
2777 2777           * discussed above.
2778 2778           */
2779 2779          if ((xsave->prx_xsh_xstate_bv & ~bv_found) != 0) {
2780 2780                  return (EINVAL);
2781 2781          }
2782 2782  
2783 2783          /*
2784 2784           * At this point, we consider all the data actually valid. Now we must
2785 2785           * set up this information in the save area. If this is our own lwp, we
2786 2786           * must disable it first. Otherwise, we expect that it is already valid.
2787 2787           * To try to sanitize this, we will defensively zero the entire region
2788 2788           * as we are setting everything that will result in here.
2789 2789           */
2790 2790          kpreempt_disable();
2791 2791          if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
2792 2792                  /*
2793 2793                   * This case suggests that thread in question doesn't have a
2794 2794                   * valid FPU save state which should only happen when it is on
2795 2795                   * CPU. If this is the case, we explicitly disable the FPU, but
2796 2796                   * do not save it before proceeding. We also sanity check
2797 2797                   * several things here before doing this as using /proc on
2798 2798                   * yourself is always exciting. Unlike fp_save(), fp_free() does
2799 2799                   * not signal that an update is required, so we unconditionally
2800 2800                   * set that for all threads.
2801 2801                   */
2802 2802                  VERIFY3P(curthread, ==, lwptot(lwp));
2803 2803                  VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
2804 2804                  fp_free(fpu);
2805 2805          }
2806 2806          PCB_SET_UPDATE_FPU(&lwp->lwp_pcb);
2807 2807          bzero(lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic,
2808 2808              cpuid_get_xsave_size());
2809 2809  
2810 2810          for (uint32_t i = 0; i < prx->pr_ninfo; i++) {
2811 2811                  const prxregset_info_t *info = &prx->pr_info[i];
2812 2812                  bool found = false;
2813 2813  
2814 2814                  for (size_t pt = 0; pt < ARRAY_SIZE(fpu_xsave_info); pt++) {
2815 2815                          const void *data;
2816 2816                          if (info->pri_type != fpu_xsave_info[pt].xi_type)
2817 2817                                  continue;
2818 2818  
2819 2819                          /*
2820 2820                           * Check if we have a set function and if we should
2821 2821                           * include this. We may not if this is something like
2822 2822                           * PRX_INFO_XCR which is read-only.
2823 2823                           *
2824 2824                           * We may not include a given entry as it may not have
2825 2825                           * been set in the actual xsave state that we have been
2826 2826                           * asked to restore, in which case to not break the
2827 2827                           * xsaveopt logic, we must leave it in its initial
2828 2828                           * state, e.g. zeroed (generally). XMM data initial
2829 2829                           * state is not zeroed, but is marked with xi_always to
2830 2830                           * help account for this.
2831 2831                           */
2832 2832                          found = true;
2833 2833                          if (fpu_xsave_info[pt].xi_set == NULL)
2834 2834                                  break;
2835 2835                          if (!fpu_xsave_info[pt].xi_always &&
2836 2836                              (xsave->prx_xsh_xstate_bv &
2837 2837                              fpu_xsave_info[pt].xi_bits) !=
2838 2838                              fpu_xsave_info[pt].xi_bits) {
2839 2839                                  break;
2840 2840                          }
2841 2841  
2842 2842                          data = (void *)((uintptr_t)buf + info->pri_offset);
2843 2843                          fpu_xsave_info[pt].xi_set(fpu, &fpu_xsave_info[pt],
2844 2844                              xsave->prx_xsh_xstate_bv, data);
2845 2845                  }
2846 2846  
2847 2847                  VERIFY(found);
2848 2848          }
2849 2849          kpreempt_enable();
2850 2850  
2851 2851          return (0);
2852 2852  }
2853 2853  
2854 2854  /*
2855 2855   * To be included in the signal copyout logic we must have a copy function and
2856 2856   * the bit in question must be included. Note, we don't consult xi_always here
2857 2857   * as that is really part of what is always present for xsave logic and
2858 2858   * therefore isn't really pertinent here because of our custom format. See the
2859 2859   * big theory statement for more info.
2860 2860   */
2861 2861  static bool
2862 2862  fpu_signal_include(const xsave_proc_info_t *infop, uint64_t xs_bv)
2863 2863  {
2864 2864          return ((infop->xi_bits & xs_bv) == infop->xi_bits &&
2865 2865              infop->xi_signal_out != NULL);
2866 2866  }
2867 2867  
2868 2868  /*
2869 2869   * We need to fill out the xsave related data into the ucontext_t that we've
2870 2870   * been given. We should have a valid user pointer at this point in the uc_xsave
2871 2871   * member. This is much simpler than the copyin that we have. Here are the
2872 2872   * current assumptions:
2873 2873   *
2874 2874   *   o This is being called for the current thread. This is not meant to operate
2875 2875   *     on an arbitrary thread's state.
2876 2876   *   o We cannot assume whether the FPU is valid in the pcb or not. While most
2877 2877   *     callers will have just called getfpregs() which saved the state, don't
2878 2878   *     assume that.
2879 2879   *   o We assume that the user address has the requisite required space for this
2880 2880   *     to be copied out.
2881 2881   *   o We assume that copyfunc() will ensure we are not copying into a kernel
2882 2882   *     address.
2883 2883   *
2884 2884   * For more information on the format of the data, see the 'Signal Handling and
2885 2885   * the ucontext_t' portion of the big theory statement. We copy out all the
2886 2886   * constituent parts and then come back and write out the actual final header
2887 2887   * information.
2888 2888   */
2889 2889  int
2890 2890  fpu_signal_copyout(struct _klwp *lwp, uintptr_t uaddr, fpu_copyout_f copyfunc)
2891 2891  {
2892 2892          struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
2893 2893          uint64_t xs_bv;
2894 2894          uc_xsave_t ucx;
2895 2895          int ret;
2896 2896  
2897 2897          VERIFY3P(curthread, ==, lwptot(lwp));
2898 2898          VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
2899 2899          ASSERT3U(fpu->fpu_flags & FPU_EN, ==, FPU_EN);
2900 2900  
2901 2901          if (!fpu_xsave_enabled()) {
2902 2902                  return (ENOTSUP);
2903 2903          }
2904 2904  
2905 2905          /*
2906 2906           * Unlike when we're dealing with /proc, we can unconditionally call
2907 2907           * fp_save() because this is always called in the context that the lwp
2908 2908           * we're operating on is always the one on CPU (which is what fp_save()
2909 2909           * asserts).
2910 2910           */
2911 2911          fp_save(fpu);
2912 2912  
2913 2913          bzero(&ucx, sizeof (ucx));
2914 2914          ucx.ucx_vers = UC_XSAVE_VERS;
2915 2915          ucx.ucx_len += sizeof (uc_xsave_t);
2916 2916  
2917 2917          xs_bv = fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv;
2918 2918          for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
2919 2919                  const xsave_proc_info_t *info = &fpu_xsave_info[i];
2920 2920  
2921 2921                  if (!fpu_signal_include(&fpu_xsave_info[i], xs_bv))
2922 2922                          continue;
2923 2923                  ret = info->xi_signal_out(info, copyfunc, &ucx,
2924 2924                      lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic,
2925 2925                      uaddr);
2926 2926                  if (ret != 0) {
2927 2927                          kpreempt_enable();
2928 2928                          return (ret);
2929 2929                  }
2930 2930          }
2931 2931  
2932 2932          /*
2933 2933           * Now that everything has been copied out, we should have an accurate
2934 2934           * value in the uc_xsave_t header and we can copy that out at the start
2935 2935           * of the user data.
2936 2936           */
2937 2937          ret = copyfunc(&ucx, (void *)uaddr, sizeof (ucx));
2938 2938          return (ret);
2939 2939  }
2940 2940  
2941 2941  /*
2942 2942   * Here we've been given a ucontext_t which potentially has a user pointer to
2943 2943   * xsave state that we've copied out previously. In this case we need to do the
2944 2944   * following, assuming UC_XSAVE is present:
2945 2945   *
2946 2946   *   o Copy in our header and validate it.
2947 2947   *   o Allocate an fpu context to use as a holding ground for all this data.
2948 2948   *   o If UC_FPU is set, override the xsave structure with the saved XMM state,
2949 2949   *     clear UC_FPU, and make sure that the correct xsave_bv bits are set.
2950 2950   *
2951 2951   * Currently we always allocate the additional state as a holding ground for the
2952 2952   * FPU. What we're copying in may not be valid and we don't want to clobber the
2953 2953   * existing FPU state or deal with merging it until we believe it's reasonable
2954 2954   * enough. The proc_t is here to set us up for when we have per-process settings
2955 2955   * in the extended feature disable MSRs.
2956 2956   */
2957 2957  int
2958 2958  fpu_signal_copyin(struct _klwp *lwp, ucontext_t *kuc)
2959 2959  {
2960 2960          uc_xsave_t ucx;
2961 2961          uint64_t bv;
2962 2962          uintptr_t data, max_data;
2963 2963          void *fpu;
2964 2964          proc_t *p = lwp->lwp_procp;
2965 2965          size_t ksize;
2966 2966  
2967 2967          /*
2968 2968           * Because this has been opaque filler and the kernel has never
2969 2969           * historically looked at it, we don't really care about the uc_xsave
2970 2970           * pointer being garbage in the case that the flag is not set. While
2971 2971           * this isn't perhaps the most sporting choice in some cases, this is on
2972 2972           * the other hand, pragmatic.
2973 2973           */
2974 2974          if ((kuc->uc_flags & UC_XSAVE) != 0) {
2975 2975                  if (kuc->uc_xsave == 0) {
2976 2976                          return (EINVAL);
2977 2977                  }
2978 2978  
2979 2979                  if (!fpu_xsave_enabled()) {
2980 2980                          return (ENOTSUP);
2981 2981                  }
2982 2982          } else {
2983 2983                  return (0);
2984 2984          }
2985 2985  
2986 2986          if (ddi_copyin((const void *)kuc->uc_xsave, &ucx, sizeof (ucx), 0) !=
2987 2987              0) {
2988 2988                  return (EFAULT);
2989 2989          }
2990 2990  
2991 2991          ksize = cpuid_get_xsave_size();
2992 2992          if (ucx.ucx_vers != UC_XSAVE_VERS || ucx.ucx_len < sizeof (ucx) ||
2993 2993              ucx.ucx_len > ksize ||
2994 2994              (ucx.ucx_bv & ~xsave_bv_all) != 0 ||
2995 2995              (uintptr_t)p->p_as->a_userlimit - ucx.ucx_len <
2996 2996              (uintptr_t)kuc->uc_xsave) {
2997 2997                  return (EINVAL);
2998 2998          }
2999 2999  
3000 3000          /*
3001 3001           * OK, our goal right now is to recreate a valid xsave_state structure
3002 3002           * that we'll ultimately end up having to merge with our existing one in
3003 3003           * the FPU save state. The reason we describe this as a merge is to help
3004 3004           * future us when we want to retain supervisor state which will never be
3005 3005           * part of userland signal state. The design of the userland signal
3006 3006           * state is basically to compress it as much as we can. This is done for
3007 3007           * two reasons:
3008 3008           *
3009 3009           *   1) We currently consider this a private interface.
3010 3010           *   2) We really want to minimize the actual amount of stack space we
3011 3011           *      use as much as possible. Most applications aren't using AVX-512
3012 3012           *      right now, so doing our own compression style is worthwhile. If
3013 3013           *      libc adopts AVX-512 routines, we may want to change this.
3014 3014           *
3015 3015           * On the allocation below, our assumption is that if a thread has taken
3016 3016           * a signal, then it is likely to take a signal again in the future (or
3017 3017           * be shortly headed to its demise). As such, when that happens we will
3018 3018           * leave the allocated signal stack around for the process. Most
3019 3019           * applications don't allow all threads to take signals, so this should
3020 3020           * hopefully help amortize the cost of the allocation.
3021 3021           */
3022 3022          max_data = (uintptr_t)kuc->uc_xsave + ucx.ucx_len;
3023 3023          data = (uintptr_t)kuc->uc_xsave + sizeof (ucx);
3024 3024          bv = ucx.ucx_bv;
3025 3025          if (lwp->lwp_pcb.pcb_fpu.fpu_signal == NULL) {
3026 3026                  lwp->lwp_pcb.pcb_fpu.fpu_signal =
3027 3027                      kmem_cache_alloc(fpsave_cachep, KM_SLEEP);
3028 3028          }
3029 3029          fpu = lwp->lwp_pcb.pcb_fpu.fpu_signal;
3030 3030  
3031 3031          /*
3032 3032           * Unconditionally initialize the memory we get in here to ensure that
3033 3033           * it is in a reasonable state for ourselves. This ensures that unused
3034 3034           * regions are mostly left in their initial state (the main exception
3035 3035           * here is the x87/XMM state, but that should be OK). We don't fill in
3036 3036           * the initial xsave state as we expect that to happen as part of our
3037 3037           * processing.
3038 3038           */
3039 3039          bzero(fpu, ksize);
3040 3040  
3041 3041          for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
3042 3042                  int ret;
3043 3043                  const xsave_proc_info_t *info = &fpu_xsave_info[i];
3044 3044                  if (!info->xi_always && (info->xi_bits & bv) == 0)
3045 3045                          continue;
3046 3046                  bv &= ~info->xi_bits;
3047 3047  
3048 3048                  if (info->xi_signal_in == NULL)
3049 3049                          continue;
3050 3050                  ret = info->xi_signal_in(info, kuc, &ucx, fpu, &data, max_data);
3051 3051                  if (ret != 0) {
3052 3052                          return (ret);
3053 3053                  }
3054 3054          }
3055 3055          ASSERT0(bv);
3056 3056  
3057 3057          /*
3058 3058           * As described in the big theory statement section 'Signal Handling and
3059 3059           * the ucontext_t', we always remove UC_FPU from here as we've taken
3060 3060           * care of reassembling it ourselves.
3061 3061           */
3062 3062          kuc->uc_flags &= ~UC_FPU;
3063 3063          kuc->uc_xsave = (uintptr_t)fpu;
3064 3064  
3065 3065          return (0);
3066 3066  }
3067 3067  
3068 3068  /*
3069 3069   * This determines the size of the signal stack that we need for our custom form
3070 3070   * of the xsave state.
3071 3071   */
3072 3072  size_t
3073 3073  fpu_signal_size(struct _klwp *lwp)
3074 3074  {
3075 3075          struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
3076 3076          size_t len = sizeof (uc_xsave_t);
3077 3077          uint64_t xs_bv;
3078 3078  
3079 3079          VERIFY3P(curthread, ==, lwptot(lwp));
3080 3080          VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
3081 3081          ASSERT3U(fpu->fpu_flags & FPU_EN, ==, FPU_EN);
3082 3082  
3083 3083          if (!fpu_xsave_enabled()) {
3084 3084                  return (0);
3085 3085          }
3086 3086  
3087 3087          kpreempt_disable();
3088 3088          if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
3089 3089                  fp_save(fpu);
3090 3090          }
3091 3091  
3092 3092          xs_bv = fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv;
3093 3093          for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
3094 3094                  size_t comp_size;
3095 3095  
3096 3096                  if (!fpu_signal_include(&fpu_xsave_info[i], xs_bv))
3097 3097                          continue;
3098 3098  
3099 3099                  cpuid_get_xsave_info(fpu_xsave_info[i].xi_bits, &comp_size,
3100 3100                      NULL);
3101 3101                  len += comp_size;
3102 3102          }
3103 3103  
3104 3104          kpreempt_enable();
3105 3105          return (len);
3106 3106  }
3107 3107  
3108 3108  /*
3109 3109   * This function is used in service of restorecontext() to set the specified
3110 3110   * thread's extended FPU state to the passed in data. Our assumptions at this
3111 3111   * point from the system are:
3112 3112   *
3113 3113   *   o Someone has already verified that the actual xsave header is correct.
3114 3114   *   o Any traditional XMM state that causes a #gp has been clamped.
3115 3115   *   o That data is basically the correct sized xsave state structure. Right now
3116 3116   *     that means it is not compressed and follows the CPUID-based rules for
3117 3117   *     constructing and laying out data.
3118 3118   *   o That the lwp argument does refer to the current thread.
3119 3119   *
3120 3120   * Our primary purpose here is to merge the current FPU state with what exists
3121 3121   * here. Right now, "merge", strictly speaking is just "replace". We can get
3122 3122   * away with just replacing everything because all we currently save are user
3123 3123   * states. If we start saving kernel states in here, this will get more nuanced
3124 3124   * and we will need to be more careful about how we store data here.
3125 3125   */
3126 3126  void
3127 3127  fpu_set_xsave(struct _klwp *lwp, const void *data)
3128 3128  {
3129 3129          struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
3130 3130          uint32_t status, xstatus;
3131 3131          struct xsave_state *dst_xsave;
3132 3132  
3133 3133          ASSERT(fpu_xsave_enabled());
3134 3134          VERIFY3P(curthread, ==, lwptot(lwp));
3135 3135          VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
3136 3136          ASSERT3U(fpu->fpu_flags & FPU_EN, ==, FPU_EN);
3137 3137  
3138 3138          /*
3139 3139           * We use fp_save() here rather than a stock fpdisable() so we can
3140 3140           * attempt to honor our invariants that when the thread state has been
3141 3141           * saved, the valid flag is set, even though we're going to be
3142 3142           * overwriting it shortly. If we just called fpdisable() then we would
3143 3143           * basically be asking for trouble.
3144 3144           *
3145 3145           * Because we are modifying the state here and we don't want the system
3146 3146           * to end up in an odd state, we are being a little paranoid and
3147 3147           * disabling preemption across this operation. In particular, once the
3148 3148           * state is properly tagged with FPU_VALID, there should be no other way
3149 3149           * that this thread can return to userland and get cleared out because
3150 3150           * we're resetting its context; however, we let paranoia win out.
3151 3151           */
3152 3152          kpreempt_disable();
3153 3153          if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
3154 3154                  fp_save(fpu);
3155 3155          }
3156 3156  
3157 3157          bcopy(data, lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic,
3158 3158              cpuid_get_xsave_size());
3159 3159          dst_xsave = lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic;
3160 3160          status = dst_xsave->xs_fxsave.__fx_ign2[3]._l[0];
3161 3161          xstatus = dst_xsave->xs_fxsave.__fx_ign2[3]._l[1];
3162 3162          dst_xsave->xs_fxsave.__fx_ign2[3]._l[0] = 0;
3163 3163          dst_xsave->xs_fxsave.__fx_ign2[3]._l[1] = 0;
3164 3164  
3165 3165          /*
3166 3166           * These two status words are information that the kernel itself uses to
3167 3167           * track additional information and is part of the traditional fpregset,
3168 3168           * but is not part of our xregs information. Because we are setting this
3169 3169           * state, we leave it up to the rest of the kernel to determine whether
3170 3170           * this came from an fpregset_t or is being reset to the default of 0.
3171 3171           */
3172 3172          fpu->fpu_regs.kfpu_status = status;
3173 3173          fpu->fpu_regs.kfpu_xstatus = xstatus;
3174 3174  
3175 3175          fpu->fpu_flags |= FPU_VALID;
3176 3176          PCB_SET_UPDATE_FPU(&lwp->lwp_pcb);
3177 3177          kpreempt_enable();
3178 3178  }
3179 3179  
3180 3180  /*
3181 3181   * Convert the current FPU state to the traditional fpregset_t. In the 64-bit
3182 3182   * kernel, this is just an fxsave_state with additional values for the status
3183 3183   * and xstatus members.
3184 3184   *
3185 3185   * This has the same nuance as the xregs cases discussed above, but is simpler
3186 3186   * in that we only need to handle the fxsave state, but more complicated because
3187 3187   * we need to check our save mechanism.
3188 3188   */
3189 3189  void
3190 3190  fpu_get_fpregset(struct _klwp *lwp, fpregset_t *fp)
3191 3191  {
3192 3192          struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
3193 3193  
3194 3194          kpreempt_disable();
3195 3195          fp->fp_reg_set.fpchip_state.status = fpu->fpu_regs.kfpu_status;
3196 3196          fp->fp_reg_set.fpchip_state.xstatus = fpu->fpu_regs.kfpu_xstatus;
3197 3197  
3198 3198          if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
3199 3199                  /*
3200 3200                   * If we're requesting the fpregs of a thread that isn't
3201 3201                   * currently valid and isn't the one that we're executing, then
3202 3202                   * we consider getting this information to be a best-effort and
3203 3203                   * we will not stop the thread in question to serialize it,
3204 3204                   * which means possibly getting stale data. This is the
3205 3205                   * traditional semantics that the system has used to service
3206 3206                   * this for /proc.
3207 3207                   */
3208 3208                  if (curthread == lwptot(lwp)) {
3209 3209                          VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
3210 3210                          fp_save(fpu);
3211 3211                  }
3212 3212          }
3213 3213  
3214 3214          /*
3215 3215           * If the FPU is not enabled and the state isn't valid (due to someone
3216 3216           * else setting it), just copy the initial state.
3217 3217           */
3218 3218          if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == 0) {
3219 3219                  bcopy(&sse_initial, fp, sizeof (sse_initial));
3220 3220                  kpreempt_enable();
3221 3221                  return;
3222 3222          }
3223 3223  
3224 3224          /*
3225 3225           * Given that we have an enabled FPU, we must look at the type of FPU
3226 3226           * save mechanism to clean this up. In particular, while we can just
3227 3227           * copy the save area with FXSAVE, with XSAVE we must carefully copy
3228 3228           * only the bits that are valid and reset the rest to their default
3229 3229           * state.
3230 3230           */
3231 3231          switch (fp_save_mech) {
3232 3232          case FP_FXSAVE:
3233 3233                  bcopy(fpu->fpu_regs.kfpu_u.kfpu_fx, fp,
3234 3234                      sizeof (struct fxsave_state));
3235 3235                  break;
3236 3236          case FP_XSAVE:
3237 3237                  fpu_xsave_to_fxsave(fpu->fpu_regs.kfpu_u.kfpu_xs,
3238 3238                      (struct fxsave_state *)fp);
3239 3239                  break;
3240 3240          default:
3241 3241                  panic("Invalid fp_save_mech");
3242 3242          }
3243 3243  
3244 3244          kpreempt_enable();
3245 3245  }
3246 3246  
3247 3247  /*
3248 3248   * This is a request to set the ABI fpregset_t into our actual hardware state.
3249 3249   * In the 64-bit kernel the first 512 bytes of the fpregset_t is the same as the
3250 3250   * 512-byte fxsave area.
3251 3251   */
3252 3252  void
3253 3253  fpu_set_fpregset(struct _klwp *lwp, const fpregset_t *fp)
3254 3254  {
3255 3255          struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
3256 3256  
3257 3257          kpreempt_disable();
3258 3258          if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
3259 3259                  /*
3260 3260                   * We always save the entire FPU. This is required if we're
3261 3261                   * using xsave. If we're using fxsave, we could skip the
3262 3262                   * 512-byte write and instead just disable the FPU since we'd be
3263 3263                   * replacing it all. For now we don't bother with more
3264 3264                   * conditional logic.
3265 3265                   */
3266 3266                  VERIFY3P(curthread, ==, lwptot(lwp));
3267 3267                  VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
3268 3268                  fp_save(fpu);
3269 3269          }
3270 3270  
3271 3271          fpu->fpu_regs.kfpu_xstatus = fp->fp_reg_set.fpchip_state.xstatus;
3272 3272          fpu->fpu_regs.kfpu_status = fp->fp_reg_set.fpchip_state.status;
3273 3273          switch (fp_save_mech) {
3274 3274          case FP_FXSAVE:
3275 3275                  bcopy(fp, fpu->fpu_regs.kfpu_u.kfpu_fx,
3276 3276                      sizeof (struct fxsave_state));
3277 3277                  break;
3278 3278          case FP_XSAVE:
3279 3279                  bcopy(fp, fpu->fpu_regs.kfpu_u.kfpu_xs,
3280 3280                      sizeof (struct fxsave_state));
3281 3281                  fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv |=
3282 3282                      XFEATURE_LEGACY_FP | XFEATURE_SSE;
3283 3283                  break;
3284 3284          default:
3285 3285                  panic("Invalid fp_save_mech");
3286 3286          }
3287 3287  
3288 3288          fpu->fpu_flags |= FPU_VALID;
3289 3289          PCB_SET_UPDATE_FPU(&lwp->lwp_pcb);
3290 3290          kpreempt_enable();
3291 3291  }

↓ open down ↓

3291 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX