base-ij-webrev Sdiff usr/src/uts/intel/os/fpu.c

Print this page

15254 %ymm registers not restored after signal handler
15367 x86 getfpregs() summons corrupting %xmm ghosts
15333 want x86 /proc xregs support (libc_db, libproc, mdb, etc.)
15336 want libc functions for extended ucontext_t
15334 want ps_lwphandle-specific reg routines
15328 FPU_CW_INIT mistreats reserved bit
15335 i86pc fpu_subr.c isn't really platform-specific
15332 setcontext(2) isn't actually noreturn
15331 need <sys/stdalign.h>
Change-Id: I7060aa86042dfb989f77fc3323c065ea2eafa9ad
Conflicts:
    usr/src/uts/common/fs/proc/prcontrol.c
    usr/src/uts/intel/os/archdep.c
    usr/src/uts/intel/sys/ucontext.h
    usr/src/uts/intel/syscall/getcontext.c

   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2021 Joyent, Inc.
  24  * Copyright 2021 RackTop Systems, Inc.
  25  * Copyright 2022 Oxide Computer Company
  26  */
  27 
  28 /*      Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
  29 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T   */
  30 /*              All Rights Reserved                             */
  31 
  32 /*      Copyright (c) 1987, 1988 Microsoft Corporation          */
  33 /*              All Rights Reserved                             */
  34 
  35 /*
  36  * Copyright (c) 2009, Intel Corporation.
  37  * All rights reserved.
  38  */
  39 
  40 #include <sys/types.h>
  41 #include <sys/param.h>
  42 #include <sys/signal.h>
  43 #include <sys/regset.h>
  44 #include <sys/privregs.h>
  45 #include <sys/psw.h>
  46 #include <sys/trap.h>
  47 #include <sys/fault.h>
  48 #include <sys/systm.h>
  49 #include <sys/user.h>
  50 #include <sys/file.h>
  51 #include <sys/proc.h>
  52 #include <sys/pcb.h>
  53 #include <sys/lwp.h>
  54 #include <sys/cpuvar.h>
  55 #include <sys/thread.h>
  56 #include <sys/disp.h>
  57 #include <sys/fp.h>
  58 #include <sys/siginfo.h>
  59 #include <sys/archsystm.h>
  60 #include <sys/kmem.h>
  61 #include <sys/debug.h>
  62 #include <sys/x86_archext.h>
  63 #include <sys/sysmacros.h>
  64 #include <sys/cmn_err.h>
  65 #include <sys/kfpu.h>




  66 
  67 /*
  68  * FPU Management Overview
  69  * -----------------------
  70  *
  71  * The x86 FPU has evolved substantially since its days as the x87 coprocessor;
  72  * however, many aspects of its life as a coprocessor are still around in x86.
  73  *
  74  * Today, when we refer to the 'FPU', we don't just mean the original x87 FPU.
  75  * While that state still exists, there is much more that is covered by the FPU.
  76  * Today, this includes not just traditional FPU state, but also supervisor only
  77  * state. The following state is currently managed and covered logically by the
  78  * idea of the FPU registers:

  79  *
  80  *    o Traditional x87 FPU
  81  *    o Vector Registers (%xmm, %ymm, %zmm)
  82  *    o Memory Protection Extensions (MPX) Bounds Registers
  83  *    o Protected Key Rights Registers (PKRU)
  84  *    o Processor Trace data



  85  *
  86  * The rest of this covers how the FPU is managed and controlled, how state is
  87  * saved and restored between threads, interactions with hypervisors, and other
  88  * information exported to user land through aux vectors. A lot of background
  89  * information is here to synthesize major parts of the Intel SDM, but
  90  * unfortunately, it is not a replacement for reading it.
  91  *
  92  * FPU Control Registers
  93  * ---------------------
  94  *
  95  * Because the x87 FPU began its life as a co-processor and the FPU was
  96  * optional there are several bits that show up in %cr0 that we have to
  97  * manipulate when dealing with the FPU. These are:
  98  *
  99  *   o CR0.ET   The 'extension type' bit. This was used originally to indicate
 100  *              that the FPU co-processor was present. Now it is forced on for
 101  *              compatibility. This is often used to verify whether or not the
 102  *              FPU is present.
 103  *
 104  *   o CR0.NE   The 'native error' bit. Used to indicate that native error
 105  *              mode should be enabled. This indicates that we should take traps
 106  *              on FPU errors. The OS enables this early in boot.
 107  *
 108  *   o CR0.MP   The 'Monitor Coprocessor' bit. Used to control whether or not

 316  * third is to deal with special lwps like the agent lwp.
 317  *
 318  * During exec, we will call fp_exec() which will initialize and set up the FPU
 319  * state for the process. That will fill in the initial state for the FPU and
 320  * also set that state in the FPU itself. As part of fp_exec() we also install a
 321  * thread context operations vector that takes care of dealing with the saving
 322  * and restoring of the FPU. These context handlers will also be called whenever
 323  * an lwp is created or forked. In those cases, to initialize the FPU we will
 324  * call fp_new_lwp(). Like fp_exec(), fp_new_lwp() will install a context
 325  * operations vector for the new thread.
 326  *
 327  * Next we'll end up in the context operation fp_new_lwp(). This saves the
 328  * current thread's state, initializes the new thread's state, and copies over
 329  * the relevant parts of the originating thread's state. It's as this point that
 330  * we also install the FPU context operations into the new thread, which ensures
 331  * that all future threads that are descendants of the current one get the
 332  * thread context operations (unless they call exec).
 333  *
 334  * To deal with some things like the agent lwp, we double check the state of the
 335  * FPU in sys_rtt_common() to make sure that it has been enabled before
 336  * returning to user land. In general, this path should be rare, but it's useful
 337  * for the odd lwp here and there.
 338  *
 339  * The FPU state will remain valid most of the time. There are times that
 340  * the state will be rewritten. For example in restorecontext, due to /proc, or
 341  * the lwp calls exec(). Whether the context is being freed or we are resetting
 342  * the state, we will call fp_free() to disable the FPU and our context.
 343  *
 344  * Finally, when the lwp is destroyed, it will actually destroy and free the FPU
 345  * state by calling fp_lwp_cleanup().
 346  *
 347  * Kernel FPU Multiplexing
 348  * -----------------------
 349  *
 350  * Just as the kernel has to maintain all of the general purpose registers when
 351  * switching between scheduled threads, the same is true of the FPU registers.
 352  *
 353  * When a thread has FPU state, it also has a set of context operations
 354  * installed. These context operations take care of making sure that the FPU is
 355  * properly saved and restored during a context switch (fpsave_ctxt and
 356  * fprestore_ctxt respectively). This means that the current implementation of
 357  * the FPU is 'eager', when a thread is running the CPU will have its FPU state
 358  * loaded. While this is always true when executing in userland, there are a few
 359  * cases where this is not true in the kernel.
 360  *
 361  * This was not always the case. Traditionally on x86 a 'lazy' FPU restore was
 362  * employed. This meant that the FPU would be saved on a context switch and the
 363  * CR0.TS bit would be set. When a thread next tried to use the FPU, it would
 364  * then take a #NM trap, at which point we would restore the FPU from the save
 365  * area and return to user land. Given the frequency of use of the FPU alone by
 366  * libc, there's no point returning to user land just to trap again.
 367  *
 368  * There are a few cases though where the FPU state may need to be changed for a
 369  * thread on its behalf. The most notable cases are in the case of processes
 370  * using /proc, restorecontext, forking, etc. In all of these cases the kernel
 371  * will force a threads FPU state to be saved into the PCB through the fp_save()
 372  * function. Whenever the FPU is saved, then the FPU_VALID flag is set on the
 373  * pcb. This indicates that the save state holds currently valid data. As a side
 374  * effect of this, CR0.TS will be set. To make sure that all of the state is
 375  * updated before returning to user land, in these cases, we set a flag on the
 376  * PCB that says the FPU needs to be updated. This will make sure that we take
 377  * the slow path out of a system call to fix things up for the thread. Due to
 378  * the fact that this is a rather rare case, effectively setting the equivalent
 379  * of t_postsys is acceptable.
 380  *
 381  * CR0.TS will be set after a save occurs and cleared when a restore occurs.
 382  * Generally this means it will be cleared immediately by the new thread that is
 383  * running in a context switch. However, this isn't the case for kernel threads.
 384  * They currently operate with CR0.TS set as no kernel state is restored for
 385  * them. This means that using the FPU will cause a #NM and panic.
 386  *
 387  * The FPU_VALID flag on the currently executing thread's pcb is meant to track
 388  * what the value of CR0.TS should be. If it is set, then CR0.TS will be set.
 389  * However, because we eagerly restore, the only time that CR0.TS should be set
 390  * for a non-kernel thread is during operations where it will be cleared before
 391  * returning to user land and importantly, the only data that is in it is its
 392  * own.
 393  *
 394  * Kernel FPU Usage
 395  * ----------------
 396  *
 397  * Traditionally the kernel never used the FPU since it had no need for
 398  * floating point operations. However, modern FPU hardware supports a variety
 399  * of SIMD extensions which can speed up code such as parity calculations or
 400  * encryption.
 401  *
 402  * To allow the kernel to take advantage of these features, the
 403  * kernel_fpu_begin() and kernel_fpu_end() functions should be wrapped
 404  * around any usage of the FPU by the kernel to ensure that user-level context
 405  * is properly saved/restored, as well as to properly setup the FPU for use by
 406  * the kernel. There are a variety of ways this wrapping can be used, as
 407  * discussed in this section below.
 408  *
 409  * When kernel_fpu_begin() and kernel_fpu_end() are used for extended
 410  * operations, the kernel_fpu_alloc() function should be used to allocate a
 411  * kfpu_state_t structure that is used to save/restore the thread's kernel FPU

 424  * operations. The tradeoff between using the FPU without a kfpu_state_t
 425  * structure vs. the overhead of allowing a context switch while using the FPU
 426  * should be carefully considered on a case by case basis.
 427  *
 428  * In other cases, kernel threads have an LWP, but never execute in user space.
 429  * In this situation, the LWP's pcb_fpu area can be used to save/restore the
 430  * kernel's FPU state if the thread is context switched, instead of having to
 431  * allocate and manage a kfpu_state_t structure. The KFPU_USE_LWP bit in the
 432  * kernel_fpu_begin() and kernel_fpu_end() flags parameter is used to
 433  * enable this behavior. It is the caller's responsibility to ensure that this
 434  * is only used for a kernel thread which never executes in user space.
 435  *
 436  * FPU Exceptions
 437  * --------------
 438  *
 439  * Certain operations can cause the kernel to take traps due to FPU activity.
 440  * Generally these events will cause a user process to receive a SIGFPU and if
 441  * the kernel receives it in kernel context, we will die. Traditionally the #NM
 442  * (Device Not Available / No Math) exception generated by CR0.TS would have
 443  * caused us to restore the FPU. Now it is a fatal event regardless of whether
 444  * or not user land causes it.
 445  *
 446  * While there are some cases where the kernel uses the FPU, it is up to the
 447  * kernel to use the FPU in a way such that it cannot receive a trap or to use
 448  * the appropriate trap protection mechanisms.
 449  *
 450  * Hypervisors
 451  * -----------
 452  *
 453  * When providing support for hypervisors things are a little bit more
 454  * complicated because the FPU is not virtualized at all. This means that they
 455  * need to save and restore the FPU and %xcr0 across entry and exit to the
 456  * guest. To facilitate this, we provide a series of APIs in <sys/hma.h>. These
 457  * allow us to use the full native state to make sure that we are always saving
 458  * and restoring the full FPU that the host sees, even when the guest is using a
 459  * subset.
 460  *
 461  * One tricky aspect of this is that the guest may be using a subset of %xcr0
 462  * and therefore changing our %xcr0 on the fly. It is vital that when we're
 463  * saving and restoring the FPU that we always use the largest %xcr0 contents
 464  * otherwise we will end up leaving behind data in it.
 465  *
 466  * ELF PLT Support
 467  * ---------------
 468  *
 469  * rtld has to preserve a subset of the FPU when it is saving and restoring
 470  * registers due to the amd64 SYS V ABI. See cmd/sgs/rtld/amd64/boot_elf.s for
 471  * more information. As a result, we set up an aux vector that contains
 472  * information about what save and restore mechanisms it should be using and
 473  * the sizing thereof based on what the kernel supports. This is passed down in
 474  * a series of aux vectors SUN_AT_FPTYPE and SUN_AT_FPSIZE. This information is
 475  * initialized in fpu_subr.c.
















































































































































































































































































































































































































































 476  */
 477 











 478 kmem_cache_t *fpsave_cachep;
 479 
 480 /* Legacy fxsave layout + xsave header + ymm */
 481 #define AVX_XSAVE_SIZE          (512 + 64 + 256)
 482 
 483 /*
 484  * Various sanity checks.
 485  */
 486 CTASSERT(sizeof (struct fxsave_state) == 512);
 487 CTASSERT(sizeof (struct fnsave_state) == 108);
 488 CTASSERT((offsetof(struct fxsave_state, fx_xmm[0]) & 0xf) == 0);
 489 CTASSERT(sizeof (struct xsave_state) >= AVX_XSAVE_SIZE);
 490 
 491 /*







 492  * This structure is the x86 implementation of the kernel FPU that is defined in
 493  * uts/common/sys/kfpu.h.
 494  */
 495 
 496 typedef enum kfpu_flags {
 497         /*
 498          * This indicates that the save state has initial FPU data.
 499          */
 500         KFPU_F_INITIALIZED = 0x01
 501 } kfpu_flags_t;
 502 
 503 struct kfpu_state {
 504         fpu_ctx_t       kfpu_ctx;
 505         kfpu_flags_t    kfpu_flags;
 506         kthread_t       *kfpu_curthread;
 507 };
 508 
 509 /*
 510  * Initial kfpu state for SSE/SSE2 used by fpinit()
 511  */

 533                 .fx_mxcsr = SSE_MXCSR_INIT,
 534         },
 535         .xs_header = {
 536                 /*
 537                  * bit0 = 1 for XSTATE_BV to indicate that legacy fields are
 538                  * valid, and CPU should initialize XMM/YMM.
 539                  */
 540                 .xsh_xstate_bv = 1,
 541                 .xsh_xcomp_bv = 0,
 542         },
 543 };
 544 
 545 /*
 546  * mxcsr_mask value (possibly reset in fpu_probe); used to avoid
 547  * the #gp exception caused by setting unsupported bits in the
 548  * MXCSR register
 549  */
 550 uint32_t sse_mxcsr_mask = SSE_MXCSR_MASK_DEFAULT;
 551 
 552 /*
 553  * Initial kfpu state for x87 used by fpinit()
 554  */
 555 const struct fnsave_state x87_initial = {
 556         FPU_CW_INIT,    /* f_fcw */
 557         0,              /* __f_ign0 */
 558         0,              /* f_fsw */
 559         0,              /* __f_ign1 */
 560         0xffff,         /* f_ftw */
 561         /* rest of structure is zero */
 562 };
 563 
 564 /*
 565  * This vector is patched to xsave_ctxt() or xsaveopt_ctxt() if we discover we
 566  * have an XSAVE-capable chip in fpu_probe.
 567  */
 568 void (*fpsave_ctxt)(void *) = fpxsave_ctxt;
 569 void (*fprestore_ctxt)(void *) = fpxrestore_ctxt;
 570 
 571 /*
 572  * This function pointer is changed to xsaveopt if the CPU is xsaveopt capable.
 573  */
 574 void (*xsavep)(struct xsave_state *, uint64_t) = xsave;
 575 
 576 static int fpe_sicode(uint_t);
 577 static int fpe_simd_sicode(uint_t);
 578 static void fp_new_lwp(void *, void *);
 579 static void fp_free_ctx(void *, int);
 580 
 581 static struct ctxop *
 582 fp_ctxop_allocate(struct fpu_ctx *fp)
 583 {
 584         const struct ctxop_template tpl = {

 644 
 645                 VERIFY(fp->fpu_regs.kfpu_u.kfpu_xs != NULL);
 646 
 647                 fx = &fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave;
 648                 cxs = cfp->fpu_regs.kfpu_u.kfpu_xs;
 649                 cfx = &cxs->xs_fxsave;
 650 
 651                 bcopy(&avx_initial, cxs, sizeof (*cxs));
 652                 cfx->fx_mxcsr = fx->fx_mxcsr & ~SSE_MXCSR_EFLAGS;
 653                 cfx->fx_fcw = fx->fx_fcw;
 654                 cxs->xs_header.xsh_xstate_bv |=
 655                     (get_xcr(XFEATURE_ENABLED_MASK) & XFEATURE_FP_INITIAL);
 656                 break;
 657         default:
 658                 panic("Invalid fp_save_mech");
 659                 /*NOTREACHED*/
 660         }
 661 
 662         /*
 663          * Mark that both the parent and child need to have the FPU cleaned up
 664          * before returning to user land.
 665          */
 666 
 667         ctxop_attach(ct, fp_ctxop_allocate(cfp));
 668 }
 669 
 670 /*
 671  * Free any state associated with floating point context.
 672  * Fp_free can be called in three cases:
 673  * 1) from reaper -> thread_free -> freectx-> fp_free
 674  *      fp context belongs to a thread on deathrow
 675  *      nothing to do,  thread will never be resumed
 676  *      thread calling ctxfree is reaper
 677  *
 678  * 2) from exec -> freectx -> fp_free
 679  *      fp context belongs to the current thread
 680  *      must disable fpu, thread calling ctxfree is curthread
 681  *
 682  * 3) from restorecontext -> setfpregs -> fp_free
 683  *      we have a modified context in the memory (lwp->pcb_fpu)
 684  *      disable fpu and release the fp context for the CPU

 849 
 850         ASSERT((fp->fpu_flags & FPU_VALID) == 0);
 851         fp->fpu_flags = FPU_EN;
 852 }
 853 
 854 /*
 855  * When using xsave/xrstor, these three functions are used by the lwp code to
 856  * manage the memory for the xsave area.
 857  */
 858 void
 859 fp_lwp_init(struct _klwp *lwp)
 860 {
 861         struct fpu_ctx *fp = &lwp->lwp_pcb.pcb_fpu;
 862 
 863         /*
 864          * We keep a copy of the pointer in lwp_fpu so that we can restore the
 865          * value in forklwp() after we duplicate the parent's LWP state.
 866          */
 867         lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic =
 868             kmem_cache_alloc(fpsave_cachep, KM_SLEEP);

 869 
 870         if (fp_save_mech == FP_XSAVE) {
 871                 /*
 872                  *
 873                  * We bzero since the fpinit() code path will only
 874                  * partially initialize the xsave area using avx_inital.
 875                  */
 876                 ASSERT(cpuid_get_xsave_size() >= sizeof (struct xsave_state));
 877                 bzero(fp->fpu_regs.kfpu_u.kfpu_xs, cpuid_get_xsave_size());
 878         }
 879 }
 880 
 881 void
 882 fp_lwp_cleanup(struct _klwp *lwp)
 883 {
 884         struct fpu_ctx *fp = &lwp->lwp_pcb.pcb_fpu;
 885 
 886         if (fp->fpu_regs.kfpu_u.kfpu_generic != NULL) {
 887                 kmem_cache_free(fpsave_cachep,
 888                     fp->fpu_regs.kfpu_u.kfpu_generic);
 889                 lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic = NULL;
 890         }





 891 }
 892 
 893 /*
 894  * Called during the process of forklwp(). The kfpu_u pointer will have been
 895  * overwritten while copying the parent's LWP structure. We have a valid copy
 896  * stashed in the child's lwp_fpu which we use to restore the correct value.
 897  */
 898 void
 899 fp_lwp_dup(struct _klwp *lwp)
 900 {
 901         void *xp = lwp->lwp_fpu;
 902         size_t sz;
 903 
 904         switch (fp_save_mech) {
 905         case FP_FXSAVE:
 906                 sz = sizeof (struct fxsave_state);
 907                 break;
 908         case FP_XSAVE:
 909                 sz = cpuid_get_xsave_size();
 910                 break;
 911         default:
 912                 panic("Invalid fp_save_mech");
 913                 /*NOTREACHED*/
 914         }
 915 
 916         /* copy the parent's values into the new lwp's struct */
 917         bcopy(lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic, xp, sz);
 918         /* now restore the pointer */
 919         lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic = xp;


 920 }
 921 
 922 /*
 923  * Handle a processor extension error fault
 924  * Returns non zero for error.
 925  */
 926 
 927 /*ARGSUSED*/
 928 int
 929 fpexterrflt(struct regs *rp)
 930 {
 931         uint32_t fpcw, fpsw;
 932         fpu_ctx_t *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
 933 
 934         ASSERT(fp_kind != FP_NO);
 935 
 936         /*
 937          * Now we can enable the interrupts.
 938          * (NOTE: x87 fp exceptions come thru interrupt gate)
 939          */

1506                                     "for %p, but active thread is not "
1507                                     "curthread", kfpu);
1508                         } else {
1509                                 kfpu->kfpu_curthread = NULL;
1510                         }
1511                 }
1512 
1513                 kpreempt_enable();
1514         }
1515 
1516         if (curthread->t_lwp != NULL) {
1517                 uint_t f;
1518 
1519                 if (flags & KFPU_USE_LWP) {
1520                         f = FPU_EN | FPU_KERNEL;
1521                 } else {
1522                         f = FPU_KERNEL;
1523                 }
1524                 curthread->t_lwp->lwp_pcb.pcb_fpu.fpu_flags &= ~f;
1525         }































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1526 }

   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2021 Joyent, Inc.
  24  * Copyright 2021 RackTop Systems, Inc.
  25  * Copyright 2023 Oxide Computer Company
  26  */
  27 
  28 /*      Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
  29 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T   */
  30 /*              All Rights Reserved                             */
  31 
  32 /*      Copyright (c) 1987, 1988 Microsoft Corporation          */
  33 /*              All Rights Reserved                             */
  34 
  35 /*
  36  * Copyright (c) 2009, Intel Corporation.
  37  * All rights reserved.
  38  */
  39 
  40 #include <sys/types.h>
  41 #include <sys/param.h>
  42 #include <sys/signal.h>
  43 #include <sys/regset.h>
  44 #include <sys/privregs.h>
  45 #include <sys/psw.h>
  46 #include <sys/trap.h>
  47 #include <sys/fault.h>
  48 #include <sys/systm.h>
  49 #include <sys/user.h>
  50 #include <sys/file.h>
  51 #include <sys/proc.h>
  52 #include <sys/pcb.h>
  53 #include <sys/lwp.h>
  54 #include <sys/cpuvar.h>
  55 #include <sys/thread.h>
  56 #include <sys/disp.h>
  57 #include <sys/fp.h>
  58 #include <sys/siginfo.h>
  59 #include <sys/archsystm.h>
  60 #include <sys/kmem.h>
  61 #include <sys/debug.h>
  62 #include <sys/x86_archext.h>
  63 #include <sys/sysmacros.h>
  64 #include <sys/cmn_err.h>
  65 #include <sys/kfpu.h>
  66 #include <sys/stdbool.h>
  67 #include <sys/stdalign.h>
  68 #include <sys/procfs_isa.h>
  69 #include <sys/sunddi.h>
  70 
  71 /*
  72  * FPU Management Overview
  73  * -----------------------
  74  *
  75  * The x86 FPU has evolved substantially since its days as the x87 coprocessor;
  76  * however, many aspects of its life as a coprocessor are still around in x86.
  77  *
  78  * Today, when we refer to the 'FPU', we don't just mean the original x87 FPU.
  79  * While that state still exists, there is much more that is covered by the FPU.
  80  * Today, this includes not just traditional FPU state, but also supervisor only
  81  * state. The following state is currently managed and covered logically by the
  82  * idea of the FPU registers and more generally is called the Extended Processor
  83  * States:
  84  *
  85  *    o Traditional x87 FPU
  86  *    o Vector Registers (%xmm, %ymm, %zmm)
  87  *    o Memory Protection Extensions (MPX) Bounds Registers
  88  *    o Protected Key Rights Registers (PKRU)
  89  *    o Processor Trace data
  90  *    o Control-Flow Enforcement state
  91  *    o Hardware Duty Cycle
  92  *    o Hardware P-states
  93  *
  94  * The rest of this covers how the FPU is managed and controlled, how state is
  95  * saved and restored between threads, interactions with hypervisors, and other
  96  * information exported to userland through aux vectors. A lot of background
  97  * information is here to synthesize major parts of the Intel SDM, but
  98  * unfortunately, it is not a replacement for reading it.
  99  *
 100  * FPU Control Registers
 101  * ---------------------
 102  *
 103  * Because the x87 FPU began its life as a co-processor and the FPU was
 104  * optional there are several bits that show up in %cr0 that we have to
 105  * manipulate when dealing with the FPU. These are:
 106  *
 107  *   o CR0.ET   The 'extension type' bit. This was used originally to indicate
 108  *              that the FPU co-processor was present. Now it is forced on for
 109  *              compatibility. This is often used to verify whether or not the
 110  *              FPU is present.
 111  *
 112  *   o CR0.NE   The 'native error' bit. Used to indicate that native error
 113  *              mode should be enabled. This indicates that we should take traps
 114  *              on FPU errors. The OS enables this early in boot.
 115  *
 116  *   o CR0.MP   The 'Monitor Coprocessor' bit. Used to control whether or not

 324  * third is to deal with special lwps like the agent lwp.
 325  *
 326  * During exec, we will call fp_exec() which will initialize and set up the FPU
 327  * state for the process. That will fill in the initial state for the FPU and
 328  * also set that state in the FPU itself. As part of fp_exec() we also install a
 329  * thread context operations vector that takes care of dealing with the saving
 330  * and restoring of the FPU. These context handlers will also be called whenever
 331  * an lwp is created or forked. In those cases, to initialize the FPU we will
 332  * call fp_new_lwp(). Like fp_exec(), fp_new_lwp() will install a context
 333  * operations vector for the new thread.
 334  *
 335  * Next we'll end up in the context operation fp_new_lwp(). This saves the
 336  * current thread's state, initializes the new thread's state, and copies over
 337  * the relevant parts of the originating thread's state. It's as this point that
 338  * we also install the FPU context operations into the new thread, which ensures
 339  * that all future threads that are descendants of the current one get the
 340  * thread context operations (unless they call exec).
 341  *
 342  * To deal with some things like the agent lwp, we double check the state of the
 343  * FPU in sys_rtt_common() to make sure that it has been enabled before
 344  * returning to userland. In general, this path should be rare, but it's useful
 345  * for the odd lwp here and there.
 346  *
 347  * The FPU state will remain valid most of the time. There are times that
 348  * the state will be rewritten. For example in restorecontext, due to /proc, or
 349  * the lwp calls exec(). Whether the context is being freed or we are resetting
 350  * the state, we will call fp_free() to disable the FPU and our context.
 351  *
 352  * Finally, when the lwp is destroyed, it will actually destroy and free the FPU
 353  * state by calling fp_lwp_cleanup().
 354  *
 355  * Kernel FPU Multiplexing
 356  * -----------------------
 357  *
 358  * Just as the kernel has to maintain all of the general purpose registers when
 359  * switching between scheduled threads, the same is true of the FPU registers.
 360  *
 361  * When a thread has FPU state, it also has a set of context operations
 362  * installed. These context operations take care of making sure that the FPU is
 363  * properly saved and restored during a context switch (fpsave_ctxt and
 364  * fprestore_ctxt respectively). This means that the current implementation of
 365  * the FPU is 'eager', when a thread is running the CPU will have its FPU state
 366  * loaded. While this is always true when executing in userland, there are a few
 367  * cases where this is not true in the kernel.
 368  *
 369  * This was not always the case. Traditionally on x86 a 'lazy' FPU restore was
 370  * employed. This meant that the FPU would be saved on a context switch and the
 371  * CR0.TS bit would be set. When a thread next tried to use the FPU, it would
 372  * then take a #NM trap, at which point we would restore the FPU from the save
 373  * area and return to userland. Given the frequency of use of the FPU alone by
 374  * libc, there's no point returning to userland just to trap again.
 375  *
 376  * There are a few cases though where the FPU state may need to be changed for a
 377  * thread on its behalf. The most notable cases are in the case of processes
 378  * using /proc, restorecontext, forking, etc. In all of these cases the kernel
 379  * will force a threads FPU state to be saved into the PCB through the fp_save()
 380  * function. Whenever the FPU is saved, then the FPU_VALID flag is set on the
 381  * pcb. This indicates that the save state holds currently valid data. As a side
 382  * effect of this, CR0.TS will be set. To make sure that all of the state is
 383  * updated before returning to userland, in these cases, we set a flag on the
 384  * PCB that says the FPU needs to be updated. This will make sure that we take
 385  * the slow path out of a system call to fix things up for the thread. Due to
 386  * the fact that this is a rather rare case, effectively setting the equivalent
 387  * of t_postsys is acceptable.
 388  *
 389  * CR0.TS will be set after a save occurs and cleared when a restore occurs.
 390  * Generally this means it will be cleared immediately by the new thread that is
 391  * running in a context switch. However, this isn't the case for kernel threads.
 392  * They currently operate with CR0.TS set as no kernel state is restored for
 393  * them. This means that using the FPU will cause a #NM and panic.
 394  *
 395  * The FPU_VALID flag on the currently executing thread's pcb is meant to track
 396  * what the value of CR0.TS should be. If it is set, then CR0.TS will be set.
 397  * However, because we eagerly restore, the only time that CR0.TS should be set
 398  * for a non-kernel thread is during operations where it will be cleared before
 399  * returning to userland and importantly, the only data that is in it is its
 400  * own.
 401  *
 402  * Kernel FPU Usage
 403  * ----------------
 404  *
 405  * Traditionally the kernel never used the FPU since it had no need for
 406  * floating point operations. However, modern FPU hardware supports a variety
 407  * of SIMD extensions which can speed up code such as parity calculations or
 408  * encryption.
 409  *
 410  * To allow the kernel to take advantage of these features, the
 411  * kernel_fpu_begin() and kernel_fpu_end() functions should be wrapped
 412  * around any usage of the FPU by the kernel to ensure that user-level context
 413  * is properly saved/restored, as well as to properly setup the FPU for use by
 414  * the kernel. There are a variety of ways this wrapping can be used, as
 415  * discussed in this section below.
 416  *
 417  * When kernel_fpu_begin() and kernel_fpu_end() are used for extended
 418  * operations, the kernel_fpu_alloc() function should be used to allocate a
 419  * kfpu_state_t structure that is used to save/restore the thread's kernel FPU

 432  * operations. The tradeoff between using the FPU without a kfpu_state_t
 433  * structure vs. the overhead of allowing a context switch while using the FPU
 434  * should be carefully considered on a case by case basis.
 435  *
 436  * In other cases, kernel threads have an LWP, but never execute in user space.
 437  * In this situation, the LWP's pcb_fpu area can be used to save/restore the
 438  * kernel's FPU state if the thread is context switched, instead of having to
 439  * allocate and manage a kfpu_state_t structure. The KFPU_USE_LWP bit in the
 440  * kernel_fpu_begin() and kernel_fpu_end() flags parameter is used to
 441  * enable this behavior. It is the caller's responsibility to ensure that this
 442  * is only used for a kernel thread which never executes in user space.
 443  *
 444  * FPU Exceptions
 445  * --------------
 446  *
 447  * Certain operations can cause the kernel to take traps due to FPU activity.
 448  * Generally these events will cause a user process to receive a SIGFPU and if
 449  * the kernel receives it in kernel context, we will die. Traditionally the #NM
 450  * (Device Not Available / No Math) exception generated by CR0.TS would have
 451  * caused us to restore the FPU. Now it is a fatal event regardless of whether
 452  * or not userland causes it.
 453  *
 454  * While there are some cases where the kernel uses the FPU, it is up to the
 455  * kernel to use the FPU in a way such that it cannot receive a trap or to use
 456  * the appropriate trap protection mechanisms.
 457  *
 458  * Hypervisors
 459  * -----------
 460  *
 461  * When providing support for hypervisors things are a little bit more
 462  * complicated because the FPU is not virtualized at all. This means that they
 463  * need to save and restore the FPU and %xcr0 across entry and exit to the
 464  * guest. To facilitate this, we provide a series of APIs in <sys/hma.h>. These
 465  * allow us to use the full native state to make sure that we are always saving
 466  * and restoring the full FPU that the host sees, even when the guest is using a
 467  * subset.
 468  *
 469  * One tricky aspect of this is that the guest may be using a subset of %xcr0
 470  * and therefore changing our %xcr0 on the fly. It is vital that when we're
 471  * saving and restoring the FPU that we always use the largest %xcr0 contents
 472  * otherwise we will end up leaving behind data in it.
 473  *
 474  * ELF PLT Support
 475  * ---------------
 476  *
 477  * rtld has to preserve a subset of the FPU when it is saving and restoring
 478  * registers due to the amd64 SYS V ABI. See cmd/sgs/rtld/amd64/boot_elf.s for
 479  * more information. As a result, we set up an aux vector that contains
 480  * information about what save and restore mechanisms it should be using and
 481  * the sizing thereof based on what the kernel supports. This is passed down in
 482  * a series of aux vectors SUN_AT_FPTYPE and SUN_AT_FPSIZE. This information is
 483  * initialized in fpu_subr.c.
 484  *
 485  * Signal Handling and the ucontext_t
 486  * ----------------------------------
 487  *
 488  * One of the many gifts that signals give us is the twofold fact that when a
 489  * signal occurs, the signal handler is allowed to change the CPU's state
 490  * arbitrarily and when the signal handler is done executing, we must restore it
 491  * back to the original state. However, the second part of this is that the
 492  * signal handler is actually allowed to modify the state that the thread will
 493  * return to! To create this facade, the kernel will create a full ucontext_t
 494  * state, effectively calling getcontext(2) on the thread's behalf, and a
 495  * pointer to that is given to the signal handler (the void * argument for the
 496  * sa_sigaction function pointer in sigaction(2)). When libc is done with a
 497  * signal, it will call setcontext(2) with that same ucontext_t.
 498  *
 499  * Now, the ucontext_t has a fixed ABI for both ILP32 and LP64 environments and
 500  * it's often declared on the stack itself, with the signal handler spilling all
 501  * this state to the stack. The ucontext_t machine portion was broken into the
 502  * general purpose and floating point registers. In 64-bit code, the floating
 503  * point registers were mostly the same as the results of the fxsave instruction
 504  * (i.e. struct fxsave_state). While the 64-bit kernel still uses the equivalent
 505  * starting point for information, it is transformed into a different shape to
 506  * deal with the history of the 32-bit SYS V ABI.
 507  *
 508  * While this worked, if you're reading this, you're aware that the x86 FPU and
 509  * extended register states didn't stop at the initial 16 128-bit %xmm
 510  * registers. Since then we have added 256-bit %ymm, 512-bit %zmm, and the %k
 511  * opmask registers. None of these fit inside the standard ucontext_t; however,
 512  * they must all be preserved and restored across a signal. While the various
 513  * x86 platform-specific ABIs all suggest that these registers are not preserved
 514  * across a function call, receiving a signal is not a function call and must be
 515  * thought of like a process receiving an interrupt. In other words, this
 516  * extended state must be preserved.
 517  *
 518  * To facilitate this, we have extended the ucontext_t structure with an
 519  * additional flag, UC_XSAVE, which indicates that the traditional padding
 520  * member, uc_xsave, actually is a pointer to the extended state. While this is
 521  * accessible outside of a signal handling context through the combination of
 522  * ucontext_alloc(3C) and getcontext_extd(2), our design around saving this
 523  * state is focused on signal handling. Signal handling spills all this state to
 524  * the stack and if we cannot spill the entire state to the stack then our
 525  * inability to deliver the signal results in the process being killed! While
 526  * there are separate efforts to ensure that the signal stack sizing that is
 527  * used for the minimum and maximum signal sizes are sufficient, we still need
 528  * to do our part to minimize the likelihood here.
 529  *
 530  * In designing this, we make the following observations which have helped us
 531  * focus our design:
 532  *
 533  *   o While the start of an xsave area is the traditional 512-byte fxsave XMM
 534  *     region, we already have that in the fpregs. Thus there is no reason to
 535  *     duplicate it. This not only saves 512 bytes of additional stack space,
 536  *     but it also means we don't have to ask which of the version of it to take
 537  *     if they were to differ.
 538  *
 539  *   o Many applications out there aren't necessarily using the extended vectors
 540  *     and even when we do make libc and others take advantage of it, it will
 541  *     behoove us to ensure that they are put back into their initial state
 542  *     after use. This leads us to expect that in a number of cases, the actual
 543  *     extended register state will be in its initial state.
 544  *
 545  *   o While the signal handler does allow contents to be modified, we are
 546  *     starting with making the interface private and thus allowing us to excise
 547  *     components that are in their initial state.
 548  *
 549  *   o There are similarities to what we want to create with the compressed
 550  *     xsave format; however, because we don't always have support for the
 551  *     compressed format, we can't just arbitrarily say let's do a compressed
 552  *     save to the user stack.
 553  *
 554  *   o Because we are not handing this state directly to and from hardware, we
 555  *     don't need to meet some of the constraints of the compressed xsave format
 556  *     around wanting alignment for the initial save or additional components.
 557  *
 558  * All of the above lead us to our own unique format for this data. When the
 559  * UC_XSAVE flag is set in the ucontext_t, the uc_xsave member points to a
 560  * uc_xsave_t structure which has a magic version number, a 32-bit length of the
 561  * overall structure, and the 64-bit state bit-vector to represent which
 562  * components are valid. Following this 8-byte header, each component that is
 563  * present in the bit vector is immediately written out in roughly ascending bit
 564  * order (the order is determined based on the order of the fpu_xsave_info
 565  * array).
 566  *
 567  * This makes the rough logic that we have here when taking a signal and writing
 568  * out this state as:
 569  *
 570  *   1. Ensure that the FPU is saved and that the contents of the pcb save area
 571  *      are valid. That is, call fp_save() if the state is not already flagged
 572  *      with FPU_VALID.
 573  *
 574  *   2. Copy the bit-vector from the save area and remove the XFEATURE_LEGACY_FP
 575  *      and XFEATURE_SSE bits as these will be placed in the xsave area.
 576  *
 577  *   3. Initialize the uc_xsave_t by setting our version field, initializing the
 578  *      length to the length of the current structure, and then setting the
 579  *      modified bit vector above.
 580  *
 581  *   4. Walk each remaining bit of the bit-vector. For each set bit, copy out
 582  *      its extended state starting at the current length in the header and then
 583  *      increase the header size by that length.
 584  *
 585  *   5. Finally write out the final uc_xsave_t structure.
 586  *
 587  * The above process is also used when someone manually calls getcontext_extd(2)
 588  * to get this state. The main difference between the two is which copyout
 589  * function we use. This deserves some explanation. Our main starting point for
 590  * all the logic here is fpu_signal_copyout(). It takes a copyfunc that allows
 591  * the signal handling context to operate with a different copyout than we
 592  * normally use in say getcontext_extd(2).
 593  *
 594  * When we've received a signal, we're at the intersection of several different
 595  * gotchas. Normal copyout (or ddi_copyout()) will trigger watchpoints. That is,
 596  * the watchpoints effectively set a copyout override function (t_copyops) that
 597  * we end up vectoring to rather than a normal copyout. This allows the data to
 598  * be modified and for the watchpoint to fire. While this is all well and good
 599  * normally, it is problematic if we are trying to handle a signal. The signal
 600  * deliver logic, sendsig(), goes through and disables the watchpoint for the
 601  * region of the stack that we are copying out to. However, disabling
 602  * watchpoints is not sufficient, we also need to use the copyout_noerr
 603  * variants.
 604  *
 605  * These variants also require the use of on_fault() and no_fault() for error
 606  * handling. While it is tempting to try and on_fault() the entire
 607  * fpu_signal_copyout() operation, that is actually fraught for a few reasons.
 608  * The first is that we don't want to disable faults during the entire operation
 609  * as if the kernel messes up we will treat that as a user error. That isn't
 610  * theoretical and happened during development. The second and perhaps more
 611  * important issue is that correctly bounding the on_fault() / no_fault() means
 612  * being careful about state. For example, kernel pre-emption is often disabled
 613  * during parts of these operations, but it needs to be re-enabled when we're
 614  * done. This would require tracking in some volatile variable that this had
 615  * been enabled and disabled and tracking that.
 616  *
 617  * Instead, this is why fpu_signal_copyout() takes a copy out function as an
 618  * argument. When we're in signal handling context, the function will use
 619  * coypout_noerr() and wrap it in the appropriate on_fault() mechanisms.
 620  *
 621  * RESTORING STATE
 622  *
 623  * Copying out our current state is the easier half of this problem. When the
 624  * kernel is done with a signal it calls setcontext(2) with the ucontext_t we
 625  * assembled for it as described above. setcontext(2) isn't just used for
 626  * returning from signals.
 627  *
 628  * The process for this goes in two steps. The first step is to copy in,
 629  * validate, and transform the ucontext_t UC_XSAVE that we created above into an
 630  * equivalent xsave format that we can use the appropriate xrstor function on.
 631  * This first phase is implemented in fpu_signal_copyin(). Once that is done, we
 632  * come back through a second phase that is driven out of restorecontext() and
 633  * is implemented in fpu_set_xsave().
 634  *
 635  * Let's start by discussing the second part of this, which is more
 636  * straightforward. In particular, the second phase assumes that all of the
 637  * validation and error handling has been done by the first phase. This means
 638  * here, we have a buffer that is already the appropriate size
 639  * (cpuid_get_xsave_size()) and all we need to do is make sure that we can
 640  * replace the actual save state with the current one.
 641  *
 642  * The only piece of shenanigans we have to do is around the kernel provided
 643  * notion of 'status' and 'xstatus', which are cached versions of the x87 and
 644  * SSE exception vectors. These are part of the fpregset ABI and therefore we
 645  * need to propagate them from the temporary storage that part 1 sets up in the
 646  * ignored region of the fxsave data. We use that because it is not persisted by
 647  * the CPU, so clobbering it is generally alright.
 648  *
 649  * Once that is done, we simply note that we need a PCB update to occur to
 650  * refresh the FPU state before we return to userland. Given that someone has
 651  * called setcontext(2), this was always going to happen because we have to
 652  * update segment registers and related, so this isn't so bad. With that, let's
 653  * move onto the more nuanced part (1).
 654  *
 655  * When we're handling a setcontext(2) we have, in userland, a data structure
 656  * that should match one we serialized out, though we cannot assume that a user
 657  * has not modified it either accidentally or maliciously. Our goal is to set up
 658  * the appropriate xsave state that can be passed to the CPU's xrstor. The first
 659  * problem we have to deal with is where do we actually put this state?
 660  *
 661  * While not many programs actually call setcontext(2) on their own volition,
 662  * this is going to get hit every time we take a signal. The first thought was
 663  * to re-use the existing thread's save area; however, that's a bit challenging
 664  * for a few reasons. In particular, we would need to ensure that we don't go
 665  * off-CPU for any reason, which we cannot assume with a copyin from a user
 666  * address space. In particular, it is trivial for us to hit a case where the
 667  * stack has been paged out for some reason, which eschews that path.
 668  *
 669  * Instead, whenever a thread first calls setcontext(2), generally from signal
 670  * context, we will at that time allocate another entry from the 'fpsave_cachep'
 671  * kmem cache, giving us a buffer of the appropriate space to handle this. Once
 672  * this buffer has been allocated, we leave it assigned to the thread's pcb and
 673  * only tear it down when the thread itself finally exits. We reason that a
 674  * thread that takes a signal once is either going to have the process exit
 675  * shortly thereafter or is much more likely to take a signal again in the
 676  * future. Many daemons and other processes set things up so signals are
 677  * dispatched via one location, masking signals in other thread, using
 678  * sigsuspend(2), signalfd(3C), or something similar.
 679  *
 680  * With this buffer in hand, we begin our task of reassembling state. Note, all
 681  * of this is conditional on UC_XSAVE being set in the uc_flags member of the
 682  * ucontext_t. If it is not set, then we assume that there is no extended state
 683  * and will use the traditional path of setting the fpregset_t into the system
 684  * via setfpregs().
 685  *
 686  * We first will copyin and validate the uc_xsave_t. In particular, we need to
 687  * make sure the version makes sense and that the xsave component bit-vector
 688  * doesn't have anything unexpected and more importantly unsupported in it, and
 689  * that the addresses we've been given are within the user address space. At
 690  * this point we can walk through our table of implemented bits and process
 691  * them.
 692  *
 693  * For most components in here, the processing is straightforward. We continue
 694  * walking our cursor and copy data into the kernel and place it in the
 695  * appropriate place in our xsave state. If a xsave state component bit-vector
 696  * isn't set, then we must ensure that we have the item in the initial state,
 697  * which for everything other than the x87/SSE state is the memory being zeroed.
 698  *
 699  * The most unique case in the copyin state is that of the x87/SSE state. You
 700  * might recall that we didn't copy it out explicitly as part of the uc_xsave_t,
 701  * but instead have opted to use the single definition in the fpregset_t. Thus
 702  * here, we copy it out of the fpregset_t, which the kernel has helpfully
 703  * already unified into the 64-bit fxsave version prior to calling us, and
 704  * install that into the save area we're building up.
 705  *
 706  * As part of this, there are two important pieces to be aware of. The first is
 707  * that because the fpregset_t has both the status and xstatus members
 708  * mentioned earlier, we temporarily copy them to the software-usable ignored
 709  * areas of the fxsave state so we can corral this extra state into part (2)
 710  * without needing to allocate additional space. The second piece is that when
 711  * we're done processing this we explicitly remove the UC_FPU flag that would
 712  * tell the kernel to proceed with updating that region. The problem is that
 713  * that goes directly into the pcb's save area and not to the intermediate
 714  * buffer as it uses the same entry point as /proc, mainly setfpregs().
 715  *
 716  * We don't do much validation of the actual contents of the registers that are
 717  * being set with the exception of ensuring that no reserved bits of the mxcsr
 718  * are used. This is not as strict as /proc, but failure here means the process
 719  * is likely going to die (returning from setcontext() in a signal handler is
 720  * fatal).
 721  *
 722  * /proc xregs
 723  * -----------
 724  *
 725  * Observability of the state of the extended registers is important for
 726  * understanding the system. While on the surface this is similar to signal
 727  * handling, it is crucially different in a number of ways:
 728  *
 729  *   o In signal handling, we're trying to conserve every byte of stack that we
 730  *     can.
 731  *   o The /proc xregs file will end up in core files, which means that we need
 732  *     a way of knowing what components are present and not present in it,
 733  *     because this will vary from CPU to CPU due to the addition of
 734  *     architectural features. For example, some CPUs support AVX-512, but
 735  *     others do not.
 736  *   o The signal handling structure is private and we're not trying to have
 737  *     software modify it, on the other hand, the /proc interfaces that we
 738  *     support we do want software to be able to interrogate and manipulate.
 739  *     These need to be something that we can introduce additional components
 740  *     into and make other changes that still allow it to work.
 741  *
 742  * The x86 xregs format is documented in proc(5). The short form is that the
 743  * prxregset_hdr_t has a number of information entries, which are of the type
 744  * prxregset_info_t. Each of the information headers has a type, size, and
 745  * offset which indicate where to find the additional data.
 746  *
 747  * Each entry is described as one of the entries in the fpu_xsave_info[]. These
 748  * items either are a 1:1 correspondence with a xsave related feature (e.g.
 749  * there is one entry for each of the three AVX-512 components) or it is
 750  * something synthetic that we provide as additional information such as the
 751  * PRX_INFO_XCR, which is a way of getting information about the system such as
 752  * what is enabled in %xcr0 out there.
 753  *
 754  * Unlike signal handling, we are given the buffer to place everything that
 755  * needs to be written out. This is partially the design of the /proc APIs. That
 756  * is, we will always assemble everything into the entire buffer that /proc asks
 757  * us to, and then it will use as much or as little of it as is required.
 758  * Similarly, when setting things, we don't have to worry about copying in
 759  * information in the same way as signal handling does, because /proc takes care
 760  * of it and always hands us a full buffer. Sizing that is a little nuanced, but
 761  * is all handled in prmachdep.c.
 762  *
 763  * When someone performs a read of the xregs and thus is asking us for the
 764  * current state, there is a little bit of nuance that we need to deal with
 765  * here. The first, is whether or not the FPU is enabled and the second is if
 766  * the FPU is enabled, whether a given component is noted as being in its
 767  * initial state. This basically gives us three possible states for a given
 768  * component:
 769  *
 770  *   1. FPU_EN is not set and FPU_VALID is not set. This means we need to take
 771  *      the illumos FPU default for an item. More on that in a moment.
 772  *   2. The saved xsave state indicates that the bit for a given component is
 773  *      zero -- specifically the xsh_xstate_bv member of the struct xsave_state.
 774  *      In this case, we must take the CPU's default for an item. This is
 775  *      usually the same as illumos, but not always.
 776  *   3. The saved xsave state indicates that a given component's state bit is
 777  *      valid. The simplest of our cases. We can just take what we have from the
 778  *      xsave state.
 779  *
 780  * The CPU's default state for most components other than the x87/SSE state is
 781  * to have it be zeroed. This is what we treat as our default state as well. The
 782  * primary difference is in the initialization of the x87/SSE state. The SYS V
 783  * ABI requires that we enable a different floating point control word then the
 784  * hardware default. This means that when we're dealing with case (1) for
 785  * x87/SSE we have to be more careful than the other components. Thankfully for
 786  * everything else this is just keeping it zeroed.
 787  *
 788  * A reasonable question would be why not just skip components that aren't
 789  * marked as present. There are a few reasons we take a different approach and
 790  * always include it. Both of these are to make lives simpler for consumers. In
 791  * the first case, when someone is performing a read and wants to reassemble and
 792  * answer the question of 'what is the value of %ymm0 or %zmm15', they have
 793  * to combine multiple disparate parts. If one knows that the data we put into
 794  * there is always valid and represents what is in hardware and doesn't have to
 795  * keep track of what are the defaults in different circumstances, then that
 796  * greatly simplifies consumers lives. It also helps us for core files and other
 797  * observability cases because the answer to what is the operating system's
 798  * default may change over time.
 799  *
 800  * Similarly, including all the possible structures means that we have
 801  * simplified someone who does a write. Writes are always setting the full state
 802  * of a thread, meaning that if someone wants to modify only a single register
 803  * they must do a read, modify, and write. By including everything that they
 804  * might need, it makes it easier for consumers to do this and not have to cons
 805  * up the whole structure on their own.
 806  *
 807  * When we're setting state, things change around a little bit. We have a few
 808  * constraints that are laid out in proc(5). In particular, we require that the
 809  * PRX_INFO_XSAVE component always be present to tell us which other components
 810  * we expect to be here and which ones we don't. We also are much stricter about
 811  * writes in several ways. Of all the components, the PRX_INFO_XCR is read-only
 812  * and may not be modified by a calling process. In addition, when we have
 813  * 32-bit applications which have reserved registers in the %ymm, %zmm, etc.
 814  * segments, if they are being written to and have modifications, then we will
 815  * indicate an error there.
 816  *
 817  * Because we are given the entire buffer from userland and don't need to have
 818  * an intermediate place to copy it in, we will validate the entire thing in
 819  * advance. Once it has been validated and we consider it legal, then we will
 820  * translate each entry into its corresponding entry in pcb's normal floating
 821  * point state. This is different from signal handling mostly because of the
 822  * fact that we are not using copyin, and once we get to this point, there is
 823  * no more validation, so we don't have the same concerns around blocking while
 824  * pre-emption is disabled.
 825  *
 826  * The Wrinkle with fpregs
 827  * -----------------------
 828  *
 829  * When we instead turn our attention to the fpregs, whether we're gathering
 830  * them as part of the ucontext_t or as part of /proc, there are a few
 831  * complications that we need to be aware of when we're operating on a kernel
 832  * that is using xsave as the save mechanism. When we're using fxsave as the
 833  * save mechanism, the CPU will always save the entire 512-byte fxsave region.
 834  * The fpregs ABI that the kernel expects is basically this structure itself,
 835  * which is transformed into a 32-bit compatible form in archdep.c.
 836  *
 837  * But xsave makes this much more complex and has been a source of historical
 838  * bugs in the system. In particular, unlike fxsave, xsave has its component bit
 839  * vector that is written out to indicate validity. This means that blindly
 840  * copying the fxsave area without checking those bits will lead us to do the
 841  * wrong thing. The XMM state flag mostly covers the 16 128-bit %xmm registers,
 842  * while the x87 legacy fp flag covers the rest of the state. This is all good,
 843  * aside from the MCXSR.
 844  *
 845  * One of the more complicated pieces of xsave state management is correctly
 846  * answering the question of when the MXCSR is written out to xsave_state. In
 847  * practice, this is rather convoluted and varies. If either the XMM or AVX
 848  * feature bits are set then the CPU will write out the MXCSR and its mask
 849  * register into the traditional fxsave state region. This behavior is dependent
 850  * on the type of save function that we use. xsave and xsaveopt will look at the
 851  * AVX feature bit; however, xsavec does not and only considers the SSE feature
 852  * bit. This means that when we're retrieving things, we need to check both of
 853  * those bits to determine if we should use the initial state or the value
 854  * written out.
 855  *
 856  * When we come to someone trying to set the fpregs through /proc, the main
 857  * question we have is what happens to the extended registers. We have opted to
 858  * implement and document it such that a write to the fpregs only impacts the
 859  * fpregs. Put differently, we will save the FPU state with fp_save() ahead of
 860  * copying the data into the save area, set the state bits for x87 and XMM
 861  * state, and then set the FPU to be restored. All in all, this basically means
 862  * that writing to fpregs does not touch any of the %ymm, %zmm, or other state
 863  * that we might have present.
 864  *
 865  * Forward Looking: Adding Intel AMX Support
 866  * -----------------------------------------
 867  *
 868  * Nothing can stop the march of features being added into the FPU. One of the
 869  * larger chunks that we will need to wrangle with is Intel's Advanced Matrix
 870  * Extensions (AMX), which add a large chunk of xsave state to each process.
 871  * While things like AVX and AVX-512 have been enabled by default, the broader
 872  * OS community has not been wanting to do this for AMX ,because of the size of
 873  * the state which exceeds 8 KiB. While the signal handling state went out of
 874  * its way to minimize the size it wrote to the stack, if this is used, it would
 875  * need to be preserved.
 876  *
 877  * To deal with this reality and the fact that folks don't really want to
 878  * enable it by default for all purposes when its use will be quite special
 879  * purpose, Intel has also added a MSR around extended feature disable or xfd.
 880  * This is what we represent in the PRX_INFO_XCR prx_xfd member. Our starting
 881  * assumption, and the reason that so much of the /proc and signal logic ensures
 882  * that we have the thread and process around, taking as an example the unused
 883  * process argument in fpu_proc_xregs_info(), is that we will follow suit and
 884  * default to having support disabled, but that a process will be able to opt
 885  * into it, which will result in several different assumptions around signal
 886  * stack sizing and cause us to reallocate and extend the pcb's FPU save state.
 887  *
 888  * The following is a list of items to pay attention to for future folks who
 889  * work on this:
 890  *
 891  *   o We will want to confirm whether other systems have opted to make this
 892  *     process-wide or thread-wide. Assuming process-wide, we will need to do a
 893  *     hold of all lwps while making a change. The interface for that probably
 894  *     doesn't want to be /proc, as a process probably doesn't want to write to
 895  *     its own control file. Changing it for another process could be done
 896  *     through the agent-lwp.
 897  *   o Opting into this should probably be a one-way street.
 898  *   o Opting into this will need to evaluate all threads and in particular
 899  *     stack sizes to confirm they adhere to the new minimum.
 900  *   o We will need to make sure that setting and clearing the xfd MSR is part
 901  *     of the FPU context ops and something we set by default on every CPU.
 902  *   o We will need to add a new interface to allow opting into this feature.
 903  *   o We will need to ensure that all subsequently created signal stacks adhere
 904  *     to a required minimum size that we communicate through libc.
 905  *   o We will need to make sure that both rtld and libc no longer rely on a
 906  *     static value of the AT_SUN_FPSIZE, but rather realize that this can be
 907  *     dynamic. At that time, we should evaluate if we can get away with not
 908  *     needing to save this for rtld, even though signal handlers should assume
 909  *     they will.
 910  *   o The various components (because there is more than one) will want to be
 911  *     added to the fpu_xsave_info[]. Consulting the processes's xfd will be
 912  *     required and probably require logic changes.
 913  *
 914  * The above is not exhaustive. We'll probably have some other issues and fun
 915  * while doing this.
 916  */
 917 
 918 /*
 919  * The kind of FPU we advertise to rtld so it knows what to do when working
 920  * through the PLT.
 921  */
 922 int fp_elf = AT_386_FPINFO_FXSAVE;
 923 
 924 /*
 925  * Mechanism to save FPU state.
 926  */
 927 int fp_save_mech = FP_FXSAVE;
 928 
 929 kmem_cache_t *fpsave_cachep;
 930 
 931 /* Legacy fxsave layout + xsave header + ymm */
 932 #define AVX_XSAVE_SIZE          (512 + 64 + 256)
 933 
 934 /*
 935  * Various sanity checks.
 936  */
 937 CTASSERT(sizeof (struct fxsave_state) == 512);
 938 CTASSERT(sizeof (struct fnsave_state) == 108);
 939 CTASSERT((offsetof(struct fxsave_state, fx_xmm[0]) & 0xf) == 0);
 940 CTASSERT(sizeof (struct xsave_state) >= AVX_XSAVE_SIZE);
 941 
 942 /*
 943  * Basic architectural alignment information.
 944  */
 945 #define FPU_ALIGN_XMM   16
 946 #define FPU_ALIGN_YMM   32
 947 #define FPU_ALIGN_ZMM   64
 948 
 949 /*
 950  * This structure is the x86 implementation of the kernel FPU that is defined in
 951  * uts/common/sys/kfpu.h.
 952  */
 953 
 954 typedef enum kfpu_flags {
 955         /*
 956          * This indicates that the save state has initial FPU data.
 957          */
 958         KFPU_F_INITIALIZED = 0x01
 959 } kfpu_flags_t;
 960 
 961 struct kfpu_state {
 962         fpu_ctx_t       kfpu_ctx;
 963         kfpu_flags_t    kfpu_flags;
 964         kthread_t       *kfpu_curthread;
 965 };
 966 
 967 /*
 968  * Initial kfpu state for SSE/SSE2 used by fpinit()
 969  */

 991                 .fx_mxcsr = SSE_MXCSR_INIT,
 992         },
 993         .xs_header = {
 994                 /*
 995                  * bit0 = 1 for XSTATE_BV to indicate that legacy fields are
 996                  * valid, and CPU should initialize XMM/YMM.
 997                  */
 998                 .xsh_xstate_bv = 1,
 999                 .xsh_xcomp_bv = 0,
1000         },
1001 };
1002 
1003 /*
1004  * mxcsr_mask value (possibly reset in fpu_probe); used to avoid
1005  * the #gp exception caused by setting unsupported bits in the
1006  * MXCSR register
1007  */
1008 uint32_t sse_mxcsr_mask = SSE_MXCSR_MASK_DEFAULT;
1009 
1010 /*












1011  * This vector is patched to xsave_ctxt() or xsaveopt_ctxt() if we discover we
1012  * have an XSAVE-capable chip in fpu_probe.
1013  */
1014 void (*fpsave_ctxt)(void *) = fpxsave_ctxt;
1015 void (*fprestore_ctxt)(void *) = fpxrestore_ctxt;
1016 
1017 /*
1018  * This function pointer is changed to xsaveopt if the CPU is xsaveopt capable.
1019  */
1020 void (*xsavep)(struct xsave_state *, uint64_t) = xsave;
1021 
1022 static int fpe_sicode(uint_t);
1023 static int fpe_simd_sicode(uint_t);
1024 static void fp_new_lwp(void *, void *);
1025 static void fp_free_ctx(void *, int);
1026 
1027 static struct ctxop *
1028 fp_ctxop_allocate(struct fpu_ctx *fp)
1029 {
1030         const struct ctxop_template tpl = {

1090 
1091                 VERIFY(fp->fpu_regs.kfpu_u.kfpu_xs != NULL);
1092 
1093                 fx = &fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave;
1094                 cxs = cfp->fpu_regs.kfpu_u.kfpu_xs;
1095                 cfx = &cxs->xs_fxsave;
1096 
1097                 bcopy(&avx_initial, cxs, sizeof (*cxs));
1098                 cfx->fx_mxcsr = fx->fx_mxcsr & ~SSE_MXCSR_EFLAGS;
1099                 cfx->fx_fcw = fx->fx_fcw;
1100                 cxs->xs_header.xsh_xstate_bv |=
1101                     (get_xcr(XFEATURE_ENABLED_MASK) & XFEATURE_FP_INITIAL);
1102                 break;
1103         default:
1104                 panic("Invalid fp_save_mech");
1105                 /*NOTREACHED*/
1106         }
1107 
1108         /*
1109          * Mark that both the parent and child need to have the FPU cleaned up
1110          * before returning to userland.
1111          */
1112 
1113         ctxop_attach(ct, fp_ctxop_allocate(cfp));
1114 }
1115 
1116 /*
1117  * Free any state associated with floating point context.
1118  * Fp_free can be called in three cases:
1119  * 1) from reaper -> thread_free -> freectx-> fp_free
1120  *      fp context belongs to a thread on deathrow
1121  *      nothing to do,  thread will never be resumed
1122  *      thread calling ctxfree is reaper
1123  *
1124  * 2) from exec -> freectx -> fp_free
1125  *      fp context belongs to the current thread
1126  *      must disable fpu, thread calling ctxfree is curthread
1127  *
1128  * 3) from restorecontext -> setfpregs -> fp_free
1129  *      we have a modified context in the memory (lwp->pcb_fpu)
1130  *      disable fpu and release the fp context for the CPU

1295 
1296         ASSERT((fp->fpu_flags & FPU_VALID) == 0);
1297         fp->fpu_flags = FPU_EN;
1298 }
1299 
1300 /*
1301  * When using xsave/xrstor, these three functions are used by the lwp code to
1302  * manage the memory for the xsave area.
1303  */
1304 void
1305 fp_lwp_init(struct _klwp *lwp)
1306 {
1307         struct fpu_ctx *fp = &lwp->lwp_pcb.pcb_fpu;
1308 
1309         /*
1310          * We keep a copy of the pointer in lwp_fpu so that we can restore the
1311          * value in forklwp() after we duplicate the parent's LWP state.
1312          */
1313         lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic =
1314             kmem_cache_alloc(fpsave_cachep, KM_SLEEP);
1315         fp->fpu_signal = NULL;
1316 
1317         if (fp_save_mech == FP_XSAVE) {
1318                 /*
1319                  *
1320                  * We bzero since the fpinit() code path will only
1321                  * partially initialize the xsave area using avx_inital.
1322                  */
1323                 ASSERT(cpuid_get_xsave_size() >= sizeof (struct xsave_state));
1324                 bzero(fp->fpu_regs.kfpu_u.kfpu_xs, cpuid_get_xsave_size());
1325         }
1326 }
1327 
1328 void
1329 fp_lwp_cleanup(struct _klwp *lwp)
1330 {
1331         struct fpu_ctx *fp = &lwp->lwp_pcb.pcb_fpu;
1332 
1333         if (fp->fpu_regs.kfpu_u.kfpu_generic != NULL) {
1334                 kmem_cache_free(fpsave_cachep,
1335                     fp->fpu_regs.kfpu_u.kfpu_generic);
1336                 lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic = NULL;
1337         }
1338 
1339         if (fp->fpu_signal != NULL) {
1340                 kmem_cache_free(fpsave_cachep, fp->fpu_signal);
1341                 fp->fpu_signal = NULL;
1342         }
1343 }
1344 
1345 /*
1346  * Called during the process of forklwp(). The kfpu_u pointer will have been
1347  * overwritten while copying the parent's LWP structure. We have a valid copy
1348  * stashed in the child's lwp_fpu which we use to restore the correct value.
1349  */
1350 void
1351 fp_lwp_dup(struct _klwp *lwp)
1352 {
1353         void *xp = lwp->lwp_fpu;
1354         size_t sz;
1355 
1356         switch (fp_save_mech) {
1357         case FP_FXSAVE:
1358                 sz = sizeof (struct fxsave_state);
1359                 break;
1360         case FP_XSAVE:
1361                 sz = cpuid_get_xsave_size();
1362                 break;
1363         default:
1364                 panic("Invalid fp_save_mech");
1365                 /*NOTREACHED*/
1366         }
1367 
1368         /* copy the parent's values into the new lwp's struct */
1369         bcopy(lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic, xp, sz);
1370         /* now restore the pointer */
1371         lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic = xp;
1372         /* Ensure that we don't inherit our parent's signal state */
1373         lwp->lwp_pcb.pcb_fpu.fpu_signal = NULL;
1374 }
1375 
1376 /*
1377  * Handle a processor extension error fault
1378  * Returns non zero for error.
1379  */
1380 
1381 /*ARGSUSED*/
1382 int
1383 fpexterrflt(struct regs *rp)
1384 {
1385         uint32_t fpcw, fpsw;
1386         fpu_ctx_t *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
1387 
1388         ASSERT(fp_kind != FP_NO);
1389 
1390         /*
1391          * Now we can enable the interrupts.
1392          * (NOTE: x87 fp exceptions come thru interrupt gate)
1393          */

1960                                     "for %p, but active thread is not "
1961                                     "curthread", kfpu);
1962                         } else {
1963                                 kfpu->kfpu_curthread = NULL;
1964                         }
1965                 }
1966 
1967                 kpreempt_enable();
1968         }
1969 
1970         if (curthread->t_lwp != NULL) {
1971                 uint_t f;
1972 
1973                 if (flags & KFPU_USE_LWP) {
1974                         f = FPU_EN | FPU_KERNEL;
1975                 } else {
1976                         f = FPU_KERNEL;
1977                 }
1978                 curthread->t_lwp->lwp_pcb.pcb_fpu.fpu_flags &= ~f;
1979         }
1980 }
1981 
1982 /*
1983  * Fill in FPU information that is required by exec.
1984  */
1985 void
1986 fpu_auxv_info(int *typep, size_t *lenp)
1987 {
1988         *typep = fp_elf;
1989         switch (fp_save_mech) {
1990         case FP_FXSAVE:
1991                 *lenp = sizeof (struct fxsave_state);
1992                 break;
1993         case FP_XSAVE:
1994                 *lenp = cpuid_get_xsave_size();
1995                 break;
1996         default:
1997                 *lenp = 0;
1998                 break;
1999         }
2000 }
2001 
2002 /*
2003  * This function exists to transform an xsave_state into an fxsave_state. The
2004  * way that we have to do this is nuanced. We assume that callers have already
2005  * handled FPU_EN and thus we only need to consider the xsave_state and its
2006  * component vector itself. This results in the following cases that we need to
2007  * consider:
2008  *
2009  *   o Neither the x87 / XMM state bits are set. We use the hardware default and
2010  *     need to ensure to copy the xsave header.
2011  *   o Both x87 / XMM state bits are set. We can copy everything.
2012  *   o Only the x87 bit is set. We need to copy the x87 state but make the XMM
2013  *     state be in the initial case.
2014  *   o Only the XMM bit is set. The reverse of the above case.
2015  *
2016  * The illumos and hardware defaults in 'sse_initial' and 'avx_initial' are
2017  * generally the same; however, the default floating point control word is
2018  * different.
2019  *
2020  * Finally, we have the complication of the MXCSR and MCXSR_MASK registers.
2021  * Because we are using xsave and xsaveopt in the kernel right now and not
2022  * xsavec, the hardware may write out the MXCSR and MXCSR_MASK registers if the
2023  * XFEATURE_AVX bit is set. Therefore if we don't have the XMM bit set but AVX
2024  * is set, we must also come back and copy out the MXCSR register. Sorry, we
2025  * don't make the rules.
2026  */
2027 static void
2028 fpu_xsave_to_fxsave(const struct xsave_state *xsave, struct fxsave_state *fx)
2029 {
2030         const uint64_t comps = xsave->xs_header.xsh_xstate_bv;
2031 
2032         switch (comps & (XFEATURE_LEGACY_FP | XFEATURE_SSE)) {
2033         case XFEATURE_LEGACY_FP | XFEATURE_SSE:
2034                 bcopy(xsave, fx, sizeof (*fx));
2035                 return;
2036         case XFEATURE_LEGACY_FP:
2037                 bcopy(xsave, fx, offsetof(struct fxsave_state, fx_xmm));
2038                 fx->fx_mxcsr = SSE_MXCSR_INIT;
2039                 fx->fx_mxcsr_mask = 0;
2040                 break;
2041         case XFEATURE_SSE:
2042                 bcopy(&sse_initial, fx, offsetof(struct fxsave_state,
2043                     fx_mxcsr));
2044 
2045                 fx->fx_fcw = FPU_CW_INIT_HW;
2046                 fx->fx_mxcsr = xsave->xs_fxsave.fx_mxcsr;
2047                 fx->fx_mxcsr_mask = xsave->xs_fxsave.fx_mxcsr_mask;
2048                 bcopy(xsave->xs_fxsave.fx_xmm, fx->fx_xmm, sizeof (fx->fx_xmm));
2049                 break;
2050         default:
2051                 bcopy(&sse_initial, fx, sizeof (*fx));
2052                 fx->fx_fcw = FPU_CW_INIT_HW;
2053                 break;
2054         }
2055 
2056         /*
2057          * Account for the AVX causing MXCSR to be valid.
2058          */
2059         if ((xsave->xs_header.xsh_xstate_bv & XFEATURE_AVX) != 0 &&
2060             (xsave->xs_header.xsh_xstate_bv & XFEATURE_SSE) == 0) {
2061                 fx->fx_mxcsr = xsave->xs_fxsave.fx_mxcsr;
2062                 fx->fx_mxcsr_mask = xsave->xs_fxsave.fx_mxcsr_mask;
2063         }
2064 }
2065 
2066 /*
2067  * This function is designed to answer the question of are we using any xsave
2068  * family of instructions in context switch and therefore we have this state.
2069  * This should still remain true if we are using xsavec or xsaves in the kernel
2070  * in the future.
2071  */
2072 boolean_t
2073 fpu_xsave_enabled(void)
2074 {
2075         return (fp_save_mech == FP_XSAVE);
2076 }
2077 
2078 /*
2079  * The following structure is used to track and manage the programmatic
2080  * construction of /proc and signal stack spilling of xsave information. All
2081  * known xsave types that the kernel supports must be included here.
2082  */
2083 typedef struct xsave_proc_info {
2084         /*
2085          * This matches the /proc xregs type that this data represents. This s
2086          * used for /proc only.
2087          */
2088         uint32_t xi_type;
2089         /*
2090          * This indicates the size of the /proc data that we're operating on.
2091          * This is only used for /proc.
2092          */
2093         size_t  xi_size;
2094         /*
2095          * This indicates the alignment that we want to have for the member when
2096          * we're writing out. This is not used when setting data. This is only
2097          * used for /proc.
2098          */
2099         size_t  xi_align;
2100         /*
2101          * This indicates whether this member must always be considered or not.
2102          * This is used in both /proc and context/signal handling.
2103          */
2104         bool    xi_always;
2105         /*
2106          * This contains the corresponding bits in the xsave bit vector that
2107          * corresponds to this entry. This is used for both /proc and
2108          * context/signal handling.
2109          */
2110         uint64_t xi_bits;
2111         /*
2112          * The xi_fill function pointer is used to write out the /proc regset
2113          * data (e.g. when a user reads xregs). This is only used for the /proc
2114          * handling. The xi_valid function pointer is used instead to validate a
2115          * given set of data that we've read in, while the xi_set pointer is
2116          * used to actually transform the data in the underlying fpu save area.
2117          */
2118         void    (*xi_fill)(const fpu_ctx_t *, const struct xsave_proc_info *,
2119             void *);
2120         bool    (*xi_valid)(model_t, const void *);
2121         void    (*xi_set)(fpu_ctx_t *, const struct xsave_proc_info *,
2122             uint64_t, const void *);
2123         /*
2124          * The xi_signal_in and xi_signal_out function pointers are used for
2125          * extended context and signal handling information. They are used when
2126          * reading in data from a ucontex_t and writing it out respectively.
2127          * These are only used for context/signal handling.
2128          */
2129         int     (*xi_signal_in)(const struct xsave_proc_info *,
2130             const ucontext_t *, const uc_xsave_t *, void *, uintptr_t *,
2131             const uintptr_t);
2132         int     (*xi_signal_out)(const struct xsave_proc_info *, fpu_copyout_f,
2133             uc_xsave_t *, const void *fpup, uintptr_t);
2134 } xsave_proc_info_t;
2135 
2136 static bool
2137 fpu_proc_xregs_initial_state(const fpu_ctx_t *fpu, uint64_t feats)
2138 {
2139         if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == 0) {
2140                 return (B_TRUE);
2141         }
2142 
2143         return ((fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv &
2144             feats) == 0);
2145 }
2146 
2147 static void
2148 fpu_proc_xregs_xcr_fill(const fpu_ctx_t *fpu, const xsave_proc_info_t *info,
2149     void *datap)
2150 {
2151         prxregset_xcr_t *xcr = datap;
2152 
2153         xcr->prx_xcr_xcr0 = xsave_bv_all;
2154 }
2155 
2156 /*
2157  * Unlike other instruction portions, we treat the xsave header and the legacy
2158  * XMM section together as both are somewhat tied at the instruction hip. Unlike
2159  * the latter values, the initial state here is not quite the same.
2160  */
2161 static void
2162 fpu_proc_xregs_xsave_fill(const fpu_ctx_t *fpu, const xsave_proc_info_t *info,
2163     void *datap)
2164 {
2165         prxregset_xsave_t *prxsave = datap;
2166         const struct xsave_state *xsave = fpu->fpu_regs.kfpu_u.kfpu_xs;
2167         size_t hdr_off;
2168 
2169         /*
2170          * In the x87/XMM case, the no device vs. initial state is different
2171          * because the initial state case still wants us to copy the real xsave
2172          * header. It's also worth calling out that the actual illumos default
2173          * fxsave state is not the same as what Intel documents. The main
2174          * difference is in what the x87 FPU control word is. This results in
2175          * the following different cases that we need to think about:
2176          *
2177          *   o FPU_EN is not set. So we use the illumos default.
2178          */
2179         if ((fpu->fpu_flags & FPU_EN) == 0) {
2180                 bcopy(&avx_initial, prxsave, sizeof (*prxsave));
2181                 return;
2182         }
2183 
2184         /*
2185          * Convert all the fxsave region while taking into account the validity
2186          * of the xsave bits. The prxregset_xsave_t structure is identical in
2187          * the first 512-bits to the prxsave structure.
2188          */
2189         fpu_xsave_to_fxsave(xsave, (struct fxsave_state *)prxsave);
2190 
2191         /*
2192          * Now that we've dealt with the x87 and XMM state, take care of the
2193          * header.
2194          */
2195         hdr_off = offsetof(prxregset_xsave_t, prx_xsh_xstate_bv);
2196         bcopy((const void *)((uintptr_t)xsave + hdr_off),
2197             (void *)((uintptr_t)prxsave + hdr_off),
2198             sizeof (struct xsave_header));
2199 }
2200 
2201 static void
2202 fpu_proc_xregs_std_fill(const fpu_ctx_t *fpu, const xsave_proc_info_t *info,
2203     void *datap)
2204 {
2205         if (!fpu_proc_xregs_initial_state(fpu, info->xi_bits)) {
2206                 size_t size, off;
2207                 const void *xsave_off;
2208 
2209                 cpuid_get_xsave_info(info->xi_bits, &size, &off);
2210                 ASSERT3U(size, ==, info->xi_size);
2211                 xsave_off = (void *)((uintptr_t)fpu->fpu_regs.kfpu_u.kfpu_xs +
2212                     off);
2213                 bcopy(xsave_off, datap, info->xi_size);
2214         }
2215 }
2216 
2217 /*
2218  * Users are not allowed to actually set the xcr information this way. However,
2219  * to make it easier for someone to just do a read, modify, write, of the xregs
2220  * data, if it is identical, then we will accept it (and do nothing).
2221  */
2222 static bool
2223 fpu_proc_xregs_xcr_valid(model_t model, const void *datap)
2224 {
2225         const prxregset_xcr_t *xcr = datap;
2226 
2227         return (xcr->prx_xcr_xcr0 == xsave_bv_all && xcr->prx_xcr_xfd == 0 &&
2228             xcr->prx_xcr_pad[0] == 0 && xcr->prx_xcr_pad[1] == 0);
2229 }
2230 
2231 /*
2232  * To match traditional /proc semantics, we do not error if reserved bits of
2233  * MXCSR are set, they will be masked off when writing data. We do not allow
2234  * someone to indicate that they are asking for compressed xsave data, hence the
2235  * check that prx_xsh_comp_bv is zero. Finally, we will check that each
2236  * component that was indicated in the xstate_bv is present as another item as
2237  * part of the broader validation path.
2238  */
2239 static bool
2240 fpu_proc_xregs_xsave_valid(model_t model, const void *datap)
2241 {
2242         const prxregset_xsave_t *xsave = datap;
2243         uint64_t rsvd[6] = { 0 };
2244 
2245         if (bcmp(rsvd, xsave->prx_xsh_reserved, sizeof (rsvd)) != 0 ||
2246             xsave->prx_xsh_xcomp_bv != 0) {
2247                 return (false);
2248         }
2249 
2250         if ((xsave->prx_xsh_xstate_bv & ~xsave_bv_all) != 0) {
2251                 return (false);
2252         }
2253 
2254         return (true);
2255 }
2256 
2257 /*
2258  * The YMM, ZMM, and Hi-ZMM registers are all valid when in an LP64 environment
2259  * on x86; however, when operating in ILP32, subsets are reserved. We basically
2260  * require that all reserved portions are set to zero as our way to accept them.
2261  */
2262 static bool
2263 fpu_proc_xregs_ymm_valid(model_t model, const void *datap)
2264 {
2265         upad128_t ymm_zero[8];
2266         const prxregset_ymm_t *ymm = datap;
2267 
2268         if (model == DATAMODEL_LP64) {
2269                 return (true);
2270         }
2271 
2272         bzero(&ymm_zero, sizeof (ymm_zero));
2273         return (bcmp(&ymm->prx_ymm[8], &ymm_zero, sizeof (ymm_zero)) == 0);
2274 }
2275 
2276 static bool
2277 fpu_proc_xregs_zmm_valid(model_t model, const void *datap)
2278 {
2279         upad256_t zmm_zero[8];
2280         const prxregset_zmm_t *zmm = datap;
2281 
2282         if (model == DATAMODEL_LP64) {
2283                 return (true);
2284         }
2285 
2286         bzero(&zmm_zero, sizeof (zmm_zero));
2287         return (bcmp(&zmm->prx_zmm[8], &zmm_zero, sizeof (zmm_zero)) == 0);
2288 }
2289 
2290 static bool
2291 fpu_proc_xregs_hi_zmm_valid(model_t model, const void *datap)
2292 {
2293         prxregset_hi_zmm_t hi_zmm_zero;
2294         const prxregset_hi_zmm_t *hi_zmm = datap;
2295 
2296         if (model == DATAMODEL_LP64) {
2297                 return (true);
2298         }
2299 
2300         bzero(&hi_zmm_zero, sizeof (hi_zmm_zero));
2301         return (bcmp(hi_zmm, &hi_zmm_zero, sizeof (hi_zmm_zero)) == 0);
2302 }
2303 
2304 /*
2305  * The xsave state consists of the first 512 byes of the XMM state and then the
2306  * xsave header itself. Because of the xsave header, this structure is marked
2307  * with xi_always, so we must always process and consider it.
2308  *
2309  * Semantically if either of the bits around SSE / x87 is set, then we will copy
2310  * the entire thing. This may mean that we end up copying a region that is not
2311  * valid into the save area; however, that should be OK as we still have the
2312  * specific bit flags that indicate what we should consider or not.
2313  *
2314  * There is one additional wrinkle we need to consider and honor here. The CPU
2315  * will load the MXCSR values if the AVX bit is set in an xrstor regardless of
2316  * anything else. So if if this is set and we do not have a valid x87/XMM bits
2317  * set then we will set the MXCSR to its default state in case the processor
2318  * tries to load it. For reference see:
2319  *
2320  *   o Intel SDM Volume 1: 13.8.1 Standard Form of XRSTOR
2321  *   o AMD64 Volume 2: Section 11.5.9 MXCSR State Management
2322  *
2323  * Note, the behavior around this changes depending on whether using the
2324  * compressed xrstor or not. We are not, but it's worth being aware of. We do
2325  * not worry about MXCSR_MASK because the instructions ignore it.
2326  */
2327 static void
2328 fpu_proc_xregs_xsave_set(fpu_ctx_t *fpu, const xsave_proc_info_t *info,
2329     uint64_t xsave_bv, const void *datap)
2330 {
2331         const struct xsave_state *xs = datap;
2332 
2333         if ((xsave_bv & info->xi_bits) != 0) {
2334                 bcopy(&xs->xs_fxsave, &fpu->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave,
2335                     sizeof (struct fxsave_state));
2336         } else if ((xsave_bv & XFEATURE_AVX) != 0) {
2337                 fpu->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_mxcsr =
2338                     SSE_MXCSR_INIT;
2339         }
2340 
2341         bcopy(&xs->xs_header, &fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header,
2342             sizeof (struct xsave_header));
2343         fpu->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_mxcsr &= sse_mxcsr_mask;
2344 }
2345 
2346 static void
2347 fpu_proc_xregs_std_set(fpu_ctx_t *fpu, const xsave_proc_info_t *info,
2348     uint64_t xsave_bv, const void *datap)
2349 {
2350         size_t size, off;
2351         void *xsave_off;
2352 
2353         cpuid_get_xsave_info(info->xi_bits, &size, &off);
2354         xsave_off = (void *)((uintptr_t)fpu->fpu_regs.kfpu_u.kfpu_xs +
2355             off);
2356         bcopy(datap, xsave_off, size);
2357 }
2358 
2359 /*
2360  * Dealing with XMM data is a little more annoying here. If UC_FPU is set, it
2361  * also contains a copy of the XMM region. That must take priority over anything
2362  * we have here. In the copyout code we do not set the XMM bits here as
2363  * something to copy, therefore if they are set, we currently treat that as an
2364  * error.
2365  *
2366  * The system has always gone through and cleaned up the reserved bits in the
2367  * fxsave state when someone calls setcontext(). Therefore we need to do the
2368  * same thing which is why you see the masking of the mxcsr below.
2369  *
2370  * Finally, there is one last wrinkle here that we need to consider. The
2371  * fpregset_t has historically had two private words that are used to convey the
2372  * status which cache the status/exception information. Therefore, we well...
2373  * cheat. Intel has left bytes 464 (0x1d0) through 511 (0x1ff) available for us
2374  * to do what we want. So we will pass this through that for the moment to help
2375  * us pass this state around without too much extra allocation.
2376  */
2377 static int
2378 fpu_signal_copyin_xmm(const xsave_proc_info_t *info, const ucontext_t *kuc,
2379     const uc_xsave_t *ucx, void *fpup, uintptr_t *udatap,
2380     const uintptr_t max_udata)
2381 {
2382         struct xsave_state *xsave = fpup;
2383 
2384         if ((ucx->ucx_bv & info->xi_bits) != 0) {
2385                 return (EINVAL);
2386         }
2387 
2388         if ((kuc->uc_flags & UC_FPU) != 0) {
2389                 bcopy(&kuc->uc_mcontext.fpregs, &xsave->xs_fxsave,
2390                     sizeof (struct fxsave_state));
2391                 xsave->xs_fxsave.__fx_ign2[3]._l[0] =
2392                     kuc->uc_mcontext.fpregs.fp_reg_set.fpchip_state.status;
2393                 xsave->xs_fxsave.__fx_ign2[3]._l[1] =
2394                     kuc->uc_mcontext.fpregs.fp_reg_set.fpchip_state.xstatus;
2395                 xsave->xs_fxsave.fx_mxcsr &= sse_mxcsr_mask;
2396                 xsave->xs_header.xsh_xstate_bv |= info->xi_bits;
2397         }
2398 
2399         return (0);
2400 }
2401 
2402 static int
2403 fpu_signal_copyin_std(const xsave_proc_info_t *info, const ucontext_t *kuc,
2404     const uc_xsave_t *ucx, void *fpup, uintptr_t *udatap,
2405     const uintptr_t max_udata)
2406 {
2407         size_t len, xsave_off;
2408         void *copy_to;
2409         struct xsave_state *xsave = fpup;
2410 
2411         cpuid_get_xsave_info(info->xi_bits, &len, &xsave_off);
2412         if (*udatap + len > max_udata) {
2413                 return (EOVERFLOW);
2414         }
2415 
2416         copy_to = (void *)((uintptr_t)fpup + xsave_off);
2417         if (ddi_copyin((void *)*udatap, copy_to, len, 0) != 0) {
2418                 return (EFAULT);
2419         }
2420 
2421         xsave->xs_header.xsh_xstate_bv |= info->xi_bits;
2422         *udatap = *udatap + len;
2423 
2424         return (0);
2425 }
2426 
2427 static int
2428 fpu_signal_copyout_std(const xsave_proc_info_t *info, fpu_copyout_f copyfunc,
2429     uc_xsave_t *ucx, const void *fpup, uintptr_t udatap)
2430 {
2431         size_t len, xsave_off;
2432         const void *copy_from;
2433         void *copy_to;
2434         int ret;
2435 
2436         cpuid_get_xsave_info(info->xi_bits, &len, &xsave_off);
2437         copy_from = (void *)(uintptr_t)fpup + xsave_off;
2438         copy_to = (void *)(udatap + ucx->ucx_len);
2439 
2440         ret = copyfunc(copy_from, copy_to, len);
2441         if (ret != 0) {
2442                 return (ret);
2443         }
2444 
2445         ucx->ucx_len += len;
2446         ucx->ucx_bv |= info->xi_bits;
2447         return (0);
2448 }
2449 
2450 /*
2451  * This table contains information about the extended FPU states and synthetic
2452  * information we create for /proc, the ucontext_t, and signal handling. The
2453  * definition of the xsave_proc_info_t describes how each member is used.
2454  *
2455  * In general, this table is expected to be in the order of the xsave data
2456  * structure itself. Synthetic elements that we create can go anywhere and new
2457  * ones should be inserted at the end. This structure is walked in order to
2458  * produce the /proc and signal handling logic, so changing the order is
2459  * meaningful for those and probably should not be done lightly.
2460  */
2461 static const xsave_proc_info_t fpu_xsave_info[] = { {
2462         .xi_type = PRX_INFO_XCR,
2463         .xi_size = sizeof (prxregset_xcr_t),
2464         .xi_align = alignof (prxregset_xcr_t),
2465         .xi_always = true,
2466         .xi_bits = 0,
2467         .xi_fill = fpu_proc_xregs_xcr_fill,
2468         .xi_valid = fpu_proc_xregs_xcr_valid
2469 }, {
2470         /*
2471          * The XSAVE entry covers both the xsave header and the %xmm registers.
2472          * Note, there is no signal copyout information for the %xmm registers
2473          * because it is expected that that data is already in the fpregset_t.
2474          */
2475         .xi_type = PRX_INFO_XSAVE,
2476         .xi_size = sizeof (prxregset_xsave_t),
2477         .xi_align = FPU_ALIGN_XMM,
2478         .xi_always = true,
2479         .xi_bits = XFEATURE_LEGACY_FP | XFEATURE_SSE,
2480         .xi_fill = fpu_proc_xregs_xsave_fill,
2481         .xi_set = fpu_proc_xregs_xsave_set,
2482         .xi_valid = fpu_proc_xregs_xsave_valid,
2483         .xi_signal_in = fpu_signal_copyin_xmm
2484 }, {
2485         .xi_type = PRX_INFO_YMM,
2486         .xi_size = sizeof (prxregset_ymm_t),
2487         .xi_align = FPU_ALIGN_YMM,
2488         .xi_always = false,
2489         .xi_bits = XFEATURE_AVX,
2490         .xi_fill = fpu_proc_xregs_std_fill,
2491         .xi_set = fpu_proc_xregs_std_set,
2492         .xi_signal_in = fpu_signal_copyin_std,
2493         .xi_valid = fpu_proc_xregs_ymm_valid,
2494         .xi_signal_out = fpu_signal_copyout_std
2495 }, {
2496         /*
2497          * There is no /proc validation function for the mask registers because
2498          * they are the same in ILP32 / LP64 and there is nothing for us to
2499          * actually validate.
2500          */
2501         .xi_type = PRX_INFO_OPMASK,
2502         .xi_size = sizeof (prxregset_opmask_t),
2503         .xi_align = alignof (prxregset_opmask_t),
2504         .xi_always = false,
2505         .xi_bits = XFEATURE_AVX512_OPMASK,
2506         .xi_fill = fpu_proc_xregs_std_fill,
2507         .xi_set = fpu_proc_xregs_std_set,
2508         .xi_signal_in = fpu_signal_copyin_std,
2509         .xi_signal_out = fpu_signal_copyout_std
2510 }, {
2511         .xi_type = PRX_INFO_ZMM,
2512         .xi_size = sizeof (prxregset_zmm_t),
2513         .xi_align = FPU_ALIGN_ZMM,
2514         .xi_always = false,
2515         .xi_bits = XFEATURE_AVX512_ZMM,
2516         .xi_fill = fpu_proc_xregs_std_fill,
2517         .xi_set = fpu_proc_xregs_std_set,
2518         .xi_valid = fpu_proc_xregs_zmm_valid,
2519         .xi_signal_in = fpu_signal_copyin_std,
2520         .xi_signal_out = fpu_signal_copyout_std
2521 }, {
2522         .xi_type = PRX_INFO_HI_ZMM,
2523         .xi_size = sizeof (prxregset_hi_zmm_t),
2524         .xi_align = FPU_ALIGN_ZMM,
2525         .xi_always = false,
2526         .xi_bits = XFEATURE_AVX512_HI_ZMM,
2527         .xi_fill = fpu_proc_xregs_std_fill,
2528         .xi_set = fpu_proc_xregs_std_set,
2529         .xi_valid = fpu_proc_xregs_hi_zmm_valid,
2530         .xi_signal_in = fpu_signal_copyin_std,
2531         .xi_signal_out = fpu_signal_copyout_std
2532 } };
2533 
2534 static bool
2535 fpu_proc_xregs_include(const xsave_proc_info_t *infop)
2536 {
2537         return (infop->xi_always || (xsave_bv_all & infop->xi_bits) != 0);
2538 }
2539 
2540 void
2541 fpu_proc_xregs_info(struct proc *p __unused, uint32_t *ninfop, uint32_t *sizep,
2542     uint32_t *dstart)
2543 {
2544         size_t ret = sizeof (prxregset_hdr_t);
2545         uint32_t ninfo = 0;
2546 
2547         ASSERT(fpu_xsave_enabled());
2548 
2549         /*
2550          * Right now the set of flags that are enabled in the FPU is global.
2551          * That is, while the pcb's fcpu_ctx_t has the fpu_xsave_mask, the
2552          * actual things that might show up and we care about are all about what
2553          * is set up in %xcr0 which is stored in the global xsave_bv_all. If we
2554          * move to per-process FPU enablement which is likely to come with AMX,
2555          * then this will need the proc_t to look at, hence why we've set things
2556          * up with the unused variable above.
2557          *
2558          * We take two passes through the array. The first is just to count up
2559          * how many informational entries we need.
2560          */
2561         for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
2562                 if (!fpu_proc_xregs_include(&fpu_xsave_info[i]))
2563                         continue;
2564                 ninfo++;
2565         }
2566 
2567         ASSERT3U(ninfo, >, 0);
2568         ret += sizeof (prxregset_info_t) * ninfo;
2569 
2570         for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
2571                 size_t curphase;
2572                 if (!fpu_proc_xregs_include(&fpu_xsave_info[i]))
2573                         continue;
2574 
2575                 curphase = ret % fpu_xsave_info[i].xi_align;
2576                 if (ret < fpu_xsave_info[i].xi_align) {
2577                         ret = fpu_xsave_info[i].xi_align;
2578                 } else if (curphase != 0) {
2579                         ret += curphase;
2580                 }
2581 
2582                 if (i == 0 && dstart != NULL) {
2583                         *dstart = ret;
2584                 }
2585 
2586                 ret += fpu_xsave_info[i].xi_size;
2587         }
2588 
2589         VERIFY3U(ret, <=, UINT32_MAX);
2590         if (sizep != NULL) {
2591                 *sizep = ret;
2592         }
2593 
2594         if (ninfop != NULL) {
2595                 *ninfop = ninfo;
2596         }
2597 }
2598 
2599 /*
2600  * This function supports /proc. Because /proc does not have a process locked
2601  * while processing a PCSXREG, so this tries to establish an upper bound that we
2602  * will validate later in fpu_proc_xregs_set(). We basically say that if you
2603  * take the maximum xsave size and add 1 KiB that is a good enough approximation
2604  * for the maximum size.
2605  */
2606 size_t
2607 fpu_proc_xregs_max_size(void)
2608 {
2609         VERIFY(fpu_xsave_enabled());
2610         return (cpuid_get_xsave_size() + 0x1000);
2611 }
2612 
2613 /*
2614  * This functions supports /proc. In particular, it's meant to perform the
2615  * following:
2616  *
2617  *  o Potentially save the current thread's registers.
2618  *  o Write out the x86 xsave /proc xregs format data from the xsave data we
2619  *    actually have. Note, this can be a little weird for cases where the FPU is
2620  *    not actually enabled, which happens for system processes.
2621  *    /proc let us read this state?
2622  */
2623 void
2624 fpu_proc_xregs_get(struct _klwp *lwp, void *buf)
2625 {
2626         uint32_t size, ninfo, curinfo, dstart;
2627         fpu_ctx_t *fpu = &lwp->lwp_pcb.pcb_fpu;
2628         prxregset_hdr_t *hdr = buf;
2629 
2630         ASSERT(fpu_xsave_enabled());
2631         fpu_proc_xregs_info(lwp->lwp_procp, &ninfo, &size, &dstart);
2632 
2633         /*
2634          * Before we get going, defensively zero out all the data buffer so that
2635          * the rest of the fill functions can assume a specific base.
2636          */
2637         bzero(buf, size);
2638 
2639         kpreempt_disable();
2640         if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
2641                 /*
2642                  * This case suggests that thread in question doesn't have a
2643                  * valid FPU save state which should only happen when it is on
2644                  * CPU. If this is the case, we must ensure that we save the
2645                  * current FPU state before proceeding. We also sanity check
2646                  * several things here before doing this as using /proc on
2647                  * yourself is always exciting. fp_save() will ensure that the
2648                  * thread is flagged to go back to being an eager FPU before
2649                  * returning back to userland.
2650                  */
2651                 VERIFY3P(curthread, ==, lwptot(lwp));
2652                 VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
2653                 fp_save(fpu);
2654         }
2655         kpreempt_enable();
2656 
2657         hdr->pr_type = PR_TYPE_XSAVE;
2658         hdr->pr_size = size;
2659         hdr->pr_flags = hdr->pr_pad[0] = hdr->pr_pad[1] = hdr->pr_pad[2] =
2660             hdr->pr_pad[3] = 0;
2661         hdr->pr_ninfo = ninfo;
2662 
2663         curinfo = 0;
2664         for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
2665                 void *startp;
2666                 uint32_t phase;
2667 
2668                 if (!fpu_proc_xregs_include(&fpu_xsave_info[i]))
2669                         continue;
2670 
2671                 phase = dstart % fpu_xsave_info[i].xi_align;
2672                 if (dstart < fpu_xsave_info[i].xi_align) {
2673                         ASSERT3U(i, !=, 0);
2674                         dstart = fpu_xsave_info[i].xi_align;
2675                 } else if (phase != 0) {
2676                         ASSERT3U(i, !=, 0);
2677                         dstart += phase;
2678                 }
2679 
2680                 hdr->pr_info[curinfo].pri_type = fpu_xsave_info[i].xi_type;
2681                 hdr->pr_info[curinfo].pri_flags = 0;
2682                 hdr->pr_info[curinfo].pri_size = fpu_xsave_info[i].xi_size;
2683                 hdr->pr_info[curinfo].pri_offset = dstart;
2684 
2685                 startp = (void *)((uintptr_t)buf + dstart);
2686                 fpu_xsave_info[i].xi_fill(fpu, &fpu_xsave_info[i], startp);
2687                 dstart += fpu_xsave_info[i].xi_size;
2688                 ASSERT3U(curinfo, <=, ninfo);
2689                 curinfo++;
2690         }
2691 }
2692 
2693 /*
2694  * We have been asked to set the data in the FPU for a given thread. Our
2695  * prmachdep code has already validated that the raw semantics of the data that
2696  * we have are valid (that is the appropriate sizes, offsets, and flags). We now
2697  * apply additional checking here:
2698  *
2699  *   o The xsave structure is present and only valid bits are set.
2700  *   o If the xsave component bit-vector is set, we have the corresponding proc
2701  *     info item.
2702  *   o Read-only items are ignored if and only if they actually match what we
2703  *     gave the user mostly as a courtesy to simplify things here.
2704  *   o ILP32 processes which can't support many of the regions are allowed to
2705  *     have the items here (as we likely gave them to them), but they must be
2706  *     zero if they are set.
2707  *
2708  * We take a first pass through all the data, validating it makes sense for the
2709  * FPU. Only after that point do we ensure that we have the FPU data in question
2710  * and then we clobber all the FPU data. Part of the semantics of setting this
2711  * is that we're setting the entire extended FPU.
2712  */
2713 int
2714 fpu_proc_xregs_set(struct _klwp *lwp, void *buf)
2715 {
2716         prxregset_hdr_t *prx = buf;
2717         model_t model = lwp_getdatamodel(lwp);
2718         uint64_t bv_found = 0;
2719         const prxregset_xsave_t *xsave = NULL;
2720         fpu_ctx_t *fpu = &lwp->lwp_pcb.pcb_fpu;
2721 
2722         VERIFY(fpu_xsave_enabled());
2723 
2724         /*
2725          * First, walk each note info header that we have from the user and
2726          * proceed to validate it. The prmachdep code has already validated that
2727          * the size, type, and offset information is valid, but it has not
2728          * validated the semantic contents of this or if someone is trying to
2729          * write something they shouldn't.
2730          *
2731          * While we walk this, we keep track of where the xsave header is. We
2732          * also track all of the bits that we have found along the way so we can
2733          * match up and ensure that everything that was set has a corresponding
2734          * bit in the xsave bitmap. If we have something in the xsave bitmap,
2735          * but not its corresponding data, then that is an error. However, we
2736          * allow folks to write data regions without the bit set in the xsave
2737          * data to make the read, modify, write process simpler.
2738          */
2739         for (uint32_t i = 0; i < prx->pr_ninfo; i++) {
2740                 const prxregset_info_t *info = &prx->pr_info[i];
2741                 bool found = false;
2742 
2743                 for (size_t pt = 0; pt < ARRAY_SIZE(fpu_xsave_info); pt++) {
2744                         void *data;
2745                         if (info->pri_type != fpu_xsave_info[pt].xi_type)
2746                                 continue;
2747 
2748                         found = true;
2749                         data = (void *)((uintptr_t)buf + info->pri_offset);
2750                         if (fpu_xsave_info[pt].xi_valid != NULL &&
2751                             !fpu_xsave_info[pt].xi_valid(model, data)) {
2752                                 return (EINVAL);
2753                         }
2754 
2755                         if (info->pri_type == PRX_INFO_XSAVE) {
2756                                 xsave = data;
2757                         }
2758                         bv_found |= fpu_xsave_info[pt].xi_bits;
2759                         break;
2760                 }
2761 
2762                 if (!found) {
2763                         return (EINVAL);
2764                 }
2765         }
2766 
2767         /*
2768          * No xsave data, no dice.
2769          */
2770         if (xsave == NULL) {
2771                 return (EINVAL);
2772         }
2773 
2774         /*
2775          * If anything is set in the xsave header that was not found as we
2776          * walked structures, then that is an error. The opposite is not true as
2777          * discussed above.
2778          */
2779         if ((xsave->prx_xsh_xstate_bv & ~bv_found) != 0) {
2780                 return (EINVAL);
2781         }
2782 
2783         /*
2784          * At this point, we consider all the data actually valid. Now we must
2785          * set up this information in the save area. If this is our own lwp, we
2786          * must disable it first. Otherwise, we expect that it is already valid.
2787          * To try to sanitize this, we will defensively zero the entire region
2788          * as we are setting everything that will result in here.
2789          */
2790         kpreempt_disable();
2791         if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
2792                 /*
2793                  * This case suggests that thread in question doesn't have a
2794                  * valid FPU save state which should only happen when it is on
2795                  * CPU. If this is the case, we explicitly disable the FPU, but
2796                  * do not save it before proceeding. We also sanity check
2797                  * several things here before doing this as using /proc on
2798                  * yourself is always exciting. Unlike fp_save(), fp_free() does
2799                  * not signal that an update is required, so we unconditionally
2800                  * set that for all threads.
2801                  */
2802                 VERIFY3P(curthread, ==, lwptot(lwp));
2803                 VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
2804                 fp_free(fpu);
2805         }
2806         PCB_SET_UPDATE_FPU(&lwp->lwp_pcb);
2807         bzero(lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic,
2808             cpuid_get_xsave_size());
2809 
2810         for (uint32_t i = 0; i < prx->pr_ninfo; i++) {
2811                 const prxregset_info_t *info = &prx->pr_info[i];
2812                 bool found = false;
2813 
2814                 for (size_t pt = 0; pt < ARRAY_SIZE(fpu_xsave_info); pt++) {
2815                         const void *data;
2816                         if (info->pri_type != fpu_xsave_info[pt].xi_type)
2817                                 continue;
2818 
2819                         /*
2820                          * Check if we have a set function and if we should
2821                          * include this. We may not if this is something like
2822                          * PRX_INFO_XCR which is read-only.
2823                          *
2824                          * We may not include a given entry as it may not have
2825                          * been set in the actual xsave state that we have been
2826                          * asked to restore, in which case to not break the
2827                          * xsaveopt logic, we must leave it in its initial
2828                          * state, e.g. zeroed (generally). XMM data initial
2829                          * state is not zeroed, but is marked with xi_always to
2830                          * help account for this.
2831                          */
2832                         found = true;
2833                         if (fpu_xsave_info[pt].xi_set == NULL)
2834                                 break;
2835                         if (!fpu_xsave_info[pt].xi_always &&
2836                             (xsave->prx_xsh_xstate_bv &
2837                             fpu_xsave_info[pt].xi_bits) !=
2838                             fpu_xsave_info[pt].xi_bits) {
2839                                 break;
2840                         }
2841 
2842                         data = (void *)((uintptr_t)buf + info->pri_offset);
2843                         fpu_xsave_info[pt].xi_set(fpu, &fpu_xsave_info[pt],
2844                             xsave->prx_xsh_xstate_bv, data);
2845                 }
2846 
2847                 VERIFY(found);
2848         }
2849         kpreempt_enable();
2850 
2851         return (0);
2852 }
2853 
2854 /*
2855  * To be included in the signal copyout logic we must have a copy function and
2856  * the bit in question must be included. Note, we don't consult xi_always here
2857  * as that is really part of what is always present for xsave logic and
2858  * therefore isn't really pertinent here because of our custom format. See the
2859  * big theory statement for more info.
2860  */
2861 static bool
2862 fpu_signal_include(const xsave_proc_info_t *infop, uint64_t xs_bv)
2863 {
2864         return ((infop->xi_bits & xs_bv) == infop->xi_bits &&
2865             infop->xi_signal_out != NULL);
2866 }
2867 
2868 /*
2869  * We need to fill out the xsave related data into the ucontext_t that we've
2870  * been given. We should have a valid user pointer at this point in the uc_xsave
2871  * member. This is much simpler than the copyin that we have. Here are the
2872  * current assumptions:
2873  *
2874  *   o This is being called for the current thread. This is not meant to operate
2875  *     on an arbitrary thread's state.
2876  *   o We cannot assume whether the FPU is valid in the pcb or not. While most
2877  *     callers will have just called getfpregs() which saved the state, don't
2878  *     assume that.
2879  *   o We assume that the user address has the requisite required space for this
2880  *     to be copied out.
2881  *   o We assume that copyfunc() will ensure we are not copying into a kernel
2882  *     address.
2883  *
2884  * For more information on the format of the data, see the 'Signal Handling and
2885  * the ucontext_t' portion of the big theory statement. We copy out all the
2886  * constituent parts and then come back and write out the actual final header
2887  * information.
2888  */
2889 int
2890 fpu_signal_copyout(struct _klwp *lwp, uintptr_t uaddr, fpu_copyout_f copyfunc)
2891 {
2892         struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
2893         uint64_t xs_bv;
2894         uc_xsave_t ucx;
2895         int ret;
2896 
2897         VERIFY3P(curthread, ==, lwptot(lwp));
2898         VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
2899         ASSERT3U(fpu->fpu_flags & FPU_EN, ==, FPU_EN);
2900 
2901         if (!fpu_xsave_enabled()) {
2902                 return (ENOTSUP);
2903         }
2904 
2905         /*
2906          * Unlike when we're dealing with /proc, we can unconditionally call
2907          * fp_save() because this is always called in the context that the lwp
2908          * we're operating on is always the one on CPU (which is what fp_save()
2909          * asserts).
2910          */
2911         fp_save(fpu);
2912 
2913         bzero(&ucx, sizeof (ucx));
2914         ucx.ucx_vers = UC_XSAVE_VERS;
2915         ucx.ucx_len += sizeof (uc_xsave_t);
2916 
2917         xs_bv = fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv;
2918         for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
2919                 const xsave_proc_info_t *info = &fpu_xsave_info[i];
2920 
2921                 if (!fpu_signal_include(&fpu_xsave_info[i], xs_bv))
2922                         continue;
2923                 ret = info->xi_signal_out(info, copyfunc, &ucx,
2924                     lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic,
2925                     uaddr);
2926                 if (ret != 0) {
2927                         kpreempt_enable();
2928                         return (ret);
2929                 }
2930         }
2931 
2932         /*
2933          * Now that everything has been copied out, we should have an accurate
2934          * value in the uc_xsave_t header and we can copy that out at the start
2935          * of the user data.
2936          */
2937         ret = copyfunc(&ucx, (void *)uaddr, sizeof (ucx));
2938         return (ret);
2939 }
2940 
2941 /*
2942  * Here we've been given a ucontext_t which potentially has a user pointer to
2943  * xsave state that we've copied out previously. In this case we need to do the
2944  * following, assuming UC_XSAVE is present:
2945  *
2946  *   o Copy in our header and validate it.
2947  *   o Allocate an fpu context to use as a holding ground for all this data.
2948  *   o If UC_FPU is set, override the xsave structure with the saved XMM state,
2949  *     clear UC_FPU, and make sure that the correct xsave_bv bits are set.
2950  *
2951  * Currently we always allocate the additional state as a holding ground for the
2952  * FPU. What we're copying in may not be valid and we don't want to clobber the
2953  * existing FPU state or deal with merging it until we believe it's reasonable
2954  * enough. The proc_t is here to set us up for when we have per-process settings
2955  * in the extended feature disable MSRs.
2956  */
2957 int
2958 fpu_signal_copyin(struct _klwp *lwp, ucontext_t *kuc)
2959 {
2960         uc_xsave_t ucx;
2961         uint64_t bv;
2962         uintptr_t data, max_data;
2963         void *fpu;
2964         proc_t *p = lwp->lwp_procp;
2965         size_t ksize;
2966 
2967         /*
2968          * Because this has been opaque filler and the kernel has never
2969          * historically looked at it, we don't really care about the uc_xsave
2970          * pointer being garbage in the case that the flag is not set. While
2971          * this isn't perhaps the most sporting choice in some cases, this is on
2972          * the other hand, pragmatic.
2973          */
2974         if ((kuc->uc_flags & UC_XSAVE) != 0) {
2975                 if (kuc->uc_xsave == 0) {
2976                         return (EINVAL);
2977                 }
2978 
2979                 if (!fpu_xsave_enabled()) {
2980                         return (ENOTSUP);
2981                 }
2982         } else {
2983                 return (0);
2984         }
2985 
2986         if (ddi_copyin((const void *)kuc->uc_xsave, &ucx, sizeof (ucx), 0) !=
2987             0) {
2988                 return (EFAULT);
2989         }
2990 
2991         ksize = cpuid_get_xsave_size();
2992         if (ucx.ucx_vers != UC_XSAVE_VERS || ucx.ucx_len < sizeof (ucx) ||
2993             ucx.ucx_len > ksize ||
2994             (ucx.ucx_bv & ~xsave_bv_all) != 0 ||
2995             (uintptr_t)p->p_as->a_userlimit - ucx.ucx_len <
2996             (uintptr_t)kuc->uc_xsave) {
2997                 return (EINVAL);
2998         }
2999 
3000         /*
3001          * OK, our goal right now is to recreate a valid xsave_state structure
3002          * that we'll ultimately end up having to merge with our existing one in
3003          * the FPU save state. The reason we describe this as a merge is to help
3004          * future us when we want to retain supervisor state which will never be
3005          * part of userland signal state. The design of the userland signal
3006          * state is basically to compress it as much as we can. This is done for
3007          * two reasons:
3008          *
3009          *   1) We currently consider this a private interface.
3010          *   2) We really want to minimize the actual amount of stack space we
3011          *      use as much as possible. Most applications aren't using AVX-512
3012          *      right now, so doing our own compression style is worthwhile. If
3013          *      libc adopts AVX-512 routines, we may want to change this.
3014          *
3015          * On the allocation below, our assumption is that if a thread has taken
3016          * a signal, then it is likely to take a signal again in the future (or
3017          * be shortly headed to its demise). As such, when that happens we will
3018          * leave the allocated signal stack around for the process. Most
3019          * applications don't allow all threads to take signals, so this should
3020          * hopefully help amortize the cost of the allocation.
3021          */
3022         max_data = (uintptr_t)kuc->uc_xsave + ucx.ucx_len;
3023         data = (uintptr_t)kuc->uc_xsave + sizeof (ucx);
3024         bv = ucx.ucx_bv;
3025         if (lwp->lwp_pcb.pcb_fpu.fpu_signal == NULL) {
3026                 lwp->lwp_pcb.pcb_fpu.fpu_signal =
3027                     kmem_cache_alloc(fpsave_cachep, KM_SLEEP);
3028         }
3029         fpu = lwp->lwp_pcb.pcb_fpu.fpu_signal;
3030 
3031         /*
3032          * Unconditionally initialize the memory we get in here to ensure that
3033          * it is in a reasonable state for ourselves. This ensures that unused
3034          * regions are mostly left in their initial state (the main exception
3035          * here is the x87/XMM state, but that should be OK). We don't fill in
3036          * the initial xsave state as we expect that to happen as part of our
3037          * processing.
3038          */
3039         bzero(fpu, ksize);
3040 
3041         for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
3042                 int ret;
3043                 const xsave_proc_info_t *info = &fpu_xsave_info[i];
3044                 if (!info->xi_always && (info->xi_bits & bv) == 0)
3045                         continue;
3046                 bv &= ~info->xi_bits;
3047 
3048                 if (info->xi_signal_in == NULL)
3049                         continue;
3050                 ret = info->xi_signal_in(info, kuc, &ucx, fpu, &data, max_data);
3051                 if (ret != 0) {
3052                         return (ret);
3053                 }
3054         }
3055         ASSERT0(bv);
3056 
3057         /*
3058          * As described in the big theory statement section 'Signal Handling and
3059          * the ucontext_t', we always remove UC_FPU from here as we've taken
3060          * care of reassembling it ourselves.
3061          */
3062         kuc->uc_flags &= ~UC_FPU;
3063         kuc->uc_xsave = (uintptr_t)fpu;
3064 
3065         return (0);
3066 }
3067 
3068 /*
3069  * This determines the size of the signal stack that we need for our custom form
3070  * of the xsave state.
3071  */
3072 size_t
3073 fpu_signal_size(struct _klwp *lwp)
3074 {
3075         struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
3076         size_t len = sizeof (uc_xsave_t);
3077         uint64_t xs_bv;
3078 
3079         VERIFY3P(curthread, ==, lwptot(lwp));
3080         VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
3081         ASSERT3U(fpu->fpu_flags & FPU_EN, ==, FPU_EN);
3082 
3083         if (!fpu_xsave_enabled()) {
3084                 return (0);
3085         }
3086 
3087         kpreempt_disable();
3088         if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
3089                 fp_save(fpu);
3090         }
3091 
3092         xs_bv = fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv;
3093         for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
3094                 size_t comp_size;
3095 
3096                 if (!fpu_signal_include(&fpu_xsave_info[i], xs_bv))
3097                         continue;
3098 
3099                 cpuid_get_xsave_info(fpu_xsave_info[i].xi_bits, &comp_size,
3100                     NULL);
3101                 len += comp_size;
3102         }
3103 
3104         kpreempt_enable();
3105         return (len);
3106 }
3107 
3108 /*
3109  * This function is used in service of restorecontext() to set the specified
3110  * thread's extended FPU state to the passed in data. Our assumptions at this
3111  * point from the system are:
3112  *
3113  *   o Someone has already verified that the actual xsave header is correct.
3114  *   o Any traditional XMM state that causes a #gp has been clamped.
3115  *   o That data is basically the correct sized xsave state structure. Right now
3116  *     that means it is not compressed and follows the CPUID-based rules for
3117  *     constructing and laying out data.
3118  *   o That the lwp argument does refer to the current thread.
3119  *
3120  * Our primary purpose here is to merge the current FPU state with what exists
3121  * here. Right now, "merge", strictly speaking is just "replace". We can get
3122  * away with just replacing everything because all we currently save are user
3123  * states. If we start saving kernel states in here, this will get more nuanced
3124  * and we will need to be more careful about how we store data here.
3125  */
3126 void
3127 fpu_set_xsave(struct _klwp *lwp, const void *data)
3128 {
3129         struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
3130         uint32_t status, xstatus;
3131         struct xsave_state *dst_xsave;
3132 
3133         ASSERT(fpu_xsave_enabled());
3134         VERIFY3P(curthread, ==, lwptot(lwp));
3135         VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
3136         ASSERT3U(fpu->fpu_flags & FPU_EN, ==, FPU_EN);
3137 
3138         /*
3139          * We use fp_save() here rather than a stock fpdisable() so we can
3140          * attempt to honor our invariants that when the thread state has been
3141          * saved, the valid flag is set, even though we're going to be
3142          * overwriting it shortly. If we just called fpdisable() then we would
3143          * basically be asking for trouble.
3144          *
3145          * Because we are modifying the state here and we don't want the system
3146          * to end up in an odd state, we are being a little paranoid and
3147          * disabling preemption across this operation. In particular, once the
3148          * state is properly tagged with FPU_VALID, there should be no other way
3149          * that this thread can return to userland and get cleared out because
3150          * we're resetting its context; however, we let paranoia win out.
3151          */
3152         kpreempt_disable();
3153         if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
3154                 fp_save(fpu);
3155         }
3156 
3157         bcopy(data, lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic,
3158             cpuid_get_xsave_size());
3159         dst_xsave = lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic;
3160         status = dst_xsave->xs_fxsave.__fx_ign2[3]._l[0];
3161         xstatus = dst_xsave->xs_fxsave.__fx_ign2[3]._l[1];
3162         dst_xsave->xs_fxsave.__fx_ign2[3]._l[0] = 0;
3163         dst_xsave->xs_fxsave.__fx_ign2[3]._l[1] = 0;
3164 
3165         /*
3166          * These two status words are information that the kernel itself uses to
3167          * track additional information and is part of the traditional fpregset,
3168          * but is not part of our xregs information. Because we are setting this
3169          * state, we leave it up to the rest of the kernel to determine whether
3170          * this came from an fpregset_t or is being reset to the default of 0.
3171          */
3172         fpu->fpu_regs.kfpu_status = status;
3173         fpu->fpu_regs.kfpu_xstatus = xstatus;
3174 
3175         fpu->fpu_flags |= FPU_VALID;
3176         PCB_SET_UPDATE_FPU(&lwp->lwp_pcb);
3177         kpreempt_enable();
3178 }
3179 
3180 /*
3181  * Convert the current FPU state to the traditional fpregset_t. In the 64-bit
3182  * kernel, this is just an fxsave_state with additional values for the status
3183  * and xstatus members.
3184  *
3185  * This has the same nuance as the xregs cases discussed above, but is simpler
3186  * in that we only need to handle the fxsave state, but more complicated because
3187  * we need to check our save mechanism.
3188  */
3189 void
3190 fpu_get_fpregset(struct _klwp *lwp, fpregset_t *fp)
3191 {
3192         struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
3193 
3194         kpreempt_disable();
3195         fp->fp_reg_set.fpchip_state.status = fpu->fpu_regs.kfpu_status;
3196         fp->fp_reg_set.fpchip_state.xstatus = fpu->fpu_regs.kfpu_xstatus;
3197 
3198         if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
3199                 /*
3200                  * If we're requesting the fpregs of a thread that isn't
3201                  * currently valid and isn't the one that we're executing, then
3202                  * we consider getting this information to be a best-effort and
3203                  * we will not stop the thread in question to serialize it,
3204                  * which means possibly getting stale data. This is the
3205                  * traditional semantics that the system has used to service
3206                  * this for /proc.
3207                  */
3208                 if (curthread == lwptot(lwp)) {
3209                         VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
3210                         fp_save(fpu);
3211                 }
3212         }
3213 
3214         /*
3215          * If the FPU is not enabled and the state isn't valid (due to someone
3216          * else setting it), just copy the initial state.
3217          */
3218         if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == 0) {
3219                 bcopy(&sse_initial, fp, sizeof (sse_initial));
3220                 kpreempt_enable();
3221                 return;
3222         }
3223 
3224         /*
3225          * Given that we have an enabled FPU, we must look at the type of FPU
3226          * save mechanism to clean this up. In particular, while we can just
3227          * copy the save area with FXSAVE, with XSAVE we must carefully copy
3228          * only the bits that are valid and reset the rest to their default
3229          * state.
3230          */
3231         switch (fp_save_mech) {
3232         case FP_FXSAVE:
3233                 bcopy(fpu->fpu_regs.kfpu_u.kfpu_fx, fp,
3234                     sizeof (struct fxsave_state));
3235                 break;
3236         case FP_XSAVE:
3237                 fpu_xsave_to_fxsave(fpu->fpu_regs.kfpu_u.kfpu_xs,
3238                     (struct fxsave_state *)fp);
3239                 break;
3240         default:
3241                 panic("Invalid fp_save_mech");
3242         }
3243 
3244         kpreempt_enable();
3245 }
3246 
3247 /*
3248  * This is a request to set the ABI fpregset_t into our actual hardware state.
3249  * In the 64-bit kernel the first 512 bytes of the fpregset_t is the same as the
3250  * 512-byte fxsave area.
3251  */
3252 void
3253 fpu_set_fpregset(struct _klwp *lwp, const fpregset_t *fp)
3254 {
3255         struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
3256 
3257         kpreempt_disable();
3258         if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
3259                 /*
3260                  * We always save the entire FPU. This is required if we're
3261                  * using xsave. If we're using fxsave, we could skip the
3262                  * 512-byte write and instead just disable the FPU since we'd be
3263                  * replacing it all. For now we don't bother with more
3264                  * conditional logic.
3265                  */
3266                 VERIFY3P(curthread, ==, lwptot(lwp));
3267                 VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
3268                 fp_save(fpu);
3269         }
3270 
3271         fpu->fpu_regs.kfpu_xstatus = fp->fp_reg_set.fpchip_state.xstatus;
3272         fpu->fpu_regs.kfpu_status = fp->fp_reg_set.fpchip_state.status;
3273         switch (fp_save_mech) {
3274         case FP_FXSAVE:
3275                 bcopy(fp, fpu->fpu_regs.kfpu_u.kfpu_fx,
3276                     sizeof (struct fxsave_state));
3277                 break;
3278         case FP_XSAVE:
3279                 bcopy(fp, fpu->fpu_regs.kfpu_u.kfpu_xs,
3280                     sizeof (struct fxsave_state));
3281                 fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv |=
3282                     XFEATURE_LEGACY_FP | XFEATURE_SSE;
3283                 break;
3284         default:
3285                 panic("Invalid fp_save_mech");
3286         }
3287 
3288         fpu->fpu_flags |= FPU_VALID;
3289         PCB_SET_UPDATE_FPU(&lwp->lwp_pcb);
3290         kpreempt_enable();
3291 }