1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2021 Joyent, Inc.
  24  */
  25 
  26 /*      Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
  27 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T       */
  28 /*        All Rights Reserved   */
  29 
  30 /*      Copyright (c) 1987, 1988 Microsoft Corporation  */
  31 /*        All Rights Reserved   */
  32 
  33 #include <sys/param.h>
  34 #include <sys/types.h>
  35 #include <sys/sysmacros.h>
  36 #include <sys/systm.h>
  37 #include <sys/signal.h>
  38 #include <sys/errno.h>
  39 #include <sys/fault.h>
  40 #include <sys/syscall.h>
  41 #include <sys/cpuvar.h>
  42 #include <sys/sysi86.h>
  43 #include <sys/psw.h>
  44 #include <sys/cred.h>
  45 #include <sys/policy.h>
  46 #include <sys/thread.h>
  47 #include <sys/debug.h>
  48 #include <sys/ontrap.h>
  49 #include <sys/privregs.h>
  50 #include <sys/x86_archext.h>
  51 #include <sys/vmem.h>
  52 #include <sys/kmem.h>
  53 #include <sys/mman.h>
  54 #include <sys/archsystm.h>
  55 #include <vm/hat.h>
  56 #include <vm/as.h>
  57 #include <vm/seg.h>
  58 #include <vm/seg_kmem.h>
  59 #include <vm/faultcode.h>
  60 #include <sys/fp.h>
  61 #include <sys/cmn_err.h>
  62 #include <sys/segments.h>
  63 #include <sys/clock.h>
  64 #include <vm/hat_i86.h>
  65 #if defined(__xpv)
  66 #include <sys/hypervisor.h>
  67 #include <sys/note.h>
  68 #endif
  69 
  70 static void ldt_alloc(proc_t *, uint_t);
  71 static void ldt_free(proc_t *);
  72 static void ldt_dup(proc_t *, proc_t *);
  73 static void ldt_grow(proc_t *, uint_t);
  74 
  75 /*
  76  * sysi86 System Call
  77  */
  78 
  79 /* ARGSUSED */
  80 int
  81 sysi86(short cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3)
  82 {
  83         struct ssd ssd;
  84         int error = 0;
  85         int c;
  86         proc_t *pp = curproc;
  87 
  88         switch (cmd) {
  89 
  90         /*
  91          * The SI86V86 subsystem call of the SYSI86 system call
  92          * supports only one subcode -- V86SC_IOPL.
  93          */
  94         case SI86V86:
  95                 if (arg1 == V86SC_IOPL) {
  96 #if defined(__xpv)
  97                         struct ctxop *ctx;
  98 #endif
  99                         struct regs *rp = lwptoregs(ttolwp(curthread));
 100                         greg_t oldpl = rp->r_ps & PS_IOPL;
 101                         greg_t newpl = arg2 & PS_IOPL;
 102 
 103                         /*
 104                          * Must be privileged to run this system call
 105                          * if giving more io privilege.
 106                          */
 107                         if (newpl > oldpl && (error =
 108                             secpolicy_sys_config(CRED(), B_FALSE)) != 0)
 109                                 return (set_errno(error));
 110 #if defined(__xpv)
 111                         ctx = installctx_preallocate();
 112                         kpreempt_disable();
 113                         installctx(curthread, NULL, xen_disable_user_iopl,
 114                             xen_enable_user_iopl, NULL, NULL,
 115                             xen_disable_user_iopl, NULL, ctx);
 116                         xen_enable_user_iopl();
 117                         kpreempt_enable();
 118 #else
 119                         rp->r_ps ^= oldpl ^ newpl;
 120 #endif
 121                 } else
 122                         error = EINVAL;
 123                 break;
 124 
 125         /*
 126          * Set a segment descriptor
 127          */
 128         case SI86DSCR:
 129                 /*
 130                  * There are considerable problems here manipulating
 131                  * resources shared by many running lwps.  Get everyone
 132                  * into a safe state before changing the LDT.
 133                  */
 134                 if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK1)) {
 135                         error = EINTR;
 136                         break;
 137                 }
 138 
 139                 if (get_udatamodel() == DATAMODEL_LP64) {
 140                         error = EINVAL;
 141                         break;
 142                 }
 143 
 144                 if (copyin((caddr_t)arg1, &ssd, sizeof (ssd)) < 0) {
 145                         error = EFAULT;
 146                         break;
 147                 }
 148 
 149                 error = setdscr(&ssd);
 150 
 151                 mutex_enter(&pp->p_lock);
 152                 if (curthread != pp->p_agenttp)
 153                         continuelwps(pp);
 154                 mutex_exit(&pp->p_lock);
 155                 break;
 156 
 157         case SI86FPHW:
 158                 c = fp_kind & 0xff;
 159                 if (suword32((void *)arg1, c) == -1)
 160                         error = EFAULT;
 161                 break;
 162 
 163         case SI86FPSTART:
 164                 /*
 165                  * arg1 is the address of _fp_hw
 166                  * arg2 is the desired x87 FCW value
 167                  * arg3 is the desired SSE MXCSR value
 168                  * a return value of one means SSE hardware, else none.
 169                  */
 170                 c = fp_kind & 0xff;
 171                 if (suword32((void *)arg1, c) == -1) {
 172                         error = EFAULT;
 173                         break;
 174                 }
 175                 fpsetcw((uint16_t)arg2, (uint32_t)arg3);
 176                 return ((fp_kind & __FP_SSE) ? 1 : 0);
 177 
 178         /* real time clock management commands */
 179 
 180         case WTODC:
 181                 if ((error = secpolicy_settime(CRED())) == 0) {
 182                         timestruc_t ts;
 183                         mutex_enter(&tod_lock);
 184                         gethrestime(&ts);
 185                         tod_set(ts);
 186                         mutex_exit(&tod_lock);
 187                 }
 188                 break;
 189 
 190 /* Give some timezone playing room */
 191 #define ONEWEEK (7 * 24 * 60 * 60)
 192 
 193         case SGMTL:
 194                 /*
 195                  * Called from 32 bit land, negative values
 196                  * are not sign extended, so we do that here
 197                  * by casting it to an int and back.  We also
 198                  * clamp the value to within reason and detect
 199                  * when a 64 bit call overflows an int.
 200                  */
 201                 if ((error = secpolicy_settime(CRED())) == 0) {
 202                         int newlag = (int)arg1;
 203 
 204 #ifdef _SYSCALL32_IMPL
 205                         if (get_udatamodel() == DATAMODEL_NATIVE &&
 206                             (long)newlag != (long)arg1) {
 207                                 error = EOVERFLOW;
 208                         } else
 209 #endif
 210                         if (newlag >= -ONEWEEK && newlag <= ONEWEEK)
 211                                 sgmtl(newlag);
 212                         else
 213                                 error = EOVERFLOW;
 214                 }
 215                 break;
 216 
 217         case GGMTL:
 218                 if (get_udatamodel() == DATAMODEL_NATIVE) {
 219                         if (sulword((void *)arg1, ggmtl()) == -1)
 220                                 error = EFAULT;
 221 #ifdef _SYSCALL32_IMPL
 222                 } else {
 223                         time_t gmtl;
 224 
 225                         if ((gmtl = ggmtl()) > INT32_MAX) {
 226                                 /*
 227                                  * Since gmt_lag can at most be
 228                                  * +/- 12 hours, something is
 229                                  * *seriously* messed up here.
 230                                  */
 231                                 error = EOVERFLOW;
 232                         } else if (suword32((void *)arg1, (int32_t)gmtl) == -1)
 233                                 error = EFAULT;
 234 #endif
 235                 }
 236                 break;
 237 
 238         case RTCSYNC:
 239                 if ((error = secpolicy_settime(CRED())) == 0)
 240                         rtcsync();
 241                 break;
 242 
 243         /* END OF real time clock management commands */
 244 
 245         default:
 246                 error = EINVAL;
 247                 break;
 248         }
 249         return (error == 0 ? 0 : set_errno(error));
 250 }
 251 
 252 void
 253 usd_to_ssd(user_desc_t *usd, struct ssd *ssd, selector_t sel)
 254 {
 255         ssd->bo = USEGD_GETBASE(usd);
 256         ssd->ls = USEGD_GETLIMIT(usd);
 257         ssd->sel = sel;
 258 
 259         /*
 260          * set type, dpl and present bits.
 261          */
 262         ssd->acc1 = usd->usd_type;
 263         ssd->acc1 |= usd->usd_dpl << 5;
 264         ssd->acc1 |= usd->usd_p << (5 + 2);
 265 
 266         /*
 267          * set avl, DB and granularity bits.
 268          */
 269         ssd->acc2 = usd->usd_avl;
 270 
 271 #if defined(__amd64)
 272         ssd->acc2 |= usd->usd_long << 1;
 273 #else
 274         ssd->acc2 |= usd->usd_reserved << 1;
 275 #endif
 276 
 277         ssd->acc2 |= usd->usd_def32 << (1 + 1);
 278         ssd->acc2 |= usd->usd_gran << (1 + 1 + 1);
 279 }
 280 
 281 static void
 282 ssd_to_usd(struct ssd *ssd, user_desc_t *usd)
 283 {
 284 
 285         ASSERT(bcmp(usd, &null_udesc, sizeof (*usd)) == 0);
 286 
 287         USEGD_SETBASE(usd, ssd->bo);
 288         USEGD_SETLIMIT(usd, ssd->ls);
 289 
 290         /*
 291          * Set type, dpl and present bits.
 292          *
 293          * Force the "accessed" bit to on so that we don't run afoul of
 294          * KPTI.
 295          */
 296         usd->usd_type = ssd->acc1 | SDT_A;
 297         usd->usd_dpl = ssd->acc1 >> 5;
 298         usd->usd_p = ssd->acc1 >> (5 + 2);
 299 
 300         ASSERT(usd->usd_type >= SDT_MEMRO);
 301         ASSERT(usd->usd_dpl == SEL_UPL);
 302 
 303         /*
 304          * 64-bit code selectors are never allowed in the LDT.
 305          * Reserved bit is always 0 on 32-bit systems.
 306          */
 307 #if defined(__amd64)
 308         usd->usd_long = 0;
 309 #else
 310         usd->usd_reserved = 0;
 311 #endif
 312 
 313         /*
 314          * set avl, DB and granularity bits.
 315          */
 316         usd->usd_avl = ssd->acc2;
 317         usd->usd_def32 = ssd->acc2 >> (1 + 1);
 318         usd->usd_gran = ssd->acc2 >> (1 + 1 + 1);
 319 }
 320 
 321 
 322 #if defined(__i386)
 323 
 324 static void
 325 ssd_to_sgd(struct ssd *ssd, gate_desc_t *sgd)
 326 {
 327 
 328         ASSERT(bcmp(sgd, &null_sdesc, sizeof (*sgd)) == 0);
 329 
 330         sgd->sgd_looffset = ssd->bo;
 331         sgd->sgd_hioffset = ssd->bo >> 16;
 332 
 333         sgd->sgd_selector = ssd->ls;
 334 
 335         /*
 336          * set type, dpl and present bits.
 337          */
 338         sgd->sgd_type = ssd->acc1;
 339         sgd->sgd_dpl = ssd->acc1 >> 5;
 340         sgd->sgd_p = ssd->acc1 >> 7;
 341         ASSERT(sgd->sgd_type == SDT_SYSCGT);
 342         ASSERT(sgd->sgd_dpl == SEL_UPL);
 343         sgd->sgd_stkcpy = 0;
 344 }
 345 
 346 #endif  /* __i386 */
 347 
 348 /*
 349  * Load LDT register with the current process's LDT.
 350  */
 351 static void
 352 ldt_load(void)
 353 {
 354 #if defined(__xpv)
 355         xen_set_ldt(curproc->p_ldt, curproc->p_ldtlimit + 1);
 356 #else
 357         size_t len;
 358         system_desc_t desc;
 359 
 360         /*
 361          * Before we can use the LDT on this CPU, we must install the LDT in the
 362          * user mapping table.
 363          */
 364         len = (curproc->p_ldtlimit + 1) * sizeof (user_desc_t);
 365         bcopy(curproc->p_ldt, CPU->cpu_m.mcpu_ldt, len);
 366         CPU->cpu_m.mcpu_ldt_len = len;
 367         set_syssegd(&desc, CPU->cpu_m.mcpu_ldt, len - 1, SDT_SYSLDT, SEL_KPL);
 368         *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = desc;
 369 
 370         wr_ldtr(ULDT_SEL);
 371 #endif
 372 }
 373 
 374 /*
 375  * Store a NULL selector in the LDTR. All subsequent illegal references to
 376  * the LDT will result in a #gp.
 377  */
 378 void
 379 ldt_unload(void)
 380 {
 381 #if defined(__xpv)
 382         xen_set_ldt(NULL, 0);
 383 #else
 384         *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = null_sdesc;
 385         wr_ldtr(0);
 386 
 387         bzero(CPU->cpu_m.mcpu_ldt, CPU->cpu_m.mcpu_ldt_len);
 388         CPU->cpu_m.mcpu_ldt_len = 0;
 389 #endif
 390 }
 391 
 392 /*ARGSUSED*/
 393 static void
 394 ldt_savectx(proc_t *p)
 395 {
 396         ASSERT(p->p_ldt != NULL);
 397         ASSERT(p == curproc);
 398 
 399 #if defined(__amd64)
 400         /*
 401          * The 64-bit kernel must be sure to clear any stale ldt
 402          * selectors when context switching away from a process that
 403          * has a private ldt. Consider the following example:
 404          *
 405          *      Wine creats a ldt descriptor and points a segment register
 406          *      to it.
 407          *
 408          *      We then context switch away from wine lwp to kernel
 409          *      thread and hit breakpoint in kernel with kmdb
 410          *
 411          *      When we continue and resume from kmdb we will #gp
 412          *      fault since kmdb will have saved the stale ldt selector
 413          *      from wine and will try to restore it but we are no longer in
 414          *      the context of the wine process and do not have our
 415          *      ldtr register pointing to the private ldt.
 416          */
 417         reset_sregs();
 418 #endif
 419 
 420         ldt_unload();
 421         cpu_fast_syscall_enable();
 422 }
 423 
 424 static void
 425 ldt_restorectx(proc_t *p)
 426 {
 427         ASSERT(p->p_ldt != NULL);
 428         ASSERT(p == curproc);
 429 
 430         ldt_load();
 431         cpu_fast_syscall_disable();
 432 }
 433 
 434 /*
 435  * At exec time, we need to clear up our LDT context and re-enable fast syscalls
 436  * for the new process image.
 437  *
 438  * The same is true for the other case, where we have:
 439  *
 440  * proc_exit()
 441  *  ->exitpctx()->ldt_savectx()
 442  *  ->freepctx()->ldt_freectx()
 443  *
 444  * Because pre-emption is not prevented between the two callbacks, we could have
 445  * come off CPU, and brought back LDT context when coming back on CPU via
 446  * ldt_restorectx().
 447  */
 448 /* ARGSUSED */
 449 static void
 450 ldt_freectx(proc_t *p, int isexec)
 451 {
 452         ASSERT(p->p_ldt != NULL);
 453         ASSERT(p == curproc);
 454 
 455         kpreempt_disable();
 456         ldt_free(p);
 457         cpu_fast_syscall_enable();
 458         kpreempt_enable();
 459 }
 460 
 461 /*
 462  * Install ctx op that ensures syscall/sysenter are disabled.
 463  * See comments below.
 464  *
 465  * When a thread with a private LDT forks, the new process
 466  * must have the LDT context ops installed.
 467  */
 468 /* ARGSUSED */
 469 static void
 470 ldt_installctx(proc_t *p, proc_t *cp)
 471 {
 472         proc_t          *targ = p;
 473         kthread_t       *t;
 474 
 475         /*
 476          * If this is a fork, operate on the child process.
 477          */
 478         if (cp != NULL) {
 479                 targ = cp;
 480                 ldt_dup(p, cp);
 481         }
 482 
 483         /*
 484          * The process context ops expect the target process as their argument.
 485          */
 486         ASSERT(removepctx(targ, targ, ldt_savectx, ldt_restorectx,
 487             ldt_installctx, ldt_savectx, ldt_freectx) == 0);
 488 
 489         installpctx(targ, targ, ldt_savectx, ldt_restorectx,
 490             ldt_installctx, ldt_savectx, ldt_freectx);
 491 
 492         /*
 493          * We've just disabled fast system call and return instructions; take
 494          * the slow path out to make sure we don't try to use one to return
 495          * back to user. We must set t_post_sys for every thread in the
 496          * process to make sure none of them escape out via fast return.
 497          */
 498 
 499         mutex_enter(&targ->p_lock);
 500         t = targ->p_tlist;
 501         do {
 502                 t->t_post_sys = 1;
 503         } while ((t = t->t_forw) != targ->p_tlist);
 504         mutex_exit(&targ->p_lock);
 505 }
 506 
 507 int
 508 setdscr(struct ssd *ssd)
 509 {
 510         ushort_t seli;          /* selector index */
 511         user_desc_t *ldp;       /* descriptor pointer */
 512         user_desc_t ndesc;      /* new descriptor */
 513         proc_t  *pp = curproc;
 514         int     rc = 0;
 515 
 516         /*
 517          * LDT segments: executable and data at DPL 3 only.
 518          */
 519         if (!SELISLDT(ssd->sel) || !SELISUPL(ssd->sel))
 520                 return (EINVAL);
 521 
 522         /*
 523          * check the selector index.
 524          */
 525         seli = SELTOIDX(ssd->sel);
 526         if (seli >= MAXNLDT || seli < LDT_UDBASE)
 527                 return (EINVAL);
 528 
 529         ndesc = null_udesc;
 530         mutex_enter(&pp->p_ldtlock);
 531 
 532         /*
 533          * If this is the first time for this process then setup a
 534          * private LDT for it.
 535          */
 536         if (pp->p_ldt == NULL) {
 537                 ldt_alloc(pp, seli);
 538 
 539                 /*
 540                  * Now that this process has a private LDT, the use of
 541                  * the syscall/sysret and sysenter/sysexit instructions
 542                  * is forbidden for this processes because they destroy
 543                  * the contents of %cs and %ss segment registers.
 544                  *
 545                  * Explicity disable them here and add a context handler
 546                  * to the process. Note that disabling
 547                  * them here means we can't use sysret or sysexit on
 548                  * the way out of this system call - so we force this
 549                  * thread to take the slow path (which doesn't make use
 550                  * of sysenter or sysexit) back out.
 551                  */
 552                 kpreempt_disable();
 553                 ldt_installctx(pp, NULL);
 554                 cpu_fast_syscall_disable();
 555                 ASSERT(curthread->t_post_sys != 0);
 556                 kpreempt_enable();
 557 
 558         } else if (seli > pp->p_ldtlimit) {
 559                 ASSERT(pp->p_pctx != NULL);
 560 
 561                 /*
 562                  * Increase size of ldt to include seli.
 563                  */
 564                 ldt_grow(pp, seli);
 565         }
 566 
 567         ASSERT(seli <= pp->p_ldtlimit);
 568         ldp = &pp->p_ldt[seli];
 569 
 570         /*
 571          * On the 64-bit kernel, this is where things get more subtle.
 572          * Recall that in the 64-bit kernel, when we enter the kernel we
 573          * deliberately -don't- reload the segment selectors we came in on
 574          * for %ds, %es, %fs or %gs. Messing with selectors is expensive,
 575          * and the underlying descriptors are essentially ignored by the
 576          * hardware in long mode - except for the base that we override with
 577          * the gsbase MSRs.
 578          *
 579          * However, there's one unfortunate issue with this rosy picture --
 580          * a descriptor that's not marked as 'present' will still generate
 581          * an #np when loading a segment register.
 582          *
 583          * Consider this case.  An lwp creates a harmless LDT entry, points
 584          * one of it's segment registers at it, then tells the kernel (here)
 585          * to delete it.  In the 32-bit kernel, the #np will happen on the
 586          * way back to userland where we reload the segment registers, and be
 587          * handled in kern_gpfault().  In the 64-bit kernel, the same thing
 588          * will happen in the normal case too.  However, if we're trying to
 589          * use a debugger that wants to save and restore the segment registers,
 590          * and the debugger things that we have valid segment registers, we
 591          * have the problem that the debugger will try and restore the
 592          * segment register that points at the now 'not present' descriptor
 593          * and will take a #np right there.
 594          *
 595          * We should obviously fix the debugger to be paranoid about
 596          * -not- restoring segment registers that point to bad descriptors;
 597          * however we can prevent the problem here if we check to see if any
 598          * of the segment registers are still pointing at the thing we're
 599          * destroying; if they are, return an error instead. (That also seems
 600          * a lot better failure mode than SIGKILL and a core file
 601          * from kern_gpfault() too.)
 602          */
 603         if (SI86SSD_PRES(ssd) == 0) {
 604                 kthread_t *t;
 605                 int bad = 0;
 606 
 607                 /*
 608                  * Look carefully at the segment registers of every lwp
 609                  * in the process (they're all stopped by our caller).
 610                  * If we're about to invalidate a descriptor that's still
 611                  * being referenced by *any* of them, return an error,
 612                  * rather than having them #gp on their way out of the kernel.
 613                  */
 614                 ASSERT(pp->p_lwprcnt == 1);
 615 
 616                 mutex_enter(&pp->p_lock);
 617                 t = pp->p_tlist;
 618                 do {
 619                         klwp_t *lwp = ttolwp(t);
 620                         struct regs *rp = lwp->lwp_regs;
 621 #if defined(__amd64)
 622                         pcb_t *pcb = &lwp->lwp_pcb;
 623 #endif
 624 
 625                         if (ssd->sel == rp->r_cs || ssd->sel == rp->r_ss) {
 626                                 bad = 1;
 627                                 break;
 628                         }
 629 
 630 #if defined(__amd64)
 631                         if (PCB_NEED_UPDATE_SEGS(pcb)) {
 632                                 if (ssd->sel == pcb->pcb_ds ||
 633                                     ssd->sel == pcb->pcb_es ||
 634                                     ssd->sel == pcb->pcb_fs ||
 635                                     ssd->sel == pcb->pcb_gs) {
 636                                         bad = 1;
 637                                         break;
 638                                 }
 639                         } else
 640 #endif
 641                         {
 642                                 if (ssd->sel == rp->r_ds ||
 643                                     ssd->sel == rp->r_es ||
 644                                     ssd->sel == rp->r_fs ||
 645                                     ssd->sel == rp->r_gs) {
 646                                         bad = 1;
 647                                         break;
 648                                 }
 649                         }
 650 
 651                 } while ((t = t->t_forw) != pp->p_tlist);
 652                 mutex_exit(&pp->p_lock);
 653 
 654                 if (bad) {
 655                         mutex_exit(&pp->p_ldtlock);
 656                         return (EBUSY);
 657                 }
 658         }
 659 
 660         /*
 661          * If acc1 is zero, clear the descriptor (including the 'present' bit).
 662          * Make sure we update the CPU-private copy of the LDT.
 663          */
 664         if (ssd->acc1 == 0) {
 665                 rc  = ldt_update_segd(ldp, &null_udesc);
 666                 kpreempt_disable();
 667                 ldt_load();
 668                 kpreempt_enable();
 669                 mutex_exit(&pp->p_ldtlock);
 670                 return (rc);
 671         }
 672 
 673         /*
 674          * Check segment type, allow segment not present and
 675          * only user DPL (3).
 676          */
 677         if (SI86SSD_DPL(ssd) != SEL_UPL) {
 678                 mutex_exit(&pp->p_ldtlock);
 679                 return (EINVAL);
 680         }
 681 
 682         /*
 683          * Do not allow 32-bit applications to create 64-bit mode code
 684          * segments.
 685          */
 686         if (SI86SSD_ISUSEG(ssd) && ((SI86SSD_TYPE(ssd) >> 3) & 1) == 1 &&
 687             SI86SSD_ISLONG(ssd)) {
 688                 mutex_exit(&pp->p_ldtlock);
 689                 return (EINVAL);
 690         }
 691 
 692         /*
 693          * Set up a code or data user segment descriptor, making sure to update
 694          * the CPU-private copy of the LDT.
 695          */
 696         if (SI86SSD_ISUSEG(ssd)) {
 697                 ssd_to_usd(ssd, &ndesc);
 698                 rc = ldt_update_segd(ldp, &ndesc);
 699                 kpreempt_disable();
 700                 ldt_load();
 701                 kpreempt_enable();
 702                 mutex_exit(&pp->p_ldtlock);
 703                 return (rc);
 704         }
 705 
 706         mutex_exit(&pp->p_ldtlock);
 707         return (EINVAL);
 708 }
 709 
 710 /*
 711  * Allocate new LDT for process just large enough to contain seli.  Note we
 712  * allocate and grow LDT in PAGESIZE chunks. We do this to simplify the
 713  * implementation and because on the hypervisor it's required, since the LDT
 714  * must live on pages that have PROT_WRITE removed and which are given to the
 715  * hypervisor.
 716  *
 717  * Note that we don't actually load the LDT into the current CPU here: it's done
 718  * later by our caller.
 719  */
 720 static void
 721 ldt_alloc(proc_t *pp, uint_t seli)
 722 {
 723         user_desc_t     *ldt;
 724         size_t          ldtsz;
 725         uint_t          nsels;
 726 
 727         ASSERT(MUTEX_HELD(&pp->p_ldtlock));
 728         ASSERT(pp->p_ldt == NULL);
 729         ASSERT(pp->p_ldtlimit == 0);
 730 
 731         /*
 732          * Allocate new LDT just large enough to contain seli. The LDT must
 733          * always be allocated in units of pages for KPTI.
 734          */
 735         ldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE);
 736         nsels = ldtsz / sizeof (user_desc_t);
 737         ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT);
 738 
 739         ldt = kmem_zalloc(ldtsz, KM_SLEEP);
 740         ASSERT(IS_P2ALIGNED(ldt, PAGESIZE));
 741 
 742 #if defined(__xpv)
 743         if (xen_ldt_setprot(ldt, ldtsz, PROT_READ))
 744                 panic("ldt_alloc:xen_ldt_setprot(PROT_READ) failed");
 745 #endif
 746 
 747         pp->p_ldt = ldt;
 748         pp->p_ldtlimit = nsels - 1;
 749 }
 750 
 751 static void
 752 ldt_free(proc_t *pp)
 753 {
 754         user_desc_t     *ldt;
 755         size_t          ldtsz;
 756 
 757         ASSERT(pp->p_ldt != NULL);
 758 
 759         mutex_enter(&pp->p_ldtlock);
 760         ldt = pp->p_ldt;
 761         ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
 762 
 763         ASSERT(IS_P2ALIGNED(ldtsz, PAGESIZE));
 764 
 765         pp->p_ldt = NULL;
 766         pp->p_ldtlimit = 0;
 767         mutex_exit(&pp->p_ldtlock);
 768 
 769         if (pp == curproc) {
 770                 kpreempt_disable();
 771                 ldt_unload();
 772                 kpreempt_enable();
 773         }
 774 
 775 #if defined(__xpv)
 776         /*
 777          * We are not allowed to make the ldt writable until after
 778          * we tell the hypervisor to unload it.
 779          */
 780         if (xen_ldt_setprot(ldt, ldtsz, PROT_READ | PROT_WRITE))
 781                 panic("ldt_free:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
 782 #endif
 783 
 784         kmem_free(ldt, ldtsz);
 785 }
 786 
 787 /*
 788  * On fork copy new ldt for child.
 789  */
 790 static void
 791 ldt_dup(proc_t *pp, proc_t *cp)
 792 {
 793         size_t  ldtsz;
 794 
 795         ASSERT(pp->p_ldt != NULL);
 796         ASSERT(cp != curproc);
 797 
 798         /*
 799          * I assume the parent's ldt can't increase since we're in a fork.
 800          */
 801         mutex_enter(&pp->p_ldtlock);
 802         mutex_enter(&cp->p_ldtlock);
 803 
 804         ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
 805 
 806         ldt_alloc(cp, pp->p_ldtlimit);
 807 
 808 #if defined(__xpv)
 809         /*
 810          * Make child's ldt writable so it can be copied into from
 811          * parent's ldt. This works since ldt_alloc above did not load
 812          * the ldt since its for the child process. If we tried to make
 813          * an LDT writable that is loaded in hw the setprot operation
 814          * would fail.
 815          */
 816         if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ | PROT_WRITE))
 817                 panic("ldt_dup:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
 818 #endif
 819 
 820         bcopy(pp->p_ldt, cp->p_ldt, ldtsz);
 821 
 822 #if defined(__xpv)
 823         if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ))
 824                 panic("ldt_dup:xen_ldt_setprot(PROT_READ) failed");
 825 #endif
 826         mutex_exit(&cp->p_ldtlock);
 827         mutex_exit(&pp->p_ldtlock);
 828 
 829 }
 830 
 831 /*
 832  * Note that we don't actually load the LDT into the current CPU here: it's done
 833  * later by our caller - unless we take an error.  This works out because
 834  * ldt_load() does a copy of ->p_ldt instead of directly loading it into the GDT
 835  * (and therefore can't be using the freed old LDT), and by definition if the
 836  * new entry didn't pass validation, then the proc shouldn't be referencing an
 837  * entry in the extended region.
 838  */
 839 static void
 840 ldt_grow(proc_t *pp, uint_t seli)
 841 {
 842         user_desc_t     *oldt, *nldt;
 843         uint_t          nsels;
 844         size_t          oldtsz, nldtsz;
 845 
 846         ASSERT(MUTEX_HELD(&pp->p_ldtlock));
 847         ASSERT(pp->p_ldt != NULL);
 848         ASSERT(pp->p_ldtlimit != 0);
 849 
 850         /*
 851          * Allocate larger LDT just large enough to contain seli. The LDT must
 852          * always be allocated in units of pages for KPTI.
 853          */
 854         nldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE);
 855         nsels = nldtsz / sizeof (user_desc_t);
 856         ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT);
 857         ASSERT(nsels > pp->p_ldtlimit);
 858 
 859         oldt = pp->p_ldt;
 860         oldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
 861 
 862         nldt = kmem_zalloc(nldtsz, KM_SLEEP);
 863         ASSERT(IS_P2ALIGNED(nldt, PAGESIZE));
 864 
 865         bcopy(oldt, nldt, oldtsz);
 866 
 867         /*
 868          * unload old ldt.
 869          */
 870         kpreempt_disable();
 871         ldt_unload();
 872         kpreempt_enable();
 873 
 874 #if defined(__xpv)
 875 
 876         /*
 877          * Make old ldt writable and new ldt read only.
 878          */
 879         if (xen_ldt_setprot(oldt, oldtsz, PROT_READ | PROT_WRITE))
 880                 panic("ldt_grow:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
 881 
 882         if (xen_ldt_setprot(nldt, nldtsz, PROT_READ))
 883                 panic("ldt_grow:xen_ldt_setprot(PROT_READ) failed");
 884 #endif
 885 
 886         pp->p_ldt = nldt;
 887         pp->p_ldtlimit = nsels - 1;
 888 
 889         kmem_free(oldt, oldtsz);
 890 }