1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2021 Joyent, Inc.
24 */
25
26 /* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */
28 /* All Rights Reserved */
29
30 /* Copyright (c) 1987, 1988 Microsoft Corporation */
31 /* All Rights Reserved */
32
33 #include <sys/param.h>
34 #include <sys/types.h>
35 #include <sys/sysmacros.h>
36 #include <sys/systm.h>
37 #include <sys/signal.h>
38 #include <sys/errno.h>
39 #include <sys/fault.h>
40 #include <sys/syscall.h>
41 #include <sys/cpuvar.h>
42 #include <sys/sysi86.h>
43 #include <sys/psw.h>
44 #include <sys/cred.h>
45 #include <sys/policy.h>
46 #include <sys/thread.h>
47 #include <sys/debug.h>
48 #include <sys/ontrap.h>
49 #include <sys/privregs.h>
50 #include <sys/x86_archext.h>
51 #include <sys/vmem.h>
52 #include <sys/kmem.h>
53 #include <sys/mman.h>
54 #include <sys/archsystm.h>
55 #include <vm/hat.h>
56 #include <vm/as.h>
57 #include <vm/seg.h>
58 #include <vm/seg_kmem.h>
59 #include <vm/faultcode.h>
60 #include <sys/fp.h>
61 #include <sys/cmn_err.h>
62 #include <sys/segments.h>
63 #include <sys/clock.h>
64 #include <vm/hat_i86.h>
65 #if defined(__xpv)
66 #include <sys/hypervisor.h>
67 #include <sys/note.h>
68 #endif
69
70 static void ldt_alloc(proc_t *, uint_t);
71 static void ldt_free(proc_t *);
72 static void ldt_dup(proc_t *, proc_t *);
73 static void ldt_grow(proc_t *, uint_t);
74
75 /*
76 * sysi86 System Call
77 */
78
79 /* ARGSUSED */
80 int
81 sysi86(short cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3)
82 {
83 struct ssd ssd;
84 int error = 0;
85 int c;
86 proc_t *pp = curproc;
87
88 switch (cmd) {
89
90 /*
91 * The SI86V86 subsystem call of the SYSI86 system call
92 * supports only one subcode -- V86SC_IOPL.
93 */
94 case SI86V86:
95 if (arg1 == V86SC_IOPL) {
96 #if defined(__xpv)
97 struct ctxop *ctx;
98 #endif
99 struct regs *rp = lwptoregs(ttolwp(curthread));
100 greg_t oldpl = rp->r_ps & PS_IOPL;
101 greg_t newpl = arg2 & PS_IOPL;
102
103 /*
104 * Must be privileged to run this system call
105 * if giving more io privilege.
106 */
107 if (newpl > oldpl && (error =
108 secpolicy_sys_config(CRED(), B_FALSE)) != 0)
109 return (set_errno(error));
110 #if defined(__xpv)
111 ctx = installctx_preallocate();
112 kpreempt_disable();
113 installctx(curthread, NULL, xen_disable_user_iopl,
114 xen_enable_user_iopl, NULL, NULL,
115 xen_disable_user_iopl, NULL, ctx);
116 xen_enable_user_iopl();
117 kpreempt_enable();
118 #else
119 rp->r_ps ^= oldpl ^ newpl;
120 #endif
121 } else
122 error = EINVAL;
123 break;
124
125 /*
126 * Set a segment descriptor
127 */
128 case SI86DSCR:
129 /*
130 * There are considerable problems here manipulating
131 * resources shared by many running lwps. Get everyone
132 * into a safe state before changing the LDT.
133 */
134 if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK1)) {
135 error = EINTR;
136 break;
137 }
138
139 if (get_udatamodel() == DATAMODEL_LP64) {
140 error = EINVAL;
141 break;
142 }
143
144 if (copyin((caddr_t)arg1, &ssd, sizeof (ssd)) < 0) {
145 error = EFAULT;
146 break;
147 }
148
149 error = setdscr(&ssd);
150
151 mutex_enter(&pp->p_lock);
152 if (curthread != pp->p_agenttp)
153 continuelwps(pp);
154 mutex_exit(&pp->p_lock);
155 break;
156
157 case SI86FPHW:
158 c = fp_kind & 0xff;
159 if (suword32((void *)arg1, c) == -1)
160 error = EFAULT;
161 break;
162
163 case SI86FPSTART:
164 /*
165 * arg1 is the address of _fp_hw
166 * arg2 is the desired x87 FCW value
167 * arg3 is the desired SSE MXCSR value
168 * a return value of one means SSE hardware, else none.
169 */
170 c = fp_kind & 0xff;
171 if (suword32((void *)arg1, c) == -1) {
172 error = EFAULT;
173 break;
174 }
175 fpsetcw((uint16_t)arg2, (uint32_t)arg3);
176 return ((fp_kind & __FP_SSE) ? 1 : 0);
177
178 /* real time clock management commands */
179
180 case WTODC:
181 if ((error = secpolicy_settime(CRED())) == 0) {
182 timestruc_t ts;
183 mutex_enter(&tod_lock);
184 gethrestime(&ts);
185 tod_set(ts);
186 mutex_exit(&tod_lock);
187 }
188 break;
189
190 /* Give some timezone playing room */
191 #define ONEWEEK (7 * 24 * 60 * 60)
192
193 case SGMTL:
194 /*
195 * Called from 32 bit land, negative values
196 * are not sign extended, so we do that here
197 * by casting it to an int and back. We also
198 * clamp the value to within reason and detect
199 * when a 64 bit call overflows an int.
200 */
201 if ((error = secpolicy_settime(CRED())) == 0) {
202 int newlag = (int)arg1;
203
204 #ifdef _SYSCALL32_IMPL
205 if (get_udatamodel() == DATAMODEL_NATIVE &&
206 (long)newlag != (long)arg1) {
207 error = EOVERFLOW;
208 } else
209 #endif
210 if (newlag >= -ONEWEEK && newlag <= ONEWEEK)
211 sgmtl(newlag);
212 else
213 error = EOVERFLOW;
214 }
215 break;
216
217 case GGMTL:
218 if (get_udatamodel() == DATAMODEL_NATIVE) {
219 if (sulword((void *)arg1, ggmtl()) == -1)
220 error = EFAULT;
221 #ifdef _SYSCALL32_IMPL
222 } else {
223 time_t gmtl;
224
225 if ((gmtl = ggmtl()) > INT32_MAX) {
226 /*
227 * Since gmt_lag can at most be
228 * +/- 12 hours, something is
229 * *seriously* messed up here.
230 */
231 error = EOVERFLOW;
232 } else if (suword32((void *)arg1, (int32_t)gmtl) == -1)
233 error = EFAULT;
234 #endif
235 }
236 break;
237
238 case RTCSYNC:
239 if ((error = secpolicy_settime(CRED())) == 0)
240 rtcsync();
241 break;
242
243 /* END OF real time clock management commands */
244
245 default:
246 error = EINVAL;
247 break;
248 }
249 return (error == 0 ? 0 : set_errno(error));
250 }
251
252 void
253 usd_to_ssd(user_desc_t *usd, struct ssd *ssd, selector_t sel)
254 {
255 ssd->bo = USEGD_GETBASE(usd);
256 ssd->ls = USEGD_GETLIMIT(usd);
257 ssd->sel = sel;
258
259 /*
260 * set type, dpl and present bits.
261 */
262 ssd->acc1 = usd->usd_type;
263 ssd->acc1 |= usd->usd_dpl << 5;
264 ssd->acc1 |= usd->usd_p << (5 + 2);
265
266 /*
267 * set avl, DB and granularity bits.
268 */
269 ssd->acc2 = usd->usd_avl;
270
271 #if defined(__amd64)
272 ssd->acc2 |= usd->usd_long << 1;
273 #else
274 ssd->acc2 |= usd->usd_reserved << 1;
275 #endif
276
277 ssd->acc2 |= usd->usd_def32 << (1 + 1);
278 ssd->acc2 |= usd->usd_gran << (1 + 1 + 1);
279 }
280
281 static void
282 ssd_to_usd(struct ssd *ssd, user_desc_t *usd)
283 {
284
285 ASSERT(bcmp(usd, &null_udesc, sizeof (*usd)) == 0);
286
287 USEGD_SETBASE(usd, ssd->bo);
288 USEGD_SETLIMIT(usd, ssd->ls);
289
290 /*
291 * Set type, dpl and present bits.
292 *
293 * Force the "accessed" bit to on so that we don't run afoul of
294 * KPTI.
295 */
296 usd->usd_type = ssd->acc1 | SDT_A;
297 usd->usd_dpl = ssd->acc1 >> 5;
298 usd->usd_p = ssd->acc1 >> (5 + 2);
299
300 ASSERT(usd->usd_type >= SDT_MEMRO);
301 ASSERT(usd->usd_dpl == SEL_UPL);
302
303 /*
304 * 64-bit code selectors are never allowed in the LDT.
305 * Reserved bit is always 0 on 32-bit systems.
306 */
307 #if defined(__amd64)
308 usd->usd_long = 0;
309 #else
310 usd->usd_reserved = 0;
311 #endif
312
313 /*
314 * set avl, DB and granularity bits.
315 */
316 usd->usd_avl = ssd->acc2;
317 usd->usd_def32 = ssd->acc2 >> (1 + 1);
318 usd->usd_gran = ssd->acc2 >> (1 + 1 + 1);
319 }
320
321
322 #if defined(__i386)
323
324 static void
325 ssd_to_sgd(struct ssd *ssd, gate_desc_t *sgd)
326 {
327
328 ASSERT(bcmp(sgd, &null_sdesc, sizeof (*sgd)) == 0);
329
330 sgd->sgd_looffset = ssd->bo;
331 sgd->sgd_hioffset = ssd->bo >> 16;
332
333 sgd->sgd_selector = ssd->ls;
334
335 /*
336 * set type, dpl and present bits.
337 */
338 sgd->sgd_type = ssd->acc1;
339 sgd->sgd_dpl = ssd->acc1 >> 5;
340 sgd->sgd_p = ssd->acc1 >> 7;
341 ASSERT(sgd->sgd_type == SDT_SYSCGT);
342 ASSERT(sgd->sgd_dpl == SEL_UPL);
343 sgd->sgd_stkcpy = 0;
344 }
345
346 #endif /* __i386 */
347
348 /*
349 * Load LDT register with the current process's LDT.
350 */
351 static void
352 ldt_load(void)
353 {
354 #if defined(__xpv)
355 xen_set_ldt(curproc->p_ldt, curproc->p_ldtlimit + 1);
356 #else
357 size_t len;
358 system_desc_t desc;
359
360 /*
361 * Before we can use the LDT on this CPU, we must install the LDT in the
362 * user mapping table.
363 */
364 len = (curproc->p_ldtlimit + 1) * sizeof (user_desc_t);
365 bcopy(curproc->p_ldt, CPU->cpu_m.mcpu_ldt, len);
366 CPU->cpu_m.mcpu_ldt_len = len;
367 set_syssegd(&desc, CPU->cpu_m.mcpu_ldt, len - 1, SDT_SYSLDT, SEL_KPL);
368 *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = desc;
369
370 wr_ldtr(ULDT_SEL);
371 #endif
372 }
373
374 /*
375 * Store a NULL selector in the LDTR. All subsequent illegal references to
376 * the LDT will result in a #gp.
377 */
378 void
379 ldt_unload(void)
380 {
381 #if defined(__xpv)
382 xen_set_ldt(NULL, 0);
383 #else
384 *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = null_sdesc;
385 wr_ldtr(0);
386
387 bzero(CPU->cpu_m.mcpu_ldt, CPU->cpu_m.mcpu_ldt_len);
388 CPU->cpu_m.mcpu_ldt_len = 0;
389 #endif
390 }
391
392 /*ARGSUSED*/
393 static void
394 ldt_savectx(proc_t *p)
395 {
396 ASSERT(p->p_ldt != NULL);
397 ASSERT(p == curproc);
398
399 #if defined(__amd64)
400 /*
401 * The 64-bit kernel must be sure to clear any stale ldt
402 * selectors when context switching away from a process that
403 * has a private ldt. Consider the following example:
404 *
405 * Wine creats a ldt descriptor and points a segment register
406 * to it.
407 *
408 * We then context switch away from wine lwp to kernel
409 * thread and hit breakpoint in kernel with kmdb
410 *
411 * When we continue and resume from kmdb we will #gp
412 * fault since kmdb will have saved the stale ldt selector
413 * from wine and will try to restore it but we are no longer in
414 * the context of the wine process and do not have our
415 * ldtr register pointing to the private ldt.
416 */
417 reset_sregs();
418 #endif
419
420 ldt_unload();
421 cpu_fast_syscall_enable();
422 }
423
424 static void
425 ldt_restorectx(proc_t *p)
426 {
427 ASSERT(p->p_ldt != NULL);
428 ASSERT(p == curproc);
429
430 ldt_load();
431 cpu_fast_syscall_disable();
432 }
433
434 /*
435 * At exec time, we need to clear up our LDT context and re-enable fast syscalls
436 * for the new process image.
437 *
438 * The same is true for the other case, where we have:
439 *
440 * proc_exit()
441 * ->exitpctx()->ldt_savectx()
442 * ->freepctx()->ldt_freectx()
443 *
444 * Because pre-emption is not prevented between the two callbacks, we could have
445 * come off CPU, and brought back LDT context when coming back on CPU via
446 * ldt_restorectx().
447 */
448 /* ARGSUSED */
449 static void
450 ldt_freectx(proc_t *p, int isexec)
451 {
452 ASSERT(p->p_ldt != NULL);
453 ASSERT(p == curproc);
454
455 kpreempt_disable();
456 ldt_free(p);
457 cpu_fast_syscall_enable();
458 kpreempt_enable();
459 }
460
461 /*
462 * Install ctx op that ensures syscall/sysenter are disabled.
463 * See comments below.
464 *
465 * When a thread with a private LDT forks, the new process
466 * must have the LDT context ops installed.
467 */
468 /* ARGSUSED */
469 static void
470 ldt_installctx(proc_t *p, proc_t *cp)
471 {
472 proc_t *targ = p;
473 kthread_t *t;
474
475 /*
476 * If this is a fork, operate on the child process.
477 */
478 if (cp != NULL) {
479 targ = cp;
480 ldt_dup(p, cp);
481 }
482
483 /*
484 * The process context ops expect the target process as their argument.
485 */
486 ASSERT(removepctx(targ, targ, ldt_savectx, ldt_restorectx,
487 ldt_installctx, ldt_savectx, ldt_freectx) == 0);
488
489 installpctx(targ, targ, ldt_savectx, ldt_restorectx,
490 ldt_installctx, ldt_savectx, ldt_freectx);
491
492 /*
493 * We've just disabled fast system call and return instructions; take
494 * the slow path out to make sure we don't try to use one to return
495 * back to user. We must set t_post_sys for every thread in the
496 * process to make sure none of them escape out via fast return.
497 */
498
499 mutex_enter(&targ->p_lock);
500 t = targ->p_tlist;
501 do {
502 t->t_post_sys = 1;
503 } while ((t = t->t_forw) != targ->p_tlist);
504 mutex_exit(&targ->p_lock);
505 }
506
507 int
508 setdscr(struct ssd *ssd)
509 {
510 ushort_t seli; /* selector index */
511 user_desc_t *ldp; /* descriptor pointer */
512 user_desc_t ndesc; /* new descriptor */
513 proc_t *pp = curproc;
514 int rc = 0;
515
516 /*
517 * LDT segments: executable and data at DPL 3 only.
518 */
519 if (!SELISLDT(ssd->sel) || !SELISUPL(ssd->sel))
520 return (EINVAL);
521
522 /*
523 * check the selector index.
524 */
525 seli = SELTOIDX(ssd->sel);
526 if (seli >= MAXNLDT || seli < LDT_UDBASE)
527 return (EINVAL);
528
529 ndesc = null_udesc;
530 mutex_enter(&pp->p_ldtlock);
531
532 /*
533 * If this is the first time for this process then setup a
534 * private LDT for it.
535 */
536 if (pp->p_ldt == NULL) {
537 ldt_alloc(pp, seli);
538
539 /*
540 * Now that this process has a private LDT, the use of
541 * the syscall/sysret and sysenter/sysexit instructions
542 * is forbidden for this processes because they destroy
543 * the contents of %cs and %ss segment registers.
544 *
545 * Explicity disable them here and add a context handler
546 * to the process. Note that disabling
547 * them here means we can't use sysret or sysexit on
548 * the way out of this system call - so we force this
549 * thread to take the slow path (which doesn't make use
550 * of sysenter or sysexit) back out.
551 */
552 kpreempt_disable();
553 ldt_installctx(pp, NULL);
554 cpu_fast_syscall_disable();
555 ASSERT(curthread->t_post_sys != 0);
556 kpreempt_enable();
557
558 } else if (seli > pp->p_ldtlimit) {
559 ASSERT(pp->p_pctx != NULL);
560
561 /*
562 * Increase size of ldt to include seli.
563 */
564 ldt_grow(pp, seli);
565 }
566
567 ASSERT(seli <= pp->p_ldtlimit);
568 ldp = &pp->p_ldt[seli];
569
570 /*
571 * On the 64-bit kernel, this is where things get more subtle.
572 * Recall that in the 64-bit kernel, when we enter the kernel we
573 * deliberately -don't- reload the segment selectors we came in on
574 * for %ds, %es, %fs or %gs. Messing with selectors is expensive,
575 * and the underlying descriptors are essentially ignored by the
576 * hardware in long mode - except for the base that we override with
577 * the gsbase MSRs.
578 *
579 * However, there's one unfortunate issue with this rosy picture --
580 * a descriptor that's not marked as 'present' will still generate
581 * an #np when loading a segment register.
582 *
583 * Consider this case. An lwp creates a harmless LDT entry, points
584 * one of it's segment registers at it, then tells the kernel (here)
585 * to delete it. In the 32-bit kernel, the #np will happen on the
586 * way back to userland where we reload the segment registers, and be
587 * handled in kern_gpfault(). In the 64-bit kernel, the same thing
588 * will happen in the normal case too. However, if we're trying to
589 * use a debugger that wants to save and restore the segment registers,
590 * and the debugger things that we have valid segment registers, we
591 * have the problem that the debugger will try and restore the
592 * segment register that points at the now 'not present' descriptor
593 * and will take a #np right there.
594 *
595 * We should obviously fix the debugger to be paranoid about
596 * -not- restoring segment registers that point to bad descriptors;
597 * however we can prevent the problem here if we check to see if any
598 * of the segment registers are still pointing at the thing we're
599 * destroying; if they are, return an error instead. (That also seems
600 * a lot better failure mode than SIGKILL and a core file
601 * from kern_gpfault() too.)
602 */
603 if (SI86SSD_PRES(ssd) == 0) {
604 kthread_t *t;
605 int bad = 0;
606
607 /*
608 * Look carefully at the segment registers of every lwp
609 * in the process (they're all stopped by our caller).
610 * If we're about to invalidate a descriptor that's still
611 * being referenced by *any* of them, return an error,
612 * rather than having them #gp on their way out of the kernel.
613 */
614 ASSERT(pp->p_lwprcnt == 1);
615
616 mutex_enter(&pp->p_lock);
617 t = pp->p_tlist;
618 do {
619 klwp_t *lwp = ttolwp(t);
620 struct regs *rp = lwp->lwp_regs;
621 #if defined(__amd64)
622 pcb_t *pcb = &lwp->lwp_pcb;
623 #endif
624
625 if (ssd->sel == rp->r_cs || ssd->sel == rp->r_ss) {
626 bad = 1;
627 break;
628 }
629
630 #if defined(__amd64)
631 if (PCB_NEED_UPDATE_SEGS(pcb)) {
632 if (ssd->sel == pcb->pcb_ds ||
633 ssd->sel == pcb->pcb_es ||
634 ssd->sel == pcb->pcb_fs ||
635 ssd->sel == pcb->pcb_gs) {
636 bad = 1;
637 break;
638 }
639 } else
640 #endif
641 {
642 if (ssd->sel == rp->r_ds ||
643 ssd->sel == rp->r_es ||
644 ssd->sel == rp->r_fs ||
645 ssd->sel == rp->r_gs) {
646 bad = 1;
647 break;
648 }
649 }
650
651 } while ((t = t->t_forw) != pp->p_tlist);
652 mutex_exit(&pp->p_lock);
653
654 if (bad) {
655 mutex_exit(&pp->p_ldtlock);
656 return (EBUSY);
657 }
658 }
659
660 /*
661 * If acc1 is zero, clear the descriptor (including the 'present' bit).
662 * Make sure we update the CPU-private copy of the LDT.
663 */
664 if (ssd->acc1 == 0) {
665 rc = ldt_update_segd(ldp, &null_udesc);
666 kpreempt_disable();
667 ldt_load();
668 kpreempt_enable();
669 mutex_exit(&pp->p_ldtlock);
670 return (rc);
671 }
672
673 /*
674 * Check segment type, allow segment not present and
675 * only user DPL (3).
676 */
677 if (SI86SSD_DPL(ssd) != SEL_UPL) {
678 mutex_exit(&pp->p_ldtlock);
679 return (EINVAL);
680 }
681
682 /*
683 * Do not allow 32-bit applications to create 64-bit mode code
684 * segments.
685 */
686 if (SI86SSD_ISUSEG(ssd) && ((SI86SSD_TYPE(ssd) >> 3) & 1) == 1 &&
687 SI86SSD_ISLONG(ssd)) {
688 mutex_exit(&pp->p_ldtlock);
689 return (EINVAL);
690 }
691
692 /*
693 * Set up a code or data user segment descriptor, making sure to update
694 * the CPU-private copy of the LDT.
695 */
696 if (SI86SSD_ISUSEG(ssd)) {
697 ssd_to_usd(ssd, &ndesc);
698 rc = ldt_update_segd(ldp, &ndesc);
699 kpreempt_disable();
700 ldt_load();
701 kpreempt_enable();
702 mutex_exit(&pp->p_ldtlock);
703 return (rc);
704 }
705
706 mutex_exit(&pp->p_ldtlock);
707 return (EINVAL);
708 }
709
710 /*
711 * Allocate new LDT for process just large enough to contain seli. Note we
712 * allocate and grow LDT in PAGESIZE chunks. We do this to simplify the
713 * implementation and because on the hypervisor it's required, since the LDT
714 * must live on pages that have PROT_WRITE removed and which are given to the
715 * hypervisor.
716 *
717 * Note that we don't actually load the LDT into the current CPU here: it's done
718 * later by our caller.
719 */
720 static void
721 ldt_alloc(proc_t *pp, uint_t seli)
722 {
723 user_desc_t *ldt;
724 size_t ldtsz;
725 uint_t nsels;
726
727 ASSERT(MUTEX_HELD(&pp->p_ldtlock));
728 ASSERT(pp->p_ldt == NULL);
729 ASSERT(pp->p_ldtlimit == 0);
730
731 /*
732 * Allocate new LDT just large enough to contain seli. The LDT must
733 * always be allocated in units of pages for KPTI.
734 */
735 ldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE);
736 nsels = ldtsz / sizeof (user_desc_t);
737 ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT);
738
739 ldt = kmem_zalloc(ldtsz, KM_SLEEP);
740 ASSERT(IS_P2ALIGNED(ldt, PAGESIZE));
741
742 #if defined(__xpv)
743 if (xen_ldt_setprot(ldt, ldtsz, PROT_READ))
744 panic("ldt_alloc:xen_ldt_setprot(PROT_READ) failed");
745 #endif
746
747 pp->p_ldt = ldt;
748 pp->p_ldtlimit = nsels - 1;
749 }
750
751 static void
752 ldt_free(proc_t *pp)
753 {
754 user_desc_t *ldt;
755 size_t ldtsz;
756
757 ASSERT(pp->p_ldt != NULL);
758
759 mutex_enter(&pp->p_ldtlock);
760 ldt = pp->p_ldt;
761 ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
762
763 ASSERT(IS_P2ALIGNED(ldtsz, PAGESIZE));
764
765 pp->p_ldt = NULL;
766 pp->p_ldtlimit = 0;
767 mutex_exit(&pp->p_ldtlock);
768
769 if (pp == curproc) {
770 kpreempt_disable();
771 ldt_unload();
772 kpreempt_enable();
773 }
774
775 #if defined(__xpv)
776 /*
777 * We are not allowed to make the ldt writable until after
778 * we tell the hypervisor to unload it.
779 */
780 if (xen_ldt_setprot(ldt, ldtsz, PROT_READ | PROT_WRITE))
781 panic("ldt_free:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
782 #endif
783
784 kmem_free(ldt, ldtsz);
785 }
786
787 /*
788 * On fork copy new ldt for child.
789 */
790 static void
791 ldt_dup(proc_t *pp, proc_t *cp)
792 {
793 size_t ldtsz;
794
795 ASSERT(pp->p_ldt != NULL);
796 ASSERT(cp != curproc);
797
798 /*
799 * I assume the parent's ldt can't increase since we're in a fork.
800 */
801 mutex_enter(&pp->p_ldtlock);
802 mutex_enter(&cp->p_ldtlock);
803
804 ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
805
806 ldt_alloc(cp, pp->p_ldtlimit);
807
808 #if defined(__xpv)
809 /*
810 * Make child's ldt writable so it can be copied into from
811 * parent's ldt. This works since ldt_alloc above did not load
812 * the ldt since its for the child process. If we tried to make
813 * an LDT writable that is loaded in hw the setprot operation
814 * would fail.
815 */
816 if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ | PROT_WRITE))
817 panic("ldt_dup:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
818 #endif
819
820 bcopy(pp->p_ldt, cp->p_ldt, ldtsz);
821
822 #if defined(__xpv)
823 if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ))
824 panic("ldt_dup:xen_ldt_setprot(PROT_READ) failed");
825 #endif
826 mutex_exit(&cp->p_ldtlock);
827 mutex_exit(&pp->p_ldtlock);
828
829 }
830
831 /*
832 * Note that we don't actually load the LDT into the current CPU here: it's done
833 * later by our caller - unless we take an error. This works out because
834 * ldt_load() does a copy of ->p_ldt instead of directly loading it into the GDT
835 * (and therefore can't be using the freed old LDT), and by definition if the
836 * new entry didn't pass validation, then the proc shouldn't be referencing an
837 * entry in the extended region.
838 */
839 static void
840 ldt_grow(proc_t *pp, uint_t seli)
841 {
842 user_desc_t *oldt, *nldt;
843 uint_t nsels;
844 size_t oldtsz, nldtsz;
845
846 ASSERT(MUTEX_HELD(&pp->p_ldtlock));
847 ASSERT(pp->p_ldt != NULL);
848 ASSERT(pp->p_ldtlimit != 0);
849
850 /*
851 * Allocate larger LDT just large enough to contain seli. The LDT must
852 * always be allocated in units of pages for KPTI.
853 */
854 nldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE);
855 nsels = nldtsz / sizeof (user_desc_t);
856 ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT);
857 ASSERT(nsels > pp->p_ldtlimit);
858
859 oldt = pp->p_ldt;
860 oldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
861
862 nldt = kmem_zalloc(nldtsz, KM_SLEEP);
863 ASSERT(IS_P2ALIGNED(nldt, PAGESIZE));
864
865 bcopy(oldt, nldt, oldtsz);
866
867 /*
868 * unload old ldt.
869 */
870 kpreempt_disable();
871 ldt_unload();
872 kpreempt_enable();
873
874 #if defined(__xpv)
875
876 /*
877 * Make old ldt writable and new ldt read only.
878 */
879 if (xen_ldt_setprot(oldt, oldtsz, PROT_READ | PROT_WRITE))
880 panic("ldt_grow:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
881
882 if (xen_ldt_setprot(nldt, nldtsz, PROT_READ))
883 panic("ldt_grow:xen_ldt_setprot(PROT_READ) failed");
884 #endif
885
886 pp->p_ldt = nldt;
887 pp->p_ldtlimit = nsels - 1;
888
889 kmem_free(oldt, oldtsz);
890 }