Print this page
13902 Fix for 13717 may break 8-disk raidz2
13915 installctx() blocking allocate causes problems
Portions contributed by: Jerry Jelinek <gjelinek@gmail.com>
Change-Id: I934d69946cec42630fc541fa8c7385b862b69ca2
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/intel/ia32/os/sysi86.c
+++ new/usr/src/uts/intel/ia32/os/sysi86.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
|
↓ open down ↓ |
12 lines elided |
↑ open up ↑ |
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
23 - * Copyright 2018 Joyent, Inc.
23 + * Copyright 2021 Joyent, Inc.
24 24 */
25 25
26 26 /* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
27 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */
28 -/* All Rights Reserved */
28 +/* All Rights Reserved */
29 29
30 30 /* Copyright (c) 1987, 1988 Microsoft Corporation */
31 31 /* All Rights Reserved */
32 32
33 33 #include <sys/param.h>
34 34 #include <sys/types.h>
35 35 #include <sys/sysmacros.h>
36 36 #include <sys/systm.h>
37 37 #include <sys/signal.h>
38 38 #include <sys/errno.h>
39 39 #include <sys/fault.h>
40 40 #include <sys/syscall.h>
41 41 #include <sys/cpuvar.h>
42 42 #include <sys/sysi86.h>
43 43 #include <sys/psw.h>
44 44 #include <sys/cred.h>
45 45 #include <sys/policy.h>
46 46 #include <sys/thread.h>
47 47 #include <sys/debug.h>
48 48 #include <sys/ontrap.h>
49 49 #include <sys/privregs.h>
50 50 #include <sys/x86_archext.h>
51 51 #include <sys/vmem.h>
52 52 #include <sys/kmem.h>
53 53 #include <sys/mman.h>
54 54 #include <sys/archsystm.h>
55 55 #include <vm/hat.h>
56 56 #include <vm/as.h>
57 57 #include <vm/seg.h>
58 58 #include <vm/seg_kmem.h>
59 59 #include <vm/faultcode.h>
60 60 #include <sys/fp.h>
61 61 #include <sys/cmn_err.h>
62 62 #include <sys/segments.h>
63 63 #include <sys/clock.h>
64 64 #include <vm/hat_i86.h>
65 65 #if defined(__xpv)
66 66 #include <sys/hypervisor.h>
67 67 #include <sys/note.h>
68 68 #endif
69 69
70 70 static void ldt_alloc(proc_t *, uint_t);
71 71 static void ldt_free(proc_t *);
72 72 static void ldt_dup(proc_t *, proc_t *);
73 73 static void ldt_grow(proc_t *, uint_t);
74 74
75 75 /*
76 76 * sysi86 System Call
77 77 */
78 78
79 79 /* ARGSUSED */
80 80 int
81 81 sysi86(short cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3)
82 82 {
83 83 struct ssd ssd;
84 84 int error = 0;
85 85 int c;
|
↓ open down ↓ |
47 lines elided |
↑ open up ↑ |
86 86 proc_t *pp = curproc;
87 87
88 88 switch (cmd) {
89 89
90 90 /*
91 91 * The SI86V86 subsystem call of the SYSI86 system call
92 92 * supports only one subcode -- V86SC_IOPL.
93 93 */
94 94 case SI86V86:
95 95 if (arg1 == V86SC_IOPL) {
96 +#if defined(__xpv)
97 + struct ctxop *ctx;
98 +#endif
96 99 struct regs *rp = lwptoregs(ttolwp(curthread));
97 100 greg_t oldpl = rp->r_ps & PS_IOPL;
98 101 greg_t newpl = arg2 & PS_IOPL;
99 102
100 103 /*
101 104 * Must be privileged to run this system call
102 105 * if giving more io privilege.
103 106 */
104 107 if (newpl > oldpl && (error =
105 108 secpolicy_sys_config(CRED(), B_FALSE)) != 0)
106 109 return (set_errno(error));
107 110 #if defined(__xpv)
111 + ctx = installctx_preallocate();
108 112 kpreempt_disable();
109 113 installctx(curthread, NULL, xen_disable_user_iopl,
110 114 xen_enable_user_iopl, NULL, NULL,
111 - xen_disable_user_iopl, NULL);
115 + xen_disable_user_iopl, NULL, ctx);
112 116 xen_enable_user_iopl();
113 117 kpreempt_enable();
114 118 #else
115 119 rp->r_ps ^= oldpl ^ newpl;
116 120 #endif
117 121 } else
118 122 error = EINVAL;
119 123 break;
120 124
121 125 /*
122 126 * Set a segment descriptor
123 127 */
124 128 case SI86DSCR:
125 129 /*
126 130 * There are considerable problems here manipulating
127 131 * resources shared by many running lwps. Get everyone
128 132 * into a safe state before changing the LDT.
129 133 */
130 134 if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK1)) {
131 135 error = EINTR;
132 136 break;
133 137 }
134 138
135 139 if (get_udatamodel() == DATAMODEL_LP64) {
136 140 error = EINVAL;
137 141 break;
138 142 }
139 143
140 144 if (copyin((caddr_t)arg1, &ssd, sizeof (ssd)) < 0) {
141 145 error = EFAULT;
142 146 break;
143 147 }
144 148
145 149 error = setdscr(&ssd);
146 150
147 151 mutex_enter(&pp->p_lock);
148 152 if (curthread != pp->p_agenttp)
149 153 continuelwps(pp);
150 154 mutex_exit(&pp->p_lock);
151 155 break;
152 156
153 157 case SI86FPHW:
154 158 c = fp_kind & 0xff;
155 159 if (suword32((void *)arg1, c) == -1)
156 160 error = EFAULT;
157 161 break;
158 162
159 163 case SI86FPSTART:
160 164 /*
161 165 * arg1 is the address of _fp_hw
162 166 * arg2 is the desired x87 FCW value
163 167 * arg3 is the desired SSE MXCSR value
164 168 * a return value of one means SSE hardware, else none.
165 169 */
166 170 c = fp_kind & 0xff;
167 171 if (suword32((void *)arg1, c) == -1) {
168 172 error = EFAULT;
169 173 break;
170 174 }
171 175 fpsetcw((uint16_t)arg2, (uint32_t)arg3);
172 176 return ((fp_kind & __FP_SSE) ? 1 : 0);
173 177
174 178 /* real time clock management commands */
175 179
176 180 case WTODC:
177 181 if ((error = secpolicy_settime(CRED())) == 0) {
178 182 timestruc_t ts;
179 183 mutex_enter(&tod_lock);
180 184 gethrestime(&ts);
181 185 tod_set(ts);
182 186 mutex_exit(&tod_lock);
183 187 }
184 188 break;
185 189
186 190 /* Give some timezone playing room */
187 191 #define ONEWEEK (7 * 24 * 60 * 60)
188 192
189 193 case SGMTL:
190 194 /*
191 195 * Called from 32 bit land, negative values
192 196 * are not sign extended, so we do that here
193 197 * by casting it to an int and back. We also
194 198 * clamp the value to within reason and detect
195 199 * when a 64 bit call overflows an int.
196 200 */
197 201 if ((error = secpolicy_settime(CRED())) == 0) {
198 202 int newlag = (int)arg1;
199 203
200 204 #ifdef _SYSCALL32_IMPL
201 205 if (get_udatamodel() == DATAMODEL_NATIVE &&
202 206 (long)newlag != (long)arg1) {
203 207 error = EOVERFLOW;
204 208 } else
205 209 #endif
206 210 if (newlag >= -ONEWEEK && newlag <= ONEWEEK)
207 211 sgmtl(newlag);
208 212 else
209 213 error = EOVERFLOW;
210 214 }
211 215 break;
212 216
213 217 case GGMTL:
214 218 if (get_udatamodel() == DATAMODEL_NATIVE) {
215 219 if (sulword((void *)arg1, ggmtl()) == -1)
216 220 error = EFAULT;
217 221 #ifdef _SYSCALL32_IMPL
218 222 } else {
219 223 time_t gmtl;
220 224
221 225 if ((gmtl = ggmtl()) > INT32_MAX) {
222 226 /*
223 227 * Since gmt_lag can at most be
224 228 * +/- 12 hours, something is
225 229 * *seriously* messed up here.
226 230 */
227 231 error = EOVERFLOW;
228 232 } else if (suword32((void *)arg1, (int32_t)gmtl) == -1)
229 233 error = EFAULT;
230 234 #endif
231 235 }
232 236 break;
233 237
234 238 case RTCSYNC:
235 239 if ((error = secpolicy_settime(CRED())) == 0)
236 240 rtcsync();
237 241 break;
238 242
239 243 /* END OF real time clock management commands */
240 244
241 245 default:
242 246 error = EINVAL;
243 247 break;
244 248 }
245 249 return (error == 0 ? 0 : set_errno(error));
246 250 }
247 251
248 252 void
249 253 usd_to_ssd(user_desc_t *usd, struct ssd *ssd, selector_t sel)
250 254 {
251 255 ssd->bo = USEGD_GETBASE(usd);
252 256 ssd->ls = USEGD_GETLIMIT(usd);
253 257 ssd->sel = sel;
254 258
255 259 /*
256 260 * set type, dpl and present bits.
257 261 */
258 262 ssd->acc1 = usd->usd_type;
259 263 ssd->acc1 |= usd->usd_dpl << 5;
260 264 ssd->acc1 |= usd->usd_p << (5 + 2);
261 265
262 266 /*
263 267 * set avl, DB and granularity bits.
264 268 */
265 269 ssd->acc2 = usd->usd_avl;
266 270
267 271 #if defined(__amd64)
268 272 ssd->acc2 |= usd->usd_long << 1;
269 273 #else
270 274 ssd->acc2 |= usd->usd_reserved << 1;
271 275 #endif
272 276
273 277 ssd->acc2 |= usd->usd_def32 << (1 + 1);
274 278 ssd->acc2 |= usd->usd_gran << (1 + 1 + 1);
275 279 }
276 280
277 281 static void
278 282 ssd_to_usd(struct ssd *ssd, user_desc_t *usd)
279 283 {
280 284
281 285 ASSERT(bcmp(usd, &null_udesc, sizeof (*usd)) == 0);
282 286
283 287 USEGD_SETBASE(usd, ssd->bo);
284 288 USEGD_SETLIMIT(usd, ssd->ls);
285 289
286 290 /*
287 291 * Set type, dpl and present bits.
288 292 *
289 293 * Force the "accessed" bit to on so that we don't run afoul of
290 294 * KPTI.
291 295 */
292 296 usd->usd_type = ssd->acc1 | SDT_A;
293 297 usd->usd_dpl = ssd->acc1 >> 5;
294 298 usd->usd_p = ssd->acc1 >> (5 + 2);
295 299
296 300 ASSERT(usd->usd_type >= SDT_MEMRO);
297 301 ASSERT(usd->usd_dpl == SEL_UPL);
298 302
299 303 /*
300 304 * 64-bit code selectors are never allowed in the LDT.
301 305 * Reserved bit is always 0 on 32-bit systems.
302 306 */
303 307 #if defined(__amd64)
304 308 usd->usd_long = 0;
305 309 #else
306 310 usd->usd_reserved = 0;
307 311 #endif
308 312
309 313 /*
310 314 * set avl, DB and granularity bits.
311 315 */
312 316 usd->usd_avl = ssd->acc2;
313 317 usd->usd_def32 = ssd->acc2 >> (1 + 1);
314 318 usd->usd_gran = ssd->acc2 >> (1 + 1 + 1);
315 319 }
316 320
317 321
318 322 #if defined(__i386)
319 323
320 324 static void
321 325 ssd_to_sgd(struct ssd *ssd, gate_desc_t *sgd)
322 326 {
323 327
324 328 ASSERT(bcmp(sgd, &null_sdesc, sizeof (*sgd)) == 0);
325 329
326 330 sgd->sgd_looffset = ssd->bo;
327 331 sgd->sgd_hioffset = ssd->bo >> 16;
328 332
329 333 sgd->sgd_selector = ssd->ls;
330 334
331 335 /*
332 336 * set type, dpl and present bits.
333 337 */
334 338 sgd->sgd_type = ssd->acc1;
335 339 sgd->sgd_dpl = ssd->acc1 >> 5;
336 340 sgd->sgd_p = ssd->acc1 >> 7;
337 341 ASSERT(sgd->sgd_type == SDT_SYSCGT);
338 342 ASSERT(sgd->sgd_dpl == SEL_UPL);
339 343 sgd->sgd_stkcpy = 0;
340 344 }
341 345
342 346 #endif /* __i386 */
343 347
344 348 /*
345 349 * Load LDT register with the current process's LDT.
346 350 */
347 351 static void
348 352 ldt_load(void)
349 353 {
350 354 #if defined(__xpv)
351 355 xen_set_ldt(curproc->p_ldt, curproc->p_ldtlimit + 1);
352 356 #else
353 357 size_t len;
354 358 system_desc_t desc;
355 359
356 360 /*
357 361 * Before we can use the LDT on this CPU, we must install the LDT in the
358 362 * user mapping table.
359 363 */
360 364 len = (curproc->p_ldtlimit + 1) * sizeof (user_desc_t);
361 365 bcopy(curproc->p_ldt, CPU->cpu_m.mcpu_ldt, len);
362 366 CPU->cpu_m.mcpu_ldt_len = len;
363 367 set_syssegd(&desc, CPU->cpu_m.mcpu_ldt, len - 1, SDT_SYSLDT, SEL_KPL);
364 368 *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = desc;
365 369
366 370 wr_ldtr(ULDT_SEL);
367 371 #endif
368 372 }
369 373
370 374 /*
371 375 * Store a NULL selector in the LDTR. All subsequent illegal references to
372 376 * the LDT will result in a #gp.
373 377 */
374 378 void
375 379 ldt_unload(void)
376 380 {
377 381 #if defined(__xpv)
378 382 xen_set_ldt(NULL, 0);
379 383 #else
380 384 *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = null_sdesc;
381 385 wr_ldtr(0);
382 386
383 387 bzero(CPU->cpu_m.mcpu_ldt, CPU->cpu_m.mcpu_ldt_len);
384 388 CPU->cpu_m.mcpu_ldt_len = 0;
385 389 #endif
386 390 }
387 391
388 392 /*ARGSUSED*/
389 393 static void
390 394 ldt_savectx(proc_t *p)
|
↓ open down ↓ |
269 lines elided |
↑ open up ↑ |
391 395 {
392 396 ASSERT(p->p_ldt != NULL);
393 397 ASSERT(p == curproc);
394 398
395 399 #if defined(__amd64)
396 400 /*
397 401 * The 64-bit kernel must be sure to clear any stale ldt
398 402 * selectors when context switching away from a process that
399 403 * has a private ldt. Consider the following example:
400 404 *
401 - * Wine creats a ldt descriptor and points a segment register
402 - * to it.
405 + * Wine creats a ldt descriptor and points a segment register
406 + * to it.
403 407 *
404 408 * We then context switch away from wine lwp to kernel
405 409 * thread and hit breakpoint in kernel with kmdb
406 410 *
407 411 * When we continue and resume from kmdb we will #gp
408 - * fault since kmdb will have saved the stale ldt selector
412 + * fault since kmdb will have saved the stale ldt selector
409 413 * from wine and will try to restore it but we are no longer in
410 414 * the context of the wine process and do not have our
411 415 * ldtr register pointing to the private ldt.
412 416 */
413 417 reset_sregs();
414 418 #endif
415 419
416 420 ldt_unload();
417 421 cpu_fast_syscall_enable();
418 422 }
419 423
420 424 static void
421 425 ldt_restorectx(proc_t *p)
422 426 {
423 427 ASSERT(p->p_ldt != NULL);
424 428 ASSERT(p == curproc);
425 429
426 430 ldt_load();
427 431 cpu_fast_syscall_disable();
428 432 }
429 433
430 434 /*
431 435 * At exec time, we need to clear up our LDT context and re-enable fast syscalls
432 436 * for the new process image.
433 437 *
434 438 * The same is true for the other case, where we have:
435 439 *
436 440 * proc_exit()
437 441 * ->exitpctx()->ldt_savectx()
438 442 * ->freepctx()->ldt_freectx()
439 443 *
440 444 * Because pre-emption is not prevented between the two callbacks, we could have
441 445 * come off CPU, and brought back LDT context when coming back on CPU via
442 446 * ldt_restorectx().
443 447 */
444 448 /* ARGSUSED */
445 449 static void
446 450 ldt_freectx(proc_t *p, int isexec)
447 451 {
448 452 ASSERT(p->p_ldt != NULL);
449 453 ASSERT(p == curproc);
450 454
451 455 kpreempt_disable();
452 456 ldt_free(p);
453 457 cpu_fast_syscall_enable();
454 458 kpreempt_enable();
455 459 }
456 460
457 461 /*
458 462 * Install ctx op that ensures syscall/sysenter are disabled.
459 463 * See comments below.
460 464 *
461 465 * When a thread with a private LDT forks, the new process
462 466 * must have the LDT context ops installed.
463 467 */
464 468 /* ARGSUSED */
465 469 static void
466 470 ldt_installctx(proc_t *p, proc_t *cp)
467 471 {
468 472 proc_t *targ = p;
469 473 kthread_t *t;
470 474
471 475 /*
472 476 * If this is a fork, operate on the child process.
473 477 */
474 478 if (cp != NULL) {
475 479 targ = cp;
476 480 ldt_dup(p, cp);
477 481 }
478 482
479 483 /*
480 484 * The process context ops expect the target process as their argument.
481 485 */
482 486 ASSERT(removepctx(targ, targ, ldt_savectx, ldt_restorectx,
483 487 ldt_installctx, ldt_savectx, ldt_freectx) == 0);
484 488
485 489 installpctx(targ, targ, ldt_savectx, ldt_restorectx,
486 490 ldt_installctx, ldt_savectx, ldt_freectx);
487 491
488 492 /*
489 493 * We've just disabled fast system call and return instructions; take
490 494 * the slow path out to make sure we don't try to use one to return
491 495 * back to user. We must set t_post_sys for every thread in the
492 496 * process to make sure none of them escape out via fast return.
493 497 */
494 498
495 499 mutex_enter(&targ->p_lock);
496 500 t = targ->p_tlist;
497 501 do {
498 502 t->t_post_sys = 1;
499 503 } while ((t = t->t_forw) != targ->p_tlist);
500 504 mutex_exit(&targ->p_lock);
501 505 }
502 506
503 507 int
504 508 setdscr(struct ssd *ssd)
505 509 {
506 510 ushort_t seli; /* selector index */
507 511 user_desc_t *ldp; /* descriptor pointer */
508 512 user_desc_t ndesc; /* new descriptor */
509 513 proc_t *pp = curproc;
510 514 int rc = 0;
511 515
512 516 /*
513 517 * LDT segments: executable and data at DPL 3 only.
514 518 */
515 519 if (!SELISLDT(ssd->sel) || !SELISUPL(ssd->sel))
516 520 return (EINVAL);
517 521
518 522 /*
519 523 * check the selector index.
520 524 */
521 525 seli = SELTOIDX(ssd->sel);
522 526 if (seli >= MAXNLDT || seli < LDT_UDBASE)
523 527 return (EINVAL);
524 528
525 529 ndesc = null_udesc;
526 530 mutex_enter(&pp->p_ldtlock);
527 531
528 532 /*
529 533 * If this is the first time for this process then setup a
530 534 * private LDT for it.
531 535 */
532 536 if (pp->p_ldt == NULL) {
533 537 ldt_alloc(pp, seli);
534 538
535 539 /*
536 540 * Now that this process has a private LDT, the use of
537 541 * the syscall/sysret and sysenter/sysexit instructions
538 542 * is forbidden for this processes because they destroy
539 543 * the contents of %cs and %ss segment registers.
540 544 *
541 545 * Explicity disable them here and add a context handler
542 546 * to the process. Note that disabling
543 547 * them here means we can't use sysret or sysexit on
544 548 * the way out of this system call - so we force this
545 549 * thread to take the slow path (which doesn't make use
546 550 * of sysenter or sysexit) back out.
547 551 */
548 552 kpreempt_disable();
549 553 ldt_installctx(pp, NULL);
550 554 cpu_fast_syscall_disable();
551 555 ASSERT(curthread->t_post_sys != 0);
552 556 kpreempt_enable();
553 557
554 558 } else if (seli > pp->p_ldtlimit) {
555 559 ASSERT(pp->p_pctx != NULL);
556 560
557 561 /*
558 562 * Increase size of ldt to include seli.
559 563 */
560 564 ldt_grow(pp, seli);
561 565 }
562 566
563 567 ASSERT(seli <= pp->p_ldtlimit);
564 568 ldp = &pp->p_ldt[seli];
565 569
566 570 /*
567 571 * On the 64-bit kernel, this is where things get more subtle.
568 572 * Recall that in the 64-bit kernel, when we enter the kernel we
569 573 * deliberately -don't- reload the segment selectors we came in on
570 574 * for %ds, %es, %fs or %gs. Messing with selectors is expensive,
571 575 * and the underlying descriptors are essentially ignored by the
572 576 * hardware in long mode - except for the base that we override with
573 577 * the gsbase MSRs.
574 578 *
575 579 * However, there's one unfortunate issue with this rosy picture --
576 580 * a descriptor that's not marked as 'present' will still generate
577 581 * an #np when loading a segment register.
578 582 *
579 583 * Consider this case. An lwp creates a harmless LDT entry, points
580 584 * one of it's segment registers at it, then tells the kernel (here)
581 585 * to delete it. In the 32-bit kernel, the #np will happen on the
582 586 * way back to userland where we reload the segment registers, and be
583 587 * handled in kern_gpfault(). In the 64-bit kernel, the same thing
584 588 * will happen in the normal case too. However, if we're trying to
585 589 * use a debugger that wants to save and restore the segment registers,
586 590 * and the debugger things that we have valid segment registers, we
587 591 * have the problem that the debugger will try and restore the
588 592 * segment register that points at the now 'not present' descriptor
589 593 * and will take a #np right there.
590 594 *
591 595 * We should obviously fix the debugger to be paranoid about
592 596 * -not- restoring segment registers that point to bad descriptors;
593 597 * however we can prevent the problem here if we check to see if any
594 598 * of the segment registers are still pointing at the thing we're
595 599 * destroying; if they are, return an error instead. (That also seems
596 600 * a lot better failure mode than SIGKILL and a core file
597 601 * from kern_gpfault() too.)
598 602 */
599 603 if (SI86SSD_PRES(ssd) == 0) {
600 604 kthread_t *t;
601 605 int bad = 0;
602 606
603 607 /*
604 608 * Look carefully at the segment registers of every lwp
605 609 * in the process (they're all stopped by our caller).
606 610 * If we're about to invalidate a descriptor that's still
607 611 * being referenced by *any* of them, return an error,
608 612 * rather than having them #gp on their way out of the kernel.
609 613 */
610 614 ASSERT(pp->p_lwprcnt == 1);
611 615
612 616 mutex_enter(&pp->p_lock);
613 617 t = pp->p_tlist;
614 618 do {
615 619 klwp_t *lwp = ttolwp(t);
616 620 struct regs *rp = lwp->lwp_regs;
617 621 #if defined(__amd64)
618 622 pcb_t *pcb = &lwp->lwp_pcb;
619 623 #endif
620 624
621 625 if (ssd->sel == rp->r_cs || ssd->sel == rp->r_ss) {
622 626 bad = 1;
623 627 break;
624 628 }
625 629
626 630 #if defined(__amd64)
627 631 if (PCB_NEED_UPDATE_SEGS(pcb)) {
628 632 if (ssd->sel == pcb->pcb_ds ||
629 633 ssd->sel == pcb->pcb_es ||
630 634 ssd->sel == pcb->pcb_fs ||
631 635 ssd->sel == pcb->pcb_gs) {
632 636 bad = 1;
633 637 break;
634 638 }
635 639 } else
636 640 #endif
637 641 {
638 642 if (ssd->sel == rp->r_ds ||
639 643 ssd->sel == rp->r_es ||
640 644 ssd->sel == rp->r_fs ||
641 645 ssd->sel == rp->r_gs) {
642 646 bad = 1;
643 647 break;
644 648 }
645 649 }
646 650
647 651 } while ((t = t->t_forw) != pp->p_tlist);
648 652 mutex_exit(&pp->p_lock);
649 653
650 654 if (bad) {
651 655 mutex_exit(&pp->p_ldtlock);
652 656 return (EBUSY);
653 657 }
654 658 }
655 659
656 660 /*
657 661 * If acc1 is zero, clear the descriptor (including the 'present' bit).
658 662 * Make sure we update the CPU-private copy of the LDT.
659 663 */
660 664 if (ssd->acc1 == 0) {
661 665 rc = ldt_update_segd(ldp, &null_udesc);
662 666 kpreempt_disable();
663 667 ldt_load();
664 668 kpreempt_enable();
665 669 mutex_exit(&pp->p_ldtlock);
666 670 return (rc);
667 671 }
668 672
669 673 /*
670 674 * Check segment type, allow segment not present and
671 675 * only user DPL (3).
672 676 */
673 677 if (SI86SSD_DPL(ssd) != SEL_UPL) {
674 678 mutex_exit(&pp->p_ldtlock);
675 679 return (EINVAL);
676 680 }
677 681
678 682 /*
679 683 * Do not allow 32-bit applications to create 64-bit mode code
680 684 * segments.
681 685 */
682 686 if (SI86SSD_ISUSEG(ssd) && ((SI86SSD_TYPE(ssd) >> 3) & 1) == 1 &&
683 687 SI86SSD_ISLONG(ssd)) {
684 688 mutex_exit(&pp->p_ldtlock);
685 689 return (EINVAL);
686 690 }
687 691
688 692 /*
689 693 * Set up a code or data user segment descriptor, making sure to update
690 694 * the CPU-private copy of the LDT.
691 695 */
692 696 if (SI86SSD_ISUSEG(ssd)) {
693 697 ssd_to_usd(ssd, &ndesc);
694 698 rc = ldt_update_segd(ldp, &ndesc);
695 699 kpreempt_disable();
696 700 ldt_load();
697 701 kpreempt_enable();
698 702 mutex_exit(&pp->p_ldtlock);
699 703 return (rc);
700 704 }
701 705
702 706 mutex_exit(&pp->p_ldtlock);
703 707 return (EINVAL);
704 708 }
705 709
706 710 /*
707 711 * Allocate new LDT for process just large enough to contain seli. Note we
708 712 * allocate and grow LDT in PAGESIZE chunks. We do this to simplify the
709 713 * implementation and because on the hypervisor it's required, since the LDT
710 714 * must live on pages that have PROT_WRITE removed and which are given to the
711 715 * hypervisor.
712 716 *
713 717 * Note that we don't actually load the LDT into the current CPU here: it's done
714 718 * later by our caller.
715 719 */
716 720 static void
717 721 ldt_alloc(proc_t *pp, uint_t seli)
718 722 {
719 723 user_desc_t *ldt;
720 724 size_t ldtsz;
721 725 uint_t nsels;
722 726
723 727 ASSERT(MUTEX_HELD(&pp->p_ldtlock));
724 728 ASSERT(pp->p_ldt == NULL);
725 729 ASSERT(pp->p_ldtlimit == 0);
726 730
727 731 /*
728 732 * Allocate new LDT just large enough to contain seli. The LDT must
729 733 * always be allocated in units of pages for KPTI.
730 734 */
731 735 ldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE);
732 736 nsels = ldtsz / sizeof (user_desc_t);
733 737 ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT);
734 738
735 739 ldt = kmem_zalloc(ldtsz, KM_SLEEP);
736 740 ASSERT(IS_P2ALIGNED(ldt, PAGESIZE));
737 741
738 742 #if defined(__xpv)
739 743 if (xen_ldt_setprot(ldt, ldtsz, PROT_READ))
740 744 panic("ldt_alloc:xen_ldt_setprot(PROT_READ) failed");
741 745 #endif
742 746
743 747 pp->p_ldt = ldt;
744 748 pp->p_ldtlimit = nsels - 1;
745 749 }
746 750
747 751 static void
748 752 ldt_free(proc_t *pp)
749 753 {
750 754 user_desc_t *ldt;
751 755 size_t ldtsz;
752 756
753 757 ASSERT(pp->p_ldt != NULL);
754 758
755 759 mutex_enter(&pp->p_ldtlock);
756 760 ldt = pp->p_ldt;
757 761 ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
758 762
759 763 ASSERT(IS_P2ALIGNED(ldtsz, PAGESIZE));
760 764
761 765 pp->p_ldt = NULL;
762 766 pp->p_ldtlimit = 0;
763 767 mutex_exit(&pp->p_ldtlock);
764 768
765 769 if (pp == curproc) {
766 770 kpreempt_disable();
767 771 ldt_unload();
768 772 kpreempt_enable();
769 773 }
770 774
771 775 #if defined(__xpv)
772 776 /*
773 777 * We are not allowed to make the ldt writable until after
774 778 * we tell the hypervisor to unload it.
775 779 */
776 780 if (xen_ldt_setprot(ldt, ldtsz, PROT_READ | PROT_WRITE))
777 781 panic("ldt_free:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
778 782 #endif
779 783
780 784 kmem_free(ldt, ldtsz);
781 785 }
782 786
783 787 /*
784 788 * On fork copy new ldt for child.
785 789 */
786 790 static void
787 791 ldt_dup(proc_t *pp, proc_t *cp)
788 792 {
789 793 size_t ldtsz;
790 794
791 795 ASSERT(pp->p_ldt != NULL);
792 796 ASSERT(cp != curproc);
793 797
794 798 /*
795 799 * I assume the parent's ldt can't increase since we're in a fork.
796 800 */
797 801 mutex_enter(&pp->p_ldtlock);
798 802 mutex_enter(&cp->p_ldtlock);
799 803
800 804 ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
801 805
802 806 ldt_alloc(cp, pp->p_ldtlimit);
803 807
804 808 #if defined(__xpv)
805 809 /*
806 810 * Make child's ldt writable so it can be copied into from
807 811 * parent's ldt. This works since ldt_alloc above did not load
808 812 * the ldt since its for the child process. If we tried to make
809 813 * an LDT writable that is loaded in hw the setprot operation
810 814 * would fail.
811 815 */
812 816 if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ | PROT_WRITE))
813 817 panic("ldt_dup:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
814 818 #endif
815 819
816 820 bcopy(pp->p_ldt, cp->p_ldt, ldtsz);
817 821
818 822 #if defined(__xpv)
819 823 if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ))
820 824 panic("ldt_dup:xen_ldt_setprot(PROT_READ) failed");
821 825 #endif
822 826 mutex_exit(&cp->p_ldtlock);
823 827 mutex_exit(&pp->p_ldtlock);
824 828
825 829 }
826 830
827 831 /*
828 832 * Note that we don't actually load the LDT into the current CPU here: it's done
829 833 * later by our caller - unless we take an error. This works out because
830 834 * ldt_load() does a copy of ->p_ldt instead of directly loading it into the GDT
831 835 * (and therefore can't be using the freed old LDT), and by definition if the
832 836 * new entry didn't pass validation, then the proc shouldn't be referencing an
833 837 * entry in the extended region.
834 838 */
835 839 static void
836 840 ldt_grow(proc_t *pp, uint_t seli)
837 841 {
838 842 user_desc_t *oldt, *nldt;
839 843 uint_t nsels;
840 844 size_t oldtsz, nldtsz;
841 845
842 846 ASSERT(MUTEX_HELD(&pp->p_ldtlock));
843 847 ASSERT(pp->p_ldt != NULL);
844 848 ASSERT(pp->p_ldtlimit != 0);
845 849
846 850 /*
847 851 * Allocate larger LDT just large enough to contain seli. The LDT must
848 852 * always be allocated in units of pages for KPTI.
849 853 */
850 854 nldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE);
851 855 nsels = nldtsz / sizeof (user_desc_t);
852 856 ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT);
853 857 ASSERT(nsels > pp->p_ldtlimit);
854 858
855 859 oldt = pp->p_ldt;
856 860 oldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
857 861
858 862 nldt = kmem_zalloc(nldtsz, KM_SLEEP);
859 863 ASSERT(IS_P2ALIGNED(nldt, PAGESIZE));
860 864
861 865 bcopy(oldt, nldt, oldtsz);
862 866
863 867 /*
864 868 * unload old ldt.
865 869 */
866 870 kpreempt_disable();
867 871 ldt_unload();
868 872 kpreempt_enable();
869 873
870 874 #if defined(__xpv)
871 875
872 876 /*
873 877 * Make old ldt writable and new ldt read only.
874 878 */
875 879 if (xen_ldt_setprot(oldt, oldtsz, PROT_READ | PROT_WRITE))
876 880 panic("ldt_grow:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
877 881
878 882 if (xen_ldt_setprot(nldt, nldtsz, PROT_READ))
879 883 panic("ldt_grow:xen_ldt_setprot(PROT_READ) failed");
880 884 #endif
881 885
882 886 pp->p_ldt = nldt;
883 887 pp->p_ldtlimit = nsels - 1;
884 888
885 889 kmem_free(oldt, oldtsz);
886 890 }
|
↓ open down ↓ |
468 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX