1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 /*
27 * Copyright 2022 Joyent, Inc.
28 */
29
30 #include <sys/errno.h>
31 #include <sys/systm.h>
32 #include <sys/archsystm.h>
33 #include <sys/privregs.h>
34 #include <sys/exec.h>
35 #include <sys/lwp.h>
36 #include <sys/sem.h>
37 #include <sys/brand.h>
38 #include <sys/lx_brand.h>
39 #include <sys/lx_misc.h>
40 #include <sys/lx_siginfo.h>
41 #include <sys/lx_futex.h>
42 #include <lx_errno.h>
43 #include <sys/lx_userhz.h>
44 #include <sys/cmn_err.h>
45 #include <sys/siginfo.h>
46 #include <sys/contract/process_impl.h>
47 #include <sys/x86_archext.h>
48 #include <sys/sdt.h>
49 #include <lx_signum.h>
50 #include <lx_syscall.h>
51 #include <sys/proc.h>
52 #include <sys/procfs.h>
53 #include <net/if.h>
54 #include <inet/ip6.h>
55 #include <sys/sunddi.h>
56 #include <sys/dlpi.h>
57 #include <sys/sysmacros.h>
58
59 /* Linux specific functions and definitions */
60 static void lx_save(void *);
61 static void lx_restore(void *);
62
63 /* Context op template. */
64 static const struct ctxop_template lx_ctxop_template = {
65 .ct_rev = CTXOP_TPL_REV,
66 .ct_save = lx_save,
67 .ct_restore = lx_restore,
68 .ct_exit = lx_save,
69 };
70
71 /*
72 * Set the return code for the forked child, always zero
73 */
74 /*ARGSUSED*/
75 void
76 lx_setrval(klwp_t *lwp, int v1, int v2)
77 {
78 lwptoregs(lwp)->r_r0 = 0;
79 }
80
81 /*
82 * Reset process state on exec(2)
83 */
84 void
85 lx_exec()
86 {
87 klwp_t *lwp = ttolwp(curthread);
88 struct lx_lwp_data *lwpd = lwptolxlwp(lwp);
89 proc_t *p = ttoproc(curthread);
90 lx_proc_data_t *pd = ptolxproc(p);
91 struct regs *rp = lwptoregs(lwp);
92
93 /* b_exec is called without p_lock held */
94 VERIFY(MUTEX_NOT_HELD(&p->p_lock));
95
96 /*
97 * Any l_handler handlers set as a result of B_REGISTER are now
98 * invalid; clear them.
99 */
100 pd->l_handler = (uintptr_t)NULL;
101
102 /*
103 * If this was a multi-threaded Linux process and this lwp wasn't the
104 * main lwp, then we need to make its Illumos and Linux PIDs match.
105 */
106 if (curthread->t_tid != 1) {
107 lx_pid_reassign(curthread);
108 }
109
110 /*
111 * Inform ptrace(2) that we are processing an execve(2) call so that if
112 * we are traced we can post either the PTRACE_EVENT_EXEC event or the
113 * legacy SIGTRAP.
114 */
115 (void) lx_ptrace_stop_for_option(LX_PTRACE_O_TRACEEXEC, B_FALSE, 0, 0);
116
117 /* clear the fs/gsbase values until the app. can reinitialize them */
118 lwpd->br_lx_fsbase = (uintptr_t)NULL;
119 lwpd->br_ntv_fsbase = (uintptr_t)NULL;
120 lwpd->br_lx_gsbase = (uintptr_t)NULL;
121 lwpd->br_ntv_gsbase = (uintptr_t)NULL;
122
123 /*
124 * Clear the native stack flags. This will be reinitialised by
125 * lx_init() in the new process image.
126 */
127 lwpd->br_stack_mode = LX_STACK_MODE_PREINIT;
128 lwpd->br_ntv_stack = 0;
129 lwpd->br_ntv_stack_current = 0;
130
131 ctxop_install(lwptot(lwp), &lx_ctxop_template, lwp);
132
133 /*
134 * clear out the tls array
135 */
136 bzero(lwpd->br_tls, sizeof (lwpd->br_tls));
137
138 /*
139 * reset the tls entries in the gdt
140 */
141 kpreempt_disable();
142 lx_restore(lwp);
143 kpreempt_enable();
144
145 /*
146 * The exec syscall doesn't return (so we don't call lx_syscall_return)
147 * but for our ptrace emulation we need to do this so that a tracer
148 * does not get out of sync. We know that by the time this lx_exec
149 * function is called that the exec has succeeded.
150 */
151 rp->r_r0 = 0;
152 (void) lx_ptrace_stop(LX_PR_SYSEXIT);
153 }
154
155 static void
156 lx_cleanlwp(klwp_t *lwp, proc_t *p)
157 {
158 struct lx_lwp_data *lwpd = lwptolxlwp(lwp);
159 void *rb_list = NULL;
160
161 VERIFY(lwpd != NULL);
162
163 mutex_enter(&p->p_lock);
164 if ((lwpd->br_ptrace_flags & LX_PTF_EXITING) == 0) {
165 lx_ptrace_exit(p, lwp);
166 }
167
168 /*
169 * While we have p_lock, clear the TP_KTHREAD flag. This is needed
170 * to prevent races within lx procfs. It's fine for prchoose() to pick
171 * this thread now since it is exiting and no longer blocked in the
172 * kernel.
173 */
174 lwptot(lwp)->t_proc_flag &= ~TP_KTHREAD;
175
176 /*
177 * While we have p_lock, safely grab any robust_list references and
178 * clear the lwp field.
179 */
180 sprlock_proc(p);
181 rb_list = lwpd->br_robust_list;
182 lwpd->br_robust_list = NULL;
183 sprunlock(p);
184
185 if (rb_list != NULL) {
186 lx_futex_robust_exit((uintptr_t)rb_list, lwpd->br_pid);
187 }
188
189 /*
190 * We need to run our context exit operation (lx_save) here to ensure
191 * we don't leave any garbage around. This is necessary to handle the
192 * following calling sequence:
193 * exit -> proc_exit -> lx_freelwp -> removectx
194 * That is, when our branded process exits, proc_exit will call our
195 * lx_freelwp brand hook which does call this function (lx_cleanlwp),
196 * but lx_freelwp also removes our context exit operation. The context
197 * exit functions are run by exitctx, which is called by either
198 * lwp_exit or thread_exit. The thread_exit function is called at the
199 * end of proc_exit when we'll swtch() to another thread, but by then
200 * our context exit function has been removed.
201 *
202 * It's ok if this function happens to be called more than once (for
203 * example, if we exec a native binary).
204 */
205 kpreempt_disable();
206 lx_save(lwp);
207 kpreempt_enable();
208 }
209
210 void
211 lx_exitlwp(klwp_t *lwp)
212 {
213 struct lx_lwp_data *lwpd = lwptolxlwp(lwp);
214 proc_t *p = lwptoproc(lwp);
215 kthread_t *t;
216 sigqueue_t *sqp = NULL;
217 pid_t ppid;
218 id_t ptid;
219
220 VERIFY(MUTEX_NOT_HELD(&p->p_lock));
221
222 if (lwpd == NULL) {
223 /* second time thru' */
224 return;
225 }
226
227 lx_cleanlwp(lwp, p);
228
229 if (lwpd->br_clear_ctidp != NULL) {
230 (void) suword32(lwpd->br_clear_ctidp, 0);
231 (void) lx_futex((uintptr_t)lwpd->br_clear_ctidp, FUTEX_WAKE, 1,
232 (uintptr_t)NULL, (uintptr_t)NULL, 0);
233 lwpd->br_clear_ctidp = NULL;
234 }
235
236 if (lwpd->br_signal != 0) {
237 /*
238 * The first thread in a process doesn't cause a signal to
239 * be sent when it exits. It was created by a fork(), not
240 * a clone(), so the parent should get signalled when the
241 * process exits.
242 */
243 if (lwpd->br_ptid == -1)
244 goto free;
245
246 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
247 /*
248 * If br_ppid is 0, it means this is a CLONE_PARENT thread,
249 * so the signal goes to the parent process - not to a
250 * specific thread in this process.
251 */
252 p = lwptoproc(lwp);
253 if (lwpd->br_ppid == 0) {
254 mutex_enter(&p->p_lock);
255 ppid = p->p_ppid;
256 t = NULL;
257 } else {
258 /*
259 * If we have been reparented to init or if our
260 * parent thread is gone, then nobody gets
261 * signaled.
262 */
263 if ((lx_lwp_ppid(lwp, &ppid, &ptid) == 1) ||
264 (ptid == -1))
265 goto free;
266
267 mutex_enter(&pidlock);
268 if ((p = prfind(ppid)) == NULL || p->p_stat == SIDL) {
269 mutex_exit(&pidlock);
270 goto free;
271 }
272 mutex_enter(&p->p_lock);
273 mutex_exit(&pidlock);
274
275 if ((t = idtot(p, ptid)) == NULL) {
276 mutex_exit(&p->p_lock);
277 goto free;
278 }
279 }
280
281 sqp->sq_info.si_signo = lwpd->br_signal;
282 sqp->sq_info.si_code = lwpd->br_exitwhy;
283 sqp->sq_info.si_status = lwpd->br_exitwhat;
284 sqp->sq_info.si_pid = lwpd->br_pid;
285 sqp->sq_info.si_uid = crgetruid(CRED());
286 sigaddqa(p, t, sqp);
287 mutex_exit(&p->p_lock);
288 sqp = NULL;
289 }
290
291 free:
292 if (lwpd->br_scall_args != NULL) {
293 ASSERT(lwpd->br_args_size > 0);
294 kmem_free(lwpd->br_scall_args, lwpd->br_args_size);
295 }
296 if (sqp)
297 kmem_free(sqp, sizeof (sigqueue_t));
298 }
299
300 void
301 lx_freelwp(klwp_t *lwp)
302 {
303 struct lx_lwp_data *lwpd = lwptolxlwp(lwp);
304 proc_t *p = lwptoproc(lwp);
305 lx_zone_data_t *lxzdata;
306 vfs_t *cgrp;
307
308 VERIFY(MUTEX_NOT_HELD(&p->p_lock));
309
310 if (lwpd == NULL) {
311 /*
312 * There is one case where an LX branded process will possess
313 * LWPs which lack their own brand data. During the course of
314 * executing native binary, the process will be preemptively
315 * branded to allow hooks such as b_native_exec to function.
316 * If that process possesses multiple LWPS, they will _not_ be
317 * branded since they will exit if the exec succeeds. It's
318 * during this LWP exit that lx_freelwp would be called on an
319 * unbranded LWP. When that is the case, it is acceptable to
320 * bypass the hook.
321 */
322 return;
323 }
324
325 /* cgroup integration */
326 lxzdata = ztolxzd(p->p_zone);
327 mutex_enter(&lxzdata->lxzd_lock);
328 cgrp = lxzdata->lxzd_cgroup;
329 if (cgrp != NULL) {
330 VFS_HOLD(cgrp);
331 mutex_exit(&lxzdata->lxzd_lock);
332 ASSERT(lx_cgrp_freelwp != NULL);
333 (*lx_cgrp_freelwp)(cgrp, lwpd->br_cgroupid, lwptot(lwp)->t_tid,
334 lwpd->br_pid);
335 VFS_RELE(cgrp);
336 } else {
337 mutex_exit(&lxzdata->lxzd_lock);
338 }
339
340 /*
341 * It is possible for the lx_freelwp hook to be called without a prior
342 * call to lx_exitlwp being made. This happens as part of lwp
343 * de-branding when a native binary is executed from a branded process.
344 *
345 * To cover all cases, lx_cleanlwp is called from lx_exitlwp as well
346 * here in lx_freelwp. When the second call is redundant, the
347 * resources will already be freed and no work will be needed.
348 */
349 lx_cleanlwp(lwp, p);
350
351 /*
352 * Remove our system call interposer.
353 */
354 lwp->lwp_brand_syscall = NULL;
355
356 /*
357 * If this process is being de-branded during an exec(),
358 * the LX ctxops may have already been removed, so the result
359 * from ctxop_remove is irrelevant.
360 */
361 (void) ctxop_remove(lwptot(lwp), &lx_ctxop_template, lwp);
362 if (lwpd->br_pid != 0) {
363 lx_pid_rele(lwptoproc(lwp)->p_pid, lwptot(lwp)->t_tid);
364 }
365
366 /*
367 * Discard the affinity mask.
368 */
369 VERIFY(lwpd->br_affinitymask != NULL);
370 cpuset_free(lwpd->br_affinitymask);
371 lwpd->br_affinitymask = NULL;
372
373 /*
374 * Ensure that lx_ptrace_exit() has been called to detach
375 * ptrace(2) tracers and tracees.
376 */
377 VERIFY(lwpd->br_ptrace_tracer == NULL);
378 VERIFY(lwpd->br_ptrace_accord == NULL);
379
380 lwp->lwp_brand = NULL;
381 kmem_free(lwpd, sizeof (struct lx_lwp_data));
382 }
383
384 void *
385 lx_lwpdata_alloc(proc_t *p)
386 {
387 lx_lwp_data_t *lwpd;
388 struct lx_pid *lpidp;
389 cpuset_t *affmask;
390 pid_t newpid = 0;
391 struct pid *pidp = NULL;
392
393 VERIFY(MUTEX_NOT_HELD(&p->p_lock));
394
395 /*
396 * LWPs beyond the first will require a pid to be allocated to emulate
397 * Linux's goofy thread model. While this allocation may be
398 * unnecessary when a single-lwp process undergoes branding, it cannot
399 * be performed during b_initlwp due to p_lock being held.
400 */
401 if (p->p_lwpcnt > 0) {
402 if ((newpid = pid_allocate(p, 0, 0)) < 0) {
403 return (NULL);
404 }
405 pidp = pid_find(newpid);
406 }
407
408 lwpd = kmem_zalloc(sizeof (struct lx_lwp_data), KM_SLEEP);
409 lpidp = kmem_zalloc(sizeof (struct lx_pid), KM_SLEEP);
410 affmask = cpuset_alloc(KM_SLEEP);
411
412 lpidp->lxp_lpid = newpid;
413 lpidp->lxp_pidp = pidp;
414 lwpd->br_lpid = lpidp;
415 lwpd->br_affinitymask = affmask;
416
417 return (lwpd);
418 }
419
420 /*
421 * Free lwp brand data if an error occurred during lwp_create.
422 * Otherwise, lx_freelwp will be used to free the resources after they're
423 * associated with the lwp via lx_initlwp.
424 */
425 void
426 lx_lwpdata_free(void *lwpbd)
427 {
428 lx_lwp_data_t *lwpd = (lx_lwp_data_t *)lwpbd;
429 VERIFY(lwpd != NULL);
430 VERIFY(lwpd->br_lpid != NULL);
431 VERIFY(lwpd->br_affinitymask != NULL);
432
433 cpuset_free(lwpd->br_affinitymask);
434 if (lwpd->br_lpid->lxp_pidp != NULL) {
435 (void) pid_rele(lwpd->br_lpid->lxp_pidp);
436 }
437 kmem_free(lwpd->br_lpid, sizeof (*lwpd->br_lpid));
438 kmem_free(lwpd, sizeof (*lwpd));
439 }
440
441 void
442 lx_initlwp(klwp_t *lwp, void *lwpbd)
443 {
444 lx_lwp_data_t *lwpd = (lx_lwp_data_t *)lwpbd;
445 lx_lwp_data_t *plwpd = ttolxlwp(curthread);
446 kthread_t *tp = lwptot(lwp);
447 proc_t *p = lwptoproc(lwp);
448 lx_zone_data_t *lxzdata;
449 vfs_t *cgrp;
450
451 VERIFY(MUTEX_HELD(&p->p_lock));
452 VERIFY(lwp->lwp_brand == NULL);
453
454 lwpd->br_exitwhy = CLD_EXITED;
455 lwpd->br_lwp = lwp;
456 lwpd->br_clear_ctidp = NULL;
457 lwpd->br_set_ctidp = NULL;
458 lwpd->br_signal = 0;
459 lwpd->br_stack_mode = LX_STACK_MODE_PREINIT;
460 cpuset_all(lwpd->br_affinitymask);
461
462 /*
463 * The first thread in a process has ppid set to the parent
464 * process's pid, and ptid set to -1. Subsequent threads in the
465 * process have their ppid set to the pid of the thread that
466 * created them, and their ptid to that thread's tid.
467 */
468 if (tp->t_next == tp) {
469 lwpd->br_ppid = tp->t_procp->p_ppid;
470 lwpd->br_ptid = -1;
471 } else if (plwpd != NULL) {
472 bcopy(plwpd->br_tls, lwpd->br_tls, sizeof (lwpd->br_tls));
473 lwpd->br_ppid = plwpd->br_pid;
474 lwpd->br_ptid = curthread->t_tid;
475 /* The child inherits the fs/gsbase values from the parent */
476 lwpd->br_lx_fsbase = plwpd->br_lx_fsbase;
477 lwpd->br_ntv_fsbase = plwpd->br_ntv_fsbase;
478 lwpd->br_lx_gsbase = plwpd->br_lx_gsbase;
479 lwpd->br_ntv_gsbase = plwpd->br_ntv_gsbase;
480 } else {
481 /*
482 * Oddball case: the parent thread isn't a Linux process.
483 */
484 lwpd->br_ppid = 0;
485 lwpd->br_ptid = -1;
486 }
487 lwp->lwp_brand = lwpd;
488
489 /*
490 * When during lx_lwpdata_alloc, we must decide whether or not to
491 * allocate a new pid to associate with the lwp. Since p_lock is not
492 * held at that point, the only time we can guarantee a new pid isn't
493 * needed is when p_lwpcnt == 0. This is because other lwps won't be
494 * present to race with us with regards to pid allocation.
495 *
496 * This means that in all other cases (where p_lwpcnt > 0), we expect
497 * that lx_lwpdata_alloc will allocate a pid for us to use here, even
498 * if it is uneeded. If this process is undergoing an exec, for
499 * example, the single existing lwp will not need a new pid when it is
500 * rebranded. In that case, lx_pid_assign will free the uneeded pid.
501 */
502 VERIFY(lwpd->br_lpid->lxp_pidp != NULL || p->p_lwpcnt == 0);
503
504 lx_pid_assign(tp, lwpd->br_lpid);
505 lwpd->br_tgid = lwpd->br_pid;
506 /*
507 * Having performed the lx pid assignement, the lpid reference is no
508 * longer needed. The underlying data will be freed during lx_freelwp.
509 */
510 lwpd->br_lpid = NULL;
511
512 ctxop_install(lwptot(lwp), &lx_ctxop_template, lwp);
513
514 /*
515 * Install branded system call hooks for this LWP:
516 */
517 lwp->lwp_brand_syscall = lx_syscall_enter;
518
519 /*
520 * The new LWP inherits the parent LWP cgroup ID.
521 */
522 if (plwpd != NULL) {
523 lwpd->br_cgroupid = plwpd->br_cgroupid;
524 }
525 /*
526 * The new LWP inherits the parent LWP emulated scheduling info.
527 */
528 if (plwpd != NULL) {
529 lwpd->br_schd_class = plwpd->br_schd_class;
530 lwpd->br_schd_pri = plwpd->br_schd_pri;
531 lwpd->br_schd_flags = plwpd->br_schd_flags;
532 lwpd->br_schd_runtime = plwpd->br_schd_runtime;
533 lwpd->br_schd_deadline = plwpd->br_schd_deadline;
534 lwpd->br_schd_period = plwpd->br_schd_period;
535 }
536 lxzdata = ztolxzd(p->p_zone);
537 mutex_enter(&lxzdata->lxzd_lock);
538 cgrp = lxzdata->lxzd_cgroup;
539 if (cgrp != NULL) {
540 VFS_HOLD(cgrp);
541 mutex_exit(&lxzdata->lxzd_lock);
542 ASSERT(lx_cgrp_initlwp != NULL);
543 (*lx_cgrp_initlwp)(cgrp, lwpd->br_cgroupid, lwptot(lwp)->t_tid,
544 lwpd->br_pid);
545 VFS_RELE(cgrp);
546 } else {
547 mutex_exit(&lxzdata->lxzd_lock);
548 }
549 }
550
551 void
552 lx_initlwp_post(klwp_t *lwp)
553 {
554 lx_lwp_data_t *plwpd = ttolxlwp(curthread);
555 /*
556 * If the parent LWP has a ptrace(2) tracer, the new LWP may
557 * need to inherit that same tracer.
558 */
559 if (plwpd != NULL) {
560 lx_ptrace_inherit_tracer(plwpd, lwptolxlwp(lwp));
561 }
562 }
563
564 /*
565 * There is no need to have any locking for either the source or
566 * destination struct lx_lwp_data structs. This is always run in the
567 * thread context of the source thread, and the destination thread is
568 * always newly created and not referred to from anywhere else.
569 */
570 void
571 lx_forklwp(klwp_t *srclwp, klwp_t *dstlwp)
572 {
573 struct lx_lwp_data *src = srclwp->lwp_brand;
574 struct lx_lwp_data *dst = dstlwp->lwp_brand;
575
576 dst->br_ppid = src->br_pid;
577 dst->br_ptid = lwptot(srclwp)->t_tid;
578 bcopy(src->br_tls, dst->br_tls, sizeof (dst->br_tls));
579
580 switch (src->br_stack_mode) {
581 case LX_STACK_MODE_BRAND:
582 case LX_STACK_MODE_NATIVE:
583 /*
584 * The parent LWP has an alternate stack installed.
585 * The child LWP should have the same stack base and extent.
586 */
587 dst->br_stack_mode = src->br_stack_mode;
588 dst->br_ntv_stack = src->br_ntv_stack;
589 dst->br_ntv_stack_current = src->br_ntv_stack_current;
590 break;
591
592 default:
593 /*
594 * Otherwise, clear the stack data for this LWP.
595 */
596 dst->br_stack_mode = LX_STACK_MODE_PREINIT;
597 dst->br_ntv_stack = 0;
598 dst->br_ntv_stack_current = 0;
599 }
600
601 /*
602 * copy only these flags
603 */
604 dst->br_lwp_flags = src->br_lwp_flags & BR_CPU_BOUND;
605 dst->br_scall_args = NULL;
606 lx_affinity_forklwp(srclwp, dstlwp);
607
608 /*
609 * Flag so child doesn't ptrace-stop on syscall exit.
610 */
611 dst->br_ptrace_flags |= LX_PTF_NOSTOP;
612
613 if (src->br_clone_grp_flags != 0) {
614 lx_clone_grp_enter(src->br_clone_grp_flags, lwptoproc(srclwp),
615 lwptoproc(dstlwp));
616 /* clone group no longer pending on this thread */
617 src->br_clone_grp_flags = 0;
618 }
619 }
620
621 /*
622 * When switching a Linux process off the CPU, clear its GDT entries.
623 */
624 /* ARGSUSED */
625 static void
626 lx_save(void *arg)
627 {
628 klwp_t *t = (klwp_t *)arg;
629 int i;
630
631 #if defined(__amd64)
632 reset_sregs();
633 #endif
634 for (i = 0; i < LX_TLSNUM; i++)
635 gdt_update_usegd(GDT_TLSMIN + i, &null_udesc);
636 }
637
638 /*
639 * When switching a Linux process on the CPU, set its GDT entries.
640 *
641 * For 64-bit code we don't have to worry about explicitly setting the
642 * %fsbase via wrmsr(MSR_AMD_FSBASE) here. Instead, that should happen
643 * automatically in update_sregs if we are executing in user-land. If this
644 * is the case then pcb_rupdate should be set.
645 */
646 static void
647 lx_restore(void *arg)
648 {
649 klwp_t *t = (klwp_t *)arg;
650 struct lx_lwp_data *lwpd = lwptolxlwp(t);
651 user_desc_t *tls;
652 int i;
653
654 ASSERT(lwpd);
655
656 tls = lwpd->br_tls;
657 for (i = 0; i < LX_TLSNUM; i++)
658 gdt_update_usegd(GDT_TLSMIN + i, &tls[i]);
659 }
660
661 void
662 lx_set_gdt(int entry, user_desc_t *descrp)
663 {
664
665 gdt_update_usegd(entry, descrp);
666 }
667
668 void
669 lx_clear_gdt(int entry)
670 {
671 gdt_update_usegd(entry, &null_udesc);
672 }
673
674 longlong_t
675 lx_nosys()
676 {
677 return (set_errno(ENOSYS));
678 }
679
680 /*
681 * Brand-specific routine to check if given non-Solaris standard segment
682 * register values should be modified to other values.
683 */
684 /*ARGSUSED*/
685 greg_t
686 lx_fixsegreg(greg_t sr, model_t datamodel)
687 {
688 uint16_t idx = SELTOIDX(sr);
689
690 ASSERT(sr == (sr & 0xffff));
691
692 /*
693 * If the segment selector is a valid TLS selector, just return it.
694 */
695 if (!SELISLDT(sr) && idx >= GDT_TLSMIN && idx <= GDT_TLSMAX)
696 return (sr | SEL_UPL);
697
698 /*
699 * Force the SR into the LDT in ring 3 for 32-bit processes.
700 *
701 * 64-bit processes get the null GDT selector since they are not
702 * allowed to have a private LDT.
703 */
704 #if defined(__amd64)
705 return (datamodel == DATAMODEL_ILP32 ? (sr | SEL_TI_LDT | SEL_UPL) : 0);
706 #elif defined(__i386)
707 datamodel = datamodel; /* datamodel currently unused for 32-bit */
708 return (sr | SEL_TI_LDT | SEL_UPL);
709 #endif /* __amd64 */
710 }
711
712 /*
713 * Brand-specific function to convert the fsbase as pulled from the register
714 * into a native fsbase suitable for locating the ulwp_t from the kernel.
715 */
716 uintptr_t
717 lx_fsbase(klwp_t *lwp, uintptr_t fsbase)
718 {
719 lx_lwp_data_t *lwpd = lwp->lwp_brand;
720
721 if (lwpd->br_stack_mode != LX_STACK_MODE_BRAND ||
722 lwpd->br_ntv_fsbase == (uintptr_t)NULL) {
723 return (fsbase);
724 }
725
726 return (lwpd->br_ntv_fsbase);
727 }
728
729 /*
730 * These two functions simulate winfo and post_sigcld for the lx brand. The
731 * difference is delivering a designated signal as opposed to always SIGCLD.
732 */
733 static void
734 lx_winfo(proc_t *pp, k_siginfo_t *ip, struct lx_proc_data *dat)
735 {
736 ASSERT(MUTEX_HELD(&pidlock));
737 bzero(ip, sizeof (k_siginfo_t));
738 ip->si_signo = ltos_signo[dat->l_signal];
739 ip->si_code = pp->p_wcode;
740 ip->si_pid = pp->p_pid;
741 ip->si_ctid = PRCTID(pp);
742 ip->si_zoneid = pp->p_zone->zone_id;
743 ip->si_status = pp->p_wdata;
744 /*
745 * These siginfo values are converted to USER_HZ in the user-land
746 * brand signal code.
747 */
748 ip->si_stime = pp->p_stime;
749 ip->si_utime = pp->p_utime;
750 }
751
752 static void
753 lx_post_exit_sig(proc_t *cp, sigqueue_t *sqp, struct lx_proc_data *dat)
754 {
755 proc_t *pp = cp->p_parent;
756
757 ASSERT(MUTEX_HELD(&pidlock));
758 mutex_enter(&pp->p_lock);
759 /*
760 * Since Linux doesn't queue SIGCHLD, or any other non RT
761 * signals, we just blindly deliver whatever signal we can.
762 */
763 ASSERT(sqp != NULL);
764 lx_winfo(cp, &sqp->sq_info, dat);
765 sigaddqa(pp, NULL, sqp);
766 sqp = NULL;
767 mutex_exit(&pp->p_lock);
768 }
769
770
771 /*
772 * Brand specific code for exiting and sending a signal to the parent, as
773 * opposed to sigcld().
774 */
775 void
776 lx_exit_with_sig(proc_t *cp, sigqueue_t *sqp)
777 {
778 proc_t *pp = cp->p_parent;
779 lx_proc_data_t *lx_brand_data = ptolxproc(cp);
780 ASSERT(MUTEX_HELD(&pidlock));
781
782 switch (cp->p_wcode) {
783 case CLD_EXITED:
784 case CLD_DUMPED:
785 case CLD_KILLED:
786 ASSERT(cp->p_stat == SZOMB);
787 /*
788 * The broadcast on p_srwchan_cv is a kludge to
789 * wakeup a possible thread in uadmin(A_SHUTDOWN).
790 */
791 cv_broadcast(&cp->p_srwchan_cv);
792
793 /*
794 * Add to newstate list of the parent
795 */
796 add_ns(pp, cp);
797
798 cv_broadcast(&pp->p_cv);
799 if ((pp->p_flag & SNOWAIT) ||
800 PTOU(pp)->u_signal[SIGCLD - 1] == SIG_IGN) {
801 if (!(cp->p_pidflag & CLDWAITPID))
802 freeproc(cp);
803 } else if (!(cp->p_pidflag & CLDNOSIGCHLD) &&
804 lx_brand_data->l_signal != 0) {
805 lx_post_exit_sig(cp, sqp, lx_brand_data);
806 sqp = NULL;
807 }
808 break;
809
810 case CLD_STOPPED:
811 case CLD_CONTINUED:
812 case CLD_TRAPPED:
813 panic("Should not be called in this case");
814 }
815
816 if (sqp)
817 siginfofree(sqp);
818 }
819
820 /*
821 * Filters based on arguments that have been passed in by a separate syscall
822 * using the B_STORE_ARGS mechanism. if the __WALL flag is set, no filter is
823 * applied, otherwise we look at the difference between a clone and non-clone
824 * process.
825 * The definition of a clone process in Linux is a thread that does not deliver
826 * SIGCHLD to its parent. The option __WCLONE indicates to wait only on clone
827 * processes. Without that option, a process should only wait on normal
828 * children. The following table shows the cases.
829 *
830 * default __WCLONE
831 * no SIGCHLD - X
832 * SIGCHLD X -
833 *
834 * This is an XOR of __WCLONE being set, and SIGCHLD being the signal sent on
835 * process exit.
836 *
837 * More information on wait in lx brands can be found at
838 * usr/src/lib/brand/lx/lx_brand/common/wait.c.
839 */
840 /* ARGSUSED */
841 boolean_t
842 lx_wait_filter(proc_t *pp, proc_t *cp)
843 {
844 lx_lwp_data_t *lwpd = ttolxlwp(curthread);
845 int flags = lwpd->br_waitid_flags;
846 boolean_t ret;
847
848 if (!lwpd->br_waitid_emulate) {
849 return (B_TRUE);
850 }
851
852 mutex_enter(&cp->p_lock);
853 if (flags & LX_WALL) {
854 ret = B_TRUE;
855 } else {
856 lx_proc_data_t *pd = ptolxproc(cp);
857 boolean_t is_sigchld = B_TRUE;
858 boolean_t match_wclone = B_FALSE;
859
860 /*
861 * When calling clone, an alternate signal can be chosen to
862 * deliver to the parent when the child exits.
863 */
864 if (pd != NULL && pd->l_signal != stol_signo[SIGCHLD]) {
865 is_sigchld = B_FALSE;
866 }
867 if ((flags & LX_WCLONE) != 0) {
868 match_wclone = B_TRUE;
869 }
870
871 ret = (match_wclone ^ is_sigchld) ? B_TRUE : B_FALSE;
872 }
873 mutex_exit(&cp->p_lock);
874
875 return (ret);
876 }
877
878 void
879 lx_ifname_convert(char *ifname, lx_if_action_t act)
880 {
881 if (act == LX_IF_TONATIVE) {
882 if (strncmp(ifname, "lo", IFNAMSIZ) == 0)
883 (void) strlcpy(ifname, "lo0", IFNAMSIZ);
884 } else {
885 if (strncmp(ifname, "lo0", IFNAMSIZ) == 0)
886 (void) strlcpy(ifname, "lo", IFNAMSIZ);
887 }
888 }
889
890 void
891 lx_ifflags_convert(uint64_t *flags, lx_if_action_t act)
892 {
893 uint64_t buf;
894
895 buf = *flags & (IFF_UP | IFF_BROADCAST | IFF_DEBUG |
896 IFF_LOOPBACK | IFF_POINTOPOINT | IFF_NOTRAILERS |
897 IFF_RUNNING | IFF_NOARP | IFF_PROMISC | IFF_ALLMULTI);
898
899 /* Linux has different shift for multicast flag */
900 if (act == LX_IF_TONATIVE) {
901 if (*flags & 0x1000)
902 buf |= IFF_MULTICAST;
903 } else {
904 if (*flags & IFF_MULTICAST)
905 buf |= 0x1000;
906 }
907 *flags = buf;
908 }
909
910 /*
911 * Convert an IPv6 address into the numbers used by /proc/net/if_inet6
912 */
913 unsigned int
914 lx_ipv6_scope_convert(const in6_addr_t *addr)
915 {
916 if (IN6_IS_ADDR_V4COMPAT(addr)) {
917 return (LX_IPV6_ADDR_COMPATv4);
918 } else if (IN6_ARE_ADDR_EQUAL(addr, &ipv6_loopback)) {
919 return (LX_IPV6_ADDR_LOOPBACK);
920 } else if (IN6_IS_ADDR_LINKLOCAL(addr)) {
921 return (LX_IPV6_ADDR_LINKLOCAL);
922 } else if (IN6_IS_ADDR_SITELOCAL(addr)) {
923 return (LX_IPV6_ADDR_SITELOCAL);
924 } else {
925 return (0x0000U);
926 }
927 }
928
929
930 void
931 lx_stol_hwaddr(const struct sockaddr_dl *src, struct sockaddr *dst, int *size)
932 {
933 int copy_size = MIN(src->sdl_alen, sizeof (dst->sa_data));
934
935 switch (src->sdl_type) {
936 case DL_ETHER:
937 dst->sa_family = LX_ARPHRD_ETHER;
938 break;
939 case DL_LOOP:
940 dst->sa_family = LX_ARPHRD_LOOPBACK;
941 break;
942 default:
943 dst->sa_family = LX_ARPHRD_VOID;
944 }
945
946 bcopy(LLADDR(src), dst->sa_data, copy_size);
947 *size = copy_size;
948 }
949
950 /*
951 * Brand hook to convert native kernel siginfo signal number, errno, code, pid
952 * and si_status to Linux values. Similar to the stol_ksiginfo function but
953 * this one converts in-place, converts the pid, and does not copyout.
954 */
955 void
956 lx_sigfd_translate(k_siginfo_t *infop)
957 {
958 zone_t *zone = curproc->p_zone;
959
960 infop->si_signo = lx_stol_signo(infop->si_signo, LX_SIGKILL);
961 infop->si_status = lx_stol_status(infop->si_status, LX_SIGKILL);
962 infop->si_code = lx_stol_sigcode(infop->si_code);
963 infop->si_errno = lx_errno(infop->si_errno, EINVAL);
964
965 /* Map zsched and zone init to pid 1 */
966 if (infop->si_pid == zone->zone_proc_initpid ||
967 infop->si_pid == zone->zone_zsched->p_pid) {
968 infop->si_pid = 1;
969 }
970 }
971
972 int
973 stol_ksiginfo_copyout(k_siginfo_t *sip, void *ulxsip)
974 {
975 lx_siginfo_t lsi;
976
977 bzero(&lsi, sizeof (lsi));
978 lsi.lsi_signo = lx_stol_signo(sip->si_signo, SIGCLD);
979 lsi.lsi_code = lx_stol_sigcode(sip->si_code);
980 lsi.lsi_errno = lx_errno(sip->si_errno, EINVAL);
981
982 switch (lsi.lsi_signo) {
983 case LX_SIGPOLL:
984 lsi.lsi_band = sip->si_band;
985 lsi.lsi_fd = sip->si_fd;
986 break;
987
988 case LX_SIGCHLD:
989 lsi.lsi_pid = sip->si_pid;
990 if (sip->si_code <= 0 || sip->si_code == CLD_EXITED) {
991 lsi.lsi_status = sip->si_status;
992 } else {
993 lsi.lsi_status = lx_stol_status(sip->si_status,
994 SIGKILL);
995 }
996 lsi.lsi_utime = HZ_TO_LX_USERHZ(sip->si_utime);
997 lsi.lsi_stime = HZ_TO_LX_USERHZ(sip->si_stime);
998 break;
999
1000 case LX_SIGILL:
1001 case LX_SIGBUS:
1002 case LX_SIGFPE:
1003 case LX_SIGSEGV:
1004 lsi.lsi_addr = sip->si_addr;
1005 break;
1006
1007 default:
1008 lsi.lsi_pid = sip->si_pid;
1009 lsi.lsi_uid = LX_UID32_TO_UID16(sip->si_uid);
1010 }
1011
1012 if (copyout(&lsi, ulxsip, sizeof (lsi)) != 0) {
1013 return (set_errno(EFAULT));
1014 }
1015
1016 return (0);
1017 }
1018
1019 #if defined(_SYSCALL32_IMPL)
1020 int
1021 stol_ksiginfo32_copyout(k_siginfo_t *sip, void *ulxsip)
1022 {
1023 lx_siginfo32_t lsi;
1024
1025 bzero(&lsi, sizeof (lsi));
1026 lsi.lsi_signo = lx_stol_signo(sip->si_signo, SIGCLD);
1027 lsi.lsi_code = lx_stol_sigcode(sip->si_code);
1028 lsi.lsi_errno = lx_errno(sip->si_errno, EINVAL);
1029
1030 switch (lsi.lsi_signo) {
1031 case LX_SIGPOLL:
1032 lsi.lsi_band = sip->si_band;
1033 lsi.lsi_fd = sip->si_fd;
1034 break;
1035
1036 case LX_SIGCHLD:
1037 lsi.lsi_pid = sip->si_pid;
1038 if (sip->si_code <= 0 || sip->si_code == CLD_EXITED) {
1039 lsi.lsi_status = sip->si_status;
1040 } else {
1041 lsi.lsi_status = lx_stol_status(sip->si_status,
1042 SIGKILL);
1043 }
1044 lsi.lsi_utime = HZ_TO_LX_USERHZ(sip->si_utime);
1045 lsi.lsi_stime = HZ_TO_LX_USERHZ(sip->si_stime);
1046 break;
1047
1048 case LX_SIGILL:
1049 case LX_SIGBUS:
1050 case LX_SIGFPE:
1051 case LX_SIGSEGV:
1052 lsi.lsi_addr = (caddr32_t)(uintptr_t)sip->si_addr;
1053 break;
1054
1055 default:
1056 lsi.lsi_pid = sip->si_pid;
1057 lsi.lsi_uid = LX_UID32_TO_UID16(sip->si_uid);
1058 }
1059
1060 if (copyout(&lsi, ulxsip, sizeof (lsi)) != 0) {
1061 return (set_errno(EFAULT));
1062 }
1063
1064 return (0);
1065 }
1066 #endif
1067
1068 /* Given an LX LWP, determine where user register state is stored. */
1069 lx_regs_location_t
1070 lx_regs_location(lx_lwp_data_t *lwpd, void **ucp, boolean_t for_write)
1071 {
1072 switch (lwpd->br_stack_mode) {
1073 case LX_STACK_MODE_BRAND:
1074 /*
1075 * The LWP was stopped with the brand stack and register state
1076 * loaded, e.g. during a syscall emulated within the kernel.
1077 */
1078 return (LX_REG_LOC_LWP);
1079
1080 case LX_STACK_MODE_PREINIT:
1081 if (for_write) {
1082 /* setting registers not allowed in this state */
1083 break;
1084 }
1085 if (lwpd->br_ptrace_whatstop == LX_PR_SIGNALLED ||
1086 lwpd->br_ptrace_whatstop == LX_PR_SYSEXIT) {
1087 /* The LWP was stopped by tracing on exec. */
1088 return (LX_REG_LOC_LWP);
1089 }
1090 break;
1091
1092 case LX_STACK_MODE_NATIVE:
1093 if (for_write) {
1094 /* setting registers not allowed in this state */
1095 break;
1096 }
1097 if (lwpd->br_ptrace_whystop == PR_BRAND) {
1098 /* Called while ptrace-event-stopped by lx_exec. */
1099 if (lwpd->br_ptrace_whatstop == LX_PR_EVENT) {
1100 return (LX_REG_LOC_LWP);
1101 }
1102
1103 /* Called while ptrace-event-stopped after clone. */
1104 if (lwpd->br_ptrace_whatstop == LX_PR_SIGNALLED &&
1105 lwpd->br_ptrace_stopsig == LX_SIGSTOP &&
1106 (lwpd->br_ptrace_flags & LX_PTF_STOPPED)) {
1107 return (LX_REG_LOC_LWP);
1108 }
1109
1110 /*
1111 * Called to obtain syscall exit for other cases
1112 * (e.g. pseudo return from rt_sigreturn).
1113 */
1114 if (lwpd->br_ptrace_whatstop == LX_PR_SYSEXIT &&
1115 (lwpd->br_ptrace_flags & LX_PTF_STOPPED)) {
1116 return (LX_REG_LOC_LWP);
1117 }
1118 }
1119 break;
1120 default:
1121 break;
1122 }
1123
1124 if (lwpd->br_ptrace_stopucp != (uintptr_t)NULL) {
1125 /*
1126 * The LWP was stopped in the usermode emulation library
1127 * but a ucontext_t for the preserved brand stack and
1128 * register state was provided. Return the register state
1129 * from that ucontext_t.
1130 */
1131 VERIFY(ucp != NULL);
1132 *ucp = (void *)lwpd->br_ptrace_stopucp;
1133 return (LX_REG_LOC_UCP);
1134 }
1135
1136 return (LX_REG_LOC_UNAVAIL);
1137 }