1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 /*
27 * Copyright 2019 Joyent, Inc.
28 */
29
30 #include <sys/errno.h>
31 #include <sys/systm.h>
32 #include <sys/archsystm.h>
33 #include <sys/privregs.h>
34 #include <sys/exec.h>
35 #include <sys/lwp.h>
36 #include <sys/sem.h>
37 #include <sys/brand.h>
38 #include <sys/lx_brand.h>
39 #include <sys/lx_misc.h>
40 #include <sys/lx_siginfo.h>
41 #include <sys/lx_futex.h>
42 #include <lx_errno.h>
43 #include <sys/lx_userhz.h>
44 #include <sys/cmn_err.h>
45 #include <sys/siginfo.h>
46 #include <sys/contract/process_impl.h>
47 #include <sys/x86_archext.h>
48 #include <sys/sdt.h>
49 #include <lx_signum.h>
50 #include <lx_syscall.h>
51 #include <sys/proc.h>
52 #include <sys/procfs.h>
53 #include <net/if.h>
54 #include <inet/ip6.h>
55 #include <sys/sunddi.h>
56 #include <sys/dlpi.h>
57 #include <sys/sysmacros.h>
58
59 /* Linux specific functions and definitions */
60 static void lx_save(klwp_t *);
61 static void lx_restore(klwp_t *);
62
63 /*
64 * Set the return code for the forked child, always zero
65 */
66 /*ARGSUSED*/
67 void
68 lx_setrval(klwp_t *lwp, int v1, int v2)
69 {
70 lwptoregs(lwp)->r_r0 = 0;
71 }
72
73 /*
74 * Reset process state on exec(2)
75 */
76 void
77 lx_exec()
78 {
79 klwp_t *lwp = ttolwp(curthread);
80 struct lx_lwp_data *lwpd = lwptolxlwp(lwp);
81 proc_t *p = ttoproc(curthread);
82 lx_proc_data_t *pd = ptolxproc(p);
83 struct regs *rp = lwptoregs(lwp);
84
85 /* b_exec is called without p_lock held */
86 VERIFY(MUTEX_NOT_HELD(&p->p_lock));
87
88 /*
89 * Any l_handler handlers set as a result of B_REGISTER are now
90 * invalid; clear them.
91 */
92 pd->l_handler = (uintptr_t)NULL;
93
94 /*
95 * If this was a multi-threaded Linux process and this lwp wasn't the
96 * main lwp, then we need to make its Illumos and Linux PIDs match.
97 */
98 if (curthread->t_tid != 1) {
99 lx_pid_reassign(curthread);
100 }
101
102 /*
103 * Inform ptrace(2) that we are processing an execve(2) call so that if
104 * we are traced we can post either the PTRACE_EVENT_EXEC event or the
105 * legacy SIGTRAP.
106 */
107 (void) lx_ptrace_stop_for_option(LX_PTRACE_O_TRACEEXEC, B_FALSE, 0, 0);
108
109 /* clear the fs/gsbase values until the app. can reinitialize them */
110 lwpd->br_lx_fsbase = (uintptr_t)NULL;
111 lwpd->br_ntv_fsbase = (uintptr_t)NULL;
112 lwpd->br_lx_gsbase = (uintptr_t)NULL;
113 lwpd->br_ntv_gsbase = (uintptr_t)NULL;
114
115 /*
116 * Clear the native stack flags. This will be reinitialised by
117 * lx_init() in the new process image.
118 */
119 lwpd->br_stack_mode = LX_STACK_MODE_PREINIT;
120 lwpd->br_ntv_stack = 0;
121 lwpd->br_ntv_stack_current = 0;
122
123 installctx(lwptot(lwp), lwp, lx_save, lx_restore, NULL, NULL, lx_save,
124 NULL, NULL);
125
126 /*
127 * clear out the tls array
128 */
129 bzero(lwpd->br_tls, sizeof (lwpd->br_tls));
130
131 /*
132 * reset the tls entries in the gdt
133 */
134 kpreempt_disable();
135 lx_restore(lwp);
136 kpreempt_enable();
137
138 /*
139 * The exec syscall doesn't return (so we don't call lx_syscall_return)
140 * but for our ptrace emulation we need to do this so that a tracer
141 * does not get out of sync. We know that by the time this lx_exec
142 * function is called that the exec has succeeded.
143 */
144 rp->r_r0 = 0;
145 (void) lx_ptrace_stop(LX_PR_SYSEXIT);
146 }
147
148 static void
149 lx_cleanlwp(klwp_t *lwp, proc_t *p)
150 {
151 struct lx_lwp_data *lwpd = lwptolxlwp(lwp);
152 void *rb_list = NULL;
153
154 VERIFY(lwpd != NULL);
155
156 mutex_enter(&p->p_lock);
157 if ((lwpd->br_ptrace_flags & LX_PTF_EXITING) == 0) {
158 lx_ptrace_exit(p, lwp);
159 }
160
161 /*
162 * While we have p_lock, clear the TP_KTHREAD flag. This is needed
163 * to prevent races within lx procfs. It's fine for prchoose() to pick
164 * this thread now since it is exiting and no longer blocked in the
165 * kernel.
166 */
167 lwptot(lwp)->t_proc_flag &= ~TP_KTHREAD;
168
169 /*
170 * While we have p_lock, safely grab any robust_list references and
171 * clear the lwp field.
172 */
173 sprlock_proc(p);
174 rb_list = lwpd->br_robust_list;
175 lwpd->br_robust_list = NULL;
176 sprunlock(p);
177
178 if (rb_list != NULL) {
179 lx_futex_robust_exit((uintptr_t)rb_list, lwpd->br_pid);
180 }
181
182 /*
183 * We need to run our context exit operation (lx_save) here to ensure
184 * we don't leave any garbage around. This is necessary to handle the
185 * following calling sequence:
186 * exit -> proc_exit -> lx_freelwp -> removectx
187 * That is, when our branded process exits, proc_exit will call our
188 * lx_freelwp brand hook which does call this function (lx_cleanlwp),
189 * but lx_freelwp also removes our context exit operation. The context
190 * exit functions are run by exitctx, which is called by either
191 * lwp_exit or thread_exit. The thread_exit function is called at the
192 * end of proc_exit when we'll swtch() to another thread, but by then
193 * our context exit function has been removed.
194 *
195 * It's ok if this function happens to be called more than once (for
196 * example, if we exec a native binary).
197 */
198 kpreempt_disable();
199 lx_save(lwp);
200 kpreempt_enable();
201 }
202
203 void
204 lx_exitlwp(klwp_t *lwp)
205 {
206 struct lx_lwp_data *lwpd = lwptolxlwp(lwp);
207 proc_t *p = lwptoproc(lwp);
208 kthread_t *t;
209 sigqueue_t *sqp = NULL;
210 pid_t ppid;
211 id_t ptid;
212
213 VERIFY(MUTEX_NOT_HELD(&p->p_lock));
214
215 if (lwpd == NULL) {
216 /* second time thru' */
217 return;
218 }
219
220 lx_cleanlwp(lwp, p);
221
222 if (lwpd->br_clear_ctidp != NULL) {
223 (void) suword32(lwpd->br_clear_ctidp, 0);
224 (void) lx_futex((uintptr_t)lwpd->br_clear_ctidp, FUTEX_WAKE, 1,
225 (uintptr_t)NULL, (uintptr_t)NULL, 0);
226 lwpd->br_clear_ctidp = NULL;
227 }
228
229 if (lwpd->br_signal != 0) {
230 /*
231 * The first thread in a process doesn't cause a signal to
232 * be sent when it exits. It was created by a fork(), not
233 * a clone(), so the parent should get signalled when the
234 * process exits.
235 */
236 if (lwpd->br_ptid == -1)
237 goto free;
238
239 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
240 /*
241 * If br_ppid is 0, it means this is a CLONE_PARENT thread,
242 * so the signal goes to the parent process - not to a
243 * specific thread in this process.
244 */
245 p = lwptoproc(lwp);
246 if (lwpd->br_ppid == 0) {
247 mutex_enter(&p->p_lock);
248 ppid = p->p_ppid;
249 t = NULL;
250 } else {
251 /*
252 * If we have been reparented to init or if our
253 * parent thread is gone, then nobody gets
254 * signaled.
255 */
256 if ((lx_lwp_ppid(lwp, &ppid, &ptid) == 1) ||
257 (ptid == -1))
258 goto free;
259
260 mutex_enter(&pidlock);
261 if ((p = prfind(ppid)) == NULL || p->p_stat == SIDL) {
262 mutex_exit(&pidlock);
263 goto free;
264 }
265 mutex_enter(&p->p_lock);
266 mutex_exit(&pidlock);
267
268 if ((t = idtot(p, ptid)) == NULL) {
269 mutex_exit(&p->p_lock);
270 goto free;
271 }
272 }
273
274 sqp->sq_info.si_signo = lwpd->br_signal;
275 sqp->sq_info.si_code = lwpd->br_exitwhy;
276 sqp->sq_info.si_status = lwpd->br_exitwhat;
277 sqp->sq_info.si_pid = lwpd->br_pid;
278 sqp->sq_info.si_uid = crgetruid(CRED());
279 sigaddqa(p, t, sqp);
280 mutex_exit(&p->p_lock);
281 sqp = NULL;
282 }
283
284 free:
285 if (lwpd->br_scall_args != NULL) {
286 ASSERT(lwpd->br_args_size > 0);
287 kmem_free(lwpd->br_scall_args, lwpd->br_args_size);
288 }
289 if (sqp)
290 kmem_free(sqp, sizeof (sigqueue_t));
291 }
292
293 void
294 lx_freelwp(klwp_t *lwp)
295 {
296 struct lx_lwp_data *lwpd = lwptolxlwp(lwp);
297 proc_t *p = lwptoproc(lwp);
298 lx_zone_data_t *lxzdata;
299 vfs_t *cgrp;
300
301 VERIFY(MUTEX_NOT_HELD(&p->p_lock));
302
303 if (lwpd == NULL) {
304 /*
305 * There is one case where an LX branded process will possess
306 * LWPs which lack their own brand data. During the course of
307 * executing native binary, the process will be preemptively
308 * branded to allow hooks such as b_native_exec to function.
309 * If that process possesses multiple LWPS, they will _not_ be
310 * branded since they will exit if the exec succeeds. It's
311 * during this LWP exit that lx_freelwp would be called on an
312 * unbranded LWP. When that is the case, it is acceptable to
313 * bypass the hook.
314 */
315 return;
316 }
317
318 /* cgroup integration */
319 lxzdata = ztolxzd(p->p_zone);
320 mutex_enter(&lxzdata->lxzd_lock);
321 cgrp = lxzdata->lxzd_cgroup;
322 if (cgrp != NULL) {
323 VFS_HOLD(cgrp);
324 mutex_exit(&lxzdata->lxzd_lock);
325 ASSERT(lx_cgrp_freelwp != NULL);
326 (*lx_cgrp_freelwp)(cgrp, lwpd->br_cgroupid, lwptot(lwp)->t_tid,
327 lwpd->br_pid);
328 VFS_RELE(cgrp);
329 } else {
330 mutex_exit(&lxzdata->lxzd_lock);
331 }
332
333 /*
334 * It is possible for the lx_freelwp hook to be called without a prior
335 * call to lx_exitlwp being made. This happens as part of lwp
336 * de-branding when a native binary is executed from a branded process.
337 *
338 * To cover all cases, lx_cleanlwp is called from lx_exitlwp as well
339 * here in lx_freelwp. When the second call is redundant, the
340 * resources will already be freed and no work will be needed.
341 */
342 lx_cleanlwp(lwp, p);
343
344 /*
345 * Remove our system call interposer.
346 */
347 lwp->lwp_brand_syscall = NULL;
348
349 (void) removectx(lwptot(lwp), lwp, lx_save, lx_restore, NULL, NULL,
350 lx_save, NULL);
351 if (lwpd->br_pid != 0) {
352 lx_pid_rele(lwptoproc(lwp)->p_pid, lwptot(lwp)->t_tid);
353 }
354
355 /*
356 * Discard the affinity mask.
357 */
358 VERIFY(lwpd->br_affinitymask != NULL);
359 cpuset_free(lwpd->br_affinitymask);
360 lwpd->br_affinitymask = NULL;
361
362 /*
363 * Ensure that lx_ptrace_exit() has been called to detach
364 * ptrace(2) tracers and tracees.
365 */
366 VERIFY(lwpd->br_ptrace_tracer == NULL);
367 VERIFY(lwpd->br_ptrace_accord == NULL);
368
369 lwp->lwp_brand = NULL;
370 kmem_free(lwpd, sizeof (struct lx_lwp_data));
371 }
372
373 void *
374 lx_lwpdata_alloc(proc_t *p)
375 {
376 lx_lwp_data_t *lwpd;
377 struct lx_pid *lpidp;
378 cpuset_t *affmask;
379 pid_t newpid = 0;
380 struct pid *pidp = NULL;
381
382 VERIFY(MUTEX_NOT_HELD(&p->p_lock));
383
384 /*
385 * LWPs beyond the first will require a pid to be allocated to emulate
386 * Linux's goofy thread model. While this allocation may be
387 * unnecessary when a single-lwp process undergoes branding, it cannot
388 * be performed during b_initlwp due to p_lock being held.
389 */
390 if (p->p_lwpcnt > 0) {
391 if ((newpid = pid_allocate(p, 0, 0)) < 0) {
392 return (NULL);
393 }
394 pidp = pid_find(newpid);
395 }
396
397 lwpd = kmem_zalloc(sizeof (struct lx_lwp_data), KM_SLEEP);
398 lpidp = kmem_zalloc(sizeof (struct lx_pid), KM_SLEEP);
399 affmask = cpuset_alloc(KM_SLEEP);
400
401 lpidp->lxp_lpid = newpid;
402 lpidp->lxp_pidp = pidp;
403 lwpd->br_lpid = lpidp;
404 lwpd->br_affinitymask = affmask;
405
406 return (lwpd);
407 }
408
409 /*
410 * Free lwp brand data if an error occurred during lwp_create.
411 * Otherwise, lx_freelwp will be used to free the resources after they're
412 * associated with the lwp via lx_initlwp.
413 */
414 void
415 lx_lwpdata_free(void *lwpbd)
416 {
417 lx_lwp_data_t *lwpd = (lx_lwp_data_t *)lwpbd;
418 VERIFY(lwpd != NULL);
419 VERIFY(lwpd->br_lpid != NULL);
420 VERIFY(lwpd->br_affinitymask != NULL);
421
422 cpuset_free(lwpd->br_affinitymask);
423 if (lwpd->br_lpid->lxp_pidp != NULL) {
424 (void) pid_rele(lwpd->br_lpid->lxp_pidp);
425 }
426 kmem_free(lwpd->br_lpid, sizeof (*lwpd->br_lpid));
427 kmem_free(lwpd, sizeof (*lwpd));
428 }
429
430 void
431 lx_initlwp(klwp_t *lwp, void *lwpbd)
432 {
433 lx_lwp_data_t *lwpd = (lx_lwp_data_t *)lwpbd;
434 lx_lwp_data_t *plwpd = ttolxlwp(curthread);
435 kthread_t *tp = lwptot(lwp);
436 proc_t *p = lwptoproc(lwp);
437 lx_zone_data_t *lxzdata;
438 vfs_t *cgrp;
439
440 VERIFY(MUTEX_HELD(&p->p_lock));
441 VERIFY(lwp->lwp_brand == NULL);
442
443 lwpd->br_exitwhy = CLD_EXITED;
444 lwpd->br_lwp = lwp;
445 lwpd->br_clear_ctidp = NULL;
446 lwpd->br_set_ctidp = NULL;
447 lwpd->br_signal = 0;
448 lwpd->br_stack_mode = LX_STACK_MODE_PREINIT;
449 cpuset_all(lwpd->br_affinitymask);
450
451 /*
452 * The first thread in a process has ppid set to the parent
453 * process's pid, and ptid set to -1. Subsequent threads in the
454 * process have their ppid set to the pid of the thread that
455 * created them, and their ptid to that thread's tid.
456 */
457 if (tp->t_next == tp) {
458 lwpd->br_ppid = tp->t_procp->p_ppid;
459 lwpd->br_ptid = -1;
460 } else if (plwpd != NULL) {
461 bcopy(plwpd->br_tls, lwpd->br_tls, sizeof (lwpd->br_tls));
462 lwpd->br_ppid = plwpd->br_pid;
463 lwpd->br_ptid = curthread->t_tid;
464 /* The child inherits the fs/gsbase values from the parent */
465 lwpd->br_lx_fsbase = plwpd->br_lx_fsbase;
466 lwpd->br_ntv_fsbase = plwpd->br_ntv_fsbase;
467 lwpd->br_lx_gsbase = plwpd->br_lx_gsbase;
468 lwpd->br_ntv_gsbase = plwpd->br_ntv_gsbase;
469 } else {
470 /*
471 * Oddball case: the parent thread isn't a Linux process.
472 */
473 lwpd->br_ppid = 0;
474 lwpd->br_ptid = -1;
475 }
476 lwp->lwp_brand = lwpd;
477
478 /*
479 * When during lx_lwpdata_alloc, we must decide whether or not to
480 * allocate a new pid to associate with the lwp. Since p_lock is not
481 * held at that point, the only time we can guarantee a new pid isn't
482 * needed is when p_lwpcnt == 0. This is because other lwps won't be
483 * present to race with us with regards to pid allocation.
484 *
485 * This means that in all other cases (where p_lwpcnt > 0), we expect
486 * that lx_lwpdata_alloc will allocate a pid for us to use here, even
487 * if it is uneeded. If this process is undergoing an exec, for
488 * example, the single existing lwp will not need a new pid when it is
489 * rebranded. In that case, lx_pid_assign will free the uneeded pid.
490 */
491 VERIFY(lwpd->br_lpid->lxp_pidp != NULL || p->p_lwpcnt == 0);
492
493 lx_pid_assign(tp, lwpd->br_lpid);
494 lwpd->br_tgid = lwpd->br_pid;
495 /*
496 * Having performed the lx pid assignement, the lpid reference is no
497 * longer needed. The underlying data will be freed during lx_freelwp.
498 */
499 lwpd->br_lpid = NULL;
500
501 installctx(lwptot(lwp), lwp, lx_save, lx_restore, NULL, NULL,
502 lx_save, NULL, NULL);
503
504 /*
505 * Install branded system call hooks for this LWP:
506 */
507 lwp->lwp_brand_syscall = lx_syscall_enter;
508
509 /*
510 * The new LWP inherits the parent LWP cgroup ID.
511 */
512 if (plwpd != NULL) {
513 lwpd->br_cgroupid = plwpd->br_cgroupid;
514 }
515 /*
516 * The new LWP inherits the parent LWP emulated scheduling info.
517 */
518 if (plwpd != NULL) {
519 lwpd->br_schd_class = plwpd->br_schd_class;
520 lwpd->br_schd_pri = plwpd->br_schd_pri;
521 lwpd->br_schd_flags = plwpd->br_schd_flags;
522 lwpd->br_schd_runtime = plwpd->br_schd_runtime;
523 lwpd->br_schd_deadline = plwpd->br_schd_deadline;
524 lwpd->br_schd_period = plwpd->br_schd_period;
525 }
526 lxzdata = ztolxzd(p->p_zone);
527 mutex_enter(&lxzdata->lxzd_lock);
528 cgrp = lxzdata->lxzd_cgroup;
529 if (cgrp != NULL) {
530 VFS_HOLD(cgrp);
531 mutex_exit(&lxzdata->lxzd_lock);
532 ASSERT(lx_cgrp_initlwp != NULL);
533 (*lx_cgrp_initlwp)(cgrp, lwpd->br_cgroupid, lwptot(lwp)->t_tid,
534 lwpd->br_pid);
535 VFS_RELE(cgrp);
536 } else {
537 mutex_exit(&lxzdata->lxzd_lock);
538 }
539 }
540
541 void
542 lx_initlwp_post(klwp_t *lwp)
543 {
544 lx_lwp_data_t *plwpd = ttolxlwp(curthread);
545 /*
546 * If the parent LWP has a ptrace(2) tracer, the new LWP may
547 * need to inherit that same tracer.
548 */
549 if (plwpd != NULL) {
550 lx_ptrace_inherit_tracer(plwpd, lwptolxlwp(lwp));
551 }
552 }
553
554 /*
555 * There is no need to have any locking for either the source or
556 * destination struct lx_lwp_data structs. This is always run in the
557 * thread context of the source thread, and the destination thread is
558 * always newly created and not referred to from anywhere else.
559 */
560 void
561 lx_forklwp(klwp_t *srclwp, klwp_t *dstlwp)
562 {
563 struct lx_lwp_data *src = srclwp->lwp_brand;
564 struct lx_lwp_data *dst = dstlwp->lwp_brand;
565
566 dst->br_ppid = src->br_pid;
567 dst->br_ptid = lwptot(srclwp)->t_tid;
568 bcopy(src->br_tls, dst->br_tls, sizeof (dst->br_tls));
569
570 switch (src->br_stack_mode) {
571 case LX_STACK_MODE_BRAND:
572 case LX_STACK_MODE_NATIVE:
573 /*
574 * The parent LWP has an alternate stack installed.
575 * The child LWP should have the same stack base and extent.
576 */
577 dst->br_stack_mode = src->br_stack_mode;
578 dst->br_ntv_stack = src->br_ntv_stack;
579 dst->br_ntv_stack_current = src->br_ntv_stack_current;
580 break;
581
582 default:
583 /*
584 * Otherwise, clear the stack data for this LWP.
585 */
586 dst->br_stack_mode = LX_STACK_MODE_PREINIT;
587 dst->br_ntv_stack = 0;
588 dst->br_ntv_stack_current = 0;
589 }
590
591 /*
592 * copy only these flags
593 */
594 dst->br_lwp_flags = src->br_lwp_flags & BR_CPU_BOUND;
595 dst->br_scall_args = NULL;
596 lx_affinity_forklwp(srclwp, dstlwp);
597
598 /*
599 * Flag so child doesn't ptrace-stop on syscall exit.
600 */
601 dst->br_ptrace_flags |= LX_PTF_NOSTOP;
602
603 if (src->br_clone_grp_flags != 0) {
604 lx_clone_grp_enter(src->br_clone_grp_flags, lwptoproc(srclwp),
605 lwptoproc(dstlwp));
606 /* clone group no longer pending on this thread */
607 src->br_clone_grp_flags = 0;
608 }
609 }
610
611 /*
612 * When switching a Linux process off the CPU, clear its GDT entries.
613 */
614 /* ARGSUSED */
615 static void
616 lx_save(klwp_t *t)
617 {
618 int i;
619
620 #if defined(__amd64)
621 reset_sregs();
622 #endif
623 for (i = 0; i < LX_TLSNUM; i++)
624 gdt_update_usegd(GDT_TLSMIN + i, &null_udesc);
625 }
626
627 /*
628 * When switching a Linux process on the CPU, set its GDT entries.
629 *
630 * For 64-bit code we don't have to worry about explicitly setting the
631 * %fsbase via wrmsr(MSR_AMD_FSBASE) here. Instead, that should happen
632 * automatically in update_sregs if we are executing in user-land. If this
633 * is the case then pcb_rupdate should be set.
634 */
635 static void
636 lx_restore(klwp_t *t)
637 {
638 struct lx_lwp_data *lwpd = lwptolxlwp(t);
639 user_desc_t *tls;
640 int i;
641
642 ASSERT(lwpd);
643
644 tls = lwpd->br_tls;
645 for (i = 0; i < LX_TLSNUM; i++)
646 gdt_update_usegd(GDT_TLSMIN + i, &tls[i]);
647 }
648
649 void
650 lx_set_gdt(int entry, user_desc_t *descrp)
651 {
652
653 gdt_update_usegd(entry, descrp);
654 }
655
656 void
657 lx_clear_gdt(int entry)
658 {
659 gdt_update_usegd(entry, &null_udesc);
660 }
661
662 longlong_t
663 lx_nosys()
664 {
665 return (set_errno(ENOSYS));
666 }
667
668 /*
669 * Brand-specific routine to check if given non-Solaris standard segment
670 * register values should be modified to other values.
671 */
672 /*ARGSUSED*/
673 greg_t
674 lx_fixsegreg(greg_t sr, model_t datamodel)
675 {
676 uint16_t idx = SELTOIDX(sr);
677
678 ASSERT(sr == (sr & 0xffff));
679
680 /*
681 * If the segment selector is a valid TLS selector, just return it.
682 */
683 if (!SELISLDT(sr) && idx >= GDT_TLSMIN && idx <= GDT_TLSMAX)
684 return (sr | SEL_UPL);
685
686 /*
687 * Force the SR into the LDT in ring 3 for 32-bit processes.
688 *
689 * 64-bit processes get the null GDT selector since they are not
690 * allowed to have a private LDT.
691 */
692 #if defined(__amd64)
693 return (datamodel == DATAMODEL_ILP32 ? (sr | SEL_TI_LDT | SEL_UPL) : 0);
694 #elif defined(__i386)
695 datamodel = datamodel; /* datamodel currently unused for 32-bit */
696 return (sr | SEL_TI_LDT | SEL_UPL);
697 #endif /* __amd64 */
698 }
699
700 /*
701 * Brand-specific function to convert the fsbase as pulled from the register
702 * into a native fsbase suitable for locating the ulwp_t from the kernel.
703 */
704 uintptr_t
705 lx_fsbase(klwp_t *lwp, uintptr_t fsbase)
706 {
707 lx_lwp_data_t *lwpd = lwp->lwp_brand;
708
709 if (lwpd->br_stack_mode != LX_STACK_MODE_BRAND ||
710 lwpd->br_ntv_fsbase == (uintptr_t)NULL) {
711 return (fsbase);
712 }
713
714 return (lwpd->br_ntv_fsbase);
715 }
716
717 /*
718 * These two functions simulate winfo and post_sigcld for the lx brand. The
719 * difference is delivering a designated signal as opposed to always SIGCLD.
720 */
721 static void
722 lx_winfo(proc_t *pp, k_siginfo_t *ip, struct lx_proc_data *dat)
723 {
724 ASSERT(MUTEX_HELD(&pidlock));
725 bzero(ip, sizeof (k_siginfo_t));
726 ip->si_signo = ltos_signo[dat->l_signal];
727 ip->si_code = pp->p_wcode;
728 ip->si_pid = pp->p_pid;
729 ip->si_ctid = PRCTID(pp);
730 ip->si_zoneid = pp->p_zone->zone_id;
731 ip->si_status = pp->p_wdata;
732 /*
733 * These siginfo values are converted to USER_HZ in the user-land
734 * brand signal code.
735 */
736 ip->si_stime = pp->p_stime;
737 ip->si_utime = pp->p_utime;
738 }
739
740 static void
741 lx_post_exit_sig(proc_t *cp, sigqueue_t *sqp, struct lx_proc_data *dat)
742 {
743 proc_t *pp = cp->p_parent;
744
745 ASSERT(MUTEX_HELD(&pidlock));
746 mutex_enter(&pp->p_lock);
747 /*
748 * Since Linux doesn't queue SIGCHLD, or any other non RT
749 * signals, we just blindly deliver whatever signal we can.
750 */
751 ASSERT(sqp != NULL);
752 lx_winfo(cp, &sqp->sq_info, dat);
753 sigaddqa(pp, NULL, sqp);
754 sqp = NULL;
755 mutex_exit(&pp->p_lock);
756 }
757
758
759 /*
760 * Brand specific code for exiting and sending a signal to the parent, as
761 * opposed to sigcld().
762 */
763 void
764 lx_exit_with_sig(proc_t *cp, sigqueue_t *sqp)
765 {
766 proc_t *pp = cp->p_parent;
767 lx_proc_data_t *lx_brand_data = ptolxproc(cp);
768 ASSERT(MUTEX_HELD(&pidlock));
769
770 switch (cp->p_wcode) {
771 case CLD_EXITED:
772 case CLD_DUMPED:
773 case CLD_KILLED:
774 ASSERT(cp->p_stat == SZOMB);
775 /*
776 * The broadcast on p_srwchan_cv is a kludge to
777 * wakeup a possible thread in uadmin(A_SHUTDOWN).
778 */
779 cv_broadcast(&cp->p_srwchan_cv);
780
781 /*
782 * Add to newstate list of the parent
783 */
784 add_ns(pp, cp);
785
786 cv_broadcast(&pp->p_cv);
787 if ((pp->p_flag & SNOWAIT) ||
788 PTOU(pp)->u_signal[SIGCLD - 1] == SIG_IGN) {
789 if (!(cp->p_pidflag & CLDWAITPID))
790 freeproc(cp);
791 } else if (!(cp->p_pidflag & CLDNOSIGCHLD) &&
792 lx_brand_data->l_signal != 0) {
793 lx_post_exit_sig(cp, sqp, lx_brand_data);
794 sqp = NULL;
795 }
796 break;
797
798 case CLD_STOPPED:
799 case CLD_CONTINUED:
800 case CLD_TRAPPED:
801 panic("Should not be called in this case");
802 }
803
804 if (sqp)
805 siginfofree(sqp);
806 }
807
808 /*
809 * Filters based on arguments that have been passed in by a separate syscall
810 * using the B_STORE_ARGS mechanism. if the __WALL flag is set, no filter is
811 * applied, otherwise we look at the difference between a clone and non-clone
812 * process.
813 * The definition of a clone process in Linux is a thread that does not deliver
814 * SIGCHLD to its parent. The option __WCLONE indicates to wait only on clone
815 * processes. Without that option, a process should only wait on normal
816 * children. The following table shows the cases.
817 *
818 * default __WCLONE
819 * no SIGCHLD - X
820 * SIGCHLD X -
821 *
822 * This is an XOR of __WCLONE being set, and SIGCHLD being the signal sent on
823 * process exit.
824 *
825 * More information on wait in lx brands can be found at
826 * usr/src/lib/brand/lx/lx_brand/common/wait.c.
827 */
828 /* ARGSUSED */
829 boolean_t
830 lx_wait_filter(proc_t *pp, proc_t *cp)
831 {
832 lx_lwp_data_t *lwpd = ttolxlwp(curthread);
833 int flags = lwpd->br_waitid_flags;
834 boolean_t ret;
835
836 if (!lwpd->br_waitid_emulate) {
837 return (B_TRUE);
838 }
839
840 mutex_enter(&cp->p_lock);
841 if (flags & LX_WALL) {
842 ret = B_TRUE;
843 } else {
844 lx_proc_data_t *pd = ptolxproc(cp);
845 boolean_t is_sigchld = B_TRUE;
846 boolean_t match_wclone = B_FALSE;
847
848 /*
849 * When calling clone, an alternate signal can be chosen to
850 * deliver to the parent when the child exits.
851 */
852 if (pd != NULL && pd->l_signal != stol_signo[SIGCHLD]) {
853 is_sigchld = B_FALSE;
854 }
855 if ((flags & LX_WCLONE) != 0) {
856 match_wclone = B_TRUE;
857 }
858
859 ret = (match_wclone ^ is_sigchld) ? B_TRUE : B_FALSE;
860 }
861 mutex_exit(&cp->p_lock);
862
863 return (ret);
864 }
865
866 void
867 lx_ifname_convert(char *ifname, lx_if_action_t act)
868 {
869 if (act == LX_IF_TONATIVE) {
870 if (strncmp(ifname, "lo", IFNAMSIZ) == 0)
871 (void) strlcpy(ifname, "lo0", IFNAMSIZ);
872 } else {
873 if (strncmp(ifname, "lo0", IFNAMSIZ) == 0)
874 (void) strlcpy(ifname, "lo", IFNAMSIZ);
875 }
876 }
877
878 void
879 lx_ifflags_convert(uint64_t *flags, lx_if_action_t act)
880 {
881 uint64_t buf;
882
883 buf = *flags & (IFF_UP | IFF_BROADCAST | IFF_DEBUG |
884 IFF_LOOPBACK | IFF_POINTOPOINT | IFF_NOTRAILERS |
885 IFF_RUNNING | IFF_NOARP | IFF_PROMISC | IFF_ALLMULTI);
886
887 /* Linux has different shift for multicast flag */
888 if (act == LX_IF_TONATIVE) {
889 if (*flags & 0x1000)
890 buf |= IFF_MULTICAST;
891 } else {
892 if (*flags & IFF_MULTICAST)
893 buf |= 0x1000;
894 }
895 *flags = buf;
896 }
897
898 /*
899 * Convert an IPv6 address into the numbers used by /proc/net/if_inet6
900 */
901 unsigned int
902 lx_ipv6_scope_convert(const in6_addr_t *addr)
903 {
904 if (IN6_IS_ADDR_V4COMPAT(addr)) {
905 return (LX_IPV6_ADDR_COMPATv4);
906 } else if (IN6_ARE_ADDR_EQUAL(addr, &ipv6_loopback)) {
907 return (LX_IPV6_ADDR_LOOPBACK);
908 } else if (IN6_IS_ADDR_LINKLOCAL(addr)) {
909 return (LX_IPV6_ADDR_LINKLOCAL);
910 } else if (IN6_IS_ADDR_SITELOCAL(addr)) {
911 return (LX_IPV6_ADDR_SITELOCAL);
912 } else {
913 return (0x0000U);
914 }
915 }
916
917
918 void
919 lx_stol_hwaddr(const struct sockaddr_dl *src, struct sockaddr *dst, int *size)
920 {
921 int copy_size = MIN(src->sdl_alen, sizeof (dst->sa_data));
922
923 switch (src->sdl_type) {
924 case DL_ETHER:
925 dst->sa_family = LX_ARPHRD_ETHER;
926 break;
927 case DL_LOOP:
928 dst->sa_family = LX_ARPHRD_LOOPBACK;
929 break;
930 default:
931 dst->sa_family = LX_ARPHRD_VOID;
932 }
933
934 bcopy(LLADDR(src), dst->sa_data, copy_size);
935 *size = copy_size;
936 }
937
938 /*
939 * Brand hook to convert native kernel siginfo signal number, errno, code, pid
940 * and si_status to Linux values. Similar to the stol_ksiginfo function but
941 * this one converts in-place, converts the pid, and does not copyout.
942 */
943 void
944 lx_sigfd_translate(k_siginfo_t *infop)
945 {
946 zone_t *zone = curproc->p_zone;
947
948 infop->si_signo = lx_stol_signo(infop->si_signo, LX_SIGKILL);
949 infop->si_status = lx_stol_status(infop->si_status, LX_SIGKILL);
950 infop->si_code = lx_stol_sigcode(infop->si_code);
951 infop->si_errno = lx_errno(infop->si_errno, EINVAL);
952
953 /* Map zsched and zone init to pid 1 */
954 if (infop->si_pid == zone->zone_proc_initpid ||
955 infop->si_pid == zone->zone_zsched->p_pid) {
956 infop->si_pid = 1;
957 }
958 }
959
960 int
961 stol_ksiginfo_copyout(k_siginfo_t *sip, void *ulxsip)
962 {
963 lx_siginfo_t lsi;
964
965 bzero(&lsi, sizeof (lsi));
966 lsi.lsi_signo = lx_stol_signo(sip->si_signo, SIGCLD);
967 lsi.lsi_code = lx_stol_sigcode(sip->si_code);
968 lsi.lsi_errno = lx_errno(sip->si_errno, EINVAL);
969
970 switch (lsi.lsi_signo) {
971 case LX_SIGPOLL:
972 lsi.lsi_band = sip->si_band;
973 lsi.lsi_fd = sip->si_fd;
974 break;
975
976 case LX_SIGCHLD:
977 lsi.lsi_pid = sip->si_pid;
978 if (sip->si_code <= 0 || sip->si_code == CLD_EXITED) {
979 lsi.lsi_status = sip->si_status;
980 } else {
981 lsi.lsi_status = lx_stol_status(sip->si_status,
982 SIGKILL);
983 }
984 lsi.lsi_utime = HZ_TO_LX_USERHZ(sip->si_utime);
985 lsi.lsi_stime = HZ_TO_LX_USERHZ(sip->si_stime);
986 break;
987
988 case LX_SIGILL:
989 case LX_SIGBUS:
990 case LX_SIGFPE:
991 case LX_SIGSEGV:
992 lsi.lsi_addr = sip->si_addr;
993 break;
994
995 default:
996 lsi.lsi_pid = sip->si_pid;
997 lsi.lsi_uid = LX_UID32_TO_UID16(sip->si_uid);
998 }
999
1000 if (copyout(&lsi, ulxsip, sizeof (lsi)) != 0) {
1001 return (set_errno(EFAULT));
1002 }
1003
1004 return (0);
1005 }
1006
1007 #if defined(_SYSCALL32_IMPL)
1008 int
1009 stol_ksiginfo32_copyout(k_siginfo_t *sip, void *ulxsip)
1010 {
1011 lx_siginfo32_t lsi;
1012
1013 bzero(&lsi, sizeof (lsi));
1014 lsi.lsi_signo = lx_stol_signo(sip->si_signo, SIGCLD);
1015 lsi.lsi_code = lx_stol_sigcode(sip->si_code);
1016 lsi.lsi_errno = lx_errno(sip->si_errno, EINVAL);
1017
1018 switch (lsi.lsi_signo) {
1019 case LX_SIGPOLL:
1020 lsi.lsi_band = sip->si_band;
1021 lsi.lsi_fd = sip->si_fd;
1022 break;
1023
1024 case LX_SIGCHLD:
1025 lsi.lsi_pid = sip->si_pid;
1026 if (sip->si_code <= 0 || sip->si_code == CLD_EXITED) {
1027 lsi.lsi_status = sip->si_status;
1028 } else {
1029 lsi.lsi_status = lx_stol_status(sip->si_status,
1030 SIGKILL);
1031 }
1032 lsi.lsi_utime = HZ_TO_LX_USERHZ(sip->si_utime);
1033 lsi.lsi_stime = HZ_TO_LX_USERHZ(sip->si_stime);
1034 break;
1035
1036 case LX_SIGILL:
1037 case LX_SIGBUS:
1038 case LX_SIGFPE:
1039 case LX_SIGSEGV:
1040 lsi.lsi_addr = (caddr32_t)(uintptr_t)sip->si_addr;
1041 break;
1042
1043 default:
1044 lsi.lsi_pid = sip->si_pid;
1045 lsi.lsi_uid = LX_UID32_TO_UID16(sip->si_uid);
1046 }
1047
1048 if (copyout(&lsi, ulxsip, sizeof (lsi)) != 0) {
1049 return (set_errno(EFAULT));
1050 }
1051
1052 return (0);
1053 }
1054 #endif
1055
1056 /* Given an LX LWP, determine where user register state is stored. */
1057 lx_regs_location_t
1058 lx_regs_location(lx_lwp_data_t *lwpd, void **ucp, boolean_t for_write)
1059 {
1060 switch (lwpd->br_stack_mode) {
1061 case LX_STACK_MODE_BRAND:
1062 /*
1063 * The LWP was stopped with the brand stack and register state
1064 * loaded, e.g. during a syscall emulated within the kernel.
1065 */
1066 return (LX_REG_LOC_LWP);
1067
1068 case LX_STACK_MODE_PREINIT:
1069 if (for_write) {
1070 /* setting registers not allowed in this state */
1071 break;
1072 }
1073 if (lwpd->br_ptrace_whatstop == LX_PR_SIGNALLED ||
1074 lwpd->br_ptrace_whatstop == LX_PR_SYSEXIT) {
1075 /* The LWP was stopped by tracing on exec. */
1076 return (LX_REG_LOC_LWP);
1077 }
1078 break;
1079
1080 case LX_STACK_MODE_NATIVE:
1081 if (for_write) {
1082 /* setting registers not allowed in this state */
1083 break;
1084 }
1085 if (lwpd->br_ptrace_whystop == PR_BRAND) {
1086 /* Called while ptrace-event-stopped by lx_exec. */
1087 if (lwpd->br_ptrace_whatstop == LX_PR_EVENT) {
1088 return (LX_REG_LOC_LWP);
1089 }
1090
1091 /* Called while ptrace-event-stopped after clone. */
1092 if (lwpd->br_ptrace_whatstop == LX_PR_SIGNALLED &&
1093 lwpd->br_ptrace_stopsig == LX_SIGSTOP &&
1094 (lwpd->br_ptrace_flags & LX_PTF_STOPPED)) {
1095 return (LX_REG_LOC_LWP);
1096 }
1097
1098 /*
1099 * Called to obtain syscall exit for other cases
1100 * (e.g. pseudo return from rt_sigreturn).
1101 */
1102 if (lwpd->br_ptrace_whatstop == LX_PR_SYSEXIT &&
1103 (lwpd->br_ptrace_flags & LX_PTF_STOPPED)) {
1104 return (LX_REG_LOC_LWP);
1105 }
1106 }
1107 break;
1108 default:
1109 break;
1110 }
1111
1112 if (lwpd->br_ptrace_stopucp != (uintptr_t)NULL) {
1113 /*
1114 * The LWP was stopped in the usermode emulation library
1115 * but a ucontext_t for the preserved brand stack and
1116 * register state was provided. Return the register state
1117 * from that ucontext_t.
1118 */
1119 VERIFY(ucp != NULL);
1120 *ucp = (void *)lwpd->br_ptrace_stopucp;
1121 return (LX_REG_LOC_UCP);
1122 }
1123
1124 return (LX_REG_LOC_UNAVAIL);
1125 }