1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*
  27  * Copyright 2019 Joyent, Inc.
  28  */
  29 
  30 #include <sys/errno.h>
  31 #include <sys/systm.h>
  32 #include <sys/archsystm.h>
  33 #include <sys/privregs.h>
  34 #include <sys/exec.h>
  35 #include <sys/lwp.h>
  36 #include <sys/sem.h>
  37 #include <sys/brand.h>
  38 #include <sys/lx_brand.h>
  39 #include <sys/lx_misc.h>
  40 #include <sys/lx_siginfo.h>
  41 #include <sys/lx_futex.h>
  42 #include <lx_errno.h>
  43 #include <sys/lx_userhz.h>
  44 #include <sys/cmn_err.h>
  45 #include <sys/siginfo.h>
  46 #include <sys/contract/process_impl.h>
  47 #include <sys/x86_archext.h>
  48 #include <sys/sdt.h>
  49 #include <lx_signum.h>
  50 #include <lx_syscall.h>
  51 #include <sys/proc.h>
  52 #include <sys/procfs.h>
  53 #include <net/if.h>
  54 #include <inet/ip6.h>
  55 #include <sys/sunddi.h>
  56 #include <sys/dlpi.h>
  57 #include <sys/sysmacros.h>
  58 
  59 /* Linux specific functions and definitions */
  60 static void lx_save(klwp_t *);
  61 static void lx_restore(klwp_t *);
  62 
  63 /*
  64  * Set the return code for the forked child, always zero
  65  */
  66 /*ARGSUSED*/
  67 void
  68 lx_setrval(klwp_t *lwp, int v1, int v2)
  69 {
  70         lwptoregs(lwp)->r_r0 = 0;
  71 }
  72 
  73 /*
  74  * Reset process state on exec(2)
  75  */
  76 void
  77 lx_exec()
  78 {
  79         klwp_t *lwp = ttolwp(curthread);
  80         struct lx_lwp_data *lwpd = lwptolxlwp(lwp);
  81         proc_t *p = ttoproc(curthread);
  82         lx_proc_data_t *pd = ptolxproc(p);
  83         struct regs *rp = lwptoregs(lwp);
  84 
  85         /* b_exec is called without p_lock held */
  86         VERIFY(MUTEX_NOT_HELD(&p->p_lock));
  87 
  88         /*
  89          * Any l_handler handlers set as a result of B_REGISTER are now
  90          * invalid; clear them.
  91          */
  92         pd->l_handler = (uintptr_t)NULL;
  93 
  94         /*
  95          * If this was a multi-threaded Linux process and this lwp wasn't the
  96          * main lwp, then we need to make its Illumos and Linux PIDs match.
  97          */
  98         if (curthread->t_tid != 1) {
  99                 lx_pid_reassign(curthread);
 100         }
 101 
 102         /*
 103          * Inform ptrace(2) that we are processing an execve(2) call so that if
 104          * we are traced we can post either the PTRACE_EVENT_EXEC event or the
 105          * legacy SIGTRAP.
 106          */
 107         (void) lx_ptrace_stop_for_option(LX_PTRACE_O_TRACEEXEC, B_FALSE, 0, 0);
 108 
 109         /* clear the fs/gsbase values until the app. can reinitialize them */
 110         lwpd->br_lx_fsbase = (uintptr_t)NULL;
 111         lwpd->br_ntv_fsbase = (uintptr_t)NULL;
 112         lwpd->br_lx_gsbase = (uintptr_t)NULL;
 113         lwpd->br_ntv_gsbase = (uintptr_t)NULL;
 114 
 115         /*
 116          * Clear the native stack flags.  This will be reinitialised by
 117          * lx_init() in the new process image.
 118          */
 119         lwpd->br_stack_mode = LX_STACK_MODE_PREINIT;
 120         lwpd->br_ntv_stack = 0;
 121         lwpd->br_ntv_stack_current = 0;
 122 
 123         installctx(lwptot(lwp), lwp, lx_save, lx_restore, NULL, NULL, lx_save,
 124             NULL, NULL);
 125 
 126         /*
 127          * clear out the tls array
 128          */
 129         bzero(lwpd->br_tls, sizeof (lwpd->br_tls));
 130 
 131         /*
 132          * reset the tls entries in the gdt
 133          */
 134         kpreempt_disable();
 135         lx_restore(lwp);
 136         kpreempt_enable();
 137 
 138         /*
 139          * The exec syscall doesn't return (so we don't call lx_syscall_return)
 140          * but for our ptrace emulation we need to do this so that a tracer
 141          * does not get out of sync. We know that by the time this lx_exec
 142          * function is called that the exec has succeeded.
 143          */
 144         rp->r_r0 = 0;
 145         (void) lx_ptrace_stop(LX_PR_SYSEXIT);
 146 }
 147 
 148 static void
 149 lx_cleanlwp(klwp_t *lwp, proc_t *p)
 150 {
 151         struct lx_lwp_data *lwpd = lwptolxlwp(lwp);
 152         void *rb_list = NULL;
 153 
 154         VERIFY(lwpd != NULL);
 155 
 156         mutex_enter(&p->p_lock);
 157         if ((lwpd->br_ptrace_flags & LX_PTF_EXITING) == 0) {
 158                 lx_ptrace_exit(p, lwp);
 159         }
 160 
 161         /*
 162          * While we have p_lock, clear the TP_KTHREAD flag. This is needed
 163          * to prevent races within lx procfs. It's fine for prchoose() to pick
 164          * this thread now since it is exiting and no longer blocked in the
 165          * kernel.
 166          */
 167         lwptot(lwp)->t_proc_flag &= ~TP_KTHREAD;
 168 
 169         /*
 170          * While we have p_lock, safely grab any robust_list references and
 171          * clear the lwp field.
 172          */
 173         sprlock_proc(p);
 174         rb_list = lwpd->br_robust_list;
 175         lwpd->br_robust_list = NULL;
 176         sprunlock(p);
 177 
 178         if (rb_list != NULL) {
 179                 lx_futex_robust_exit((uintptr_t)rb_list, lwpd->br_pid);
 180         }
 181 
 182         /*
 183          * We need to run our context exit operation (lx_save) here to ensure
 184          * we don't leave any garbage around. This is necessary to handle the
 185          * following calling sequence:
 186          *    exit -> proc_exit -> lx_freelwp -> removectx
 187          * That is, when our branded process exits, proc_exit will call our
 188          * lx_freelwp brand hook which does call this function (lx_cleanlwp),
 189          * but lx_freelwp also removes our context exit operation. The context
 190          * exit functions are run by exitctx, which is called by either
 191          * lwp_exit or thread_exit. The thread_exit function is called at the
 192          * end of proc_exit when we'll swtch() to another thread, but by then
 193          * our context exit function has been removed.
 194          *
 195          * It's ok if this function happens to be called more than once (for
 196          * example, if we exec a native binary).
 197          */
 198         kpreempt_disable();
 199         lx_save(lwp);
 200         kpreempt_enable();
 201 }
 202 
 203 void
 204 lx_exitlwp(klwp_t *lwp)
 205 {
 206         struct lx_lwp_data *lwpd = lwptolxlwp(lwp);
 207         proc_t *p = lwptoproc(lwp);
 208         kthread_t *t;
 209         sigqueue_t *sqp = NULL;
 210         pid_t ppid;
 211         id_t ptid;
 212 
 213         VERIFY(MUTEX_NOT_HELD(&p->p_lock));
 214 
 215         if (lwpd == NULL) {
 216                 /* second time thru' */
 217                 return;
 218         }
 219 
 220         lx_cleanlwp(lwp, p);
 221 
 222         if (lwpd->br_clear_ctidp != NULL) {
 223                 (void) suword32(lwpd->br_clear_ctidp, 0);
 224                 (void) lx_futex((uintptr_t)lwpd->br_clear_ctidp, FUTEX_WAKE, 1,
 225                     (uintptr_t)NULL, (uintptr_t)NULL, 0);
 226                 lwpd->br_clear_ctidp = NULL;
 227         }
 228 
 229         if (lwpd->br_signal != 0) {
 230                 /*
 231                  * The first thread in a process doesn't cause a signal to
 232                  * be sent when it exits.  It was created by a fork(), not
 233                  * a clone(), so the parent should get signalled when the
 234                  * process exits.
 235                  */
 236                 if (lwpd->br_ptid == -1)
 237                         goto free;
 238 
 239                 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
 240                 /*
 241                  * If br_ppid is 0, it means this is a CLONE_PARENT thread,
 242                  * so the signal goes to the parent process - not to a
 243                  * specific thread in this process.
 244                  */
 245                 p = lwptoproc(lwp);
 246                 if (lwpd->br_ppid == 0) {
 247                         mutex_enter(&p->p_lock);
 248                         ppid = p->p_ppid;
 249                         t = NULL;
 250                 } else {
 251                         /*
 252                          * If we have been reparented to init or if our
 253                          * parent thread is gone, then nobody gets
 254                          * signaled.
 255                          */
 256                         if ((lx_lwp_ppid(lwp, &ppid, &ptid) == 1) ||
 257                             (ptid == -1))
 258                                 goto free;
 259 
 260                         mutex_enter(&pidlock);
 261                         if ((p = prfind(ppid)) == NULL || p->p_stat == SIDL) {
 262                                 mutex_exit(&pidlock);
 263                                 goto free;
 264                         }
 265                         mutex_enter(&p->p_lock);
 266                         mutex_exit(&pidlock);
 267 
 268                         if ((t = idtot(p, ptid)) == NULL) {
 269                                 mutex_exit(&p->p_lock);
 270                                 goto free;
 271                         }
 272                 }
 273 
 274                 sqp->sq_info.si_signo = lwpd->br_signal;
 275                 sqp->sq_info.si_code = lwpd->br_exitwhy;
 276                 sqp->sq_info.si_status = lwpd->br_exitwhat;
 277                 sqp->sq_info.si_pid = lwpd->br_pid;
 278                 sqp->sq_info.si_uid = crgetruid(CRED());
 279                 sigaddqa(p, t, sqp);
 280                 mutex_exit(&p->p_lock);
 281                 sqp = NULL;
 282         }
 283 
 284 free:
 285         if (lwpd->br_scall_args != NULL) {
 286                 ASSERT(lwpd->br_args_size > 0);
 287                 kmem_free(lwpd->br_scall_args, lwpd->br_args_size);
 288         }
 289         if (sqp)
 290                 kmem_free(sqp, sizeof (sigqueue_t));
 291 }
 292 
 293 void
 294 lx_freelwp(klwp_t *lwp)
 295 {
 296         struct lx_lwp_data *lwpd = lwptolxlwp(lwp);
 297         proc_t *p = lwptoproc(lwp);
 298         lx_zone_data_t *lxzdata;
 299         vfs_t *cgrp;
 300 
 301         VERIFY(MUTEX_NOT_HELD(&p->p_lock));
 302 
 303         if (lwpd == NULL) {
 304                 /*
 305                  * There is one case where an LX branded process will possess
 306                  * LWPs which lack their own brand data.  During the course of
 307                  * executing native binary, the process will be preemptively
 308                  * branded to allow hooks such as b_native_exec to function.
 309                  * If that process possesses multiple LWPS, they will _not_ be
 310                  * branded since they will exit if the exec succeeds.  It's
 311                  * during this LWP exit that lx_freelwp would be called on an
 312                  * unbranded LWP.  When that is the case, it is acceptable to
 313                  * bypass the hook.
 314                  */
 315                 return;
 316         }
 317 
 318         /* cgroup integration */
 319         lxzdata = ztolxzd(p->p_zone);
 320         mutex_enter(&lxzdata->lxzd_lock);
 321         cgrp = lxzdata->lxzd_cgroup;
 322         if (cgrp != NULL) {
 323                 VFS_HOLD(cgrp);
 324                 mutex_exit(&lxzdata->lxzd_lock);
 325                 ASSERT(lx_cgrp_freelwp != NULL);
 326                 (*lx_cgrp_freelwp)(cgrp, lwpd->br_cgroupid, lwptot(lwp)->t_tid,
 327                     lwpd->br_pid);
 328                 VFS_RELE(cgrp);
 329         } else {
 330                 mutex_exit(&lxzdata->lxzd_lock);
 331         }
 332 
 333         /*
 334          * It is possible for the lx_freelwp hook to be called without a prior
 335          * call to lx_exitlwp being made.  This happens as part of lwp
 336          * de-branding when a native binary is executed from a branded process.
 337          *
 338          * To cover all cases, lx_cleanlwp is called from lx_exitlwp as well
 339          * here in lx_freelwp.  When the second call is redundant, the
 340          * resources will already be freed and no work will be needed.
 341          */
 342         lx_cleanlwp(lwp, p);
 343 
 344         /*
 345          * Remove our system call interposer.
 346          */
 347         lwp->lwp_brand_syscall = NULL;
 348 
 349         (void) removectx(lwptot(lwp), lwp, lx_save, lx_restore, NULL, NULL,
 350             lx_save, NULL);
 351         if (lwpd->br_pid != 0) {
 352                 lx_pid_rele(lwptoproc(lwp)->p_pid, lwptot(lwp)->t_tid);
 353         }
 354 
 355         /*
 356          * Discard the affinity mask.
 357          */
 358         VERIFY(lwpd->br_affinitymask != NULL);
 359         cpuset_free(lwpd->br_affinitymask);
 360         lwpd->br_affinitymask = NULL;
 361 
 362         /*
 363          * Ensure that lx_ptrace_exit() has been called to detach
 364          * ptrace(2) tracers and tracees.
 365          */
 366         VERIFY(lwpd->br_ptrace_tracer == NULL);
 367         VERIFY(lwpd->br_ptrace_accord == NULL);
 368 
 369         lwp->lwp_brand = NULL;
 370         kmem_free(lwpd, sizeof (struct lx_lwp_data));
 371 }
 372 
 373 void *
 374 lx_lwpdata_alloc(proc_t *p)
 375 {
 376         lx_lwp_data_t *lwpd;
 377         struct lx_pid *lpidp;
 378         cpuset_t *affmask;
 379         pid_t newpid = 0;
 380         struct pid *pidp = NULL;
 381 
 382         VERIFY(MUTEX_NOT_HELD(&p->p_lock));
 383 
 384         /*
 385          * LWPs beyond the first will require a pid to be allocated to emulate
 386          * Linux's goofy thread model.  While this  allocation may be
 387          * unnecessary when a single-lwp process undergoes branding, it cannot
 388          * be performed during b_initlwp due to p_lock being held.
 389          */
 390         if (p->p_lwpcnt > 0) {
 391                 if ((newpid = pid_allocate(p, 0, 0)) < 0) {
 392                         return (NULL);
 393                 }
 394                 pidp = pid_find(newpid);
 395         }
 396 
 397         lwpd = kmem_zalloc(sizeof (struct lx_lwp_data), KM_SLEEP);
 398         lpidp = kmem_zalloc(sizeof (struct lx_pid), KM_SLEEP);
 399         affmask = cpuset_alloc(KM_SLEEP);
 400 
 401         lpidp->lxp_lpid = newpid;
 402         lpidp->lxp_pidp = pidp;
 403         lwpd->br_lpid = lpidp;
 404         lwpd->br_affinitymask = affmask;
 405 
 406         return (lwpd);
 407 }
 408 
 409 /*
 410  * Free lwp brand data if an error occurred during lwp_create.
 411  * Otherwise, lx_freelwp will be used to free the resources after they're
 412  * associated with the lwp via lx_initlwp.
 413  */
 414 void
 415 lx_lwpdata_free(void *lwpbd)
 416 {
 417         lx_lwp_data_t *lwpd = (lx_lwp_data_t *)lwpbd;
 418         VERIFY(lwpd != NULL);
 419         VERIFY(lwpd->br_lpid != NULL);
 420         VERIFY(lwpd->br_affinitymask != NULL);
 421 
 422         cpuset_free(lwpd->br_affinitymask);
 423         if (lwpd->br_lpid->lxp_pidp != NULL) {
 424                 (void) pid_rele(lwpd->br_lpid->lxp_pidp);
 425         }
 426         kmem_free(lwpd->br_lpid, sizeof (*lwpd->br_lpid));
 427         kmem_free(lwpd, sizeof (*lwpd));
 428 }
 429 
 430 void
 431 lx_initlwp(klwp_t *lwp, void *lwpbd)
 432 {
 433         lx_lwp_data_t *lwpd = (lx_lwp_data_t *)lwpbd;
 434         lx_lwp_data_t *plwpd = ttolxlwp(curthread);
 435         kthread_t *tp = lwptot(lwp);
 436         proc_t *p = lwptoproc(lwp);
 437         lx_zone_data_t *lxzdata;
 438         vfs_t *cgrp;
 439 
 440         VERIFY(MUTEX_HELD(&p->p_lock));
 441         VERIFY(lwp->lwp_brand == NULL);
 442 
 443         lwpd->br_exitwhy = CLD_EXITED;
 444         lwpd->br_lwp = lwp;
 445         lwpd->br_clear_ctidp = NULL;
 446         lwpd->br_set_ctidp = NULL;
 447         lwpd->br_signal = 0;
 448         lwpd->br_stack_mode = LX_STACK_MODE_PREINIT;
 449         cpuset_all(lwpd->br_affinitymask);
 450 
 451         /*
 452          * The first thread in a process has ppid set to the parent
 453          * process's pid, and ptid set to -1.  Subsequent threads in the
 454          * process have their ppid set to the pid of the thread that
 455          * created them, and their ptid to that thread's tid.
 456          */
 457         if (tp->t_next == tp) {
 458                 lwpd->br_ppid = tp->t_procp->p_ppid;
 459                 lwpd->br_ptid = -1;
 460         } else if (plwpd != NULL) {
 461                 bcopy(plwpd->br_tls, lwpd->br_tls, sizeof (lwpd->br_tls));
 462                 lwpd->br_ppid = plwpd->br_pid;
 463                 lwpd->br_ptid = curthread->t_tid;
 464                 /* The child inherits the fs/gsbase values from the parent */
 465                 lwpd->br_lx_fsbase = plwpd->br_lx_fsbase;
 466                 lwpd->br_ntv_fsbase = plwpd->br_ntv_fsbase;
 467                 lwpd->br_lx_gsbase = plwpd->br_lx_gsbase;
 468                 lwpd->br_ntv_gsbase = plwpd->br_ntv_gsbase;
 469         } else {
 470                 /*
 471                  * Oddball case: the parent thread isn't a Linux process.
 472                  */
 473                 lwpd->br_ppid = 0;
 474                 lwpd->br_ptid = -1;
 475         }
 476         lwp->lwp_brand = lwpd;
 477 
 478         /*
 479          * When during lx_lwpdata_alloc, we must decide whether or not to
 480          * allocate a new pid to associate with the lwp. Since p_lock is not
 481          * held at that point, the only time we can guarantee a new pid isn't
 482          * needed is when p_lwpcnt == 0.  This is because other lwps won't be
 483          * present to race with us with regards to pid allocation.
 484          *
 485          * This means that in all other cases (where p_lwpcnt > 0), we expect
 486          * that lx_lwpdata_alloc will allocate a pid for us to use here, even
 487          * if it is uneeded.  If this process is undergoing an exec, for
 488          * example, the single existing lwp will not need a new pid when it is
 489          * rebranded.  In that case, lx_pid_assign will free the uneeded pid.
 490          */
 491         VERIFY(lwpd->br_lpid->lxp_pidp != NULL || p->p_lwpcnt == 0);
 492 
 493         lx_pid_assign(tp, lwpd->br_lpid);
 494         lwpd->br_tgid = lwpd->br_pid;
 495         /*
 496          * Having performed the lx pid assignement, the lpid reference is no
 497          * longer needed.  The underlying data will be freed during lx_freelwp.
 498          */
 499         lwpd->br_lpid = NULL;
 500 
 501         installctx(lwptot(lwp), lwp, lx_save, lx_restore, NULL, NULL,
 502             lx_save, NULL, NULL);
 503 
 504         /*
 505          * Install branded system call hooks for this LWP:
 506          */
 507         lwp->lwp_brand_syscall = lx_syscall_enter;
 508 
 509         /*
 510          * The new LWP inherits the parent LWP cgroup ID.
 511          */
 512         if (plwpd != NULL) {
 513                 lwpd->br_cgroupid = plwpd->br_cgroupid;
 514         }
 515         /*
 516          * The new LWP inherits the parent LWP emulated scheduling info.
 517          */
 518         if (plwpd != NULL) {
 519                 lwpd->br_schd_class = plwpd->br_schd_class;
 520                 lwpd->br_schd_pri = plwpd->br_schd_pri;
 521                 lwpd->br_schd_flags = plwpd->br_schd_flags;
 522                 lwpd->br_schd_runtime = plwpd->br_schd_runtime;
 523                 lwpd->br_schd_deadline = plwpd->br_schd_deadline;
 524                 lwpd->br_schd_period = plwpd->br_schd_period;
 525         }
 526         lxzdata = ztolxzd(p->p_zone);
 527         mutex_enter(&lxzdata->lxzd_lock);
 528         cgrp = lxzdata->lxzd_cgroup;
 529         if (cgrp != NULL) {
 530                 VFS_HOLD(cgrp);
 531                 mutex_exit(&lxzdata->lxzd_lock);
 532                 ASSERT(lx_cgrp_initlwp != NULL);
 533                 (*lx_cgrp_initlwp)(cgrp, lwpd->br_cgroupid, lwptot(lwp)->t_tid,
 534                     lwpd->br_pid);
 535                 VFS_RELE(cgrp);
 536         } else {
 537                 mutex_exit(&lxzdata->lxzd_lock);
 538         }
 539 }
 540 
 541 void
 542 lx_initlwp_post(klwp_t *lwp)
 543 {
 544         lx_lwp_data_t *plwpd = ttolxlwp(curthread);
 545         /*
 546          * If the parent LWP has a ptrace(2) tracer, the new LWP may
 547          * need to inherit that same tracer.
 548          */
 549         if (plwpd != NULL) {
 550                 lx_ptrace_inherit_tracer(plwpd, lwptolxlwp(lwp));
 551         }
 552 }
 553 
 554 /*
 555  * There is no need to have any locking for either the source or
 556  * destination struct lx_lwp_data structs.  This is always run in the
 557  * thread context of the source thread, and the destination thread is
 558  * always newly created and not referred to from anywhere else.
 559  */
 560 void
 561 lx_forklwp(klwp_t *srclwp, klwp_t *dstlwp)
 562 {
 563         struct lx_lwp_data *src = srclwp->lwp_brand;
 564         struct lx_lwp_data *dst = dstlwp->lwp_brand;
 565 
 566         dst->br_ppid = src->br_pid;
 567         dst->br_ptid = lwptot(srclwp)->t_tid;
 568         bcopy(src->br_tls, dst->br_tls, sizeof (dst->br_tls));
 569 
 570         switch (src->br_stack_mode) {
 571         case LX_STACK_MODE_BRAND:
 572         case LX_STACK_MODE_NATIVE:
 573                 /*
 574                  * The parent LWP has an alternate stack installed.
 575                  * The child LWP should have the same stack base and extent.
 576                  */
 577                 dst->br_stack_mode = src->br_stack_mode;
 578                 dst->br_ntv_stack = src->br_ntv_stack;
 579                 dst->br_ntv_stack_current = src->br_ntv_stack_current;
 580                 break;
 581 
 582         default:
 583                 /*
 584                  * Otherwise, clear the stack data for this LWP.
 585                  */
 586                 dst->br_stack_mode = LX_STACK_MODE_PREINIT;
 587                 dst->br_ntv_stack = 0;
 588                 dst->br_ntv_stack_current = 0;
 589         }
 590 
 591         /*
 592          * copy only these flags
 593          */
 594         dst->br_lwp_flags = src->br_lwp_flags & BR_CPU_BOUND;
 595         dst->br_scall_args = NULL;
 596         lx_affinity_forklwp(srclwp, dstlwp);
 597 
 598         /*
 599          * Flag so child doesn't ptrace-stop on syscall exit.
 600          */
 601         dst->br_ptrace_flags |= LX_PTF_NOSTOP;
 602 
 603         if (src->br_clone_grp_flags != 0) {
 604                 lx_clone_grp_enter(src->br_clone_grp_flags, lwptoproc(srclwp),
 605                     lwptoproc(dstlwp));
 606                 /* clone group no longer pending on this thread */
 607                 src->br_clone_grp_flags = 0;
 608         }
 609 }
 610 
 611 /*
 612  * When switching a Linux process off the CPU, clear its GDT entries.
 613  */
 614 /* ARGSUSED */
 615 static void
 616 lx_save(klwp_t *t)
 617 {
 618         int i;
 619 
 620 #if defined(__amd64)
 621         reset_sregs();
 622 #endif
 623         for (i = 0; i < LX_TLSNUM; i++)
 624                 gdt_update_usegd(GDT_TLSMIN + i, &null_udesc);
 625 }
 626 
 627 /*
 628  * When switching a Linux process on the CPU, set its GDT entries.
 629  *
 630  * For 64-bit code we don't have to worry about explicitly setting the
 631  * %fsbase via wrmsr(MSR_AMD_FSBASE) here. Instead, that should happen
 632  * automatically in update_sregs if we are executing in user-land. If this
 633  * is the case then pcb_rupdate should be set.
 634  */
 635 static void
 636 lx_restore(klwp_t *t)
 637 {
 638         struct lx_lwp_data *lwpd = lwptolxlwp(t);
 639         user_desc_t *tls;
 640         int i;
 641 
 642         ASSERT(lwpd);
 643 
 644         tls = lwpd->br_tls;
 645         for (i = 0; i < LX_TLSNUM; i++)
 646                 gdt_update_usegd(GDT_TLSMIN + i, &tls[i]);
 647 }
 648 
 649 void
 650 lx_set_gdt(int entry, user_desc_t *descrp)
 651 {
 652 
 653         gdt_update_usegd(entry, descrp);
 654 }
 655 
 656 void
 657 lx_clear_gdt(int entry)
 658 {
 659         gdt_update_usegd(entry, &null_udesc);
 660 }
 661 
 662 longlong_t
 663 lx_nosys()
 664 {
 665         return (set_errno(ENOSYS));
 666 }
 667 
 668 /*
 669  * Brand-specific routine to check if given non-Solaris standard segment
 670  * register values should be modified to other values.
 671  */
 672 /*ARGSUSED*/
 673 greg_t
 674 lx_fixsegreg(greg_t sr, model_t datamodel)
 675 {
 676         uint16_t idx = SELTOIDX(sr);
 677 
 678         ASSERT(sr == (sr & 0xffff));
 679 
 680         /*
 681          * If the segment selector is a valid TLS selector, just return it.
 682          */
 683         if (!SELISLDT(sr) && idx >= GDT_TLSMIN && idx <= GDT_TLSMAX)
 684                 return (sr | SEL_UPL);
 685 
 686         /*
 687          * Force the SR into the LDT in ring 3 for 32-bit processes.
 688          *
 689          * 64-bit processes get the null GDT selector since they are not
 690          * allowed to have a private LDT.
 691          */
 692 #if defined(__amd64)
 693         return (datamodel == DATAMODEL_ILP32 ? (sr | SEL_TI_LDT | SEL_UPL) : 0);
 694 #elif defined(__i386)
 695         datamodel = datamodel;  /* datamodel currently unused for 32-bit */
 696         return (sr | SEL_TI_LDT | SEL_UPL);
 697 #endif  /* __amd64 */
 698 }
 699 
 700 /*
 701  * Brand-specific function to convert the fsbase as pulled from the register
 702  * into a native fsbase suitable for locating the ulwp_t from the kernel.
 703  */
 704 uintptr_t
 705 lx_fsbase(klwp_t *lwp, uintptr_t fsbase)
 706 {
 707         lx_lwp_data_t *lwpd = lwp->lwp_brand;
 708 
 709         if (lwpd->br_stack_mode != LX_STACK_MODE_BRAND ||
 710             lwpd->br_ntv_fsbase == (uintptr_t)NULL) {
 711                 return (fsbase);
 712         }
 713 
 714         return (lwpd->br_ntv_fsbase);
 715 }
 716 
 717 /*
 718  * These two functions simulate winfo and post_sigcld for the lx brand. The
 719  * difference is delivering a designated signal as opposed to always SIGCLD.
 720  */
 721 static void
 722 lx_winfo(proc_t *pp, k_siginfo_t *ip, struct lx_proc_data *dat)
 723 {
 724         ASSERT(MUTEX_HELD(&pidlock));
 725         bzero(ip, sizeof (k_siginfo_t));
 726         ip->si_signo = ltos_signo[dat->l_signal];
 727         ip->si_code = pp->p_wcode;
 728         ip->si_pid = pp->p_pid;
 729         ip->si_ctid = PRCTID(pp);
 730         ip->si_zoneid = pp->p_zone->zone_id;
 731         ip->si_status = pp->p_wdata;
 732         /*
 733          * These siginfo values are converted to USER_HZ in the user-land
 734          * brand signal code.
 735          */
 736         ip->si_stime = pp->p_stime;
 737         ip->si_utime = pp->p_utime;
 738 }
 739 
 740 static void
 741 lx_post_exit_sig(proc_t *cp, sigqueue_t *sqp, struct lx_proc_data *dat)
 742 {
 743         proc_t *pp = cp->p_parent;
 744 
 745         ASSERT(MUTEX_HELD(&pidlock));
 746         mutex_enter(&pp->p_lock);
 747         /*
 748          * Since Linux doesn't queue SIGCHLD, or any other non RT
 749          * signals, we just blindly deliver whatever signal we can.
 750          */
 751         ASSERT(sqp != NULL);
 752         lx_winfo(cp, &sqp->sq_info, dat);
 753         sigaddqa(pp, NULL, sqp);
 754         sqp = NULL;
 755         mutex_exit(&pp->p_lock);
 756 }
 757 
 758 
 759 /*
 760  * Brand specific code for exiting and sending a signal to the parent, as
 761  * opposed to sigcld().
 762  */
 763 void
 764 lx_exit_with_sig(proc_t *cp, sigqueue_t *sqp)
 765 {
 766         proc_t *pp = cp->p_parent;
 767         lx_proc_data_t *lx_brand_data = ptolxproc(cp);
 768         ASSERT(MUTEX_HELD(&pidlock));
 769 
 770         switch (cp->p_wcode) {
 771         case CLD_EXITED:
 772         case CLD_DUMPED:
 773         case CLD_KILLED:
 774                         ASSERT(cp->p_stat == SZOMB);
 775                         /*
 776                          * The broadcast on p_srwchan_cv is a kludge to
 777                          * wakeup a possible thread in uadmin(A_SHUTDOWN).
 778                          */
 779                         cv_broadcast(&cp->p_srwchan_cv);
 780 
 781                         /*
 782                          * Add to newstate list of the parent
 783                          */
 784                         add_ns(pp, cp);
 785 
 786                         cv_broadcast(&pp->p_cv);
 787                         if ((pp->p_flag & SNOWAIT) ||
 788                             PTOU(pp)->u_signal[SIGCLD - 1] == SIG_IGN) {
 789                                 if (!(cp->p_pidflag & CLDWAITPID))
 790                                         freeproc(cp);
 791                         } else if (!(cp->p_pidflag & CLDNOSIGCHLD) &&
 792                             lx_brand_data->l_signal != 0) {
 793                                 lx_post_exit_sig(cp, sqp, lx_brand_data);
 794                                 sqp = NULL;
 795                         }
 796                         break;
 797 
 798         case CLD_STOPPED:
 799         case CLD_CONTINUED:
 800         case CLD_TRAPPED:
 801                         panic("Should not be called in this case");
 802         }
 803 
 804         if (sqp)
 805                 siginfofree(sqp);
 806 }
 807 
 808 /*
 809  * Filters based on arguments that have been passed in by a separate syscall
 810  * using the B_STORE_ARGS mechanism. if the __WALL flag is set, no filter is
 811  * applied, otherwise we look at the difference between a clone and non-clone
 812  * process.
 813  * The definition of a clone process in Linux is a thread that does not deliver
 814  * SIGCHLD to its parent. The option __WCLONE indicates to wait only on clone
 815  * processes. Without that option, a process should only wait on normal
 816  * children. The following table shows the cases.
 817  *
 818  *                   default    __WCLONE
 819  *   no SIGCHLD      -           X
 820  *   SIGCHLD         X           -
 821  *
 822  * This is an XOR of __WCLONE being set, and SIGCHLD being the signal sent on
 823  * process exit.
 824  *
 825  * More information on wait in lx brands can be found at
 826  * usr/src/lib/brand/lx/lx_brand/common/wait.c.
 827  */
 828 /* ARGSUSED */
 829 boolean_t
 830 lx_wait_filter(proc_t *pp, proc_t *cp)
 831 {
 832         lx_lwp_data_t *lwpd = ttolxlwp(curthread);
 833         int flags = lwpd->br_waitid_flags;
 834         boolean_t ret;
 835 
 836         if (!lwpd->br_waitid_emulate) {
 837                 return (B_TRUE);
 838         }
 839 
 840         mutex_enter(&cp->p_lock);
 841         if (flags & LX_WALL) {
 842                 ret = B_TRUE;
 843         } else {
 844                 lx_proc_data_t *pd = ptolxproc(cp);
 845                 boolean_t is_sigchld = B_TRUE;
 846                 boolean_t match_wclone = B_FALSE;
 847 
 848                 /*
 849                  * When calling clone, an alternate signal can be chosen to
 850                  * deliver to the parent when the child exits.
 851                  */
 852                 if (pd != NULL && pd->l_signal != stol_signo[SIGCHLD]) {
 853                         is_sigchld = B_FALSE;
 854                 }
 855                 if ((flags & LX_WCLONE) != 0) {
 856                         match_wclone = B_TRUE;
 857                 }
 858 
 859                 ret = (match_wclone ^ is_sigchld) ? B_TRUE : B_FALSE;
 860         }
 861         mutex_exit(&cp->p_lock);
 862 
 863         return (ret);
 864 }
 865 
 866 void
 867 lx_ifname_convert(char *ifname, lx_if_action_t act)
 868 {
 869         if (act == LX_IF_TONATIVE) {
 870                 if (strncmp(ifname, "lo", IFNAMSIZ) == 0)
 871                         (void) strlcpy(ifname, "lo0", IFNAMSIZ);
 872         } else {
 873                 if (strncmp(ifname, "lo0", IFNAMSIZ) == 0)
 874                         (void) strlcpy(ifname, "lo", IFNAMSIZ);
 875         }
 876 }
 877 
 878 void
 879 lx_ifflags_convert(uint64_t *flags, lx_if_action_t act)
 880 {
 881         uint64_t buf;
 882 
 883         buf = *flags & (IFF_UP | IFF_BROADCAST | IFF_DEBUG |
 884             IFF_LOOPBACK | IFF_POINTOPOINT | IFF_NOTRAILERS |
 885             IFF_RUNNING | IFF_NOARP | IFF_PROMISC | IFF_ALLMULTI);
 886 
 887         /* Linux has different shift for multicast flag */
 888         if (act == LX_IF_TONATIVE) {
 889                 if (*flags & 0x1000)
 890                         buf |= IFF_MULTICAST;
 891         } else {
 892                 if (*flags & IFF_MULTICAST)
 893                         buf |= 0x1000;
 894         }
 895         *flags = buf;
 896 }
 897 
 898 /*
 899  * Convert an IPv6 address into the numbers used by /proc/net/if_inet6
 900  */
 901 unsigned int
 902 lx_ipv6_scope_convert(const in6_addr_t *addr)
 903 {
 904         if (IN6_IS_ADDR_V4COMPAT(addr)) {
 905                 return (LX_IPV6_ADDR_COMPATv4);
 906         } else if (IN6_ARE_ADDR_EQUAL(addr, &ipv6_loopback)) {
 907                 return (LX_IPV6_ADDR_LOOPBACK);
 908         } else if (IN6_IS_ADDR_LINKLOCAL(addr)) {
 909                 return (LX_IPV6_ADDR_LINKLOCAL);
 910         } else if (IN6_IS_ADDR_SITELOCAL(addr)) {
 911                 return (LX_IPV6_ADDR_SITELOCAL);
 912         } else {
 913                 return (0x0000U);
 914         }
 915 }
 916 
 917 
 918 void
 919 lx_stol_hwaddr(const struct sockaddr_dl *src, struct sockaddr *dst, int *size)
 920 {
 921         int copy_size = MIN(src->sdl_alen, sizeof (dst->sa_data));
 922 
 923         switch (src->sdl_type) {
 924         case DL_ETHER:
 925                 dst->sa_family = LX_ARPHRD_ETHER;
 926                 break;
 927         case DL_LOOP:
 928                 dst->sa_family = LX_ARPHRD_LOOPBACK;
 929                 break;
 930         default:
 931                 dst->sa_family = LX_ARPHRD_VOID;
 932         }
 933 
 934         bcopy(LLADDR(src), dst->sa_data, copy_size);
 935         *size = copy_size;
 936 }
 937 
 938 /*
 939  * Brand hook to convert native kernel siginfo signal number, errno, code, pid
 940  * and si_status to Linux values. Similar to the stol_ksiginfo function but
 941  * this one converts in-place, converts the pid, and does not copyout.
 942  */
 943 void
 944 lx_sigfd_translate(k_siginfo_t *infop)
 945 {
 946         zone_t *zone = curproc->p_zone;
 947 
 948         infop->si_signo = lx_stol_signo(infop->si_signo, LX_SIGKILL);
 949         infop->si_status = lx_stol_status(infop->si_status, LX_SIGKILL);
 950         infop->si_code = lx_stol_sigcode(infop->si_code);
 951         infop->si_errno = lx_errno(infop->si_errno, EINVAL);
 952 
 953         /* Map zsched and zone init to pid 1 */
 954         if (infop->si_pid == zone->zone_proc_initpid ||
 955             infop->si_pid == zone->zone_zsched->p_pid) {
 956                 infop->si_pid = 1;
 957         }
 958 }
 959 
 960 int
 961 stol_ksiginfo_copyout(k_siginfo_t *sip, void *ulxsip)
 962 {
 963         lx_siginfo_t lsi;
 964 
 965         bzero(&lsi, sizeof (lsi));
 966         lsi.lsi_signo = lx_stol_signo(sip->si_signo, SIGCLD);
 967         lsi.lsi_code = lx_stol_sigcode(sip->si_code);
 968         lsi.lsi_errno = lx_errno(sip->si_errno, EINVAL);
 969 
 970         switch (lsi.lsi_signo) {
 971         case LX_SIGPOLL:
 972                 lsi.lsi_band = sip->si_band;
 973                 lsi.lsi_fd = sip->si_fd;
 974                 break;
 975 
 976         case LX_SIGCHLD:
 977                 lsi.lsi_pid = sip->si_pid;
 978                 if (sip->si_code <= 0 || sip->si_code == CLD_EXITED) {
 979                         lsi.lsi_status = sip->si_status;
 980                 } else {
 981                         lsi.lsi_status = lx_stol_status(sip->si_status,
 982                             SIGKILL);
 983                 }
 984                 lsi.lsi_utime = HZ_TO_LX_USERHZ(sip->si_utime);
 985                 lsi.lsi_stime = HZ_TO_LX_USERHZ(sip->si_stime);
 986                 break;
 987 
 988         case LX_SIGILL:
 989         case LX_SIGBUS:
 990         case LX_SIGFPE:
 991         case LX_SIGSEGV:
 992                 lsi.lsi_addr = sip->si_addr;
 993                 break;
 994 
 995         default:
 996                 lsi.lsi_pid = sip->si_pid;
 997                 lsi.lsi_uid = LX_UID32_TO_UID16(sip->si_uid);
 998         }
 999 
1000         if (copyout(&lsi, ulxsip, sizeof (lsi)) != 0) {
1001                 return (set_errno(EFAULT));
1002         }
1003 
1004         return (0);
1005 }
1006 
1007 #if defined(_SYSCALL32_IMPL)
1008 int
1009 stol_ksiginfo32_copyout(k_siginfo_t *sip, void *ulxsip)
1010 {
1011         lx_siginfo32_t lsi;
1012 
1013         bzero(&lsi, sizeof (lsi));
1014         lsi.lsi_signo = lx_stol_signo(sip->si_signo, SIGCLD);
1015         lsi.lsi_code = lx_stol_sigcode(sip->si_code);
1016         lsi.lsi_errno = lx_errno(sip->si_errno, EINVAL);
1017 
1018         switch (lsi.lsi_signo) {
1019         case LX_SIGPOLL:
1020                 lsi.lsi_band = sip->si_band;
1021                 lsi.lsi_fd = sip->si_fd;
1022                 break;
1023 
1024         case LX_SIGCHLD:
1025                 lsi.lsi_pid = sip->si_pid;
1026                 if (sip->si_code <= 0 || sip->si_code == CLD_EXITED) {
1027                         lsi.lsi_status = sip->si_status;
1028                 } else {
1029                         lsi.lsi_status = lx_stol_status(sip->si_status,
1030                             SIGKILL);
1031                 }
1032                 lsi.lsi_utime = HZ_TO_LX_USERHZ(sip->si_utime);
1033                 lsi.lsi_stime = HZ_TO_LX_USERHZ(sip->si_stime);
1034                 break;
1035 
1036         case LX_SIGILL:
1037         case LX_SIGBUS:
1038         case LX_SIGFPE:
1039         case LX_SIGSEGV:
1040                 lsi.lsi_addr = (caddr32_t)(uintptr_t)sip->si_addr;
1041                 break;
1042 
1043         default:
1044                 lsi.lsi_pid = sip->si_pid;
1045                 lsi.lsi_uid = LX_UID32_TO_UID16(sip->si_uid);
1046         }
1047 
1048         if (copyout(&lsi, ulxsip, sizeof (lsi)) != 0) {
1049                 return (set_errno(EFAULT));
1050         }
1051 
1052         return (0);
1053 }
1054 #endif
1055 
1056 /* Given an LX LWP, determine where user register state is stored. */
1057 lx_regs_location_t
1058 lx_regs_location(lx_lwp_data_t *lwpd, void **ucp, boolean_t for_write)
1059 {
1060         switch (lwpd->br_stack_mode) {
1061         case LX_STACK_MODE_BRAND:
1062                 /*
1063                  * The LWP was stopped with the brand stack and register state
1064                  * loaded, e.g. during a syscall emulated within the kernel.
1065                  */
1066                 return (LX_REG_LOC_LWP);
1067 
1068         case LX_STACK_MODE_PREINIT:
1069                 if (for_write) {
1070                         /* setting registers not allowed in this state */
1071                         break;
1072                 }
1073                 if (lwpd->br_ptrace_whatstop == LX_PR_SIGNALLED ||
1074                     lwpd->br_ptrace_whatstop == LX_PR_SYSEXIT) {
1075                         /* The LWP was stopped by tracing on exec. */
1076                         return (LX_REG_LOC_LWP);
1077                 }
1078                 break;
1079 
1080         case LX_STACK_MODE_NATIVE:
1081                 if (for_write) {
1082                         /* setting registers not allowed in this state */
1083                         break;
1084                 }
1085                 if (lwpd->br_ptrace_whystop == PR_BRAND) {
1086                         /* Called while ptrace-event-stopped by lx_exec. */
1087                         if (lwpd->br_ptrace_whatstop == LX_PR_EVENT) {
1088                                 return (LX_REG_LOC_LWP);
1089                         }
1090 
1091                         /* Called while ptrace-event-stopped after clone. */
1092                         if (lwpd->br_ptrace_whatstop == LX_PR_SIGNALLED &&
1093                             lwpd->br_ptrace_stopsig == LX_SIGSTOP &&
1094                             (lwpd->br_ptrace_flags & LX_PTF_STOPPED)) {
1095                                 return (LX_REG_LOC_LWP);
1096                         }
1097 
1098                         /*
1099                          * Called to obtain syscall exit for other cases
1100                          * (e.g. pseudo return from rt_sigreturn).
1101                          */
1102                         if (lwpd->br_ptrace_whatstop == LX_PR_SYSEXIT &&
1103                             (lwpd->br_ptrace_flags & LX_PTF_STOPPED)) {
1104                                 return (LX_REG_LOC_LWP);
1105                         }
1106                 }
1107                 break;
1108         default:
1109                 break;
1110         }
1111 
1112         if (lwpd->br_ptrace_stopucp != (uintptr_t)NULL) {
1113                 /*
1114                  * The LWP was stopped in the usermode emulation library
1115                  * but a ucontext_t for the preserved brand stack and
1116                  * register state was provided.  Return the register state
1117                  * from that ucontext_t.
1118                  */
1119                 VERIFY(ucp != NULL);
1120                 *ucp = (void *)lwpd->br_ptrace_stopucp;
1121                 return (LX_REG_LOC_UCP);
1122         }
1123 
1124         return (LX_REG_LOC_UNAVAIL);
1125 }