1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*
  27  * Copyright 2022 Joyent, Inc.
  28  */
  29 
  30 #include <sys/errno.h>
  31 #include <sys/systm.h>
  32 #include <sys/archsystm.h>
  33 #include <sys/privregs.h>
  34 #include <sys/exec.h>
  35 #include <sys/lwp.h>
  36 #include <sys/sem.h>
  37 #include <sys/brand.h>
  38 #include <sys/lx_brand.h>
  39 #include <sys/lx_misc.h>
  40 #include <sys/lx_siginfo.h>
  41 #include <sys/lx_futex.h>
  42 #include <lx_errno.h>
  43 #include <sys/lx_userhz.h>
  44 #include <sys/cmn_err.h>
  45 #include <sys/siginfo.h>
  46 #include <sys/contract/process_impl.h>
  47 #include <sys/x86_archext.h>
  48 #include <sys/sdt.h>
  49 #include <lx_signum.h>
  50 #include <lx_syscall.h>
  51 #include <sys/proc.h>
  52 #include <sys/procfs.h>
  53 #include <net/if.h>
  54 #include <inet/ip6.h>
  55 #include <sys/sunddi.h>
  56 #include <sys/dlpi.h>
  57 #include <sys/sysmacros.h>
  58 
  59 /* Linux specific functions and definitions */
  60 static void lx_save(void *);
  61 static void lx_restore(void *);
  62 
  63 /* Context op template. */
  64 static const struct ctxop_template lx_ctxop_template = {
  65         .ct_rev         = CTXOP_TPL_REV,
  66         .ct_save        = lx_save,
  67         .ct_restore     = lx_restore,
  68         .ct_exit        = lx_save,
  69 };
  70 
  71 /*
  72  * Set the return code for the forked child, always zero
  73  */
  74 /*ARGSUSED*/
  75 void
  76 lx_setrval(klwp_t *lwp, int v1, int v2)
  77 {
  78         lwptoregs(lwp)->r_r0 = 0;
  79 }
  80 
  81 /*
  82  * Reset process state on exec(2)
  83  */
  84 void
  85 lx_exec()
  86 {
  87         klwp_t *lwp = ttolwp(curthread);
  88         struct lx_lwp_data *lwpd = lwptolxlwp(lwp);
  89         proc_t *p = ttoproc(curthread);
  90         lx_proc_data_t *pd = ptolxproc(p);
  91         struct regs *rp = lwptoregs(lwp);
  92 
  93         /* b_exec is called without p_lock held */
  94         VERIFY(MUTEX_NOT_HELD(&p->p_lock));
  95 
  96         /*
  97          * Any l_handler handlers set as a result of B_REGISTER are now
  98          * invalid; clear them.
  99          */
 100         pd->l_handler = (uintptr_t)NULL;
 101 
 102         /*
 103          * If this was a multi-threaded Linux process and this lwp wasn't the
 104          * main lwp, then we need to make its Illumos and Linux PIDs match.
 105          */
 106         if (curthread->t_tid != 1) {
 107                 lx_pid_reassign(curthread);
 108         }
 109 
 110         /*
 111          * Inform ptrace(2) that we are processing an execve(2) call so that if
 112          * we are traced we can post either the PTRACE_EVENT_EXEC event or the
 113          * legacy SIGTRAP.
 114          */
 115         (void) lx_ptrace_stop_for_option(LX_PTRACE_O_TRACEEXEC, B_FALSE, 0, 0);
 116 
 117         /* clear the fs/gsbase values until the app. can reinitialize them */
 118         lwpd->br_lx_fsbase = (uintptr_t)NULL;
 119         lwpd->br_ntv_fsbase = (uintptr_t)NULL;
 120         lwpd->br_lx_gsbase = (uintptr_t)NULL;
 121         lwpd->br_ntv_gsbase = (uintptr_t)NULL;
 122 
 123         /*
 124          * Clear the native stack flags.  This will be reinitialised by
 125          * lx_init() in the new process image.
 126          */
 127         lwpd->br_stack_mode = LX_STACK_MODE_PREINIT;
 128         lwpd->br_ntv_stack = 0;
 129         lwpd->br_ntv_stack_current = 0;
 130 
 131         ctxop_install(lwptot(lwp), &lx_ctxop_template, lwp);
 132 
 133         /*
 134          * clear out the tls array
 135          */
 136         bzero(lwpd->br_tls, sizeof (lwpd->br_tls));
 137 
 138         /*
 139          * reset the tls entries in the gdt
 140          */
 141         kpreempt_disable();
 142         lx_restore(lwp);
 143         kpreempt_enable();
 144 
 145         /*
 146          * The exec syscall doesn't return (so we don't call lx_syscall_return)
 147          * but for our ptrace emulation we need to do this so that a tracer
 148          * does not get out of sync. We know that by the time this lx_exec
 149          * function is called that the exec has succeeded.
 150          */
 151         rp->r_r0 = 0;
 152         (void) lx_ptrace_stop(LX_PR_SYSEXIT);
 153 }
 154 
 155 static void
 156 lx_cleanlwp(klwp_t *lwp, proc_t *p)
 157 {
 158         struct lx_lwp_data *lwpd = lwptolxlwp(lwp);
 159         void *rb_list = NULL;
 160 
 161         VERIFY(lwpd != NULL);
 162 
 163         mutex_enter(&p->p_lock);
 164         if ((lwpd->br_ptrace_flags & LX_PTF_EXITING) == 0) {
 165                 lx_ptrace_exit(p, lwp);
 166         }
 167 
 168         /*
 169          * While we have p_lock, clear the TP_KTHREAD flag. This is needed
 170          * to prevent races within lx procfs. It's fine for prchoose() to pick
 171          * this thread now since it is exiting and no longer blocked in the
 172          * kernel.
 173          */
 174         lwptot(lwp)->t_proc_flag &= ~TP_KTHREAD;
 175 
 176         /*
 177          * While we have p_lock, safely grab any robust_list references and
 178          * clear the lwp field.
 179          */
 180         sprlock_proc(p);
 181         rb_list = lwpd->br_robust_list;
 182         lwpd->br_robust_list = NULL;
 183         sprunlock(p);
 184 
 185         if (rb_list != NULL) {
 186                 lx_futex_robust_exit((uintptr_t)rb_list, lwpd->br_pid);
 187         }
 188 
 189         /*
 190          * We need to run our context exit operation (lx_save) here to ensure
 191          * we don't leave any garbage around. This is necessary to handle the
 192          * following calling sequence:
 193          *    exit -> proc_exit -> lx_freelwp -> removectx
 194          * That is, when our branded process exits, proc_exit will call our
 195          * lx_freelwp brand hook which does call this function (lx_cleanlwp),
 196          * but lx_freelwp also removes our context exit operation. The context
 197          * exit functions are run by exitctx, which is called by either
 198          * lwp_exit or thread_exit. The thread_exit function is called at the
 199          * end of proc_exit when we'll swtch() to another thread, but by then
 200          * our context exit function has been removed.
 201          *
 202          * It's ok if this function happens to be called more than once (for
 203          * example, if we exec a native binary).
 204          */
 205         kpreempt_disable();
 206         lx_save(lwp);
 207         kpreempt_enable();
 208 }
 209 
 210 void
 211 lx_exitlwp(klwp_t *lwp)
 212 {
 213         struct lx_lwp_data *lwpd = lwptolxlwp(lwp);
 214         proc_t *p = lwptoproc(lwp);
 215         kthread_t *t;
 216         sigqueue_t *sqp = NULL;
 217         pid_t ppid;
 218         id_t ptid;
 219 
 220         VERIFY(MUTEX_NOT_HELD(&p->p_lock));
 221 
 222         if (lwpd == NULL) {
 223                 /* second time thru' */
 224                 return;
 225         }
 226 
 227         lx_cleanlwp(lwp, p);
 228 
 229         if (lwpd->br_clear_ctidp != NULL) {
 230                 (void) suword32(lwpd->br_clear_ctidp, 0);
 231                 (void) lx_futex((uintptr_t)lwpd->br_clear_ctidp, FUTEX_WAKE, 1,
 232                     (uintptr_t)NULL, (uintptr_t)NULL, 0);
 233                 lwpd->br_clear_ctidp = NULL;
 234         }
 235 
 236         if (lwpd->br_signal != 0) {
 237                 /*
 238                  * The first thread in a process doesn't cause a signal to
 239                  * be sent when it exits.  It was created by a fork(), not
 240                  * a clone(), so the parent should get signalled when the
 241                  * process exits.
 242                  */
 243                 if (lwpd->br_ptid == -1)
 244                         goto free;
 245 
 246                 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
 247                 /*
 248                  * If br_ppid is 0, it means this is a CLONE_PARENT thread,
 249                  * so the signal goes to the parent process - not to a
 250                  * specific thread in this process.
 251                  */
 252                 p = lwptoproc(lwp);
 253                 if (lwpd->br_ppid == 0) {
 254                         mutex_enter(&p->p_lock);
 255                         ppid = p->p_ppid;
 256                         t = NULL;
 257                 } else {
 258                         /*
 259                          * If we have been reparented to init or if our
 260                          * parent thread is gone, then nobody gets
 261                          * signaled.
 262                          */
 263                         if ((lx_lwp_ppid(lwp, &ppid, &ptid) == 1) ||
 264                             (ptid == -1))
 265                                 goto free;
 266 
 267                         mutex_enter(&pidlock);
 268                         if ((p = prfind(ppid)) == NULL || p->p_stat == SIDL) {
 269                                 mutex_exit(&pidlock);
 270                                 goto free;
 271                         }
 272                         mutex_enter(&p->p_lock);
 273                         mutex_exit(&pidlock);
 274 
 275                         if ((t = idtot(p, ptid)) == NULL) {
 276                                 mutex_exit(&p->p_lock);
 277                                 goto free;
 278                         }
 279                 }
 280 
 281                 sqp->sq_info.si_signo = lwpd->br_signal;
 282                 sqp->sq_info.si_code = lwpd->br_exitwhy;
 283                 sqp->sq_info.si_status = lwpd->br_exitwhat;
 284                 sqp->sq_info.si_pid = lwpd->br_pid;
 285                 sqp->sq_info.si_uid = crgetruid(CRED());
 286                 sigaddqa(p, t, sqp);
 287                 mutex_exit(&p->p_lock);
 288                 sqp = NULL;
 289         }
 290 
 291 free:
 292         if (lwpd->br_scall_args != NULL) {
 293                 ASSERT(lwpd->br_args_size > 0);
 294                 kmem_free(lwpd->br_scall_args, lwpd->br_args_size);
 295         }
 296         if (sqp)
 297                 kmem_free(sqp, sizeof (sigqueue_t));
 298 }
 299 
 300 void
 301 lx_freelwp(klwp_t *lwp)
 302 {
 303         struct lx_lwp_data *lwpd = lwptolxlwp(lwp);
 304         proc_t *p = lwptoproc(lwp);
 305         lx_zone_data_t *lxzdata;
 306         vfs_t *cgrp;
 307 
 308         VERIFY(MUTEX_NOT_HELD(&p->p_lock));
 309 
 310         if (lwpd == NULL) {
 311                 /*
 312                  * There is one case where an LX branded process will possess
 313                  * LWPs which lack their own brand data.  During the course of
 314                  * executing native binary, the process will be preemptively
 315                  * branded to allow hooks such as b_native_exec to function.
 316                  * If that process possesses multiple LWPS, they will _not_ be
 317                  * branded since they will exit if the exec succeeds.  It's
 318                  * during this LWP exit that lx_freelwp would be called on an
 319                  * unbranded LWP.  When that is the case, it is acceptable to
 320                  * bypass the hook.
 321                  */
 322                 return;
 323         }
 324 
 325         /* cgroup integration */
 326         lxzdata = ztolxzd(p->p_zone);
 327         mutex_enter(&lxzdata->lxzd_lock);
 328         cgrp = lxzdata->lxzd_cgroup;
 329         if (cgrp != NULL) {
 330                 VFS_HOLD(cgrp);
 331                 mutex_exit(&lxzdata->lxzd_lock);
 332                 ASSERT(lx_cgrp_freelwp != NULL);
 333                 (*lx_cgrp_freelwp)(cgrp, lwpd->br_cgroupid, lwptot(lwp)->t_tid,
 334                     lwpd->br_pid);
 335                 VFS_RELE(cgrp);
 336         } else {
 337                 mutex_exit(&lxzdata->lxzd_lock);
 338         }
 339 
 340         /*
 341          * It is possible for the lx_freelwp hook to be called without a prior
 342          * call to lx_exitlwp being made.  This happens as part of lwp
 343          * de-branding when a native binary is executed from a branded process.
 344          *
 345          * To cover all cases, lx_cleanlwp is called from lx_exitlwp as well
 346          * here in lx_freelwp.  When the second call is redundant, the
 347          * resources will already be freed and no work will be needed.
 348          */
 349         lx_cleanlwp(lwp, p);
 350 
 351         /*
 352          * Remove our system call interposer.
 353          */
 354         lwp->lwp_brand_syscall = NULL;
 355 
 356         /*
 357          * If this process is being de-branded during an exec(),
 358          * the LX ctxops may have already been removed, so the result
 359          * from ctxop_remove is irrelevant.
 360          */
 361         (void) ctxop_remove(lwptot(lwp), &lx_ctxop_template, lwp);
 362         if (lwpd->br_pid != 0) {
 363                 lx_pid_rele(lwptoproc(lwp)->p_pid, lwptot(lwp)->t_tid);
 364         }
 365 
 366         /*
 367          * Discard the affinity mask.
 368          */
 369         VERIFY(lwpd->br_affinitymask != NULL);
 370         cpuset_free(lwpd->br_affinitymask);
 371         lwpd->br_affinitymask = NULL;
 372 
 373         /*
 374          * Ensure that lx_ptrace_exit() has been called to detach
 375          * ptrace(2) tracers and tracees.
 376          */
 377         VERIFY(lwpd->br_ptrace_tracer == NULL);
 378         VERIFY(lwpd->br_ptrace_accord == NULL);
 379 
 380         lwp->lwp_brand = NULL;
 381         kmem_free(lwpd, sizeof (struct lx_lwp_data));
 382 }
 383 
 384 void *
 385 lx_lwpdata_alloc(proc_t *p)
 386 {
 387         lx_lwp_data_t *lwpd;
 388         struct lx_pid *lpidp;
 389         cpuset_t *affmask;
 390         pid_t newpid = 0;
 391         struct pid *pidp = NULL;
 392 
 393         VERIFY(MUTEX_NOT_HELD(&p->p_lock));
 394 
 395         /*
 396          * LWPs beyond the first will require a pid to be allocated to emulate
 397          * Linux's goofy thread model.  While this  allocation may be
 398          * unnecessary when a single-lwp process undergoes branding, it cannot
 399          * be performed during b_initlwp due to p_lock being held.
 400          */
 401         if (p->p_lwpcnt > 0) {
 402                 if ((newpid = pid_allocate(p, 0, 0)) < 0) {
 403                         return (NULL);
 404                 }
 405                 pidp = pid_find(newpid);
 406         }
 407 
 408         lwpd = kmem_zalloc(sizeof (struct lx_lwp_data), KM_SLEEP);
 409         lpidp = kmem_zalloc(sizeof (struct lx_pid), KM_SLEEP);
 410         affmask = cpuset_alloc(KM_SLEEP);
 411 
 412         lpidp->lxp_lpid = newpid;
 413         lpidp->lxp_pidp = pidp;
 414         lwpd->br_lpid = lpidp;
 415         lwpd->br_affinitymask = affmask;
 416 
 417         return (lwpd);
 418 }
 419 
 420 /*
 421  * Free lwp brand data if an error occurred during lwp_create.
 422  * Otherwise, lx_freelwp will be used to free the resources after they're
 423  * associated with the lwp via lx_initlwp.
 424  */
 425 void
 426 lx_lwpdata_free(void *lwpbd)
 427 {
 428         lx_lwp_data_t *lwpd = (lx_lwp_data_t *)lwpbd;
 429         VERIFY(lwpd != NULL);
 430         VERIFY(lwpd->br_lpid != NULL);
 431         VERIFY(lwpd->br_affinitymask != NULL);
 432 
 433         cpuset_free(lwpd->br_affinitymask);
 434         if (lwpd->br_lpid->lxp_pidp != NULL) {
 435                 (void) pid_rele(lwpd->br_lpid->lxp_pidp);
 436         }
 437         kmem_free(lwpd->br_lpid, sizeof (*lwpd->br_lpid));
 438         kmem_free(lwpd, sizeof (*lwpd));
 439 }
 440 
 441 void
 442 lx_initlwp(klwp_t *lwp, void *lwpbd)
 443 {
 444         lx_lwp_data_t *lwpd = (lx_lwp_data_t *)lwpbd;
 445         lx_lwp_data_t *plwpd = ttolxlwp(curthread);
 446         kthread_t *tp = lwptot(lwp);
 447         proc_t *p = lwptoproc(lwp);
 448         lx_zone_data_t *lxzdata;
 449         vfs_t *cgrp;
 450 
 451         VERIFY(MUTEX_HELD(&p->p_lock));
 452         VERIFY(lwp->lwp_brand == NULL);
 453 
 454         lwpd->br_exitwhy = CLD_EXITED;
 455         lwpd->br_lwp = lwp;
 456         lwpd->br_clear_ctidp = NULL;
 457         lwpd->br_set_ctidp = NULL;
 458         lwpd->br_signal = 0;
 459         lwpd->br_stack_mode = LX_STACK_MODE_PREINIT;
 460         cpuset_all(lwpd->br_affinitymask);
 461 
 462         /*
 463          * The first thread in a process has ppid set to the parent
 464          * process's pid, and ptid set to -1.  Subsequent threads in the
 465          * process have their ppid set to the pid of the thread that
 466          * created them, and their ptid to that thread's tid.
 467          */
 468         if (tp->t_next == tp) {
 469                 lwpd->br_ppid = tp->t_procp->p_ppid;
 470                 lwpd->br_ptid = -1;
 471         } else if (plwpd != NULL) {
 472                 bcopy(plwpd->br_tls, lwpd->br_tls, sizeof (lwpd->br_tls));
 473                 lwpd->br_ppid = plwpd->br_pid;
 474                 lwpd->br_ptid = curthread->t_tid;
 475                 /* The child inherits the fs/gsbase values from the parent */
 476                 lwpd->br_lx_fsbase = plwpd->br_lx_fsbase;
 477                 lwpd->br_ntv_fsbase = plwpd->br_ntv_fsbase;
 478                 lwpd->br_lx_gsbase = plwpd->br_lx_gsbase;
 479                 lwpd->br_ntv_gsbase = plwpd->br_ntv_gsbase;
 480         } else {
 481                 /*
 482                  * Oddball case: the parent thread isn't a Linux process.
 483                  */
 484                 lwpd->br_ppid = 0;
 485                 lwpd->br_ptid = -1;
 486         }
 487         lwp->lwp_brand = lwpd;
 488 
 489         /*
 490          * When during lx_lwpdata_alloc, we must decide whether or not to
 491          * allocate a new pid to associate with the lwp. Since p_lock is not
 492          * held at that point, the only time we can guarantee a new pid isn't
 493          * needed is when p_lwpcnt == 0.  This is because other lwps won't be
 494          * present to race with us with regards to pid allocation.
 495          *
 496          * This means that in all other cases (where p_lwpcnt > 0), we expect
 497          * that lx_lwpdata_alloc will allocate a pid for us to use here, even
 498          * if it is uneeded.  If this process is undergoing an exec, for
 499          * example, the single existing lwp will not need a new pid when it is
 500          * rebranded.  In that case, lx_pid_assign will free the uneeded pid.
 501          */
 502         VERIFY(lwpd->br_lpid->lxp_pidp != NULL || p->p_lwpcnt == 0);
 503 
 504         lx_pid_assign(tp, lwpd->br_lpid);
 505         lwpd->br_tgid = lwpd->br_pid;
 506         /*
 507          * Having performed the lx pid assignement, the lpid reference is no
 508          * longer needed.  The underlying data will be freed during lx_freelwp.
 509          */
 510         lwpd->br_lpid = NULL;
 511 
 512         ctxop_install(lwptot(lwp), &lx_ctxop_template, lwp);
 513 
 514         /*
 515          * Install branded system call hooks for this LWP:
 516          */
 517         lwp->lwp_brand_syscall = lx_syscall_enter;
 518 
 519         /*
 520          * The new LWP inherits the parent LWP cgroup ID.
 521          */
 522         if (plwpd != NULL) {
 523                 lwpd->br_cgroupid = plwpd->br_cgroupid;
 524         }
 525         /*
 526          * The new LWP inherits the parent LWP emulated scheduling info.
 527          */
 528         if (plwpd != NULL) {
 529                 lwpd->br_schd_class = plwpd->br_schd_class;
 530                 lwpd->br_schd_pri = plwpd->br_schd_pri;
 531                 lwpd->br_schd_flags = plwpd->br_schd_flags;
 532                 lwpd->br_schd_runtime = plwpd->br_schd_runtime;
 533                 lwpd->br_schd_deadline = plwpd->br_schd_deadline;
 534                 lwpd->br_schd_period = plwpd->br_schd_period;
 535         }
 536         lxzdata = ztolxzd(p->p_zone);
 537         mutex_enter(&lxzdata->lxzd_lock);
 538         cgrp = lxzdata->lxzd_cgroup;
 539         if (cgrp != NULL) {
 540                 VFS_HOLD(cgrp);
 541                 mutex_exit(&lxzdata->lxzd_lock);
 542                 ASSERT(lx_cgrp_initlwp != NULL);
 543                 (*lx_cgrp_initlwp)(cgrp, lwpd->br_cgroupid, lwptot(lwp)->t_tid,
 544                     lwpd->br_pid);
 545                 VFS_RELE(cgrp);
 546         } else {
 547                 mutex_exit(&lxzdata->lxzd_lock);
 548         }
 549 }
 550 
 551 void
 552 lx_initlwp_post(klwp_t *lwp)
 553 {
 554         lx_lwp_data_t *plwpd = ttolxlwp(curthread);
 555         /*
 556          * If the parent LWP has a ptrace(2) tracer, the new LWP may
 557          * need to inherit that same tracer.
 558          */
 559         if (plwpd != NULL) {
 560                 lx_ptrace_inherit_tracer(plwpd, lwptolxlwp(lwp));
 561         }
 562 }
 563 
 564 /*
 565  * There is no need to have any locking for either the source or
 566  * destination struct lx_lwp_data structs.  This is always run in the
 567  * thread context of the source thread, and the destination thread is
 568  * always newly created and not referred to from anywhere else.
 569  */
 570 void
 571 lx_forklwp(klwp_t *srclwp, klwp_t *dstlwp)
 572 {
 573         struct lx_lwp_data *src = srclwp->lwp_brand;
 574         struct lx_lwp_data *dst = dstlwp->lwp_brand;
 575 
 576         dst->br_ppid = src->br_pid;
 577         dst->br_ptid = lwptot(srclwp)->t_tid;
 578         bcopy(src->br_tls, dst->br_tls, sizeof (dst->br_tls));
 579 
 580         switch (src->br_stack_mode) {
 581         case LX_STACK_MODE_BRAND:
 582         case LX_STACK_MODE_NATIVE:
 583                 /*
 584                  * The parent LWP has an alternate stack installed.
 585                  * The child LWP should have the same stack base and extent.
 586                  */
 587                 dst->br_stack_mode = src->br_stack_mode;
 588                 dst->br_ntv_stack = src->br_ntv_stack;
 589                 dst->br_ntv_stack_current = src->br_ntv_stack_current;
 590                 break;
 591 
 592         default:
 593                 /*
 594                  * Otherwise, clear the stack data for this LWP.
 595                  */
 596                 dst->br_stack_mode = LX_STACK_MODE_PREINIT;
 597                 dst->br_ntv_stack = 0;
 598                 dst->br_ntv_stack_current = 0;
 599         }
 600 
 601         /*
 602          * copy only these flags
 603          */
 604         dst->br_lwp_flags = src->br_lwp_flags & BR_CPU_BOUND;
 605         dst->br_scall_args = NULL;
 606         lx_affinity_forklwp(srclwp, dstlwp);
 607 
 608         /*
 609          * Flag so child doesn't ptrace-stop on syscall exit.
 610          */
 611         dst->br_ptrace_flags |= LX_PTF_NOSTOP;
 612 
 613         if (src->br_clone_grp_flags != 0) {
 614                 lx_clone_grp_enter(src->br_clone_grp_flags, lwptoproc(srclwp),
 615                     lwptoproc(dstlwp));
 616                 /* clone group no longer pending on this thread */
 617                 src->br_clone_grp_flags = 0;
 618         }
 619 }
 620 
 621 /*
 622  * When switching a Linux process off the CPU, clear its GDT entries.
 623  */
 624 /* ARGSUSED */
 625 static void
 626 lx_save(void *arg)
 627 {
 628         klwp_t *t = (klwp_t *)arg;
 629         int i;
 630 
 631 #if defined(__amd64)
 632         reset_sregs();
 633 #endif
 634         for (i = 0; i < LX_TLSNUM; i++)
 635                 gdt_update_usegd(GDT_TLSMIN + i, &null_udesc);
 636 }
 637 
 638 /*
 639  * When switching a Linux process on the CPU, set its GDT entries.
 640  *
 641  * For 64-bit code we don't have to worry about explicitly setting the
 642  * %fsbase via wrmsr(MSR_AMD_FSBASE) here. Instead, that should happen
 643  * automatically in update_sregs if we are executing in user-land. If this
 644  * is the case then pcb_rupdate should be set.
 645  */
 646 static void
 647 lx_restore(void *arg)
 648 {
 649         klwp_t *t = (klwp_t *)arg;
 650         struct lx_lwp_data *lwpd = lwptolxlwp(t);
 651         user_desc_t *tls;
 652         int i;
 653 
 654         ASSERT(lwpd);
 655 
 656         tls = lwpd->br_tls;
 657         for (i = 0; i < LX_TLSNUM; i++)
 658                 gdt_update_usegd(GDT_TLSMIN + i, &tls[i]);
 659 }
 660 
 661 void
 662 lx_set_gdt(int entry, user_desc_t *descrp)
 663 {
 664 
 665         gdt_update_usegd(entry, descrp);
 666 }
 667 
 668 void
 669 lx_clear_gdt(int entry)
 670 {
 671         gdt_update_usegd(entry, &null_udesc);
 672 }
 673 
 674 longlong_t
 675 lx_nosys()
 676 {
 677         return (set_errno(ENOSYS));
 678 }
 679 
 680 /*
 681  * Brand-specific routine to check if given non-Solaris standard segment
 682  * register values should be modified to other values.
 683  */
 684 /*ARGSUSED*/
 685 greg_t
 686 lx_fixsegreg(greg_t sr, model_t datamodel)
 687 {
 688         uint16_t idx = SELTOIDX(sr);
 689 
 690         ASSERT(sr == (sr & 0xffff));
 691 
 692         /*
 693          * If the segment selector is a valid TLS selector, just return it.
 694          */
 695         if (!SELISLDT(sr) && idx >= GDT_TLSMIN && idx <= GDT_TLSMAX)
 696                 return (sr | SEL_UPL);
 697 
 698         /*
 699          * Force the SR into the LDT in ring 3 for 32-bit processes.
 700          *
 701          * 64-bit processes get the null GDT selector since they are not
 702          * allowed to have a private LDT.
 703          */
 704 #if defined(__amd64)
 705         return (datamodel == DATAMODEL_ILP32 ? (sr | SEL_TI_LDT | SEL_UPL) : 0);
 706 #elif defined(__i386)
 707         datamodel = datamodel;  /* datamodel currently unused for 32-bit */
 708         return (sr | SEL_TI_LDT | SEL_UPL);
 709 #endif  /* __amd64 */
 710 }
 711 
 712 /*
 713  * Brand-specific function to convert the fsbase as pulled from the register
 714  * into a native fsbase suitable for locating the ulwp_t from the kernel.
 715  */
 716 uintptr_t
 717 lx_fsbase(klwp_t *lwp, uintptr_t fsbase)
 718 {
 719         lx_lwp_data_t *lwpd = lwp->lwp_brand;
 720 
 721         if (lwpd->br_stack_mode != LX_STACK_MODE_BRAND ||
 722             lwpd->br_ntv_fsbase == (uintptr_t)NULL) {
 723                 return (fsbase);
 724         }
 725 
 726         return (lwpd->br_ntv_fsbase);
 727 }
 728 
 729 /*
 730  * These two functions simulate winfo and post_sigcld for the lx brand. The
 731  * difference is delivering a designated signal as opposed to always SIGCLD.
 732  */
 733 static void
 734 lx_winfo(proc_t *pp, k_siginfo_t *ip, struct lx_proc_data *dat)
 735 {
 736         ASSERT(MUTEX_HELD(&pidlock));
 737         bzero(ip, sizeof (k_siginfo_t));
 738         ip->si_signo = ltos_signo[dat->l_signal];
 739         ip->si_code = pp->p_wcode;
 740         ip->si_pid = pp->p_pid;
 741         ip->si_ctid = PRCTID(pp);
 742         ip->si_zoneid = pp->p_zone->zone_id;
 743         ip->si_status = pp->p_wdata;
 744         /*
 745          * These siginfo values are converted to USER_HZ in the user-land
 746          * brand signal code.
 747          */
 748         ip->si_stime = pp->p_stime;
 749         ip->si_utime = pp->p_utime;
 750 }
 751 
 752 static void
 753 lx_post_exit_sig(proc_t *cp, sigqueue_t *sqp, struct lx_proc_data *dat)
 754 {
 755         proc_t *pp = cp->p_parent;
 756 
 757         ASSERT(MUTEX_HELD(&pidlock));
 758         mutex_enter(&pp->p_lock);
 759         /*
 760          * Since Linux doesn't queue SIGCHLD, or any other non RT
 761          * signals, we just blindly deliver whatever signal we can.
 762          */
 763         ASSERT(sqp != NULL);
 764         lx_winfo(cp, &sqp->sq_info, dat);
 765         sigaddqa(pp, NULL, sqp);
 766         sqp = NULL;
 767         mutex_exit(&pp->p_lock);
 768 }
 769 
 770 
 771 /*
 772  * Brand specific code for exiting and sending a signal to the parent, as
 773  * opposed to sigcld().
 774  */
 775 void
 776 lx_exit_with_sig(proc_t *cp, sigqueue_t *sqp)
 777 {
 778         proc_t *pp = cp->p_parent;
 779         lx_proc_data_t *lx_brand_data = ptolxproc(cp);
 780         ASSERT(MUTEX_HELD(&pidlock));
 781 
 782         switch (cp->p_wcode) {
 783         case CLD_EXITED:
 784         case CLD_DUMPED:
 785         case CLD_KILLED:
 786                         ASSERT(cp->p_stat == SZOMB);
 787                         /*
 788                          * The broadcast on p_srwchan_cv is a kludge to
 789                          * wakeup a possible thread in uadmin(A_SHUTDOWN).
 790                          */
 791                         cv_broadcast(&cp->p_srwchan_cv);
 792 
 793                         /*
 794                          * Add to newstate list of the parent
 795                          */
 796                         add_ns(pp, cp);
 797 
 798                         cv_broadcast(&pp->p_cv);
 799                         if ((pp->p_flag & SNOWAIT) ||
 800                             PTOU(pp)->u_signal[SIGCLD - 1] == SIG_IGN) {
 801                                 if (!(cp->p_pidflag & CLDWAITPID))
 802                                         freeproc(cp);
 803                         } else if (!(cp->p_pidflag & CLDNOSIGCHLD) &&
 804                             lx_brand_data->l_signal != 0) {
 805                                 lx_post_exit_sig(cp, sqp, lx_brand_data);
 806                                 sqp = NULL;
 807                         }
 808                         break;
 809 
 810         case CLD_STOPPED:
 811         case CLD_CONTINUED:
 812         case CLD_TRAPPED:
 813                         panic("Should not be called in this case");
 814         }
 815 
 816         if (sqp)
 817                 siginfofree(sqp);
 818 }
 819 
 820 /*
 821  * Filters based on arguments that have been passed in by a separate syscall
 822  * using the B_STORE_ARGS mechanism. if the __WALL flag is set, no filter is
 823  * applied, otherwise we look at the difference between a clone and non-clone
 824  * process.
 825  * The definition of a clone process in Linux is a thread that does not deliver
 826  * SIGCHLD to its parent. The option __WCLONE indicates to wait only on clone
 827  * processes. Without that option, a process should only wait on normal
 828  * children. The following table shows the cases.
 829  *
 830  *                   default    __WCLONE
 831  *   no SIGCHLD      -           X
 832  *   SIGCHLD         X           -
 833  *
 834  * This is an XOR of __WCLONE being set, and SIGCHLD being the signal sent on
 835  * process exit.
 836  *
 837  * More information on wait in lx brands can be found at
 838  * usr/src/lib/brand/lx/lx_brand/common/wait.c.
 839  */
 840 /* ARGSUSED */
 841 boolean_t
 842 lx_wait_filter(proc_t *pp, proc_t *cp)
 843 {
 844         lx_lwp_data_t *lwpd = ttolxlwp(curthread);
 845         int flags = lwpd->br_waitid_flags;
 846         boolean_t ret;
 847 
 848         if (!lwpd->br_waitid_emulate) {
 849                 return (B_TRUE);
 850         }
 851 
 852         mutex_enter(&cp->p_lock);
 853         if (flags & LX_WALL) {
 854                 ret = B_TRUE;
 855         } else {
 856                 lx_proc_data_t *pd = ptolxproc(cp);
 857                 boolean_t is_sigchld = B_TRUE;
 858                 boolean_t match_wclone = B_FALSE;
 859 
 860                 /*
 861                  * When calling clone, an alternate signal can be chosen to
 862                  * deliver to the parent when the child exits.
 863                  */
 864                 if (pd != NULL && pd->l_signal != stol_signo[SIGCHLD]) {
 865                         is_sigchld = B_FALSE;
 866                 }
 867                 if ((flags & LX_WCLONE) != 0) {
 868                         match_wclone = B_TRUE;
 869                 }
 870 
 871                 ret = (match_wclone ^ is_sigchld) ? B_TRUE : B_FALSE;
 872         }
 873         mutex_exit(&cp->p_lock);
 874 
 875         return (ret);
 876 }
 877 
 878 void
 879 lx_ifname_convert(char *ifname, lx_if_action_t act)
 880 {
 881         if (act == LX_IF_TONATIVE) {
 882                 if (strncmp(ifname, "lo", IFNAMSIZ) == 0)
 883                         (void) strlcpy(ifname, "lo0", IFNAMSIZ);
 884         } else {
 885                 if (strncmp(ifname, "lo0", IFNAMSIZ) == 0)
 886                         (void) strlcpy(ifname, "lo", IFNAMSIZ);
 887         }
 888 }
 889 
 890 void
 891 lx_ifflags_convert(uint64_t *flags, lx_if_action_t act)
 892 {
 893         uint64_t buf;
 894 
 895         buf = *flags & (IFF_UP | IFF_BROADCAST | IFF_DEBUG |
 896             IFF_LOOPBACK | IFF_POINTOPOINT | IFF_NOTRAILERS |
 897             IFF_RUNNING | IFF_NOARP | IFF_PROMISC | IFF_ALLMULTI);
 898 
 899         /* Linux has different shift for multicast flag */
 900         if (act == LX_IF_TONATIVE) {
 901                 if (*flags & 0x1000)
 902                         buf |= IFF_MULTICAST;
 903         } else {
 904                 if (*flags & IFF_MULTICAST)
 905                         buf |= 0x1000;
 906         }
 907         *flags = buf;
 908 }
 909 
 910 /*
 911  * Convert an IPv6 address into the numbers used by /proc/net/if_inet6
 912  */
 913 unsigned int
 914 lx_ipv6_scope_convert(const in6_addr_t *addr)
 915 {
 916         if (IN6_IS_ADDR_V4COMPAT(addr)) {
 917                 return (LX_IPV6_ADDR_COMPATv4);
 918         } else if (IN6_ARE_ADDR_EQUAL(addr, &ipv6_loopback)) {
 919                 return (LX_IPV6_ADDR_LOOPBACK);
 920         } else if (IN6_IS_ADDR_LINKLOCAL(addr)) {
 921                 return (LX_IPV6_ADDR_LINKLOCAL);
 922         } else if (IN6_IS_ADDR_SITELOCAL(addr)) {
 923                 return (LX_IPV6_ADDR_SITELOCAL);
 924         } else {
 925                 return (0x0000U);
 926         }
 927 }
 928 
 929 
 930 void
 931 lx_stol_hwaddr(const struct sockaddr_dl *src, struct sockaddr *dst, int *size)
 932 {
 933         int copy_size = MIN(src->sdl_alen, sizeof (dst->sa_data));
 934 
 935         switch (src->sdl_type) {
 936         case DL_ETHER:
 937                 dst->sa_family = LX_ARPHRD_ETHER;
 938                 break;
 939         case DL_LOOP:
 940                 dst->sa_family = LX_ARPHRD_LOOPBACK;
 941                 break;
 942         default:
 943                 dst->sa_family = LX_ARPHRD_VOID;
 944         }
 945 
 946         bcopy(LLADDR(src), dst->sa_data, copy_size);
 947         *size = copy_size;
 948 }
 949 
 950 /*
 951  * Brand hook to convert native kernel siginfo signal number, errno, code, pid
 952  * and si_status to Linux values. Similar to the stol_ksiginfo function but
 953  * this one converts in-place, converts the pid, and does not copyout.
 954  */
 955 void
 956 lx_sigfd_translate(k_siginfo_t *infop)
 957 {
 958         zone_t *zone = curproc->p_zone;
 959 
 960         infop->si_signo = lx_stol_signo(infop->si_signo, LX_SIGKILL);
 961         infop->si_status = lx_stol_status(infop->si_status, LX_SIGKILL);
 962         infop->si_code = lx_stol_sigcode(infop->si_code);
 963         infop->si_errno = lx_errno(infop->si_errno, EINVAL);
 964 
 965         /* Map zsched and zone init to pid 1 */
 966         if (infop->si_pid == zone->zone_proc_initpid ||
 967             infop->si_pid == zone->zone_zsched->p_pid) {
 968                 infop->si_pid = 1;
 969         }
 970 }
 971 
 972 int
 973 stol_ksiginfo_copyout(k_siginfo_t *sip, void *ulxsip)
 974 {
 975         lx_siginfo_t lsi;
 976 
 977         bzero(&lsi, sizeof (lsi));
 978         lsi.lsi_signo = lx_stol_signo(sip->si_signo, SIGCLD);
 979         lsi.lsi_code = lx_stol_sigcode(sip->si_code);
 980         lsi.lsi_errno = lx_errno(sip->si_errno, EINVAL);
 981 
 982         switch (lsi.lsi_signo) {
 983         case LX_SIGPOLL:
 984                 lsi.lsi_band = sip->si_band;
 985                 lsi.lsi_fd = sip->si_fd;
 986                 break;
 987 
 988         case LX_SIGCHLD:
 989                 lsi.lsi_pid = sip->si_pid;
 990                 if (sip->si_code <= 0 || sip->si_code == CLD_EXITED) {
 991                         lsi.lsi_status = sip->si_status;
 992                 } else {
 993                         lsi.lsi_status = lx_stol_status(sip->si_status,
 994                             SIGKILL);
 995                 }
 996                 lsi.lsi_utime = HZ_TO_LX_USERHZ(sip->si_utime);
 997                 lsi.lsi_stime = HZ_TO_LX_USERHZ(sip->si_stime);
 998                 break;
 999 
1000         case LX_SIGILL:
1001         case LX_SIGBUS:
1002         case LX_SIGFPE:
1003         case LX_SIGSEGV:
1004                 lsi.lsi_addr = sip->si_addr;
1005                 break;
1006 
1007         default:
1008                 lsi.lsi_pid = sip->si_pid;
1009                 lsi.lsi_uid = LX_UID32_TO_UID16(sip->si_uid);
1010         }
1011 
1012         if (copyout(&lsi, ulxsip, sizeof (lsi)) != 0) {
1013                 return (set_errno(EFAULT));
1014         }
1015 
1016         return (0);
1017 }
1018 
1019 #if defined(_SYSCALL32_IMPL)
1020 int
1021 stol_ksiginfo32_copyout(k_siginfo_t *sip, void *ulxsip)
1022 {
1023         lx_siginfo32_t lsi;
1024 
1025         bzero(&lsi, sizeof (lsi));
1026         lsi.lsi_signo = lx_stol_signo(sip->si_signo, SIGCLD);
1027         lsi.lsi_code = lx_stol_sigcode(sip->si_code);
1028         lsi.lsi_errno = lx_errno(sip->si_errno, EINVAL);
1029 
1030         switch (lsi.lsi_signo) {
1031         case LX_SIGPOLL:
1032                 lsi.lsi_band = sip->si_band;
1033                 lsi.lsi_fd = sip->si_fd;
1034                 break;
1035 
1036         case LX_SIGCHLD:
1037                 lsi.lsi_pid = sip->si_pid;
1038                 if (sip->si_code <= 0 || sip->si_code == CLD_EXITED) {
1039                         lsi.lsi_status = sip->si_status;
1040                 } else {
1041                         lsi.lsi_status = lx_stol_status(sip->si_status,
1042                             SIGKILL);
1043                 }
1044                 lsi.lsi_utime = HZ_TO_LX_USERHZ(sip->si_utime);
1045                 lsi.lsi_stime = HZ_TO_LX_USERHZ(sip->si_stime);
1046                 break;
1047 
1048         case LX_SIGILL:
1049         case LX_SIGBUS:
1050         case LX_SIGFPE:
1051         case LX_SIGSEGV:
1052                 lsi.lsi_addr = (caddr32_t)(uintptr_t)sip->si_addr;
1053                 break;
1054 
1055         default:
1056                 lsi.lsi_pid = sip->si_pid;
1057                 lsi.lsi_uid = LX_UID32_TO_UID16(sip->si_uid);
1058         }
1059 
1060         if (copyout(&lsi, ulxsip, sizeof (lsi)) != 0) {
1061                 return (set_errno(EFAULT));
1062         }
1063 
1064         return (0);
1065 }
1066 #endif
1067 
1068 /* Given an LX LWP, determine where user register state is stored. */
1069 lx_regs_location_t
1070 lx_regs_location(lx_lwp_data_t *lwpd, void **ucp, boolean_t for_write)
1071 {
1072         switch (lwpd->br_stack_mode) {
1073         case LX_STACK_MODE_BRAND:
1074                 /*
1075                  * The LWP was stopped with the brand stack and register state
1076                  * loaded, e.g. during a syscall emulated within the kernel.
1077                  */
1078                 return (LX_REG_LOC_LWP);
1079 
1080         case LX_STACK_MODE_PREINIT:
1081                 if (for_write) {
1082                         /* setting registers not allowed in this state */
1083                         break;
1084                 }
1085                 if (lwpd->br_ptrace_whatstop == LX_PR_SIGNALLED ||
1086                     lwpd->br_ptrace_whatstop == LX_PR_SYSEXIT) {
1087                         /* The LWP was stopped by tracing on exec. */
1088                         return (LX_REG_LOC_LWP);
1089                 }
1090                 break;
1091 
1092         case LX_STACK_MODE_NATIVE:
1093                 if (for_write) {
1094                         /* setting registers not allowed in this state */
1095                         break;
1096                 }
1097                 if (lwpd->br_ptrace_whystop == PR_BRAND) {
1098                         /* Called while ptrace-event-stopped by lx_exec. */
1099                         if (lwpd->br_ptrace_whatstop == LX_PR_EVENT) {
1100                                 return (LX_REG_LOC_LWP);
1101                         }
1102 
1103                         /* Called while ptrace-event-stopped after clone. */
1104                         if (lwpd->br_ptrace_whatstop == LX_PR_SIGNALLED &&
1105                             lwpd->br_ptrace_stopsig == LX_SIGSTOP &&
1106                             (lwpd->br_ptrace_flags & LX_PTF_STOPPED)) {
1107                                 return (LX_REG_LOC_LWP);
1108                         }
1109 
1110                         /*
1111                          * Called to obtain syscall exit for other cases
1112                          * (e.g. pseudo return from rt_sigreturn).
1113                          */
1114                         if (lwpd->br_ptrace_whatstop == LX_PR_SYSEXIT &&
1115                             (lwpd->br_ptrace_flags & LX_PTF_STOPPED)) {
1116                                 return (LX_REG_LOC_LWP);
1117                         }
1118                 }
1119                 break;
1120         default:
1121                 break;
1122         }
1123 
1124         if (lwpd->br_ptrace_stopucp != (uintptr_t)NULL) {
1125                 /*
1126                  * The LWP was stopped in the usermode emulation library
1127                  * but a ucontext_t for the preserved brand stack and
1128                  * register state was provided.  Return the register state
1129                  * from that ucontext_t.
1130                  */
1131                 VERIFY(ucp != NULL);
1132                 *ucp = (void *)lwpd->br_ptrace_stopucp;
1133                 return (LX_REG_LOC_UCP);
1134         }
1135 
1136         return (LX_REG_LOC_UNAVAIL);
1137 }