1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2016, Joyent, Inc.
  25  */
  26 
  27 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  28 /*        All Rights Reserved   */
  29 
  30 #include <sys/types.h>
  31 #include <sys/param.h>
  32 #include <sys/sysmacros.h>
  33 #include <sys/signal.h>
  34 #include <sys/cred.h>
  35 #include <sys/policy.h>
  36 #include <sys/user.h>
  37 #include <sys/systm.h>
  38 #include <sys/cpuvar.h>
  39 #include <sys/vfs.h>
  40 #include <sys/vnode.h>
  41 #include <sys/file.h>
  42 #include <sys/errno.h>
  43 #include <sys/time.h>
  44 #include <sys/proc.h>
  45 #include <sys/cmn_err.h>
  46 #include <sys/acct.h>
  47 #include <sys/tuneable.h>
  48 #include <sys/class.h>
  49 #include <sys/kmem.h>
  50 #include <sys/session.h>
  51 #include <sys/ucontext.h>
  52 #include <sys/stack.h>
  53 #include <sys/procfs.h>
  54 #include <sys/prsystm.h>
  55 #include <sys/vmsystm.h>
  56 #include <sys/vtrace.h>
  57 #include <sys/debug.h>
  58 #include <sys/shm_impl.h>
  59 #include <sys/door_data.h>
  60 #include <vm/as.h>
  61 #include <vm/rm.h>
  62 #include <c2/audit.h>
  63 #include <sys/var.h>
  64 #include <sys/schedctl.h>
  65 #include <sys/utrap.h>
  66 #include <sys/task.h>
  67 #include <sys/resource.h>
  68 #include <sys/cyclic.h>
  69 #include <sys/lgrp.h>
  70 #include <sys/rctl.h>
  71 #include <sys/contract_impl.h>
  72 #include <sys/contract/process_impl.h>
  73 #include <sys/list.h>
  74 #include <sys/dtrace.h>
  75 #include <sys/pool.h>
  76 #include <sys/zone.h>
  77 #include <sys/sdt.h>
  78 #include <sys/class.h>
  79 #include <sys/corectl.h>
  80 #include <sys/brand.h>
  81 #include <sys/fork.h>
  82 
  83 static int64_t cfork(int, int, int);
  84 static int getproc(proc_t **, pid_t, uint_t);
  85 #define GETPROC_USER    0x0
  86 #define GETPROC_KERNEL  0x1
  87 #define GETPROC_ZSCHED  0x2
  88 
  89 static void fork_fail(proc_t *);
  90 static void forklwp_fail(proc_t *);
  91 
  92 int fork_fail_pending;
  93 
  94 extern struct kmem_cache *process_cache;
  95 
  96 /*
  97  * The vfork() system call trap is no longer invoked by libc.
  98  * It is retained only for the benefit of applications running
  99  * within a solaris10 branded zone.  It should be eliminated
 100  * when we no longer support solaris10 branded zones.
 101  */
 102 int64_t
 103 vfork(void)
 104 {
 105         curthread->t_post_sys = 1;   /* so vfwait() will be called */
 106         return (cfork(1, 1, 0));
 107 }
 108 
 109 /*
 110  * forksys system call - forkx, forkallx, vforkx.  This is the
 111  * interface invoked by libc for fork1(), forkall(), and vfork()
 112  */
 113 int64_t
 114 forksys(int subcode, int flags)
 115 {
 116         switch (subcode) {
 117         case 0:
 118                 return (cfork(0, 1, flags));    /* forkx(flags) */
 119         case 1:
 120                 return (cfork(0, 0, flags));    /* forkallx(flags) */
 121         case 2:
 122                 curthread->t_post_sys = 1;   /* so vfwait() will be called */
 123                 return (cfork(1, 1, flags));    /* vforkx(flags) */
 124         default:
 125                 return ((int64_t)set_errno(EINVAL));
 126         }
 127 }
 128 
 129 /*
 130  * Remove the associations of a child process from its parent and siblings.
 131  */
 132 static void
 133 disown_proc(proc_t *pp, proc_t *cp)
 134 {
 135         proc_t **orphpp;
 136 
 137         ASSERT(MUTEX_HELD(&pidlock));
 138 
 139         orphpp = &pp->p_orphan;
 140         while (*orphpp != cp)
 141                 orphpp = &(*orphpp)->p_nextorph;
 142         *orphpp = cp->p_nextorph;
 143 
 144         if (pp->p_child == cp)
 145                 pp->p_child = cp->p_sibling;
 146         if (cp->p_sibling)
 147                 cp->p_sibling->p_psibling = cp->p_psibling;
 148         if (cp->p_psibling)
 149                 cp->p_psibling->p_sibling = cp->p_sibling;
 150 }
 151 
 152 /* ARGSUSED */
 153 static int64_t
 154 cfork(int isvfork, int isfork1, int flags)
 155 {
 156         proc_t *p = ttoproc(curthread);
 157         struct as *as;
 158         proc_t *cp;
 159         klwp_t *clone;
 160         kthread_t *t;
 161         task_t *tk;
 162         rval_t  r;
 163         int error;
 164         int i;
 165         rctl_set_t *dup_set;
 166         rctl_alloc_gp_t *dup_gp;
 167         rctl_entity_p_t e;
 168         lwpdir_t *ldp;
 169         lwpent_t *lep;
 170         lwpent_t *clep;
 171 
 172         /*
 173          * Allow only these two flags.
 174          */
 175         if ((flags & ~(FORK_NOSIGCHLD | FORK_WAITPID)) != 0) {
 176                 error = EINVAL;
 177                 atomic_inc_32(&curproc->p_zone->zone_ffmisc);
 178                 goto forkerr;
 179         }
 180 
 181         /*
 182          * fork is not supported for the /proc agent lwp.
 183          */
 184         if (curthread == p->p_agenttp) {
 185                 error = ENOTSUP;
 186                 atomic_inc_32(&curproc->p_zone->zone_ffmisc);
 187                 goto forkerr;
 188         }
 189 
 190         if ((error = secpolicy_basic_fork(CRED())) != 0) {
 191                 atomic_inc_32(&p->p_zone->zone_ffmisc);
 192                 goto forkerr;
 193         }
 194 
 195         /*
 196          * If the calling lwp is doing a fork1() then the
 197          * other lwps in this process are not duplicated and
 198          * don't need to be held where their kernel stacks can be
 199          * cloned.  If doing forkall(), the process is held with
 200          * SHOLDFORK, so that the lwps are at a point where their
 201          * stacks can be copied which is on entry or exit from
 202          * the kernel.
 203          */
 204         if (!holdlwps(isfork1 ? SHOLDFORK1 : SHOLDFORK)) {
 205                 aston(curthread);
 206                 error = EINTR;
 207                 atomic_inc_32(&p->p_zone->zone_ffmisc);
 208                 goto forkerr;
 209         }
 210 
 211 #if defined(__sparc)
 212         /*
 213          * Ensure that the user stack is fully constructed
 214          * before creating the child process structure.
 215          */
 216         (void) flush_user_windows_to_stack(NULL);
 217 #endif
 218 
 219         mutex_enter(&p->p_lock);
 220         /*
 221          * If this is vfork(), cancel any suspend request we might
 222          * have gotten from some other thread via lwp_suspend().
 223          * Otherwise we could end up with a deadlock on return
 224          * from the vfork() in both the parent and the child.
 225          */
 226         if (isvfork)
 227                 curthread->t_proc_flag &= ~TP_HOLDLWP;
 228         /*
 229          * Prevent our resource set associations from being changed during fork.
 230          */
 231         pool_barrier_enter();
 232         mutex_exit(&p->p_lock);
 233 
 234         /*
 235          * Create a child proc struct. Place a VN_HOLD on appropriate vnodes.
 236          */
 237         if (getproc(&cp, 0, GETPROC_USER) < 0) {
 238                 mutex_enter(&p->p_lock);
 239                 pool_barrier_exit();
 240                 continuelwps(p);
 241                 mutex_exit(&p->p_lock);
 242                 error = EAGAIN;
 243                 goto forkerr;
 244         }
 245 
 246         TRACE_2(TR_FAC_PROC, TR_PROC_FORK, "proc_fork:cp %p p %p", cp, p);
 247 
 248         /*
 249          * Assign an address space to child
 250          */
 251         if (isvfork) {
 252                 /*
 253                  * Clear any watched areas and remember the
 254                  * watched pages for restoring in vfwait().
 255                  */
 256                 as = p->p_as;
 257                 if (avl_numnodes(&as->a_wpage) != 0) {
 258                         AS_LOCK_ENTER(as, RW_WRITER);
 259                         as_clearwatch(as);
 260                         p->p_wpage = as->a_wpage;
 261                         avl_create(&as->a_wpage, wp_compare,
 262                             sizeof (struct watched_page),
 263                             offsetof(struct watched_page, wp_link));
 264                         AS_LOCK_EXIT(as);
 265                 }
 266                 cp->p_as = as;
 267                 cp->p_flag |= SVFORK;
 268 
 269                 /*
 270                  * Use the parent's shm segment list information for
 271                  * the child as it uses its address space till it execs.
 272                  */
 273                 cp->p_segacct = p->p_segacct;
 274         } else {
 275                 /*
 276                  * We need to hold P_PR_LOCK until the address space has
 277                  * been duplicated and we've had a chance to remove from the
 278                  * child any DTrace probes that were in the parent. Holding
 279                  * P_PR_LOCK prevents any new probes from being added and any
 280                  * extant probes from being removed.
 281                  */
 282                 mutex_enter(&p->p_lock);
 283                 sprlock_proc(p);
 284                 p->p_flag |= SFORKING;
 285                 mutex_exit(&p->p_lock);
 286 
 287                 error = as_dup(p->p_as, cp);
 288                 if (error != 0) {
 289                         mutex_enter(&p->p_lock);
 290                         sprunlock(p);
 291                         fork_fail(cp);
 292                         mutex_enter(&pidlock);
 293                         disown_proc(p, cp);
 294                         mutex_enter(&cp->p_lock);
 295                         tk = cp->p_task;
 296                         task_detach(cp);
 297                         ASSERT(cp->p_pool->pool_ref > 0);
 298                         atomic_dec_32(&cp->p_pool->pool_ref);
 299                         mutex_exit(&cp->p_lock);
 300                         pid_exit(cp, tk);
 301                         mutex_exit(&pidlock);
 302                         task_rele(tk);
 303 
 304                         mutex_enter(&p->p_lock);
 305                         p->p_flag &= ~SFORKING;
 306                         pool_barrier_exit();
 307                         continuelwps(p);
 308                         mutex_exit(&p->p_lock);
 309                         /*
 310                          * Preserve ENOMEM error condition but
 311                          * map all others to EAGAIN.
 312                          */
 313                         error = (error == ENOMEM) ? ENOMEM : EAGAIN;
 314                         atomic_inc_32(&p->p_zone->zone_ffnomem);
 315                         goto forkerr;
 316                 }
 317 
 318                 /*
 319                  * Remove all DTrace tracepoints from the child process. We
 320                  * need to do this _before_ duplicating USDT providers since
 321                  * any associated probes may be immediately enabled.
 322                  */
 323                 if (p->p_dtrace_count > 0)
 324                         dtrace_fasttrap_fork(p, cp);
 325 
 326                 mutex_enter(&p->p_lock);
 327                 sprunlock(p);
 328 
 329                 /* Duplicate parent's shared memory */
 330                 if (p->p_segacct)
 331                         shmfork(p, cp);
 332 
 333                 /*
 334                  * Duplicate any helper actions and providers. The SFORKING
 335                  * we set above informs the code to enable USDT probes that
 336                  * sprlock() may fail because the child is being forked.
 337                  */
 338                 if (p->p_dtrace_helpers != NULL) {
 339                         ASSERT(dtrace_helpers_fork != NULL);
 340                         (*dtrace_helpers_fork)(p, cp);
 341                 }
 342 
 343                 mutex_enter(&p->p_lock);
 344                 p->p_flag &= ~SFORKING;
 345                 mutex_exit(&p->p_lock);
 346         }
 347 
 348         /*
 349          * Duplicate parent's resource controls.
 350          */
 351         dup_set = rctl_set_create();
 352         for (;;) {
 353                 dup_gp = rctl_set_dup_prealloc(p->p_rctls);
 354                 mutex_enter(&p->p_rctls->rcs_lock);
 355                 if (rctl_set_dup_ready(p->p_rctls, dup_gp))
 356                         break;
 357                 mutex_exit(&p->p_rctls->rcs_lock);
 358                 rctl_prealloc_destroy(dup_gp);
 359         }
 360         e.rcep_p.proc = cp;
 361         e.rcep_t = RCENTITY_PROCESS;
 362         cp->p_rctls = rctl_set_dup(p->p_rctls, p, cp, &e, dup_set, dup_gp,
 363             RCD_DUP | RCD_CALLBACK);
 364         mutex_exit(&p->p_rctls->rcs_lock);
 365 
 366         rctl_prealloc_destroy(dup_gp);
 367 
 368         /*
 369          * Allocate the child's lwp directory and lwpid hash table.
 370          */
 371         if (isfork1)
 372                 cp->p_lwpdir_sz = 2;
 373         else
 374                 cp->p_lwpdir_sz = p->p_lwpdir_sz;
 375         cp->p_lwpdir = cp->p_lwpfree = ldp =
 376             kmem_zalloc(cp->p_lwpdir_sz * sizeof (lwpdir_t), KM_SLEEP);
 377         for (i = 1; i < cp->p_lwpdir_sz; i++, ldp++)
 378                 ldp->ld_next = ldp + 1;
 379         cp->p_tidhash_sz = (cp->p_lwpdir_sz + 2) / 2;
 380         cp->p_tidhash =
 381             kmem_zalloc(cp->p_tidhash_sz * sizeof (tidhash_t), KM_SLEEP);
 382 
 383         /*
 384          * Duplicate parent's lwps.
 385          * Mutual exclusion is not needed because the process is
 386          * in the hold state and only the current lwp is running.
 387          */
 388         klgrpset_clear(cp->p_lgrpset);
 389         if (isfork1) {
 390                 clone = forklwp(ttolwp(curthread), cp, curthread->t_tid);
 391                 if (clone == NULL)
 392                         goto forklwperr;
 393                 /*
 394                  * Inherit only the lwp_wait()able flag,
 395                  * Daemon threads should not call fork1(), but oh well...
 396                  */
 397                 lwptot(clone)->t_proc_flag |=
 398                     (curthread->t_proc_flag & TP_TWAIT);
 399         } else {
 400                 /* this is forkall(), no one can be in lwp_wait() */
 401                 ASSERT(p->p_lwpwait == 0 && p->p_lwpdwait == 0);
 402                 /* for each entry in the parent's lwp directory... */
 403                 for (i = 0, ldp = p->p_lwpdir; i < p->p_lwpdir_sz; i++, ldp++) {
 404                         klwp_t *clwp;
 405                         kthread_t *ct;
 406 
 407                         if ((lep = ldp->ld_entry) == NULL)
 408                                 continue;
 409 
 410                         if ((t = lep->le_thread) != NULL) {
 411                                 clwp = forklwp(ttolwp(t), cp, t->t_tid);
 412                                 if (clwp == NULL)
 413                                         goto forklwperr;
 414                                 ct = lwptot(clwp);
 415                                 /*
 416                                  * Inherit lwp_wait()able and daemon flags.
 417                                  */
 418                                 ct->t_proc_flag |=
 419                                     (t->t_proc_flag & (TP_TWAIT|TP_DAEMON));
 420                                 /*
 421                                  * Keep track of the clone of curthread to
 422                                  * post return values through lwp_setrval().
 423                                  * Mark other threads for special treatment
 424                                  * by lwp_rtt() / post_syscall().
 425                                  */
 426                                 if (t == curthread)
 427                                         clone = clwp;
 428                                 else
 429                                         ct->t_flag |= T_FORKALL;
 430                         } else {
 431                                 /*
 432                                  * Replicate zombie lwps in the child.
 433                                  */
 434                                 clep = kmem_zalloc(sizeof (*clep), KM_SLEEP);
 435                                 clep->le_lwpid = lep->le_lwpid;
 436                                 clep->le_start = lep->le_start;
 437                                 lwp_hash_in(cp, clep,
 438                                     cp->p_tidhash, cp->p_tidhash_sz, 0);
 439                         }
 440                 }
 441         }
 442 
 443         /*
 444          * Put new process in the parent's process contract, or put it
 445          * in a new one if there is an active process template.  Send a
 446          * fork event (if requested) to whatever contract the child is
 447          * a member of.  Fails if the parent has been SIGKILLed.
 448          */
 449         if (contract_process_fork(NULL, cp, p, B_TRUE) == NULL) {
 450                 atomic_inc_32(&p->p_zone->zone_ffmisc);
 451                 goto forklwperr;
 452         }
 453 
 454         /*
 455          * No fork failures occur beyond this point.
 456          */
 457 
 458         cp->p_lwpid = p->p_lwpid;
 459         if (!isfork1) {
 460                 cp->p_lwpdaemon = p->p_lwpdaemon;
 461                 cp->p_zombcnt = p->p_zombcnt;
 462                 /*
 463                  * If the parent's lwp ids have wrapped around, so have the
 464                  * child's.
 465                  */
 466                 cp->p_flag |= p->p_flag & SLWPWRAP;
 467         }
 468 
 469         mutex_enter(&p->p_lock);
 470         corectl_path_hold(cp->p_corefile = p->p_corefile);
 471         corectl_content_hold(cp->p_content = p->p_content);
 472         mutex_exit(&p->p_lock);
 473 
 474         /*
 475          * Duplicate process context ops, if any.
 476          */
 477         if (p->p_pctx)
 478                 forkpctx(p, cp);
 479 
 480 #ifdef __sparc
 481         utrap_dup(p, cp);
 482 #endif
 483         /*
 484          * If the child process has been marked to stop on exit
 485          * from this fork, arrange for all other lwps to stop in
 486          * sympathy with the active lwp.
 487          */
 488         if (PTOU(cp)->u_systrap &&
 489             prismember(&PTOU(cp)->u_exitmask, curthread->t_sysnum)) {
 490                 mutex_enter(&cp->p_lock);
 491                 t = cp->p_tlist;
 492                 do {
 493                         t->t_proc_flag |= TP_PRSTOP;
 494                         aston(t);       /* so TP_PRSTOP will be seen */
 495                 } while ((t = t->t_forw) != cp->p_tlist);
 496                 mutex_exit(&cp->p_lock);
 497         }
 498         /*
 499          * If the parent process has been marked to stop on exit
 500          * from this fork, and its asynchronous-stop flag has not
 501          * been set, arrange for all other lwps to stop before
 502          * they return back to user level.
 503          */
 504         if (!(p->p_proc_flag & P_PR_ASYNC) && PTOU(p)->u_systrap &&
 505             prismember(&PTOU(p)->u_exitmask, curthread->t_sysnum)) {
 506                 mutex_enter(&p->p_lock);
 507                 t = p->p_tlist;
 508                 do {
 509                         t->t_proc_flag |= TP_PRSTOP;
 510                         aston(t);       /* so TP_PRSTOP will be seen */
 511                 } while ((t = t->t_forw) != p->p_tlist);
 512                 mutex_exit(&p->p_lock);
 513         }
 514 
 515         if (PROC_IS_BRANDED(p))
 516                 BROP(p)->b_lwp_setrval(clone, p->p_pid, 1);
 517         else
 518                 lwp_setrval(clone, p->p_pid, 1);
 519 
 520         /* set return values for parent */
 521         r.r_val1 = (int)cp->p_pid;
 522         r.r_val2 = 0;
 523 
 524         /*
 525          * pool_barrier_exit() can now be called because the child process has:
 526          * - all identifying features cloned or set (p_pid, p_task, p_pool)
 527          * - all resource sets associated (p_tlist->*->t_cpupart, p_as->a_mset)
 528          * - any other fields set which are used in resource set binding.
 529          */
 530         mutex_enter(&p->p_lock);
 531         pool_barrier_exit();
 532         mutex_exit(&p->p_lock);
 533 
 534         mutex_enter(&pidlock);
 535         mutex_enter(&cp->p_lock);
 536 
 537         /*
 538          * Set flags telling the child what (not) to do on exit.
 539          */
 540         if (flags & FORK_NOSIGCHLD)
 541                 cp->p_pidflag |= CLDNOSIGCHLD;
 542         if (flags & FORK_WAITPID)
 543                 cp->p_pidflag |= CLDWAITPID;
 544 
 545         /*
 546          * Now that there are lwps and threads attached, add the new
 547          * process to the process group.
 548          */
 549         pgjoin(cp, p->p_pgidp);
 550         cp->p_stat = SRUN;
 551         /*
 552          * We are now done with all the lwps in the child process.
 553          */
 554         t = cp->p_tlist;
 555         do {
 556                 /*
 557                  * Set the lwp_suspend()ed lwps running.
 558                  * They will suspend properly at syscall exit.
 559                  */
 560                 if (t->t_proc_flag & TP_HOLDLWP)
 561                         lwp_create_done(t);
 562                 else {
 563                         /* set TS_CREATE to allow continuelwps() to work */
 564                         thread_lock(t);
 565                         ASSERT(t->t_state == TS_STOPPED &&
 566                             !(t->t_schedflag & (TS_CREATE|TS_CSTART)));
 567                         t->t_schedflag |= TS_CREATE;
 568                         thread_unlock(t);
 569                 }
 570         } while ((t = t->t_forw) != cp->p_tlist);
 571         mutex_exit(&cp->p_lock);
 572 
 573         if (isvfork) {
 574                 CPU_STATS_ADDQ(CPU, sys, sysvfork, 1);
 575                 mutex_enter(&p->p_lock);
 576                 p->p_flag |= SVFWAIT;
 577                 curthread->t_flag |= T_VFPARENT;
 578                 DTRACE_PROC1(create, proc_t *, cp);
 579                 cv_broadcast(&pr_pid_cv[p->p_slot]);     /* inform /proc */
 580                 mutex_exit(&p->p_lock);
 581                 /*
 582                  * Grab child's p_lock before dropping pidlock to ensure
 583                  * the process will not disappear before we set it running.
 584                  */
 585                 mutex_enter(&cp->p_lock);
 586                 mutex_exit(&pidlock);
 587                 sigdefault(cp);
 588                 continuelwps(cp);
 589                 mutex_exit(&cp->p_lock);
 590         } else {
 591                 CPU_STATS_ADDQ(CPU, sys, sysfork, 1);
 592                 DTRACE_PROC1(create, proc_t *, cp);
 593                 /*
 594                  * It is CL_FORKRET's job to drop pidlock.
 595                  * If we do it here, the process could be set running
 596                  * and disappear before CL_FORKRET() is called.
 597                  */
 598                 CL_FORKRET(curthread, cp->p_tlist);
 599                 schedctl_set_cidpri(curthread);
 600                 ASSERT(MUTEX_NOT_HELD(&pidlock));
 601         }
 602 
 603         return (r.r_vals);
 604 
 605 forklwperr:
 606         if (isvfork) {
 607                 if (avl_numnodes(&p->p_wpage) != 0) {
 608                         /* restore watchpoints to parent */
 609                         as = p->p_as;
 610                         AS_LOCK_ENTER(as, RW_WRITER);
 611                         as->a_wpage = p->p_wpage;
 612                         avl_create(&p->p_wpage, wp_compare,
 613                             sizeof (struct watched_page),
 614                             offsetof(struct watched_page, wp_link));
 615                         as_setwatch(as);
 616                         AS_LOCK_EXIT(as);
 617                 }
 618         } else {
 619                 if (cp->p_segacct)
 620                         shmexit(cp);
 621                 as = cp->p_as;
 622                 cp->p_as = &kas;
 623                 as_free(as);
 624         }
 625 
 626         if (cp->p_lwpdir) {
 627                 for (i = 0, ldp = cp->p_lwpdir; i < cp->p_lwpdir_sz; i++, ldp++)
 628                         if ((lep = ldp->ld_entry) != NULL)
 629                                 kmem_free(lep, sizeof (*lep));
 630                 kmem_free(cp->p_lwpdir,
 631                     cp->p_lwpdir_sz * sizeof (*cp->p_lwpdir));
 632         }
 633         cp->p_lwpdir = NULL;
 634         cp->p_lwpfree = NULL;
 635         cp->p_lwpdir_sz = 0;
 636 
 637         if (cp->p_tidhash)
 638                 kmem_free(cp->p_tidhash,
 639                     cp->p_tidhash_sz * sizeof (*cp->p_tidhash));
 640         cp->p_tidhash = NULL;
 641         cp->p_tidhash_sz = 0;
 642 
 643         forklwp_fail(cp);
 644         fork_fail(cp);
 645         rctl_set_free(cp->p_rctls);
 646         mutex_enter(&pidlock);
 647 
 648         /*
 649          * Detach failed child from task.
 650          */
 651         mutex_enter(&cp->p_lock);
 652         tk = cp->p_task;
 653         task_detach(cp);
 654         ASSERT(cp->p_pool->pool_ref > 0);
 655         atomic_dec_32(&cp->p_pool->pool_ref);
 656         mutex_exit(&cp->p_lock);
 657 
 658         disown_proc(p, cp);
 659         pid_exit(cp, tk);
 660         mutex_exit(&pidlock);
 661 
 662         task_rele(tk);
 663 
 664         mutex_enter(&p->p_lock);
 665         pool_barrier_exit();
 666         continuelwps(p);
 667         mutex_exit(&p->p_lock);
 668         error = EAGAIN;
 669 forkerr:
 670         return ((int64_t)set_errno(error));
 671 }
 672 
 673 /*
 674  * Free allocated resources from getproc() if a fork failed.
 675  */
 676 static void
 677 fork_fail(proc_t *cp)
 678 {
 679         uf_info_t *fip = P_FINFO(cp);
 680 
 681         fcnt_add(fip, -1);
 682         sigdelq(cp, NULL, 0);
 683 
 684         mutex_enter(&pidlock);
 685         upcount_dec(crgetruid(cp->p_cred), crgetzoneid(cp->p_cred));
 686         mutex_exit(&pidlock);
 687 
 688         /*
 689          * single threaded, so no locking needed here
 690          */
 691         crfree(cp->p_cred);
 692 
 693         kmem_free(fip->fi_list, fip->fi_nfiles * sizeof (uf_entry_t));
 694 
 695         VN_RELE(PTOU(curproc)->u_cdir);
 696         if (PTOU(curproc)->u_rdir)
 697                 VN_RELE(PTOU(curproc)->u_rdir);
 698         if (cp->p_exec)
 699                 VN_RELE(cp->p_exec);
 700         if (cp->p_execdir)
 701                 VN_RELE(cp->p_execdir);
 702         if (PTOU(curproc)->u_cwd)
 703                 refstr_rele(PTOU(curproc)->u_cwd);
 704         if (PROC_IS_BRANDED(cp)) {
 705                 brand_clearbrand(cp, B_FALSE);
 706         }
 707 }
 708 
 709 /*
 710  * Clean up the lwps already created for this child process.
 711  * The fork failed while duplicating all the lwps of the parent
 712  * and those lwps already created must be freed.
 713  * This process is invisible to the rest of the system,
 714  * so we don't need to hold p->p_lock to protect the list.
 715  */
 716 static void
 717 forklwp_fail(proc_t *p)
 718 {
 719         kthread_t *t;
 720         task_t *tk;
 721         int branded = 0;
 722 
 723         if (PROC_IS_BRANDED(p))
 724                 branded = 1;
 725 
 726         while ((t = p->p_tlist) != NULL) {
 727                 /*
 728                  * First remove the lwp from the process's p_tlist.
 729                  */
 730                 if (t != t->t_forw)
 731                         p->p_tlist = t->t_forw;
 732                 else
 733                         p->p_tlist = NULL;
 734                 p->p_lwpcnt--;
 735                 t->t_forw->t_back = t->t_back;
 736                 t->t_back->t_forw = t->t_forw;
 737 
 738                 tk = p->p_task;
 739                 mutex_enter(&p->p_zone->zone_nlwps_lock);
 740                 tk->tk_nlwps--;
 741                 tk->tk_proj->kpj_nlwps--;
 742                 p->p_zone->zone_nlwps--;
 743                 mutex_exit(&p->p_zone->zone_nlwps_lock);
 744 
 745                 ASSERT(t->t_schedctl == NULL);
 746 
 747                 if (branded)
 748                         BROP(p)->b_freelwp(ttolwp(t));
 749 
 750                 if (t->t_door != NULL) {
 751                         kmem_free(t->t_door, sizeof (door_data_t));
 752                         t->t_door = NULL;
 753                 }
 754                 lwp_ctmpl_clear(ttolwp(t), B_FALSE);
 755 
 756                 /*
 757                  * Remove the thread from the all threads list.
 758                  * We need to hold pidlock for this.
 759                  */
 760                 mutex_enter(&pidlock);
 761                 t->t_next->t_prev = t->t_prev;
 762                 t->t_prev->t_next = t->t_next;
 763                 CL_EXIT(t);     /* tell the scheduler that we're exiting */
 764                 cv_broadcast(&t->t_joincv);      /* tell anyone in thread_join */
 765                 mutex_exit(&pidlock);
 766 
 767                 /*
 768                  * Let the lgroup load averages know that this thread isn't
 769                  * going to show up (i.e. un-do what was done on behalf of
 770                  * this thread by the earlier lgrp_move_thread()).
 771                  */
 772                 kpreempt_disable();
 773                 lgrp_move_thread(t, NULL, 1);
 774                 kpreempt_enable();
 775 
 776                 /*
 777                  * The thread was created TS_STOPPED.
 778                  * We change it to TS_FREE to avoid an
 779                  * ASSERT() panic in thread_free().
 780                  */
 781                 t->t_state = TS_FREE;
 782                 thread_rele(t);
 783                 thread_free(t);
 784         }
 785 }
 786 
 787 extern struct as kas;
 788 
 789 /*
 790  * fork a kernel process.
 791  *
 792  * Passing a pid argument of -1 indicates that the new process should be
 793  * launched as a child of 'zsched' within the zone.
 794  */
 795 int
 796 newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct,
 797     pid_t pid)
 798 {
 799         proc_t *p;
 800         struct user *up;
 801         kthread_t *t;
 802         cont_process_t *ctp = NULL;
 803         rctl_entity_p_t e;
 804 
 805         ASSERT(cid != sysdccid);
 806         ASSERT(cid != syscid || ct == NULL);
 807         if (CLASS_KERNEL(cid)) {
 808                 rctl_alloc_gp_t *init_gp;
 809                 rctl_set_t *init_set;
 810 
 811                 ASSERT(pid != 1);
 812                 ASSERT(pid >= 0);
 813 
 814                 if (getproc(&p, pid, GETPROC_KERNEL) < 0)
 815                         return (EAGAIN);
 816 
 817                 /*
 818                  * Release the hold on the p_exec and p_execdir, these
 819                  * were acquired in getproc()
 820                  */
 821                 if (p->p_execdir != NULL)
 822                         VN_RELE(p->p_execdir);
 823                 if (p->p_exec != NULL)
 824                         VN_RELE(p->p_exec);
 825                 p->p_flag |= SNOWAIT;
 826                 p->p_exec = NULL;
 827                 p->p_execdir = NULL;
 828 
 829                 init_set = rctl_set_create();
 830                 init_gp = rctl_set_init_prealloc(RCENTITY_PROCESS);
 831 
 832                 /*
 833                  * kernel processes do not inherit /proc tracing flags.
 834                  */
 835                 sigemptyset(&p->p_sigmask);
 836                 premptyset(&p->p_fltmask);
 837                 up = PTOU(p);
 838                 up->u_systrap = 0;
 839                 premptyset(&(up->u_entrymask));
 840                 premptyset(&(up->u_exitmask));
 841                 mutex_enter(&p->p_lock);
 842                 e.rcep_p.proc = p;
 843                 e.rcep_t = RCENTITY_PROCESS;
 844                 p->p_rctls = rctl_set_init(RCENTITY_PROCESS, p, &e, init_set,
 845                     init_gp);
 846                 mutex_exit(&p->p_lock);
 847 
 848                 rctl_prealloc_destroy(init_gp);
 849 
 850                 t = lwp_kernel_create(p, pc, arg, TS_STOPPED, pri);
 851         } else {
 852                 rctl_alloc_gp_t *init_gp, *default_gp;
 853                 rctl_set_t *init_set;
 854                 task_t *tk, *tk_old;
 855                 klwp_t *lwp;
 856                 boolean_t pzsched = B_FALSE;
 857                 int flag = GETPROC_USER;
 858 
 859                 /* Handle a new user-level thread as child of zsched. */
 860                 if (pid < 0) {
 861                         VERIFY(curzone != global_zone);
 862                         flag = GETPROC_ZSCHED;
 863                         pzsched = B_TRUE;
 864                         pid = 0;
 865                 }
 866 
 867                 if (getproc(&p, pid, flag) < 0)
 868                         return (EAGAIN);
 869                 /*
 870                  * init creates a new task, distinct from the task
 871                  * containing kernel "processes".
 872                  */
 873                 tk = task_create(0, p->p_zone);
 874                 mutex_enter(&tk->tk_zone->zone_nlwps_lock);
 875                 tk->tk_proj->kpj_ntasks++;
 876                 tk->tk_nprocs++;
 877                 mutex_exit(&tk->tk_zone->zone_nlwps_lock);
 878 
 879                 default_gp = rctl_rlimit_set_prealloc(RLIM_NLIMITS);
 880                 init_gp = rctl_set_init_prealloc(RCENTITY_PROCESS);
 881                 init_set = rctl_set_create();
 882 
 883                 mutex_enter(&pidlock);
 884                 mutex_enter(&p->p_lock);
 885                 tk_old = p->p_task;  /* switch to new task */
 886 
 887                 task_detach(p);
 888                 task_begin(tk, p);
 889                 mutex_exit(&pidlock);
 890 
 891                 mutex_enter(&tk_old->tk_zone->zone_nlwps_lock);
 892                 tk_old->tk_nprocs--;
 893                 mutex_exit(&tk_old->tk_zone->zone_nlwps_lock);
 894 
 895                 e.rcep_p.proc = p;
 896                 e.rcep_t = RCENTITY_PROCESS;
 897                 p->p_rctls = rctl_set_init(RCENTITY_PROCESS, p, &e, init_set,
 898                     init_gp);
 899                 rctlproc_default_init(p, default_gp);
 900                 mutex_exit(&p->p_lock);
 901 
 902                 task_rele(tk_old);
 903                 rctl_prealloc_destroy(default_gp);
 904                 rctl_prealloc_destroy(init_gp);
 905 
 906                 if ((lwp = lwp_create(pc, arg, 0, p, TS_STOPPED, pri,
 907                     &curthread->t_hold, cid, 1)) == NULL) {
 908                         task_t *tk;
 909 
 910                         fork_fail(p);
 911                         mutex_enter(&pidlock);
 912                         disown_proc(p->p_parent, p);
 913 
 914                         mutex_enter(&p->p_lock);
 915                         tk = p->p_task;
 916                         task_detach(p);
 917                         ASSERT(p->p_pool->pool_ref > 0);
 918                         atomic_add_32(&p->p_pool->pool_ref, -1);
 919                         mutex_exit(&p->p_lock);
 920 
 921                         pid_exit(p, tk);
 922                         mutex_exit(&pidlock);
 923                         task_rele(tk);
 924                         return (EAGAIN);
 925                 }
 926                 t = lwptot(lwp);
 927 
 928                 ctp = contract_process_fork(sys_process_tmpl, p,
 929                     (pzsched ? curproc->p_zone->zone_zsched : curproc),
 930                     B_FALSE);
 931                 ASSERT(ctp != NULL);
 932                 if (ct != NULL)
 933                         *ct = &ctp->conp_contract;
 934         }
 935 
 936         ASSERT3U(t->t_tid, ==, 1);
 937         p->p_lwpid = 1;
 938         mutex_enter(&pidlock);
 939         pgjoin(p, p->p_parent->p_pgidp);
 940         p->p_stat = SRUN;
 941         mutex_enter(&p->p_lock);
 942         t->t_proc_flag &= ~TP_HOLDLWP;
 943         lwp_create_done(t);
 944         mutex_exit(&p->p_lock);
 945         mutex_exit(&pidlock);
 946         return (0);
 947 }
 948 
 949 /*
 950  * create a child proc struct.
 951  */
 952 static int
 953 getproc(proc_t **cpp, pid_t pid, uint_t flags)
 954 {
 955         proc_t          *pp, *cp;
 956         pid_t           newpid;
 957         struct user     *uarea;
 958         extern uint_t   nproc;
 959         struct cred     *cr;
 960         uid_t           ruid;
 961         zoneid_t        zoneid;
 962         task_t          *task;
 963         kproject_t      *proj;
 964         zone_t          *zone;
 965         int             rctlfail = 0;
 966 
 967         if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN)
 968                 return (-1);    /* no point in starting new processes */
 969 
 970         if (flags & GETPROC_ZSCHED) {
 971                 pp = curproc->p_zone->zone_zsched;
 972         } else {
 973                 pp = (flags & GETPROC_KERNEL) ? &p0 : curproc;
 974         }
 975         task = pp->p_task;
 976         proj = task->tk_proj;
 977         zone = pp->p_zone;
 978 
 979         mutex_enter(&pp->p_lock);
 980         mutex_enter(&zone->zone_nlwps_lock);
 981         if (proj != proj0p) {
 982                 if (task->tk_nprocs >= task->tk_nprocs_ctl)
 983                         if (rctl_test(rc_task_nprocs, task->tk_rctls,
 984                             pp, 1, 0) & RCT_DENY)
 985                                 rctlfail = 1;
 986 
 987                 if (proj->kpj_nprocs >= proj->kpj_nprocs_ctl)
 988                         if (rctl_test(rc_project_nprocs, proj->kpj_rctls,
 989                             pp, 1, 0) & RCT_DENY)
 990                                 rctlfail = 1;
 991 
 992                 if (zone->zone_nprocs >= zone->zone_nprocs_ctl)
 993                         if (rctl_test(rc_zone_nprocs, zone->zone_rctls,
 994                             pp, 1, 0) & RCT_DENY)
 995                                 rctlfail = 1;
 996 
 997                 if (rctlfail) {
 998                         mutex_exit(&zone->zone_nlwps_lock);
 999                         mutex_exit(&pp->p_lock);
1000                         atomic_inc_32(&zone->zone_ffcap);
1001                         goto punish;
1002                 }
1003         }
1004         task->tk_nprocs++;
1005         proj->kpj_nprocs++;
1006         zone->zone_nprocs++;
1007         mutex_exit(&zone->zone_nlwps_lock);
1008         mutex_exit(&pp->p_lock);
1009 
1010         cp = kmem_cache_alloc(process_cache, KM_SLEEP);
1011         bzero(cp, sizeof (proc_t));
1012 
1013         /*
1014          * Make proc entry for child process
1015          */
1016         mutex_init(&cp->p_splock, NULL, MUTEX_DEFAULT, NULL);
1017         mutex_init(&cp->p_crlock, NULL, MUTEX_DEFAULT, NULL);
1018         mutex_init(&cp->p_pflock, NULL, MUTEX_DEFAULT, NULL);
1019 #if defined(__x86)
1020         mutex_init(&cp->p_ldtlock, NULL, MUTEX_DEFAULT, NULL);
1021 #endif
1022         mutex_init(&cp->p_maplock, NULL, MUTEX_DEFAULT, NULL);
1023         cp->p_stat = SIDL;
1024         cp->p_mstart = gethrtime();
1025         cp->p_as = &kas;
1026         /*
1027          * p_zone must be set before we call pid_allocate since the process
1028          * will be visible after that and code such as prfind_zone will
1029          * look at the p_zone field.
1030          */
1031         cp->p_zone = pp->p_zone;
1032         cp->p_t1_lgrpid = LGRP_NONE;
1033         cp->p_tr_lgrpid = LGRP_NONE;
1034 
1035         /* Default to native brand initially */
1036         cp->p_brand = &native_brand;
1037 
1038         if ((newpid = pid_allocate(cp, pid, PID_ALLOC_PROC)) == -1) {
1039                 if (nproc == v.v_proc) {
1040                         CPU_STATS_ADDQ(CPU, sys, procovf, 1);
1041                         cmn_err(CE_WARN, "out of processes");
1042                 }
1043                 goto bad;
1044         }
1045 
1046         mutex_enter(&pp->p_lock);
1047         cp->p_exec = pp->p_exec;
1048         cp->p_execdir = pp->p_execdir;
1049         mutex_exit(&pp->p_lock);
1050 
1051         if (cp->p_exec) {
1052                 VN_HOLD(cp->p_exec);
1053                 /*
1054                  * Each VOP_OPEN() must be paired with a corresponding
1055                  * VOP_CLOSE(). In this case, the executable will be
1056                  * closed for the child in either proc_exit() or gexec().
1057                  */
1058                 if (VOP_OPEN(&cp->p_exec, FREAD, CRED(), NULL) != 0) {
1059                         VN_RELE(cp->p_exec);
1060                         cp->p_exec = NULLVP;
1061                         cp->p_execdir = NULLVP;
1062                         goto bad;
1063                 }
1064         }
1065         if (cp->p_execdir)
1066                 VN_HOLD(cp->p_execdir);
1067 
1068         /*
1069          * If not privileged make sure that this user hasn't exceeded
1070          * v.v_maxup processes, and that users collectively haven't
1071          * exceeded v.v_maxupttl processes.
1072          */
1073         mutex_enter(&pidlock);
1074         ASSERT(nproc < v.v_proc);    /* otherwise how'd we get our pid? */
1075         cr = CRED();
1076         ruid = crgetruid(cr);
1077         zoneid = crgetzoneid(cr);
1078         if (nproc >= v.v_maxup &&    /* short-circuit; usually false */
1079             (nproc >= v.v_maxupttl ||
1080             upcount_get(ruid, zoneid) >= v.v_maxup) &&
1081             secpolicy_newproc(cr) != 0) {
1082                 mutex_exit(&pidlock);
1083                 zcmn_err(zoneid, CE_NOTE,
1084                     "out of per-user processes for uid %d", ruid);
1085                 goto bad;
1086         }
1087 
1088         /*
1089          * Everything is cool, put the new proc on the active process list.
1090          * It is already on the pid list and in /proc.
1091          * Increment the per uid process count (upcount).
1092          */
1093         nproc++;
1094         upcount_inc(ruid, zoneid);
1095 
1096         cp->p_next = practive;
1097         practive->p_prev = cp;
1098         practive = cp;
1099 
1100         cp->p_ignore = pp->p_ignore;
1101         cp->p_siginfo = pp->p_siginfo;
1102         cp->p_flag = pp->p_flag & (SJCTL|SNOWAIT|SNOCD);
1103         cp->p_sessp = pp->p_sessp;
1104         sess_hold(pp);
1105         cp->p_bssbase = pp->p_bssbase;
1106         cp->p_brkbase = pp->p_brkbase;
1107         cp->p_brksize = pp->p_brksize;
1108         cp->p_brkpageszc = pp->p_brkpageszc;
1109         cp->p_stksize = pp->p_stksize;
1110         cp->p_stkpageszc = pp->p_stkpageszc;
1111         cp->p_stkprot = pp->p_stkprot;
1112         cp->p_datprot = pp->p_datprot;
1113         cp->p_usrstack = pp->p_usrstack;
1114         cp->p_model = pp->p_model;
1115         cp->p_ppid = pp->p_pid;
1116         cp->p_ancpid = pp->p_pid;
1117         cp->p_portcnt = pp->p_portcnt;
1118 
1119         /*
1120          * Initialize watchpoint structures
1121          */
1122         avl_create(&cp->p_warea, wa_compare, sizeof (struct watched_area),
1123             offsetof(struct watched_area, wa_link));
1124 
1125         /*
1126          * Initialize immediate resource control values.
1127          */
1128         cp->p_stk_ctl = pp->p_stk_ctl;
1129         cp->p_fsz_ctl = pp->p_fsz_ctl;
1130         cp->p_vmem_ctl = pp->p_vmem_ctl;
1131         cp->p_fno_ctl = pp->p_fno_ctl;
1132 
1133         /*
1134          * Link up to parent-child-sibling chain.  No need to lock
1135          * in general since only a call to freeproc() (done by the
1136          * same parent as newproc()) diddles with the child chain.
1137          */
1138         cp->p_sibling = pp->p_child;
1139         if (pp->p_child)
1140                 pp->p_child->p_psibling = cp;
1141 
1142         cp->p_parent = pp;
1143         pp->p_child = cp;
1144 
1145         cp->p_child_ns = NULL;
1146         cp->p_sibling_ns = NULL;
1147 
1148         cp->p_nextorph = pp->p_orphan;
1149         cp->p_nextofkin = pp;
1150         pp->p_orphan = cp;
1151 
1152         /*
1153          * Inherit profiling state; do not inherit REALPROF profiling state.
1154          */
1155         cp->p_prof = pp->p_prof;
1156         cp->p_rprof_cyclic = CYCLIC_NONE;
1157 
1158         /*
1159          * Inherit pool pointer from the parent.  Kernel processes are
1160          * always bound to the default pool.
1161          */
1162         mutex_enter(&pp->p_lock);
1163         if (flags & GETPROC_KERNEL) {
1164                 cp->p_pool = pool_default;
1165                 cp->p_flag |= SSYS;
1166         } else {
1167                 cp->p_pool = pp->p_pool;
1168         }
1169         atomic_inc_32(&cp->p_pool->pool_ref);
1170         mutex_exit(&pp->p_lock);
1171 
1172         /*
1173          * Add the child process to the current task.  Kernel processes
1174          * are always attached to task0.
1175          */
1176         mutex_enter(&cp->p_lock);
1177         if (flags & GETPROC_KERNEL)
1178                 task_attach(task0p, cp);
1179         else
1180                 task_attach(pp->p_task, cp);
1181         mutex_exit(&cp->p_lock);
1182         mutex_exit(&pidlock);
1183 
1184         if (PROC_IS_BRANDED(pp)) {
1185                 /*
1186                  * The only reason why process branding should fail is when
1187                  * the procedure is complicated by multiple LWPs on the scene.
1188                  * With an LWP count of 0, this newly allocated process has no
1189                  * reason to fail branding.
1190                  */
1191                 VERIFY0(brand_setbrand(cp, B_FALSE));
1192 
1193                 BROP(pp)->b_copy_procdata(cp, pp);
1194         }
1195 
1196         avl_create(&cp->p_ct_held, contract_compar, sizeof (contract_t),
1197             offsetof(contract_t, ct_ctlist));
1198 
1199         /*
1200          * Duplicate any audit information kept in the process table
1201          */
1202         if (audit_active)       /* copy audit data to cp */
1203                 audit_newproc(cp);
1204 
1205         crhold(cp->p_cred = cr);
1206 
1207         /*
1208          * Bump up the counts on the file structures pointed at by the
1209          * parent's file table since the child will point at them too.
1210          */
1211         fcnt_add(P_FINFO(pp), 1);
1212 
1213         if (PTOU(pp)->u_cdir) {
1214                 VN_HOLD(PTOU(pp)->u_cdir);
1215         } else {
1216                 ASSERT(pp == &p0);
1217                 /*
1218                  * We must be at or before vfs_mountroot(); it will take care of
1219                  * assigning our current directory.
1220                  */
1221         }
1222         if (PTOU(pp)->u_rdir)
1223                 VN_HOLD(PTOU(pp)->u_rdir);
1224         if (PTOU(pp)->u_cwd)
1225                 refstr_hold(PTOU(pp)->u_cwd);
1226 
1227         /*
1228          * copy the parent's uarea.
1229          */
1230         uarea = PTOU(cp);
1231         bcopy(PTOU(pp), uarea, sizeof (*uarea));
1232         flist_fork(P_FINFO(pp), P_FINFO(cp));
1233 
1234         gethrestime(&uarea->u_start);
1235         uarea->u_ticks = ddi_get_lbolt();
1236         uarea->u_mem = rm_asrss(pp->p_as);
1237         uarea->u_acflag = AFORK;
1238 
1239         /*
1240          * If inherit-on-fork, copy /proc tracing flags to child.
1241          */
1242         if ((pp->p_proc_flag & P_PR_FORK) != 0) {
1243                 cp->p_proc_flag |= pp->p_proc_flag & (P_PR_TRACE|P_PR_FORK);
1244                 cp->p_sigmask = pp->p_sigmask;
1245                 cp->p_fltmask = pp->p_fltmask;
1246         } else {
1247                 sigemptyset(&cp->p_sigmask);
1248                 premptyset(&cp->p_fltmask);
1249                 uarea->u_systrap = 0;
1250                 premptyset(&uarea->u_entrymask);
1251                 premptyset(&uarea->u_exitmask);
1252         }
1253         /*
1254          * If microstate accounting is being inherited, mark child
1255          */
1256         if ((pp->p_flag & SMSFORK) != 0)
1257                 cp->p_flag |= pp->p_flag & (SMSFORK|SMSACCT);
1258 
1259         /*
1260          * Inherit fixalignment flag from the parent
1261          */
1262         cp->p_fixalignment = pp->p_fixalignment;
1263 
1264         *cpp = cp;
1265         return (0);
1266 
1267 bad:
1268         ASSERT(MUTEX_NOT_HELD(&pidlock));
1269 
1270         mutex_destroy(&cp->p_crlock);
1271         mutex_destroy(&cp->p_pflock);
1272 #if defined(__x86)
1273         mutex_destroy(&cp->p_ldtlock);
1274 #endif
1275         if (newpid != -1) {
1276                 proc_entry_free(cp->p_pidp);
1277                 (void) pid_rele(cp->p_pidp);
1278         }
1279         kmem_cache_free(process_cache, cp);
1280 
1281         mutex_enter(&zone->zone_nlwps_lock);
1282         task->tk_nprocs--;
1283         proj->kpj_nprocs--;
1284         zone->zone_nprocs--;
1285         mutex_exit(&zone->zone_nlwps_lock);
1286         atomic_inc_32(&zone->zone_ffnoproc);
1287 
1288 punish:
1289         /*
1290          * We most likely got into this situation because some process is
1291          * forking out of control.  As punishment, put it to sleep for a
1292          * bit so it can't eat the machine alive.  Sleep interval is chosen
1293          * to allow no more than one fork failure per cpu per clock tick
1294          * on average (yes, I just made this up).  This has two desirable
1295          * properties: (1) it sets a constant limit on the fork failure
1296          * rate, and (2) the busier the system is, the harsher the penalty
1297          * for abusing it becomes.
1298          */
1299         INCR_COUNT(&fork_fail_pending, &pidlock);
1300         delay(fork_fail_pending / ncpus + 1);
1301         DECR_COUNT(&fork_fail_pending, &pidlock);
1302 
1303         return (-1); /* out of memory or proc slots */
1304 }
1305 
1306 /*
1307  * Release virtual memory.
1308  * In the case of vfork(), the child was given exclusive access to its
1309  * parent's address space.  The parent is waiting in vfwait() for the
1310  * child to release its exclusive claim via relvm().
1311  */
1312 void
1313 relvm()
1314 {
1315         proc_t *p = curproc;
1316 
1317         ASSERT((unsigned)p->p_lwpcnt <= 1);
1318 
1319         prrelvm();      /* inform /proc */
1320 
1321         if (p->p_flag & SVFORK) {
1322                 proc_t *pp = p->p_parent;
1323                 /*
1324                  * The child process is either exec'ing or exit'ing.
1325                  * The child is now separated from the parent's address
1326                  * space.  The parent process is made dispatchable.
1327                  *
1328                  * This is a delicate locking maneuver, involving
1329                  * both the parent's p_lock and the child's p_lock.
1330                  * As soon as the SVFORK flag is turned off, the
1331                  * parent is free to run, but it must not run until
1332                  * we wake it up using its p_cv because it might
1333                  * exit and we would be referencing invalid memory.
1334                  * Therefore, we hold the parent with its p_lock
1335                  * while protecting our p_flags with our own p_lock.
1336                  */
1337 try_again:
1338                 mutex_enter(&p->p_lock); /* grab child's lock first */
1339                 prbarrier(p);           /* make sure /proc is blocked out */
1340                 mutex_enter(&pp->p_lock);
1341 
1342                 /*
1343                  * Check if parent is locked by /proc.
1344                  */
1345                 if (pp->p_proc_flag & P_PR_LOCK) {
1346                         /*
1347                          * Delay until /proc is done with the parent.
1348                          * We must drop our (the child's) p->p_lock, wait
1349                          * via prbarrier() on the parent, then start over.
1350                          */
1351                         mutex_exit(&p->p_lock);
1352                         prbarrier(pp);
1353                         mutex_exit(&pp->p_lock);
1354                         goto try_again;
1355                 }
1356                 p->p_flag &= ~SVFORK;
1357                 kpreempt_disable();
1358                 p->p_as = &kas;
1359 
1360                 /*
1361                  * notify hat of change in thread's address space
1362                  */
1363                 hat_thread_exit(curthread);
1364                 kpreempt_enable();
1365 
1366                 /*
1367                  * child sizes are copied back to parent because
1368                  * child may have grown.
1369                  */
1370                 pp->p_brkbase = p->p_brkbase;
1371                 pp->p_brksize = p->p_brksize;
1372                 pp->p_stksize = p->p_stksize;
1373 
1374                 /*
1375                  * Copy back the shm accounting information
1376                  * to the parent process.
1377                  */
1378                 pp->p_segacct = p->p_segacct;
1379                 p->p_segacct = NULL;
1380 
1381                 /*
1382                  * The parent is no longer waiting for the vfork()d child.
1383                  * Restore the parent's watched pages, if any.  This is
1384                  * safe because we know the parent is not locked by /proc
1385                  */
1386                 pp->p_flag &= ~SVFWAIT;
1387                 if (avl_numnodes(&pp->p_wpage) != 0) {
1388                         pp->p_as->a_wpage = pp->p_wpage;
1389                         avl_create(&pp->p_wpage, wp_compare,
1390                             sizeof (struct watched_page),
1391                             offsetof(struct watched_page, wp_link));
1392                 }
1393                 cv_signal(&pp->p_cv);
1394                 mutex_exit(&pp->p_lock);
1395                 mutex_exit(&p->p_lock);
1396         } else {
1397                 if (p->p_as != &kas) {
1398                         struct as *as;
1399 
1400                         if (p->p_segacct)
1401                                 shmexit(p);
1402 
1403                         /*
1404                          * We grab p_lock for the benefit of /proc
1405                          */
1406                         kpreempt_disable();
1407                         mutex_enter(&p->p_lock);
1408                         prbarrier(p);   /* make sure /proc is blocked out */
1409                         as = p->p_as;
1410                         p->p_as = &kas;
1411                         mutex_exit(&p->p_lock);
1412 
1413                         /*
1414                          * notify hat of change in thread's address space
1415                          */
1416                         hat_thread_exit(curthread);
1417                         kpreempt_enable();
1418 
1419                         as_free(as);
1420                         p->p_tr_lgrpid = LGRP_NONE;
1421                 }
1422         }
1423 }
1424 
1425 /*
1426  * Wait for child to exec or exit.
1427  * Called by parent of vfork'ed process.
1428  * See important comments in relvm(), above.
1429  */
1430 void
1431 vfwait(pid_t pid)
1432 {
1433         int signalled = 0;
1434         proc_t *pp = ttoproc(curthread);
1435         proc_t *cp;
1436 
1437         /*
1438          * Wait for child to exec or exit.
1439          */
1440         for (;;) {
1441                 mutex_enter(&pidlock);
1442                 cp = prfind(pid);
1443                 if (cp == NULL || cp->p_parent != pp) {
1444                         /*
1445                          * Child has exit()ed.
1446                          */
1447                         mutex_exit(&pidlock);
1448                         break;
1449                 }
1450                 /*
1451                  * Grab the child's p_lock before releasing pidlock.
1452                  * Otherwise, the child could exit and we would be
1453                  * referencing invalid memory.
1454                  */
1455                 mutex_enter(&cp->p_lock);
1456                 mutex_exit(&pidlock);
1457                 if (!(cp->p_flag & SVFORK)) {
1458                         /*
1459                          * Child has exec()ed or is exit()ing.
1460                          */
1461                         mutex_exit(&cp->p_lock);
1462                         break;
1463                 }
1464                 mutex_enter(&pp->p_lock);
1465                 mutex_exit(&cp->p_lock);
1466                 /*
1467                  * We might be waked up spuriously from the cv_wait().
1468                  * We have to do the whole operation over again to be
1469                  * sure the child's SVFORK flag really is turned off.
1470                  * We cannot make reference to the child because it can
1471                  * exit before we return and we would be referencing
1472                  * invalid memory.
1473                  *
1474                  * Because this is potentially a very long-term wait,
1475                  * we call cv_wait_sig() (for its jobcontrol and /proc
1476                  * side-effects) unless there is a current signal, in
1477                  * which case we use cv_wait() because we cannot return
1478                  * from this function until the child has released the
1479                  * address space.  Calling cv_wait_sig() with a current
1480                  * signal would lead to an indefinite loop here because
1481                  * cv_wait_sig() returns immediately in this case.
1482                  */
1483                 if (signalled)
1484                         cv_wait(&pp->p_cv, &pp->p_lock);
1485                 else
1486                         signalled = !cv_wait_sig(&pp->p_cv, &pp->p_lock);
1487                 mutex_exit(&pp->p_lock);
1488         }
1489 
1490         /* restore watchpoints to parent */
1491         if (pr_watch_active(pp)) {
1492                 struct as *as = pp->p_as;
1493                 AS_LOCK_ENTER(as, RW_WRITER);
1494                 as_setwatch(as);
1495                 AS_LOCK_EXIT(as);
1496         }
1497 
1498         mutex_enter(&pp->p_lock);
1499         prbarrier(pp);  /* barrier against /proc locking */
1500         continuelwps(pp);
1501         mutex_exit(&pp->p_lock);
1502 }