Print this page
    
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/os/fork.c
          +++ new/usr/src/uts/common/os/fork.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright 2016, Joyent, Inc.
  25   25   */
  26   26  
  27   27  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  28   28  /*        All Rights Reserved   */
  29   29  
  30   30  #include <sys/types.h>
  31   31  #include <sys/param.h>
  32   32  #include <sys/sysmacros.h>
  33   33  #include <sys/signal.h>
  34   34  #include <sys/cred.h>
  35   35  #include <sys/policy.h>
  36   36  #include <sys/user.h>
  37   37  #include <sys/systm.h>
  38   38  #include <sys/cpuvar.h>
  39   39  #include <sys/vfs.h>
  40   40  #include <sys/vnode.h>
  41   41  #include <sys/file.h>
  42   42  #include <sys/errno.h>
  43   43  #include <sys/time.h>
  44   44  #include <sys/proc.h>
  45   45  #include <sys/cmn_err.h>
  46   46  #include <sys/acct.h>
  47   47  #include <sys/tuneable.h>
  48   48  #include <sys/class.h>
  49   49  #include <sys/kmem.h>
  50   50  #include <sys/session.h>
  51   51  #include <sys/ucontext.h>
  52   52  #include <sys/stack.h>
  53   53  #include <sys/procfs.h>
  54   54  #include <sys/prsystm.h>
  55   55  #include <sys/vmsystm.h>
  56   56  #include <sys/vtrace.h>
  57   57  #include <sys/debug.h>
  58   58  #include <sys/shm_impl.h>
  59   59  #include <sys/door_data.h>
  60   60  #include <vm/as.h>
  61   61  #include <vm/rm.h>
  62   62  #include <c2/audit.h>
  63   63  #include <sys/var.h>
  64   64  #include <sys/schedctl.h>
  65   65  #include <sys/utrap.h>
  66   66  #include <sys/task.h>
  67   67  #include <sys/resource.h>
  68   68  #include <sys/cyclic.h>
  69   69  #include <sys/lgrp.h>
  70   70  #include <sys/rctl.h>
  71   71  #include <sys/contract_impl.h>
  72   72  #include <sys/contract/process_impl.h>
  73   73  #include <sys/list.h>
  74   74  #include <sys/dtrace.h>
  75   75  #include <sys/pool.h>
  76   76  #include <sys/zone.h>
  77   77  #include <sys/sdt.h>
  78   78  #include <sys/class.h>
  79   79  #include <sys/corectl.h>
  80   80  #include <sys/brand.h>
  81   81  #include <sys/fork.h>
  82   82  
  83   83  static int64_t cfork(int, int, int);
  84   84  static int getproc(proc_t **, pid_t, uint_t);
  85   85  #define GETPROC_USER    0x0
  86   86  #define GETPROC_KERNEL  0x1
  87   87  #define GETPROC_ZSCHED  0x2
  88   88  
  89   89  static void fork_fail(proc_t *);
  90   90  static void forklwp_fail(proc_t *);
  91   91  
  92   92  int fork_fail_pending;
  93   93  
  94   94  extern struct kmem_cache *process_cache;
  95   95  
  96   96  /*
  97   97   * The vfork() system call trap is no longer invoked by libc.
  98   98   * It is retained only for the benefit of applications running
  99   99   * within a solaris10 branded zone.  It should be eliminated
 100  100   * when we no longer support solaris10 branded zones.
 101  101   */
 102  102  int64_t
 103  103  vfork(void)
 104  104  {
 105  105          curthread->t_post_sys = 1;      /* so vfwait() will be called */
 106  106          return (cfork(1, 1, 0));
 107  107  }
 108  108  
 109  109  /*
 110  110   * forksys system call - forkx, forkallx, vforkx.  This is the
 111  111   * interface invoked by libc for fork1(), forkall(), and vfork()
 112  112   */
 113  113  int64_t
 114  114  forksys(int subcode, int flags)
 115  115  {
 116  116          switch (subcode) {
 117  117          case 0:
 118  118                  return (cfork(0, 1, flags));    /* forkx(flags) */
 119  119          case 1:
 120  120                  return (cfork(0, 0, flags));    /* forkallx(flags) */
 121  121          case 2:
 122  122                  curthread->t_post_sys = 1;      /* so vfwait() will be called */
 123  123                  return (cfork(1, 1, flags));    /* vforkx(flags) */
 124  124          default:
 125  125                  return ((int64_t)set_errno(EINVAL));
 126  126          }
 127  127  }
 128  128  
 129  129  /*
 130  130   * Remove the associations of a child process from its parent and siblings.
 131  131   */
 132  132  static void
 133  133  disown_proc(proc_t *pp, proc_t *cp)
 134  134  {
 135  135          proc_t **orphpp;
 136  136  
 137  137          ASSERT(MUTEX_HELD(&pidlock));
 138  138  
 139  139          orphpp = &pp->p_orphan;
 140  140          while (*orphpp != cp)
 141  141                  orphpp = &(*orphpp)->p_nextorph;
 142  142          *orphpp = cp->p_nextorph;
 143  143  
 144  144          if (pp->p_child == cp)
 145  145                  pp->p_child = cp->p_sibling;
 146  146          if (cp->p_sibling)
 147  147                  cp->p_sibling->p_psibling = cp->p_psibling;
 148  148          if (cp->p_psibling)
 149  149                  cp->p_psibling->p_sibling = cp->p_sibling;
 150  150  }
 151  151  
 152  152  /* ARGSUSED */
 153  153  static int64_t
 154  154  cfork(int isvfork, int isfork1, int flags)
 155  155  {
 156  156          proc_t *p = ttoproc(curthread);
 157  157          struct as *as;
 158  158          proc_t *cp;
 159  159          klwp_t *clone;
 160  160          kthread_t *t;
 161  161          task_t *tk;
 162  162          rval_t  r;
 163  163          int error;
 164  164          int i;
 165  165          rctl_set_t *dup_set;
 166  166          rctl_alloc_gp_t *dup_gp;
 167  167          rctl_entity_p_t e;
 168  168          lwpdir_t *ldp;
 169  169          lwpent_t *lep;
 170  170          lwpent_t *clep;
 171  171  
 172  172          /*
 173  173           * Allow only these two flags.
 174  174           */
 175  175          if ((flags & ~(FORK_NOSIGCHLD | FORK_WAITPID)) != 0) {
 176  176                  error = EINVAL;
 177  177                  atomic_inc_32(&curproc->p_zone->zone_ffmisc);
 178  178                  goto forkerr;
 179  179          }
 180  180  
 181  181          /*
 182  182           * fork is not supported for the /proc agent lwp.
 183  183           */
 184  184          if (curthread == p->p_agenttp) {
 185  185                  error = ENOTSUP;
 186  186                  atomic_inc_32(&curproc->p_zone->zone_ffmisc);
 187  187                  goto forkerr;
 188  188          }
 189  189  
 190  190          if ((error = secpolicy_basic_fork(CRED())) != 0) {
 191  191                  atomic_inc_32(&p->p_zone->zone_ffmisc);
 192  192                  goto forkerr;
 193  193          }
 194  194  
 195  195          /*
 196  196           * If the calling lwp is doing a fork1() then the
 197  197           * other lwps in this process are not duplicated and
 198  198           * don't need to be held where their kernel stacks can be
 199  199           * cloned.  If doing forkall(), the process is held with
 200  200           * SHOLDFORK, so that the lwps are at a point where their
 201  201           * stacks can be copied which is on entry or exit from
 202  202           * the kernel.
 203  203           */
 204  204          if (!holdlwps(isfork1 ? SHOLDFORK1 : SHOLDFORK)) {
 205  205                  aston(curthread);
 206  206                  error = EINTR;
 207  207                  atomic_inc_32(&p->p_zone->zone_ffmisc);
 208  208                  goto forkerr;
 209  209          }
 210  210  
 211  211  #if defined(__sparc)
 212  212          /*
 213  213           * Ensure that the user stack is fully constructed
 214  214           * before creating the child process structure.
 215  215           */
 216  216          (void) flush_user_windows_to_stack(NULL);
 217  217  #endif
 218  218  
 219  219          mutex_enter(&p->p_lock);
 220  220          /*
 221  221           * If this is vfork(), cancel any suspend request we might
 222  222           * have gotten from some other thread via lwp_suspend().
 223  223           * Otherwise we could end up with a deadlock on return
 224  224           * from the vfork() in both the parent and the child.
 225  225           */
 226  226          if (isvfork)
 227  227                  curthread->t_proc_flag &= ~TP_HOLDLWP;
 228  228          /*
 229  229           * Prevent our resource set associations from being changed during fork.
 230  230           */
 231  231          pool_barrier_enter();
 232  232          mutex_exit(&p->p_lock);
 233  233  
 234  234          /*
 235  235           * Create a child proc struct. Place a VN_HOLD on appropriate vnodes.
 236  236           */
 237  237          if (getproc(&cp, 0, GETPROC_USER) < 0) {
 238  238                  mutex_enter(&p->p_lock);
 239  239                  pool_barrier_exit();
 240  240                  continuelwps(p);
 241  241                  mutex_exit(&p->p_lock);
 242  242                  error = EAGAIN;
 243  243                  goto forkerr;
 244  244          }
 245  245  
 246  246          TRACE_2(TR_FAC_PROC, TR_PROC_FORK, "proc_fork:cp %p p %p", cp, p);
 247  247  
 248  248          /*
 249  249           * Assign an address space to child
 250  250           */
 251  251          if (isvfork) {
 252  252                  /*
 253  253                   * Clear any watched areas and remember the
 254  254                   * watched pages for restoring in vfwait().
 255  255                   */
 256  256                  as = p->p_as;
 257  257                  if (avl_numnodes(&as->a_wpage) != 0) {
 258  258                          AS_LOCK_ENTER(as, RW_WRITER);
 259  259                          as_clearwatch(as);
 260  260                          p->p_wpage = as->a_wpage;
 261  261                          avl_create(&as->a_wpage, wp_compare,
 262  262                              sizeof (struct watched_page),
 263  263                              offsetof(struct watched_page, wp_link));
 264  264                          AS_LOCK_EXIT(as);
 265  265                  }
 266  266                  cp->p_as = as;
 267  267                  cp->p_flag |= SVFORK;
 268  268  
 269  269                  /*
 270  270                   * Use the parent's shm segment list information for
 271  271                   * the child as it uses its address space till it execs.
 272  272                   */
 273  273                  cp->p_segacct = p->p_segacct;
 274  274          } else {
 275  275                  /*
 276  276                   * We need to hold P_PR_LOCK until the address space has
 277  277                   * been duplicated and we've had a chance to remove from the
 278  278                   * child any DTrace probes that were in the parent. Holding
 279  279                   * P_PR_LOCK prevents any new probes from being added and any
 280  280                   * extant probes from being removed.
 281  281                   */
 282  282                  mutex_enter(&p->p_lock);
 283  283                  sprlock_proc(p);
 284  284                  p->p_flag |= SFORKING;
 285  285                  mutex_exit(&p->p_lock);
 286  286  
 287  287                  error = as_dup(p->p_as, cp);
 288  288                  if (error != 0) {
 289  289                          mutex_enter(&p->p_lock);
 290  290                          sprunlock(p);
 291  291                          fork_fail(cp);
 292  292                          mutex_enter(&pidlock);
 293  293                          disown_proc(p, cp);
 294  294                          mutex_enter(&cp->p_lock);
 295  295                          tk = cp->p_task;
 296  296                          task_detach(cp);
 297  297                          ASSERT(cp->p_pool->pool_ref > 0);
 298  298                          atomic_dec_32(&cp->p_pool->pool_ref);
 299  299                          mutex_exit(&cp->p_lock);
 300  300                          pid_exit(cp, tk);
 301  301                          mutex_exit(&pidlock);
 302  302                          task_rele(tk);
 303  303  
 304  304                          mutex_enter(&p->p_lock);
 305  305                          p->p_flag &= ~SFORKING;
 306  306                          pool_barrier_exit();
 307  307                          continuelwps(p);
 308  308                          mutex_exit(&p->p_lock);
 309  309                          /*
 310  310                           * Preserve ENOMEM error condition but
 311  311                           * map all others to EAGAIN.
 312  312                           */
 313  313                          error = (error == ENOMEM) ? ENOMEM : EAGAIN;
 314  314                          atomic_inc_32(&p->p_zone->zone_ffnomem);
 315  315                          goto forkerr;
 316  316                  }
 317  317  
 318  318                  /*
 319  319                   * Remove all DTrace tracepoints from the child process. We
 320  320                   * need to do this _before_ duplicating USDT providers since
 321  321                   * any associated probes may be immediately enabled.
 322  322                   */
 323  323                  if (p->p_dtrace_count > 0)
 324  324                          dtrace_fasttrap_fork(p, cp);
 325  325  
 326  326                  mutex_enter(&p->p_lock);
 327  327                  sprunlock(p);
 328  328  
 329  329                  /* Duplicate parent's shared memory */
 330  330                  if (p->p_segacct)
 331  331                          shmfork(p, cp);
 332  332  
 333  333                  /*
 334  334                   * Duplicate any helper actions and providers. The SFORKING
 335  335                   * we set above informs the code to enable USDT probes that
 336  336                   * sprlock() may fail because the child is being forked.
 337  337                   */
 338  338                  if (p->p_dtrace_helpers != NULL) {
 339  339                          ASSERT(dtrace_helpers_fork != NULL);
 340  340                          (*dtrace_helpers_fork)(p, cp);
 341  341                  }
 342  342  
 343  343                  mutex_enter(&p->p_lock);
 344  344                  p->p_flag &= ~SFORKING;
 345  345                  mutex_exit(&p->p_lock);
 346  346          }
 347  347  
 348  348          /*
 349  349           * Duplicate parent's resource controls.
 350  350           */
 351  351          dup_set = rctl_set_create();
 352  352          for (;;) {
 353  353                  dup_gp = rctl_set_dup_prealloc(p->p_rctls);
 354  354                  mutex_enter(&p->p_rctls->rcs_lock);
 355  355                  if (rctl_set_dup_ready(p->p_rctls, dup_gp))
 356  356                          break;
 357  357                  mutex_exit(&p->p_rctls->rcs_lock);
 358  358                  rctl_prealloc_destroy(dup_gp);
 359  359          }
 360  360          e.rcep_p.proc = cp;
 361  361          e.rcep_t = RCENTITY_PROCESS;
 362  362          cp->p_rctls = rctl_set_dup(p->p_rctls, p, cp, &e, dup_set, dup_gp,
 363  363              RCD_DUP | RCD_CALLBACK);
 364  364          mutex_exit(&p->p_rctls->rcs_lock);
 365  365  
 366  366          rctl_prealloc_destroy(dup_gp);
 367  367  
 368  368          /*
 369  369           * Allocate the child's lwp directory and lwpid hash table.
 370  370           */
 371  371          if (isfork1)
 372  372                  cp->p_lwpdir_sz = 2;
 373  373          else
 374  374                  cp->p_lwpdir_sz = p->p_lwpdir_sz;
 375  375          cp->p_lwpdir = cp->p_lwpfree = ldp =
 376  376              kmem_zalloc(cp->p_lwpdir_sz * sizeof (lwpdir_t), KM_SLEEP);
 377  377          for (i = 1; i < cp->p_lwpdir_sz; i++, ldp++)
 378  378                  ldp->ld_next = ldp + 1;
 379  379          cp->p_tidhash_sz = (cp->p_lwpdir_sz + 2) / 2;
 380  380          cp->p_tidhash =
 381  381              kmem_zalloc(cp->p_tidhash_sz * sizeof (tidhash_t), KM_SLEEP);
 382  382  
 383  383          /*
 384  384           * Duplicate parent's lwps.
 385  385           * Mutual exclusion is not needed because the process is
 386  386           * in the hold state and only the current lwp is running.
 387  387           */
 388  388          klgrpset_clear(cp->p_lgrpset);
 389  389          if (isfork1) {
 390  390                  clone = forklwp(ttolwp(curthread), cp, curthread->t_tid);
 391  391                  if (clone == NULL)
 392  392                          goto forklwperr;
 393  393                  /*
 394  394                   * Inherit only the lwp_wait()able flag,
 395  395                   * Daemon threads should not call fork1(), but oh well...
 396  396                   */
 397  397                  lwptot(clone)->t_proc_flag |=
 398  398                      (curthread->t_proc_flag & TP_TWAIT);
 399  399          } else {
 400  400                  /* this is forkall(), no one can be in lwp_wait() */
 401  401                  ASSERT(p->p_lwpwait == 0 && p->p_lwpdwait == 0);
 402  402                  /* for each entry in the parent's lwp directory... */
 403  403                  for (i = 0, ldp = p->p_lwpdir; i < p->p_lwpdir_sz; i++, ldp++) {
 404  404                          klwp_t *clwp;
 405  405                          kthread_t *ct;
 406  406  
 407  407                          if ((lep = ldp->ld_entry) == NULL)
 408  408                                  continue;
 409  409  
 410  410                          if ((t = lep->le_thread) != NULL) {
 411  411                                  clwp = forklwp(ttolwp(t), cp, t->t_tid);
 412  412                                  if (clwp == NULL)
 413  413                                          goto forklwperr;
 414  414                                  ct = lwptot(clwp);
 415  415                                  /*
 416  416                                   * Inherit lwp_wait()able and daemon flags.
 417  417                                   */
 418  418                                  ct->t_proc_flag |=
 419  419                                      (t->t_proc_flag & (TP_TWAIT|TP_DAEMON));
 420  420                                  /*
 421  421                                   * Keep track of the clone of curthread to
 422  422                                   * post return values through lwp_setrval().
 423  423                                   * Mark other threads for special treatment
 424  424                                   * by lwp_rtt() / post_syscall().
 425  425                                   */
 426  426                                  if (t == curthread)
 427  427                                          clone = clwp;
 428  428                                  else
 429  429                                          ct->t_flag |= T_FORKALL;
 430  430                          } else {
 431  431                                  /*
 432  432                                   * Replicate zombie lwps in the child.
 433  433                                   */
 434  434                                  clep = kmem_zalloc(sizeof (*clep), KM_SLEEP);
 435  435                                  clep->le_lwpid = lep->le_lwpid;
 436  436                                  clep->le_start = lep->le_start;
 437  437                                  lwp_hash_in(cp, clep,
 438  438                                      cp->p_tidhash, cp->p_tidhash_sz, 0);
 439  439                          }
 440  440                  }
 441  441          }
 442  442  
 443  443          /*
 444  444           * Put new process in the parent's process contract, or put it
 445  445           * in a new one if there is an active process template.  Send a
 446  446           * fork event (if requested) to whatever contract the child is
 447  447           * a member of.  Fails if the parent has been SIGKILLed.
 448  448           */
 449  449          if (contract_process_fork(NULL, cp, p, B_TRUE) == NULL) {
 450  450                  atomic_inc_32(&p->p_zone->zone_ffmisc);
 451  451                  goto forklwperr;
 452  452          }
 453  453  
 454  454          /*
 455  455           * No fork failures occur beyond this point.
 456  456           */
 457  457  
 458  458          cp->p_lwpid = p->p_lwpid;
 459  459          if (!isfork1) {
 460  460                  cp->p_lwpdaemon = p->p_lwpdaemon;
 461  461                  cp->p_zombcnt = p->p_zombcnt;
 462  462                  /*
 463  463                   * If the parent's lwp ids have wrapped around, so have the
 464  464                   * child's.
 465  465                   */
 466  466                  cp->p_flag |= p->p_flag & SLWPWRAP;
 467  467          }
 468  468  
 469  469          mutex_enter(&p->p_lock);
 470  470          corectl_path_hold(cp->p_corefile = p->p_corefile);
 471  471          corectl_content_hold(cp->p_content = p->p_content);
 472  472          mutex_exit(&p->p_lock);
 473  473  
 474  474          /*
 475  475           * Duplicate process context ops, if any.
 476  476           */
 477  477          if (p->p_pctx)
 478  478                  forkpctx(p, cp);
 479  479  
 480  480  #ifdef __sparc
 481  481          utrap_dup(p, cp);
 482  482  #endif
 483  483          /*
 484  484           * If the child process has been marked to stop on exit
 485  485           * from this fork, arrange for all other lwps to stop in
 486  486           * sympathy with the active lwp.
 487  487           */
 488  488          if (PTOU(cp)->u_systrap &&
 489  489              prismember(&PTOU(cp)->u_exitmask, curthread->t_sysnum)) {
 490  490                  mutex_enter(&cp->p_lock);
 491  491                  t = cp->p_tlist;
 492  492                  do {
 493  493                          t->t_proc_flag |= TP_PRSTOP;
 494  494                          aston(t);       /* so TP_PRSTOP will be seen */
 495  495                  } while ((t = t->t_forw) != cp->p_tlist);
 496  496                  mutex_exit(&cp->p_lock);
 497  497          }
 498  498          /*
 499  499           * If the parent process has been marked to stop on exit
 500  500           * from this fork, and its asynchronous-stop flag has not
 501  501           * been set, arrange for all other lwps to stop before
 502  502           * they return back to user level.
 503  503           */
 504  504          if (!(p->p_proc_flag & P_PR_ASYNC) && PTOU(p)->u_systrap &&
 505  505              prismember(&PTOU(p)->u_exitmask, curthread->t_sysnum)) {
 506  506                  mutex_enter(&p->p_lock);
 507  507                  t = p->p_tlist;
 508  508                  do {
 509  509                          t->t_proc_flag |= TP_PRSTOP;
 510  510                          aston(t);       /* so TP_PRSTOP will be seen */
 511  511                  } while ((t = t->t_forw) != p->p_tlist);
 512  512                  mutex_exit(&p->p_lock);
 513  513          }
 514  514  
 515  515          if (PROC_IS_BRANDED(p))
 516  516                  BROP(p)->b_lwp_setrval(clone, p->p_pid, 1);
 517  517          else
 518  518                  lwp_setrval(clone, p->p_pid, 1);
 519  519  
 520  520          /* set return values for parent */
 521  521          r.r_val1 = (int)cp->p_pid;
 522  522          r.r_val2 = 0;
 523  523  
 524  524          /*
 525  525           * pool_barrier_exit() can now be called because the child process has:
 526  526           * - all identifying features cloned or set (p_pid, p_task, p_pool)
 527  527           * - all resource sets associated (p_tlist->*->t_cpupart, p_as->a_mset)
 528  528           * - any other fields set which are used in resource set binding.
 529  529           */
 530  530          mutex_enter(&p->p_lock);
 531  531          pool_barrier_exit();
 532  532          mutex_exit(&p->p_lock);
 533  533  
 534  534          mutex_enter(&pidlock);
 535  535          mutex_enter(&cp->p_lock);
 536  536  
 537  537          /*
 538  538           * Set flags telling the child what (not) to do on exit.
 539  539           */
 540  540          if (flags & FORK_NOSIGCHLD)
 541  541                  cp->p_pidflag |= CLDNOSIGCHLD;
 542  542          if (flags & FORK_WAITPID)
 543  543                  cp->p_pidflag |= CLDWAITPID;
 544  544  
 545  545          /*
 546  546           * Now that there are lwps and threads attached, add the new
 547  547           * process to the process group.
 548  548           */
 549  549          pgjoin(cp, p->p_pgidp);
 550  550          cp->p_stat = SRUN;
 551  551          /*
 552  552           * We are now done with all the lwps in the child process.
 553  553           */
 554  554          t = cp->p_tlist;
 555  555          do {
 556  556                  /*
 557  557                   * Set the lwp_suspend()ed lwps running.
 558  558                   * They will suspend properly at syscall exit.
 559  559                   */
 560  560                  if (t->t_proc_flag & TP_HOLDLWP)
 561  561                          lwp_create_done(t);
 562  562                  else {
 563  563                          /* set TS_CREATE to allow continuelwps() to work */
 564  564                          thread_lock(t);
 565  565                          ASSERT(t->t_state == TS_STOPPED &&
 566  566                              !(t->t_schedflag & (TS_CREATE|TS_CSTART)));
 567  567                          t->t_schedflag |= TS_CREATE;
 568  568                          thread_unlock(t);
 569  569                  }
 570  570          } while ((t = t->t_forw) != cp->p_tlist);
 571  571          mutex_exit(&cp->p_lock);
 572  572  
 573  573          if (isvfork) {
 574  574                  CPU_STATS_ADDQ(CPU, sys, sysvfork, 1);
 575  575                  mutex_enter(&p->p_lock);
 576  576                  p->p_flag |= SVFWAIT;
 577  577                  curthread->t_flag |= T_VFPARENT;
 578  578                  DTRACE_PROC1(create, proc_t *, cp);
 579  579                  cv_broadcast(&pr_pid_cv[p->p_slot]);    /* inform /proc */
 580  580                  mutex_exit(&p->p_lock);
 581  581                  /*
 582  582                   * Grab child's p_lock before dropping pidlock to ensure
 583  583                   * the process will not disappear before we set it running.
 584  584                   */
 585  585                  mutex_enter(&cp->p_lock);
 586  586                  mutex_exit(&pidlock);
 587  587                  sigdefault(cp);
 588  588                  continuelwps(cp);
 589  589                  mutex_exit(&cp->p_lock);
 590  590          } else {
 591  591                  CPU_STATS_ADDQ(CPU, sys, sysfork, 1);
 592  592                  DTRACE_PROC1(create, proc_t *, cp);
 593  593                  /*
 594  594                   * It is CL_FORKRET's job to drop pidlock.
 595  595                   * If we do it here, the process could be set running
 596  596                   * and disappear before CL_FORKRET() is called.
 597  597                   */
 598  598                  CL_FORKRET(curthread, cp->p_tlist);
 599  599                  schedctl_set_cidpri(curthread);
 600  600                  ASSERT(MUTEX_NOT_HELD(&pidlock));
 601  601          }
 602  602  
 603  603          return (r.r_vals);
 604  604  
 605  605  forklwperr:
 606  606          if (isvfork) {
 607  607                  if (avl_numnodes(&p->p_wpage) != 0) {
 608  608                          /* restore watchpoints to parent */
 609  609                          as = p->p_as;
 610  610                          AS_LOCK_ENTER(as, RW_WRITER);
 611  611                          as->a_wpage = p->p_wpage;
 612  612                          avl_create(&p->p_wpage, wp_compare,
 613  613                              sizeof (struct watched_page),
 614  614                              offsetof(struct watched_page, wp_link));
 615  615                          as_setwatch(as);
 616  616                          AS_LOCK_EXIT(as);
 617  617                  }
 618  618          } else {
 619  619                  if (cp->p_segacct)
 620  620                          shmexit(cp);
 621  621                  as = cp->p_as;
 622  622                  cp->p_as = &kas;
 623  623                  as_free(as);
 624  624          }
 625  625  
 626  626          if (cp->p_lwpdir) {
 627  627                  for (i = 0, ldp = cp->p_lwpdir; i < cp->p_lwpdir_sz; i++, ldp++)
 628  628                          if ((lep = ldp->ld_entry) != NULL)
 629  629                                  kmem_free(lep, sizeof (*lep));
 630  630                  kmem_free(cp->p_lwpdir,
 631  631                      cp->p_lwpdir_sz * sizeof (*cp->p_lwpdir));
 632  632          }
 633  633          cp->p_lwpdir = NULL;
 634  634          cp->p_lwpfree = NULL;
 635  635          cp->p_lwpdir_sz = 0;
 636  636  
 637  637          if (cp->p_tidhash)
 638  638                  kmem_free(cp->p_tidhash,
 639  639                      cp->p_tidhash_sz * sizeof (*cp->p_tidhash));
 640  640          cp->p_tidhash = NULL;
 641  641          cp->p_tidhash_sz = 0;
 642  642  
 643  643          forklwp_fail(cp);
 644  644          fork_fail(cp);
 645  645          rctl_set_free(cp->p_rctls);
 646  646          mutex_enter(&pidlock);
 647  647  
 648  648          /*
 649  649           * Detach failed child from task.
 650  650           */
 651  651          mutex_enter(&cp->p_lock);
 652  652          tk = cp->p_task;
 653  653          task_detach(cp);
 654  654          ASSERT(cp->p_pool->pool_ref > 0);
 655  655          atomic_dec_32(&cp->p_pool->pool_ref);
 656  656          mutex_exit(&cp->p_lock);
 657  657  
 658  658          disown_proc(p, cp);
 659  659          pid_exit(cp, tk);
 660  660          mutex_exit(&pidlock);
 661  661  
 662  662          task_rele(tk);
 663  663  
 664  664          mutex_enter(&p->p_lock);
 665  665          pool_barrier_exit();
 666  666          continuelwps(p);
 667  667          mutex_exit(&p->p_lock);
 668  668          error = EAGAIN;
 669  669  forkerr:
 670  670          return ((int64_t)set_errno(error));
 671  671  }
 672  672  
 673  673  /*
 674  674   * Free allocated resources from getproc() if a fork failed.
 675  675   */
 676  676  static void
 677  677  fork_fail(proc_t *cp)
 678  678  {
 679  679          uf_info_t *fip = P_FINFO(cp);
 680  680  
 681  681          fcnt_add(fip, -1);
 682  682          sigdelq(cp, NULL, 0);
 683  683  
 684  684          mutex_enter(&pidlock);
 685  685          upcount_dec(crgetruid(cp->p_cred), crgetzoneid(cp->p_cred));
 686  686          mutex_exit(&pidlock);
 687  687  
 688  688          /*
 689  689           * single threaded, so no locking needed here
 690  690           */
 691  691          crfree(cp->p_cred);
 692  692  
 693  693          kmem_free(fip->fi_list, fip->fi_nfiles * sizeof (uf_entry_t));
 694  694  
 695  695          VN_RELE(PTOU(curproc)->u_cdir);
 696  696          if (PTOU(curproc)->u_rdir)
 697  697                  VN_RELE(PTOU(curproc)->u_rdir);
 698  698          if (cp->p_exec)
 699  699                  VN_RELE(cp->p_exec);
 700  700          if (cp->p_execdir)
 701  701                  VN_RELE(cp->p_execdir);
 702  702          if (PTOU(curproc)->u_cwd)
 703  703                  refstr_rele(PTOU(curproc)->u_cwd);
 704  704          if (PROC_IS_BRANDED(cp)) {
 705  705                  brand_clearbrand(cp, B_FALSE);
 706  706          }
 707  707  }
 708  708  
 709  709  /*
 710  710   * Clean up the lwps already created for this child process.
 711  711   * The fork failed while duplicating all the lwps of the parent
 712  712   * and those lwps already created must be freed.
 713  713   * This process is invisible to the rest of the system,
 714  714   * so we don't need to hold p->p_lock to protect the list.
 715  715   */
 716  716  static void
 717  717  forklwp_fail(proc_t *p)
 718  718  {
 719  719          kthread_t *t;
 720  720          task_t *tk;
 721  721          int branded = 0;
 722  722  
 723  723          if (PROC_IS_BRANDED(p))
 724  724                  branded = 1;
 725  725  
 726  726          while ((t = p->p_tlist) != NULL) {
 727  727                  /*
 728  728                   * First remove the lwp from the process's p_tlist.
 729  729                   */
 730  730                  if (t != t->t_forw)
 731  731                          p->p_tlist = t->t_forw;
 732  732                  else
 733  733                          p->p_tlist = NULL;
 734  734                  p->p_lwpcnt--;
 735  735                  t->t_forw->t_back = t->t_back;
 736  736                  t->t_back->t_forw = t->t_forw;
 737  737  
 738  738                  tk = p->p_task;
 739  739                  mutex_enter(&p->p_zone->zone_nlwps_lock);
 740  740                  tk->tk_nlwps--;
 741  741                  tk->tk_proj->kpj_nlwps--;
 742  742                  p->p_zone->zone_nlwps--;
 743  743                  mutex_exit(&p->p_zone->zone_nlwps_lock);
 744  744  
 745  745                  ASSERT(t->t_schedctl == NULL);
 746  746  
 747  747                  if (branded)
 748  748                          BROP(p)->b_freelwp(ttolwp(t));
 749  749  
 750  750                  if (t->t_door != NULL) {
 751  751                          kmem_free(t->t_door, sizeof (door_data_t));
 752  752                          t->t_door = NULL;
 753  753                  }
 754  754                  lwp_ctmpl_clear(ttolwp(t), B_FALSE);
 755  755  
 756  756                  /*
 757  757                   * Remove the thread from the all threads list.
 758  758                   * We need to hold pidlock for this.
 759  759                   */
 760  760                  mutex_enter(&pidlock);
 761  761                  t->t_next->t_prev = t->t_prev;
 762  762                  t->t_prev->t_next = t->t_next;
 763  763                  CL_EXIT(t);     /* tell the scheduler that we're exiting */
 764  764                  cv_broadcast(&t->t_joincv);     /* tell anyone in thread_join */
 765  765                  mutex_exit(&pidlock);
 766  766  
 767  767                  /*
 768  768                   * Let the lgroup load averages know that this thread isn't
 769  769                   * going to show up (i.e. un-do what was done on behalf of
 770  770                   * this thread by the earlier lgrp_move_thread()).
 771  771                   */
 772  772                  kpreempt_disable();
 773  773                  lgrp_move_thread(t, NULL, 1);
 774  774                  kpreempt_enable();
 775  775  
 776  776                  /*
 777  777                   * The thread was created TS_STOPPED.
 778  778                   * We change it to TS_FREE to avoid an
 779  779                   * ASSERT() panic in thread_free().
 780  780                   */
 781  781                  t->t_state = TS_FREE;
 782  782                  thread_rele(t);
 783  783                  thread_free(t);
 784  784          }
 785  785  }
 786  786  
 787  787  extern struct as kas;
 788  788  
 789  789  /*
 790  790   * fork a kernel process.
 791  791   *
 792  792   * Passing a pid argument of -1 indicates that the new process should be
 793  793   * launched as a child of 'zsched' within the zone.
 794  794   */
 795  795  int
 796  796  newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct,
 797  797      pid_t pid)
 798  798  {
 799  799          proc_t *p;
 800  800          struct user *up;
 801  801          kthread_t *t;
 802  802          cont_process_t *ctp = NULL;
 803  803          rctl_entity_p_t e;
 804  804  
 805  805          ASSERT(cid != sysdccid);
 806  806          ASSERT(cid != syscid || ct == NULL);
 807  807          if (CLASS_KERNEL(cid)) {
 808  808                  rctl_alloc_gp_t *init_gp;
 809  809                  rctl_set_t *init_set;
 810  810  
 811  811                  ASSERT(pid != 1);
 812  812                  ASSERT(pid >= 0);
 813  813  
 814  814                  if (getproc(&p, pid, GETPROC_KERNEL) < 0)
 815  815                          return (EAGAIN);
 816  816  
 817  817                  /*
 818  818                   * Release the hold on the p_exec and p_execdir, these
 819  819                   * were acquired in getproc()
 820  820                   */
 821  821                  if (p->p_execdir != NULL)
 822  822                          VN_RELE(p->p_execdir);
 823  823                  if (p->p_exec != NULL)
 824  824                          VN_RELE(p->p_exec);
 825  825                  p->p_flag |= SNOWAIT;
 826  826                  p->p_exec = NULL;
 827  827                  p->p_execdir = NULL;
 828  828  
 829  829                  init_set = rctl_set_create();
 830  830                  init_gp = rctl_set_init_prealloc(RCENTITY_PROCESS);
 831  831  
 832  832                  /*
 833  833                   * kernel processes do not inherit /proc tracing flags.
 834  834                   */
 835  835                  sigemptyset(&p->p_sigmask);
 836  836                  premptyset(&p->p_fltmask);
 837  837                  up = PTOU(p);
 838  838                  up->u_systrap = 0;
 839  839                  premptyset(&(up->u_entrymask));
 840  840                  premptyset(&(up->u_exitmask));
 841  841                  mutex_enter(&p->p_lock);
 842  842                  e.rcep_p.proc = p;
 843  843                  e.rcep_t = RCENTITY_PROCESS;
 844  844                  p->p_rctls = rctl_set_init(RCENTITY_PROCESS, p, &e, init_set,
 845  845                      init_gp);
 846  846                  mutex_exit(&p->p_lock);
 847  847  
 848  848                  rctl_prealloc_destroy(init_gp);
 849  849  
 850  850                  t = lwp_kernel_create(p, pc, arg, TS_STOPPED, pri);
 851  851          } else {
 852  852                  rctl_alloc_gp_t *init_gp, *default_gp;
 853  853                  rctl_set_t *init_set;
 854  854                  task_t *tk, *tk_old;
 855  855                  klwp_t *lwp;
 856  856                  boolean_t pzsched = B_FALSE;
 857  857                  int flag = GETPROC_USER;
 858  858  
 859  859                  /* Handle a new user-level thread as child of zsched. */
 860  860                  if (pid < 0) {
 861  861                          VERIFY(curzone != global_zone);
 862  862                          flag = GETPROC_ZSCHED;
 863  863                          pzsched = B_TRUE;
 864  864                          pid = 0;
 865  865                  }
 866  866  
 867  867                  if (getproc(&p, pid, flag) < 0)
 868  868                          return (EAGAIN);
 869  869                  /*
 870  870                   * init creates a new task, distinct from the task
 871  871                   * containing kernel "processes".
 872  872                   */
 873  873                  tk = task_create(0, p->p_zone);
 874  874                  mutex_enter(&tk->tk_zone->zone_nlwps_lock);
 875  875                  tk->tk_proj->kpj_ntasks++;
 876  876                  tk->tk_nprocs++;
 877  877                  mutex_exit(&tk->tk_zone->zone_nlwps_lock);
 878  878  
 879  879                  default_gp = rctl_rlimit_set_prealloc(RLIM_NLIMITS);
 880  880                  init_gp = rctl_set_init_prealloc(RCENTITY_PROCESS);
 881  881                  init_set = rctl_set_create();
 882  882  
 883  883                  mutex_enter(&pidlock);
 884  884                  mutex_enter(&p->p_lock);
 885  885                  tk_old = p->p_task;     /* switch to new task */
 886  886  
 887  887                  task_detach(p);
 888  888                  task_begin(tk, p);
 889  889                  mutex_exit(&pidlock);
 890  890  
 891  891                  mutex_enter(&tk_old->tk_zone->zone_nlwps_lock);
 892  892                  tk_old->tk_nprocs--;
 893  893                  mutex_exit(&tk_old->tk_zone->zone_nlwps_lock);
 894  894  
 895  895                  e.rcep_p.proc = p;
 896  896                  e.rcep_t = RCENTITY_PROCESS;
 897  897                  p->p_rctls = rctl_set_init(RCENTITY_PROCESS, p, &e, init_set,
 898  898                      init_gp);
 899  899                  rctlproc_default_init(p, default_gp);
 900  900                  mutex_exit(&p->p_lock);
 901  901  
 902  902                  task_rele(tk_old);
 903  903                  rctl_prealloc_destroy(default_gp);
 904  904                  rctl_prealloc_destroy(init_gp);
 905  905  
 906  906                  if ((lwp = lwp_create(pc, arg, 0, p, TS_STOPPED, pri,
 907  907                      &curthread->t_hold, cid, 1)) == NULL) {
 908  908                          task_t *tk;
 909  909  
 910  910                          fork_fail(p);
 911  911                          mutex_enter(&pidlock);
 912  912                          disown_proc(p->p_parent, p);
 913  913  
 914  914                          mutex_enter(&p->p_lock);
 915  915                          tk = p->p_task;
 916  916                          task_detach(p);
 917  917                          ASSERT(p->p_pool->pool_ref > 0);
 918  918                          atomic_add_32(&p->p_pool->pool_ref, -1);
 919  919                          mutex_exit(&p->p_lock);
 920  920  
 921  921                          pid_exit(p, tk);
 922  922                          mutex_exit(&pidlock);
 923  923                          task_rele(tk);
 924  924                          return (EAGAIN);
 925  925                  }
 926  926                  t = lwptot(lwp);
 927  927  
 928  928                  ctp = contract_process_fork(sys_process_tmpl, p,
 929  929                      (pzsched ? curproc->p_zone->zone_zsched : curproc),
 930  930                      B_FALSE);
 931  931                  ASSERT(ctp != NULL);
 932  932                  if (ct != NULL)
 933  933                          *ct = &ctp->conp_contract;
 934  934          }
 935  935  
 936  936          ASSERT3U(t->t_tid, ==, 1);
 937  937          p->p_lwpid = 1;
 938  938          mutex_enter(&pidlock);
 939  939          pgjoin(p, p->p_parent->p_pgidp);
 940  940          p->p_stat = SRUN;
 941  941          mutex_enter(&p->p_lock);
 942  942          t->t_proc_flag &= ~TP_HOLDLWP;
 943  943          lwp_create_done(t);
 944  944          mutex_exit(&p->p_lock);
 945  945          mutex_exit(&pidlock);
 946  946          return (0);
 947  947  }
 948  948  
 949  949  /*
 950  950   * create a child proc struct.
 951  951   */
 952  952  static int
 953  953  getproc(proc_t **cpp, pid_t pid, uint_t flags)
 954  954  {
 955  955          proc_t          *pp, *cp;
 956  956          pid_t           newpid;
 957  957          struct user     *uarea;
 958  958          extern uint_t   nproc;
 959  959          struct cred     *cr;
 960  960          uid_t           ruid;
 961  961          zoneid_t        zoneid;
 962  962          task_t          *task;
 963  963          kproject_t      *proj;
 964  964          zone_t          *zone;
 965  965          int             rctlfail = 0;
 966  966  
 967  967          if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN)
 968  968                  return (-1);    /* no point in starting new processes */
 969  969  
 970  970          if (flags & GETPROC_ZSCHED) {
 971  971                  pp = curproc->p_zone->zone_zsched;
 972  972          } else {
 973  973                  pp = (flags & GETPROC_KERNEL) ? &p0 : curproc;
 974  974          }
 975  975          task = pp->p_task;
 976  976          proj = task->tk_proj;
 977  977          zone = pp->p_zone;
 978  978  
 979  979          mutex_enter(&pp->p_lock);
 980  980          mutex_enter(&zone->zone_nlwps_lock);
 981  981          if (proj != proj0p) {
 982  982                  if (task->tk_nprocs >= task->tk_nprocs_ctl)
 983  983                          if (rctl_test(rc_task_nprocs, task->tk_rctls,
 984  984                              pp, 1, 0) & RCT_DENY)
 985  985                                  rctlfail = 1;
 986  986  
 987  987                  if (proj->kpj_nprocs >= proj->kpj_nprocs_ctl)
 988  988                          if (rctl_test(rc_project_nprocs, proj->kpj_rctls,
 989  989                              pp, 1, 0) & RCT_DENY)
 990  990                                  rctlfail = 1;
 991  991  
 992  992                  if (zone->zone_nprocs >= zone->zone_nprocs_ctl)
 993  993                          if (rctl_test(rc_zone_nprocs, zone->zone_rctls,
 994  994                              pp, 1, 0) & RCT_DENY)
 995  995                                  rctlfail = 1;
 996  996  
 997  997                  if (rctlfail) {
 998  998                          mutex_exit(&zone->zone_nlwps_lock);
 999  999                          mutex_exit(&pp->p_lock);
1000 1000                          atomic_inc_32(&zone->zone_ffcap);
1001 1001                          goto punish;
1002 1002                  }
1003 1003          }
1004 1004          task->tk_nprocs++;
1005 1005          proj->kpj_nprocs++;
1006 1006          zone->zone_nprocs++;
1007 1007          mutex_exit(&zone->zone_nlwps_lock);
1008 1008          mutex_exit(&pp->p_lock);
1009 1009  
1010 1010          cp = kmem_cache_alloc(process_cache, KM_SLEEP);
1011 1011          bzero(cp, sizeof (proc_t));
1012 1012  
1013 1013          /*
1014 1014           * Make proc entry for child process
1015 1015           */
1016 1016          mutex_init(&cp->p_splock, NULL, MUTEX_DEFAULT, NULL);
1017 1017          mutex_init(&cp->p_crlock, NULL, MUTEX_DEFAULT, NULL);
1018 1018          mutex_init(&cp->p_pflock, NULL, MUTEX_DEFAULT, NULL);
1019 1019  #if defined(__x86)
1020 1020          mutex_init(&cp->p_ldtlock, NULL, MUTEX_DEFAULT, NULL);
1021 1021  #endif
1022 1022          mutex_init(&cp->p_maplock, NULL, MUTEX_DEFAULT, NULL);
1023 1023          cp->p_stat = SIDL;
1024 1024          cp->p_mstart = gethrtime();
1025 1025          cp->p_as = &kas;
1026 1026          /*
1027 1027           * p_zone must be set before we call pid_allocate since the process
1028 1028           * will be visible after that and code such as prfind_zone will
1029 1029           * look at the p_zone field.
1030 1030           */
1031 1031          cp->p_zone = pp->p_zone;
1032 1032          cp->p_t1_lgrpid = LGRP_NONE;
1033 1033          cp->p_tr_lgrpid = LGRP_NONE;
1034 1034  
1035 1035          /* Default to native brand initially */
1036 1036          cp->p_brand = &native_brand;
1037 1037  
1038 1038          if ((newpid = pid_allocate(cp, pid, PID_ALLOC_PROC)) == -1) {
1039 1039                  if (nproc == v.v_proc) {
1040 1040                          CPU_STATS_ADDQ(CPU, sys, procovf, 1);
1041 1041                          cmn_err(CE_WARN, "out of processes");
1042 1042                  }
1043 1043                  goto bad;
1044 1044          }
1045 1045  
1046 1046          mutex_enter(&pp->p_lock);
1047 1047          cp->p_exec = pp->p_exec;
1048 1048          cp->p_execdir = pp->p_execdir;
1049 1049          mutex_exit(&pp->p_lock);
1050 1050  
1051 1051          if (cp->p_exec) {
1052 1052                  VN_HOLD(cp->p_exec);
1053 1053                  /*
1054 1054                   * Each VOP_OPEN() must be paired with a corresponding
1055 1055                   * VOP_CLOSE(). In this case, the executable will be
1056 1056                   * closed for the child in either proc_exit() or gexec().
1057 1057                   */
1058 1058                  if (VOP_OPEN(&cp->p_exec, FREAD, CRED(), NULL) != 0) {
1059 1059                          VN_RELE(cp->p_exec);
1060 1060                          cp->p_exec = NULLVP;
1061 1061                          cp->p_execdir = NULLVP;
1062 1062                          goto bad;
1063 1063                  }
1064 1064          }
1065 1065          if (cp->p_execdir)
1066 1066                  VN_HOLD(cp->p_execdir);
1067 1067  
1068 1068          /*
1069 1069           * If not privileged make sure that this user hasn't exceeded
1070 1070           * v.v_maxup processes, and that users collectively haven't
1071 1071           * exceeded v.v_maxupttl processes.
1072 1072           */
1073 1073          mutex_enter(&pidlock);
1074 1074          ASSERT(nproc < v.v_proc);       /* otherwise how'd we get our pid? */
1075 1075          cr = CRED();
1076 1076          ruid = crgetruid(cr);
1077 1077          zoneid = crgetzoneid(cr);
1078 1078          if (nproc >= v.v_maxup &&       /* short-circuit; usually false */
1079 1079              (nproc >= v.v_maxupttl ||
1080 1080              upcount_get(ruid, zoneid) >= v.v_maxup) &&
1081 1081              secpolicy_newproc(cr) != 0) {
1082 1082                  mutex_exit(&pidlock);
1083 1083                  zcmn_err(zoneid, CE_NOTE,
1084 1084                      "out of per-user processes for uid %d", ruid);
1085 1085                  goto bad;
1086 1086          }
1087 1087  
1088 1088          /*
1089 1089           * Everything is cool, put the new proc on the active process list.
1090 1090           * It is already on the pid list and in /proc.
1091 1091           * Increment the per uid process count (upcount).
1092 1092           */
1093 1093          nproc++;
1094 1094          upcount_inc(ruid, zoneid);
1095 1095  
1096 1096          cp->p_next = practive;
1097 1097          practive->p_prev = cp;
1098 1098          practive = cp;
1099 1099  
1100 1100          cp->p_ignore = pp->p_ignore;
1101 1101          cp->p_siginfo = pp->p_siginfo;
1102 1102          cp->p_flag = pp->p_flag & (SJCTL|SNOWAIT|SNOCD);
1103 1103          cp->p_sessp = pp->p_sessp;
1104 1104          sess_hold(pp);
1105 1105          cp->p_bssbase = pp->p_bssbase;
1106 1106          cp->p_brkbase = pp->p_brkbase;
1107 1107          cp->p_brksize = pp->p_brksize;
1108 1108          cp->p_brkpageszc = pp->p_brkpageszc;
1109 1109          cp->p_stksize = pp->p_stksize;
1110 1110          cp->p_stkpageszc = pp->p_stkpageszc;
1111 1111          cp->p_stkprot = pp->p_stkprot;
1112 1112          cp->p_datprot = pp->p_datprot;
1113 1113          cp->p_usrstack = pp->p_usrstack;
1114 1114          cp->p_model = pp->p_model;
1115 1115          cp->p_ppid = pp->p_pid;
1116 1116          cp->p_ancpid = pp->p_pid;
1117 1117          cp->p_portcnt = pp->p_portcnt;
1118 1118  
1119 1119          /*
1120 1120           * Initialize watchpoint structures
1121 1121           */
1122 1122          avl_create(&cp->p_warea, wa_compare, sizeof (struct watched_area),
1123 1123              offsetof(struct watched_area, wa_link));
1124 1124  
1125 1125          /*
1126 1126           * Initialize immediate resource control values.
1127 1127           */
1128 1128          cp->p_stk_ctl = pp->p_stk_ctl;
1129 1129          cp->p_fsz_ctl = pp->p_fsz_ctl;
1130 1130          cp->p_vmem_ctl = pp->p_vmem_ctl;
1131 1131          cp->p_fno_ctl = pp->p_fno_ctl;
1132 1132  
1133 1133          /*
1134 1134           * Link up to parent-child-sibling chain.  No need to lock
1135 1135           * in general since only a call to freeproc() (done by the
1136 1136           * same parent as newproc()) diddles with the child chain.
1137 1137           */
1138 1138          cp->p_sibling = pp->p_child;
1139 1139          if (pp->p_child)
1140 1140                  pp->p_child->p_psibling = cp;
1141 1141  
1142 1142          cp->p_parent = pp;
1143 1143          pp->p_child = cp;
1144 1144  
1145 1145          cp->p_child_ns = NULL;
1146 1146          cp->p_sibling_ns = NULL;
1147 1147  
1148 1148          cp->p_nextorph = pp->p_orphan;
1149 1149          cp->p_nextofkin = pp;
1150 1150          pp->p_orphan = cp;
1151 1151  
1152 1152          /*
1153 1153           * Inherit profiling state; do not inherit REALPROF profiling state.
1154 1154           */
1155 1155          cp->p_prof = pp->p_prof;
1156 1156          cp->p_rprof_cyclic = CYCLIC_NONE;
1157 1157  
1158 1158          /*
1159 1159           * Inherit pool pointer from the parent.  Kernel processes are
1160 1160           * always bound to the default pool.
1161 1161           */
1162 1162          mutex_enter(&pp->p_lock);
1163 1163          if (flags & GETPROC_KERNEL) {
1164 1164                  cp->p_pool = pool_default;
1165 1165                  cp->p_flag |= SSYS;
1166 1166          } else {
1167 1167                  cp->p_pool = pp->p_pool;
1168 1168          }
1169 1169          atomic_inc_32(&cp->p_pool->pool_ref);
1170 1170          mutex_exit(&pp->p_lock);
1171 1171  
1172 1172          /*
1173 1173           * Add the child process to the current task.  Kernel processes
1174 1174           * are always attached to task0.
1175 1175           */
1176 1176          mutex_enter(&cp->p_lock);
1177 1177          if (flags & GETPROC_KERNEL)
1178 1178                  task_attach(task0p, cp);
1179 1179          else
1180 1180                  task_attach(pp->p_task, cp);
1181 1181          mutex_exit(&cp->p_lock);
1182 1182          mutex_exit(&pidlock);
1183 1183  
1184 1184          if (PROC_IS_BRANDED(pp)) {
1185 1185                  /*
1186 1186                   * The only reason why process branding should fail is when
1187 1187                   * the procedure is complicated by multiple LWPs on the scene.
1188 1188                   * With an LWP count of 0, this newly allocated process has no
1189 1189                   * reason to fail branding.
1190 1190                   */
1191 1191                  VERIFY0(brand_setbrand(cp, B_FALSE));
1192 1192  
1193 1193                  BROP(pp)->b_copy_procdata(cp, pp);
1194 1194          }
1195 1195  
1196 1196          avl_create(&cp->p_ct_held, contract_compar, sizeof (contract_t),
1197 1197              offsetof(contract_t, ct_ctlist));
1198 1198  
1199 1199          /*
1200 1200           * Duplicate any audit information kept in the process table
1201 1201           */
1202 1202          if (audit_active)       /* copy audit data to cp */
1203 1203                  audit_newproc(cp);
1204 1204  
1205 1205          crhold(cp->p_cred = cr);
1206 1206  
1207 1207          /*
1208 1208           * Bump up the counts on the file structures pointed at by the
1209 1209           * parent's file table since the child will point at them too.
1210 1210           */
1211 1211          fcnt_add(P_FINFO(pp), 1);
1212 1212  
1213 1213          if (PTOU(pp)->u_cdir) {
1214 1214                  VN_HOLD(PTOU(pp)->u_cdir);
1215 1215          } else {
1216 1216                  ASSERT(pp == &p0);
1217 1217                  /*
1218 1218                   * We must be at or before vfs_mountroot(); it will take care of
1219 1219                   * assigning our current directory.
1220 1220                   */
1221 1221          }
1222 1222          if (PTOU(pp)->u_rdir)
1223 1223                  VN_HOLD(PTOU(pp)->u_rdir);
1224 1224          if (PTOU(pp)->u_cwd)
1225 1225                  refstr_hold(PTOU(pp)->u_cwd);
1226 1226  
1227 1227          /*
1228 1228           * copy the parent's uarea.
1229 1229           */
1230 1230          uarea = PTOU(cp);
1231 1231          bcopy(PTOU(pp), uarea, sizeof (*uarea));
1232 1232          flist_fork(P_FINFO(pp), P_FINFO(cp));
1233 1233  
1234 1234          gethrestime(&uarea->u_start);
1235 1235          uarea->u_ticks = ddi_get_lbolt();
1236 1236          uarea->u_mem = rm_asrss(pp->p_as);
1237 1237          uarea->u_acflag = AFORK;
1238 1238  
1239 1239          /*
1240 1240           * If inherit-on-fork, copy /proc tracing flags to child.
1241 1241           */
1242 1242          if ((pp->p_proc_flag & P_PR_FORK) != 0) {
1243 1243                  cp->p_proc_flag |= pp->p_proc_flag & (P_PR_TRACE|P_PR_FORK);
1244 1244                  cp->p_sigmask = pp->p_sigmask;
1245 1245                  cp->p_fltmask = pp->p_fltmask;
1246 1246          } else {
1247 1247                  sigemptyset(&cp->p_sigmask);
1248 1248                  premptyset(&cp->p_fltmask);
1249 1249                  uarea->u_systrap = 0;
1250 1250                  premptyset(&uarea->u_entrymask);
1251 1251                  premptyset(&uarea->u_exitmask);
1252 1252          }
1253 1253          /*
1254 1254           * If microstate accounting is being inherited, mark child
1255 1255           */
1256 1256          if ((pp->p_flag & SMSFORK) != 0)
1257 1257                  cp->p_flag |= pp->p_flag & (SMSFORK|SMSACCT);
1258 1258  
1259 1259          /*
1260 1260           * Inherit fixalignment flag from the parent
1261 1261           */
1262 1262          cp->p_fixalignment = pp->p_fixalignment;
1263 1263  
1264 1264          *cpp = cp;
1265 1265          return (0);
1266 1266  
1267 1267  bad:
1268 1268          ASSERT(MUTEX_NOT_HELD(&pidlock));
1269 1269  
1270 1270          mutex_destroy(&cp->p_crlock);
1271 1271          mutex_destroy(&cp->p_pflock);
1272 1272  #if defined(__x86)
1273 1273          mutex_destroy(&cp->p_ldtlock);
1274 1274  #endif
1275 1275          if (newpid != -1) {
1276 1276                  proc_entry_free(cp->p_pidp);
1277 1277                  (void) pid_rele(cp->p_pidp);
1278 1278          }
1279 1279          kmem_cache_free(process_cache, cp);
1280 1280  
1281 1281          mutex_enter(&zone->zone_nlwps_lock);
1282 1282          task->tk_nprocs--;
1283 1283          proj->kpj_nprocs--;
1284 1284          zone->zone_nprocs--;
1285 1285          mutex_exit(&zone->zone_nlwps_lock);
1286 1286          atomic_inc_32(&zone->zone_ffnoproc);
1287 1287  
1288 1288  punish:
1289 1289          /*
1290 1290           * We most likely got into this situation because some process is
1291 1291           * forking out of control.  As punishment, put it to sleep for a
1292 1292           * bit so it can't eat the machine alive.  Sleep interval is chosen
1293 1293           * to allow no more than one fork failure per cpu per clock tick
1294 1294           * on average (yes, I just made this up).  This has two desirable
1295 1295           * properties: (1) it sets a constant limit on the fork failure
1296 1296           * rate, and (2) the busier the system is, the harsher the penalty
1297 1297           * for abusing it becomes.
1298 1298           */
1299 1299          INCR_COUNT(&fork_fail_pending, &pidlock);
1300 1300          delay(fork_fail_pending / ncpus + 1);
1301 1301          DECR_COUNT(&fork_fail_pending, &pidlock);
1302 1302  
1303 1303          return (-1); /* out of memory or proc slots */
1304 1304  }
1305 1305  
1306 1306  /*
1307 1307   * Release virtual memory.
1308 1308   * In the case of vfork(), the child was given exclusive access to its
1309 1309   * parent's address space.  The parent is waiting in vfwait() for the
1310 1310   * child to release its exclusive claim via relvm().
1311 1311   */
1312 1312  void
1313 1313  relvm()
1314 1314  {
1315 1315          proc_t *p = curproc;
1316 1316  
1317 1317          ASSERT((unsigned)p->p_lwpcnt <= 1);
1318 1318  
1319 1319          prrelvm();      /* inform /proc */
1320 1320  
1321 1321          if (p->p_flag & SVFORK) {
1322 1322                  proc_t *pp = p->p_parent;
1323 1323                  /*
1324 1324                   * The child process is either exec'ing or exit'ing.
1325 1325                   * The child is now separated from the parent's address
1326 1326                   * space.  The parent process is made dispatchable.
1327 1327                   *
1328 1328                   * This is a delicate locking maneuver, involving
1329 1329                   * both the parent's p_lock and the child's p_lock.
1330 1330                   * As soon as the SVFORK flag is turned off, the
1331 1331                   * parent is free to run, but it must not run until
1332 1332                   * we wake it up using its p_cv because it might
1333 1333                   * exit and we would be referencing invalid memory.
1334 1334                   * Therefore, we hold the parent with its p_lock
1335 1335                   * while protecting our p_flags with our own p_lock.
1336 1336                   */
1337 1337  try_again:
1338 1338                  mutex_enter(&p->p_lock);        /* grab child's lock first */
1339 1339                  prbarrier(p);           /* make sure /proc is blocked out */
1340 1340                  mutex_enter(&pp->p_lock);
1341 1341  
1342 1342                  /*
1343 1343                   * Check if parent is locked by /proc.
1344 1344                   */
1345 1345                  if (pp->p_proc_flag & P_PR_LOCK) {
1346 1346                          /*
1347 1347                           * Delay until /proc is done with the parent.
1348 1348                           * We must drop our (the child's) p->p_lock, wait
1349 1349                           * via prbarrier() on the parent, then start over.
1350 1350                           */
1351 1351                          mutex_exit(&p->p_lock);
1352 1352                          prbarrier(pp);
1353 1353                          mutex_exit(&pp->p_lock);
1354 1354                          goto try_again;
1355 1355                  }
1356 1356                  p->p_flag &= ~SVFORK;
1357 1357                  kpreempt_disable();
1358 1358                  p->p_as = &kas;
1359 1359  
1360 1360                  /*
1361 1361                   * notify hat of change in thread's address space
1362 1362                   */
1363 1363                  hat_thread_exit(curthread);
1364 1364                  kpreempt_enable();
1365 1365  
1366 1366                  /*
1367 1367                   * child sizes are copied back to parent because
1368 1368                   * child may have grown.
1369 1369                   */
1370 1370                  pp->p_brkbase = p->p_brkbase;
1371 1371                  pp->p_brksize = p->p_brksize;
1372 1372                  pp->p_stksize = p->p_stksize;
1373 1373  
1374 1374                  /*
1375 1375                   * Copy back the shm accounting information
1376 1376                   * to the parent process.
1377 1377                   */
1378 1378                  pp->p_segacct = p->p_segacct;
1379 1379                  p->p_segacct = NULL;
1380 1380  
1381 1381                  /*
1382 1382                   * The parent is no longer waiting for the vfork()d child.
1383 1383                   * Restore the parent's watched pages, if any.  This is
1384 1384                   * safe because we know the parent is not locked by /proc
1385 1385                   */
1386 1386                  pp->p_flag &= ~SVFWAIT;
1387 1387                  if (avl_numnodes(&pp->p_wpage) != 0) {
1388 1388                          pp->p_as->a_wpage = pp->p_wpage;
1389 1389                          avl_create(&pp->p_wpage, wp_compare,
1390 1390                              sizeof (struct watched_page),
1391 1391                              offsetof(struct watched_page, wp_link));
1392 1392                  }
1393 1393                  cv_signal(&pp->p_cv);
1394 1394                  mutex_exit(&pp->p_lock);
1395 1395                  mutex_exit(&p->p_lock);
1396 1396          } else {
1397 1397                  if (p->p_as != &kas) {
1398 1398                          struct as *as;
1399 1399  
1400 1400                          if (p->p_segacct)
1401 1401                                  shmexit(p);
1402 1402  
1403 1403                          /*
1404 1404                           * We grab p_lock for the benefit of /proc
1405 1405                           */
1406 1406                          kpreempt_disable();
1407 1407                          mutex_enter(&p->p_lock);
1408 1408                          prbarrier(p);   /* make sure /proc is blocked out */
1409 1409                          as = p->p_as;
1410 1410                          p->p_as = &kas;
1411 1411                          mutex_exit(&p->p_lock);
1412 1412  
1413 1413                          /*
1414 1414                           * notify hat of change in thread's address space
1415 1415                           */
1416 1416                          hat_thread_exit(curthread);
1417 1417                          kpreempt_enable();
1418 1418  
1419 1419                          as_free(as);
1420 1420                          p->p_tr_lgrpid = LGRP_NONE;
1421 1421                  }
1422 1422          }
1423 1423  }
1424 1424  
1425 1425  /*
1426 1426   * Wait for child to exec or exit.
1427 1427   * Called by parent of vfork'ed process.
1428 1428   * See important comments in relvm(), above.
1429 1429   */
1430 1430  void
1431 1431  vfwait(pid_t pid)
1432 1432  {
1433 1433          int signalled = 0;
1434 1434          proc_t *pp = ttoproc(curthread);
1435 1435          proc_t *cp;
1436 1436  
1437 1437          /*
1438 1438           * Wait for child to exec or exit.
1439 1439           */
1440 1440          for (;;) {
1441 1441                  mutex_enter(&pidlock);
1442 1442                  cp = prfind(pid);
1443 1443                  if (cp == NULL || cp->p_parent != pp) {
1444 1444                          /*
1445 1445                           * Child has exit()ed.
1446 1446                           */
1447 1447                          mutex_exit(&pidlock);
1448 1448                          break;
1449 1449                  }
1450 1450                  /*
1451 1451                   * Grab the child's p_lock before releasing pidlock.
1452 1452                   * Otherwise, the child could exit and we would be
1453 1453                   * referencing invalid memory.
1454 1454                   */
1455 1455                  mutex_enter(&cp->p_lock);
1456 1456                  mutex_exit(&pidlock);
1457 1457                  if (!(cp->p_flag & SVFORK)) {
1458 1458                          /*
1459 1459                           * Child has exec()ed or is exit()ing.
1460 1460                           */
1461 1461                          mutex_exit(&cp->p_lock);
1462 1462                          break;
1463 1463                  }
1464 1464                  mutex_enter(&pp->p_lock);
1465 1465                  mutex_exit(&cp->p_lock);
1466 1466                  /*
1467 1467                   * We might be waked up spuriously from the cv_wait().
1468 1468                   * We have to do the whole operation over again to be
1469 1469                   * sure the child's SVFORK flag really is turned off.
1470 1470                   * We cannot make reference to the child because it can
1471 1471                   * exit before we return and we would be referencing
1472 1472                   * invalid memory.
1473 1473                   *
1474 1474                   * Because this is potentially a very long-term wait,
1475 1475                   * we call cv_wait_sig() (for its jobcontrol and /proc
1476 1476                   * side-effects) unless there is a current signal, in
1477 1477                   * which case we use cv_wait() because we cannot return
1478 1478                   * from this function until the child has released the
1479 1479                   * address space.  Calling cv_wait_sig() with a current
1480 1480                   * signal would lead to an indefinite loop here because
1481 1481                   * cv_wait_sig() returns immediately in this case.
1482 1482                   */
1483 1483                  if (signalled)
1484 1484                          cv_wait(&pp->p_cv, &pp->p_lock);
1485 1485                  else
1486 1486                          signalled = !cv_wait_sig(&pp->p_cv, &pp->p_lock);
1487 1487                  mutex_exit(&pp->p_lock);
1488 1488          }
1489 1489  
1490 1490          /* restore watchpoints to parent */
1491 1491          if (pr_watch_active(pp)) {
1492 1492                  struct as *as = pp->p_as;
1493 1493                  AS_LOCK_ENTER(as, RW_WRITER);
1494 1494                  as_setwatch(as);
1495 1495                  AS_LOCK_EXIT(as);
1496 1496          }
1497 1497  
1498 1498          mutex_enter(&pp->p_lock);
1499 1499          prbarrier(pp);  /* barrier against /proc locking */
1500 1500          continuelwps(pp);
1501 1501          mutex_exit(&pp->p_lock);
1502 1502  }
  
    | 
      ↓ open down ↓ | 
    1502 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX