Print this page
    
OS-5598 newproc() performs inadequate clean-up after failed lwp_create()
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Approved by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-4818 contract template disappears on exec
OS-4825 cgroup user agent should be launched from the kernel
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
OS-4460 exec brands processes that still have multiple threads
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Joshua M. Clulow <jmc@joyent.com>
OS-4151 setbrand hooks should be sane during fork
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Joshua M. Clulow <jmc@joyent.com>
OS-4129 lxbrand should not abuse p_brand_data for storing exit signal
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Joshua M. Clulow <jmc@joyent.com>
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/os/fork.c
          +++ new/usr/src/uts/common/os/fork.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  
    | 
      ↓ open down ↓ | 
    13 lines elided | 
    
      ↑ open up ↑ | 
  
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  24      - * Copyright 2013, Joyent, Inc. All rights reserved.
       24 + * Copyright 2016, Joyent, Inc.
  25   25   */
  26   26  
  27   27  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  28   28  /*        All Rights Reserved   */
  29   29  
  30   30  #include <sys/types.h>
  31   31  #include <sys/param.h>
  32   32  #include <sys/sysmacros.h>
  33   33  #include <sys/signal.h>
  34   34  #include <sys/cred.h>
  35   35  #include <sys/policy.h>
  36   36  #include <sys/user.h>
  37   37  #include <sys/systm.h>
  38   38  #include <sys/cpuvar.h>
  39   39  #include <sys/vfs.h>
  40   40  #include <sys/vnode.h>
  41   41  #include <sys/file.h>
  42   42  #include <sys/errno.h>
  43   43  #include <sys/time.h>
  44   44  #include <sys/proc.h>
  45   45  #include <sys/cmn_err.h>
  46   46  #include <sys/acct.h>
  47   47  #include <sys/tuneable.h>
  48   48  #include <sys/class.h>
  49   49  #include <sys/kmem.h>
  50   50  #include <sys/session.h>
  51   51  #include <sys/ucontext.h>
  52   52  #include <sys/stack.h>
  53   53  #include <sys/procfs.h>
  54   54  #include <sys/prsystm.h>
  55   55  #include <sys/vmsystm.h>
  56   56  #include <sys/vtrace.h>
  57   57  #include <sys/debug.h>
  58   58  #include <sys/shm_impl.h>
  59   59  #include <sys/door_data.h>
  60   60  #include <vm/as.h>
  61   61  #include <vm/rm.h>
  62   62  #include <c2/audit.h>
  63   63  #include <sys/var.h>
  64   64  #include <sys/schedctl.h>
  65   65  #include <sys/utrap.h>
  66   66  #include <sys/task.h>
  67   67  #include <sys/resource.h>
  68   68  #include <sys/cyclic.h>
  69   69  #include <sys/lgrp.h>
  70   70  #include <sys/rctl.h>
  71   71  #include <sys/contract_impl.h>
  72   72  #include <sys/contract/process_impl.h>
  73   73  #include <sys/list.h>
  74   74  #include <sys/dtrace.h>
  75   75  #include <sys/pool.h>
  76   76  #include <sys/zone.h>
  
    | 
      ↓ open down ↓ | 
    42 lines elided | 
    
      ↑ open up ↑ | 
  
  77   77  #include <sys/sdt.h>
  78   78  #include <sys/class.h>
  79   79  #include <sys/corectl.h>
  80   80  #include <sys/brand.h>
  81   81  #include <sys/fork.h>
  82   82  
  83   83  static int64_t cfork(int, int, int);
  84   84  static int getproc(proc_t **, pid_t, uint_t);
  85   85  #define GETPROC_USER    0x0
  86   86  #define GETPROC_KERNEL  0x1
       87 +#define GETPROC_ZSCHED  0x2
  87   88  
  88   89  static void fork_fail(proc_t *);
  89   90  static void forklwp_fail(proc_t *);
  90   91  
  91   92  int fork_fail_pending;
  92   93  
  93   94  extern struct kmem_cache *process_cache;
  94   95  
  95   96  /*
  96   97   * The vfork() system call trap is no longer invoked by libc.
  97   98   * It is retained only for the benefit of applications running
  98   99   * within a solaris10 branded zone.  It should be eliminated
  99  100   * when we no longer support solaris10 branded zones.
 100  101   */
 101  102  int64_t
 102  103  vfork(void)
 103  104  {
 104  105          curthread->t_post_sys = 1;      /* so vfwait() will be called */
 105  106          return (cfork(1, 1, 0));
 106  107  }
 107  108  
 108  109  /*
 109  110   * forksys system call - forkx, forkallx, vforkx.  This is the
 110  111   * interface invoked by libc for fork1(), forkall(), and vfork()
 111  112   */
 112  113  int64_t
 113  114  forksys(int subcode, int flags)
 114  115  {
 115  116          switch (subcode) {
 116  117          case 0:
 117  118                  return (cfork(0, 1, flags));    /* forkx(flags) */
  
    | 
      ↓ open down ↓ | 
    21 lines elided | 
    
      ↑ open up ↑ | 
  
 118  119          case 1:
 119  120                  return (cfork(0, 0, flags));    /* forkallx(flags) */
 120  121          case 2:
 121  122                  curthread->t_post_sys = 1;      /* so vfwait() will be called */
 122  123                  return (cfork(1, 1, flags));    /* vforkx(flags) */
 123  124          default:
 124  125                  return ((int64_t)set_errno(EINVAL));
 125  126          }
 126  127  }
 127  128  
      129 +/*
      130 + * Remove the associations of a child process from its parent and siblings.
      131 + */
      132 +static void
      133 +disown_proc(proc_t *pp, proc_t *cp)
      134 +{
      135 +        proc_t **orphpp;
      136 +
      137 +        ASSERT(MUTEX_HELD(&pidlock));
      138 +
      139 +        orphpp = &pp->p_orphan;
      140 +        while (*orphpp != cp)
      141 +                orphpp = &(*orphpp)->p_nextorph;
      142 +        *orphpp = cp->p_nextorph;
      143 +
      144 +        if (pp->p_child == cp)
      145 +                pp->p_child = cp->p_sibling;
      146 +        if (cp->p_sibling)
      147 +                cp->p_sibling->p_psibling = cp->p_psibling;
      148 +        if (cp->p_psibling)
      149 +                cp->p_psibling->p_sibling = cp->p_sibling;
      150 +}
      151 +
 128  152  /* ARGSUSED */
 129  153  static int64_t
 130  154  cfork(int isvfork, int isfork1, int flags)
 131  155  {
 132  156          proc_t *p = ttoproc(curthread);
 133  157          struct as *as;
 134      -        proc_t *cp, **orphpp;
      158 +        proc_t *cp;
 135  159          klwp_t *clone;
 136  160          kthread_t *t;
 137  161          task_t *tk;
 138  162          rval_t  r;
 139  163          int error;
 140  164          int i;
 141  165          rctl_set_t *dup_set;
 142  166          rctl_alloc_gp_t *dup_gp;
 143  167          rctl_entity_p_t e;
 144  168          lwpdir_t *ldp;
 145  169          lwpent_t *lep;
 146  170          lwpent_t *clep;
 147  171  
 148  172          /*
 149  173           * Allow only these two flags.
 150  174           */
 151  175          if ((flags & ~(FORK_NOSIGCHLD | FORK_WAITPID)) != 0) {
 152  176                  error = EINVAL;
 153  177                  atomic_inc_32(&curproc->p_zone->zone_ffmisc);
 154  178                  goto forkerr;
 155  179          }
 156  180  
 157  181          /*
 158  182           * fork is not supported for the /proc agent lwp.
 159  183           */
 160  184          if (curthread == p->p_agenttp) {
 161  185                  error = ENOTSUP;
 162  186                  atomic_inc_32(&curproc->p_zone->zone_ffmisc);
 163  187                  goto forkerr;
 164  188          }
 165  189  
 166  190          if ((error = secpolicy_basic_fork(CRED())) != 0) {
 167  191                  atomic_inc_32(&p->p_zone->zone_ffmisc);
 168  192                  goto forkerr;
 169  193          }
 170  194  
 171  195          /*
 172  196           * If the calling lwp is doing a fork1() then the
 173  197           * other lwps in this process are not duplicated and
 174  198           * don't need to be held where their kernel stacks can be
 175  199           * cloned.  If doing forkall(), the process is held with
 176  200           * SHOLDFORK, so that the lwps are at a point where their
 177  201           * stacks can be copied which is on entry or exit from
 178  202           * the kernel.
 179  203           */
 180  204          if (!holdlwps(isfork1 ? SHOLDFORK1 : SHOLDFORK)) {
 181  205                  aston(curthread);
 182  206                  error = EINTR;
 183  207                  atomic_inc_32(&p->p_zone->zone_ffmisc);
 184  208                  goto forkerr;
 185  209          }
 186  210  
 187  211  #if defined(__sparc)
 188  212          /*
 189  213           * Ensure that the user stack is fully constructed
 190  214           * before creating the child process structure.
 191  215           */
 192  216          (void) flush_user_windows_to_stack(NULL);
 193  217  #endif
 194  218  
 195  219          mutex_enter(&p->p_lock);
 196  220          /*
 197  221           * If this is vfork(), cancel any suspend request we might
 198  222           * have gotten from some other thread via lwp_suspend().
 199  223           * Otherwise we could end up with a deadlock on return
 200  224           * from the vfork() in both the parent and the child.
 201  225           */
 202  226          if (isvfork)
 203  227                  curthread->t_proc_flag &= ~TP_HOLDLWP;
 204  228          /*
 205  229           * Prevent our resource set associations from being changed during fork.
 206  230           */
 207  231          pool_barrier_enter();
 208  232          mutex_exit(&p->p_lock);
 209  233  
 210  234          /*
 211  235           * Create a child proc struct. Place a VN_HOLD on appropriate vnodes.
 212  236           */
 213  237          if (getproc(&cp, 0, GETPROC_USER) < 0) {
 214  238                  mutex_enter(&p->p_lock);
 215  239                  pool_barrier_exit();
 216  240                  continuelwps(p);
 217  241                  mutex_exit(&p->p_lock);
 218  242                  error = EAGAIN;
 219  243                  goto forkerr;
 220  244          }
 221  245  
 222  246          TRACE_2(TR_FAC_PROC, TR_PROC_FORK, "proc_fork:cp %p p %p", cp, p);
 223  247  
 224  248          /*
 225  249           * Assign an address space to child
 226  250           */
 227  251          if (isvfork) {
 228  252                  /*
 229  253                   * Clear any watched areas and remember the
 230  254                   * watched pages for restoring in vfwait().
 231  255                   */
 232  256                  as = p->p_as;
 233  257                  if (avl_numnodes(&as->a_wpage) != 0) {
 234  258                          AS_LOCK_ENTER(as, RW_WRITER);
 235  259                          as_clearwatch(as);
 236  260                          p->p_wpage = as->a_wpage;
 237  261                          avl_create(&as->a_wpage, wp_compare,
 238  262                              sizeof (struct watched_page),
 239  263                              offsetof(struct watched_page, wp_link));
 240  264                          AS_LOCK_EXIT(as);
 241  265                  }
 242  266                  cp->p_as = as;
 243  267                  cp->p_flag |= SVFORK;
 244  268  
 245  269                  /*
 246  270                   * Use the parent's shm segment list information for
 247  271                   * the child as it uses its address space till it execs.
 248  272                   */
 249  273                  cp->p_segacct = p->p_segacct;
 250  274          } else {
 251  275                  /*
 252  276                   * We need to hold P_PR_LOCK until the address space has
 253  277                   * been duplicated and we've had a chance to remove from the
 254  278                   * child any DTrace probes that were in the parent. Holding
 255  279                   * P_PR_LOCK prevents any new probes from being added and any
 256  280                   * extant probes from being removed.
 257  281                   */
 258  282                  mutex_enter(&p->p_lock);
  
    | 
      ↓ open down ↓ | 
    114 lines elided | 
    
      ↑ open up ↑ | 
  
 259  283                  sprlock_proc(p);
 260  284                  p->p_flag |= SFORKING;
 261  285                  mutex_exit(&p->p_lock);
 262  286  
 263  287                  error = as_dup(p->p_as, cp);
 264  288                  if (error != 0) {
 265  289                          mutex_enter(&p->p_lock);
 266  290                          sprunlock(p);
 267  291                          fork_fail(cp);
 268  292                          mutex_enter(&pidlock);
 269      -                        orphpp = &p->p_orphan;
 270      -                        while (*orphpp != cp)
 271      -                                orphpp = &(*orphpp)->p_nextorph;
 272      -                        *orphpp = cp->p_nextorph;
 273      -                        if (p->p_child == cp)
 274      -                                p->p_child = cp->p_sibling;
 275      -                        if (cp->p_sibling)
 276      -                                cp->p_sibling->p_psibling = cp->p_psibling;
 277      -                        if (cp->p_psibling)
 278      -                                cp->p_psibling->p_sibling = cp->p_sibling;
      293 +                        disown_proc(p, cp);
 279  294                          mutex_enter(&cp->p_lock);
 280  295                          tk = cp->p_task;
 281  296                          task_detach(cp);
 282  297                          ASSERT(cp->p_pool->pool_ref > 0);
 283  298                          atomic_dec_32(&cp->p_pool->pool_ref);
 284  299                          mutex_exit(&cp->p_lock);
 285  300                          pid_exit(cp, tk);
 286  301                          mutex_exit(&pidlock);
 287  302                          task_rele(tk);
 288  303  
 289  304                          mutex_enter(&p->p_lock);
 290  305                          p->p_flag &= ~SFORKING;
 291  306                          pool_barrier_exit();
 292  307                          continuelwps(p);
 293  308                          mutex_exit(&p->p_lock);
 294  309                          /*
 295  310                           * Preserve ENOMEM error condition but
 296  311                           * map all others to EAGAIN.
 297  312                           */
 298  313                          error = (error == ENOMEM) ? ENOMEM : EAGAIN;
 299  314                          atomic_inc_32(&p->p_zone->zone_ffnomem);
 300  315                          goto forkerr;
 301  316                  }
 302  317  
 303  318                  /*
 304  319                   * Remove all DTrace tracepoints from the child process. We
 305  320                   * need to do this _before_ duplicating USDT providers since
 306  321                   * any associated probes may be immediately enabled.
 307  322                   */
 308  323                  if (p->p_dtrace_count > 0)
 309  324                          dtrace_fasttrap_fork(p, cp);
 310  325  
 311  326                  mutex_enter(&p->p_lock);
 312  327                  sprunlock(p);
 313  328  
 314  329                  /* Duplicate parent's shared memory */
 315  330                  if (p->p_segacct)
 316  331                          shmfork(p, cp);
 317  332  
 318  333                  /*
 319  334                   * Duplicate any helper actions and providers. The SFORKING
 320  335                   * we set above informs the code to enable USDT probes that
 321  336                   * sprlock() may fail because the child is being forked.
 322  337                   */
 323  338                  if (p->p_dtrace_helpers != NULL) {
 324  339                          ASSERT(dtrace_helpers_fork != NULL);
 325  340                          (*dtrace_helpers_fork)(p, cp);
 326  341                  }
 327  342  
 328  343                  mutex_enter(&p->p_lock);
 329  344                  p->p_flag &= ~SFORKING;
 330  345                  mutex_exit(&p->p_lock);
 331  346          }
 332  347  
 333  348          /*
 334  349           * Duplicate parent's resource controls.
 335  350           */
 336  351          dup_set = rctl_set_create();
 337  352          for (;;) {
 338  353                  dup_gp = rctl_set_dup_prealloc(p->p_rctls);
 339  354                  mutex_enter(&p->p_rctls->rcs_lock);
 340  355                  if (rctl_set_dup_ready(p->p_rctls, dup_gp))
 341  356                          break;
 342  357                  mutex_exit(&p->p_rctls->rcs_lock);
 343  358                  rctl_prealloc_destroy(dup_gp);
 344  359          }
 345  360          e.rcep_p.proc = cp;
 346  361          e.rcep_t = RCENTITY_PROCESS;
 347  362          cp->p_rctls = rctl_set_dup(p->p_rctls, p, cp, &e, dup_set, dup_gp,
 348  363              RCD_DUP | RCD_CALLBACK);
 349  364          mutex_exit(&p->p_rctls->rcs_lock);
 350  365  
 351  366          rctl_prealloc_destroy(dup_gp);
 352  367  
 353  368          /*
 354  369           * Allocate the child's lwp directory and lwpid hash table.
 355  370           */
 356  371          if (isfork1)
 357  372                  cp->p_lwpdir_sz = 2;
 358  373          else
 359  374                  cp->p_lwpdir_sz = p->p_lwpdir_sz;
 360  375          cp->p_lwpdir = cp->p_lwpfree = ldp =
 361  376              kmem_zalloc(cp->p_lwpdir_sz * sizeof (lwpdir_t), KM_SLEEP);
 362  377          for (i = 1; i < cp->p_lwpdir_sz; i++, ldp++)
 363  378                  ldp->ld_next = ldp + 1;
 364  379          cp->p_tidhash_sz = (cp->p_lwpdir_sz + 2) / 2;
 365  380          cp->p_tidhash =
 366  381              kmem_zalloc(cp->p_tidhash_sz * sizeof (tidhash_t), KM_SLEEP);
 367  382  
 368  383          /*
 369  384           * Duplicate parent's lwps.
 370  385           * Mutual exclusion is not needed because the process is
 371  386           * in the hold state and only the current lwp is running.
 372  387           */
 373  388          klgrpset_clear(cp->p_lgrpset);
 374  389          if (isfork1) {
 375  390                  clone = forklwp(ttolwp(curthread), cp, curthread->t_tid);
 376  391                  if (clone == NULL)
 377  392                          goto forklwperr;
 378  393                  /*
 379  394                   * Inherit only the lwp_wait()able flag,
 380  395                   * Daemon threads should not call fork1(), but oh well...
 381  396                   */
 382  397                  lwptot(clone)->t_proc_flag |=
 383  398                      (curthread->t_proc_flag & TP_TWAIT);
 384  399          } else {
 385  400                  /* this is forkall(), no one can be in lwp_wait() */
 386  401                  ASSERT(p->p_lwpwait == 0 && p->p_lwpdwait == 0);
 387  402                  /* for each entry in the parent's lwp directory... */
 388  403                  for (i = 0, ldp = p->p_lwpdir; i < p->p_lwpdir_sz; i++, ldp++) {
 389  404                          klwp_t *clwp;
 390  405                          kthread_t *ct;
 391  406  
 392  407                          if ((lep = ldp->ld_entry) == NULL)
 393  408                                  continue;
 394  409  
 395  410                          if ((t = lep->le_thread) != NULL) {
 396  411                                  clwp = forklwp(ttolwp(t), cp, t->t_tid);
 397  412                                  if (clwp == NULL)
 398  413                                          goto forklwperr;
 399  414                                  ct = lwptot(clwp);
 400  415                                  /*
 401  416                                   * Inherit lwp_wait()able and daemon flags.
 402  417                                   */
 403  418                                  ct->t_proc_flag |=
 404  419                                      (t->t_proc_flag & (TP_TWAIT|TP_DAEMON));
 405  420                                  /*
 406  421                                   * Keep track of the clone of curthread to
 407  422                                   * post return values through lwp_setrval().
 408  423                                   * Mark other threads for special treatment
 409  424                                   * by lwp_rtt() / post_syscall().
 410  425                                   */
 411  426                                  if (t == curthread)
 412  427                                          clone = clwp;
 413  428                                  else
 414  429                                          ct->t_flag |= T_FORKALL;
 415  430                          } else {
 416  431                                  /*
 417  432                                   * Replicate zombie lwps in the child.
 418  433                                   */
 419  434                                  clep = kmem_zalloc(sizeof (*clep), KM_SLEEP);
 420  435                                  clep->le_lwpid = lep->le_lwpid;
 421  436                                  clep->le_start = lep->le_start;
 422  437                                  lwp_hash_in(cp, clep,
 423  438                                      cp->p_tidhash, cp->p_tidhash_sz, 0);
 424  439                          }
 425  440                  }
 426  441          }
 427  442  
 428  443          /*
 429  444           * Put new process in the parent's process contract, or put it
 430  445           * in a new one if there is an active process template.  Send a
 431  446           * fork event (if requested) to whatever contract the child is
 432  447           * a member of.  Fails if the parent has been SIGKILLed.
 433  448           */
 434  449          if (contract_process_fork(NULL, cp, p, B_TRUE) == NULL) {
 435  450                  atomic_inc_32(&p->p_zone->zone_ffmisc);
 436  451                  goto forklwperr;
 437  452          }
 438  453  
 439  454          /*
 440  455           * No fork failures occur beyond this point.
 441  456           */
 442  457  
 443  458          cp->p_lwpid = p->p_lwpid;
 444  459          if (!isfork1) {
 445  460                  cp->p_lwpdaemon = p->p_lwpdaemon;
 446  461                  cp->p_zombcnt = p->p_zombcnt;
 447  462                  /*
 448  463                   * If the parent's lwp ids have wrapped around, so have the
 449  464                   * child's.
 450  465                   */
 451  466                  cp->p_flag |= p->p_flag & SLWPWRAP;
 452  467          }
 453  468  
 454  469          mutex_enter(&p->p_lock);
 455  470          corectl_path_hold(cp->p_corefile = p->p_corefile);
 456  471          corectl_content_hold(cp->p_content = p->p_content);
 457  472          mutex_exit(&p->p_lock);
 458  473  
 459  474          /*
 460  475           * Duplicate process context ops, if any.
 461  476           */
 462  477          if (p->p_pctx)
 463  478                  forkpctx(p, cp);
 464  479  
 465  480  #ifdef __sparc
 466  481          utrap_dup(p, cp);
 467  482  #endif
 468  483          /*
 469  484           * If the child process has been marked to stop on exit
 470  485           * from this fork, arrange for all other lwps to stop in
 471  486           * sympathy with the active lwp.
 472  487           */
 473  488          if (PTOU(cp)->u_systrap &&
 474  489              prismember(&PTOU(cp)->u_exitmask, curthread->t_sysnum)) {
 475  490                  mutex_enter(&cp->p_lock);
 476  491                  t = cp->p_tlist;
 477  492                  do {
 478  493                          t->t_proc_flag |= TP_PRSTOP;
 479  494                          aston(t);       /* so TP_PRSTOP will be seen */
 480  495                  } while ((t = t->t_forw) != cp->p_tlist);
 481  496                  mutex_exit(&cp->p_lock);
 482  497          }
 483  498          /*
 484  499           * If the parent process has been marked to stop on exit
 485  500           * from this fork, and its asynchronous-stop flag has not
 486  501           * been set, arrange for all other lwps to stop before
 487  502           * they return back to user level.
 488  503           */
 489  504          if (!(p->p_proc_flag & P_PR_ASYNC) && PTOU(p)->u_systrap &&
 490  505              prismember(&PTOU(p)->u_exitmask, curthread->t_sysnum)) {
 491  506                  mutex_enter(&p->p_lock);
 492  507                  t = p->p_tlist;
 493  508                  do {
 494  509                          t->t_proc_flag |= TP_PRSTOP;
 495  510                          aston(t);       /* so TP_PRSTOP will be seen */
 496  511                  } while ((t = t->t_forw) != p->p_tlist);
 497  512                  mutex_exit(&p->p_lock);
 498  513          }
 499  514  
 500  515          if (PROC_IS_BRANDED(p))
 501  516                  BROP(p)->b_lwp_setrval(clone, p->p_pid, 1);
 502  517          else
 503  518                  lwp_setrval(clone, p->p_pid, 1);
 504  519  
 505  520          /* set return values for parent */
 506  521          r.r_val1 = (int)cp->p_pid;
 507  522          r.r_val2 = 0;
 508  523  
 509  524          /*
 510  525           * pool_barrier_exit() can now be called because the child process has:
 511  526           * - all identifying features cloned or set (p_pid, p_task, p_pool)
 512  527           * - all resource sets associated (p_tlist->*->t_cpupart, p_as->a_mset)
 513  528           * - any other fields set which are used in resource set binding.
 514  529           */
 515  530          mutex_enter(&p->p_lock);
 516  531          pool_barrier_exit();
 517  532          mutex_exit(&p->p_lock);
 518  533  
 519  534          mutex_enter(&pidlock);
 520  535          mutex_enter(&cp->p_lock);
 521  536  
 522  537          /*
 523  538           * Set flags telling the child what (not) to do on exit.
 524  539           */
 525  540          if (flags & FORK_NOSIGCHLD)
 526  541                  cp->p_pidflag |= CLDNOSIGCHLD;
 527  542          if (flags & FORK_WAITPID)
 528  543                  cp->p_pidflag |= CLDWAITPID;
 529  544  
 530  545          /*
 531  546           * Now that there are lwps and threads attached, add the new
 532  547           * process to the process group.
 533  548           */
 534  549          pgjoin(cp, p->p_pgidp);
 535  550          cp->p_stat = SRUN;
 536  551          /*
 537  552           * We are now done with all the lwps in the child process.
 538  553           */
 539  554          t = cp->p_tlist;
 540  555          do {
 541  556                  /*
 542  557                   * Set the lwp_suspend()ed lwps running.
 543  558                   * They will suspend properly at syscall exit.
 544  559                   */
 545  560                  if (t->t_proc_flag & TP_HOLDLWP)
 546  561                          lwp_create_done(t);
 547  562                  else {
 548  563                          /* set TS_CREATE to allow continuelwps() to work */
 549  564                          thread_lock(t);
 550  565                          ASSERT(t->t_state == TS_STOPPED &&
 551  566                              !(t->t_schedflag & (TS_CREATE|TS_CSTART)));
 552  567                          t->t_schedflag |= TS_CREATE;
 553  568                          thread_unlock(t);
 554  569                  }
 555  570          } while ((t = t->t_forw) != cp->p_tlist);
 556  571          mutex_exit(&cp->p_lock);
 557  572  
 558  573          if (isvfork) {
 559  574                  CPU_STATS_ADDQ(CPU, sys, sysvfork, 1);
 560  575                  mutex_enter(&p->p_lock);
 561  576                  p->p_flag |= SVFWAIT;
 562  577                  curthread->t_flag |= T_VFPARENT;
 563  578                  DTRACE_PROC1(create, proc_t *, cp);
 564  579                  cv_broadcast(&pr_pid_cv[p->p_slot]);    /* inform /proc */
 565  580                  mutex_exit(&p->p_lock);
 566  581                  /*
 567  582                   * Grab child's p_lock before dropping pidlock to ensure
 568  583                   * the process will not disappear before we set it running.
 569  584                   */
 570  585                  mutex_enter(&cp->p_lock);
 571  586                  mutex_exit(&pidlock);
 572  587                  sigdefault(cp);
 573  588                  continuelwps(cp);
 574  589                  mutex_exit(&cp->p_lock);
 575  590          } else {
 576  591                  CPU_STATS_ADDQ(CPU, sys, sysfork, 1);
 577  592                  DTRACE_PROC1(create, proc_t *, cp);
 578  593                  /*
 579  594                   * It is CL_FORKRET's job to drop pidlock.
 580  595                   * If we do it here, the process could be set running
 581  596                   * and disappear before CL_FORKRET() is called.
 582  597                   */
 583  598                  CL_FORKRET(curthread, cp->p_tlist);
 584  599                  schedctl_set_cidpri(curthread);
 585  600                  ASSERT(MUTEX_NOT_HELD(&pidlock));
 586  601          }
 587  602  
 588  603          return (r.r_vals);
 589  604  
 590  605  forklwperr:
 591  606          if (isvfork) {
 592  607                  if (avl_numnodes(&p->p_wpage) != 0) {
 593  608                          /* restore watchpoints to parent */
 594  609                          as = p->p_as;
 595  610                          AS_LOCK_ENTER(as, RW_WRITER);
 596  611                          as->a_wpage = p->p_wpage;
 597  612                          avl_create(&p->p_wpage, wp_compare,
 598  613                              sizeof (struct watched_page),
 599  614                              offsetof(struct watched_page, wp_link));
 600  615                          as_setwatch(as);
 601  616                          AS_LOCK_EXIT(as);
 602  617                  }
 603  618          } else {
 604  619                  if (cp->p_segacct)
 605  620                          shmexit(cp);
 606  621                  as = cp->p_as;
 607  622                  cp->p_as = &kas;
 608  623                  as_free(as);
 609  624          }
 610  625  
 611  626          if (cp->p_lwpdir) {
 612  627                  for (i = 0, ldp = cp->p_lwpdir; i < cp->p_lwpdir_sz; i++, ldp++)
 613  628                          if ((lep = ldp->ld_entry) != NULL)
 614  629                                  kmem_free(lep, sizeof (*lep));
 615  630                  kmem_free(cp->p_lwpdir,
 616  631                      cp->p_lwpdir_sz * sizeof (*cp->p_lwpdir));
 617  632          }
 618  633          cp->p_lwpdir = NULL;
 619  634          cp->p_lwpfree = NULL;
 620  635          cp->p_lwpdir_sz = 0;
 621  636  
 622  637          if (cp->p_tidhash)
 623  638                  kmem_free(cp->p_tidhash,
 624  639                      cp->p_tidhash_sz * sizeof (*cp->p_tidhash));
 625  640          cp->p_tidhash = NULL;
 626  641          cp->p_tidhash_sz = 0;
 627  642  
 628  643          forklwp_fail(cp);
 629  644          fork_fail(cp);
 630  645          rctl_set_free(cp->p_rctls);
 631  646          mutex_enter(&pidlock);
 632  647  
  
    | 
      ↓ open down ↓ | 
    344 lines elided | 
    
      ↑ open up ↑ | 
  
 633  648          /*
 634  649           * Detach failed child from task.
 635  650           */
 636  651          mutex_enter(&cp->p_lock);
 637  652          tk = cp->p_task;
 638  653          task_detach(cp);
 639  654          ASSERT(cp->p_pool->pool_ref > 0);
 640  655          atomic_dec_32(&cp->p_pool->pool_ref);
 641  656          mutex_exit(&cp->p_lock);
 642  657  
 643      -        orphpp = &p->p_orphan;
 644      -        while (*orphpp != cp)
 645      -                orphpp = &(*orphpp)->p_nextorph;
 646      -        *orphpp = cp->p_nextorph;
 647      -        if (p->p_child == cp)
 648      -                p->p_child = cp->p_sibling;
 649      -        if (cp->p_sibling)
 650      -                cp->p_sibling->p_psibling = cp->p_psibling;
 651      -        if (cp->p_psibling)
 652      -                cp->p_psibling->p_sibling = cp->p_sibling;
      658 +        disown_proc(p, cp);
 653  659          pid_exit(cp, tk);
 654  660          mutex_exit(&pidlock);
 655  661  
 656  662          task_rele(tk);
 657  663  
 658  664          mutex_enter(&p->p_lock);
 659  665          pool_barrier_exit();
 660  666          continuelwps(p);
 661  667          mutex_exit(&p->p_lock);
 662  668          error = EAGAIN;
 663  669  forkerr:
 664  670          return ((int64_t)set_errno(error));
 665  671  }
 666  672  
 667  673  /*
 668  674   * Free allocated resources from getproc() if a fork failed.
 669  675   */
 670  676  static void
 671  677  fork_fail(proc_t *cp)
 672  678  {
 673  679          uf_info_t *fip = P_FINFO(cp);
 674  680  
 675  681          fcnt_add(fip, -1);
 676  682          sigdelq(cp, NULL, 0);
 677  683  
 678  684          mutex_enter(&pidlock);
 679  685          upcount_dec(crgetruid(cp->p_cred), crgetzoneid(cp->p_cred));
 680  686          mutex_exit(&pidlock);
 681  687  
 682  688          /*
 683  689           * single threaded, so no locking needed here
 684  690           */
 685  691          crfree(cp->p_cred);
 686  692  
 687  693          kmem_free(fip->fi_list, fip->fi_nfiles * sizeof (uf_entry_t));
 688  694  
  
    | 
      ↓ open down ↓ | 
    26 lines elided | 
    
      ↑ open up ↑ | 
  
 689  695          VN_RELE(PTOU(curproc)->u_cdir);
 690  696          if (PTOU(curproc)->u_rdir)
 691  697                  VN_RELE(PTOU(curproc)->u_rdir);
 692  698          if (cp->p_exec)
 693  699                  VN_RELE(cp->p_exec);
 694  700          if (cp->p_execdir)
 695  701                  VN_RELE(cp->p_execdir);
 696  702          if (PTOU(curproc)->u_cwd)
 697  703                  refstr_rele(PTOU(curproc)->u_cwd);
 698  704          if (PROC_IS_BRANDED(cp)) {
 699      -                brand_clearbrand(cp, B_TRUE);
      705 +                brand_clearbrand(cp, B_FALSE);
 700  706          }
 701  707  }
 702  708  
 703  709  /*
 704  710   * Clean up the lwps already created for this child process.
 705  711   * The fork failed while duplicating all the lwps of the parent
 706  712   * and those lwps already created must be freed.
 707  713   * This process is invisible to the rest of the system,
 708  714   * so we don't need to hold p->p_lock to protect the list.
 709  715   */
 710  716  static void
 711  717  forklwp_fail(proc_t *p)
 712  718  {
 713  719          kthread_t *t;
 714  720          task_t *tk;
 715  721          int branded = 0;
 716  722  
 717  723          if (PROC_IS_BRANDED(p))
 718  724                  branded = 1;
 719  725  
 720  726          while ((t = p->p_tlist) != NULL) {
 721  727                  /*
 722  728                   * First remove the lwp from the process's p_tlist.
 723  729                   */
 724  730                  if (t != t->t_forw)
 725  731                          p->p_tlist = t->t_forw;
 726  732                  else
 727  733                          p->p_tlist = NULL;
 728  734                  p->p_lwpcnt--;
 729  735                  t->t_forw->t_back = t->t_back;
 730  736                  t->t_back->t_forw = t->t_forw;
 731  737  
 732  738                  tk = p->p_task;
 733  739                  mutex_enter(&p->p_zone->zone_nlwps_lock);
 734  740                  tk->tk_nlwps--;
 735  741                  tk->tk_proj->kpj_nlwps--;
 736  742                  p->p_zone->zone_nlwps--;
 737  743                  mutex_exit(&p->p_zone->zone_nlwps_lock);
  
    | 
      ↓ open down ↓ | 
    28 lines elided | 
    
      ↑ open up ↑ | 
  
 738  744  
 739  745                  ASSERT(t->t_schedctl == NULL);
 740  746  
 741  747                  if (branded)
 742  748                          BROP(p)->b_freelwp(ttolwp(t));
 743  749  
 744  750                  if (t->t_door != NULL) {
 745  751                          kmem_free(t->t_door, sizeof (door_data_t));
 746  752                          t->t_door = NULL;
 747  753                  }
 748      -                lwp_ctmpl_clear(ttolwp(t));
      754 +                lwp_ctmpl_clear(ttolwp(t), B_FALSE);
 749  755  
 750  756                  /*
 751  757                   * Remove the thread from the all threads list.
 752  758                   * We need to hold pidlock for this.
 753  759                   */
 754  760                  mutex_enter(&pidlock);
 755  761                  t->t_next->t_prev = t->t_prev;
 756  762                  t->t_prev->t_next = t->t_next;
 757  763                  CL_EXIT(t);     /* tell the scheduler that we're exiting */
 758  764                  cv_broadcast(&t->t_joincv);     /* tell anyone in thread_join */
 759  765                  mutex_exit(&pidlock);
 760  766  
 761  767                  /*
 762  768                   * Let the lgroup load averages know that this thread isn't
 763  769                   * going to show up (i.e. un-do what was done on behalf of
 764  770                   * this thread by the earlier lgrp_move_thread()).
 765  771                   */
 766  772                  kpreempt_disable();
 767  773                  lgrp_move_thread(t, NULL, 1);
 768  774                  kpreempt_enable();
 769  775  
 770  776                  /*
 771  777                   * The thread was created TS_STOPPED.
 772  778                   * We change it to TS_FREE to avoid an
 773  779                   * ASSERT() panic in thread_free().
 774  780                   */
  
    | 
      ↓ open down ↓ | 
    16 lines elided | 
    
      ↑ open up ↑ | 
  
 775  781                  t->t_state = TS_FREE;
 776  782                  thread_rele(t);
 777  783                  thread_free(t);
 778  784          }
 779  785  }
 780  786  
 781  787  extern struct as kas;
 782  788  
 783  789  /*
 784  790   * fork a kernel process.
      791 + *
      792 + * Passing a pid argument of -1 indicates that the new process should be
      793 + * launched as a child of 'zsched' within the zone.
 785  794   */
 786  795  int
 787  796  newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct,
 788  797      pid_t pid)
 789  798  {
 790  799          proc_t *p;
 791  800          struct user *up;
 792  801          kthread_t *t;
 793  802          cont_process_t *ctp = NULL;
 794  803          rctl_entity_p_t e;
 795  804  
 796  805          ASSERT(cid != sysdccid);
 797  806          ASSERT(cid != syscid || ct == NULL);
 798  807          if (CLASS_KERNEL(cid)) {
 799  808                  rctl_alloc_gp_t *init_gp;
 800  809                  rctl_set_t *init_set;
 801  810  
 802  811                  ASSERT(pid != 1);
      812 +                ASSERT(pid >= 0);
 803  813  
 804  814                  if (getproc(&p, pid, GETPROC_KERNEL) < 0)
 805  815                          return (EAGAIN);
 806  816  
 807  817                  /*
 808  818                   * Release the hold on the p_exec and p_execdir, these
 809  819                   * were acquired in getproc()
 810  820                   */
 811  821                  if (p->p_execdir != NULL)
 812  822                          VN_RELE(p->p_execdir);
 813  823                  if (p->p_exec != NULL)
 814  824                          VN_RELE(p->p_exec);
 815  825                  p->p_flag |= SNOWAIT;
 816  826                  p->p_exec = NULL;
 817  827                  p->p_execdir = NULL;
 818  828  
 819  829                  init_set = rctl_set_create();
 820  830                  init_gp = rctl_set_init_prealloc(RCENTITY_PROCESS);
 821  831  
 822  832                  /*
 823  833                   * kernel processes do not inherit /proc tracing flags.
 824  834                   */
 825  835                  sigemptyset(&p->p_sigmask);
 826  836                  premptyset(&p->p_fltmask);
 827  837                  up = PTOU(p);
 828  838                  up->u_systrap = 0;
 829  839                  premptyset(&(up->u_entrymask));
 830  840                  premptyset(&(up->u_exitmask));
 831  841                  mutex_enter(&p->p_lock);
 832  842                  e.rcep_p.proc = p;
 833  843                  e.rcep_t = RCENTITY_PROCESS;
 834  844                  p->p_rctls = rctl_set_init(RCENTITY_PROCESS, p, &e, init_set,
 835  845                      init_gp);
  
    | 
      ↓ open down ↓ | 
    23 lines elided | 
    
      ↑ open up ↑ | 
  
 836  846                  mutex_exit(&p->p_lock);
 837  847  
 838  848                  rctl_prealloc_destroy(init_gp);
 839  849  
 840  850                  t = lwp_kernel_create(p, pc, arg, TS_STOPPED, pri);
 841  851          } else {
 842  852                  rctl_alloc_gp_t *init_gp, *default_gp;
 843  853                  rctl_set_t *init_set;
 844  854                  task_t *tk, *tk_old;
 845  855                  klwp_t *lwp;
      856 +                boolean_t pzsched = B_FALSE;
      857 +                int flag = GETPROC_USER;
 846  858  
 847      -                if (getproc(&p, pid, GETPROC_USER) < 0)
      859 +                /* Handle a new user-level thread as child of zsched. */
      860 +                if (pid < 0) {
      861 +                        VERIFY(curzone != global_zone);
      862 +                        flag = GETPROC_ZSCHED;
      863 +                        pzsched = B_TRUE;
      864 +                        pid = 0;
      865 +                }
      866 +
      867 +                if (getproc(&p, pid, flag) < 0)
 848  868                          return (EAGAIN);
 849  869                  /*
 850  870                   * init creates a new task, distinct from the task
 851  871                   * containing kernel "processes".
 852  872                   */
 853  873                  tk = task_create(0, p->p_zone);
 854  874                  mutex_enter(&tk->tk_zone->zone_nlwps_lock);
 855  875                  tk->tk_proj->kpj_ntasks++;
 856  876                  tk->tk_nprocs++;
 857  877                  mutex_exit(&tk->tk_zone->zone_nlwps_lock);
 858  878  
 859  879                  default_gp = rctl_rlimit_set_prealloc(RLIM_NLIMITS);
 860  880                  init_gp = rctl_set_init_prealloc(RCENTITY_PROCESS);
 861  881                  init_set = rctl_set_create();
 862  882  
 863  883                  mutex_enter(&pidlock);
 864  884                  mutex_enter(&p->p_lock);
 865  885                  tk_old = p->p_task;     /* switch to new task */
 866  886  
 867  887                  task_detach(p);
 868  888                  task_begin(tk, p);
 869  889                  mutex_exit(&pidlock);
 870  890  
 871  891                  mutex_enter(&tk_old->tk_zone->zone_nlwps_lock);
 872  892                  tk_old->tk_nprocs--;
 873  893                  mutex_exit(&tk_old->tk_zone->zone_nlwps_lock);
 874  894  
 875  895                  e.rcep_p.proc = p;
 876  896                  e.rcep_t = RCENTITY_PROCESS;
 877  897                  p->p_rctls = rctl_set_init(RCENTITY_PROCESS, p, &e, init_set,
 878  898                      init_gp);
  
    | 
      ↓ open down ↓ | 
    21 lines elided | 
    
      ↑ open up ↑ | 
  
 879  899                  rctlproc_default_init(p, default_gp);
 880  900                  mutex_exit(&p->p_lock);
 881  901  
 882  902                  task_rele(tk_old);
 883  903                  rctl_prealloc_destroy(default_gp);
 884  904                  rctl_prealloc_destroy(init_gp);
 885  905  
 886  906                  if ((lwp = lwp_create(pc, arg, 0, p, TS_STOPPED, pri,
 887  907                      &curthread->t_hold, cid, 1)) == NULL) {
 888  908                          task_t *tk;
      909 +
 889  910                          fork_fail(p);
 890  911                          mutex_enter(&pidlock);
      912 +                        disown_proc(p->p_parent, p);
      913 +
 891  914                          mutex_enter(&p->p_lock);
 892  915                          tk = p->p_task;
 893  916                          task_detach(p);
 894  917                          ASSERT(p->p_pool->pool_ref > 0);
 895  918                          atomic_add_32(&p->p_pool->pool_ref, -1);
 896  919                          mutex_exit(&p->p_lock);
      920 +
 897  921                          pid_exit(p, tk);
 898  922                          mutex_exit(&pidlock);
 899  923                          task_rele(tk);
 900      -
 901  924                          return (EAGAIN);
 902  925                  }
 903  926                  t = lwptot(lwp);
 904  927  
 905      -                ctp = contract_process_fork(sys_process_tmpl, p, curproc,
      928 +                ctp = contract_process_fork(sys_process_tmpl, p,
      929 +                    (pzsched ? curproc->p_zone->zone_zsched : curproc),
 906  930                      B_FALSE);
 907  931                  ASSERT(ctp != NULL);
 908  932                  if (ct != NULL)
 909  933                          *ct = &ctp->conp_contract;
 910  934          }
 911  935  
 912  936          ASSERT3U(t->t_tid, ==, 1);
 913  937          p->p_lwpid = 1;
 914  938          mutex_enter(&pidlock);
 915  939          pgjoin(p, p->p_parent->p_pgidp);
 916  940          p->p_stat = SRUN;
 917  941          mutex_enter(&p->p_lock);
 918  942          t->t_proc_flag &= ~TP_HOLDLWP;
 919  943          lwp_create_done(t);
 920  944          mutex_exit(&p->p_lock);
 921  945          mutex_exit(&pidlock);
 922  946          return (0);
 923  947  }
 924  948  
 925  949  /*
 926  950   * create a child proc struct.
 927  951   */
 928  952  static int
 929  953  getproc(proc_t **cpp, pid_t pid, uint_t flags)
 930  954  {
 931  955          proc_t          *pp, *cp;
 932  956          pid_t           newpid;
 933  957          struct user     *uarea;
 934  958          extern uint_t   nproc;
 935  959          struct cred     *cr;
  
    | 
      ↓ open down ↓ | 
    20 lines elided | 
    
      ↑ open up ↑ | 
  
 936  960          uid_t           ruid;
 937  961          zoneid_t        zoneid;
 938  962          task_t          *task;
 939  963          kproject_t      *proj;
 940  964          zone_t          *zone;
 941  965          int             rctlfail = 0;
 942  966  
 943  967          if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN)
 944  968                  return (-1);    /* no point in starting new processes */
 945  969  
 946      -        pp = (flags & GETPROC_KERNEL) ? &p0 : curproc;
      970 +        if (flags & GETPROC_ZSCHED) {
      971 +                pp = curproc->p_zone->zone_zsched;
      972 +        } else {
      973 +                pp = (flags & GETPROC_KERNEL) ? &p0 : curproc;
      974 +        }
 947  975          task = pp->p_task;
 948  976          proj = task->tk_proj;
 949  977          zone = pp->p_zone;
 950  978  
 951  979          mutex_enter(&pp->p_lock);
 952  980          mutex_enter(&zone->zone_nlwps_lock);
 953  981          if (proj != proj0p) {
 954  982                  if (task->tk_nprocs >= task->tk_nprocs_ctl)
 955  983                          if (rctl_test(rc_task_nprocs, task->tk_rctls,
 956  984                              pp, 1, 0) & RCT_DENY)
 957  985                                  rctlfail = 1;
 958  986  
 959  987                  if (proj->kpj_nprocs >= proj->kpj_nprocs_ctl)
 960  988                          if (rctl_test(rc_project_nprocs, proj->kpj_rctls,
 961  989                              pp, 1, 0) & RCT_DENY)
 962  990                                  rctlfail = 1;
 963  991  
 964  992                  if (zone->zone_nprocs >= zone->zone_nprocs_ctl)
 965  993                          if (rctl_test(rc_zone_nprocs, zone->zone_rctls,
 966  994                              pp, 1, 0) & RCT_DENY)
 967  995                                  rctlfail = 1;
 968  996  
 969  997                  if (rctlfail) {
 970  998                          mutex_exit(&zone->zone_nlwps_lock);
 971  999                          mutex_exit(&pp->p_lock);
 972 1000                          atomic_inc_32(&zone->zone_ffcap);
 973 1001                          goto punish;
 974 1002                  }
 975 1003          }
 976 1004          task->tk_nprocs++;
 977 1005          proj->kpj_nprocs++;
 978 1006          zone->zone_nprocs++;
 979 1007          mutex_exit(&zone->zone_nlwps_lock);
 980 1008          mutex_exit(&pp->p_lock);
 981 1009  
 982 1010          cp = kmem_cache_alloc(process_cache, KM_SLEEP);
 983 1011          bzero(cp, sizeof (proc_t));
 984 1012  
 985 1013          /*
 986 1014           * Make proc entry for child process
 987 1015           */
 988 1016          mutex_init(&cp->p_splock, NULL, MUTEX_DEFAULT, NULL);
 989 1017          mutex_init(&cp->p_crlock, NULL, MUTEX_DEFAULT, NULL);
 990 1018          mutex_init(&cp->p_pflock, NULL, MUTEX_DEFAULT, NULL);
 991 1019  #if defined(__x86)
 992 1020          mutex_init(&cp->p_ldtlock, NULL, MUTEX_DEFAULT, NULL);
 993 1021  #endif
 994 1022          mutex_init(&cp->p_maplock, NULL, MUTEX_DEFAULT, NULL);
 995 1023          cp->p_stat = SIDL;
 996 1024          cp->p_mstart = gethrtime();
  
    | 
      ↓ open down ↓ | 
    40 lines elided | 
    
      ↑ open up ↑ | 
  
 997 1025          cp->p_as = &kas;
 998 1026          /*
 999 1027           * p_zone must be set before we call pid_allocate since the process
1000 1028           * will be visible after that and code such as prfind_zone will
1001 1029           * look at the p_zone field.
1002 1030           */
1003 1031          cp->p_zone = pp->p_zone;
1004 1032          cp->p_t1_lgrpid = LGRP_NONE;
1005 1033          cp->p_tr_lgrpid = LGRP_NONE;
1006 1034  
     1035 +        /* Default to native brand initially */
     1036 +        cp->p_brand = &native_brand;
     1037 +
1007 1038          if ((newpid = pid_allocate(cp, pid, PID_ALLOC_PROC)) == -1) {
1008 1039                  if (nproc == v.v_proc) {
1009 1040                          CPU_STATS_ADDQ(CPU, sys, procovf, 1);
1010 1041                          cmn_err(CE_WARN, "out of processes");
1011 1042                  }
1012 1043                  goto bad;
1013 1044          }
1014 1045  
1015 1046          mutex_enter(&pp->p_lock);
1016 1047          cp->p_exec = pp->p_exec;
1017 1048          cp->p_execdir = pp->p_execdir;
1018 1049          mutex_exit(&pp->p_lock);
1019 1050  
1020 1051          if (cp->p_exec) {
1021 1052                  VN_HOLD(cp->p_exec);
1022 1053                  /*
1023 1054                   * Each VOP_OPEN() must be paired with a corresponding
1024 1055                   * VOP_CLOSE(). In this case, the executable will be
1025 1056                   * closed for the child in either proc_exit() or gexec().
1026 1057                   */
1027 1058                  if (VOP_OPEN(&cp->p_exec, FREAD, CRED(), NULL) != 0) {
1028 1059                          VN_RELE(cp->p_exec);
1029 1060                          cp->p_exec = NULLVP;
1030 1061                          cp->p_execdir = NULLVP;
1031 1062                          goto bad;
1032 1063                  }
1033 1064          }
1034 1065          if (cp->p_execdir)
1035 1066                  VN_HOLD(cp->p_execdir);
1036 1067  
1037 1068          /*
1038 1069           * If not privileged make sure that this user hasn't exceeded
1039 1070           * v.v_maxup processes, and that users collectively haven't
1040 1071           * exceeded v.v_maxupttl processes.
1041 1072           */
1042 1073          mutex_enter(&pidlock);
1043 1074          ASSERT(nproc < v.v_proc);       /* otherwise how'd we get our pid? */
1044 1075          cr = CRED();
1045 1076          ruid = crgetruid(cr);
1046 1077          zoneid = crgetzoneid(cr);
1047 1078          if (nproc >= v.v_maxup &&       /* short-circuit; usually false */
1048 1079              (nproc >= v.v_maxupttl ||
1049 1080              upcount_get(ruid, zoneid) >= v.v_maxup) &&
1050 1081              secpolicy_newproc(cr) != 0) {
1051 1082                  mutex_exit(&pidlock);
1052 1083                  zcmn_err(zoneid, CE_NOTE,
1053 1084                      "out of per-user processes for uid %d", ruid);
1054 1085                  goto bad;
1055 1086          }
1056 1087  
1057 1088          /*
1058 1089           * Everything is cool, put the new proc on the active process list.
1059 1090           * It is already on the pid list and in /proc.
1060 1091           * Increment the per uid process count (upcount).
1061 1092           */
1062 1093          nproc++;
1063 1094          upcount_inc(ruid, zoneid);
  
    | 
      ↓ open down ↓ | 
    47 lines elided | 
    
      ↑ open up ↑ | 
  
1064 1095  
1065 1096          cp->p_next = practive;
1066 1097          practive->p_prev = cp;
1067 1098          practive = cp;
1068 1099  
1069 1100          cp->p_ignore = pp->p_ignore;
1070 1101          cp->p_siginfo = pp->p_siginfo;
1071 1102          cp->p_flag = pp->p_flag & (SJCTL|SNOWAIT|SNOCD);
1072 1103          cp->p_sessp = pp->p_sessp;
1073 1104          sess_hold(pp);
1074      -        cp->p_brand = pp->p_brand;
1075      -        if (PROC_IS_BRANDED(pp))
1076      -                BROP(pp)->b_copy_procdata(cp, pp);
1077 1105          cp->p_bssbase = pp->p_bssbase;
1078 1106          cp->p_brkbase = pp->p_brkbase;
1079 1107          cp->p_brksize = pp->p_brksize;
1080 1108          cp->p_brkpageszc = pp->p_brkpageszc;
1081 1109          cp->p_stksize = pp->p_stksize;
1082 1110          cp->p_stkpageszc = pp->p_stkpageszc;
1083 1111          cp->p_stkprot = pp->p_stkprot;
1084 1112          cp->p_datprot = pp->p_datprot;
1085 1113          cp->p_usrstack = pp->p_usrstack;
1086 1114          cp->p_model = pp->p_model;
1087 1115          cp->p_ppid = pp->p_pid;
1088 1116          cp->p_ancpid = pp->p_pid;
1089 1117          cp->p_portcnt = pp->p_portcnt;
1090 1118  
1091 1119          /*
1092 1120           * Initialize watchpoint structures
1093 1121           */
1094 1122          avl_create(&cp->p_warea, wa_compare, sizeof (struct watched_area),
1095 1123              offsetof(struct watched_area, wa_link));
1096 1124  
1097 1125          /*
1098 1126           * Initialize immediate resource control values.
1099 1127           */
1100 1128          cp->p_stk_ctl = pp->p_stk_ctl;
1101 1129          cp->p_fsz_ctl = pp->p_fsz_ctl;
1102 1130          cp->p_vmem_ctl = pp->p_vmem_ctl;
1103 1131          cp->p_fno_ctl = pp->p_fno_ctl;
1104 1132  
1105 1133          /*
1106 1134           * Link up to parent-child-sibling chain.  No need to lock
1107 1135           * in general since only a call to freeproc() (done by the
1108 1136           * same parent as newproc()) diddles with the child chain.
1109 1137           */
1110 1138          cp->p_sibling = pp->p_child;
1111 1139          if (pp->p_child)
1112 1140                  pp->p_child->p_psibling = cp;
1113 1141  
1114 1142          cp->p_parent = pp;
1115 1143          pp->p_child = cp;
1116 1144  
1117 1145          cp->p_child_ns = NULL;
1118 1146          cp->p_sibling_ns = NULL;
1119 1147  
1120 1148          cp->p_nextorph = pp->p_orphan;
1121 1149          cp->p_nextofkin = pp;
1122 1150          pp->p_orphan = cp;
1123 1151  
1124 1152          /*
1125 1153           * Inherit profiling state; do not inherit REALPROF profiling state.
1126 1154           */
1127 1155          cp->p_prof = pp->p_prof;
1128 1156          cp->p_rprof_cyclic = CYCLIC_NONE;
1129 1157  
1130 1158          /*
1131 1159           * Inherit pool pointer from the parent.  Kernel processes are
1132 1160           * always bound to the default pool.
1133 1161           */
1134 1162          mutex_enter(&pp->p_lock);
1135 1163          if (flags & GETPROC_KERNEL) {
1136 1164                  cp->p_pool = pool_default;
1137 1165                  cp->p_flag |= SSYS;
1138 1166          } else {
1139 1167                  cp->p_pool = pp->p_pool;
1140 1168          }
1141 1169          atomic_inc_32(&cp->p_pool->pool_ref);
1142 1170          mutex_exit(&pp->p_lock);
1143 1171  
1144 1172          /*
1145 1173           * Add the child process to the current task.  Kernel processes
  
    | 
      ↓ open down ↓ | 
    59 lines elided | 
    
      ↑ open up ↑ | 
  
1146 1174           * are always attached to task0.
1147 1175           */
1148 1176          mutex_enter(&cp->p_lock);
1149 1177          if (flags & GETPROC_KERNEL)
1150 1178                  task_attach(task0p, cp);
1151 1179          else
1152 1180                  task_attach(pp->p_task, cp);
1153 1181          mutex_exit(&cp->p_lock);
1154 1182          mutex_exit(&pidlock);
1155 1183  
     1184 +        if (PROC_IS_BRANDED(pp)) {
     1185 +                /*
     1186 +                 * The only reason why process branding should fail is when
     1187 +                 * the procedure is complicated by multiple LWPs on the scene.
     1188 +                 * With an LWP count of 0, this newly allocated process has no
     1189 +                 * reason to fail branding.
     1190 +                 */
     1191 +                VERIFY0(brand_setbrand(cp, B_FALSE));
     1192 +
     1193 +                BROP(pp)->b_copy_procdata(cp, pp);
     1194 +        }
     1195 +
1156 1196          avl_create(&cp->p_ct_held, contract_compar, sizeof (contract_t),
1157 1197              offsetof(contract_t, ct_ctlist));
1158 1198  
1159 1199          /*
1160 1200           * Duplicate any audit information kept in the process table
1161 1201           */
1162 1202          if (audit_active)       /* copy audit data to cp */
1163 1203                  audit_newproc(cp);
1164 1204  
1165 1205          crhold(cp->p_cred = cr);
1166 1206  
1167 1207          /*
1168 1208           * Bump up the counts on the file structures pointed at by the
1169 1209           * parent's file table since the child will point at them too.
1170 1210           */
1171 1211          fcnt_add(P_FINFO(pp), 1);
1172 1212  
1173 1213          if (PTOU(pp)->u_cdir) {
1174 1214                  VN_HOLD(PTOU(pp)->u_cdir);
1175 1215          } else {
1176 1216                  ASSERT(pp == &p0);
1177 1217                  /*
1178 1218                   * We must be at or before vfs_mountroot(); it will take care of
1179 1219                   * assigning our current directory.
1180 1220                   */
1181 1221          }
1182 1222          if (PTOU(pp)->u_rdir)
1183 1223                  VN_HOLD(PTOU(pp)->u_rdir);
1184 1224          if (PTOU(pp)->u_cwd)
1185 1225                  refstr_hold(PTOU(pp)->u_cwd);
1186 1226  
1187 1227          /*
1188 1228           * copy the parent's uarea.
1189 1229           */
1190 1230          uarea = PTOU(cp);
1191 1231          bcopy(PTOU(pp), uarea, sizeof (*uarea));
1192 1232          flist_fork(P_FINFO(pp), P_FINFO(cp));
1193 1233  
1194 1234          gethrestime(&uarea->u_start);
1195 1235          uarea->u_ticks = ddi_get_lbolt();
1196 1236          uarea->u_mem = rm_asrss(pp->p_as);
1197 1237          uarea->u_acflag = AFORK;
1198 1238  
1199 1239          /*
1200 1240           * If inherit-on-fork, copy /proc tracing flags to child.
1201 1241           */
1202 1242          if ((pp->p_proc_flag & P_PR_FORK) != 0) {
1203 1243                  cp->p_proc_flag |= pp->p_proc_flag & (P_PR_TRACE|P_PR_FORK);
1204 1244                  cp->p_sigmask = pp->p_sigmask;
1205 1245                  cp->p_fltmask = pp->p_fltmask;
1206 1246          } else {
1207 1247                  sigemptyset(&cp->p_sigmask);
1208 1248                  premptyset(&cp->p_fltmask);
1209 1249                  uarea->u_systrap = 0;
1210 1250                  premptyset(&uarea->u_entrymask);
1211 1251                  premptyset(&uarea->u_exitmask);
1212 1252          }
1213 1253          /*
1214 1254           * If microstate accounting is being inherited, mark child
1215 1255           */
1216 1256          if ((pp->p_flag & SMSFORK) != 0)
1217 1257                  cp->p_flag |= pp->p_flag & (SMSFORK|SMSACCT);
1218 1258  
1219 1259          /*
1220 1260           * Inherit fixalignment flag from the parent
1221 1261           */
1222 1262          cp->p_fixalignment = pp->p_fixalignment;
1223 1263  
1224 1264          *cpp = cp;
1225 1265          return (0);
1226 1266  
1227 1267  bad:
1228 1268          ASSERT(MUTEX_NOT_HELD(&pidlock));
1229 1269  
1230 1270          mutex_destroy(&cp->p_crlock);
1231 1271          mutex_destroy(&cp->p_pflock);
1232 1272  #if defined(__x86)
1233 1273          mutex_destroy(&cp->p_ldtlock);
1234 1274  #endif
1235 1275          if (newpid != -1) {
1236 1276                  proc_entry_free(cp->p_pidp);
1237 1277                  (void) pid_rele(cp->p_pidp);
1238 1278          }
1239 1279          kmem_cache_free(process_cache, cp);
1240 1280  
1241 1281          mutex_enter(&zone->zone_nlwps_lock);
1242 1282          task->tk_nprocs--;
1243 1283          proj->kpj_nprocs--;
1244 1284          zone->zone_nprocs--;
1245 1285          mutex_exit(&zone->zone_nlwps_lock);
1246 1286          atomic_inc_32(&zone->zone_ffnoproc);
1247 1287  
1248 1288  punish:
1249 1289          /*
1250 1290           * We most likely got into this situation because some process is
1251 1291           * forking out of control.  As punishment, put it to sleep for a
1252 1292           * bit so it can't eat the machine alive.  Sleep interval is chosen
1253 1293           * to allow no more than one fork failure per cpu per clock tick
1254 1294           * on average (yes, I just made this up).  This has two desirable
1255 1295           * properties: (1) it sets a constant limit on the fork failure
1256 1296           * rate, and (2) the busier the system is, the harsher the penalty
1257 1297           * for abusing it becomes.
1258 1298           */
1259 1299          INCR_COUNT(&fork_fail_pending, &pidlock);
1260 1300          delay(fork_fail_pending / ncpus + 1);
1261 1301          DECR_COUNT(&fork_fail_pending, &pidlock);
1262 1302  
1263 1303          return (-1); /* out of memory or proc slots */
1264 1304  }
1265 1305  
1266 1306  /*
1267 1307   * Release virtual memory.
1268 1308   * In the case of vfork(), the child was given exclusive access to its
1269 1309   * parent's address space.  The parent is waiting in vfwait() for the
1270 1310   * child to release its exclusive claim via relvm().
1271 1311   */
1272 1312  void
1273 1313  relvm()
1274 1314  {
1275 1315          proc_t *p = curproc;
1276 1316  
1277 1317          ASSERT((unsigned)p->p_lwpcnt <= 1);
1278 1318  
1279 1319          prrelvm();      /* inform /proc */
1280 1320  
1281 1321          if (p->p_flag & SVFORK) {
1282 1322                  proc_t *pp = p->p_parent;
1283 1323                  /*
1284 1324                   * The child process is either exec'ing or exit'ing.
1285 1325                   * The child is now separated from the parent's address
1286 1326                   * space.  The parent process is made dispatchable.
1287 1327                   *
1288 1328                   * This is a delicate locking maneuver, involving
1289 1329                   * both the parent's p_lock and the child's p_lock.
1290 1330                   * As soon as the SVFORK flag is turned off, the
1291 1331                   * parent is free to run, but it must not run until
1292 1332                   * we wake it up using its p_cv because it might
1293 1333                   * exit and we would be referencing invalid memory.
1294 1334                   * Therefore, we hold the parent with its p_lock
1295 1335                   * while protecting our p_flags with our own p_lock.
1296 1336                   */
1297 1337  try_again:
1298 1338                  mutex_enter(&p->p_lock);        /* grab child's lock first */
1299 1339                  prbarrier(p);           /* make sure /proc is blocked out */
1300 1340                  mutex_enter(&pp->p_lock);
1301 1341  
1302 1342                  /*
1303 1343                   * Check if parent is locked by /proc.
1304 1344                   */
1305 1345                  if (pp->p_proc_flag & P_PR_LOCK) {
1306 1346                          /*
1307 1347                           * Delay until /proc is done with the parent.
1308 1348                           * We must drop our (the child's) p->p_lock, wait
1309 1349                           * via prbarrier() on the parent, then start over.
1310 1350                           */
1311 1351                          mutex_exit(&p->p_lock);
1312 1352                          prbarrier(pp);
1313 1353                          mutex_exit(&pp->p_lock);
1314 1354                          goto try_again;
1315 1355                  }
1316 1356                  p->p_flag &= ~SVFORK;
1317 1357                  kpreempt_disable();
1318 1358                  p->p_as = &kas;
1319 1359  
1320 1360                  /*
1321 1361                   * notify hat of change in thread's address space
1322 1362                   */
1323 1363                  hat_thread_exit(curthread);
1324 1364                  kpreempt_enable();
1325 1365  
1326 1366                  /*
1327 1367                   * child sizes are copied back to parent because
1328 1368                   * child may have grown.
1329 1369                   */
1330 1370                  pp->p_brkbase = p->p_brkbase;
1331 1371                  pp->p_brksize = p->p_brksize;
1332 1372                  pp->p_stksize = p->p_stksize;
1333 1373  
1334 1374                  /*
1335 1375                   * Copy back the shm accounting information
1336 1376                   * to the parent process.
1337 1377                   */
1338 1378                  pp->p_segacct = p->p_segacct;
1339 1379                  p->p_segacct = NULL;
1340 1380  
1341 1381                  /*
1342 1382                   * The parent is no longer waiting for the vfork()d child.
1343 1383                   * Restore the parent's watched pages, if any.  This is
1344 1384                   * safe because we know the parent is not locked by /proc
1345 1385                   */
1346 1386                  pp->p_flag &= ~SVFWAIT;
1347 1387                  if (avl_numnodes(&pp->p_wpage) != 0) {
1348 1388                          pp->p_as->a_wpage = pp->p_wpage;
1349 1389                          avl_create(&pp->p_wpage, wp_compare,
1350 1390                              sizeof (struct watched_page),
1351 1391                              offsetof(struct watched_page, wp_link));
1352 1392                  }
1353 1393                  cv_signal(&pp->p_cv);
1354 1394                  mutex_exit(&pp->p_lock);
1355 1395                  mutex_exit(&p->p_lock);
1356 1396          } else {
1357 1397                  if (p->p_as != &kas) {
1358 1398                          struct as *as;
1359 1399  
1360 1400                          if (p->p_segacct)
1361 1401                                  shmexit(p);
1362 1402  
1363 1403                          /*
1364 1404                           * We grab p_lock for the benefit of /proc
1365 1405                           */
1366 1406                          kpreempt_disable();
1367 1407                          mutex_enter(&p->p_lock);
1368 1408                          prbarrier(p);   /* make sure /proc is blocked out */
1369 1409                          as = p->p_as;
1370 1410                          p->p_as = &kas;
1371 1411                          mutex_exit(&p->p_lock);
1372 1412  
1373 1413                          /*
1374 1414                           * notify hat of change in thread's address space
1375 1415                           */
1376 1416                          hat_thread_exit(curthread);
1377 1417                          kpreempt_enable();
1378 1418  
1379 1419                          as_free(as);
1380 1420                          p->p_tr_lgrpid = LGRP_NONE;
1381 1421                  }
1382 1422          }
1383 1423  }
1384 1424  
1385 1425  /*
1386 1426   * Wait for child to exec or exit.
1387 1427   * Called by parent of vfork'ed process.
1388 1428   * See important comments in relvm(), above.
1389 1429   */
1390 1430  void
1391 1431  vfwait(pid_t pid)
1392 1432  {
1393 1433          int signalled = 0;
1394 1434          proc_t *pp = ttoproc(curthread);
1395 1435          proc_t *cp;
1396 1436  
1397 1437          /*
1398 1438           * Wait for child to exec or exit.
1399 1439           */
1400 1440          for (;;) {
1401 1441                  mutex_enter(&pidlock);
1402 1442                  cp = prfind(pid);
1403 1443                  if (cp == NULL || cp->p_parent != pp) {
1404 1444                          /*
1405 1445                           * Child has exit()ed.
1406 1446                           */
1407 1447                          mutex_exit(&pidlock);
1408 1448                          break;
1409 1449                  }
1410 1450                  /*
1411 1451                   * Grab the child's p_lock before releasing pidlock.
1412 1452                   * Otherwise, the child could exit and we would be
1413 1453                   * referencing invalid memory.
1414 1454                   */
1415 1455                  mutex_enter(&cp->p_lock);
1416 1456                  mutex_exit(&pidlock);
1417 1457                  if (!(cp->p_flag & SVFORK)) {
1418 1458                          /*
1419 1459                           * Child has exec()ed or is exit()ing.
1420 1460                           */
1421 1461                          mutex_exit(&cp->p_lock);
1422 1462                          break;
1423 1463                  }
1424 1464                  mutex_enter(&pp->p_lock);
1425 1465                  mutex_exit(&cp->p_lock);
1426 1466                  /*
1427 1467                   * We might be waked up spuriously from the cv_wait().
1428 1468                   * We have to do the whole operation over again to be
1429 1469                   * sure the child's SVFORK flag really is turned off.
1430 1470                   * We cannot make reference to the child because it can
1431 1471                   * exit before we return and we would be referencing
1432 1472                   * invalid memory.
1433 1473                   *
1434 1474                   * Because this is potentially a very long-term wait,
1435 1475                   * we call cv_wait_sig() (for its jobcontrol and /proc
1436 1476                   * side-effects) unless there is a current signal, in
1437 1477                   * which case we use cv_wait() because we cannot return
1438 1478                   * from this function until the child has released the
1439 1479                   * address space.  Calling cv_wait_sig() with a current
1440 1480                   * signal would lead to an indefinite loop here because
1441 1481                   * cv_wait_sig() returns immediately in this case.
1442 1482                   */
1443 1483                  if (signalled)
1444 1484                          cv_wait(&pp->p_cv, &pp->p_lock);
1445 1485                  else
1446 1486                          signalled = !cv_wait_sig(&pp->p_cv, &pp->p_lock);
1447 1487                  mutex_exit(&pp->p_lock);
1448 1488          }
1449 1489  
1450 1490          /* restore watchpoints to parent */
1451 1491          if (pr_watch_active(pp)) {
1452 1492                  struct as *as = pp->p_as;
1453 1493                  AS_LOCK_ENTER(as, RW_WRITER);
1454 1494                  as_setwatch(as);
1455 1495                  AS_LOCK_EXIT(as);
1456 1496          }
1457 1497  
1458 1498          mutex_enter(&pp->p_lock);
1459 1499          prbarrier(pp);  /* barrier against /proc locking */
1460 1500          continuelwps(pp);
1461 1501          mutex_exit(&pp->p_lock);
1462 1502  }
  
    | 
      ↓ open down ↓ | 
    297 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX