zones-merge-changes Wdiff usr/src/uts/common/os/exit.c

Print this page

14019 Allow more control over zone init exit actions (fix mismerge)
14019 Allow more control over zone init exit actions
Portions contributed by: Joshua M. Clulow <jmc@joyent.com>
Portions contributed by: Andy Fiddaman <andy@omnios.org>
Reviewed by: C Fraire <cfraire@me.com>
Reviewed by: Gordon Ross <Gordon.W.Ross@gmail.com>
Approved by: Robert Mustacchi <rm@fingolfin.org>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/os/exit.c
          +++ new/usr/src/uts/common/os/exit.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright 2018 Joyent, Inc.
  25   25   * Copyright 2020 Oxide Computer Company
  26   26   * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
  27   27   */
  28   28  
  29   29  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  30   30  
  31   31  #include <sys/types.h>
  32   32  #include <sys/param.h>
  33   33  #include <sys/sysmacros.h>
  34   34  #include <sys/systm.h>
  35   35  #include <sys/cred.h>
  36   36  #include <sys/user.h>
  37   37  #include <sys/errno.h>
  38   38  #include <sys/proc.h>
  39   39  #include <sys/ucontext.h>
  40   40  #include <sys/procfs.h>
  41   41  #include <sys/vnode.h>
  42   42  #include <sys/acct.h>
  43   43  #include <sys/var.h>
  44   44  #include <sys/cmn_err.h>
  45   45  #include <sys/debug.h>
  46   46  #include <sys/wait.h>
  47   47  #include <sys/siginfo.h>
  48   48  #include <sys/procset.h>
  49   49  #include <sys/class.h>
  50   50  #include <sys/file.h>
  51   51  #include <sys/session.h>
  52   52  #include <sys/kmem.h>
  53   53  #include <sys/vtrace.h>
  54   54  #include <sys/prsystm.h>
  55   55  #include <sys/ipc.h>
  56   56  #include <sys/sem_impl.h>
  57   57  #include <c2/audit.h>
  58   58  #include <sys/aio_impl.h>
  59   59  #include <vm/as.h>
  60   60  #include <sys/poll.h>
  61   61  #include <sys/door.h>
  62   62  #include <sys/lwpchan_impl.h>
  63   63  #include <sys/utrap.h>
  64   64  #include <sys/task.h>
  65   65  #include <sys/exacct.h>
  66   66  #include <sys/cyclic.h>
  67   67  #include <sys/schedctl.h>
  68   68  #include <sys/rctl.h>
  69   69  #include <sys/contract_impl.h>
  70   70  #include <sys/contract/process_impl.h>
  71   71  #include <sys/list.h>
  72   72  #include <sys/dtrace.h>
  73   73  #include <sys/pool.h>
  74   74  #include <sys/sdt.h>
  75   75  #include <sys/corectl.h>
  76   76  #include <sys/core.h>
  77   77  #include <sys/brand.h>
  78   78  #include <sys/libc_kernel.h>
  79   79  
  80   80  /*
  81   81   * convert code/data pair into old style wait status
  82   82   */
  83   83  int
  84   84  wstat(int code, int data)
  85   85  {
  86   86          int stat = (data & 0377);
  87   87  
  88   88          switch (code) {
  89   89          case CLD_EXITED:
  90   90                  stat <<= 8;
  91   91                  break;
  92   92          case CLD_DUMPED:
  93   93                  stat |= WCOREFLG;
  94   94                  break;
  95   95          case CLD_KILLED:
  96   96                  break;
  97   97          case CLD_TRAPPED:
  98   98          case CLD_STOPPED:
  99   99                  stat <<= 8;
 100  100                  stat |= WSTOPFLG;
 101  101                  break;
 102  102          case CLD_CONTINUED:
 103  103                  stat = WCONTFLG;
 104  104                  break;
 105  105          default:
 106  106                  cmn_err(CE_PANIC, "wstat: bad code");
 107  107                  /* NOTREACHED */
 108  108          }
 109  109          return (stat);
 110  110  }
 111  111  
 112  112  static char *
 113  113  exit_reason(char *buf, size_t bufsz, int what, int why)
 114  114  {
 115  115          switch (why) {
 116  116          case CLD_EXITED:
 117  117                  (void) snprintf(buf, bufsz, "exited with status %d", what);
 118  118                  break;
 119  119          case CLD_KILLED:
 120  120                  (void) snprintf(buf, bufsz, "exited on fatal signal %d", what);
 121  121                  break;
 122  122          case CLD_DUMPED:
 123  123                  (void) snprintf(buf, bufsz, "core dumped on signal %d", what);
 124  124                  break;
 125  125          default:
 126  126                  (void) snprintf(buf, bufsz, "encountered unknown error "
 127  127                      "(%d, %d)", why, what);
 128  128                  break;
 129  129          }
 130  130  
 131  131          return (buf);
 132  132  }
 133  133  
 134  134  /*
 135  135   * exit system call: pass back caller's arg.
 136  136   */
 137  137  void
 138  138  rexit(int rval)
 139  139  {
 140  140          exit(CLD_EXITED, rval);
 141  141  }
 142  142  
 143  143  /*
 144  144   * Bump the init_restarts kstat and let interested parties know about the
 145  145   * restart.
 146  146   */
 147  147  static void
 148  148  restart_init_notify(zone_t *zone)
 149  149  {
 150  150          nvlist_t *nvl = NULL;
 151  151  
 152  152          zone->zone_proc_init_restarts++;
 153  153  
 154  154          if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0 &&
 155  155              nvlist_add_uint32(nvl, ZONE_CB_RESTARTS,
 156  156              zone->zone_proc_init_restarts) == 0) {
 157  157                  zone_sysevent_publish(zone, ZONE_EVENT_INIT_CLASS,
 158  158                      ZONE_EVENT_INIT_RESTART_SC, nvl);

↓ open down ↓

158 lines elided

↑ open up ↑

 159  159          }
 160  160  
 161  161          nvlist_free(nvl);
 162  162  }
 163  163  
 164  164  /*
 165  165   * Called by proc_exit() when a zone's init exits, presumably because
 166  166   * it failed.  As long as the given zone is still in the "running"
 167  167   * state, we will re-exec() init, but first we need to reset things
 168  168   * which are usually inherited across exec() but will break init's
 169      - * assumption that it is being exec()'d from a virgin process.  Most
      169 + * assumption that it is being exec()'d from a virgin process.  Most
 170  170   * importantly this includes closing all file descriptors (exec only
 171  171   * closes those marked close-on-exec) and resetting signals (exec only
 172  172   * resets handled signals, and we need to clear any signals which
 173  173   * killed init).  Anything else that exec(2) says would be inherited,
 174  174   * but would affect the execution of init, needs to be reset.
 175  175   */
 176  176  static int
 177  177  restart_init(int what, int why)
 178  178  {
 179  179          kthread_t *t = curthread;

 180  180          klwp_t *lwp = ttolwp(t);
 181  181          proc_t *p = ttoproc(t);
 182  182          proc_t *pp = p->p_zone->zone_zsched;
 183  183          user_t *up = PTOU(p);
 184  184  
 185  185          vnode_t *oldcd, *oldrd;
 186  186          int i, err;
 187  187          char reason_buf[64];
 188  188  
 189  189          /*
 190  190           * Let zone admin (and global zone admin if this is for a non-global
 191  191           * zone) know that init has failed and will be restarted.
 192  192           */
 193  193          zcmn_err(p->p_zone->zone_id, CE_WARN,
 194  194              "init(1M) %s: restarting automatically",
 195  195              exit_reason(reason_buf, sizeof (reason_buf), what, why));
 196  196  
 197  197          if (!INGLOBALZONE(p)) {
 198  198                  cmn_err(CE_WARN, "init(1M) for zone %s (pid %d) %s: "
 199  199                      "restarting automatically",
 200  200                      p->p_zone->zone_name, p->p_pid, reason_buf);
 201  201          }
 202  202  
 203  203          /*
 204  204           * Remove any fpollinfo_t's for this (last) thread from our file
 205  205           * descriptors so closeall() can ASSERT() that they're all gone.
 206  206           * Then close all open file descriptors in the process.
 207  207           */
 208  208          pollcleanup();
 209  209          closeall(P_FINFO(p));
 210  210  
 211  211          /*
 212  212           * Grab p_lock and begin clearing miscellaneous global process
 213  213           * state that needs to be reset before we exec the new init(1M).
 214  214           */
 215  215  
 216  216          mutex_enter(&p->p_lock);
 217  217          prbarrier(p);
 218  218  
 219  219          p->p_flag &= ~(SKILLED | SEXTKILLED | SEXITING | SDOCORE);
 220  220          up->u_cmask = CMASK;
 221  221  
 222  222          sigemptyset(&t->t_hold);
 223  223          sigemptyset(&t->t_sig);
 224  224          sigemptyset(&t->t_extsig);
 225  225  
 226  226          sigemptyset(&p->p_sig);
 227  227          sigemptyset(&p->p_extsig);
 228  228  
 229  229          sigdelq(p, t, 0);
 230  230          sigdelq(p, NULL, 0);
 231  231  
 232  232          if (p->p_killsqp) {
 233  233                  siginfofree(p->p_killsqp);
 234  234                  p->p_killsqp = NULL;
 235  235          }
 236  236  
 237  237          /*
 238  238           * Reset any signals that are ignored back to the default disposition.
 239  239           * Other u_signal members will be cleared when exec calls sigdefault().
 240  240           */
 241  241          for (i = 1; i < NSIG; i++) {
 242  242                  if (up->u_signal[i - 1] == SIG_IGN) {
 243  243                          up->u_signal[i - 1] = SIG_DFL;
 244  244                          sigemptyset(&up->u_sigmask[i - 1]);
 245  245                  }
 246  246          }
 247  247  
 248  248          /*
 249  249           * Clear the current signal, any signal info associated with it, and
 250  250           * any signal information from contracts and/or contract templates.
 251  251           */
 252  252          lwp->lwp_cursig = 0;
 253  253          lwp->lwp_extsig = 0;
 254  254          if (lwp->lwp_curinfo != NULL) {
 255  255                  siginfofree(lwp->lwp_curinfo);
 256  256                  lwp->lwp_curinfo = NULL;
 257  257          }
 258  258          lwp_ctmpl_clear(lwp, B_FALSE);
 259  259  
 260  260          /*
 261  261           * Reset both the process root directory and the current working
 262  262           * directory to the root of the zone just as we do during boot.
 263  263           */
 264  264          VN_HOLD(p->p_zone->zone_rootvp);
 265  265          oldrd = up->u_rdir;
 266  266          up->u_rdir = p->p_zone->zone_rootvp;
 267  267  
 268  268          VN_HOLD(p->p_zone->zone_rootvp);
 269  269          oldcd = up->u_cdir;
 270  270          up->u_cdir = p->p_zone->zone_rootvp;
 271  271  
 272  272          if (up->u_cwd != NULL) {
 273  273                  refstr_rele(up->u_cwd);
 274  274                  up->u_cwd = NULL;
 275  275          }
 276  276  
 277  277          /* Reset security flags */
 278  278          mutex_enter(&pp->p_lock);
 279  279          p->p_secflags = pp->p_secflags;
 280  280          mutex_exit(&pp->p_lock);
 281  281  
 282  282          mutex_exit(&p->p_lock);
 283  283  
 284  284          if (oldrd != NULL)
 285  285                  VN_RELE(oldrd);
 286  286          if (oldcd != NULL)
 287  287                  VN_RELE(oldcd);
 288  288  
 289  289          /*
 290  290           * It's possible that a zone's init will have become privilege aware
 291  291           * and modified privilege sets; reset them.
 292  292           */
 293  293          cred_t *oldcr, *newcr;
 294  294  
 295  295          mutex_enter(&p->p_crlock);
 296  296          oldcr = p->p_cred;
 297  297          mutex_enter(&pp->p_crlock);
 298  298          crhold(newcr = p->p_cred = pp->p_cred);
 299  299          mutex_exit(&pp->p_crlock);
 300  300          mutex_exit(&p->p_crlock);
 301  301          crfree(oldcr);
 302  302          /* Additional hold for the current thread - expected by crset() */
 303  303          crhold(newcr);
 304  304          crset(p, newcr);
 305  305  
 306  306          /* Free the controlling tty.  (freectty() always assumes curproc.) */
 307  307          ASSERT(p == curproc);
 308  308          (void) freectty(B_TRUE);
 309  309  
 310  310          restart_init_notify(p->p_zone);
 311  311  
 312  312          /*
 313  313           * Now exec() the new init(1M) on top of the current process.  If we
 314  314           * succeed, the caller will treat this like a successful system call.
 315  315           * If we fail, we issue messages and the caller will proceed with exit.
 316  316           */
 317  317          err = exec_init(p->p_zone->zone_initname, NULL);
 318  318  
 319  319          if (err == 0)
 320  320                  return (0);
 321  321  
 322  322          zcmn_err(p->p_zone->zone_id, CE_WARN,
 323  323              "failed to restart init(1M) (err=%d): system reboot required", err);
 324  324  
 325  325          if (!INGLOBALZONE(p)) {
 326  326                  cmn_err(CE_WARN, "failed to restart init(1M) for zone %s "
 327  327                      "(pid %d, err=%d): zoneadm(1M) boot required",
 328  328                      p->p_zone->zone_name, p->p_pid, err);
 329  329          }
 330  330  
 331  331          return (-1);
 332  332  }
 333  333  
 334  334  /*
 335  335   * Release resources.

↓ open down ↓

156 lines elided

↑ open up ↑

 336  336   * Enter zombie state.
 337  337   * Wake up parent and init processes,
 338  338   * and dispose of children.
 339  339   */
 340  340  void
 341  341  exit(int why, int what)
 342  342  {
 343  343          /*
 344  344           * If proc_exit() fails, then some other lwp in the process
 345  345           * got there first.  We just have to call lwp_exit() to allow
 346      -         * the other lwp to finish exiting the process.  Otherwise we're
      346 +         * the other lwp to finish exiting the process.  Otherwise we're
 347  347           * restarting init, and should return.
 348  348           */
 349  349          if (proc_exit(why, what) != 0) {
 350  350                  mutex_enter(&curproc->p_lock);
 351  351                  ASSERT(curproc->p_flag & SEXITLWPS);
 352  352                  lwp_exit();
 353  353                  /* NOTREACHED */
 354  354          }
 355  355  }
 356  356  
 357  357  /*
 358  358   * Set the SEXITING flag on the process, after making sure /proc does
 359      - * not have it locked.  This is done in more places than proc_exit(),
      359 + * not have it locked.  This is done in more places than proc_exit(),
 360  360   * so it is a separate function.
 361  361   */
 362  362  void
 363  363  proc_is_exiting(proc_t *p)
 364  364  {
 365  365          mutex_enter(&p->p_lock);
 366  366          prbarrier(p);
 367  367          p->p_flag |= SEXITING;
 368  368          mutex_exit(&p->p_lock);
 369  369  }

 370  370  
 371  371  /*
 372  372   * Return true if zone's init is restarted, false if exit processing should
 373  373   * proceeed.
 374  374   */
 375  375  static boolean_t
 376  376  zone_init_exit(zone_t *z, int why, int what)
 377  377  {
 378  378          /*
 379  379           * Typically we don't let the zone's init exit unless zone_start_init()
 380  380           * failed its exec, or we are shutting down the zone or the machine,
 381  381           * although the various flags handled within this function will control
 382  382           * the behavior.
 383  383           *
 384  384           * Since we are single threaded, we don't need to lock the following
 385  385           * accesses to zone_proc_initpid.
 386  386           */
 387  387          if (z->zone_boot_err != 0 ||
 388  388              zone_status_get(z) >= ZONE_IS_SHUTTING_DOWN ||
 389  389              zone_status_get(global_zone) >= ZONE_IS_SHUTTING_DOWN) {
 390  390                  /*
 391  391                   * Clear the zone's init pid and proceed with exit processing.
 392  392                   */
 393  393                  z->zone_proc_initpid = -1;
 394  394                  return (B_FALSE);
 395  395          }
 396  396  
 397  397          /*
 398  398           * There are a variety of configuration flags on the zone to control
 399  399           * init exit behavior.
 400  400           *
 401  401           * If the init process should be restarted, the "zone_restart_init"
 402  402           * member will be set.
 403  403           */
 404  404          if (!z->zone_restart_init) {
 405  405                  /*
 406  406                   * The zone has been setup to halt when init exits.
 407  407                   */
 408  408                  z->zone_init_status = wstat(why, what);
 409  409                  (void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL, zone_kcred());
 410  410                  z->zone_proc_initpid = -1;
 411  411                  return (B_FALSE);
 412  412          }
 413  413  
 414  414          /*
 415  415           * At this point we know we're configured to restart init, but there
 416  416           * are various modifiers to that behavior.
 417  417           */
 418  418  
 419  419          if (z->zone_reboot_on_init_exit) {
 420  420                  /*
 421  421                   * Some init programs in branded zones do not tolerate a
 422  422                   * restart in the traditional manner; setting
 423  423                   * "zone_reboot_on_init_exit" will cause the entire zone to be
 424  424                   * rebooted instead.
 425  425                   */
 426  426  
 427  427                  if (z->zone_restart_init_0) {
 428  428                          /*
 429  429                           * Some init programs in branded zones only want to
 430  430                           * restart if they exit 0, otherwise the zone should
 431  431                           * shutdown. Setting the "zone_restart_init_0" member
 432  432                           * controls this behavior.
 433  433                           */
 434  434                          if (why == CLD_EXITED && what == 0) {
 435  435                                  /* Trigger a zone reboot */
 436  436                                  (void) zone_kadmin(A_REBOOT, 0, NULL,
 437  437                                      zone_kcred());
 438  438                          } else {
 439  439                                  /* Shutdown instead of reboot */
 440  440                                  (void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL,
 441  441                                      zone_kcred());
 442  442                          }
 443  443                  } else {
 444  444                          /* Trigger a zone reboot */
 445  445                          (void) zone_kadmin(A_REBOOT, 0, NULL, zone_kcred());
 446  446                  }
 447  447  
 448  448                  z->zone_init_status = wstat(why, what);
 449  449                  z->zone_proc_initpid = -1;
 450  450                  return (B_FALSE);
 451  451          }
 452  452  
 453  453          if (z->zone_restart_init_0) {
 454  454                  /*
 455  455                   * Some init programs in branded zones only want to restart if
 456  456                   * they exit 0, otherwise the zone should shutdown. Setting the
 457  457                   * "zone_restart_init_0" member controls this behavior.
 458  458                   *
 459  459                   * In this case we only restart init if it exited successfully.
 460  460                   */
 461  461                  if (why == CLD_EXITED && what == 0 &&
 462  462                      restart_init(what, why) == 0) {
 463  463                          return (B_TRUE);

↓ open down ↓

94 lines elided

↑ open up ↑

 464  464                  }
 465  465          } else {
 466  466                  /*
 467  467                   * No restart modifiers on the zone, attempt to restart init.
 468  468                   */
 469  469                  if (restart_init(what, why) == 0) {
 470  470                          return (B_TRUE);
 471  471                  }
 472  472          }
 473  473  
 474      -
 475  474          /*
 476      -         * The restart failed, the zone will shut down.
      475 +         * The restart failed, or the criteria for a restart are not met;
      476 +         * the zone will shut down.
 477  477           */
 478  478          z->zone_init_status = wstat(why, what);
 479  479          (void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL, zone_kcred());
 480  480          z->zone_proc_initpid = -1;
 481  481          return (B_FALSE);
 482  482  }
 483  483  
 484  484  /*
 485  485   * Return value:
 486  486   *   1 - exitlwps() failed, call (or continue) lwp_exit()

 487  487   *   0 - restarting init.  Return through system call path
 488  488   */
 489  489  int
 490  490  proc_exit(int why, int what)
 491  491  {
 492  492          kthread_t *t = curthread;
 493  493          klwp_t *lwp = ttolwp(t);
 494  494          proc_t *p = ttoproc(t);
 495  495          zone_t *z = p->p_zone;
 496  496          timeout_id_t tmp_id;
 497  497          int rv;
 498  498          proc_t *q;
 499  499          task_t *tk;
 500  500          vnode_t *exec_vp, *execdir_vp, *cdir, *rdir;
 501  501          sigqueue_t *sqp;
 502  502          lwpdir_t *lwpdir;

↓ open down ↓

16 lines elided

↑ open up ↑

 503  503          uint_t lwpdir_sz;
 504  504          tidhash_t *tidhash;
 505  505          uint_t tidhash_sz;
 506  506          ret_tidhash_t *ret_tidhash;
 507  507          refstr_t *cwd;
 508  508          hrtime_t hrutime, hrstime;
 509  509          int evaporate;
 510  510  
 511  511          /*
 512  512           * Stop and discard the process's lwps except for the current one,
 513      -         * unless some other lwp beat us to it.  If exitlwps() fails then
      513 +         * unless some other lwp beat us to it.  If exitlwps() fails then
 514  514           * return and the calling lwp will call (or continue in) lwp_exit().
 515  515           */
 516  516          proc_is_exiting(p);
 517  517          if (exitlwps(0) != 0)
 518  518                  return (1);
 519  519  
 520  520          mutex_enter(&p->p_lock);
 521  521          if (p->p_ttime > 0) {
 522  522                  /*
 523  523                   * Account any remaining ticks charged to this process
 524  524                   * on its way out.
 525  525                   */
 526  526                  (void) task_cpu_time_incr(p->p_task, p->p_ttime);
 527  527                  p->p_ttime = 0;
 528  528          }
 529  529          mutex_exit(&p->p_lock);
 530  530  
      531 +        /*
      532 +         * Don't let init exit unless zone_start_init() failed its exec, or
      533 +         * we are shutting down the zone or the machine.
      534 +         *
      535 +         * Since we are single threaded, we don't need to lock the
      536 +         * following accesses to zone_proc_initpid.
      537 +         */
 531  538          if (p->p_pid == z->zone_proc_initpid) {
 532  539                  /* If zone's init restarts, we're done here. */
 533  540                  if (zone_init_exit(z, why, what))
 534  541                          return (0);
 535  542          }
 536  543  
 537  544          /*
 538  545           * Delay firing probes (and performing brand cleanup) until after the
 539  546           * zone_proc_initpid check. Cases which result in zone shutdown or
 540  547           * restart via zone_kadmin eventually result in a call back to

 541  548           * proc_exit.
 542  549           */
 543  550          DTRACE_PROC(lwp__exit);
 544  551          DTRACE_PROC1(exit, int, why);
 545  552  
 546  553          /*
 547  554           * Will perform any brand specific proc exit processing. Since this
 548  555           * is always the last lwp, will also perform lwp exit/free and proc
 549  556           * exit. Brand data will be freed when the process is reaped.
 550  557           */
 551  558          if (PROC_IS_BRANDED(p)) {
 552  559                  BROP(p)->b_lwpexit(lwp);
 553  560                  BROP(p)->b_proc_exit(p);
 554  561                  /*
 555  562                   * To ensure that b_proc_exit has access to brand-specific data
 556  563                   * contained by the one remaining lwp, call the freelwp hook as
 557  564                   * the last part of this clean-up process.
 558  565                   */
 559  566                  BROP(p)->b_freelwp(lwp);
 560  567                  lwp_detach_brand_hdlrs(lwp);
 561  568          }
 562  569  
 563  570          lwp_pcb_exit();
 564  571  
 565  572          /*
 566  573           * Allocate a sigqueue now, before we grab locks.
 567  574           * It will be given to sigcld(), below.
 568  575           * Special case:  If we will be making the process disappear
 569  576           * without a trace because it is either:
 570  577           *      * an exiting SSYS process, or
 571  578           *      * a posix_spawn() vfork child who requests it,
 572  579           * we don't bother to allocate a useless sigqueue.
 573  580           */
 574  581          evaporate = (p->p_flag & SSYS) || ((p->p_flag & SVFORK) &&
 575  582              why == CLD_EXITED && what == _EVAPORATE);
 576  583          if (!evaporate)
 577  584                  sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
 578  585  
 579  586          /*
 580  587           * revoke any doors created by the process.
 581  588           */
 582  589          if (p->p_door_list)
 583  590                  door_exit();
 584  591  
 585  592          /*
 586  593           * Release schedctl data structures.
 587  594           */
 588  595          if (p->p_pagep)
 589  596                  schedctl_proc_cleanup();
 590  597  
 591  598          /*
 592  599           * make sure all pending kaio has completed.
 593  600           */
 594  601          if (p->p_aio)
 595  602                  aio_cleanup_exit();
 596  603  
 597  604          /*
 598  605           * discard the lwpchan cache.
 599  606           */
 600  607          if (p->p_lcp != NULL)
 601  608                  lwpchan_destroy_cache(0);
 602  609  
 603  610          /*
 604  611           * Clean up any DTrace helper actions or probes for the process.
 605  612           */
 606  613          if (p->p_dtrace_helpers != NULL) {
 607  614                  ASSERT(dtrace_helpers_cleanup != NULL);
 608  615                  (*dtrace_helpers_cleanup)(p);
 609  616          }
 610  617  
 611  618          /*
 612  619           * Clean up any signalfd state for the process.
 613  620           */
 614  621          if (p->p_sigfd != NULL) {
 615  622                  VERIFY(sigfd_exit_helper != NULL);
 616  623                  (*sigfd_exit_helper)();
 617  624          }
 618  625  
 619  626          /* untimeout the realtime timers */
 620  627          if (p->p_itimer != NULL)
 621  628                  timer_exit();
 622  629  
 623  630          if ((tmp_id = p->p_alarmid) != 0) {
 624  631                  p->p_alarmid = 0;
 625  632                  (void) untimeout(tmp_id);
 626  633          }
 627  634  
 628  635          /*
 629  636           * If we had generated any upanic(2) state, free that now.
 630  637           */
 631  638          if (p->p_upanic != NULL) {
 632  639                  kmem_free(p->p_upanic, PRUPANIC_BUFLEN);
 633  640                  p->p_upanic = NULL;
 634  641          }
 635  642  
 636  643          /*
 637  644           * Remove any fpollinfo_t's for this (last) thread from our file
 638  645           * descriptors so closeall() can ASSERT() that they're all gone.
 639  646           */
 640  647          pollcleanup();
 641  648  
 642  649          if (p->p_rprof_cyclic != CYCLIC_NONE) {
 643  650                  mutex_enter(&cpu_lock);
 644  651                  cyclic_remove(p->p_rprof_cyclic);
 645  652                  mutex_exit(&cpu_lock);
 646  653          }
 647  654  
 648  655          mutex_enter(&p->p_lock);
 649  656  
 650  657          /*
 651  658           * Clean up any DTrace probes associated with this process.
 652  659           */
 653  660          if (p->p_dtrace_probes) {
 654  661                  ASSERT(dtrace_fasttrap_exit_ptr != NULL);
 655  662                  dtrace_fasttrap_exit_ptr(p);
 656  663          }
 657  664  
 658  665          while ((tmp_id = p->p_itimerid) != 0) {
 659  666                  p->p_itimerid = 0;
 660  667                  mutex_exit(&p->p_lock);
 661  668                  (void) untimeout(tmp_id);
 662  669                  mutex_enter(&p->p_lock);
 663  670          }
 664  671  
 665  672          lwp_cleanup();
 666  673  
 667  674          /*
 668  675           * We are about to exit; prevent our resource associations from
 669  676           * being changed.
 670  677           */
 671  678          pool_barrier_enter();
 672  679  
 673  680          /*
 674  681           * Block the process against /proc now that we have really
 675  682           * acquired p->p_lock (to manipulate p_tlist at least).
 676  683           */
 677  684          prbarrier(p);
 678  685  
 679  686          sigfillset(&p->p_ignore);
 680  687          sigemptyset(&p->p_siginfo);
 681  688          sigemptyset(&p->p_sig);
 682  689          sigemptyset(&p->p_extsig);
 683  690          sigemptyset(&t->t_sig);
 684  691          sigemptyset(&t->t_extsig);
 685  692          sigemptyset(&p->p_sigmask);
 686  693          sigdelq(p, t, 0);
 687  694          lwp->lwp_cursig = 0;
 688  695          lwp->lwp_extsig = 0;
 689  696          p->p_flag &= ~(SKILLED | SEXTKILLED);
 690  697          if (lwp->lwp_curinfo) {
 691  698                  siginfofree(lwp->lwp_curinfo);
 692  699                  lwp->lwp_curinfo = NULL;
 693  700          }
 694  701  
 695  702          t->t_proc_flag |= TP_LWPEXIT;
 696  703          ASSERT(p->p_lwpcnt == 1 && p->p_zombcnt == 0);
 697  704          prlwpexit(t);           /* notify /proc */
 698  705          lwp_hash_out(p, t->t_tid);
 699  706          prexit(p);
 700  707  
 701  708          p->p_lwpcnt = 0;
 702  709          p->p_tlist = NULL;
 703  710          sigqfree(p);
 704  711          term_mstate(t);
 705  712          p->p_mterm = gethrtime();
 706  713  
 707  714          exec_vp = p->p_exec;
 708  715          execdir_vp = p->p_execdir;
 709  716          p->p_exec = NULLVP;
 710  717          p->p_execdir = NULLVP;
 711  718          mutex_exit(&p->p_lock);
 712  719  
 713  720          pr_free_watched_pages(p);
 714  721  
 715  722          closeall(P_FINFO(p));
 716  723  
 717  724          /* Free the controlling tty.  (freectty() always assumes curproc.) */
 718  725          ASSERT(p == curproc);
 719  726          (void) freectty(B_TRUE);
 720  727  
 721  728  #if defined(__sparc)
 722  729          if (p->p_utraps != NULL)
 723  730                  utrap_free(p);
 724  731  #endif
 725  732          if (p->p_semacct)                       /* IPC semaphore exit */
 726  733                  semexit(p);
 727  734          rv = wstat(why, what);
 728  735  
 729  736          acct(rv);
 730  737          exacct_commit_proc(p, rv);
 731  738  
 732  739          /*
 733  740           * Release any resources associated with C2 auditing
 734  741           */
 735  742          if (AU_AUDITING()) {
 736  743                  /*
 737  744                   * audit exit system call
 738  745                   */
 739  746                  audit_exit(why, what);
 740  747          }
 741  748  
 742  749          /*
 743  750           * Free address space.
 744  751           */
 745  752          relvm();
 746  753  
 747  754          if (exec_vp) {
 748  755                  /*
 749  756                   * Close this executable which has been opened when the process
 750  757                   * was created by getproc().
 751  758                   */
 752  759                  (void) VOP_CLOSE(exec_vp, FREAD, 1, (offset_t)0, CRED(), NULL);
 753  760                  VN_RELE(exec_vp);
 754  761          }
 755  762          if (execdir_vp)
 756  763                  VN_RELE(execdir_vp);
 757  764  
 758  765          /*
 759  766           * Release held contracts.
 760  767           */
 761  768          contract_exit(p);
 762  769  
 763  770          /*
 764  771           * Depart our encapsulating process contract.
 765  772           */
 766  773          if ((p->p_flag & SSYS) == 0) {
 767  774                  ASSERT(p->p_ct_process);
 768  775                  contract_process_exit(p->p_ct_process, p, rv);
 769  776          }
 770  777  
 771  778          /*
 772  779           * Remove pool association, and block if requested by pool_do_bind.
 773  780           */
 774  781          mutex_enter(&p->p_lock);
 775  782          ASSERT(p->p_pool->pool_ref > 0);
 776  783          atomic_dec_32(&p->p_pool->pool_ref);
 777  784          p->p_pool = pool_default;
 778  785          /*
 779  786           * Now that our address space has been freed and all other threads
 780  787           * in this process have exited, set the PEXITED pool flag.  This
 781  788           * tells the pools subsystems to ignore this process if it was
 782  789           * requested to rebind this process to a new pool.
 783  790           */
 784  791          p->p_poolflag |= PEXITED;
 785  792          pool_barrier_exit();
 786  793          mutex_exit(&p->p_lock);
 787  794  
 788  795          mutex_enter(&pidlock);
 789  796  
 790  797          /*
 791  798           * Delete this process from the newstate list of its parent. We
 792  799           * will put it in the right place in the sigcld in the end.
 793  800           */
 794  801          delete_ns(p->p_parent, p);
 795  802  
 796  803          /*
 797  804           * Reassign the orphans to the next of kin.
 798  805           * Don't rearrange init's orphanage.
 799  806           */
 800  807          if ((q = p->p_orphan) != NULL && p != proc_init) {
 801  808  
 802  809                  proc_t *nokp = p->p_nextofkin;
 803  810  
 804  811                  for (;;) {
 805  812                          q->p_nextofkin = nokp;
 806  813                          if (q->p_nextorph == NULL)
 807  814                                  break;
 808  815                          q = q->p_nextorph;
 809  816                  }
 810  817                  q->p_nextorph = nokp->p_orphan;
 811  818                  nokp->p_orphan = p->p_orphan;
 812  819                  p->p_orphan = NULL;
 813  820          }
 814  821  
 815  822          /*
 816  823           * Reassign the children to init.
 817  824           * Don't try to assign init's children to init.
 818  825           */
 819  826          if ((q = p->p_child) != NULL && p != proc_init) {
 820  827                  struct proc     *np;
 821  828                  struct proc     *initp = proc_init;
 822  829                  pid_t           zone_initpid = 1;
 823  830                  struct proc     *zoneinitp = NULL;
 824  831                  boolean_t       setzonetop = B_FALSE;
 825  832  
 826  833                  if (!INGLOBALZONE(curproc)) {
 827  834                          zone_initpid = curproc->p_zone->zone_proc_initpid;
 828  835  
 829  836                          ASSERT(MUTEX_HELD(&pidlock));
 830  837                          zoneinitp = prfind(zone_initpid);
 831  838                          if (zoneinitp != NULL) {
 832  839                                  initp = zoneinitp;
 833  840                          } else {
 834  841                                  zone_initpid = 1;
 835  842                                  setzonetop = B_TRUE;
 836  843                          }
 837  844                  }
 838  845  
 839  846                  pgdetach(p);
 840  847  
 841  848                  do {
 842  849                          np = q->p_sibling;
 843  850                          /*
 844  851                           * Delete it from its current parent new state
 845  852                           * list and add it to init new state list
 846  853                           */
 847  854                          delete_ns(q->p_parent, q);
 848  855  
 849  856                          q->p_ppid = zone_initpid;
 850  857  
 851  858                          q->p_pidflag &= ~(CLDNOSIGCHLD | CLDWAITPID);
 852  859                          if (setzonetop) {
 853  860                                  mutex_enter(&q->p_lock);
 854  861                                  q->p_flag |= SZONETOP;
 855  862                                  mutex_exit(&q->p_lock);
 856  863                          }
 857  864                          q->p_parent = initp;
 858  865  
 859  866                          /*
 860  867                           * Since q will be the first child,
 861  868                           * it will not have a previous sibling.
 862  869                           */
 863  870                          q->p_psibling = NULL;
 864  871                          if (initp->p_child) {
 865  872                                  initp->p_child->p_psibling = q;
 866  873                          }
 867  874                          q->p_sibling = initp->p_child;
 868  875                          initp->p_child = q;
 869  876                          if (q->p_proc_flag & P_PR_PTRACE) {
 870  877                                  mutex_enter(&q->p_lock);
 871  878                                  sigtoproc(q, NULL, SIGKILL);
 872  879                                  mutex_exit(&q->p_lock);
 873  880                          }
 874  881                          /*
 875  882                           * sigcld() will add the child to parents
 876  883                           * newstate list.
 877  884                           */
 878  885                          if (q->p_stat == SZOMB)
 879  886                                  sigcld(q, NULL);
 880  887                  } while ((q = np) != NULL);
 881  888  
 882  889                  p->p_child = NULL;
 883  890                  ASSERT(p->p_child_ns == NULL);
 884  891          }
 885  892  
 886  893          TRACE_1(TR_FAC_PROC, TR_PROC_EXIT, "proc_exit: %p", p);
 887  894  
 888  895          mutex_enter(&p->p_lock);
 889  896          CL_EXIT(curthread); /* tell the scheduler that curthread is exiting */
 890  897  
 891  898          /*
 892  899           * Have our task accummulate our resource usage data before they
 893  900           * become contaminated by p_cacct etc., and before we renounce
 894  901           * membership of the task.
 895  902           *
 896  903           * We do this regardless of whether or not task accounting is active.
 897  904           * This is to avoid having nonsense data reported for this task if
 898  905           * task accounting is subsequently enabled. The overhead is minimal;
 899  906           * by this point, this process has accounted for the usage of all its
 900  907           * LWPs. We nonetheless do the work here, and under the protection of
 901  908           * pidlock, so that the movement of the process's usage to the task
 902  909           * happens at the same time as the removal of the process from the
 903  910           * task, from the point of view of exacct_snapshot_task_usage().
 904  911           */
 905  912          exacct_update_task_mstate(p);
 906  913  
 907  914          hrutime = mstate_aggr_state(p, LMS_USER);
 908  915          hrstime = mstate_aggr_state(p, LMS_SYSTEM);
 909  916          p->p_utime = (clock_t)NSEC_TO_TICK(hrutime) + p->p_cutime;
 910  917          p->p_stime = (clock_t)NSEC_TO_TICK(hrstime) + p->p_cstime;
 911  918  
 912  919          p->p_acct[LMS_USER]     += p->p_cacct[LMS_USER];
 913  920          p->p_acct[LMS_SYSTEM]   += p->p_cacct[LMS_SYSTEM];
 914  921          p->p_acct[LMS_TRAP]     += p->p_cacct[LMS_TRAP];
 915  922          p->p_acct[LMS_TFAULT]   += p->p_cacct[LMS_TFAULT];
 916  923          p->p_acct[LMS_DFAULT]   += p->p_cacct[LMS_DFAULT];
 917  924          p->p_acct[LMS_KFAULT]   += p->p_cacct[LMS_KFAULT];
 918  925          p->p_acct[LMS_USER_LOCK] += p->p_cacct[LMS_USER_LOCK];
 919  926          p->p_acct[LMS_SLEEP]    += p->p_cacct[LMS_SLEEP];
 920  927          p->p_acct[LMS_WAIT_CPU] += p->p_cacct[LMS_WAIT_CPU];
 921  928          p->p_acct[LMS_STOPPED]  += p->p_cacct[LMS_STOPPED];
 922  929  
 923  930          p->p_ru.minflt  += p->p_cru.minflt;
 924  931          p->p_ru.majflt  += p->p_cru.majflt;
 925  932          p->p_ru.nswap   += p->p_cru.nswap;
 926  933          p->p_ru.inblock += p->p_cru.inblock;
 927  934          p->p_ru.oublock += p->p_cru.oublock;
 928  935          p->p_ru.msgsnd  += p->p_cru.msgsnd;
 929  936          p->p_ru.msgrcv  += p->p_cru.msgrcv;
 930  937          p->p_ru.nsignals += p->p_cru.nsignals;
 931  938          p->p_ru.nvcsw   += p->p_cru.nvcsw;
 932  939          p->p_ru.nivcsw  += p->p_cru.nivcsw;
 933  940          p->p_ru.sysc    += p->p_cru.sysc;
 934  941          p->p_ru.ioch    += p->p_cru.ioch;
 935  942  
 936  943          p->p_stat = SZOMB;
 937  944          p->p_proc_flag &= ~P_PR_PTRACE;
 938  945          p->p_wdata = what;
 939  946          p->p_wcode = (char)why;
 940  947  
 941  948          cdir = PTOU(p)->u_cdir;
 942  949          rdir = PTOU(p)->u_rdir;
 943  950          cwd = PTOU(p)->u_cwd;
 944  951  
 945  952          ASSERT(cdir != NULL || p->p_parent == &p0);
 946  953  
 947  954          /*
 948  955           * Release resource controls, as they are no longer enforceable.
 949  956           */
 950  957          rctl_set_free(p->p_rctls);
 951  958  
 952  959          /*
 953  960           * Decrement tk_nlwps counter for our task.max-lwps resource control.
 954  961           * An extended accounting record, if that facility is active, is
 955  962           * scheduled to be written.  We cannot give up task and project
 956  963           * membership at this point because that would allow zombies to escape
 957  964           * from the max-processes resource controls.  Zombies stay in their
 958  965           * current task and project until the process table slot is released
 959  966           * in freeproc().
 960  967           */
 961  968          tk = p->p_task;
 962  969  
 963  970          mutex_enter(&p->p_zone->zone_nlwps_lock);
 964  971          tk->tk_nlwps--;
 965  972          tk->tk_proj->kpj_nlwps--;
 966  973          p->p_zone->zone_nlwps--;
 967  974          mutex_exit(&p->p_zone->zone_nlwps_lock);
 968  975  
 969  976          /*
 970  977           * Clear the lwp directory and the lwpid hash table
 971  978           * now that /proc can't bother us any more.
 972  979           * We free the memory below, after dropping p->p_lock.
 973  980           */
 974  981          lwpdir = p->p_lwpdir;
 975  982          lwpdir_sz = p->p_lwpdir_sz;
 976  983          tidhash = p->p_tidhash;
 977  984          tidhash_sz = p->p_tidhash_sz;
 978  985          ret_tidhash = p->p_ret_tidhash;
 979  986          p->p_lwpdir = NULL;
 980  987          p->p_lwpfree = NULL;
 981  988          p->p_lwpdir_sz = 0;
 982  989          p->p_tidhash = NULL;
 983  990          p->p_tidhash_sz = 0;
 984  991          p->p_ret_tidhash = NULL;
 985  992  
 986  993          /*
 987  994           * If the process has context ops installed, call the exit routine
 988  995           * on behalf of this last remaining thread. Normally exitpctx() is
 989  996           * called during thread_exit() or lwp_exit(), but because this is the
 990  997           * last thread in the process, we must call it here. By the time
 991  998           * thread_exit() is called (below), the association with the relevant
 992  999           * process has been lost.
 993 1000           *
 994 1001           * We also free the context here.
 995 1002           */
 996 1003          if (p->p_pctx) {
 997 1004                  kpreempt_disable();

↓ open down ↓

457 lines elided

↑ open up ↑

 998 1005                  exitpctx(p);
 999 1006                  kpreempt_enable();
1000 1007  
1001 1008                  freepctx(p, 0);
1002 1009          }
1003 1010  
1004 1011          /*
1005 1012           * curthread's proc pointer is changed to point to the 'sched'
1006 1013           * process for the corresponding zone, except in the case when
1007 1014           * the exiting process is in fact a zsched instance, in which
1008      -         * case the proc pointer is set to p0.  We do so, so that the
     1015 +         * case the proc pointer is set to p0.  We do so, so that the
1009 1016           * process still points at the right zone when we call the VN_RELE()
1010 1017           * below.
1011 1018           *
1012 1019           * This is because curthread's original proc pointer can be freed as
1013 1020           * soon as the child sends a SIGCLD to its parent.  We use zsched so
1014 1021           * that for user processes, even in the final moments of death, the
1015 1022           * process is still associated with its zone.
1016 1023           */
1017 1024          if (p != t->t_procp->p_zone->zone_zsched)
1018 1025                  t->t_procp = t->t_procp->p_zone->zone_zsched;

1019 1026          else
1020 1027                  t->t_procp = &p0;
1021 1028  
1022 1029          mutex_exit(&p->p_lock);
1023 1030          if (!evaporate) {
1024 1031                  /*
1025 1032                   * The brand specific code only happens when the brand has a
1026 1033                   * function to call in place of sigcld and the parent of the
1027 1034                   * exiting process is not the global zone init. If the parent
1028 1035                   * is the global zone init, then the process was reparented,
1029 1036                   * and we don't want brand code delivering possibly strange
1030 1037                   * signals to init. Also, init is not branded, so any brand
1031 1038                   * specific exit data will not be picked up by init anyway.
1032 1039                   */
1033 1040                  if (PROC_IS_BRANDED(p) &&
1034 1041                      BROP(p)->b_exit_with_sig != NULL &&
1035 1042                      p->p_ppid != 1) {
1036 1043                          /*
1037 1044                           * The code for _fini that could unload the brand_t
1038 1045                           * blocks until the count of zones using the module
1039 1046                           * reaches zero. Zones decrement the refcount on their
1040 1047                           * brands only after all user tasks in that zone have
1041 1048                           * exited and been waited on. The decrement on the
1042 1049                           * brand's refcount happen in zone_destroy(). That
1043 1050                           * depends on zone_shutdown() having been completed.
1044 1051                           * zone_shutdown() includes a call to zone_empty(),
1045 1052                           * where the zone waits for itself to reach the state
1046 1053                           * ZONE_IS_EMPTY. This state is only set in either
1047 1054                           * zone_shutdown(), when there are no user processes as
1048 1055                           * the zone enters this function, or in
1049 1056                           * zone_task_rele(). zone_task_rele() is called from
1050 1057                           * code triggered by waiting on processes, not by the
1051 1058                           * processes exiting through proc_exit().  This means
1052 1059                           * all the branded processes that could exist for a
1053 1060                           * specific brand_t must exit and get reaped before the
1054 1061                           * refcount on the brand_t can reach 0. _fini will
1055 1062                           * never unload the corresponding brand module before
1056 1063                           * proc_exit finishes execution for all processes
1057 1064                           * branded with a particular brand_t, which makes the
1058 1065                           * operation below safe to do. Brands that wish to use
1059 1066                           * this mechanism must wait in _fini as described
1060 1067                           * above.
1061 1068                           */
1062 1069                          BROP(p)->b_exit_with_sig(p, sqp);
1063 1070                  } else {
1064 1071                          p->p_pidflag &= ~CLDPEND;
1065 1072                          sigcld(p, sqp);
1066 1073                  }
1067 1074  
1068 1075          } else {
1069 1076                  /*
1070 1077                   * Do what sigcld() would do if the disposition
1071 1078                   * of the SIGCHLD signal were set to be ignored.
1072 1079                   */
1073 1080                  cv_broadcast(&p->p_srwchan_cv);
1074 1081                  freeproc(p);
1075 1082          }
1076 1083          mutex_exit(&pidlock);
1077 1084  
1078 1085          /*
1079 1086           * We don't release u_cdir and u_rdir until SZOMB is set.
1080 1087           * This protects us against dofusers().
1081 1088           */

↓ open down ↓

63 lines elided

↑ open up ↑

1082 1089          if (cdir)
1083 1090                  VN_RELE(cdir);
1084 1091          if (rdir)
1085 1092                  VN_RELE(rdir);
1086 1093          if (cwd)
1087 1094                  refstr_rele(cwd);
1088 1095  
1089 1096          /*
1090 1097           * task_rele() may ultimately cause the zone to go away (or
1091 1098           * may cause the last user process in a zone to go away, which
1092      -         * signals zsched to go away).  So prior to this call, we must
     1099 +         * signals zsched to go away).  So prior to this call, we must
1093 1100           * no longer point at zsched.
1094 1101           */
1095 1102          t->t_procp = &p0;
1096 1103  
1097 1104          kmem_free(lwpdir, lwpdir_sz * sizeof (lwpdir_t));
1098 1105          kmem_free(tidhash, tidhash_sz * sizeof (tidhash_t));
1099 1106          while (ret_tidhash != NULL) {
1100 1107                  ret_tidhash_t *next = ret_tidhash->rth_next;
1101 1108                  kmem_free(ret_tidhash->rth_tidhash,
1102 1109                      ret_tidhash->rth_tidhash_sz * sizeof (tidhash_t));

1103 1110                  kmem_free(ret_tidhash, sizeof (*ret_tidhash));
1104 1111                  ret_tidhash = next;
1105 1112          }
1106 1113  
1107 1114          thread_exit();
1108 1115          /* NOTREACHED */
1109 1116  }
1110 1117  
1111 1118  /*
1112 1119   * Format siginfo structure for wait system calls.
1113 1120   */
1114 1121  void
1115 1122  winfo(proc_t *pp, k_siginfo_t *ip, int waitflag)
1116 1123  {
1117 1124          ASSERT(MUTEX_HELD(&pidlock));
1118 1125  
1119 1126          bzero(ip, sizeof (k_siginfo_t));
1120 1127          ip->si_signo = SIGCLD;
1121 1128          ip->si_code = pp->p_wcode;
1122 1129          ip->si_pid = pp->p_pid;
1123 1130          ip->si_ctid = PRCTID(pp);
1124 1131          ip->si_zoneid = pp->p_zone->zone_id;
1125 1132          ip->si_status = pp->p_wdata;
1126 1133          ip->si_stime = pp->p_stime;
1127 1134          ip->si_utime = pp->p_utime;
1128 1135  
1129 1136          if (waitflag) {
1130 1137                  pp->p_wcode = 0;
1131 1138                  pp->p_wdata = 0;
1132 1139                  pp->p_pidflag &= ~CLDPEND;
1133 1140          }
1134 1141  }
1135 1142  
1136 1143  /*
1137 1144   * Wait system call.
1138 1145   * Search for a terminated (zombie) child,
1139 1146   * finally lay it to rest, and collect its status.
1140 1147   * Look also for stopped children,
1141 1148   * and pass back status from them.
1142 1149   */
1143 1150  int
1144 1151  waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
1145 1152  {
1146 1153          proc_t *cp, *pp;
1147 1154          int waitflag = !(options & WNOWAIT);
1148 1155          boolean_t have_brand_helper = B_FALSE;
1149 1156  
1150 1157          /*
1151 1158           * Obsolete flag, defined here only for binary compatibility
1152 1159           * with old statically linked executables.  Delete this when
1153 1160           * we no longer care about these old and broken applications.
1154 1161           */
1155 1162  #define _WNOCHLD        0400
1156 1163          options &= ~_WNOCHLD;
1157 1164  
1158 1165          if (options == 0 || (options & ~WOPTMASK))
1159 1166                  return (EINVAL);
1160 1167  
1161 1168          switch (idtype) {
1162 1169          case P_PID:
1163 1170          case P_PGID:
1164 1171                  if (id < 0 || id >= maxpid)
1165 1172                          return (EINVAL);
1166 1173                  /* FALLTHROUGH */
1167 1174          case P_ALL:
1168 1175                  break;
1169 1176          default:
1170 1177                  return (EINVAL);
1171 1178          }
1172 1179  
1173 1180          pp = ttoproc(curthread);
1174 1181  
1175 1182          /*
1176 1183           * Anytime you are looking for a process, you take pidlock to prevent
1177 1184           * things from changing as you look.
1178 1185           */
1179 1186          mutex_enter(&pidlock);
1180 1187  
1181 1188          /*
1182 1189           * if we are only looking for exited processes and child_ns list
1183 1190           * is empty no reason to look at all children.
1184 1191           */
1185 1192          if (idtype == P_ALL &&
1186 1193              (options & ~WNOWAIT) == (WNOHANG | WEXITED) &&
1187 1194              pp->p_child_ns == NULL) {
1188 1195                  if (pp->p_child) {
1189 1196                          mutex_exit(&pidlock);
1190 1197                          bzero(ip, sizeof (k_siginfo_t));
1191 1198                          return (0);
1192 1199                  }
1193 1200                  mutex_exit(&pidlock);
1194 1201                  return (ECHILD);
1195 1202          }
1196 1203  
1197 1204          if (PROC_IS_BRANDED(pp) && BROP(pp)->b_waitid_helper != NULL) {
1198 1205                  have_brand_helper = B_TRUE;
1199 1206          }
1200 1207  
1201 1208          while (pp->p_child != NULL || have_brand_helper) {
1202 1209                  boolean_t brand_wants_wait = B_FALSE;
1203 1210                  int proc_gone = 0;
1204 1211                  int found = 0;
1205 1212  
1206 1213                  /*
1207 1214                   * Give the brand a chance to return synthetic results from
1208 1215                   * this waitid() call before we do the real thing.
1209 1216                   */
1210 1217                  if (have_brand_helper) {
1211 1218                          int ret;
1212 1219  
1213 1220                          if (BROP(pp)->b_waitid_helper(idtype, id, ip, options,
1214 1221                              &brand_wants_wait, &ret) == 0) {
1215 1222                                  mutex_exit(&pidlock);
1216 1223                                  return (ret);
1217 1224                          }
1218 1225  
1219 1226                          if (pp->p_child == NULL) {
1220 1227                                  goto no_real_children;
1221 1228                          }
1222 1229                  }
1223 1230  
1224 1231                  /*
1225 1232                   * Look for interesting children in the newstate list.
1226 1233                   */
1227 1234                  VERIFY(pp->p_child != NULL);
1228 1235                  for (cp = pp->p_child_ns; cp != NULL; cp = cp->p_sibling_ns) {
1229 1236                          if (idtype != P_PID && (cp->p_pidflag & CLDWAITPID))
1230 1237                                  continue;
1231 1238                          if (idtype == P_PID && id != cp->p_pid)
1232 1239                                  continue;
1233 1240                          if (idtype == P_PGID && id != cp->p_pgrp)
1234 1241                                  continue;
1235 1242                          if (PROC_IS_BRANDED(pp)) {
1236 1243                                  if (BROP(pp)->b_wait_filter != NULL &&
1237 1244                                      BROP(pp)->b_wait_filter(pp, cp) == B_FALSE)
1238 1245                                          continue;
1239 1246                          }
1240 1247  
1241 1248                          switch (cp->p_wcode) {
1242 1249  
1243 1250                          case CLD_TRAPPED:
1244 1251                          case CLD_STOPPED:
1245 1252                          case CLD_CONTINUED:
1246 1253                                  cmn_err(CE_PANIC,
1247 1254                                      "waitid: wrong state %d on the p_newstate"
1248 1255                                      " list", cp->p_wcode);
1249 1256                                  break;
1250 1257  
1251 1258                          case CLD_EXITED:
1252 1259                          case CLD_DUMPED:
1253 1260                          case CLD_KILLED:
1254 1261                                  if (!(options & WEXITED)) {
1255 1262                                          /*
1256 1263                                           * Count how many are already gone
1257 1264                                           * for good.
1258 1265                                           */
1259 1266                                          proc_gone++;
1260 1267                                          break;
1261 1268                                  }
1262 1269                                  if (!waitflag) {
1263 1270                                          winfo(cp, ip, 0);
1264 1271                                  } else {
1265 1272                                          winfo(cp, ip, 1);
1266 1273                                          freeproc(cp);
1267 1274                                  }
1268 1275                                  mutex_exit(&pidlock);
1269 1276                                  if (waitflag) {         /* accept SIGCLD */
1270 1277                                          sigcld_delete(ip);
1271 1278                                          sigcld_repost();
1272 1279                                  }
1273 1280                                  return (0);
1274 1281                          }
1275 1282  
1276 1283                          if (idtype == P_PID)
1277 1284                                  break;
1278 1285                  }
1279 1286  
1280 1287                  /*
1281 1288                   * Wow! None of the threads on the p_sibling_ns list were
1282 1289                   * interesting threads. Check all the kids!
1283 1290                   */
1284 1291                  for (cp = pp->p_child; cp != NULL; cp = cp->p_sibling) {
1285 1292                          if (idtype == P_PID && id != cp->p_pid)
1286 1293                                  continue;
1287 1294                          if (idtype == P_PGID && id != cp->p_pgrp)
1288 1295                                  continue;
1289 1296                          if (PROC_IS_BRANDED(pp)) {
1290 1297                                  if (BROP(pp)->b_wait_filter != NULL &&
1291 1298                                      BROP(pp)->b_wait_filter(pp, cp) == B_FALSE)
1292 1299                                          continue;
1293 1300                          }
1294 1301  
1295 1302                          switch (cp->p_wcode) {
1296 1303                          case CLD_TRAPPED:
1297 1304                                  if (!(options & WTRAPPED))
1298 1305                                          break;
1299 1306                                  winfo(cp, ip, waitflag);
1300 1307                                  mutex_exit(&pidlock);
1301 1308                                  if (waitflag) {         /* accept SIGCLD */
1302 1309                                          sigcld_delete(ip);
1303 1310                                          sigcld_repost();
1304 1311                                  }
1305 1312                                  return (0);
1306 1313  
1307 1314                          case CLD_STOPPED:
1308 1315                                  if (!(options & WSTOPPED))
1309 1316                                          break;
1310 1317                                  /* Is it still stopped? */
1311 1318                                  mutex_enter(&cp->p_lock);
1312 1319                                  if (!jobstopped(cp)) {
1313 1320                                          mutex_exit(&cp->p_lock);
1314 1321                                          break;
1315 1322                                  }
1316 1323                                  mutex_exit(&cp->p_lock);
1317 1324                                  winfo(cp, ip, waitflag);
1318 1325                                  mutex_exit(&pidlock);
1319 1326                                  if (waitflag) {         /* accept SIGCLD */
1320 1327                                          sigcld_delete(ip);
1321 1328                                          sigcld_repost();
1322 1329                                  }
1323 1330                                  return (0);
1324 1331  
1325 1332                          case CLD_CONTINUED:
1326 1333                                  if (!(options & WCONTINUED))
1327 1334                                          break;
1328 1335                                  winfo(cp, ip, waitflag);
1329 1336                                  mutex_exit(&pidlock);
1330 1337                                  if (waitflag) {         /* accept SIGCLD */
1331 1338                                          sigcld_delete(ip);
1332 1339                                          sigcld_repost();
1333 1340                                  }
1334 1341                                  return (0);
1335 1342  
1336 1343                          case CLD_EXITED:
1337 1344                          case CLD_DUMPED:
1338 1345                          case CLD_KILLED:
1339 1346                                  if (idtype != P_PID &&
1340 1347                                      (cp->p_pidflag & CLDWAITPID))
1341 1348                                          continue;
1342 1349                                  /*
1343 1350                                   * Don't complain if a process was found in
1344 1351                                   * the first loop but we broke out of the loop
1345 1352                                   * because of the arguments passed to us.
1346 1353                                   */
1347 1354                                  if (proc_gone == 0) {
1348 1355                                          cmn_err(CE_PANIC,
1349 1356                                              "waitid: wrong state on the"
1350 1357                                              " p_child list");
1351 1358                                  } else {
1352 1359                                          break;
1353 1360                                  }
1354 1361                          }
1355 1362  
1356 1363                          found++;
1357 1364  
1358 1365                          if (idtype == P_PID)
1359 1366                                  break;
1360 1367                  }
1361 1368  
1362 1369  no_real_children:
1363 1370                  /*
1364 1371                   * If we found no interesting processes at all,
1365 1372                   * break out and return ECHILD.
1366 1373                   */
1367 1374                  if (!brand_wants_wait && (found + proc_gone == 0))
1368 1375                          break;
1369 1376  
1370 1377                  if (options & WNOHANG) {
1371 1378                          mutex_exit(&pidlock);
1372 1379                          bzero(ip, sizeof (k_siginfo_t));
1373 1380                          /*
1374 1381                           * We should set ip->si_signo = SIGCLD,
1375 1382                           * but there is an SVVS test that expects
1376 1383                           * ip->si_signo to be zero in this case.
1377 1384                           */
1378 1385                          return (0);
1379 1386                  }
1380 1387  
1381 1388                  /*
1382 1389                   * If we found no processes of interest that could
1383 1390                   * change state while we wait, we don't wait at all.
1384 1391                   * Get out with ECHILD according to SVID.
1385 1392                   */
1386 1393                  if (!brand_wants_wait && (found == proc_gone))
1387 1394                          break;
1388 1395  
1389 1396                  if (!cv_wait_sig_swap(&pp->p_cv, &pidlock)) {
1390 1397                          mutex_exit(&pidlock);
1391 1398                          return (EINTR);
1392 1399                  }
1393 1400          }
1394 1401          mutex_exit(&pidlock);
1395 1402          return (ECHILD);
1396 1403  }
1397 1404  
1398 1405  int
1399 1406  waitsys(idtype_t idtype, id_t id, siginfo_t *infop, int options)
1400 1407  {
1401 1408          int error;
1402 1409          k_siginfo_t info;
1403 1410  
1404 1411          if (error = waitid(idtype, id, &info, options))
1405 1412                  return (set_errno(error));
1406 1413          if (copyout(&info, infop, sizeof (k_siginfo_t)))
1407 1414                  return (set_errno(EFAULT));
1408 1415          return (0);
1409 1416  }
1410 1417  
1411 1418  #ifdef _SYSCALL32_IMPL
1412 1419  
1413 1420  int
1414 1421  waitsys32(idtype_t idtype, id_t id, siginfo_t *infop, int options)
1415 1422  {
1416 1423          int error;
1417 1424          k_siginfo_t info;
1418 1425          siginfo32_t info32;
1419 1426  
1420 1427          if (error = waitid(idtype, id, &info, options))
1421 1428                  return (set_errno(error));
1422 1429          siginfo_kto32(&info, &info32);
1423 1430          if (copyout(&info32, infop, sizeof (info32)))
1424 1431                  return (set_errno(EFAULT));
1425 1432          return (0);
1426 1433  }
1427 1434  
1428 1435  #endif  /* _SYSCALL32_IMPL */
1429 1436  
1430 1437  void
1431 1438  proc_detach(proc_t *p)
1432 1439  {
1433 1440          proc_t *q;
1434 1441  
1435 1442          ASSERT(MUTEX_HELD(&pidlock));
1436 1443  
1437 1444          q = p->p_parent;
1438 1445          ASSERT(q != NULL);
1439 1446  
1440 1447          /*
1441 1448           * Take it off the newstate list of its parent
1442 1449           */
1443 1450          delete_ns(q, p);
1444 1451  
1445 1452          if (q->p_child == p) {
1446 1453                  q->p_child = p->p_sibling;
1447 1454                  /*
1448 1455                   * If the parent has no children, it better not
1449 1456                   * have any with new states either!
1450 1457                   */
1451 1458                  ASSERT(q->p_child ? 1 : q->p_child_ns == NULL);
1452 1459          }
1453 1460  
1454 1461          if (p->p_sibling) {
1455 1462                  p->p_sibling->p_psibling = p->p_psibling;
1456 1463          }
1457 1464  
1458 1465          if (p->p_psibling) {
1459 1466                  p->p_psibling->p_sibling = p->p_sibling;
1460 1467          }
1461 1468  }
1462 1469  
1463 1470  /*
1464 1471   * Remove zombie children from the process table.
1465 1472   */
1466 1473  void
1467 1474  freeproc(proc_t *p)
1468 1475  {
1469 1476          proc_t *q;
1470 1477          task_t *tk;
1471 1478  
1472 1479          ASSERT(p->p_stat == SZOMB);
1473 1480          ASSERT(p->p_tlist == NULL);
1474 1481          ASSERT(MUTEX_HELD(&pidlock));
1475 1482  
1476 1483          sigdelq(p, NULL, 0);
1477 1484          if (p->p_killsqp) {
1478 1485                  siginfofree(p->p_killsqp);
1479 1486                  p->p_killsqp = NULL;
1480 1487          }
1481 1488  
1482 1489          /* Clear any remaining brand data */
1483 1490          if (PROC_IS_BRANDED(p)) {
1484 1491                  brand_clearbrand(p, B_FALSE);
1485 1492          }
1486 1493  
1487 1494  
1488 1495          prfree(p);      /* inform /proc */
1489 1496  
1490 1497          /*
1491 1498           * Don't free the init processes.
1492 1499           * Other dying processes will access it.
1493 1500           */
1494 1501          if (p == proc_init)
1495 1502                  return;
1496 1503  
1497 1504  
1498 1505          /*
1499 1506           * We wait until now to free the cred structure because a
1500 1507           * zombie process's credentials may be examined by /proc.
1501 1508           * No cred locking needed because there are no threads at this point.
1502 1509           */
1503 1510          upcount_dec(crgetruid(p->p_cred), crgetzoneid(p->p_cred));
1504 1511          crfree(p->p_cred);
1505 1512          if (p->p_corefile != NULL) {
1506 1513                  corectl_path_rele(p->p_corefile);
1507 1514                  p->p_corefile = NULL;
1508 1515          }
1509 1516          if (p->p_content != NULL) {
1510 1517                  corectl_content_rele(p->p_content);
1511 1518                  p->p_content = NULL;
1512 1519          }
1513 1520  
1514 1521          if (p->p_nextofkin && !((p->p_nextofkin->p_flag & SNOWAIT) ||
1515 1522              (PTOU(p->p_nextofkin)->u_signal[SIGCLD - 1] == SIG_IGN))) {
1516 1523                  /*
1517 1524                   * This should still do the right thing since p_utime/stime
1518 1525                   * get set to the correct value on process exit, so it
1519 1526                   * should get properly updated
1520 1527                   */
1521 1528                  p->p_nextofkin->p_cutime += p->p_utime;
1522 1529                  p->p_nextofkin->p_cstime += p->p_stime;
1523 1530  
1524 1531                  p->p_nextofkin->p_cacct[LMS_USER] += p->p_acct[LMS_USER];
1525 1532                  p->p_nextofkin->p_cacct[LMS_SYSTEM] += p->p_acct[LMS_SYSTEM];
1526 1533                  p->p_nextofkin->p_cacct[LMS_TRAP] += p->p_acct[LMS_TRAP];
1527 1534                  p->p_nextofkin->p_cacct[LMS_TFAULT] += p->p_acct[LMS_TFAULT];
1528 1535                  p->p_nextofkin->p_cacct[LMS_DFAULT] += p->p_acct[LMS_DFAULT];
1529 1536                  p->p_nextofkin->p_cacct[LMS_KFAULT] += p->p_acct[LMS_KFAULT];
1530 1537                  p->p_nextofkin->p_cacct[LMS_USER_LOCK]
1531 1538                      += p->p_acct[LMS_USER_LOCK];
1532 1539                  p->p_nextofkin->p_cacct[LMS_SLEEP] += p->p_acct[LMS_SLEEP];
1533 1540                  p->p_nextofkin->p_cacct[LMS_WAIT_CPU]
1534 1541                      += p->p_acct[LMS_WAIT_CPU];
1535 1542                  p->p_nextofkin->p_cacct[LMS_STOPPED] += p->p_acct[LMS_STOPPED];
1536 1543  
1537 1544                  p->p_nextofkin->p_cru.minflt    += p->p_ru.minflt;
1538 1545                  p->p_nextofkin->p_cru.majflt    += p->p_ru.majflt;
1539 1546                  p->p_nextofkin->p_cru.nswap     += p->p_ru.nswap;
1540 1547                  p->p_nextofkin->p_cru.inblock   += p->p_ru.inblock;
1541 1548                  p->p_nextofkin->p_cru.oublock   += p->p_ru.oublock;
1542 1549                  p->p_nextofkin->p_cru.msgsnd    += p->p_ru.msgsnd;
1543 1550                  p->p_nextofkin->p_cru.msgrcv    += p->p_ru.msgrcv;
1544 1551                  p->p_nextofkin->p_cru.nsignals  += p->p_ru.nsignals;
1545 1552                  p->p_nextofkin->p_cru.nvcsw     += p->p_ru.nvcsw;
1546 1553                  p->p_nextofkin->p_cru.nivcsw    += p->p_ru.nivcsw;
1547 1554                  p->p_nextofkin->p_cru.sysc      += p->p_ru.sysc;
1548 1555                  p->p_nextofkin->p_cru.ioch      += p->p_ru.ioch;
1549 1556  
1550 1557          }
1551 1558  
1552 1559          q = p->p_nextofkin;
1553 1560          if (q && q->p_orphan == p)
1554 1561                  q->p_orphan = p->p_nextorph;
1555 1562          else if (q) {
1556 1563                  for (q = q->p_orphan; q; q = q->p_nextorph)
1557 1564                          if (q->p_nextorph == p)
1558 1565                                  break;
1559 1566                  ASSERT(q && q->p_nextorph == p);
1560 1567                  q->p_nextorph = p->p_nextorph;
1561 1568          }
1562 1569  
1563 1570          /*
1564 1571           * The process table slot is being freed, so it is now safe to give up
1565 1572           * task and project membership.
1566 1573           */
1567 1574          mutex_enter(&p->p_lock);
1568 1575          tk = p->p_task;
1569 1576          task_detach(p);
1570 1577          mutex_exit(&p->p_lock);
1571 1578  
1572 1579          proc_detach(p);
1573 1580          pid_exit(p, tk);        /* frees pid and proc structure */
1574 1581  
1575 1582          task_rele(tk);
1576 1583  }
1577 1584  
1578 1585  /*
1579 1586   * Delete process "child" from the newstate list of process "parent"
1580 1587   */
1581 1588  void
1582 1589  delete_ns(proc_t *parent, proc_t *child)
1583 1590  {
1584 1591          proc_t **ns;
1585 1592  
1586 1593          ASSERT(MUTEX_HELD(&pidlock));
1587 1594          ASSERT(child->p_parent == parent);
1588 1595          for (ns = &parent->p_child_ns; *ns != NULL; ns = &(*ns)->p_sibling_ns) {
1589 1596                  if (*ns == child) {
1590 1597  
1591 1598                          ASSERT((*ns)->p_parent == parent);
1592 1599  
1593 1600                          *ns = child->p_sibling_ns;
1594 1601                          child->p_sibling_ns = NULL;
1595 1602                          return;
1596 1603                  }
1597 1604          }
1598 1605  }
1599 1606  
1600 1607  /*
1601 1608   * Add process "child" to the new state list of process "parent"
1602 1609   */
1603 1610  void
1604 1611  add_ns(proc_t *parent, proc_t *child)
1605 1612  {
1606 1613          ASSERT(child->p_sibling_ns == NULL);
1607 1614          child->p_sibling_ns = parent->p_child_ns;
1608 1615          parent->p_child_ns = child;
1609 1616  }

↓ open down ↓

507 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX