Print this page
OS-4818 contract template disappears on exec
OS-4460 exec brands processes that still have multiple threads
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Joshua M. Clulow <jmc@joyent.com>
OS-4151 setbrand hooks should be sane during fork
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Joshua M. Clulow <jmc@joyent.com>
OS-4144 panic in lx_freelwp during zone shutdown
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-4129 lxbrand should not abuse p_brand_data for storing exit signal
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Joshua M. Clulow <jmc@joyent.com>
OS-3820 lxbrand ptrace(2): the next generation
OS-3685 lxbrand PTRACE_O_TRACEFORK race condition
OS-3834 lxbrand 64-bit strace(1) reports 64-bit process as using x32 ABI
OS-3794 lxbrand panic on init signal death
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Bryan Cantrill <bryan@joyent.com>
OS-3140 In LX zone 'ps fax' does not show all processes
OS-3429 Expose zone's init exit status
OS-3149 lx brand always sends SIGCHLD to parent processes, regardless of how clone was invoked
OS-2887 lxbrand add WALL, WCLONE, WNOTHREAD support to waitid
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/os/exit.c
          +++ new/usr/src/uts/common/os/exit.c
↓ open down ↓ 13 lines elided ↑ open up ↑
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  24      - * Copyright (c) 2011, Joyent, Inc. All rights reserved.
       24 + * Copyright 2014 Joyent, Inc. All rights reserved.
  25   25   */
  26   26  
  27   27  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  28   28  
  29   29  #include <sys/types.h>
  30   30  #include <sys/param.h>
  31   31  #include <sys/sysmacros.h>
  32   32  #include <sys/systm.h>
  33   33  #include <sys/cred.h>
  34   34  #include <sys/user.h>
↓ open down ↓ 188 lines elided ↑ open up ↑
 223  223          /*
 224  224           * Clear the current signal, any signal info associated with it, and
 225  225           * any signal information from contracts and/or contract templates.
 226  226           */
 227  227          lwp->lwp_cursig = 0;
 228  228          lwp->lwp_extsig = 0;
 229  229          if (lwp->lwp_curinfo != NULL) {
 230  230                  siginfofree(lwp->lwp_curinfo);
 231  231                  lwp->lwp_curinfo = NULL;
 232  232          }
 233      -        lwp_ctmpl_clear(lwp);
      233 +        lwp_ctmpl_clear(lwp, B_FALSE);
 234  234  
 235  235          /*
 236  236           * Reset both the process root directory and the current working
 237  237           * directory to the root of the zone just as we do during boot.
 238  238           */
 239  239          VN_HOLD(p->p_zone->zone_rootvp);
 240  240          oldrd = up->u_rdir;
 241  241          up->u_rdir = p->p_zone->zone_rootvp;
 242  242  
 243  243          VN_HOLD(p->p_zone->zone_rootvp);
↓ open down ↓ 115 lines elided ↑ open up ↑
 359  359          if (p->p_ttime > 0) {
 360  360                  /*
 361  361                   * Account any remaining ticks charged to this process
 362  362                   * on its way out.
 363  363                   */
 364  364                  (void) task_cpu_time_incr(p->p_task, p->p_ttime);
 365  365                  p->p_ttime = 0;
 366  366          }
 367  367          mutex_exit(&p->p_lock);
 368  368  
 369      -        DTRACE_PROC(lwp__exit);
 370      -        DTRACE_PROC1(exit, int, why);
 371      -
 372  369          /*
 373      -         * Will perform any brand specific proc exit processing, since this
 374      -         * is always the last lwp, will also perform lwp_exit and free brand
 375      -         * data
 376      -         */
 377      -        if (PROC_IS_BRANDED(p)) {
 378      -                lwp_detach_brand_hdlrs(lwp);
 379      -                brand_clearbrand(p, B_FALSE);
 380      -        }
 381      -
 382      -        /*
 383  370           * Don't let init exit unless zone_start_init() failed its exec, or
 384  371           * we are shutting down the zone or the machine.
 385  372           *
 386  373           * Since we are single threaded, we don't need to lock the
 387  374           * following accesses to zone_proc_initpid.
 388  375           */
 389  376          if (p->p_pid == z->zone_proc_initpid) {
 390  377                  if (z->zone_boot_err == 0 &&
 391  378                      zone_status_get(z) < ZONE_IS_SHUTTING_DOWN &&
 392  379                      zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN) {
 393      -                        if (z->zone_restart_init == B_TRUE) {
 394      -                                if (restart_init(what, why) == 0)
 395      -                                        return (0);
      380 +
      381 +                        /*
      382 +                         * If the init process should be restarted, the
      383 +                         * "zone_restart_init" member will be set.  Some init
      384 +                         * programs in branded zones do not tolerate a restart
      385 +                         * in the traditional manner; setting the
      386 +                         * "zone_reboot_on_init_exit" member will cause the
      387 +                         * entire zone to be rebooted instead.  If neither of
      388 +                         * these flags is set the zone will shut down.
      389 +                         */
      390 +                        if (z->zone_reboot_on_init_exit == B_TRUE &&
      391 +                            z->zone_restart_init == B_TRUE) {
      392 +                                /*
      393 +                                 * Trigger a zone reboot and continue
      394 +                                 * with exit processing.
      395 +                                 */
      396 +                                z->zone_init_status = wstat(why, what);
      397 +                                (void) zone_kadmin(A_REBOOT, 0, NULL,
      398 +                                    zone_kcred());
      399 +
 396  400                          } else {
      401 +                                if (z->zone_restart_init == B_TRUE) {
      402 +                                        if (restart_init(what, why) == 0)
      403 +                                                return (0);
      404 +                                }
      405 +
      406 +                                z->zone_init_status = wstat(why, what);
 397  407                                  (void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL,
 398      -                                    CRED());
      408 +                                    zone_kcred());
 399  409                          }
 400  410                  }
 401  411  
 402  412                  /*
 403  413                   * Since we didn't or couldn't restart init, we clear
 404  414                   * the zone's init state and proceed with exit
 405  415                   * processing.
 406  416                   */
 407  417                  z->zone_proc_initpid = -1;
 408  418          }
 409  419  
      420 +        /*
      421 +         * Delay firing probes (and performing brand cleanup) until after the
      422 +         * zone_proc_initpid check. Cases which result in zone shutdown or
      423 +         * restart via zone_kadmin eventually result in a call back to
      424 +         * proc_exit.
      425 +         */
      426 +        DTRACE_PROC(lwp__exit);
      427 +        DTRACE_PROC1(exit, int, why);
      428 +
      429 +        /*
      430 +         * Will perform any brand specific proc exit processing. Since this
      431 +         * is always the last lwp, will also perform lwp exit/free and proc
      432 +         * exit. Brand data will be freed when the process is reaped.
      433 +         */
      434 +        if (PROC_IS_BRANDED(p)) {
      435 +                BROP(p)->b_lwpexit(lwp);
      436 +                BROP(p)->b_proc_exit(p);
      437 +                /*
      438 +                 * To ensure that b_proc_exit has access to brand-specific data
      439 +                 * contained by the one remaining lwp, call the freelwp hook as
      440 +                 * the last part of this clean-up process.
      441 +                 */
      442 +                BROP(p)->b_freelwp(lwp);
      443 +                lwp_detach_brand_hdlrs(lwp);
      444 +        }
      445 +
 410  446          lwp_pcb_exit();
 411  447  
 412  448          /*
 413  449           * Allocate a sigqueue now, before we grab locks.
 414  450           * It will be given to sigcld(), below.
 415  451           * Special case:  If we will be making the process disappear
 416  452           * without a trace because it is either:
 417  453           *      * an exiting SSYS process, or
 418  454           *      * a posix_spawn() vfork child who requests it,
 419  455           * we don't bother to allocate a useless sigqueue.
↓ open down ↓ 231 lines elided ↑ open up ↑
 651  687                  p->p_orphan = NULL;
 652  688          }
 653  689  
 654  690          /*
 655  691           * Reassign the children to init.
 656  692           * Don't try to assign init's children to init.
 657  693           */
 658  694          if ((q = p->p_child) != NULL && p != proc_init) {
 659  695                  struct proc     *np;
 660  696                  struct proc     *initp = proc_init;
      697 +                pid_t           zone_initpid = 1;
      698 +                struct proc     *zoneinitp = NULL;
 661  699                  boolean_t       setzonetop = B_FALSE;
 662  700  
 663      -                if (!INGLOBALZONE(curproc))
 664      -                        setzonetop = B_TRUE;
      701 +                if (!INGLOBALZONE(curproc)) {
      702 +                        zone_initpid = curproc->p_zone->zone_proc_initpid;
 665  703  
      704 +                        ASSERT(MUTEX_HELD(&pidlock));
      705 +                        zoneinitp = prfind(zone_initpid);
      706 +                        if (zoneinitp != NULL) {
      707 +                                initp = zoneinitp;
      708 +                        } else {
      709 +                                zone_initpid = 1;
      710 +                                setzonetop = B_TRUE;
      711 +                        }
      712 +                }
      713 +
 666  714                  pgdetach(p);
 667  715  
 668  716                  do {
 669  717                          np = q->p_sibling;
 670  718                          /*
 671  719                           * Delete it from its current parent new state
 672  720                           * list and add it to init new state list
 673  721                           */
 674  722                          delete_ns(q->p_parent, q);
 675  723  
 676      -                        q->p_ppid = 1;
      724 +                        q->p_ppid = zone_initpid;
      725 +
 677  726                          q->p_pidflag &= ~(CLDNOSIGCHLD | CLDWAITPID);
 678  727                          if (setzonetop) {
 679  728                                  mutex_enter(&q->p_lock);
 680  729                                  q->p_flag |= SZONETOP;
 681  730                                  mutex_exit(&q->p_lock);
 682  731                          }
 683  732                          q->p_parent = initp;
 684  733  
 685  734                          /*
 686  735                           * Since q will be the first child,
↓ open down ↓ 153 lines elided ↑ open up ↑
 840  889           * that for user processes, even in the final moments of death, the
 841  890           * process is still associated with its zone.
 842  891           */
 843  892          if (p != t->t_procp->p_zone->zone_zsched)
 844  893                  t->t_procp = t->t_procp->p_zone->zone_zsched;
 845  894          else
 846  895                  t->t_procp = &p0;
 847  896  
 848  897          mutex_exit(&p->p_lock);
 849  898          if (!evaporate) {
 850      -                p->p_pidflag &= ~CLDPEND;
 851      -                sigcld(p, sqp);
      899 +                /*
      900 +                 * The brand specific code only happens when the brand has a
      901 +                 * function to call in place of sigcld and the parent of the
      902 +                 * exiting process is not the global zone init. If the parent
      903 +                 * is the global zone init, then the process was reparented,
      904 +                 * and we don't want brand code delivering possibly strange
      905 +                 * signals to init. Also, init is not branded, so any brand
      906 +                 * specific exit data will not be picked up by init anyway.
      907 +                 */
      908 +                if (PROC_IS_BRANDED(p) &&
      909 +                    BROP(p)->b_exit_with_sig != NULL &&
      910 +                    p->p_ppid != 1) {
      911 +                        /*
      912 +                         * The code for _fini that could unload the brand_t
      913 +                         * blocks until the count of zones using the module
      914 +                         * reaches zero. Zones decrement the refcount on their
      915 +                         * brands only after all user tasks in that zone have
      916 +                         * exited and been waited on. The decrement on the
      917 +                         * brand's refcount happen in zone_destroy(). That
      918 +                         * depends on zone_shutdown() having been completed.
      919 +                         * zone_shutdown() includes a call to zone_empty(),
      920 +                         * where the zone waits for itself to reach the state
      921 +                         * ZONE_IS_EMPTY. This state is only set in either
      922 +                         * zone_shutdown(), when there are no user processes as
      923 +                         * the zone enters this function, or in
      924 +                         * zone_task_rele(). zone_task_rele() is called from
      925 +                         * code triggered by waiting on processes, not by the
      926 +                         * processes exiting through proc_exit().  This means
      927 +                         * all the branded processes that could exist for a
      928 +                         * specific brand_t must exit and get reaped before the
      929 +                         * refcount on the brand_t can reach 0. _fini will
      930 +                         * never unload the corresponding brand module before
      931 +                         * proc_exit finishes execution for all processes
      932 +                         * branded with a particular brand_t, which makes the
      933 +                         * operation below safe to do. Brands that wish to use
      934 +                         * this mechanism must wait in _fini as described
      935 +                         * above.
      936 +                         */
      937 +                        BROP(p)->b_exit_with_sig(p, sqp);
      938 +                } else {
      939 +                        p->p_pidflag &= ~CLDPEND;
      940 +                        sigcld(p, sqp);
      941 +                }
      942 +
 852  943          } else {
 853  944                  /*
 854  945                   * Do what sigcld() would do if the disposition
 855  946                   * of the SIGCHLD signal were set to be ignored.
 856  947                   */
 857  948                  cv_broadcast(&p->p_srwchan_cv);
 858  949                  freeproc(p);
 859  950          }
 860  951          mutex_exit(&pidlock);
 861  952  
↓ open down ↓ 58 lines elided ↑ open up ↑
 920 1011  /*
 921 1012   * Wait system call.
 922 1013   * Search for a terminated (zombie) child,
 923 1014   * finally lay it to rest, and collect its status.
 924 1015   * Look also for stopped children,
 925 1016   * and pass back status from them.
 926 1017   */
 927 1018  int
 928 1019  waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
 929 1020  {
 930      -        int found;
 931 1021          proc_t *cp, *pp;
 932      -        int proc_gone;
 933 1022          int waitflag = !(options & WNOWAIT);
     1023 +        boolean_t have_brand_helper = B_FALSE;
 934 1024  
 935 1025          /*
 936 1026           * Obsolete flag, defined here only for binary compatibility
 937 1027           * with old statically linked executables.  Delete this when
 938 1028           * we no longer care about these old and broken applications.
 939 1029           */
 940 1030  #define _WNOCHLD        0400
 941 1031          options &= ~_WNOCHLD;
 942 1032  
 943 1033          if (options == 0 || (options & ~WOPTMASK))
↓ open down ↓ 7 lines elided ↑ open up ↑
 951 1041                  /* FALLTHROUGH */
 952 1042          case P_ALL:
 953 1043                  break;
 954 1044          default:
 955 1045                  return (EINVAL);
 956 1046          }
 957 1047  
 958 1048          pp = ttoproc(curthread);
 959 1049  
 960 1050          /*
 961      -         * lock parent mutex so that sibling chain can be searched.
     1051 +         * Anytime you are looking for a process, you take pidlock to prevent
     1052 +         * things from changing as you look.
 962 1053           */
 963 1054          mutex_enter(&pidlock);
 964 1055  
 965 1056          /*
 966 1057           * if we are only looking for exited processes and child_ns list
 967 1058           * is empty no reason to look at all children.
 968 1059           */
 969 1060          if (idtype == P_ALL &&
 970 1061              (options & ~WNOWAIT) == (WNOHANG | WEXITED) &&
 971 1062              pp->p_child_ns == NULL) {
 972 1063                  if (pp->p_child) {
 973 1064                          mutex_exit(&pidlock);
 974 1065                          bzero(ip, sizeof (k_siginfo_t));
 975 1066                          return (0);
 976 1067                  }
 977 1068                  mutex_exit(&pidlock);
 978 1069                  return (ECHILD);
 979 1070          }
 980 1071  
 981      -        while (pp->p_child != NULL) {
     1072 +        if (PROC_IS_BRANDED(pp) && BROP(pp)->b_waitid_helper != NULL) {
     1073 +                have_brand_helper = B_TRUE;
     1074 +        }
 982 1075  
 983      -                proc_gone = 0;
     1076 +        while (pp->p_child != NULL || have_brand_helper) {
     1077 +                boolean_t brand_wants_wait = B_FALSE;
     1078 +                int proc_gone = 0;
     1079 +                int found = 0;
 984 1080  
     1081 +                /*
     1082 +                 * Give the brand a chance to return synthetic results from
     1083 +                 * this waitid() call before we do the real thing.
     1084 +                 */
     1085 +                if (have_brand_helper) {
     1086 +                        int ret;
     1087 +
     1088 +                        if (BROP(pp)->b_waitid_helper(idtype, id, ip, options,
     1089 +                            &brand_wants_wait, &ret) == 0) {
     1090 +                                mutex_exit(&pidlock);
     1091 +                                return (ret);
     1092 +                        }
     1093 +
     1094 +                        if (pp->p_child == NULL) {
     1095 +                                goto no_real_children;
     1096 +                        }
     1097 +                }
     1098 +
     1099 +                /*
     1100 +                 * Look for interesting children in the newstate list.
     1101 +                 */
     1102 +                VERIFY(pp->p_child != NULL);
 985 1103                  for (cp = pp->p_child_ns; cp != NULL; cp = cp->p_sibling_ns) {
 986 1104                          if (idtype != P_PID && (cp->p_pidflag & CLDWAITPID))
 987 1105                                  continue;
 988 1106                          if (idtype == P_PID && id != cp->p_pid)
 989 1107                                  continue;
 990 1108                          if (idtype == P_PGID && id != cp->p_pgrp)
 991 1109                                  continue;
     1110 +                        if (PROC_IS_BRANDED(pp)) {
     1111 +                                if (BROP(pp)->b_wait_filter != NULL &&
     1112 +                                    BROP(pp)->b_wait_filter(pp, cp) == B_FALSE)
     1113 +                                        continue;
     1114 +                        }
 992 1115  
 993 1116                          switch (cp->p_wcode) {
 994 1117  
 995 1118                          case CLD_TRAPPED:
 996 1119                          case CLD_STOPPED:
 997 1120                          case CLD_CONTINUED:
 998 1121                                  cmn_err(CE_PANIC,
 999 1122                                      "waitid: wrong state %d on the p_newstate"
1000 1123                                      " list", cp->p_wcode);
1001 1124                                  break;
↓ open down ↓ 24 lines elided ↑ open up ↑
1026 1149                          }
1027 1150  
1028 1151                          if (idtype == P_PID)
1029 1152                                  break;
1030 1153                  }
1031 1154  
1032 1155                  /*
1033 1156                   * Wow! None of the threads on the p_sibling_ns list were
1034 1157                   * interesting threads. Check all the kids!
1035 1158                   */
1036      -                found = 0;
1037 1159                  for (cp = pp->p_child; cp != NULL; cp = cp->p_sibling) {
1038 1160                          if (idtype == P_PID && id != cp->p_pid)
1039 1161                                  continue;
1040 1162                          if (idtype == P_PGID && id != cp->p_pgrp)
1041 1163                                  continue;
     1164 +                        if (PROC_IS_BRANDED(pp)) {
     1165 +                                if (BROP(pp)->b_wait_filter != NULL &&
     1166 +                                    BROP(pp)->b_wait_filter(pp, cp) == B_FALSE)
     1167 +                                        continue;
     1168 +                        }
1042 1169  
1043 1170                          switch (cp->p_wcode) {
1044 1171                          case CLD_TRAPPED:
1045 1172                                  if (!(options & WTRAPPED))
1046 1173                                          break;
1047 1174                                  winfo(cp, ip, waitflag);
1048 1175                                  mutex_exit(&pidlock);
1049 1176                                  if (waitflag) {         /* accept SIGCLD */
1050 1177                                          sigcld_delete(ip);
1051 1178                                          sigcld_repost();
↓ open down ↓ 48 lines elided ↑ open up ↑
1100 1227                                          break;
1101 1228                                  }
1102 1229                          }
1103 1230  
1104 1231                          found++;
1105 1232  
1106 1233                          if (idtype == P_PID)
1107 1234                                  break;
1108 1235                  }
1109 1236  
     1237 +no_real_children:
1110 1238                  /*
1111 1239                   * If we found no interesting processes at all,
1112 1240                   * break out and return ECHILD.
1113 1241                   */
1114      -                if (found + proc_gone == 0)
     1242 +                if (!brand_wants_wait && (found + proc_gone == 0))
1115 1243                          break;
1116 1244  
1117 1245                  if (options & WNOHANG) {
1118 1246                          mutex_exit(&pidlock);
1119 1247                          bzero(ip, sizeof (k_siginfo_t));
1120 1248                          /*
1121 1249                           * We should set ip->si_signo = SIGCLD,
1122 1250                           * but there is an SVVS test that expects
1123 1251                           * ip->si_signo to be zero in this case.
1124 1252                           */
1125 1253                          return (0);
1126 1254                  }
1127 1255  
1128 1256                  /*
1129 1257                   * If we found no processes of interest that could
1130 1258                   * change state while we wait, we don't wait at all.
1131 1259                   * Get out with ECHILD according to SVID.
1132 1260                   */
1133      -                if (found == proc_gone)
     1261 +                if (!brand_wants_wait && (found == proc_gone))
1134 1262                          break;
1135 1263  
1136 1264                  if (!cv_wait_sig_swap(&pp->p_cv, &pidlock)) {
1137 1265                          mutex_exit(&pidlock);
1138 1266                          return (EINTR);
1139 1267                  }
1140 1268          }
1141 1269          mutex_exit(&pidlock);
1142 1270          return (ECHILD);
1143 1271  }
↓ open down ↓ 75 lines elided ↑ open up ↑
1219 1347          ASSERT(p->p_stat == SZOMB);
1220 1348          ASSERT(p->p_tlist == NULL);
1221 1349          ASSERT(MUTEX_HELD(&pidlock));
1222 1350  
1223 1351          sigdelq(p, NULL, 0);
1224 1352          if (p->p_killsqp) {
1225 1353                  siginfofree(p->p_killsqp);
1226 1354                  p->p_killsqp = NULL;
1227 1355          }
1228 1356  
     1357 +        /* Clear any remaining brand data */
     1358 +        if (PROC_IS_BRANDED(p)) {
     1359 +                brand_clearbrand(p, B_FALSE);
     1360 +        }
     1361 +
     1362 +
1229 1363          prfree(p);      /* inform /proc */
1230 1364  
1231 1365          /*
1232 1366           * Don't free the init processes.
1233 1367           * Other dying processes will access it.
1234 1368           */
1235 1369          if (p == proc_init)
1236 1370                  return;
1237 1371  
1238 1372  
↓ open down ↓ 112 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX