Print this page
OS-5330 zoneadm mounting an lx or joyent branded zone fails
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Approved by: Jerry Jelinek <jerry.jelinek@joyent.com>
(NOTE: Manual port, because of divergence from SmartOS.)
OS-3831 lxbrand /proc/cmdline should reflect zone boot arguments
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Approved by: Jerry Jelinek <jerry.jelinek@joyent.com>
Remove most KEBE comments and accompanying unused code or variables/fields.
Merge cleanup from previous six commits
OS-200 need a better mechanism for storing persistent zone_did
OS-2564 zone boot failed: could not start zoneadmd
OS-1763 mount of /etc/svc/volatile failed: Device busy
OS-511 make zonecfg device resource extensible, like the net resource
OS-224 add more zonecfg net properties
Reduce lint
Add zfd.c to zoneadmd's Makefile, a bit more not-yet ifdef-out.
zoneadmd mismerge (we don't support debug yet)
OS-4932 zoneadm boot args not passed to lx init
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
OS-4781 would like to be able to add CT_PR_EV_EXIT to fatal event set of current contract
OS-4253 lxbrand ubuntu 15.04 won't boot because /sbin/init is a symlink
OS-3524 in order to support interaction with docker containers, need to be able to connect to stdio for init from GZ
OS-3525 in order to support 'docker logs' need to be able to get stdio from zone to log file
OS-3429 Expose zone's init exit status
OS-3342 dlmgmtd needs to be mindful of lock ordering
OS-2608 dlmgmtd needs to record zone identifiers
OS-3492 zone_free asserts to its destruction when dlmgmtd has fallen
OS-3494 zoneadmd tears down networking too soon when boot fails
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-3077 restarted zoneadmd uses invalid zlogp
OS-3075 zone long boot args aren't passed through
OS-11 rcapd behaves poorly when under extreme load

@@ -20,10 +20,11 @@
  */
 
 /*
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
  */
 
 /*
  * zoneadmd manages zones; one zoneadmd process is launched for each
  * non-global zone on the system.  This daemon juggles four jobs:

@@ -66,10 +67,11 @@
 #include <sys/param.h>
 #include <sys/mman.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/sysmacros.h>
+#include <sys/time.h>
 
 #include <bsm/adt.h>
 #include <bsm/adt_event.h>
 
 #include <alloca.h>

@@ -106,10 +108,12 @@
 #include <zonestat_impl.h>
 #include "zoneadmd.h"
 
 static char *progname;
 char *zone_name;        /* zone which we are managing */
+zone_dochandle_t snap_hndl;     /* handle for snapshot created when ready */
+char zonepath[MAXNAMELEN];
 char pool_name[MAXNAMELEN];
 char default_brand[MAXNAMELEN];
 char brand_name[MAXNAMELEN];
 boolean_t zone_isnative;
 boolean_t zone_iscluster;

@@ -139,10 +143,13 @@
 #define TEXT_DOMAIN     "SYS_TEST"      /* Use this only if it wasn't */
 #endif
 
 #define DEFAULT_LOCALE  "C"
 
+#define RSRC_NET        "net"
+#define RSRC_DEV        "device"
+
 static const char *
 z_cmd_name(zone_cmd_t zcmd)
 {
         /* This list needs to match the enum in sys/zone.h */
         static const char *zcmdstr[] = {

@@ -255,38 +262,47 @@
                 zlogp->loglen -= copylen;
         }
 }
 
 /*
- * Emit a warning for any boot arguments which are unrecognized.  Since
- * Solaris boot arguments are getopt(3c) compatible (see kernel(1m)), we
+ * Append src to dest, modifying dest in the process. Prefix src with
+ * a space character if dest is a non-empty string.
+ */
+static void
+strnappend(char *dest, size_t n, const char *src)
+{
+        (void) snprintf(dest, n, "%s%s%s", dest,
+            dest[0] == '\0' ? "" : " ", src);
+}
+
+/*
+ * Since illumos boot arguments are getopt(3c) compatible (see kernel(1m)), we
  * put the arguments into an argv style array, use getopt to process them,
- * and put the resultant argument string back into outargs.
+ * and put the resultant argument string back into outargs. Non-native brands
+ * may support alternate forms of boot arguments so we must handle that as well.
  *
  * During the filtering, we pull out any arguments which are truly "boot"
  * arguments, leaving only those which are to be passed intact to the
  * progenitor process.  The one we support at the moment is -i, which
  * indicates to the kernel which program should be launched as 'init'.
  *
- * A return of Z_INVAL indicates specifically that the arguments are
- * not valid; this is a non-fatal error.  Except for Z_OK, all other return
- * values are treated as fatal.
+ * Except for Z_OK, all other return values are treated as fatal.
  */
 static int
 filter_bootargs(zlog_t *zlogp, const char *inargs, char *outargs,
-    char *init_file, char *badarg)
+    char *init_file)
 {
         int argc = 0, argc_save;
         int i;
         int err;
         char *arg, *lasts, **argv = NULL, **argv_save;
         char zonecfg_args[BOOTARGS_MAX];
         char scratchargs[BOOTARGS_MAX], *sargs;
+        char scratchopt[3];
         char c;
 
         bzero(outargs, BOOTARGS_MAX);
-        bzero(badarg, BOOTARGS_MAX);
 
         /*
          * If the user didn't specify transient boot arguments, check
          * to see if there were any specified in the zone configuration,
          * and use them if applicable.

@@ -345,26 +361,34 @@
                 }
                 i++;
         }
 
         /*
-         * We preserve compatibility with the Solaris system boot behavior,
+         * We preserve compatibility with the illumos system boot behavior,
          * which allows:
          *
          *      # reboot kernel/unix -s -m verbose
          *
-         * In this example, kernel/unix tells the booter what file to
-         * boot.  We don't want reboot in a zone to be gratuitously different,
-         * so we silently ignore the boot file, if necessary.
+         * In this example, kernel/unix tells the booter what file to boot. The
+         * original intent of this was that we didn't want reboot in a zone to
+         * be gratuitously different, so we would silently ignore the boot
+         * file, if necessary. However, this usage is archaic and has never
+         * been common, since it is impossible to boot a zone onto a different
+         * kernel. Ignoring the first argument breaks for non-native brands
+         * which pass boot arguments in a different style. e.g.
+         *      systemd.log_level=debug
+         * Thus, for backward compatibility we only ignore the first argument
+         * if it appears to be in the illumos form and attempting to specify a
+         * kernel.
          */
         if (argv[0] == NULL)
                 goto done;
 
         assert(argv[0][0] != ' ');
         assert(argv[0][0] != '\t');
 
-        if (argv[0][0] != '-' && argv[0][0] != '\0') {
+        if (strncmp(argv[0], "kernel/", 7) == 0) {
                 argv = &argv[1];
                 argc--;
         }
 
         optind = 0;

@@ -383,46 +407,40 @@
                         /* This has already been processed by zoneadm */
                         break;
                 case 'm':
                 case 's':
                         /* These pass through unmolested */
-                        (void) snprintf(outargs, BOOTARGS_MAX,
-                            "%s -%c %s ", outargs, c, optarg ? optarg : "");
+                        (void) snprintf(scratchopt, sizeof (scratchopt),
+                            "-%c", c);
+                        strnappend(outargs, BOOTARGS_MAX, scratchopt);
+                        if (optarg != NULL)
+                                strnappend(outargs, BOOTARGS_MAX, optarg);
                         break;
                 case '?':
                         /*
-                         * We warn about unknown arguments but pass them
-                         * along anyway-- if someone wants to develop their
-                         * own init replacement, they can pass it whatever
-                         * args they want.
+                         * If a brand has its own init, we need to pass along
+                         * whatever the user provides. We use the entire
+                         * unknown string here so that we correctly handle
+                         * unknown long options (e.g. --debug).
                          */
-                        err = Z_INVAL;
-                        (void) snprintf(outargs, BOOTARGS_MAX,
-                            "%s -%c", outargs, optopt);
-                        (void) snprintf(badarg, BOOTARGS_MAX,
-                            "%s -%c", badarg, optopt);
+                        strnappend(outargs, BOOTARGS_MAX, argv[optind - 1]);
                         break;
                 }
         }
 
         /*
-         * For Solaris Zones we warn about and discard non-option arguments.
-         * Hence 'boot foo bar baz gub' --> 'boot'.  However, to be similar
-         * to the kernel, we concat up all the other remaining boot args.
-         * and warn on them as a group.
+         * We need to pass along everything else since we don't know what
+         * the brand's init is expecting. For example, an argument list like:
+         *   --confdir /foo --debug
+         * will cause the getopt parsing to stop at '/foo' but we need to pass
+         * that on, along with the '--debug'. This does mean that we require
+         * any of our known options (-ifms) to preceed the brand-specific ones.
          */
-        if (optind < argc) {
-                err = Z_INVAL;
                 while (optind < argc) {
-                        (void) snprintf(badarg, BOOTARGS_MAX, "%s%s%s",
-                            badarg, strlen(badarg) > 0 ? " " : "",
-                            argv[optind]);
+                strnappend(outargs, BOOTARGS_MAX, argv[optind]);
                         optind++;
                 }
-                zerror(zlogp, B_FALSE, "WARNING: Unused or invalid boot "
-                    "arguments `%s'.", badarg);
-        }
 
 done:
         for (i = 0; i < argc_save; i++) {
                 if (argv_save[i] != NULL)
                         free(argv_save[i]);

@@ -535,11 +553,12 @@
 static int
 zone_ready(zlog_t *zlogp, zone_mnt_t mount_cmd, int zstate)
 {
         int err;
 
-        if (brand_prestatechg(zlogp, zstate, Z_READY) != 0)
+        if (!ALT_MOUNT(mount_cmd) &&
+            brand_prestatechg(zlogp, zstate, Z_READY) != 0)
                 return (-1);
 
         if ((err = zonecfg_create_snapshot(zone_name)) != Z_OK) {
                 zerror(zlogp, B_FALSE, "unable to create snapshot: %s",
                     zonecfg_strerror(err));

@@ -559,20 +578,22 @@
                         zerror(zlogp, B_FALSE, "destroying snapshot: %s",
                             zonecfg_strerror(err));
                 goto bad;
         }
 
-        if (brand_poststatechg(zlogp, zstate, Z_READY) != 0)
+        if (!ALT_MOUNT(mount_cmd) &&
+            brand_poststatechg(zlogp, zstate, Z_READY) != 0)
                 goto bad;
 
         return (0);
 
 bad:
         /*
          * If something goes wrong, we up the zones's state to the target
          * state, READY, and then invoke the hook as if we're halting.
          */
+        if (!ALT_MOUNT(mount_cmd))
         (void) brand_poststatechg(zlogp, ZONE_STATE_READY, Z_HALT);
         return (-1);
 }
 
 int

@@ -621,19 +642,12 @@
         int rv;
         ctid_t ct;
 
         /* determine the zone rootpath */
         if (mount_cmd) {
-                char zonepath[MAXPATHLEN];
                 char luroot[MAXPATHLEN];
 
-                if (zone_get_zonepath(zone_name,
-                    zonepath, sizeof (zonepath)) != Z_OK) {
-                        zerror(zlogp, B_FALSE, "unable to determine zone path");
-                        return (-1);
-                }
-
                 (void) snprintf(luroot, sizeof (luroot), "%s/lu", zonepath);
                 resolve_lofs(zlogp, luroot, sizeof (luroot));
                 (void) strlcpy(rootpath, luroot, sizeof (rootpath));
         } else {
                 if (zone_get_rootpath(zone_name,

@@ -684,10 +698,12 @@
 
         } else if (child == 0) {        /* child */
                 char opt_buf[MAX_MNTOPT_STR];
                 int optlen = 0;
                 int mflag = MS_DATA;
+                int i;
+                int ret;
 
                 (void) ct_tmpl_clear(tmpl_fd);
                 /*
                  * Even though there are no procs running in the zone, we
                  * do this for paranoia's sake.

@@ -711,14 +727,31 @@
                         (void) strlcpy(opt_buf, opt, sizeof (opt_buf));
                         opt = opt_buf;
                         optlen = MAX_MNTOPT_STR;
                         mflag = MS_OPTIONSTR;
                 }
-                if (mount(spec, dir, mflag, fstype, NULL, 0, opt, optlen) != 0)
-                        _exit(errno);
-                _exit(0);
+
+                /*
+                 * There is an obscure race condition which can cause mount
+                 * to return EBUSY. This happens for example on the mount
+                 * of the zone's /etc/svc/volatile file system if there is
+                 * a GZ process running svcs -Z, which will touch the
+                 * mountpoint, just as we're trying to do the mount. To cope
+                 * with this, we retry up to 3 times to let this transient
+                 * process get out of the way.
+                 */
+                for (i = 0; i < 3; i++) {
+                        ret = 0;
+                        if (mount(spec, dir, mflag, fstype, NULL, 0, opt,
+                            optlen) != 0)
+                                ret = errno;
+                        if (ret != EBUSY)
+                                break;
+                        (void) sleep(1);
         }
+                _exit(ret);
+        }
 
         /* parent */
         if (contract_latest(&ct) == -1)
                 ct = -1;
         (void) ct_tmpl_clear(tmpl_fd);

@@ -737,10 +770,119 @@
 
         return (0);
 }
 
 /*
+ * env variable name format
+ *      _ZONECFG;{resource name};{identifying attr. name};{property name}
+ */
+static void
+set_zonecfg_env(char *rsrc, char *attr, char *name, char *val)
+{
+        char *p;
+        /* Enough for maximal name, rsrc + attr, & slop for ZONECFG & _'s */
+        char nm[2 * MAXNAMELEN + 32];
+
+        if (attr == NULL)
+                (void) snprintf(nm, sizeof (nm), "_ZONECFG_%s_%s", rsrc,
+                    name);
+        else
+                (void) snprintf(nm, sizeof (nm), "_ZONECFG_%s_%s_%s", rsrc,
+                    attr, name);
+
+        p = nm;
+        while ((p = strchr(p, '-')) != NULL)
+                *p++ = '_';
+
+        (void) setenv(nm, val, 1);
+}
+
+/*
+ * Export zonecfg network and device properties into environment for the boot
+ * and state change hooks.
+ * If debug is true, export the brand hook debug env. variable as well.
+ *
+ * We could export more of the config in the future, as necessary.
+ */
+static int
+setup_subproc_env()
+{
+        int res;
+        zone_dochandle_t handle;
+        struct zone_nwiftab ntab;
+        struct zone_devtab dtab;
+        char net_resources[MAXNAMELEN * 2];
+        char dev_resources[MAXNAMELEN * 2];
+
+        if ((handle = zonecfg_init_handle()) == NULL)
+                exit(Z_NOMEM);
+
+        if ((res = zonecfg_get_handle(zone_name, handle)) != Z_OK)
+                goto done;
+
+        if ((res = zonecfg_setnwifent(handle)) != Z_OK)
+                goto done;
+
+        while (zonecfg_getnwifent(handle, &ntab) == Z_OK) {
+                struct zone_res_attrtab *rap;
+                char *phys;
+
+                phys = ntab.zone_nwif_physical;
+
+                (void) strlcat(net_resources, phys, sizeof (net_resources));
+                (void) strlcat(net_resources, " ", sizeof (net_resources));
+
+                set_zonecfg_env(RSRC_NET, phys, "physical", phys);
+
+                set_zonecfg_env(RSRC_NET, phys, "address",
+                    ntab.zone_nwif_address);
+                set_zonecfg_env(RSRC_NET, phys, "allowed-address",
+                    ntab.zone_nwif_allowed_address);
+                set_zonecfg_env(RSRC_NET, phys, "defrouter",
+                    ntab.zone_nwif_defrouter);
+                set_zonecfg_env(RSRC_NET, phys, "global-nic",
+                    ntab.zone_nwif_gnic);
+                set_zonecfg_env(RSRC_NET, phys, "mac-addr", ntab.zone_nwif_mac);
+                set_zonecfg_env(RSRC_NET, phys, "vlan-id",
+                    ntab.zone_nwif_vlan_id);
+
+                for (rap = ntab.zone_nwif_attrp; rap != NULL;
+                    rap = rap->zone_res_attr_next)
+                        set_zonecfg_env(RSRC_NET, phys, rap->zone_res_attr_name,
+                            rap->zone_res_attr_value);
+        }
+
+        (void) zonecfg_endnwifent(handle);
+
+        if ((res = zonecfg_setdevent(handle)) != Z_OK)
+                goto done;
+
+        while (zonecfg_getdevent(handle, &dtab) == Z_OK) {
+                struct zone_res_attrtab *rap;
+                char *match;
+
+                match = dtab.zone_dev_match;
+
+                (void) strlcat(dev_resources, match, sizeof (dev_resources));
+                (void) strlcat(dev_resources, " ", sizeof (dev_resources));
+
+                for (rap = dtab.zone_dev_attrp; rap != NULL;
+                    rap = rap->zone_res_attr_next)
+                        set_zonecfg_env(RSRC_DEV, match,
+                            rap->zone_res_attr_name, rap->zone_res_attr_value);
+        }
+
+        (void) zonecfg_enddevent(handle);
+
+        res = Z_OK;
+
+done:
+        zonecfg_fini_handle(handle);
+        return (res);
+}
+
+/*
  * If retstr is not NULL, the output of the subproc is returned in the str,
  * otherwise it is output using zerror().  Any memory allocated for retstr
  * should be freed by the caller.
  */
 int

@@ -761,10 +903,15 @@
                 rd_cnt = 0;
         } else {
                 inbuf = buf;
         }
 
+        if (setup_subproc_env() != Z_OK) {
+                zerror(zlogp, B_FALSE, "failed to setup environment");
+                return (-1);
+        }
+
         file = popen(cmdbuf, "r");
         if (file == NULL) {
                 zerror(zlogp, B_TRUE, "could not launch: %s", cmdbuf);
                 return (-1);
         }

@@ -800,26 +947,52 @@
                 return (-1);
         }
         return (WEXITSTATUS(status));
 }
 
+/*
+ * Get the app-svc-dependent flag for this zone's init process. This is a
+ * zone-specific attr which controls the type of contract we create for the
+ * zone's init. When true, the contract will include CT_PR_EV_EXIT in the fatal
+ * set, so that when any service which is in the same contract exits, the init
+ * application will be terminated.
+ *
+ * We use the global "snap_hndl", so no parameters get passed here.
+ */
+static boolean_t
+is_app_svc_dep(void)
+{
+        struct zone_attrtab a;
+
+        bzero(&a, sizeof (a));
+        (void) strlcpy(a.zone_attr_name, "app-svc-dependent",
+            sizeof (a.zone_attr_name));
+
+        if (zonecfg_lookup_attr(snap_hndl, &a) == Z_OK &&
+            strcmp(a.zone_attr_value, "true") == 0) {
+                return (B_TRUE);
+        }
+
+        return (B_FALSE);
+}
+
 static int
 zone_bootup(zlog_t *zlogp, const char *bootargs, int zstate)
 {
         zoneid_t zoneid;
         struct stat st;
-        char zpath[MAXPATHLEN], initpath[MAXPATHLEN], init_file[MAXPATHLEN];
+        char rpath[MAXPATHLEN], initpath[MAXPATHLEN], init_file[MAXPATHLEN];
         char nbootargs[BOOTARGS_MAX];
         char cmdbuf[MAXPATHLEN];
         fs_callback_t cb;
         brand_handle_t bh;
         zone_iptype_t iptype;
-        boolean_t links_loaded = B_FALSE;
         dladm_status_t status;
         char errmsg[DLADM_STRSIZE];
         int err;
         boolean_t restart_init;
+        boolean_t app_svc_dep;
 
         if (brand_prestatechg(zlogp, zstate, Z_BOOT) != 0)
                 return (-1);
 
         if ((zoneid = getzoneidbyname(zone_name)) == -1) {

@@ -850,17 +1023,12 @@
         }
 
         /*
          * Get the brand's boot callback if it exists.
          */
-        if (zone_get_zonepath(zone_name, zpath, sizeof (zpath)) != Z_OK) {
-                zerror(zlogp, B_FALSE, "unable to determine zone path");
-                brand_close(bh);
-                goto bad;
-        }
         (void) strcpy(cmdbuf, EXEC_PREFIX);
-        if (brand_get_boot(bh, zone_name, zpath, cmdbuf + EXEC_LEN,
+        if (brand_get_boot(bh, zone_name, zonepath, cmdbuf + EXEC_LEN,
             sizeof (cmdbuf) - EXEC_LEN) != 0) {
                 zerror(zlogp, B_FALSE,
                     "unable to determine branded zone's boot callback");
                 brand_close(bh);
                 goto bad;

@@ -875,35 +1043,45 @@
         }
 
         /* See if this zone's brand should restart init if it dies. */
         restart_init = brand_restartinit(bh);
 
+        /*
+         * See if we need to setup contract dependencies between the zone's
+         * primary application and any of its services.
+         */
+        app_svc_dep = is_app_svc_dep();
+
         brand_close(bh);
 
-        err = filter_bootargs(zlogp, bootargs, nbootargs, init_file,
-            bad_boot_arg);
-        if (err == Z_INVAL)
-                eventstream_write(Z_EVT_ZONE_BADARGS);
-        else if (err != Z_OK)
+        err = filter_bootargs(zlogp, bootargs, nbootargs, init_file);
+        if (err != Z_OK)
                 goto bad;
 
         assert(init_file[0] != '\0');
 
-        /* Try to anticipate possible problems: Make sure init is executable. */
-        if (zone_get_rootpath(zone_name, zpath, sizeof (zpath)) != Z_OK) {
+        /*
+         * Try to anticipate possible problems: If possible, make sure init is
+         * executable.
+         */
+        if (zone_get_rootpath(zone_name, rpath, sizeof (rpath)) != Z_OK) {
                 zerror(zlogp, B_FALSE, "unable to determine zone root");
                 goto bad;
         }
 
-        (void) snprintf(initpath, sizeof (initpath), "%s%s", zpath, init_file);
+        (void) snprintf(initpath, sizeof (initpath), "%s%s", rpath, init_file);
 
-        if (stat(initpath, &st) == -1) {
+        if (lstat(initpath, &st) == -1) {
                 zerror(zlogp, B_TRUE, "could not stat %s", initpath);
                 goto bad;
         }
 
-        if ((st.st_mode & S_IXUSR) == 0) {
+        /*
+         * If a symlink, we'll have to wait and resolve when we boot,
+         * otherwise check the executable bits now.
+         */
+        if ((st.st_mode & S_IFMT) != S_IFLNK && (st.st_mode & S_IXUSR) == 0) {
                 zerror(zlogp, B_FALSE, "%s is not executable", initpath);
                 goto bad;
         }
 
         /*

@@ -917,11 +1095,10 @@
                 if (status != DLADM_STATUS_OK) {
                         zerror(zlogp, B_FALSE, "unable to load zone datalinks: "
                             " %s", dladm_status2str(status, errmsg));
                         goto bad;
                 }
-                links_loaded = B_TRUE;
         }
 
         /*
          * If there is a brand 'boot' callback, execute it now to give the
          * brand one last chance to do any additional setup before the zone

@@ -947,10 +1124,16 @@
             NULL, 0) == -1) {
                 zerror(zlogp, B_TRUE, "could not set zone init-no-restart");
                 goto bad;
         }
 
+        if (app_svc_dep && zone_setattr(zoneid, ZONE_ATTR_APP_SVC_CT,
+            (void *)B_TRUE, sizeof (boolean_t)) == -1) {
+                zerror(zlogp, B_TRUE, "could not set zone app-die");
+                goto bad;
+        }
+
         /*
          * Inform zonestatd of a new zone so that it can install a door for
          * the zone to contact it.
          */
         notify_zonestatd(zone_id);

@@ -961,44 +1144,58 @@
         }
 
         if (brand_poststatechg(zlogp, zstate, Z_BOOT) != 0)
                 goto bad;
 
+        /* Startup a thread to perform zfd logging/tty svc for the zone. */
+        create_log_thread(zlogp, zone_id);
+
+        /* Startup a thread to perform memory capping for the zone. */
+        create_mcap_thread(zlogp, zone_id);
+
         return (0);
 
 bad:
         /*
          * If something goes wrong, we up the zones's state to the target
          * state, RUNNING, and then invoke the hook as if we're halting.
          */
         (void) brand_poststatechg(zlogp, ZONE_STATE_RUNNING, Z_HALT);
-        if (links_loaded)
-                (void) dladm_zone_halt(dld_handle, zoneid);
+
         return (-1);
 }
 
 static int
 zone_halt(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting, int zstate)
 {
         int err;
 
-        if (brand_prestatechg(zlogp, zstate, Z_HALT) != 0)
+        if (unmount_cmd == B_FALSE &&
+            brand_prestatechg(zlogp, zstate, Z_HALT) != 0)
                 return (-1);
 
+        /* Shutting down, stop the memcap thread */
+        destroy_mcap_thread();
+
         if (vplat_teardown(zlogp, unmount_cmd, rebooting) != 0) {
                 if (!bringup_failure_recovery)
                         zerror(zlogp, B_FALSE, "unable to destroy zone");
+                destroy_log_thread();
                 return (-1);
         }
 
+        /* Shut down is done, stop the log thread */
+        destroy_log_thread();
+
+        if (unmount_cmd == B_FALSE &&
+            brand_poststatechg(zlogp, zstate, Z_HALT) != 0)
+                return (-1);
+
         if ((err = zonecfg_destroy_snapshot(zone_name)) != Z_OK)
                 zerror(zlogp, B_FALSE, "destroying snapshot: %s",
                     zonecfg_strerror(err));
 
-        if (brand_poststatechg(zlogp, zstate, Z_HALT) != 0)
-                return (-1);
-
         return (0);
 }
 
 static int
 zone_graceful_shutdown(zlog_t *zlogp)

@@ -1005,11 +1202,10 @@
 {
         zoneid_t zoneid;
         pid_t child;
         char cmdbuf[MAXPATHLEN];
         brand_handle_t bh = NULL;
-        char zpath[MAXPATHLEN];
         ctid_t ct;
         int tmpl_fd;
         int child_status;
 
         if (shutdown_in_progress) {

@@ -1026,22 +1222,16 @@
         if ((bh = brand_open(brand_name)) == NULL) {
                 zerror(zlogp, B_FALSE, "unable to determine zone brand");
                 return (-1);
         }
 
-        if (zone_get_zonepath(zone_name, zpath, sizeof (zpath)) != Z_OK) {
-                zerror(zlogp, B_FALSE, "unable to determine zone path");
-                brand_close(bh);
-                return (-1);
-        }
-
         /*
          * If there is a brand 'shutdown' callback, execute it now to give the
          * brand a chance to cleanup any custom configuration.
          */
         (void) strcpy(cmdbuf, EXEC_PREFIX);
-        if (brand_get_shutdown(bh, zone_name, zpath, cmdbuf + EXEC_LEN,
+        if (brand_get_shutdown(bh, zone_name, zonepath, cmdbuf + EXEC_LEN,
             sizeof (cmdbuf) - EXEC_LEN) != 0 || strlen(cmdbuf) <= EXEC_LEN) {
                 (void) strcat(cmdbuf, SHUTDOWN_DEFAULT);
         }
         brand_close(bh);
 

@@ -1175,10 +1365,40 @@
 
         (void) adt_end_session(ah);
 }
 
 /*
+ * Log the exit time and status of the zone's init process into
+ * {zonepath}/lastexited. If the zone shutdown normally, the exit status will
+ * be -1, otherwise it will be the exit status as described in wait.3c.
+ * If the zone is configured to restart init, then nothing will be logged if
+ * init exits unexpectedly (the kernel will never upcall in this case).
+ */
+static void
+log_init_exit(int status)
+{
+        char p[MAXPATHLEN];
+        char buf[128];
+        struct timeval t;
+        int fd;
+
+        if (snprintf(p, sizeof (p), "%s/lastexited", zonepath) > sizeof (p))
+                return;
+        if (gettimeofday(&t, NULL) != 0)
+                return;
+        if (snprintf(buf, sizeof (buf), "%ld.%ld %d\n", t.tv_sec, t.tv_usec,
+            status) > sizeof (buf))
+                return;
+        if ((fd = open(p, O_WRONLY | O_CREAT | O_TRUNC, 0644)) < 0)
+                return;
+
+        (void) write(fd, buf, strlen(buf));
+
+        (void) close(fd);
+}
+
+/*
  * The main routine for the door server that deals with zone state transitions.
  */
 /* ARGSUSED */
 static void
 server(void *cookie, char *args, size_t alen, door_desc_t *dp,

@@ -1187,10 +1407,11 @@
         ucred_t *uc = NULL;
         const priv_set_t *eset;
 
         zone_state_t zstate;
         zone_cmd_t cmd;
+        int init_status;
         zone_cmd_arg_t *zargp;
 
         boolean_t kernelcall;
 
         int rval = -1;

@@ -1239,10 +1460,11 @@
                     "unexpected (expected %d bytes)", alen,
                     sizeof (zone_cmd_arg_t));
                 goto out;
         }
         cmd = zargp->cmd;
+        init_status = zargp->status;
 
         if (door_ucred(&uc) != 0) {
                 zerror(&logsys, B_TRUE, "door_ucred");
                 goto out;
         }

@@ -1348,10 +1570,11 @@
                 switch (cmd) {
                 case Z_READY:
                         rval = zone_ready(zlogp, Z_MNT_BOOT, zstate);
                         if (rval == 0)
                                 eventstream_write(Z_EVT_ZONE_READIED);
+                        zcons_statechanged();
                         break;
                 case Z_BOOT:
                 case Z_FORCEBOOT:
                         eventstream_write(Z_EVT_ZONE_BOOTING);
                         if ((rval = zone_ready(zlogp, Z_MNT_BOOT, zstate))

@@ -1358,10 +1581,11 @@
                             == 0) {
                                 rval = zone_bootup(zlogp, zargp->bootbuf,
                                     zstate);
                         }
                         audit_put_record(zlogp, uc, rval, "boot");
+                        zcons_statechanged();
                         if (rval != 0) {
                                 bringup_failure_recovery = B_TRUE;
                                 (void) zone_halt(zlogp, B_FALSE, B_FALSE,
                                     zstate);
                                 eventstream_write(Z_EVT_ZONE_BOOTFAILED);

@@ -1480,10 +1704,11 @@
                         (void) strlcpy(boot_args, zargp->bootbuf,
                             sizeof (boot_args));
                         eventstream_write(Z_EVT_ZONE_BOOTING);
                         rval = zone_bootup(zlogp, zargp->bootbuf, zstate);
                         audit_put_record(zlogp, uc, rval, "boot");
+                        zcons_statechanged();
                         if (rval != 0) {
                                 bringup_failure_recovery = B_TRUE;
                                 (void) zone_halt(zlogp, B_FALSE, B_TRUE,
                                     zstate);
                                 eventstream_write(Z_EVT_ZONE_BOOTFAILED);

@@ -1494,10 +1719,11 @@
                         if (kernelcall) /* Invalid; can't happen */
                                 abort();
                         if ((rval = zone_halt(zlogp, B_FALSE, B_FALSE, zstate))
                             != 0)
                                 break;
+                        zcons_statechanged();
                         eventstream_write(Z_EVT_ZONE_HALTED);
                         break;
                 case Z_SHUTDOWN:
                 case Z_REBOOT:
                 case Z_NOTE_UNINSTALLING:

@@ -1541,10 +1767,11 @@
                 switch (cmd) {
                 case Z_READY:
                         if ((rval = zone_halt(zlogp, B_FALSE, B_TRUE, zstate))
                             != 0)
                                 break;
+                        zcons_statechanged();
                         if ((rval = zone_ready(zlogp, Z_MNT_BOOT, zstate)) == 0)
                                 eventstream_write(Z_EVT_ZONE_READIED);
                         else
                                 eventstream_write(Z_EVT_ZONE_HALTED);
                         break;

@@ -1557,14 +1784,20 @@
                          */
                         zerror(zlogp, B_FALSE, "zone is already booted");
                         rval = 0;
                         break;
                 case Z_HALT:
+                        if (kernelcall) {
+                                log_init_exit(init_status);
+                        } else {
+                                log_init_exit(-1);
+                        }
                         if ((rval = zone_halt(zlogp, B_FALSE, B_FALSE, zstate))
                             != 0)
                                 break;
                         eventstream_write(Z_EVT_ZONE_HALTED);
+                        zcons_statechanged();
                         break;
                 case Z_REBOOT:
                         (void) strlcpy(boot_args, zargp->bootbuf,
                             sizeof (boot_args));
                         eventstream_write(Z_EVT_ZONE_REBOOTING);

@@ -1572,12 +1805,13 @@
                             != 0) {
                                 eventstream_write(Z_EVT_ZONE_BOOTFAILED);
                                 boot_args[0] = '\0';
                                 break;
                         }
-                        if ((rval = zone_ready(zlogp, Z_MNT_BOOT, zstate))
-                            != 0) {
+                        zcons_statechanged();
+                        if ((rval = zone_ready(zlogp, Z_MNT_BOOT, zstate)) !=
+                            0) {
                                 eventstream_write(Z_EVT_ZONE_BOOTFAILED);
                                 boot_args[0] = '\0';
                                 break;
                         }
                         rval = zone_bootup(zlogp, zargp->bootbuf, zstate);

@@ -1757,17 +1991,44 @@
                  * limited to times when zoneadmd is picking back up from a
                  * zoneadmd that died while the zone was in some non-trivial
                  * state.
                  */
                 if (zstate > ZONE_STATE_INSTALLED) {
+                        static zoneid_t zid;
+
                         zerror(zlogp, B_FALSE,
                             "zone '%s': WARNING: zone is in state '%s', but "
                             "zoneadmd does not appear to be available; "
                             "restarted zoneadmd to recover.",
                             zone_name, zone_state_str(zstate));
+
+                        /*
+                         * Startup a thread to perform the zfd logging/tty svc
+                         * and a thread to perform memory capping for the
+                         * zone. zlogp won't be valid for much longer so use
+                         * logsys.
+                         */
+                        if ((zid = getzoneidbyname(zone_name)) != -1) {
+                                create_log_thread(&logsys, zid);
+                                create_mcap_thread(&logsys, zid);
                 }
 
+                        /* recover the global configuration snapshot */
+                        if (snap_hndl == NULL) {
+                                if ((snap_hndl = zonecfg_init_handle())
+                                    == NULL ||
+                                    zonecfg_create_snapshot(zone_name)
+                                    != Z_OK ||
+                                    zonecfg_get_snapshot_handle(zone_name,
+                                    snap_hndl) != Z_OK) {
+                                        zerror(zlogp, B_FALSE, "recovering "
+                                            "zone configuration handle");
+                                        goto out;
+                                }
+                        }
+                }
+
                 (void) fdetach(zone_door_path);
                 (void) close(doorfd);
                 goto top;
         }
         ret = 0;

@@ -1781,19 +2042,14 @@
  * query callback, if any of these exist.
  */
 static int
 brand_callback_init(brand_handle_t bh, char *zone_name)
 {
-        char zpath[MAXPATHLEN];
-
-        if (zone_get_zonepath(zone_name, zpath, sizeof (zpath)) != Z_OK)
-                return (-1);
-
         (void) strlcpy(pre_statechg_hook, EXEC_PREFIX,
             sizeof (pre_statechg_hook));
 
-        if (brand_get_prestatechange(bh, zone_name, zpath,
+        if (brand_get_prestatechange(bh, zone_name, zonepath,
             pre_statechg_hook + EXEC_LEN,
             sizeof (pre_statechg_hook) - EXEC_LEN) != 0)
                 return (-1);
 
         if (strlen(pre_statechg_hook) <= EXEC_LEN)

@@ -1800,11 +2056,11 @@
                 pre_statechg_hook[0] = '\0';
 
         (void) strlcpy(post_statechg_hook, EXEC_PREFIX,
             sizeof (post_statechg_hook));
 
-        if (brand_get_poststatechange(bh, zone_name, zpath,
+        if (brand_get_poststatechange(bh, zone_name, zonepath,
             post_statechg_hook + EXEC_LEN,
             sizeof (post_statechg_hook) - EXEC_LEN) != 0)
                 return (-1);
 
         if (strlen(post_statechg_hook) <= EXEC_LEN)

@@ -1811,11 +2067,11 @@
                 post_statechg_hook[0] = '\0';
 
         (void) strlcpy(query_hook, EXEC_PREFIX,
             sizeof (query_hook));
 
-        if (brand_get_query(bh, zone_name, zpath, query_hook + EXEC_LEN,
+        if (brand_get_query(bh, zone_name, zonepath, query_hook + EXEC_LEN,
             sizeof (query_hook) - EXEC_LEN) != 0)
                 return (-1);
 
         if (strlen(query_hook) <= EXEC_LEN)
                 query_hook[0] = '\0';

@@ -1939,10 +2195,15 @@
                     "cannot manage a zone which is in state '%s'",
                     zone_state_str(zstate));
                 return (1);
         }
 
+        if (zone_get_zonepath(zone_name, zonepath, sizeof (zonepath)) != Z_OK) {
+                zerror(zlogp, B_FALSE, "unable to determine zone path");
+                return (-1);
+        }
+
         if (zonecfg_default_brand(default_brand,
             sizeof (default_brand)) != Z_OK) {
                 zerror(zlogp, B_FALSE, "unable to determine default brand");
                 return (1);
         }