1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
25 * Copyright 2021 Joyent, Inc.
26 * Copyright (c) 2016 by Delphix. All rights reserved.
27 */
28
29 /*
30 * zoneadmd manages zones; one zoneadmd process is launched for each
31 * non-global zone on the system. This daemon juggles four jobs:
32 *
33 * - Implement setup and teardown of the zone "virtual platform": mount and
34 * unmount filesystems; create and destroy network interfaces; communicate
35 * with devfsadmd to lay out devices for the zone; instantiate the zone
36 * console device; configure process runtime attributes such as resource
37 * controls, pool bindings, fine-grained privileges.
38 *
39 * - Launch the zone's init(1M) process.
40 *
41 * - Implement a door server; clients (like zoneadm) connect to the door
42 * server and request zone state changes. The kernel is also a client of
43 * this door server. A request to halt or reboot the zone which originates
44 * *inside* the zone results in a door upcall from the kernel into zoneadmd.
45 *
46 * One minor problem is that messages emitted by zoneadmd need to be passed
47 * back to the zoneadm process making the request. These messages need to
48 * be rendered in the client's locale; so, this is passed in as part of the
49 * request. The exception is the kernel upcall to zoneadmd, in which case
50 * messages are syslog'd.
51 *
52 * To make all of this work, the Makefile adds -a to xgettext to extract *all*
53 * strings, and an exclusion file (zoneadmd.xcl) is used to exclude those
54 * strings which do not need to be translated.
55 *
56 * - Act as a console server for zlogin -C processes; see comments in zcons.c
57 * for more information about the zone console architecture.
58 *
59 * DESIGN NOTES
60 *
61 * Restart:
62 * A chief design constraint of zoneadmd is that it should be restartable in
63 * the case that the administrator kills it off, or it suffers a fatal error,
64 * without the running zone being impacted; this is akin to being able to
65 * reboot the service processor of a server without affecting the OS instance.
66 */
67
68 #include <sys/param.h>
69 #include <sys/mman.h>
70 #include <sys/types.h>
71 #include <sys/stat.h>
72 #include <sys/sysmacros.h>
73 #include <sys/time.h>
74
75 #include <bsm/adt.h>
76 #include <bsm/adt_event.h>
77
78 #include <alloca.h>
79 #include <assert.h>
80 #include <errno.h>
81 #include <door.h>
82 #include <fcntl.h>
83 #include <locale.h>
84 #include <signal.h>
85 #include <stdarg.h>
86 #include <stdio.h>
87 #include <stdlib.h>
88 #include <string.h>
89 #include <strings.h>
90 #include <synch.h>
91 #include <syslog.h>
92 #include <thread.h>
93 #include <unistd.h>
94 #include <wait.h>
95 #include <limits.h>
96 #include <zone.h>
97 #include <libbrand.h>
98 #include <sys/brand.h>
99 #include <libcontract.h>
100 #include <libcontract_priv.h>
101 #include <sys/brand.h>
102 #include <sys/contract/process.h>
103 #include <sys/ctfs.h>
104 #include <libdladm.h>
105 #include <sys/dls_mgmt.h>
106 #include <libscf.h>
107 #include <uuid/uuid.h>
108 #include <libppt.h>
109
110 #include <libzonecfg.h>
111 #include <zonestat_impl.h>
112 #include "zoneadmd.h"
113
114 static char *progname;
115 char *zone_name; /* zone which we are managing */
116 zone_dochandle_t snap_hndl; /* handle for snapshot created when ready */
117 char zonepath[MAXNAMELEN];
118 char pool_name[MAXNAMELEN];
119 char default_brand[MAXNAMELEN];
120 char brand_name[MAXNAMELEN];
121 boolean_t zone_isnative;
122 boolean_t zone_iscluster;
123 boolean_t zone_islabeled;
124 boolean_t shutdown_in_progress;
125 static zoneid_t zone_id;
126 static zoneid_t zone_did = 0;
127 dladm_handle_t dld_handle = NULL;
128
129 char pre_statechg_hook[2 * MAXPATHLEN];
130 char post_statechg_hook[2 * MAXPATHLEN];
131 char query_hook[2 * MAXPATHLEN];
132
133 zlog_t logsys; /* log to syslog */
134 zlog_t logplat; /* log to platform.log */
135
136 mutex_t lock = DEFAULTMUTEX; /* to serialize stuff */
137 mutex_t msglock = DEFAULTMUTEX; /* for calling setlocale() */
138
139 static sema_t scratch_sem; /* for scratch zones */
140
141 static char zone_door_path[MAXPATHLEN];
142 static int zone_door = -1;
143
144 boolean_t in_death_throes = B_FALSE; /* daemon is dying */
145 boolean_t bringup_failure_recovery = B_FALSE; /* ignore certain failures */
146
147 static int platloghdl = -1; /* Handle for <zonepath>/logs/platform.log */
148
149 #if !defined(TEXT_DOMAIN) /* should be defined by cc -D */
150 #define TEXT_DOMAIN "SYS_TEST" /* Use this only if it wasn't */
151 #endif
152
153 #define DEFAULT_LOCALE "C"
154
155 #define RSRC_NET "net"
156 #define RSRC_DEV "device"
157
158 static const char *
159 z_cmd_name(zone_cmd_t zcmd)
160 {
161 /* This list needs to match the enum in sys/zone.h */
162 static const char *zcmdstr[] = {
163 "ready", "boot", "forceboot", "reboot", "halt",
164 "note_uninstalling", "mount", "forcemount", "unmount",
165 "shutdown"
166 };
167
168 if (zcmd >= sizeof (zcmdstr) / sizeof (*zcmdstr))
169 return ("unknown");
170 else
171 return (zcmdstr[(int)zcmd]);
172 }
173
174 static char *
175 get_execbasename(char *execfullname)
176 {
177 char *last_slash, *execbasename;
178
179 /* guard against '/' at end of command invocation */
180 for (;;) {
181 last_slash = strrchr(execfullname, '/');
182 if (last_slash == NULL) {
183 execbasename = execfullname;
184 break;
185 } else {
186 execbasename = last_slash + 1;
187 if (*execbasename == '\0') {
188 *last_slash = '\0';
189 continue;
190 }
191 break;
192 }
193 }
194 return (execbasename);
195 }
196
197 static void
198 usage(void)
199 {
200 (void) fprintf(stderr, gettext("Usage: %s -z zonename\n"), progname);
201 (void) fprintf(stderr,
202 gettext("\tNote: %s should not be run directly.\n"), progname);
203 exit(2);
204 }
205
206 /* ARGSUSED */
207 static void
208 sigchld(int sig)
209 {
210 }
211
212 char *
213 localize_msg(char *locale, const char *msg)
214 {
215 char *out;
216
217 (void) mutex_lock(&msglock);
218 (void) setlocale(LC_MESSAGES, locale);
219 out = gettext(msg);
220 (void) setlocale(LC_MESSAGES, DEFAULT_LOCALE);
221 (void) mutex_unlock(&msglock);
222 return (out);
223 }
224
225 /* PRINTFLIKE3 */
226 void
227 zerror(zlog_t *zlogp, boolean_t use_strerror, const char *fmt, ...)
228 {
229 va_list alist;
230 char buf[MAXPATHLEN * 2]; /* enough space for err msg with a path */
231 char *bp, *bp_nozone;
232 int saved_errno = errno;
233
234 if (zlogp == &logsys)
235 (void) snprintf(buf, sizeof (buf), "[zone '%s'] ", zone_name);
236 else
237 buf[0] = '\0';
238 bp = bp_nozone = &(buf[strlen(buf)]);
239
240 /*
241 * In theory, the locale pointer should be set to either "C" or a
242 * char array, so it should never be NULL
243 */
244 assert(zlogp->locale != NULL);
245 /* Locale is per process, but we are multi-threaded... */
246 fmt = localize_msg(zlogp->locale, fmt);
247
248 va_start(alist, fmt);
249 (void) vsnprintf(bp, sizeof (buf) - (bp - buf), fmt, alist);
250 va_end(alist);
251 bp = &(buf[strlen(buf)]);
252 if (use_strerror)
253 (void) snprintf(bp, sizeof (buf) - (bp - buf), ": %s",
254 strerror(saved_errno));
255
256 (void) strlcat(buf, "\n", sizeof (buf));
257
258 /*
259 * If we don't have the platform log, we are in a child process, and
260 * should log to stderr (which is a pipe) instead of the file.
261 */
262 if (logging_poisoned) {
263 (void) fprintf(stderr, "%s", buf);
264
265 if (zlogp != &logsys && zlogp->logfile == stderr)
266 return;
267 } else {
268 logstream_write(platloghdl, bp_nozone, strlen(bp_nozone));
269
270 if (zlogp == &logplat)
271 return;
272 }
273
274 if (zlogp == &logsys) {
275 bp = strrchr(buf, '\n');
276 if (bp != NULL && bp[1] == '\0') {
277 *bp = '\0';
278 }
279 (void) syslog(LOG_ERR, "%s", buf);
280 } else if (zlogp->logfile != NULL) {
281 (void) fprintf(zlogp->logfile, "%s", buf);
282 } else {
283 size_t buflen;
284 size_t copylen;
285
286 buflen = snprintf(zlogp->log, zlogp->loglen, "%s", buf);
287 copylen = MIN(buflen, zlogp->loglen);
288 zlogp->log += copylen;
289 zlogp->loglen -= copylen;
290 }
291 }
292
293 /*
294 * Append src to dest, modifying dest in the process. Prefix src with
295 * a space character if dest is a non-empty string. Assumes dest is already
296 * properly \0-terminated OR overruns destsize.
297 */
298 static void
299 strnappend(char *dest, size_t destsize, const char *src)
300 {
301 size_t startpoint = strnlen(dest, destsize);
302
303 if (startpoint >= destsize - 1) {
304 /* We've run out of room. Record something?! */
305 return;
306 }
307
308 if (startpoint > 0) {
309 /* Add the space per the function's intro comment. */
310 dest[startpoint] = ' ';
311 startpoint++;
312 }
313
314 /* Arguably we should check here too... */
315 (void) strlcpy(dest + startpoint, src, destsize - startpoint);
316 }
317
318 /*
319 * Since illumos boot arguments are getopt(3c) compatible (see kernel(1m)), we
320 * put the arguments into an argv style array, use getopt to process them,
321 * and put the resultant argument string back into outargs. Non-native brands
322 * may support alternate forms of boot arguments so we must handle that as well.
323 *
324 * During the filtering, we pull out any arguments which are truly "boot"
325 * arguments, leaving only those which are to be passed intact to the
326 * progenitor process. The one we support at the moment is -i, which
327 * indicates to the kernel which program should be launched as 'init'.
328 *
329 * Except for Z_OK, all other return values are treated as fatal.
330 */
331 static int
332 filter_bootargs(zlog_t *zlogp, const char *inargs, char *outargs,
333 char *init_file)
334 {
335 int argc = 0, argc_save;
336 int i;
337 int err = Z_OK;
338 char *arg, *lasts, **argv = NULL, **argv_save;
339 char zonecfg_args[BOOTARGS_MAX];
340 char scratchargs[BOOTARGS_MAX], *sargs;
341 char scratchopt[3];
342 char c;
343
344 bzero(outargs, BOOTARGS_MAX);
345
346 /*
347 * If the user didn't specify transient boot arguments, check
348 * to see if there were any specified in the zone configuration,
349 * and use them if applicable.
350 */
351 if (inargs == NULL || inargs[0] == '\0') {
352 bzero(zonecfg_args, sizeof (zonecfg_args));
353 (void) zonecfg_get_bootargs(snap_hndl, zonecfg_args,
354 sizeof (zonecfg_args));
355 inargs = zonecfg_args;
356 }
357
358 if (strlen(inargs) >= BOOTARGS_MAX) {
359 zerror(zlogp, B_FALSE, "boot argument string too long");
360 return (Z_INVAL);
361 }
362
363 (void) strlcpy(scratchargs, inargs, sizeof (scratchargs));
364 sargs = scratchargs;
365 while ((arg = strtok_r(sargs, " \t", &lasts)) != NULL) {
366 sargs = NULL;
367 argc++;
368 }
369
370 if ((argv = calloc(argc + 1, sizeof (char *))) == NULL) {
371 zerror(zlogp, B_FALSE, "memory allocation failed");
372 return (Z_NOMEM);
373 }
374
375 argv_save = argv;
376 argc_save = argc;
377
378 (void) strlcpy(scratchargs, inargs, sizeof (scratchargs));
379 sargs = scratchargs;
380 i = 0;
381 while ((arg = strtok_r(sargs, " \t", &lasts)) != NULL) {
382 sargs = NULL;
383 if ((argv[i] = strdup(arg)) == NULL) {
384 err = Z_NOMEM;
385 zerror(zlogp, B_FALSE, "memory allocation failed");
386 goto done;
387 }
388 i++;
389 }
390
391 /*
392 * We preserve compatibility with the illumos system boot behavior,
393 * which allows:
394 *
395 * # reboot kernel/unix -s -m verbose
396 *
397 * In this example, kernel/unix tells the booter what file to boot. The
398 * original intent of this was that we didn't want reboot in a zone to
399 * be gratuitously different, so we would silently ignore the boot
400 * file, if necessary. However, this usage is archaic and has never
401 * been common, since it is impossible to boot a zone onto a different
402 * kernel. Ignoring the first argument breaks for non-native brands
403 * which pass boot arguments in a different style. e.g.
404 * systemd.log_level=debug
405 * Thus, for backward compatibility we only ignore the first argument
406 * if it appears to be in the illumos form and attempting to specify a
407 * kernel.
408 */
409 if (argv[0] == NULL)
410 goto done;
411
412 assert(argv[0][0] != ' ');
413 assert(argv[0][0] != '\t');
414
415 if (strncmp(argv[0], "kernel/", 7) == 0) {
416 argv = &argv[1];
417 argc--;
418 }
419
420 optind = 0;
421 opterr = 0;
422 err = Z_OK;
423 while ((c = getopt(argc, argv, "fi:m:s")) != -1) {
424 switch (c) {
425 case 'i':
426 /*
427 * -i is handled by the runtime and is not passed
428 * along to userland
429 */
430 (void) strlcpy(init_file, optarg, MAXPATHLEN);
431 break;
432 case 'f':
433 /* This has already been processed by zoneadm */
434 break;
435 case 'm':
436 case 's':
437 /* These pass through unmolested */
438 (void) snprintf(scratchopt, sizeof (scratchopt),
439 "-%c", c);
440 strnappend(outargs, BOOTARGS_MAX, scratchopt);
441 if (optarg != NULL)
442 strnappend(outargs, BOOTARGS_MAX, optarg);
443 break;
444 case '?':
445 /*
446 * If a brand has its own init, we need to pass along
447 * whatever the user provides. We use the entire
448 * unknown string here so that we correctly handle
449 * unknown long options (e.g. --debug).
450 */
451 strnappend(outargs, BOOTARGS_MAX, argv[optind - 1]);
452 break;
453 }
454 }
455
456 /*
457 * We need to pass along everything else since we don't know what
458 * the brand's init is expecting. For example, an argument list like:
459 * --confdir /foo --debug
460 * will cause the getopt parsing to stop at '/foo' but we need to pass
461 * that on, along with the '--debug'. This does mean that we require
462 * any of our known options (-ifms) to preceed the brand-specific ones.
463 */
464 while (optind < argc) {
465 strnappend(outargs, BOOTARGS_MAX, argv[optind]);
466 optind++;
467 }
468
469 done:
470 for (i = 0; i < argc_save; i++) {
471 if (argv_save[i] != NULL)
472 free(argv_save[i]);
473 }
474 free(argv_save);
475 return (err);
476 }
477
478
479 static int
480 mkzonedir(zlog_t *zlogp)
481 {
482 struct stat st;
483 /*
484 * We must create and lock everyone but root out of ZONES_TMPDIR
485 * since anyone can open any UNIX domain socket, regardless of
486 * its file system permissions. Sigh...
487 */
488 if (mkdir(ZONES_TMPDIR, S_IRWXU) < 0 && errno != EEXIST) {
489 zerror(zlogp, B_TRUE, "could not mkdir '%s'", ZONES_TMPDIR);
490 return (-1);
491 }
492 /* paranoia */
493 if ((stat(ZONES_TMPDIR, &st) < 0) || !S_ISDIR(st.st_mode)) {
494 zerror(zlogp, B_TRUE, "'%s' is not a directory", ZONES_TMPDIR);
495 return (-1);
496 }
497 (void) chmod(ZONES_TMPDIR, S_IRWXU);
498 return (0);
499 }
500
501 /*
502 * Run the brand's pre-state change callback, if it exists.
503 */
504 static int
505 brand_prestatechg(zlog_t *zlogp, int state, int cmd, boolean_t debug)
506 {
507 char cmdbuf[2 * MAXPATHLEN];
508 const char *altroot;
509
510 if (pre_statechg_hook[0] == '\0')
511 return (0);
512
513 altroot = zonecfg_get_root();
514 if (snprintf(cmdbuf, sizeof (cmdbuf), "%s %d %d %s", pre_statechg_hook,
515 state, cmd, altroot) > sizeof (cmdbuf))
516 return (-1);
517
518 if (do_subproc(zlogp, cmdbuf, NULL, debug) != 0)
519 return (-1);
520
521 return (0);
522 }
523
524 /*
525 * Run the brand's post-state change callback, if it exists.
526 */
527 static int
528 brand_poststatechg(zlog_t *zlogp, int state, int cmd, boolean_t debug)
529 {
530 char cmdbuf[2 * MAXPATHLEN];
531 const char *altroot;
532
533 if (post_statechg_hook[0] == '\0')
534 return (0);
535
536 altroot = zonecfg_get_root();
537 if (snprintf(cmdbuf, sizeof (cmdbuf), "%s %d %d %s", post_statechg_hook,
538 state, cmd, altroot) > sizeof (cmdbuf))
539 return (-1);
540
541 if (do_subproc(zlogp, cmdbuf, NULL, debug) != 0)
542 return (-1);
543
544 return (0);
545 }
546
547 /*
548 * Notify zonestatd of the new zone. If zonestatd is not running, this
549 * will do nothing.
550 */
551 static void
552 notify_zonestatd(zoneid_t zoneid)
553 {
554 int cmd[2];
555 int fd;
556 door_arg_t params;
557
558 fd = open(ZS_DOOR_PATH, O_RDONLY);
559 if (fd < 0)
560 return;
561
562 cmd[0] = ZSD_CMD_NEW_ZONE;
563 cmd[1] = zoneid;
564 params.data_ptr = (char *)&cmd;
565 params.data_size = sizeof (cmd);
566 params.desc_ptr = NULL;
567 params.desc_num = 0;
568 params.rbuf = NULL;
569 params.rsize = 0;
570 (void) door_call(fd, ¶ms);
571 (void) close(fd);
572 }
573
574 /*
575 * Bring a zone up to the pre-boot "ready" stage. The mount_cmd argument is
576 * 'true' if this is being invoked as part of the processing for the "mount"
577 * subcommand.
578 *
579 * If a scratch zone mount (ALT_MOUNT) is being performed then do not
580 * call the state change hooks.
581 */
582 static int
583 zone_ready(zlog_t *zlogp, zone_mnt_t mount_cmd, int zstate, boolean_t debug)
584 {
585 int err;
586 boolean_t snapped = B_FALSE;
587
588 if ((snap_hndl = zonecfg_init_handle()) == NULL) {
589 zerror(zlogp, B_TRUE, "getting zone configuration handle");
590 goto bad;
591 }
592 if ((err = zonecfg_create_snapshot(zone_name)) != Z_OK) {
593 zerror(zlogp, B_FALSE, "unable to create snapshot: %s",
594 zonecfg_strerror(err));
595 goto bad;
596 }
597 snapped = B_TRUE;
598
599 if (zonecfg_get_snapshot_handle(zone_name, snap_hndl) != Z_OK) {
600 zerror(zlogp, B_FALSE, "invalid configuration snapshot");
601 goto bad;
602 }
603
604 if (zone_did == 0)
605 zone_did = zone_get_did(zone_name);
606
607 if (!ALT_MOUNT(mount_cmd) &&
608 brand_prestatechg(zlogp, zstate, Z_READY, debug) != 0)
609 goto bad;
610
611 if ((zone_id = vplat_create(zlogp, mount_cmd, zone_did)) == -1)
612 goto bad;
613
614 if (vplat_bringup(zlogp, mount_cmd, zone_id) != 0) {
615 bringup_failure_recovery = B_TRUE;
616 (void) vplat_teardown(NULL, (mount_cmd != Z_MNT_BOOT), B_FALSE,
617 debug);
618 goto bad;
619 }
620
621 if (!ALT_MOUNT(mount_cmd) &&
622 brand_poststatechg(zlogp, zstate, Z_READY, debug) != 0)
623 goto bad;
624
625 return (0);
626
627 bad:
628 /*
629 * If something goes wrong, we up the zones's state to the target
630 * state, READY, and then invoke the hook as if we're halting.
631 */
632 if (!ALT_MOUNT(mount_cmd))
633 (void) brand_poststatechg(zlogp, ZONE_STATE_READY, Z_HALT,
634 debug);
635
636 if (snapped)
637 if ((err = zonecfg_destroy_snapshot(zone_name)) != Z_OK)
638 zerror(zlogp, B_FALSE, "destroying snapshot: %s",
639 zonecfg_strerror(err));
640 zonecfg_fini_handle(snap_hndl);
641 snap_hndl = NULL;
642 return (-1);
643 }
644
645 int
646 init_template(void)
647 {
648 int fd;
649 int err = 0;
650
651 fd = open64(CTFS_ROOT "/process/template", O_RDWR);
652 if (fd == -1)
653 return (-1);
654
655 /*
656 * For now, zoneadmd doesn't do anything with the contract.
657 * Deliver no events, don't inherit, and allow it to be orphaned.
658 */
659 err |= ct_tmpl_set_critical(fd, 0);
660 err |= ct_tmpl_set_informative(fd, 0);
661 err |= ct_pr_tmpl_set_fatal(fd, CT_PR_EV_HWERR);
662 err |= ct_pr_tmpl_set_param(fd, CT_PR_PGRPONLY | CT_PR_REGENT);
663 if (err || ct_tmpl_activate(fd)) {
664 (void) close(fd);
665 return (-1);
666 }
667
668 return (fd);
669 }
670
671 typedef struct fs_callback {
672 zlog_t *zlogp;
673 zoneid_t zoneid;
674 boolean_t mount_cmd;
675 } fs_callback_t;
676
677 static int
678 mount_early_fs(void *data, const char *spec, const char *dir,
679 const char *fstype, const char *opt)
680 {
681 zlog_t *zlogp = ((fs_callback_t *)data)->zlogp;
682 zoneid_t zoneid = ((fs_callback_t *)data)->zoneid;
683 boolean_t mount_cmd = ((fs_callback_t *)data)->mount_cmd;
684 char rootpath[MAXPATHLEN];
685 pid_t child;
686 int child_status;
687 int tmpl_fd;
688 int rv;
689 ctid_t ct;
690
691 /* determine the zone rootpath */
692 if (mount_cmd) {
693 char luroot[MAXPATHLEN];
694
695 (void) snprintf(luroot, sizeof (luroot), "%s/lu", zonepath);
696 resolve_lofs(zlogp, luroot, sizeof (luroot));
697 (void) strlcpy(rootpath, luroot, sizeof (rootpath));
698 } else {
699 if (zone_get_rootpath(zone_name,
700 rootpath, sizeof (rootpath)) != Z_OK) {
701 zerror(zlogp, B_FALSE, "unable to determine zone root");
702 return (-1);
703 }
704 }
705
706 if ((rv = valid_mount_path(zlogp, rootpath, spec, dir, fstype)) < 0) {
707 zerror(zlogp, B_FALSE, "%s%s is not a valid mount point",
708 rootpath, dir);
709 return (-1);
710 } else if (rv > 0) {
711 /* The mount point path doesn't exist, create it now. */
712 if (make_one_dir(zlogp, rootpath, dir,
713 DEFAULT_DIR_MODE, DEFAULT_DIR_USER,
714 DEFAULT_DIR_GROUP) != 0) {
715 zerror(zlogp, B_FALSE, "failed to create mount point");
716 return (-1);
717 }
718
719 /*
720 * Now this might seem weird, but we need to invoke
721 * valid_mount_path() again. Why? Because it checks
722 * to make sure that the mount point path is canonical,
723 * which it can only do if the path exists, so now that
724 * we've created the path we have to verify it again.
725 */
726 if ((rv = valid_mount_path(zlogp, rootpath, spec, dir,
727 fstype)) < 0) {
728 zerror(zlogp, B_FALSE,
729 "%s%s is not a valid mount point", rootpath, dir);
730 return (-1);
731 }
732 }
733
734 if ((tmpl_fd = init_template()) == -1) {
735 zerror(zlogp, B_TRUE, "failed to create contract");
736 return (-1);
737 }
738
739 if ((child = fork()) == -1) {
740 (void) ct_tmpl_clear(tmpl_fd);
741 (void) close(tmpl_fd);
742 zerror(zlogp, B_TRUE, "failed to fork");
743 return (-1);
744
745 } else if (child == 0) { /* child */
746 char opt_buf[MAX_MNTOPT_STR];
747 int optlen = 0;
748 int mflag = MS_DATA;
749 int i;
750 int ret;
751
752 (void) ct_tmpl_clear(tmpl_fd);
753 /*
754 * Even though there are no procs running in the zone, we
755 * do this for paranoia's sake.
756 */
757 (void) closefrom(0);
758
759 if (zone_enter(zoneid) == -1) {
760 _exit(errno);
761 }
762 if (opt != NULL) {
763 /*
764 * The mount() system call is incredibly annoying.
765 * If options are specified, we need to copy them
766 * into a temporary buffer since the mount() system
767 * call will overwrite the options string. It will
768 * also fail if the new option string it wants to
769 * write is bigger than the one we passed in, so
770 * you must pass in a buffer of the maximum possible
771 * option string length. sigh.
772 */
773 (void) strlcpy(opt_buf, opt, sizeof (opt_buf));
774 opt = opt_buf;
775 optlen = MAX_MNTOPT_STR;
776 mflag = MS_OPTIONSTR;
777 }
778
779 /*
780 * There is an obscure race condition which can cause mount
781 * to return EBUSY. This happens for example on the mount
782 * of the zone's /etc/svc/volatile file system if there is
783 * a GZ process running svcs -Z, which will touch the
784 * mountpoint, just as we're trying to do the mount. To cope
785 * with this, we retry up to 3 times to let this transient
786 * process get out of the way.
787 */
788 for (i = 0; i < 3; i++) {
789 ret = 0;
790 if (mount(spec, dir, mflag, fstype, NULL, 0, opt,
791 optlen) != 0)
792 ret = errno;
793 if (ret != EBUSY)
794 break;
795 (void) sleep(1);
796 }
797 _exit(ret);
798 }
799
800 /* parent */
801 if (contract_latest(&ct) == -1)
802 ct = -1;
803 (void) ct_tmpl_clear(tmpl_fd);
804 (void) close(tmpl_fd);
805 if (waitpid(child, &child_status, 0) != child) {
806 /* unexpected: we must have been signalled */
807 (void) contract_abandon_id(ct);
808 return (-1);
809 }
810 (void) contract_abandon_id(ct);
811 if (WEXITSTATUS(child_status) != 0) {
812 errno = WEXITSTATUS(child_status);
813 zerror(zlogp, B_TRUE, "mount of %s failed", dir);
814 return (-1);
815 }
816
817 return (0);
818 }
819
820 /*
821 * Replace characters other than [A-Za-z0-9_] with '_' so that the string is a
822 * valid environment variable name.
823 */
824 static void
825 sanitize_env_var_name(char *var)
826 {
827 for (char *p = var; *p != '\0'; p++) {
828 if (!isalnum(*p)) {
829 *p = '_';
830 }
831 }
832 }
833
834 /*
835 * env variable name format
836 * _ZONECFG_{resource name}_{identifying attr. name}_{property name}
837 * Any dashes (-) in the property names are replaced with underscore (_).
838 */
839 static void
840 set_zonecfg_env(char *rsrc, char *attr, char *name, char *val)
841 {
842 /* Enough for maximal name, rsrc + attr, & slop for ZONECFG & _'s */
843 char nm[2 * MAXNAMELEN + 32];
844
845 if (attr == NULL)
846 (void) snprintf(nm, sizeof (nm), "_ZONECFG_%s_%s", rsrc,
847 name);
848 else
849 (void) snprintf(nm, sizeof (nm), "_ZONECFG_%s_%s_%s", rsrc,
850 attr, name);
851
852 sanitize_env_var_name(nm);
853
854 (void) setenv(nm, val, 1);
855 }
856
857 /*
858 * Resolve a device:match value to a path. This is only different for PPT
859 * devices, where we expect the match property to be a /devices/... path, and
860 * configured for PPT already.
861 */
862 int
863 resolve_device_match(zlog_t *zlogp, struct zone_devtab *dtab,
864 char *path, size_t len)
865 {
866 struct zone_res_attrtab *rap;
867
868 for (rap = dtab->zone_dev_attrp; rap != NULL;
869 rap = rap->zone_res_attr_next) {
870 if (strcmp(rap->zone_res_attr_name, "model") == 0 &&
871 strcmp(rap->zone_res_attr_value, "passthru") == 0)
872 break;
873 }
874
875 if (rap == NULL) {
876 if (strlcpy(path, dtab->zone_dev_match, len) >= len)
877 return (Z_INVAL);
878 return (Z_OK);
879 }
880
881 if (strncmp(dtab->zone_dev_match, "/devices",
882 strlen("/devices")) != 0) {
883 zerror(zlogp, B_FALSE, "invalid passthru match value '%s'",
884 dtab->zone_dev_match);
885 return (Z_INVAL);
886 }
887
888 if (ppt_devpath_to_dev(dtab->zone_dev_match, path, len) != 0) {
889 zerror(zlogp, B_TRUE, "failed to resolve passthru device %s",
890 dtab->zone_dev_match);
891 return (Z_INVAL);
892 }
893
894 return (Z_OK);
895 }
896
897 /*
898 * Export various zonecfg properties into environment for the boot and state
899 * change hooks.
900 *
901 * If debug is true, _ZONEADMD_brand_debug is set to 1, else it is set to an
902 * empty string. Brand hooks consider any non-empty string as an indication
903 * that debug output is requested.
904 *
905 * We could export more of the config in the future, as necessary. A better
906 * solution would be to make it so brand-specific behavior is handled by
907 * brand-specific callbacks written in C. Then the normal libzonecfg interfaces
908 * can be used for accessing any parts of the configuration that are needed.
909 *
910 * All of the environment variables set by this function are specific to
911 * SmartOS.
912 */
913 static int
914 setup_subproc_env(zlog_t *zlogp, boolean_t debug)
915 {
916 int res;
917 struct zone_nwiftab ntab;
918 struct zone_devtab dtab;
919 struct zone_attrtab atab;
920 char net_resources[MAXNAMELEN * 2];
921 char dev_resources[MAXNAMELEN * 2];
922 char didstr[16];
923 char uuidstr[UUID_PRINTABLE_STRING_LENGTH];
924 uuid_t uuid;
925
926 /* snap_hndl is null when called through the set_brand_env code path */
927 if (snap_hndl == NULL)
928 return (Z_OK);
929
930 if ((res = zonecfg_get_uuid(zone_name, uuid)) != Z_OK)
931 return (res);
932
933 uuid_unparse(uuid, uuidstr);
934 (void) setenv("_ZONECFG_uuid", uuidstr, 1);
935
936 (void) snprintf(didstr, sizeof (didstr), "%d", zone_did);
937 (void) setenv("_ZONECFG_did", didstr, 1);
938
939 /*
940 * "net" resources are exported because zoneadmd does not handle
941 * automatic configuration of vnics and so that the bhyve boot hook
942 * can generate the argument list for the brand's init program. At such
943 * a time as vnic creation is handled in zoneadmd and brand callbacks
944 * can be executed as part of the zoneadmd process this should be
945 * removed.
946 */
947 net_resources[0] = '\0';
948 if ((res = zonecfg_setnwifent(snap_hndl)) != Z_OK)
949 goto done;
950
951 while (zonecfg_getnwifent(snap_hndl, &ntab) == Z_OK) {
952 struct zone_res_attrtab *rap;
953 char *phys;
954
955 phys = ntab.zone_nwif_physical;
956
957 (void) strlcat(net_resources, phys, sizeof (net_resources));
958 (void) strlcat(net_resources, " ", sizeof (net_resources));
959
960 set_zonecfg_env(RSRC_NET, phys, "physical", phys);
961
962 set_zonecfg_env(RSRC_NET, phys, "address",
963 ntab.zone_nwif_address);
964 set_zonecfg_env(RSRC_NET, phys, "allowed-address",
965 ntab.zone_nwif_allowed_address);
966 set_zonecfg_env(RSRC_NET, phys, "defrouter",
967 ntab.zone_nwif_defrouter);
968 set_zonecfg_env(RSRC_NET, phys, "global-nic",
969 ntab.zone_nwif_gnic);
970 set_zonecfg_env(RSRC_NET, phys, "mac-addr", ntab.zone_nwif_mac);
971 set_zonecfg_env(RSRC_NET, phys, "vlan-id",
972 ntab.zone_nwif_vlan_id);
973
974 for (rap = ntab.zone_nwif_attrp; rap != NULL;
975 rap = rap->zone_res_attr_next)
976 set_zonecfg_env(RSRC_NET, phys, rap->zone_res_attr_name,
977 rap->zone_res_attr_value);
978 nwifent_free_attrs(&ntab);
979 }
980
981 (void) setenv("_ZONECFG_net_resources", net_resources, 1);
982
983 (void) zonecfg_endnwifent(snap_hndl);
984
985 /*
986 * "device" resources are exported because the bhyve boot brand callback
987 * needs them to generate the argument list for the brand's init
988 * program. At such a time as brand callbacks can be executed as part
989 * of the zoneadmd process, this should be removed.
990 *
991 * The bhyve brand only supports disk-like and ppt devices and does not
992 * support regular expressions.
993 */
994 if ((res = zonecfg_setdevent(snap_hndl)) != Z_OK)
995 goto done;
996
997 dev_resources[0] = '\0';
998 while (zonecfg_getdevent(snap_hndl, &dtab) == Z_OK) {
999 char *match = dtab.zone_dev_match;
1000 struct zone_res_attrtab *rap;
1001 char path[MAXPATHLEN];
1002
1003 res = resolve_device_match(zlogp, &dtab, path, sizeof (path));
1004 if (res != Z_OK)
1005 goto done;
1006
1007 /*
1008 * Even if not modified, the match path will be mangled in the
1009 * environment variable name, so we always store the value here.
1010 */
1011 set_zonecfg_env(RSRC_DEV, match, "path", path);
1012
1013 for (rap = dtab.zone_dev_attrp; rap != NULL;
1014 rap = rap->zone_res_attr_next) {
1015 set_zonecfg_env(RSRC_DEV, match,
1016 rap->zone_res_attr_name, rap->zone_res_attr_value);
1017 }
1018
1019 /*
1020 * _ZONECFG_device_resources will contain a space separated list
1021 * of devices that have _ZONECFG_device_<device>* environment
1022 * variables. So that each element of the list matches up with
1023 * <device>, each list item needs to be sanitized in the same
1024 * way that environment variable names are sanitized.
1025 */
1026 sanitize_env_var_name(match);
1027 (void) strlcat(dev_resources, match, sizeof (dev_resources));
1028 (void) strlcat(dev_resources, " ", sizeof (dev_resources));
1029 }
1030 (void) zonecfg_enddevent(snap_hndl);
1031
1032 (void) setenv("_ZONECFG_device_resources", dev_resources, 1);
1033
1034 /*
1035 * "attr" resources are exported because the bhyve brand's boot hook
1036 * needs access to the "ram", "cpu", "bootrom", etc. to form the
1037 * argument list for the brand's init program. Once the bhyve brand is
1038 * configured via proper resources and properties, this should be
1039 * removed.
1040 */
1041 if ((res = zonecfg_setattrent(snap_hndl)) != Z_OK)
1042 goto done;
1043
1044 while (zonecfg_getattrent(snap_hndl, &atab) == Z_OK) {
1045 set_zonecfg_env("attr", NULL, atab.zone_attr_name,
1046 atab.zone_attr_value);
1047 }
1048
1049 (void) zonecfg_endattrent(snap_hndl);
1050
1051 if (debug)
1052 (void) setenv("_ZONEADMD_brand_debug", "1", 1);
1053 else
1054 (void) setenv("_ZONEADMD_brand_debug", "", 1);
1055
1056 res = Z_OK;
1057
1058 done:
1059 return (res);
1060 }
1061
1062 void
1063 nwifent_free_attrs(struct zone_nwiftab *np)
1064 {
1065 struct zone_res_attrtab *rap;
1066
1067 for (rap = np->zone_nwif_attrp; rap != NULL; ) {
1068 struct zone_res_attrtab *tp = rap;
1069
1070 rap = rap->zone_res_attr_next;
1071 free(tp);
1072 }
1073 }
1074
1075 /*
1076 * If retstr is not NULL, the output of the subproc is returned in the str,
1077 * otherwise it is output using zerror(). Any memory allocated for retstr
1078 * should be freed by the caller.
1079 */
1080 int
1081 do_subproc(zlog_t *zlogp, char *cmdbuf, char **retstr, boolean_t debug)
1082 {
1083 char buf[1024]; /* arbitrary large amount */
1084 char *inbuf;
1085 FILE *file;
1086 int status;
1087 int rd_cnt;
1088 int fds[2];
1089 pid_t child;
1090
1091 if (retstr != NULL) {
1092 if ((*retstr = malloc(1024)) == NULL) {
1093 zerror(zlogp, B_FALSE, "out of memory");
1094 return (-1);
1095 }
1096 inbuf = *retstr;
1097 rd_cnt = 0;
1098 } else {
1099 inbuf = buf;
1100 }
1101
1102 if (pipe(fds) != 0) {
1103 zerror(zlogp, B_TRUE, "failed to create pipe for subprocess");
1104 return (-1);
1105 }
1106
1107 if ((child = fork()) == 0) {
1108 int in;
1109
1110 /*
1111 * SIGINT is currently ignored. It probably shouldn't be so
1112 * hard to kill errant children, so we revert to SIG_DFL.
1113 * SIGHUP and SIGUSR1 are used to perform log rotation. We
1114 * leave those as-is because we don't want a 'pkill -HUP
1115 * zoneadmd' to kill this child process before exec(). On
1116 * exec(), SIGHUP and SIGUSR1 will become SIG_DFL.
1117 */
1118 (void) sigset(SIGINT, SIG_DFL);
1119
1120 /*
1121 * Set up a pipe for the child to log to.
1122 */
1123 if (dup2(fds[1], STDERR_FILENO) == -1) {
1124 (void) snprintf(buf, sizeof (buf),
1125 "subprocess failed to dup2(STDERR_FILENO): %s\n",
1126 strerror(errno));
1127 (void) write(fds[1], buf, strlen(buf));
1128 _exit(127);
1129 }
1130 if (dup2(fds[1], STDOUT_FILENO) == -1) {
1131 perror("subprocess failed to dup2(STDOUT_FILENO)");
1132 _exit(127);
1133 }
1134 /*
1135 * Some naughty children may try to read from stdin. Be sure
1136 * that the first file that a child opens doesn't get stdin's
1137 * file descriptor.
1138 */
1139 if ((in = open("/dev/null", O_RDONLY)) == -1 ||
1140 dup2(in, STDIN_FILENO) == -1) {
1141 zerror(zlogp, B_TRUE,
1142 "subprocess failed to set up STDIN_FILENO");
1143 _exit(127);
1144 }
1145 closefrom(STDERR_FILENO + 1);
1146
1147 if (setup_subproc_env(zlogp, debug) != Z_OK) {
1148 zerror(zlogp, B_FALSE, "failed to setup environment");
1149 _exit(127);
1150 }
1151
1152 (void) execl("/bin/sh", "sh", "-c", cmdbuf, NULL);
1153
1154 zerror(zlogp, B_TRUE, "subprocess execl failed");
1155 _exit(127);
1156 } else if (child == -1) {
1157 zerror(zlogp, B_TRUE, "failed to create subprocess for '%s'",
1158 cmdbuf);
1159 (void) close(fds[0]);
1160 (void) close(fds[1]);
1161 return (-1);
1162 }
1163
1164 (void) close(fds[1]);
1165
1166 file = fdopen(fds[0], "r");
1167 while (fgets(inbuf, 1024, file) != NULL) {
1168 if (retstr == NULL) {
1169 if (zlogp != &logsys) {
1170 int last = strlen(inbuf) - 1;
1171
1172 if (inbuf[last] == '\n')
1173 inbuf[last] = '\0';
1174 zerror(zlogp, B_FALSE, "%s", inbuf);
1175 }
1176 } else {
1177 char *p;
1178
1179 rd_cnt += 1024 - 1;
1180 if ((p = realloc(*retstr, rd_cnt + 1024)) == NULL) {
1181 zerror(zlogp, B_FALSE, "out of memory");
1182 break;
1183 }
1184
1185 *retstr = p;
1186 inbuf = *retstr + rd_cnt;
1187 }
1188 }
1189
1190 while (fclose(file) != 0) {
1191 assert(errno == EINTR);
1192 }
1193 while (waitpid(child, &status, 0) == -1) {
1194 if (errno != EINTR) {
1195 zerror(zlogp, B_TRUE,
1196 "failed to get exit status of '%s'", cmdbuf);
1197 return (-1);
1198 }
1199 }
1200
1201 if (WIFSIGNALED(status)) {
1202 zerror(zlogp, B_FALSE, "%s unexpectedly terminated due to "
1203 "signal %d", cmdbuf, WTERMSIG(status));
1204 return (-1);
1205 }
1206 assert(WIFEXITED(status));
1207 if (WEXITSTATUS(status) == ZEXIT_EXEC) {
1208 zerror(zlogp, B_FALSE, "failed to exec %s", cmdbuf);
1209 return (-1);
1210 }
1211 return (WEXITSTATUS(status));
1212 }
1213
1214 /*
1215 * Get the path for this zone's init(1M) (or equivalent) process. First look
1216 * for a zone-specific init-name attr, then get it from the brand.
1217 */
1218 static int
1219 get_initname(brand_handle_t bh, char *initname, int len)
1220 {
1221 struct zone_attrtab a;
1222
1223 bzero(&a, sizeof (a));
1224 (void) strlcpy(a.zone_attr_name, "init-name",
1225 sizeof (a.zone_attr_name));
1226
1227 if (zonecfg_lookup_attr(snap_hndl, &a) == Z_OK) {
1228 (void) strlcpy(initname, a.zone_attr_value, len);
1229 return (0);
1230 }
1231
1232 return (brand_get_initname(bh, initname, len));
1233 }
1234
1235 /*
1236 * Get the restart-init flag for this zone's init(1M) (or equivalent) process.
1237 * First look for a zone-specific restart-init attr, then get it from the brand.
1238 */
1239 static boolean_t
1240 restartinit(brand_handle_t bh)
1241 {
1242 struct zone_attrtab a;
1243
1244 bzero(&a, sizeof (a));
1245 (void) strlcpy(a.zone_attr_name, "restart-init",
1246 sizeof (a.zone_attr_name));
1247
1248 if (zonecfg_lookup_attr(snap_hndl, &a) == Z_OK) {
1249 if (strcmp(a.zone_attr_value, "false") == 0)
1250 return (B_FALSE);
1251 return (B_TRUE);
1252 }
1253
1254 return (brand_restartinit(bh));
1255 }
1256
1257 /*
1258 * Get the app-svc-dependent flag for this zone's init process. This is a
1259 * zone-specific attr which controls the type of contract we create for the
1260 * zone's init. When true, the contract will include CT_PR_EV_EXIT in the fatal
1261 * set, so that when any service which is in the same contract exits, the init
1262 * application will be terminated.
1263 */
1264 static boolean_t
1265 is_app_svc_dep(void)
1266 {
1267 struct zone_attrtab a;
1268
1269 bzero(&a, sizeof (a));
1270 (void) strlcpy(a.zone_attr_name, "app-svc-dependent",
1271 sizeof (a.zone_attr_name));
1272
1273 if (zonecfg_lookup_attr(snap_hndl, &a) == Z_OK &&
1274 strcmp(a.zone_attr_value, "true") == 0) {
1275 return (B_TRUE);
1276 }
1277
1278 return (B_FALSE);
1279 }
1280
1281 static int
1282 zone_bootup(zlog_t *zlogp, const char *bootargs, int zstate, boolean_t debug)
1283 {
1284 zoneid_t zoneid;
1285 struct stat st;
1286 char rpath[MAXPATHLEN], initpath[MAXPATHLEN], init_file[MAXPATHLEN];
1287 char nbootargs[BOOTARGS_MAX];
1288 char cmdbuf[MAXPATHLEN];
1289 fs_callback_t cb;
1290 brand_handle_t bh;
1291 zone_iptype_t iptype;
1292 dladm_status_t status;
1293 char errmsg[DLADM_STRSIZE];
1294 int err;
1295 boolean_t app_svc_dep;
1296 boolean_t restart_init, restart_init0, restart_initreboot;
1297
1298 if (brand_prestatechg(zlogp, zstate, Z_BOOT, debug) != 0)
1299 return (-1);
1300
1301 if ((zoneid = getzoneidbyname(zone_name)) == -1) {
1302 zerror(zlogp, B_TRUE, "unable to get zoneid");
1303 goto bad;
1304 }
1305
1306 cb.zlogp = zlogp;
1307 cb.zoneid = zoneid;
1308 cb.mount_cmd = B_FALSE;
1309
1310 /* Get a handle to the brand info for this zone */
1311 if ((bh = brand_open(brand_name)) == NULL) {
1312 zerror(zlogp, B_FALSE, "unable to determine zone brand");
1313 goto bad;
1314 }
1315
1316 /*
1317 * Get the list of filesystems to mount from the brand
1318 * configuration. These mounts are done via a thread that will
1319 * enter the zone, so they are done from within the context of the
1320 * zone.
1321 */
1322 if (brand_platform_iter_mounts(bh, mount_early_fs, &cb) != 0) {
1323 zerror(zlogp, B_FALSE, "unable to mount filesystems");
1324 brand_close(bh);
1325 goto bad;
1326 }
1327
1328 /*
1329 * Get the brand's boot callback if it exists.
1330 */
1331 (void) strcpy(cmdbuf, EXEC_PREFIX);
1332 if (brand_get_boot(bh, zone_name, zonepath, cmdbuf + EXEC_LEN,
1333 sizeof (cmdbuf) - EXEC_LEN) != 0) {
1334 zerror(zlogp, B_FALSE,
1335 "unable to determine branded zone's boot callback");
1336 brand_close(bh);
1337 goto bad;
1338 }
1339
1340 /* Get the path for this zone's init(1M) (or equivalent) process. */
1341 if (get_initname(bh, init_file, MAXPATHLEN) != 0) {
1342 zerror(zlogp, B_FALSE,
1343 "unable to determine zone's init(1M) location");
1344 brand_close(bh);
1345 goto bad;
1346 }
1347
1348 /* See if this zone's brand should restart init if it dies. */
1349 restart_init = restartinit(bh);
1350 restart_init0 = brand_restartinit0(bh);
1351 restart_initreboot = brand_restartinitreboot(bh);
1352
1353 /*
1354 * See if we need to setup contract dependencies between the zone's
1355 * primary application and any of its services.
1356 */
1357 app_svc_dep = is_app_svc_dep();
1358
1359 brand_close(bh);
1360
1361 err = filter_bootargs(zlogp, bootargs, nbootargs, init_file);
1362 if (err != Z_OK)
1363 goto bad;
1364
1365 assert(init_file[0] != '\0');
1366
1367 /*
1368 * Try to anticipate possible problems: If possible, make sure init is
1369 * executable.
1370 */
1371 if (zone_get_rootpath(zone_name, rpath, sizeof (rpath)) != Z_OK) {
1372 zerror(zlogp, B_FALSE, "unable to determine zone root");
1373 goto bad;
1374 }
1375
1376 (void) snprintf(initpath, sizeof (initpath), "%s%s", rpath, init_file);
1377
1378 if (lstat(initpath, &st) == -1) {
1379 zerror(zlogp, B_TRUE, "could not stat %s", initpath);
1380 goto bad;
1381 }
1382
1383 /* LINTED: E_NOP_IF_STMT */
1384 if ((st.st_mode & S_IFMT) == S_IFLNK) {
1385 /* symlink, we'll have to wait and resolve when we boot */
1386 } else if ((st.st_mode & S_IXUSR) == 0) {
1387 zerror(zlogp, B_FALSE, "%s is not executable", initpath);
1388 goto bad;
1389 }
1390
1391 /*
1392 * Exclusive stack zones interact with the dlmgmtd running in the
1393 * global zone. dladm_zone_boot() tells dlmgmtd that this zone is
1394 * booting, and loads its datalinks from the zone's datalink
1395 * configuration file.
1396 */
1397 if (vplat_get_iptype(zlogp, &iptype) == 0 && iptype == ZS_EXCLUSIVE) {
1398 status = dladm_zone_boot(dld_handle, zoneid);
1399 if (status != DLADM_STATUS_OK) {
1400 zerror(zlogp, B_FALSE, "unable to load zone datalinks: "
1401 " %s", dladm_status2str(status, errmsg));
1402 goto bad;
1403 }
1404 }
1405
1406 /*
1407 * If there is a brand 'boot' callback, execute it now to give the
1408 * brand one last chance to do any additional setup before the zone
1409 * is booted.
1410 */
1411 if ((strlen(cmdbuf) > EXEC_LEN) &&
1412 (do_subproc(zlogp, cmdbuf, NULL, debug) != Z_OK)) {
1413 zerror(zlogp, B_FALSE, "%s failed", cmdbuf);
1414 goto bad;
1415 }
1416
1417 if (zone_setattr(zoneid, ZONE_ATTR_INITNAME, init_file, 0) == -1) {
1418 zerror(zlogp, B_TRUE, "could not set zone boot file");
1419 goto bad;
1420 }
1421
1422 if (zone_setattr(zoneid, ZONE_ATTR_BOOTARGS, nbootargs, 0) == -1) {
1423 zerror(zlogp, B_TRUE, "could not set zone boot arguments");
1424 goto bad;
1425 }
1426
1427 if (!restart_init && zone_setattr(zoneid, ZONE_ATTR_INITNORESTART,
1428 NULL, 0) == -1) {
1429 zerror(zlogp, B_TRUE, "could not set zone init-no-restart");
1430 goto bad;
1431 }
1432 if (restart_init0 && zone_setattr(zoneid, ZONE_ATTR_INITRESTART0,
1433 NULL, 0) == -1) {
1434 zerror(zlogp, B_TRUE,
1435 "could not set zone init-restart-on-exit-0");
1436 goto bad;
1437 }
1438 if (restart_initreboot && zone_setattr(zoneid, ZONE_ATTR_INITREBOOT,
1439 NULL, 0) == -1) {
1440 zerror(zlogp, B_TRUE, "could not set zone reboot-on-init-exit");
1441 goto bad;
1442 }
1443
1444 if (app_svc_dep && zone_setattr(zoneid, ZONE_ATTR_APP_SVC_CT,
1445 (void *)B_TRUE, sizeof (boolean_t)) == -1) {
1446 zerror(zlogp, B_TRUE, "could not set zone app-die");
1447 goto bad;
1448 }
1449
1450 /*
1451 * Inform zonestatd of a new zone so that it can install a door for
1452 * the zone to contact it.
1453 */
1454 notify_zonestatd(zone_id);
1455
1456 /* Startup a thread to perform zfd logging/tty svc for the zone. */
1457 create_log_thread(zlogp);
1458
1459 if (zone_boot(zoneid) == -1) {
1460 zerror(zlogp, B_TRUE, "unable to boot zone");
1461 destroy_log_thread(zlogp);
1462 goto bad;
1463 }
1464
1465 if (brand_poststatechg(zlogp, zstate, Z_BOOT, debug) != 0) {
1466 destroy_log_thread(zlogp);
1467 goto bad;
1468 }
1469
1470 return (0);
1471
1472 bad:
1473 /*
1474 * If something goes wrong, we up the zones's state to the target
1475 * state, RUNNING, and then invoke the hook as if we're halting.
1476 */
1477 (void) brand_poststatechg(zlogp, ZONE_STATE_RUNNING, Z_HALT, debug);
1478
1479 return (-1);
1480 }
1481
1482 static int
1483 zone_halt(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting, int zstate,
1484 boolean_t debug)
1485 {
1486 int err;
1487
1488 /*
1489 * If performing a scratch zone unmount then do not call the
1490 * state change hooks.
1491 */
1492 if (unmount_cmd == B_FALSE &&
1493 brand_prestatechg(zlogp, zstate, Z_HALT, debug) != 0)
1494 return (-1);
1495
1496 if (vplat_teardown(zlogp, unmount_cmd, rebooting, debug) != 0) {
1497 if (!bringup_failure_recovery)
1498 zerror(zlogp, B_FALSE, "unable to destroy zone");
1499 destroy_log_thread(zlogp);
1500 return (-1);
1501 }
1502
1503 /* Shut down is done, stop the log thread */
1504 destroy_log_thread(zlogp);
1505
1506 if (unmount_cmd == B_FALSE &&
1507 brand_poststatechg(zlogp, zstate, Z_HALT, debug) != 0)
1508 return (-1);
1509
1510 if ((err = zonecfg_destroy_snapshot(zone_name)) != Z_OK)
1511 zerror(zlogp, B_FALSE, "destroying snapshot: %s",
1512 zonecfg_strerror(err));
1513
1514 zonecfg_fini_handle(snap_hndl);
1515 snap_hndl = NULL;
1516
1517 return (0);
1518 }
1519
1520 static int
1521 zone_graceful_shutdown(zlog_t *zlogp)
1522 {
1523 zoneid_t zoneid;
1524 pid_t child;
1525 char cmdbuf[MAXPATHLEN];
1526 brand_handle_t bh = NULL;
1527 ctid_t ct;
1528 int tmpl_fd;
1529 int child_status;
1530
1531 if (shutdown_in_progress) {
1532 zerror(zlogp, B_FALSE, "shutdown already in progress");
1533 return (-1);
1534 }
1535
1536 if ((zoneid = getzoneidbyname(zone_name)) == -1) {
1537 zerror(zlogp, B_TRUE, "unable to get zoneid");
1538 return (-1);
1539 }
1540
1541 /* Get a handle to the brand info for this zone */
1542 if ((bh = brand_open(brand_name)) == NULL) {
1543 zerror(zlogp, B_FALSE, "unable to determine zone brand");
1544 return (-1);
1545 }
1546
1547 /*
1548 * If there is a brand 'shutdown' callback, execute it now to give the
1549 * brand a chance to cleanup any custom configuration.
1550 */
1551 (void) strcpy(cmdbuf, EXEC_PREFIX);
1552 if (brand_get_shutdown(bh, zone_name, zonepath, cmdbuf + EXEC_LEN,
1553 sizeof (cmdbuf) - EXEC_LEN) != 0 || strlen(cmdbuf) <= EXEC_LEN) {
1554 (void) strcat(cmdbuf, SHUTDOWN_DEFAULT);
1555 }
1556 brand_close(bh);
1557
1558 if ((tmpl_fd = init_template()) == -1) {
1559 zerror(zlogp, B_TRUE, "failed to create contract");
1560 return (-1);
1561 }
1562
1563 if ((child = fork()) == -1) {
1564 (void) ct_tmpl_clear(tmpl_fd);
1565 (void) close(tmpl_fd);
1566 zerror(zlogp, B_TRUE, "failed to fork");
1567 return (-1);
1568 } else if (child == 0) {
1569 (void) ct_tmpl_clear(tmpl_fd);
1570 if (zone_enter(zoneid) == -1) {
1571 _exit(errno);
1572 }
1573 _exit(execl("/bin/sh", "sh", "-c", cmdbuf, (char *)NULL));
1574 }
1575
1576 if (contract_latest(&ct) == -1)
1577 ct = -1;
1578 (void) ct_tmpl_clear(tmpl_fd);
1579 (void) close(tmpl_fd);
1580
1581 if (waitpid(child, &child_status, 0) != child) {
1582 /* unexpected: we must have been signalled */
1583 (void) contract_abandon_id(ct);
1584 return (-1);
1585 }
1586
1587 (void) contract_abandon_id(ct);
1588 if (WEXITSTATUS(child_status) != 0) {
1589 errno = WEXITSTATUS(child_status);
1590 zerror(zlogp, B_FALSE, "unable to shutdown zone");
1591 return (-1);
1592 }
1593
1594 shutdown_in_progress = B_TRUE;
1595
1596 return (0);
1597 }
1598
1599 static int
1600 zone_wait_shutdown(zlog_t *zlogp)
1601 {
1602 zone_state_t zstate;
1603 uint64_t *tm = NULL;
1604 scf_simple_prop_t *prop = NULL;
1605 int timeout;
1606 int tries;
1607 int rc = -1;
1608
1609 /* Get default stop timeout from SMF framework */
1610 timeout = SHUTDOWN_WAIT;
1611 if ((prop = scf_simple_prop_get(NULL, SHUTDOWN_FMRI, "stop",
1612 SCF_PROPERTY_TIMEOUT)) != NULL) {
1613 if ((tm = scf_simple_prop_next_count(prop)) != NULL) {
1614 if (tm != 0)
1615 timeout = *tm;
1616 }
1617 scf_simple_prop_free(prop);
1618 }
1619
1620 /* allow time for zone to shutdown cleanly */
1621 for (tries = 0; tries < timeout; tries ++) {
1622 (void) sleep(1);
1623 if (zone_get_state(zone_name, &zstate) == Z_OK &&
1624 zstate == ZONE_STATE_INSTALLED) {
1625 rc = 0;
1626 break;
1627 }
1628 }
1629
1630 if (rc != 0)
1631 zerror(zlogp, B_FALSE, "unable to shutdown zone");
1632
1633 shutdown_in_progress = B_FALSE;
1634
1635 return (rc);
1636 }
1637
1638
1639
1640 /*
1641 * Generate AUE_zone_state for a command that boots a zone.
1642 */
1643 static void
1644 audit_put_record(zlog_t *zlogp, ucred_t *uc, int return_val,
1645 char *new_state)
1646 {
1647 adt_session_data_t *ah;
1648 adt_event_data_t *event;
1649 int pass_fail, fail_reason;
1650
1651 if (!adt_audit_enabled())
1652 return;
1653
1654 if (return_val == 0) {
1655 pass_fail = ADT_SUCCESS;
1656 fail_reason = ADT_SUCCESS;
1657 } else {
1658 pass_fail = ADT_FAILURE;
1659 fail_reason = ADT_FAIL_VALUE_PROGRAM;
1660 }
1661
1662 if (adt_start_session(&ah, NULL, 0)) {
1663 zerror(zlogp, B_TRUE, gettext("audit failure."));
1664 return;
1665 }
1666 if (adt_set_from_ucred(ah, uc, ADT_NEW)) {
1667 zerror(zlogp, B_TRUE, gettext("audit failure."));
1668 (void) adt_end_session(ah);
1669 return;
1670 }
1671
1672 event = adt_alloc_event(ah, ADT_zone_state);
1673 if (event == NULL) {
1674 zerror(zlogp, B_TRUE, gettext("audit failure."));
1675 (void) adt_end_session(ah);
1676 return;
1677 }
1678 event->adt_zone_state.zonename = zone_name;
1679 event->adt_zone_state.new_state = new_state;
1680
1681 if (adt_put_event(event, pass_fail, fail_reason))
1682 zerror(zlogp, B_TRUE, gettext("audit failure."));
1683
1684 adt_free_event(event);
1685
1686 (void) adt_end_session(ah);
1687 }
1688
1689 /*
1690 * Log the exit time and status of the zone's init process into
1691 * {zonepath}/lastexited. If the zone shutdown normally, the exit status will
1692 * be -1, otherwise it will be the exit status as described in wait.3c.
1693 * If the zone is configured to restart init, then nothing will be logged if
1694 * init exits unexpectedly (the kernel will never upcall in this case).
1695 */
1696 static void
1697 log_init_exit(int status)
1698 {
1699 char p[MAXPATHLEN];
1700 char buf[128];
1701 struct timeval t;
1702 int fd;
1703
1704 if (snprintf(p, sizeof (p), "%s/lastexited", zonepath) > sizeof (p))
1705 return;
1706 if (gettimeofday(&t, NULL) != 0)
1707 return;
1708 if (snprintf(buf, sizeof (buf), "%ld.%ld %d\n", t.tv_sec, t.tv_usec,
1709 status) > sizeof (buf))
1710 return;
1711 if ((fd = open(p, O_WRONLY | O_CREAT | O_TRUNC, 0644)) < 0)
1712 return;
1713
1714 (void) write(fd, buf, strlen(buf));
1715
1716 (void) close(fd);
1717 }
1718
1719 /*
1720 * The main routine for the door server that deals with zone state transitions.
1721 */
1722 /* ARGSUSED */
1723 static void
1724 server(void *cookie, char *args, size_t alen, door_desc_t *dp,
1725 uint_t n_desc)
1726 {
1727 ucred_t *uc = NULL;
1728 const priv_set_t *eset;
1729
1730 zone_state_t zstate;
1731 zone_cmd_t cmd;
1732 boolean_t debug;
1733 int init_status;
1734 zone_cmd_arg_t *zargp;
1735
1736 boolean_t kernelcall = B_TRUE;
1737
1738 int rval = -1;
1739 uint64_t uniqid;
1740 zoneid_t zoneid = -1;
1741 zlog_t zlog;
1742 zlog_t *zlogp;
1743 zone_cmd_rval_t *rvalp;
1744 size_t rlen = getpagesize(); /* conservative */
1745 fs_callback_t cb;
1746 brand_handle_t bh;
1747 boolean_t wait_shut = B_FALSE;
1748
1749 /* LINTED E_BAD_PTR_CAST_ALIGN */
1750 zargp = (zone_cmd_arg_t *)args;
1751
1752 /*
1753 * When we get the door unref message, we've fdetach'd the door, and
1754 * it is time for us to shut down zoneadmd.
1755 */
1756 if (zargp == DOOR_UNREF_DATA) {
1757 logstream_close(platloghdl, B_TRUE);
1758
1759 /*
1760 * See comment at end of main() for info on the last rites.
1761 */
1762 exit(0);
1763 }
1764
1765 if (zargp == NULL) {
1766 (void) door_return(NULL, 0, 0, 0);
1767 }
1768
1769 rvalp = alloca(rlen);
1770 bzero(rvalp, rlen);
1771 zlog.logfile = NULL;
1772 zlog.buflen = zlog.loglen = rlen - sizeof (zone_cmd_rval_t) + 1;
1773 zlog.buf = rvalp->errbuf;
1774 zlog.log = zlog.buf;
1775 /* defer initialization of zlog.locale until after credential check */
1776 zlogp = &zlog;
1777
1778 if (alen != sizeof (zone_cmd_arg_t)) {
1779 /*
1780 * This really shouldn't be happening.
1781 */
1782 zerror(&logsys, B_FALSE, "argument size (%d bytes) "
1783 "unexpected (expected %d bytes)", alen,
1784 sizeof (zone_cmd_arg_t));
1785 goto out;
1786 }
1787 cmd = zargp->cmd;
1788 debug = zargp->debug;
1789 init_status = zargp->status;
1790
1791 if (door_ucred(&uc) != 0) {
1792 zerror(&logsys, B_TRUE, "door_ucred");
1793 goto out;
1794 }
1795 eset = ucred_getprivset(uc, PRIV_EFFECTIVE);
1796 if (ucred_getzoneid(uc) != GLOBAL_ZONEID ||
1797 (eset != NULL ? !priv_ismember(eset, PRIV_SYS_CONFIG) :
1798 ucred_geteuid(uc) != 0)) {
1799 zerror(&logsys, B_FALSE, "insufficient privileges");
1800 goto out;
1801 }
1802
1803 kernelcall = ucred_getpid(uc) == 0;
1804
1805 /*
1806 * This is safe because we only use a zlog_t throughout the
1807 * duration of a door call; i.e., by the time the pointer
1808 * might become invalid, the door call would be over.
1809 */
1810 zlog.locale = kernelcall ? DEFAULT_LOCALE : zargp->locale;
1811
1812 (void) mutex_lock(&lock);
1813
1814 /*
1815 * Once we start to really die off, we don't want more connections.
1816 */
1817 if (in_death_throes) {
1818 (void) mutex_unlock(&lock);
1819 ucred_free(uc);
1820 (void) door_return(NULL, 0, 0, 0);
1821 thr_exit(NULL);
1822 }
1823
1824 /*
1825 * Check for validity of command.
1826 */
1827 if (cmd != Z_READY && cmd != Z_BOOT && cmd != Z_FORCEBOOT &&
1828 cmd != Z_REBOOT && cmd != Z_SHUTDOWN && cmd != Z_HALT &&
1829 cmd != Z_NOTE_UNINSTALLING && cmd != Z_MOUNT &&
1830 cmd != Z_FORCEMOUNT && cmd != Z_UNMOUNT) {
1831 zerror(&logsys, B_FALSE, "invalid command %d", (int)cmd);
1832 goto out;
1833 }
1834
1835 if (kernelcall && (cmd != Z_HALT && cmd != Z_REBOOT)) {
1836 /*
1837 * Can't happen
1838 */
1839 zerror(&logsys, B_FALSE, "received unexpected kernel upcall %d",
1840 cmd);
1841 goto out;
1842 }
1843 /*
1844 * We ignore the possibility of someone calling zone_create(2)
1845 * explicitly; all requests must come through zoneadmd.
1846 */
1847 if (zone_get_state(zone_name, &zstate) != Z_OK) {
1848 /*
1849 * Something terribly wrong happened
1850 */
1851 zerror(&logsys, B_FALSE, "unable to determine state of zone");
1852 goto out;
1853 }
1854
1855 if (kernelcall) {
1856 /*
1857 * Kernel-initiated requests may lose their validity if the
1858 * zone_t the kernel was referring to has gone away.
1859 */
1860 if ((zoneid = getzoneidbyname(zone_name)) == -1 ||
1861 zone_getattr(zoneid, ZONE_ATTR_UNIQID, &uniqid,
1862 sizeof (uniqid)) == -1 || uniqid != zargp->uniqid) {
1863 /*
1864 * We're not talking about the same zone. The request
1865 * must have arrived too late. Return error.
1866 */
1867 rval = -1;
1868 goto out;
1869 }
1870 zlogp = &logplat; /* Log errors to platform.log */
1871 }
1872
1873 /*
1874 * If we are being asked to forcibly mount or boot a zone, we
1875 * pretend that an INCOMPLETE zone is actually INSTALLED.
1876 */
1877 if (zstate == ZONE_STATE_INCOMPLETE &&
1878 (cmd == Z_FORCEBOOT || cmd == Z_FORCEMOUNT))
1879 zstate = ZONE_STATE_INSTALLED;
1880
1881 switch (zstate) {
1882 case ZONE_STATE_CONFIGURED:
1883 case ZONE_STATE_INCOMPLETE:
1884 /*
1885 * Not our area of expertise; we just print a nice message
1886 * and die off.
1887 */
1888 zerror(zlogp, B_FALSE,
1889 "%s operation is invalid for zones in state '%s'",
1890 z_cmd_name(cmd), zone_state_str(zstate));
1891 break;
1892
1893 case ZONE_STATE_INSTALLED:
1894 switch (cmd) {
1895 case Z_READY:
1896 rval = zone_ready(zlogp, Z_MNT_BOOT, zstate, debug);
1897 if (rval == 0)
1898 eventstream_write(Z_EVT_ZONE_READIED);
1899 zcons_statechanged();
1900 break;
1901 case Z_BOOT:
1902 case Z_FORCEBOOT:
1903 eventstream_write(Z_EVT_ZONE_BOOTING);
1904 if ((rval = zone_ready(zlogp, Z_MNT_BOOT, zstate,
1905 debug)) == 0) {
1906 rval = zone_bootup(zlogp, zargp->bootbuf,
1907 zstate, debug);
1908 }
1909 audit_put_record(zlogp, uc, rval, "boot");
1910 zcons_statechanged();
1911 if (rval != 0) {
1912 bringup_failure_recovery = B_TRUE;
1913 (void) zone_halt(zlogp, B_FALSE, B_FALSE,
1914 zstate, debug);
1915 eventstream_write(Z_EVT_ZONE_BOOTFAILED);
1916 }
1917 break;
1918 case Z_SHUTDOWN:
1919 case Z_HALT:
1920 if (kernelcall) /* Invalid; can't happen */
1921 abort();
1922 /*
1923 * We could have two clients racing to halt this
1924 * zone; the second client loses, but its request
1925 * doesn't fail, since the zone is now in the desired
1926 * state.
1927 */
1928 zerror(zlogp, B_FALSE, "zone is already halted");
1929 rval = 0;
1930 break;
1931 case Z_REBOOT:
1932 if (kernelcall) /* Invalid; can't happen */
1933 abort();
1934 zerror(zlogp, B_FALSE, "%s operation is invalid "
1935 "for zones in state '%s'", z_cmd_name(cmd),
1936 zone_state_str(zstate));
1937 rval = -1;
1938 break;
1939 case Z_NOTE_UNINSTALLING:
1940 if (kernelcall) /* Invalid; can't happen */
1941 abort();
1942 /*
1943 * Tell the console to print out a message about this.
1944 * Once it does, we will be in_death_throes.
1945 */
1946 eventstream_write(Z_EVT_ZONE_UNINSTALLING);
1947 break;
1948 case Z_MOUNT:
1949 case Z_FORCEMOUNT:
1950 if (kernelcall) /* Invalid; can't happen */
1951 abort();
1952 if (!zone_isnative && !zone_iscluster &&
1953 !zone_islabeled) {
1954 /*
1955 * -U mounts the zone without lofs mounting
1956 * zone file systems back into the scratch
1957 * zone. This is required when mounting
1958 * non-native branded zones.
1959 */
1960 (void) strlcpy(zargp->bootbuf, "-U",
1961 BOOTARGS_MAX);
1962 }
1963
1964 rval = zone_ready(zlogp,
1965 strcmp(zargp->bootbuf, "-U") == 0 ?
1966 Z_MNT_UPDATE : Z_MNT_SCRATCH, zstate, debug);
1967 if (rval != 0)
1968 break;
1969
1970 eventstream_write(Z_EVT_ZONE_READIED);
1971
1972 /*
1973 * Get a handle to the default brand info.
1974 * We must always use the default brand file system
1975 * list when mounting the zone.
1976 */
1977 if ((bh = brand_open(default_brand)) == NULL) {
1978 rval = -1;
1979 break;
1980 }
1981
1982 /*
1983 * Get the list of filesystems to mount from
1984 * the brand configuration. These mounts are done
1985 * via a thread that will enter the zone, so they
1986 * are done from within the context of the zone.
1987 */
1988 cb.zlogp = zlogp;
1989 cb.zoneid = zone_id;
1990 cb.mount_cmd = B_TRUE;
1991 rval = brand_platform_iter_mounts(bh,
1992 mount_early_fs, &cb);
1993
1994 brand_close(bh);
1995
1996 /*
1997 * Ordinarily, /dev/fd would be mounted inside the zone
1998 * by svc:/system/filesystem/usr:default, but since
1999 * we're not booting the zone, we need to do this
2000 * manually.
2001 */
2002 if (rval == 0)
2003 rval = mount_early_fs(&cb,
2004 "fd", "/dev/fd", "fd", NULL);
2005 break;
2006 case Z_UNMOUNT:
2007 if (kernelcall) /* Invalid; can't happen */
2008 abort();
2009 zerror(zlogp, B_FALSE, "zone is already unmounted");
2010 rval = 0;
2011 break;
2012 }
2013 break;
2014
2015 case ZONE_STATE_READY:
2016 switch (cmd) {
2017 case Z_READY:
2018 /*
2019 * We could have two clients racing to ready this
2020 * zone; the second client loses, but its request
2021 * doesn't fail, since the zone is now in the desired
2022 * state.
2023 */
2024 zerror(zlogp, B_FALSE, "zone is already ready");
2025 rval = 0;
2026 break;
2027 case Z_BOOT:
2028 case Z_FORCEBOOT:
2029 (void) strlcpy(boot_args, zargp->bootbuf,
2030 sizeof (boot_args));
2031 eventstream_write(Z_EVT_ZONE_BOOTING);
2032 rval = zone_bootup(zlogp, zargp->bootbuf, zstate,
2033 debug);
2034 audit_put_record(zlogp, uc, rval, "boot");
2035 zcons_statechanged();
2036 if (rval != 0) {
2037 bringup_failure_recovery = B_TRUE;
2038 (void) zone_halt(zlogp, B_FALSE, B_TRUE,
2039 zstate, debug);
2040 eventstream_write(Z_EVT_ZONE_BOOTFAILED);
2041 }
2042 boot_args[0] = '\0';
2043 break;
2044 case Z_HALT:
2045 if (kernelcall) /* Invalid; can't happen */
2046 abort();
2047 if ((rval = zone_halt(zlogp, B_FALSE, B_FALSE, zstate,
2048 debug)) != 0)
2049 break;
2050 zcons_statechanged();
2051 eventstream_write(Z_EVT_ZONE_HALTED);
2052 break;
2053 case Z_SHUTDOWN:
2054 case Z_REBOOT:
2055 case Z_NOTE_UNINSTALLING:
2056 case Z_MOUNT:
2057 case Z_FORCEMOUNT:
2058 case Z_UNMOUNT:
2059 if (kernelcall) /* Invalid; can't happen */
2060 abort();
2061 zerror(zlogp, B_FALSE, "%s operation is invalid "
2062 "for zones in state '%s'", z_cmd_name(cmd),
2063 zone_state_str(zstate));
2064 rval = -1;
2065 break;
2066 }
2067 break;
2068
2069 case ZONE_STATE_MOUNTED:
2070 switch (cmd) {
2071 case Z_UNMOUNT:
2072 if (kernelcall) /* Invalid; can't happen */
2073 abort();
2074 rval = zone_halt(zlogp, B_TRUE, B_FALSE, zstate, debug);
2075 if (rval == 0) {
2076 eventstream_write(Z_EVT_ZONE_HALTED);
2077 (void) sema_post(&scratch_sem);
2078 }
2079 break;
2080 default:
2081 if (kernelcall) /* Invalid; can't happen */
2082 abort();
2083 zerror(zlogp, B_FALSE, "%s operation is invalid "
2084 "for zones in state '%s'", z_cmd_name(cmd),
2085 zone_state_str(zstate));
2086 rval = -1;
2087 break;
2088 }
2089 break;
2090
2091 case ZONE_STATE_RUNNING:
2092 case ZONE_STATE_SHUTTING_DOWN:
2093 case ZONE_STATE_DOWN:
2094 switch (cmd) {
2095 case Z_READY:
2096 if ((rval = zone_halt(zlogp, B_FALSE, B_TRUE, zstate,
2097 debug)) != 0)
2098 break;
2099 zcons_statechanged();
2100 if ((rval = zone_ready(zlogp, Z_MNT_BOOT, zstate,
2101 debug)) == 0)
2102 eventstream_write(Z_EVT_ZONE_READIED);
2103 else
2104 eventstream_write(Z_EVT_ZONE_HALTED);
2105 break;
2106 case Z_BOOT:
2107 case Z_FORCEBOOT:
2108 /*
2109 * We could have two clients racing to boot this
2110 * zone; the second client loses, but its request
2111 * doesn't fail, since the zone is now in the desired
2112 * state.
2113 */
2114 zerror(zlogp, B_FALSE, "zone is already booted");
2115 rval = 0;
2116 break;
2117 case Z_HALT:
2118 if (kernelcall) {
2119 log_init_exit(init_status);
2120 } else {
2121 log_init_exit(-1);
2122 }
2123 if ((rval = zone_halt(zlogp, B_FALSE, B_FALSE, zstate,
2124 debug)) != 0)
2125 break;
2126 eventstream_write(Z_EVT_ZONE_HALTED);
2127 zcons_statechanged();
2128 break;
2129 case Z_REBOOT:
2130 (void) strlcpy(boot_args, zargp->bootbuf,
2131 sizeof (boot_args));
2132 eventstream_write(Z_EVT_ZONE_REBOOTING);
2133 if ((rval = zone_halt(zlogp, B_FALSE, B_TRUE, zstate,
2134 debug)) != 0) {
2135 eventstream_write(Z_EVT_ZONE_BOOTFAILED);
2136 boot_args[0] = '\0';
2137 break;
2138 }
2139 zcons_statechanged();
2140 if ((rval = zone_ready(zlogp, Z_MNT_BOOT, zstate,
2141 debug)) != 0) {
2142 eventstream_write(Z_EVT_ZONE_BOOTFAILED);
2143 boot_args[0] = '\0';
2144 break;
2145 }
2146 rval = zone_bootup(zlogp, zargp->bootbuf, zstate,
2147 debug);
2148 audit_put_record(zlogp, uc, rval, "reboot");
2149 if (rval != 0) {
2150 (void) zone_halt(zlogp, B_FALSE, B_TRUE,
2151 zstate, debug);
2152 eventstream_write(Z_EVT_ZONE_BOOTFAILED);
2153 }
2154 boot_args[0] = '\0';
2155 break;
2156 case Z_SHUTDOWN:
2157 if ((rval = zone_graceful_shutdown(zlogp)) == 0) {
2158 wait_shut = B_TRUE;
2159 }
2160 break;
2161 case Z_NOTE_UNINSTALLING:
2162 case Z_MOUNT:
2163 case Z_FORCEMOUNT:
2164 case Z_UNMOUNT:
2165 zerror(zlogp, B_FALSE, "%s operation is invalid "
2166 "for zones in state '%s'", z_cmd_name(cmd),
2167 zone_state_str(zstate));
2168 rval = -1;
2169 break;
2170 }
2171 break;
2172 default:
2173 abort();
2174 }
2175
2176 /*
2177 * Because the state of the zone may have changed, we make sure
2178 * to wake the console poller, which is in charge of initiating
2179 * the shutdown procedure as necessary.
2180 */
2181 eventstream_write(Z_EVT_NULL);
2182
2183 out:
2184 (void) mutex_unlock(&lock);
2185
2186 /* Wait for the Z_SHUTDOWN commands to complete */
2187 if (wait_shut)
2188 rval = zone_wait_shutdown(zlogp);
2189
2190 if (kernelcall) {
2191 rvalp = NULL;
2192 rlen = 0;
2193 } else {
2194 rvalp->rval = rval;
2195 }
2196 if (uc != NULL)
2197 ucred_free(uc);
2198 (void) door_return((char *)rvalp, rlen, NULL, 0);
2199 thr_exit(NULL);
2200 }
2201
2202 static int
2203 setup_door(zlog_t *zlogp)
2204 {
2205 if ((zone_door = door_create(server, NULL,
2206 DOOR_UNREF | DOOR_REFUSE_DESC | DOOR_NO_CANCEL)) < 0) {
2207 zerror(zlogp, B_TRUE, "%s failed", "door_create");
2208 return (-1);
2209 }
2210 (void) fdetach(zone_door_path);
2211
2212 if (fattach(zone_door, zone_door_path) != 0) {
2213 zerror(zlogp, B_TRUE, "fattach to %s failed", zone_door_path);
2214 (void) door_revoke(zone_door);
2215 (void) fdetach(zone_door_path);
2216 zone_door = -1;
2217 return (-1);
2218 }
2219 return (0);
2220 }
2221
2222 /*
2223 * zoneadm(1m) will start zoneadmd if it thinks it isn't running; this
2224 * is where zoneadmd itself will check to see that another instance of
2225 * zoneadmd isn't already controlling this zone.
2226 *
2227 * The idea here is that we want to open the path to which we will
2228 * attach our door, lock it, and then make sure that no-one has beat us
2229 * to fattach(3c)ing onto it.
2230 *
2231 * fattach(3c) is really a mount, so there are actually two possible
2232 * vnodes we could be dealing with. Our strategy is as follows:
2233 *
2234 * - If the file we opened is a regular file (common case):
2235 * There is no fattach(3c)ed door, so we have a chance of becoming
2236 * the managing zoneadmd. We attempt to lock the file: if it is
2237 * already locked, that means someone else raced us here, so we
2238 * lose and give up. zoneadm(1m) will try to contact the zoneadmd
2239 * that beat us to it.
2240 *
2241 * - If the file we opened is a namefs file:
2242 * This means there is already an established door fattach(3c)'ed
2243 * to the rendezvous path. We've lost the race, so we give up.
2244 * Note that in this case we also try to grab the file lock, and
2245 * will succeed in acquiring it since the vnode locked by the
2246 * "winning" zoneadmd was a regular one, and the one we locked was
2247 * the fattach(3c)'ed door node. At any rate, no harm is done, and
2248 * we just return to zoneadm(1m) which knows to retry.
2249 */
2250 static int
2251 make_daemon_exclusive(zlog_t *zlogp)
2252 {
2253 int doorfd = -1;
2254 int err, ret = -1;
2255 struct stat st;
2256 struct flock flock;
2257 zone_state_t zstate;
2258
2259 top:
2260 if ((err = zone_get_state(zone_name, &zstate)) != Z_OK) {
2261 zerror(zlogp, B_FALSE, "failed to get zone state: %s",
2262 zonecfg_strerror(err));
2263 goto out;
2264 }
2265 if ((doorfd = open(zone_door_path, O_CREAT|O_RDWR,
2266 S_IREAD|S_IWRITE)) < 0) {
2267 zerror(zlogp, B_TRUE, "failed to open %s", zone_door_path);
2268 goto out;
2269 }
2270 if (fstat(doorfd, &st) < 0) {
2271 zerror(zlogp, B_TRUE, "failed to stat %s", zone_door_path);
2272 goto out;
2273 }
2274 /*
2275 * Lock the file to synchronize with other zoneadmd
2276 */
2277 flock.l_type = F_WRLCK;
2278 flock.l_whence = SEEK_SET;
2279 flock.l_start = (off_t)0;
2280 flock.l_len = (off_t)0;
2281 if (fcntl(doorfd, F_SETLK, &flock) < 0) {
2282 /*
2283 * Someone else raced us here and grabbed the lock file
2284 * first. A warning here is inappropriate since nothing
2285 * went wrong.
2286 */
2287 goto out;
2288 }
2289
2290 if (strcmp(st.st_fstype, "namefs") == 0) {
2291 struct door_info info;
2292
2293 /*
2294 * There is already something fattach()'ed to this file.
2295 * Lets see what the door is up to.
2296 */
2297 if (door_info(doorfd, &info) == 0 && info.di_target != -1) {
2298 /*
2299 * Another zoneadmd process seems to be in
2300 * control of the situation and we don't need to
2301 * be here. A warning here is inappropriate
2302 * since nothing went wrong.
2303 *
2304 * If the door has been revoked, the zoneadmd
2305 * process currently managing the zone is going
2306 * away. We'll return control to zoneadm(1m)
2307 * which will try again (by which time zoneadmd
2308 * will hopefully have exited).
2309 */
2310 goto out;
2311 }
2312
2313 /*
2314 * If we got this far, there's a fattach(3c)'ed door
2315 * that belongs to a process that has exited, which can
2316 * happen if the previous zoneadmd died unexpectedly.
2317 *
2318 * Let user know that something is amiss, but that we can
2319 * recover; if the zone is in the installed state, then don't
2320 * message, since having a running zoneadmd isn't really
2321 * expected/needed. We want to keep occurences of this message
2322 * limited to times when zoneadmd is picking back up from a
2323 * zoneadmd that died while the zone was in some non-trivial
2324 * state.
2325 */
2326 if (zstate > ZONE_STATE_INSTALLED) {
2327 zerror(zlogp, B_FALSE,
2328 "zone '%s': WARNING: zone is in state '%s', but "
2329 "zoneadmd does not appear to be available; "
2330 "restarted zoneadmd to recover.",
2331 zone_name, zone_state_str(zstate));
2332
2333 /*
2334 * Startup a thread to perform the zfd logging/tty svc
2335 * for the zone. zlogp won't be valid for much longer
2336 * so use logplat.
2337 */
2338 if (getzoneidbyname(zone_name) != -1) {
2339 create_log_thread(&logplat);
2340 }
2341
2342 /* recover the global configuration snapshot */
2343 if (snap_hndl == NULL) {
2344 if ((snap_hndl = zonecfg_init_handle())
2345 == NULL ||
2346 zonecfg_create_snapshot(zone_name)
2347 != Z_OK ||
2348 zonecfg_get_snapshot_handle(zone_name,
2349 snap_hndl) != Z_OK) {
2350 zerror(zlogp, B_FALSE, "recovering "
2351 "zone configuration handle");
2352 goto out;
2353 }
2354 }
2355 }
2356
2357 (void) fdetach(zone_door_path);
2358 (void) close(doorfd);
2359 goto top;
2360 }
2361 ret = 0;
2362 out:
2363 (void) close(doorfd);
2364 return (ret);
2365 }
2366
2367 /*
2368 * Run the query hook with the 'env' parameter. It should return a
2369 * string of tab-delimited key-value pairs, each of which should be set
2370 * in the environment.
2371 *
2372 * Because the env_vars string values become part of the environment, the
2373 * string is static and we don't free it.
2374 *
2375 * This function is always called before zoneadmd forks and makes itself
2376 * exclusive, so it is possible there could more than one instance of zoneadmd
2377 * running in parallel at this point. Thus, we have no zonecfg snapshot and
2378 * shouldn't take one yet (i.e. snap_hndl is NULL). Thats ok, since we don't
2379 * need any zonecfg info to query for a brand-specific env value.
2380 */
2381 static int
2382 set_brand_env(zlog_t *zlogp)
2383 {
2384 int ret = 0;
2385 static char *env_vars = NULL;
2386 char buf[2 * MAXPATHLEN];
2387
2388 if (query_hook[0] == '\0' || env_vars != NULL)
2389 return (0);
2390
2391 if (snprintf(buf, sizeof (buf), "%s env", query_hook) > sizeof (buf))
2392 return (-1);
2393
2394 if (do_subproc(zlogp, buf, &env_vars, B_FALSE) != 0)
2395 return (-1);
2396
2397 if (env_vars != NULL) {
2398 char *sp;
2399
2400 sp = strtok(env_vars, "\t");
2401 while (sp != NULL) {
2402 if (putenv(sp) != 0) {
2403 ret = -1;
2404 break;
2405 }
2406 sp = strtok(NULL, "\t");
2407 }
2408 }
2409
2410 return (ret);
2411 }
2412
2413 /*
2414 * Setup the brand's pre and post state change callbacks, as well as the
2415 * query callback, if any of these exist.
2416 */
2417 static int
2418 brand_callback_init(brand_handle_t bh, char *zone_name)
2419 {
2420 (void) strlcpy(pre_statechg_hook, EXEC_PREFIX,
2421 sizeof (pre_statechg_hook));
2422
2423 if (brand_get_prestatechange(bh, zone_name, zonepath,
2424 pre_statechg_hook + EXEC_LEN,
2425 sizeof (pre_statechg_hook) - EXEC_LEN) != 0)
2426 return (-1);
2427
2428 if (strlen(pre_statechg_hook) <= EXEC_LEN)
2429 pre_statechg_hook[0] = '\0';
2430
2431 (void) strlcpy(post_statechg_hook, EXEC_PREFIX,
2432 sizeof (post_statechg_hook));
2433
2434 if (brand_get_poststatechange(bh, zone_name, zonepath,
2435 post_statechg_hook + EXEC_LEN,
2436 sizeof (post_statechg_hook) - EXEC_LEN) != 0)
2437 return (-1);
2438
2439 if (strlen(post_statechg_hook) <= EXEC_LEN)
2440 post_statechg_hook[0] = '\0';
2441
2442 (void) strlcpy(query_hook, EXEC_PREFIX,
2443 sizeof (query_hook));
2444
2445 if (brand_get_query(bh, zone_name, zonepath, query_hook + EXEC_LEN,
2446 sizeof (query_hook) - EXEC_LEN) != 0)
2447 return (-1);
2448
2449 if (strlen(query_hook) <= EXEC_LEN)
2450 query_hook[0] = '\0';
2451
2452 return (0);
2453 }
2454
2455 int
2456 main(int argc, char *argv[])
2457 {
2458 int opt;
2459 zoneid_t zid;
2460 priv_set_t *privset;
2461 zone_state_t zstate;
2462 char parents_locale[MAXPATHLEN];
2463 brand_handle_t bh;
2464 int err;
2465
2466 pid_t pid;
2467 sigset_t blockset;
2468 sigset_t block_cld;
2469
2470 struct {
2471 sema_t sem;
2472 int status;
2473 zlog_t log;
2474 } *shstate;
2475 size_t shstatelen = getpagesize();
2476
2477 zlog_t errlog;
2478 zlog_t *zlogp;
2479
2480 int ctfd;
2481
2482 progname = get_execbasename(argv[0]);
2483
2484 /*
2485 * Make sure stderr is unbuffered
2486 */
2487 (void) setbuffer(stderr, NULL, 0);
2488
2489 /*
2490 * Get out of the way of mounted filesystems, since we will daemonize
2491 * soon.
2492 */
2493 (void) chdir("/");
2494
2495 /*
2496 * Use the default system umask per PSARC 1998/110 rather than
2497 * anything that may have been set by the caller.
2498 */
2499 (void) umask(CMASK);
2500
2501 /*
2502 * Initially we want to use our parent's locale.
2503 */
2504 (void) setlocale(LC_ALL, "");
2505 (void) textdomain(TEXT_DOMAIN);
2506 (void) strlcpy(parents_locale, setlocale(LC_MESSAGES, NULL),
2507 sizeof (parents_locale));
2508
2509 /*
2510 * This zlog_t is used for writing to stderr
2511 */
2512 errlog.logfile = stderr;
2513 errlog.buflen = errlog.loglen = 0;
2514 errlog.buf = errlog.log = NULL;
2515 errlog.locale = parents_locale;
2516
2517 /*
2518 * We start off writing to stderr until we're ready to daemonize.
2519 */
2520 zlogp = &errlog;
2521
2522 /*
2523 * Process options.
2524 */
2525 while ((opt = getopt(argc, argv, "R:z:")) != EOF) {
2526 switch (opt) {
2527 case 'R':
2528 zonecfg_set_root(optarg);
2529 break;
2530 case 'z':
2531 zone_name = optarg;
2532 break;
2533 default:
2534 usage();
2535 }
2536 }
2537
2538 if (zone_name == NULL)
2539 usage();
2540
2541 /*
2542 * Because usage() prints directly to stderr, it has gettext()
2543 * wrapping, which depends on the locale. But since zerror() calls
2544 * localize() which tweaks the locale, it is not safe to call zerror()
2545 * until after the last call to usage(). Fortunately, the last call
2546 * to usage() is just above and the first call to zerror() is just
2547 * below. Don't mess this up.
2548 */
2549 if (strcmp(zone_name, GLOBAL_ZONENAME) == 0) {
2550 zerror(zlogp, B_FALSE, "cannot manage the %s zone",
2551 GLOBAL_ZONENAME);
2552 return (1);
2553 }
2554
2555 if (zone_get_id(zone_name, &zid) != 0) {
2556 zerror(zlogp, B_FALSE, "could not manage %s: %s", zone_name,
2557 zonecfg_strerror(Z_NO_ZONE));
2558 return (1);
2559 }
2560
2561 if ((err = zone_get_state(zone_name, &zstate)) != Z_OK) {
2562 zerror(zlogp, B_FALSE, "failed to get zone state: %s",
2563 zonecfg_strerror(err));
2564 return (1);
2565 }
2566 if (zstate < ZONE_STATE_INCOMPLETE) {
2567 zerror(zlogp, B_FALSE,
2568 "cannot manage a zone which is in state '%s'",
2569 zone_state_str(zstate));
2570 return (1);
2571 }
2572
2573 if (zone_get_zonepath(zone_name, zonepath, sizeof (zonepath)) != Z_OK) {
2574 zerror(zlogp, B_FALSE, "unable to determine zone path");
2575 return (-1);
2576 }
2577
2578 if (zonecfg_default_brand(default_brand,
2579 sizeof (default_brand)) != Z_OK) {
2580 zerror(zlogp, B_FALSE, "unable to determine default brand");
2581 return (1);
2582 }
2583
2584 /* Get a handle to the brand info for this zone */
2585 if (zone_get_brand(zone_name, brand_name, sizeof (brand_name))
2586 != Z_OK) {
2587 zerror(zlogp, B_FALSE, "unable to determine zone brand");
2588 return (1);
2589 }
2590 zone_isnative = (strcmp(brand_name, NATIVE_BRAND_NAME) == 0);
2591 zone_islabeled = (strcmp(brand_name, LABELED_BRAND_NAME) == 0);
2592
2593 /*
2594 * In the alternate root environment, the only supported
2595 * operations are mount and unmount. In this case, just treat
2596 * the zone as native if it is cluster. Cluster zones can be
2597 * native for the purpose of LU or upgrade, and the cluster
2598 * brand may not exist in the miniroot (such as in net install
2599 * upgrade).
2600 */
2601 if (strcmp(brand_name, CLUSTER_BRAND_NAME) == 0) {
2602 zone_iscluster = B_TRUE;
2603 if (zonecfg_in_alt_root()) {
2604 (void) strlcpy(brand_name, default_brand,
2605 sizeof (brand_name));
2606 }
2607 } else {
2608 zone_iscluster = B_FALSE;
2609 }
2610
2611 if ((bh = brand_open(brand_name)) == NULL) {
2612 zerror(zlogp, B_FALSE, "unable to open zone brand");
2613 return (1);
2614 }
2615
2616 /* Get state change brand hooks. */
2617 if (brand_callback_init(bh, zone_name) == -1) {
2618 zerror(zlogp, B_TRUE,
2619 "failed to initialize brand state change hooks");
2620 brand_close(bh);
2621 return (1);
2622 }
2623
2624 brand_close(bh);
2625
2626 /*
2627 * Check that we have all privileges. It would be nice to pare
2628 * this down, but this is at least a first cut.
2629 */
2630 if ((privset = priv_allocset()) == NULL) {
2631 zerror(zlogp, B_TRUE, "%s failed", "priv_allocset");
2632 return (1);
2633 }
2634
2635 if (getppriv(PRIV_EFFECTIVE, privset) != 0) {
2636 zerror(zlogp, B_TRUE, "%s failed", "getppriv");
2637 priv_freeset(privset);
2638 return (1);
2639 }
2640
2641 if (priv_isfullset(privset) == B_FALSE) {
2642 zerror(zlogp, B_FALSE, "You lack sufficient privilege to "
2643 "run this command (all privs required)");
2644 priv_freeset(privset);
2645 return (1);
2646 }
2647 priv_freeset(privset);
2648
2649 if (set_brand_env(zlogp) != 0) {
2650 zerror(zlogp, B_FALSE, "Unable to setup brand's environment");
2651 return (1);
2652 }
2653
2654 if (mkzonedir(zlogp) != 0)
2655 return (1);
2656
2657 /*
2658 * Pre-fork: setup shared state
2659 */
2660 if ((shstate = (void *)mmap(NULL, shstatelen,
2661 PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANON, -1, (off_t)0)) ==
2662 MAP_FAILED) {
2663 zerror(zlogp, B_TRUE, "%s failed", "mmap");
2664 return (1);
2665 }
2666 if (sema_init(&shstate->sem, 0, USYNC_PROCESS, NULL) != 0) {
2667 zerror(zlogp, B_TRUE, "%s failed", "sema_init()");
2668 (void) munmap((char *)shstate, shstatelen);
2669 return (1);
2670 }
2671 shstate->log.logfile = NULL;
2672 shstate->log.buflen = shstatelen - sizeof (*shstate);
2673 shstate->log.loglen = shstate->log.buflen;
2674 shstate->log.buf = (char *)shstate + sizeof (*shstate);
2675 shstate->log.log = shstate->log.buf;
2676 shstate->log.locale = parents_locale;
2677 shstate->status = -1;
2678
2679 /*
2680 * We need a SIGCHLD handler so the sema_wait() below will wake
2681 * up if the child dies without doing a sema_post().
2682 */
2683 (void) sigset(SIGCHLD, sigchld);
2684 /*
2685 * We must mask SIGCHLD until after we've coped with the fork
2686 * sufficiently to deal with it; otherwise we can race and
2687 * receive the signal before pid has been initialized
2688 * (yes, this really happens).
2689 */
2690 (void) sigemptyset(&block_cld);
2691 (void) sigaddset(&block_cld, SIGCHLD);
2692 (void) sigprocmask(SIG_BLOCK, &block_cld, NULL);
2693
2694 /*
2695 * The parent only needs stderr after the fork, so close other fd's
2696 * that we inherited from zoneadm so that the parent doesn't have those
2697 * open while waiting. The child will close the rest after the fork.
2698 */
2699 closefrom(3);
2700
2701 if ((ctfd = init_template()) == -1) {
2702 zerror(zlogp, B_TRUE, "failed to create contract");
2703 return (1);
2704 }
2705
2706 /*
2707 * Do not let another thread localize a message while we are forking.
2708 */
2709 (void) mutex_lock(&msglock);
2710 pid = fork();
2711 (void) mutex_unlock(&msglock);
2712
2713 /*
2714 * In all cases (parent, child, and in the event of an error) we
2715 * don't want to cause creation of contracts on subsequent fork()s.
2716 */
2717 (void) ct_tmpl_clear(ctfd);
2718 (void) close(ctfd);
2719
2720 if (pid == -1) {
2721 zerror(zlogp, B_TRUE, "could not fork");
2722 return (1);
2723
2724 } else if (pid > 0) { /* parent */
2725 (void) sigprocmask(SIG_UNBLOCK, &block_cld, NULL);
2726 /*
2727 * This marks a window of vulnerability in which we receive
2728 * the SIGCLD before falling into sema_wait (normally we would
2729 * get woken up from sema_wait with EINTR upon receipt of
2730 * SIGCLD). So we may need to use some other scheme like
2731 * sema_posting in the sigcld handler.
2732 * blech
2733 */
2734 (void) sema_wait(&shstate->sem);
2735 (void) sema_destroy(&shstate->sem);
2736 if (shstate->status != 0)
2737 (void) waitpid(pid, NULL, WNOHANG);
2738 /*
2739 * It's ok if we die with SIGPIPE. It's not like we could have
2740 * done anything about it.
2741 */
2742 (void) fprintf(stderr, "%s", shstate->log.buf);
2743 _exit(shstate->status == 0 ? 0 : 1);
2744 }
2745
2746 /*
2747 * The child charges on.
2748 */
2749 (void) sigset(SIGCHLD, SIG_DFL);
2750 (void) sigprocmask(SIG_UNBLOCK, &block_cld, NULL);
2751
2752 /*
2753 * SIGPIPE can be delivered if we write to a socket for which the
2754 * peer endpoint is gone. That can lead to too-early termination
2755 * of zoneadmd, and that's not good eats.
2756 */
2757 (void) sigset(SIGPIPE, SIG_IGN);
2758 /*
2759 * Stop using stderr
2760 */
2761 zlogp = &shstate->log;
2762
2763 /*
2764 * We don't need stdout/stderr from now on.
2765 */
2766 closefrom(0);
2767
2768 /*
2769 * Initialize the syslog zlog_t. This needs to be done after
2770 * the call to closefrom().
2771 */
2772 logsys.buf = logsys.log = NULL;
2773 logsys.buflen = logsys.loglen = 0;
2774 logsys.logfile = NULL;
2775 logsys.locale = DEFAULT_LOCALE;
2776
2777 openlog("zoneadmd", LOG_PID, LOG_DAEMON);
2778
2779 /*
2780 * Allow logging to <zonepath>/logs/<file>.
2781 */
2782 logstream_init(zlogp);
2783 platloghdl = logstream_open("platform.log", "zoneadmd", 0);
2784
2785 /* logplat looks the same as logsys, but logs to platform.log */
2786 logplat = logsys;
2787
2788 /*
2789 * The eventstream is used to publish state changes in the zone
2790 * from the door threads to the console I/O poller.
2791 */
2792 if (eventstream_init() == -1) {
2793 zerror(zlogp, B_TRUE, "unable to create eventstream");
2794 goto child_out;
2795 }
2796
2797 (void) snprintf(zone_door_path, sizeof (zone_door_path),
2798 "%s" ZONE_DOOR_PATH, zonecfg_get_root(), zone_name);
2799
2800 /*
2801 * See if another zoneadmd is running for this zone. If not, then we
2802 * can now modify system state.
2803 */
2804 if (make_daemon_exclusive(zlogp) == -1)
2805 goto child_out;
2806
2807 /*
2808 * Create/join a new session; we need to be careful of what we do with
2809 * the console from now on so we don't end up being the session leader
2810 * for the terminal we're going to be handing out.
2811 */
2812 (void) setsid();
2813
2814 /*
2815 * This thread shouldn't be receiving any signals; in particular,
2816 * SIGCHLD should be received by the thread doing the fork(). The
2817 * exceptions are SIGHUP and SIGUSR1 for log rotation, set up by
2818 * logstream_init().
2819 */
2820 (void) sigfillset(&blockset);
2821 (void) sigdelset(&blockset, SIGHUP);
2822 (void) sigdelset(&blockset, SIGUSR1);
2823 (void) thr_sigsetmask(SIG_BLOCK, &blockset, NULL);
2824
2825 /*
2826 * Setup the console device and get ready to serve the console;
2827 * once this has completed, we're ready to let console clients
2828 * make an attempt to connect (they will block until
2829 * serve_console_sock() below gets called, and any pending
2830 * connection is accept()ed).
2831 */
2832 if (!zonecfg_in_alt_root() && init_console(zlogp) < 0)
2833 goto child_out;
2834
2835 /*
2836 * Take the lock now, so that when the door server gets going, we
2837 * are guaranteed that it won't take a request until we are sure
2838 * that everything is completely set up. See the child_out: label
2839 * below to see why this matters.
2840 */
2841 (void) mutex_lock(&lock);
2842
2843 /* Init semaphore for scratch zones. */
2844 if (sema_init(&scratch_sem, 0, USYNC_THREAD, NULL) == -1) {
2845 zerror(zlogp, B_TRUE,
2846 "failed to initialize semaphore for scratch zone");
2847 goto child_out;
2848 }
2849
2850 /* open the dladm handle */
2851 if (dladm_open(&dld_handle) != DLADM_STATUS_OK) {
2852 zerror(zlogp, B_FALSE, "failed to open dladm handle");
2853 goto child_out;
2854 }
2855
2856 /*
2857 * Note: door setup must occur *after* the console is setup.
2858 * This is so that as zlogin tests the door to see if zoneadmd
2859 * is ready yet, we know that the console will get serviced
2860 * once door_info() indicates that the door is "up".
2861 */
2862 if (setup_door(zlogp) == -1)
2863 goto child_out;
2864
2865 /*
2866 * Things seem OK so far; tell the parent process that we're done
2867 * with setup tasks. This will cause the parent to exit, signalling
2868 * to zoneadm, zlogin, or whatever forked it that we are ready to
2869 * service requests.
2870 */
2871 shstate->status = 0;
2872 (void) sema_post(&shstate->sem);
2873 (void) munmap((char *)shstate, shstatelen);
2874 shstate = NULL;
2875
2876 (void) mutex_unlock(&lock);
2877
2878 /*
2879 * zlogp is now invalid, so reset it to the syslog logger.
2880 */
2881 zlogp = &logsys;
2882
2883 /*
2884 * Now that we are free of any parents, switch to the default locale.
2885 */
2886 (void) setlocale(LC_ALL, DEFAULT_LOCALE);
2887
2888 /*
2889 * At this point the setup portion of main() is basically done, so
2890 * we reuse this thread to manage the zone console. When
2891 * serve_console() has returned, we are past the point of no return
2892 * in the life of this zoneadmd.
2893 */
2894 if (zonecfg_in_alt_root()) {
2895 /*
2896 * This is just awful, but mounted scratch zones don't (and
2897 * can't) have consoles. We just wait for unmount instead.
2898 */
2899 while (sema_wait(&scratch_sem) == EINTR)
2900 ;
2901 } else {
2902 serve_console(zlogp);
2903 assert(in_death_throes);
2904 }
2905
2906 /*
2907 * This is the next-to-last part of the exit interlock. Upon calling
2908 * fdetach(), the door will go unreferenced; once any
2909 * outstanding requests (like the door thread doing Z_HALT) are
2910 * done, the door will get an UNREF notification; when it handles
2911 * the UNREF, the door server will cause the exit. It's possible
2912 * that fdetach() can fail because the file is in use, in which
2913 * case we'll retry the operation.
2914 */
2915 assert(!MUTEX_HELD(&lock));
2916 for (;;) {
2917 if ((fdetach(zone_door_path) == 0) || (errno != EBUSY))
2918 break;
2919 yield();
2920 }
2921
2922 for (;;)
2923 (void) pause();
2924
2925 child_out:
2926 assert(pid == 0);
2927
2928 shstate->status = -1;
2929 (void) sema_post(&shstate->sem);
2930 (void) munmap((char *)shstate, shstatelen);
2931
2932 /*
2933 * This might trigger an unref notification, but if so,
2934 * we are still holding the lock, so our call to exit will
2935 * ultimately win the race and will publish the right exit
2936 * code.
2937 */
2938 if (zone_door != -1) {
2939 assert(MUTEX_HELD(&lock));
2940 (void) door_revoke(zone_door);
2941 (void) fdetach(zone_door_path);
2942 }
2943
2944 if (dld_handle != NULL)
2945 dladm_close(dld_handle);
2946
2947 return (1); /* return from main() forcibly exits an MT process */
2948 }