1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
25 * Copyright 2016 Joyent, Inc.
26 */
27
28 /*
29 * zoneadmd manages zones; one zoneadmd process is launched for each
30 * non-global zone on the system. This daemon juggles four jobs:
31 *
32 * - Implement setup and teardown of the zone "virtual platform": mount and
33 * unmount filesystems; create and destroy network interfaces; communicate
34 * with devfsadmd to lay out devices for the zone; instantiate the zone
35 * console device; configure process runtime attributes such as resource
36 * controls, pool bindings, fine-grained privileges.
37 *
38 * - Launch the zone's init(1M) process.
39 *
40 * - Implement a door server; clients (like zoneadm) connect to the door
41 * server and request zone state changes. The kernel is also a client of
42 * this door server. A request to halt or reboot the zone which originates
43 * *inside* the zone results in a door upcall from the kernel into zoneadmd.
44 *
45 * One minor problem is that messages emitted by zoneadmd need to be passed
46 * back to the zoneadm process making the request. These messages need to
47 * be rendered in the client's locale; so, this is passed in as part of the
48 * request. The exception is the kernel upcall to zoneadmd, in which case
49 * messages are syslog'd.
50 *
51 * To make all of this work, the Makefile adds -a to xgettext to extract *all*
52 * strings, and an exclusion file (zoneadmd.xcl) is used to exclude those
53 * strings which do not need to be translated.
54 *
55 * - Act as a console server for zlogin -C processes; see comments in zcons.c
56 * for more information about the zone console architecture.
57 *
58 * DESIGN NOTES
59 *
60 * Restart:
61 * A chief design constraint of zoneadmd is that it should be restartable in
62 * the case that the administrator kills it off, or it suffers a fatal error,
63 * without the running zone being impacted; this is akin to being able to
64 * reboot the service processor of a server without affecting the OS instance.
65 */
66
67 #include <sys/param.h>
68 #include <sys/mman.h>
69 #include <sys/types.h>
70 #include <sys/stat.h>
71 #include <sys/sysmacros.h>
72 #include <sys/time.h>
73
74 #include <bsm/adt.h>
75 #include <bsm/adt_event.h>
76
77 #include <alloca.h>
78 #include <assert.h>
79 #include <errno.h>
80 #include <door.h>
81 #include <fcntl.h>
82 #include <locale.h>
83 #include <signal.h>
84 #include <stdarg.h>
85 #include <stdio.h>
86 #include <stdlib.h>
87 #include <string.h>
88 #include <strings.h>
89 #include <synch.h>
90 #include <syslog.h>
91 #include <thread.h>
92 #include <unistd.h>
93 #include <wait.h>
94 #include <limits.h>
95 #include <zone.h>
96 #include <libbrand.h>
97 #include <sys/brand.h>
98 #include <libcontract.h>
99 #include <libcontract_priv.h>
100 #include <sys/brand.h>
101 #include <sys/contract/process.h>
102 #include <sys/ctfs.h>
103 #include <libdladm.h>
104 #include <sys/dls_mgmt.h>
105 #include <libscf.h>
106
107 #include <libzonecfg.h>
108 #include <zonestat_impl.h>
109 #include "zoneadmd.h"
110
111 static char *progname;
112 char *zone_name; /* zone which we are managing */
113 zone_dochandle_t snap_hndl; /* handle for snapshot created when ready */
114 char zonepath[MAXNAMELEN];
115 char pool_name[MAXNAMELEN];
116 char default_brand[MAXNAMELEN];
117 char brand_name[MAXNAMELEN];
118 boolean_t zone_isnative;
119 boolean_t zone_iscluster;
120 boolean_t zone_islabeled;
121 boolean_t shutdown_in_progress;
122 static zoneid_t zone_id;
123 static zoneid_t zone_did = 0;
124 dladm_handle_t dld_handle = NULL;
125
126 char pre_statechg_hook[2 * MAXPATHLEN];
127 char post_statechg_hook[2 * MAXPATHLEN];
128 char query_hook[2 * MAXPATHLEN];
129
130 zlog_t logsys;
131
132 mutex_t lock = DEFAULTMUTEX; /* to serialize stuff */
133 mutex_t msglock = DEFAULTMUTEX; /* for calling setlocale() */
134
135 static sema_t scratch_sem; /* for scratch zones */
136
137 static char zone_door_path[MAXPATHLEN];
138 static int zone_door = -1;
139
140 boolean_t in_death_throes = B_FALSE; /* daemon is dying */
141 boolean_t bringup_failure_recovery = B_FALSE; /* ignore certain failures */
142
143 #if !defined(TEXT_DOMAIN) /* should be defined by cc -D */
144 #define TEXT_DOMAIN "SYS_TEST" /* Use this only if it wasn't */
145 #endif
146
147 #define DEFAULT_LOCALE "C"
148
149 #define RSRC_NET "net"
150 #define RSRC_DEV "device"
151
152 static const char *
153 z_cmd_name(zone_cmd_t zcmd)
154 {
155 /* This list needs to match the enum in sys/zone.h */
156 static const char *zcmdstr[] = {
157 "ready", "boot", "forceboot", "reboot", "halt",
158 "note_uninstalling", "mount", "forcemount", "unmount",
159 "shutdown"
160 };
161
162 if (zcmd >= sizeof (zcmdstr) / sizeof (*zcmdstr))
163 return ("unknown");
164 else
165 return (zcmdstr[(int)zcmd]);
166 }
167
168 static char *
169 get_execbasename(char *execfullname)
170 {
171 char *last_slash, *execbasename;
172
173 /* guard against '/' at end of command invocation */
174 for (;;) {
175 last_slash = strrchr(execfullname, '/');
176 if (last_slash == NULL) {
177 execbasename = execfullname;
178 break;
179 } else {
180 execbasename = last_slash + 1;
181 if (*execbasename == '\0') {
182 *last_slash = '\0';
183 continue;
184 }
185 break;
186 }
187 }
188 return (execbasename);
189 }
190
191 static void
192 usage(void)
193 {
194 (void) fprintf(stderr, gettext("Usage: %s -z zonename\n"), progname);
195 (void) fprintf(stderr,
196 gettext("\tNote: %s should not be run directly.\n"), progname);
197 exit(2);
198 }
199
200 /* ARGSUSED */
201 static void
202 sigchld(int sig)
203 {
204 }
205
206 char *
207 localize_msg(char *locale, const char *msg)
208 {
209 char *out;
210
211 (void) mutex_lock(&msglock);
212 (void) setlocale(LC_MESSAGES, locale);
213 out = gettext(msg);
214 (void) setlocale(LC_MESSAGES, DEFAULT_LOCALE);
215 (void) mutex_unlock(&msglock);
216 return (out);
217 }
218
219 /* PRINTFLIKE3 */
220 void
221 zerror(zlog_t *zlogp, boolean_t use_strerror, const char *fmt, ...)
222 {
223 va_list alist;
224 char buf[MAXPATHLEN * 2]; /* enough space for err msg with a path */
225 char *bp;
226 int saved_errno = errno;
227
228 if (zlogp == NULL)
229 return;
230 if (zlogp == &logsys)
231 (void) snprintf(buf, sizeof (buf), "[zone '%s'] ",
232 zone_name);
233 else
234 buf[0] = '\0';
235 bp = &(buf[strlen(buf)]);
236
237 /*
238 * In theory, the locale pointer should be set to either "C" or a
239 * char array, so it should never be NULL
240 */
241 assert(zlogp->locale != NULL);
242 /* Locale is per process, but we are multi-threaded... */
243 fmt = localize_msg(zlogp->locale, fmt);
244
245 va_start(alist, fmt);
246 (void) vsnprintf(bp, sizeof (buf) - (bp - buf), fmt, alist);
247 va_end(alist);
248 bp = &(buf[strlen(buf)]);
249 if (use_strerror)
250 (void) snprintf(bp, sizeof (buf) - (bp - buf), ": %s",
251 strerror(saved_errno));
252 if (zlogp == &logsys) {
253 (void) syslog(LOG_ERR, "%s", buf);
254 } else if (zlogp->logfile != NULL) {
255 (void) fprintf(zlogp->logfile, "%s\n", buf);
256 } else {
257 size_t buflen;
258 size_t copylen;
259
260 buflen = snprintf(zlogp->log, zlogp->loglen, "%s\n", buf);
261 copylen = MIN(buflen, zlogp->loglen);
262 zlogp->log += copylen;
263 zlogp->loglen -= copylen;
264 }
265 }
266
267 /*
268 * Append src to dest, modifying dest in the process. Prefix src with
269 * a space character if dest is a non-empty string.
270 */
271 static void
272 strnappend(char *dest, size_t n, const char *src)
273 {
274 (void) snprintf(dest, n, "%s%s%s", dest,
275 dest[0] == '\0' ? "" : " ", src);
276 }
277
278 /*
279 * Since illumos boot arguments are getopt(3c) compatible (see kernel(1m)), we
280 * put the arguments into an argv style array, use getopt to process them,
281 * and put the resultant argument string back into outargs. Non-native brands
282 * may support alternate forms of boot arguments so we must handle that as well.
283 *
284 * During the filtering, we pull out any arguments which are truly "boot"
285 * arguments, leaving only those which are to be passed intact to the
286 * progenitor process. The one we support at the moment is -i, which
287 * indicates to the kernel which program should be launched as 'init'.
288 *
289 * Except for Z_OK, all other return values are treated as fatal.
290 */
291 static int
292 filter_bootargs(zlog_t *zlogp, const char *inargs, char *outargs,
293 char *init_file)
294 {
295 int argc = 0, argc_save;
296 int i;
297 int err = Z_OK;
298 char *arg, *lasts, **argv = NULL, **argv_save;
299 char zonecfg_args[BOOTARGS_MAX];
300 char scratchargs[BOOTARGS_MAX], *sargs;
301 char scratchopt[3];
302 char c;
303
304 bzero(outargs, BOOTARGS_MAX);
305
306 /*
307 * If the user didn't specify transient boot arguments, check
308 * to see if there were any specified in the zone configuration,
309 * and use them if applicable.
310 */
311 if (inargs == NULL || inargs[0] == '\0') {
312 bzero(zonecfg_args, sizeof (zonecfg_args));
313 (void) zonecfg_get_bootargs(snap_hndl, zonecfg_args,
314 sizeof (zonecfg_args));
315 inargs = zonecfg_args;
316 }
317
318 if (strlen(inargs) >= BOOTARGS_MAX) {
319 zerror(zlogp, B_FALSE, "boot argument string too long");
320 return (Z_INVAL);
321 }
322
323 (void) strlcpy(scratchargs, inargs, sizeof (scratchargs));
324 sargs = scratchargs;
325 while ((arg = strtok_r(sargs, " \t", &lasts)) != NULL) {
326 sargs = NULL;
327 argc++;
328 }
329
330 if ((argv = calloc(argc + 1, sizeof (char *))) == NULL) {
331 zerror(zlogp, B_FALSE, "memory allocation failed");
332 return (Z_NOMEM);
333 }
334
335 argv_save = argv;
336 argc_save = argc;
337
338 (void) strlcpy(scratchargs, inargs, sizeof (scratchargs));
339 sargs = scratchargs;
340 i = 0;
341 while ((arg = strtok_r(sargs, " \t", &lasts)) != NULL) {
342 sargs = NULL;
343 if ((argv[i] = strdup(arg)) == NULL) {
344 err = Z_NOMEM;
345 zerror(zlogp, B_FALSE, "memory allocation failed");
346 goto done;
347 }
348 i++;
349 }
350
351 /*
352 * We preserve compatibility with the illumos system boot behavior,
353 * which allows:
354 *
355 * # reboot kernel/unix -s -m verbose
356 *
357 * In this example, kernel/unix tells the booter what file to boot. The
358 * original intent of this was that we didn't want reboot in a zone to
359 * be gratuitously different, so we would silently ignore the boot
360 * file, if necessary. However, this usage is archaic and has never
361 * been common, since it is impossible to boot a zone onto a different
362 * kernel. Ignoring the first argument breaks for non-native brands
363 * which pass boot arguments in a different style. e.g.
364 * systemd.log_level=debug
365 * Thus, for backward compatibility we only ignore the first argument
366 * if it appears to be in the illumos form and attempting to specify a
367 * kernel.
368 */
369 if (argv[0] == NULL)
370 goto done;
371
372 assert(argv[0][0] != ' ');
373 assert(argv[0][0] != '\t');
374
375 if (strncmp(argv[0], "kernel/", 7) == 0) {
376 argv = &argv[1];
377 argc--;
378 }
379
380 optind = 0;
381 opterr = 0;
382 err = Z_OK;
383 while ((c = getopt(argc, argv, "fi:m:s")) != -1) {
384 switch (c) {
385 case 'i':
386 /*
387 * -i is handled by the runtime and is not passed
388 * along to userland
389 */
390 (void) strlcpy(init_file, optarg, MAXPATHLEN);
391 break;
392 case 'f':
393 /* This has already been processed by zoneadm */
394 break;
395 case 'm':
396 case 's':
397 /* These pass through unmolested */
398 (void) snprintf(scratchopt, sizeof (scratchopt),
399 "-%c", c);
400 strnappend(outargs, BOOTARGS_MAX, scratchopt);
401 if (optarg != NULL)
402 strnappend(outargs, BOOTARGS_MAX, optarg);
403 break;
404 case '?':
405 /*
406 * If a brand has its own init, we need to pass along
407 * whatever the user provides. We use the entire
408 * unknown string here so that we correctly handle
409 * unknown long options (e.g. --debug).
410 */
411 strnappend(outargs, BOOTARGS_MAX, argv[optind - 1]);
412 break;
413 }
414 }
415
416 /*
417 * We need to pass along everything else since we don't know what
418 * the brand's init is expecting. For example, an argument list like:
419 * --confdir /foo --debug
420 * will cause the getopt parsing to stop at '/foo' but we need to pass
421 * that on, along with the '--debug'. This does mean that we require
422 * any of our known options (-ifms) to preceed the brand-specific ones.
423 */
424 while (optind < argc) {
425 strnappend(outargs, BOOTARGS_MAX, argv[optind]);
426 optind++;
427 }
428
429 done:
430 for (i = 0; i < argc_save; i++) {
431 if (argv_save[i] != NULL)
432 free(argv_save[i]);
433 }
434 free(argv_save);
435 return (err);
436 }
437
438
439 static int
440 mkzonedir(zlog_t *zlogp)
441 {
442 struct stat st;
443 /*
444 * We must create and lock everyone but root out of ZONES_TMPDIR
445 * since anyone can open any UNIX domain socket, regardless of
446 * its file system permissions. Sigh...
447 */
448 if (mkdir(ZONES_TMPDIR, S_IRWXU) < 0 && errno != EEXIST) {
449 zerror(zlogp, B_TRUE, "could not mkdir '%s'", ZONES_TMPDIR);
450 return (-1);
451 }
452 /* paranoia */
453 if ((stat(ZONES_TMPDIR, &st) < 0) || !S_ISDIR(st.st_mode)) {
454 zerror(zlogp, B_TRUE, "'%s' is not a directory", ZONES_TMPDIR);
455 return (-1);
456 }
457 (void) chmod(ZONES_TMPDIR, S_IRWXU);
458 return (0);
459 }
460
461 /*
462 * Run the brand's pre-state change callback, if it exists.
463 */
464 static int
465 brand_prestatechg(zlog_t *zlogp, int state, int cmd, boolean_t debug)
466 {
467 char cmdbuf[2 * MAXPATHLEN];
468 const char *altroot;
469
470 if (pre_statechg_hook[0] == '\0')
471 return (0);
472
473 altroot = zonecfg_get_root();
474 if (snprintf(cmdbuf, sizeof (cmdbuf), "%s %d %d %s", pre_statechg_hook,
475 state, cmd, altroot) > sizeof (cmdbuf))
476 return (-1);
477
478 if (do_subproc(zlogp, cmdbuf, NULL, debug) != 0)
479 return (-1);
480
481 return (0);
482 }
483
484 /*
485 * Run the brand's post-state change callback, if it exists.
486 */
487 static int
488 brand_poststatechg(zlog_t *zlogp, int state, int cmd, boolean_t debug)
489 {
490 char cmdbuf[2 * MAXPATHLEN];
491 const char *altroot;
492
493 if (post_statechg_hook[0] == '\0')
494 return (0);
495
496 altroot = zonecfg_get_root();
497 if (snprintf(cmdbuf, sizeof (cmdbuf), "%s %d %d %s", post_statechg_hook,
498 state, cmd, altroot) > sizeof (cmdbuf))
499 return (-1);
500
501 if (do_subproc(zlogp, cmdbuf, NULL, debug) != 0)
502 return (-1);
503
504 return (0);
505 }
506
507 /*
508 * Notify zonestatd of the new zone. If zonestatd is not running, this
509 * will do nothing.
510 */
511 static void
512 notify_zonestatd(zoneid_t zoneid)
513 {
514 int cmd[2];
515 int fd;
516 door_arg_t params;
517
518 fd = open(ZS_DOOR_PATH, O_RDONLY);
519 if (fd < 0)
520 return;
521
522 cmd[0] = ZSD_CMD_NEW_ZONE;
523 cmd[1] = zoneid;
524 params.data_ptr = (char *)&cmd;
525 params.data_size = sizeof (cmd);
526 params.desc_ptr = NULL;
527 params.desc_num = 0;
528 params.rbuf = NULL;
529 params.rsize = NULL;
530 (void) door_call(fd, ¶ms);
531 (void) close(fd);
532 }
533
534 /*
535 * Bring a zone up to the pre-boot "ready" stage. The mount_cmd argument is
536 * 'true' if this is being invoked as part of the processing for the "mount"
537 * subcommand.
538 *
539 * If a scratch zone mount (ALT_MOUNT) is being performed then do not
540 * call the state change hooks.
541 */
542 static int
543 zone_ready(zlog_t *zlogp, zone_mnt_t mount_cmd, int zstate, boolean_t debug)
544 {
545 int err;
546 boolean_t snapped = B_FALSE;
547
548 if ((snap_hndl = zonecfg_init_handle()) == NULL) {
549 zerror(zlogp, B_TRUE, "getting zone configuration handle");
550 goto bad;
551 }
552 if ((err = zonecfg_create_snapshot(zone_name)) != Z_OK) {
553 zerror(zlogp, B_FALSE, "unable to create snapshot: %s",
554 zonecfg_strerror(err));
555 goto bad;
556 }
557 snapped = B_TRUE;
558
559 if (zonecfg_get_snapshot_handle(zone_name, snap_hndl) != Z_OK) {
560 zerror(zlogp, B_FALSE, "invalid configuration snapshot");
561 goto bad;
562 }
563
564 if (zone_did == 0)
565 zone_did = zone_get_did(zone_name);
566
567 if (!ALT_MOUNT(mount_cmd) &&
568 brand_prestatechg(zlogp, zstate, Z_READY, debug) != 0)
569 goto bad;
570
571 if ((zone_id = vplat_create(zlogp, mount_cmd, zone_did)) == -1)
572 goto bad;
573
574 if (vplat_bringup(zlogp, mount_cmd, zone_id) != 0) {
575 bringup_failure_recovery = B_TRUE;
576 (void) vplat_teardown(NULL, (mount_cmd != Z_MNT_BOOT), B_FALSE,
577 debug);
578 goto bad;
579 }
580
581 if (!ALT_MOUNT(mount_cmd) &&
582 brand_poststatechg(zlogp, zstate, Z_READY, debug) != 0)
583 goto bad;
584
585 return (0);
586
587 bad:
588 /*
589 * If something goes wrong, we up the zones's state to the target
590 * state, READY, and then invoke the hook as if we're halting.
591 */
592 if (!ALT_MOUNT(mount_cmd))
593 (void) brand_poststatechg(zlogp, ZONE_STATE_READY, Z_HALT,
594 debug);
595
596 if (snapped)
597 if ((err = zonecfg_destroy_snapshot(zone_name)) != Z_OK)
598 zerror(zlogp, B_FALSE, "destroying snapshot: %s",
599 zonecfg_strerror(err));
600 zonecfg_fini_handle(snap_hndl);
601 snap_hndl = NULL;
602 return (-1);
603 }
604
605 int
606 init_template(void)
607 {
608 int fd;
609 int err = 0;
610
611 fd = open64(CTFS_ROOT "/process/template", O_RDWR);
612 if (fd == -1)
613 return (-1);
614
615 /*
616 * For now, zoneadmd doesn't do anything with the contract.
617 * Deliver no events, don't inherit, and allow it to be orphaned.
618 */
619 err |= ct_tmpl_set_critical(fd, 0);
620 err |= ct_tmpl_set_informative(fd, 0);
621 err |= ct_pr_tmpl_set_fatal(fd, CT_PR_EV_HWERR);
622 err |= ct_pr_tmpl_set_param(fd, CT_PR_PGRPONLY | CT_PR_REGENT);
623 if (err || ct_tmpl_activate(fd)) {
624 (void) close(fd);
625 return (-1);
626 }
627
628 return (fd);
629 }
630
631 typedef struct fs_callback {
632 zlog_t *zlogp;
633 zoneid_t zoneid;
634 boolean_t mount_cmd;
635 } fs_callback_t;
636
637 static int
638 mount_early_fs(void *data, const char *spec, const char *dir,
639 const char *fstype, const char *opt)
640 {
641 zlog_t *zlogp = ((fs_callback_t *)data)->zlogp;
642 zoneid_t zoneid = ((fs_callback_t *)data)->zoneid;
643 boolean_t mount_cmd = ((fs_callback_t *)data)->mount_cmd;
644 char rootpath[MAXPATHLEN];
645 pid_t child;
646 int child_status;
647 int tmpl_fd;
648 int rv;
649 ctid_t ct;
650
651 /* determine the zone rootpath */
652 if (mount_cmd) {
653 char luroot[MAXPATHLEN];
654
655 (void) snprintf(luroot, sizeof (luroot), "%s/lu", zonepath);
656 resolve_lofs(zlogp, luroot, sizeof (luroot));
657 (void) strlcpy(rootpath, luroot, sizeof (rootpath));
658 } else {
659 if (zone_get_rootpath(zone_name,
660 rootpath, sizeof (rootpath)) != Z_OK) {
661 zerror(zlogp, B_FALSE, "unable to determine zone root");
662 return (-1);
663 }
664 }
665
666 if ((rv = valid_mount_path(zlogp, rootpath, spec, dir, fstype)) < 0) {
667 zerror(zlogp, B_FALSE, "%s%s is not a valid mount point",
668 rootpath, dir);
669 return (-1);
670 } else if (rv > 0) {
671 /* The mount point path doesn't exist, create it now. */
672 if (make_one_dir(zlogp, rootpath, dir,
673 DEFAULT_DIR_MODE, DEFAULT_DIR_USER,
674 DEFAULT_DIR_GROUP) != 0) {
675 zerror(zlogp, B_FALSE, "failed to create mount point");
676 return (-1);
677 }
678
679 /*
680 * Now this might seem weird, but we need to invoke
681 * valid_mount_path() again. Why? Because it checks
682 * to make sure that the mount point path is canonical,
683 * which it can only do if the path exists, so now that
684 * we've created the path we have to verify it again.
685 */
686 if ((rv = valid_mount_path(zlogp, rootpath, spec, dir,
687 fstype)) < 0) {
688 zerror(zlogp, B_FALSE,
689 "%s%s is not a valid mount point", rootpath, dir);
690 return (-1);
691 }
692 }
693
694 if ((tmpl_fd = init_template()) == -1) {
695 zerror(zlogp, B_TRUE, "failed to create contract");
696 return (-1);
697 }
698
699 if ((child = fork()) == -1) {
700 (void) ct_tmpl_clear(tmpl_fd);
701 (void) close(tmpl_fd);
702 zerror(zlogp, B_TRUE, "failed to fork");
703 return (-1);
704
705 } else if (child == 0) { /* child */
706 char opt_buf[MAX_MNTOPT_STR];
707 int optlen = 0;
708 int mflag = MS_DATA;
709 int i;
710 int ret;
711
712 (void) ct_tmpl_clear(tmpl_fd);
713 /*
714 * Even though there are no procs running in the zone, we
715 * do this for paranoia's sake.
716 */
717 (void) closefrom(0);
718
719 if (zone_enter(zoneid) == -1) {
720 _exit(errno);
721 }
722 if (opt != NULL) {
723 /*
724 * The mount() system call is incredibly annoying.
725 * If options are specified, we need to copy them
726 * into a temporary buffer since the mount() system
727 * call will overwrite the options string. It will
728 * also fail if the new option string it wants to
729 * write is bigger than the one we passed in, so
730 * you must pass in a buffer of the maximum possible
731 * option string length. sigh.
732 */
733 (void) strlcpy(opt_buf, opt, sizeof (opt_buf));
734 opt = opt_buf;
735 optlen = MAX_MNTOPT_STR;
736 mflag = MS_OPTIONSTR;
737 }
738
739 /*
740 * There is an obscure race condition which can cause mount
741 * to return EBUSY. This happens for example on the mount
742 * of the zone's /etc/svc/volatile file system if there is
743 * a GZ process running svcs -Z, which will touch the
744 * mountpoint, just as we're trying to do the mount. To cope
745 * with this, we retry up to 3 times to let this transient
746 * process get out of the way.
747 */
748 for (i = 0; i < 3; i++) {
749 ret = 0;
750 if (mount(spec, dir, mflag, fstype, NULL, 0, opt,
751 optlen) != 0)
752 ret = errno;
753 if (ret != EBUSY)
754 break;
755 (void) sleep(1);
756 }
757 _exit(ret);
758 }
759
760 /* parent */
761 if (contract_latest(&ct) == -1)
762 ct = -1;
763 (void) ct_tmpl_clear(tmpl_fd);
764 (void) close(tmpl_fd);
765 if (waitpid(child, &child_status, 0) != child) {
766 /* unexpected: we must have been signalled */
767 (void) contract_abandon_id(ct);
768 return (-1);
769 }
770 (void) contract_abandon_id(ct);
771 if (WEXITSTATUS(child_status) != 0) {
772 errno = WEXITSTATUS(child_status);
773 zerror(zlogp, B_TRUE, "mount of %s failed", dir);
774 return (-1);
775 }
776
777 return (0);
778 }
779
780 /*
781 * env variable name format
782 * _ZONECFG_{resource name}_{identifying attr. name}_{property name}
783 * Any dashes (-) in the property names are replaced with underscore (_).
784 */
785 static void
786 set_zonecfg_env(char *rsrc, char *attr, char *name, char *val)
787 {
788 char *p;
789 /* Enough for maximal name, rsrc + attr, & slop for ZONECFG & _'s */
790 char nm[2 * MAXNAMELEN + 32];
791
792 if (attr == NULL)
793 (void) snprintf(nm, sizeof (nm), "_ZONECFG_%s_%s", rsrc,
794 name);
795 else
796 (void) snprintf(nm, sizeof (nm), "_ZONECFG_%s_%s_%s", rsrc,
797 attr, name);
798
799 p = nm;
800 while ((p = strchr(p, '-')) != NULL)
801 *p++ = '_';
802
803 (void) setenv(nm, val, 1);
804 }
805
806 /*
807 * Export zonecfg network and device properties into environment for the boot
808 * and state change hooks.
809 * If debug is true, export the brand hook debug env. variable as well.
810 *
811 * We could export more of the config in the future, as necessary.
812 */
813 static int
814 setup_subproc_env(boolean_t debug)
815 {
816 int res;
817 struct zone_nwiftab ntab;
818 struct zone_devtab dtab;
819 struct zone_attrtab atab;
820 char net_resources[MAXNAMELEN * 2];
821 char dev_resources[MAXNAMELEN * 2];
822
823 /* snap_hndl is null when called through the set_brand_env code path */
824 if (snap_hndl == NULL)
825 return (Z_OK);
826
827 net_resources[0] = '\0';
828 if ((res = zonecfg_setnwifent(snap_hndl)) != Z_OK)
829 goto done;
830
831 while (zonecfg_getnwifent(snap_hndl, &ntab) == Z_OK) {
832 struct zone_res_attrtab *rap;
833 char *phys;
834
835 phys = ntab.zone_nwif_physical;
836
837 (void) strlcat(net_resources, phys, sizeof (net_resources));
838 (void) strlcat(net_resources, " ", sizeof (net_resources));
839
840 set_zonecfg_env(RSRC_NET, phys, "physical", phys);
841
842 set_zonecfg_env(RSRC_NET, phys, "address",
843 ntab.zone_nwif_address);
844 set_zonecfg_env(RSRC_NET, phys, "allowed-address",
845 ntab.zone_nwif_allowed_address);
846 set_zonecfg_env(RSRC_NET, phys, "defrouter",
847 ntab.zone_nwif_defrouter);
848 set_zonecfg_env(RSRC_NET, phys, "global-nic",
849 ntab.zone_nwif_gnic);
850 set_zonecfg_env(RSRC_NET, phys, "mac-addr", ntab.zone_nwif_mac);
851 set_zonecfg_env(RSRC_NET, phys, "vlan-id",
852 ntab.zone_nwif_vlan_id);
853
854 for (rap = ntab.zone_nwif_attrp; rap != NULL;
855 rap = rap->zone_res_attr_next)
856 set_zonecfg_env(RSRC_NET, phys, rap->zone_res_attr_name,
857 rap->zone_res_attr_value);
858 nwifent_free_attrs(&ntab);
859 }
860
861 (void) setenv("_ZONECFG_net_resources", net_resources, 1);
862
863 (void) zonecfg_endnwifent(snap_hndl);
864
865 if ((res = zonecfg_setdevent(snap_hndl)) != Z_OK)
866 goto done;
867
868 while (zonecfg_getdevent(snap_hndl, &dtab) == Z_OK) {
869 struct zone_res_attrtab *rap;
870 char *match;
871
872 match = dtab.zone_dev_match;
873
874 (void) strlcat(dev_resources, match, sizeof (dev_resources));
875 (void) strlcat(dev_resources, " ", sizeof (dev_resources));
876
877 for (rap = dtab.zone_dev_attrp; rap != NULL;
878 rap = rap->zone_res_attr_next)
879 set_zonecfg_env(RSRC_DEV, match,
880 rap->zone_res_attr_name, rap->zone_res_attr_value);
881 }
882
883 (void) zonecfg_enddevent(snap_hndl);
884
885 if ((res = zonecfg_setattrent(snap_hndl)) != Z_OK)
886 goto done;
887
888 while (zonecfg_getattrent(snap_hndl, &atab) == Z_OK) {
889 set_zonecfg_env("attr", NULL, atab.zone_attr_name,
890 atab.zone_attr_value);
891 }
892
893 (void) zonecfg_endattrent(snap_hndl);
894
895 if (debug)
896 (void) setenv("_ZONEADMD_brand_debug", "1", 1);
897 else
898 (void) setenv("_ZONEADMD_brand_debug", "", 1);
899
900 res = Z_OK;
901
902 done:
903 return (res);
904 }
905
906 void
907 nwifent_free_attrs(struct zone_nwiftab *np)
908 {
909 struct zone_res_attrtab *rap;
910
911 for (rap = np->zone_nwif_attrp; rap != NULL; ) {
912 struct zone_res_attrtab *tp = rap;
913
914 rap = rap->zone_res_attr_next;
915 free(tp);
916 }
917 }
918
919 /*
920 * If retstr is not NULL, the output of the subproc is returned in the str,
921 * otherwise it is output using zerror(). Any memory allocated for retstr
922 * should be freed by the caller.
923 */
924 int
925 do_subproc(zlog_t *zlogp, char *cmdbuf, char **retstr, boolean_t debug)
926 {
927 char buf[1024]; /* arbitrary large amount */
928 char *inbuf;
929 FILE *file;
930 int status;
931 int rd_cnt;
932
933 if (retstr != NULL) {
934 if ((*retstr = malloc(1024)) == NULL) {
935 zerror(zlogp, B_FALSE, "out of memory");
936 return (-1);
937 }
938 inbuf = *retstr;
939 rd_cnt = 0;
940 } else {
941 inbuf = buf;
942 }
943
944 if (setup_subproc_env(debug) != Z_OK) {
945 zerror(zlogp, B_FALSE, "failed to setup environment");
946 return (-1);
947 }
948
949 file = popen(cmdbuf, "r");
950 if (file == NULL) {
951 zerror(zlogp, B_TRUE, "could not launch: %s", cmdbuf);
952 return (-1);
953 }
954
955 while (fgets(inbuf, 1024, file) != NULL) {
956 if (retstr == NULL) {
957 if (zlogp != &logsys) {
958 int last = strlen(inbuf) - 1;
959
960 if (inbuf[last] == '\n')
961 inbuf[last] = '\0';
962 zerror(zlogp, B_FALSE, "%s", inbuf);
963 }
964 } else {
965 char *p;
966
967 rd_cnt += 1024 - 1;
968 if ((p = realloc(*retstr, rd_cnt + 1024)) == NULL) {
969 zerror(zlogp, B_FALSE, "out of memory");
970 (void) pclose(file);
971 return (-1);
972 }
973
974 *retstr = p;
975 inbuf = *retstr + rd_cnt;
976 }
977 }
978 status = pclose(file);
979
980 if (WIFSIGNALED(status)) {
981 zerror(zlogp, B_FALSE, "%s unexpectedly terminated due to "
982 "signal %d", cmdbuf, WTERMSIG(status));
983 return (-1);
984 }
985 assert(WIFEXITED(status));
986 if (WEXITSTATUS(status) == ZEXIT_EXEC) {
987 zerror(zlogp, B_FALSE, "failed to exec %s", cmdbuf);
988 return (-1);
989 }
990 return (WEXITSTATUS(status));
991 }
992
993 /*
994 * Get the path for this zone's init(1M) (or equivalent) process. First look
995 * for a zone-specific init-name attr, then get it from the brand.
996 */
997 static int
998 get_initname(brand_handle_t bh, char *initname, int len)
999 {
1000 struct zone_attrtab a;
1001
1002 bzero(&a, sizeof (a));
1003 (void) strlcpy(a.zone_attr_name, "init-name",
1004 sizeof (a.zone_attr_name));
1005
1006 if (zonecfg_lookup_attr(snap_hndl, &a) == Z_OK) {
1007 (void) strlcpy(initname, a.zone_attr_value, len);
1008 return (0);
1009 }
1010
1011 return (brand_get_initname(bh, initname, len));
1012 }
1013
1014 /*
1015 * Get the restart-init flag for this zone's init(1M) (or equivalent) process.
1016 * First look for a zone-specific restart-init attr, then get it from the brand.
1017 */
1018 static boolean_t
1019 restartinit(brand_handle_t bh)
1020 {
1021 struct zone_attrtab a;
1022
1023 bzero(&a, sizeof (a));
1024 (void) strlcpy(a.zone_attr_name, "restart-init",
1025 sizeof (a.zone_attr_name));
1026
1027 if (zonecfg_lookup_attr(snap_hndl, &a) == Z_OK) {
1028 if (strcmp(a.zone_attr_value, "false") == 0)
1029 return (B_FALSE);
1030 return (B_TRUE);
1031 }
1032
1033 return (brand_restartinit(bh));
1034 }
1035
1036 /*
1037 * Get the app-svc-dependent flag for this zone's init process. This is a
1038 * zone-specific attr which controls the type of contract we create for the
1039 * zone's init. When true, the contract will include CT_PR_EV_EXIT in the fatal
1040 * set, so that when any service which is in the same contract exits, the init
1041 * application will be terminated.
1042 */
1043 static boolean_t
1044 is_app_svc_dep(brand_handle_t bh)
1045 {
1046 struct zone_attrtab a;
1047
1048 bzero(&a, sizeof (a));
1049 (void) strlcpy(a.zone_attr_name, "app-svc-dependent",
1050 sizeof (a.zone_attr_name));
1051
1052 if (zonecfg_lookup_attr(snap_hndl, &a) == Z_OK &&
1053 strcmp(a.zone_attr_value, "true") == 0) {
1054 return (B_TRUE);
1055 }
1056
1057 return (B_FALSE);
1058 }
1059
1060 static int
1061 zone_bootup(zlog_t *zlogp, const char *bootargs, int zstate, boolean_t debug)
1062 {
1063 zoneid_t zoneid;
1064 struct stat st;
1065 char rpath[MAXPATHLEN], initpath[MAXPATHLEN], init_file[MAXPATHLEN];
1066 char nbootargs[BOOTARGS_MAX];
1067 char cmdbuf[MAXPATHLEN];
1068 fs_callback_t cb;
1069 brand_handle_t bh;
1070 zone_iptype_t iptype;
1071 dladm_status_t status;
1072 char errmsg[DLADM_STRSIZE];
1073 int err;
1074 boolean_t restart_init;
1075 boolean_t app_svc_dep;
1076
1077 if (brand_prestatechg(zlogp, zstate, Z_BOOT, debug) != 0)
1078 return (-1);
1079
1080 if ((zoneid = getzoneidbyname(zone_name)) == -1) {
1081 zerror(zlogp, B_TRUE, "unable to get zoneid");
1082 goto bad;
1083 }
1084
1085 cb.zlogp = zlogp;
1086 cb.zoneid = zoneid;
1087 cb.mount_cmd = B_FALSE;
1088
1089 /* Get a handle to the brand info for this zone */
1090 if ((bh = brand_open(brand_name)) == NULL) {
1091 zerror(zlogp, B_FALSE, "unable to determine zone brand");
1092 goto bad;
1093 }
1094
1095 /*
1096 * Get the list of filesystems to mount from the brand
1097 * configuration. These mounts are done via a thread that will
1098 * enter the zone, so they are done from within the context of the
1099 * zone.
1100 */
1101 if (brand_platform_iter_mounts(bh, mount_early_fs, &cb) != 0) {
1102 zerror(zlogp, B_FALSE, "unable to mount filesystems");
1103 brand_close(bh);
1104 goto bad;
1105 }
1106
1107 /*
1108 * Get the brand's boot callback if it exists.
1109 */
1110 (void) strcpy(cmdbuf, EXEC_PREFIX);
1111 if (brand_get_boot(bh, zone_name, zonepath, cmdbuf + EXEC_LEN,
1112 sizeof (cmdbuf) - EXEC_LEN) != 0) {
1113 zerror(zlogp, B_FALSE,
1114 "unable to determine branded zone's boot callback");
1115 brand_close(bh);
1116 goto bad;
1117 }
1118
1119 /* Get the path for this zone's init(1M) (or equivalent) process. */
1120 if (get_initname(bh, init_file, MAXPATHLEN) != 0) {
1121 zerror(zlogp, B_FALSE,
1122 "unable to determine zone's init(1M) location");
1123 brand_close(bh);
1124 goto bad;
1125 }
1126
1127 /* See if we should restart init if it dies. */
1128 restart_init = restartinit(bh);
1129
1130 /*
1131 * See if we need to setup contract dependencies between the zone's
1132 * primary application and any of its services.
1133 */
1134 app_svc_dep = is_app_svc_dep(bh);
1135
1136 brand_close(bh);
1137
1138 err = filter_bootargs(zlogp, bootargs, nbootargs, init_file);
1139 if (err != Z_OK)
1140 goto bad;
1141
1142 assert(init_file[0] != '\0');
1143
1144 /*
1145 * Try to anticipate possible problems: If possible, make sure init is
1146 * executable.
1147 */
1148 if (zone_get_rootpath(zone_name, rpath, sizeof (rpath)) != Z_OK) {
1149 zerror(zlogp, B_FALSE, "unable to determine zone root");
1150 goto bad;
1151 }
1152
1153 (void) snprintf(initpath, sizeof (initpath), "%s%s", rpath, init_file);
1154
1155 if (lstat(initpath, &st) == -1) {
1156 zerror(zlogp, B_TRUE, "could not stat %s", initpath);
1157 goto bad;
1158 }
1159
1160 if ((st.st_mode & S_IFMT) == S_IFLNK) {
1161 /* symlink, we'll have to wait and resolve when we boot */
1162 } else if ((st.st_mode & S_IXUSR) == 0) {
1163 zerror(zlogp, B_FALSE, "%s is not executable", initpath);
1164 goto bad;
1165 }
1166
1167 /*
1168 * Exclusive stack zones interact with the dlmgmtd running in the
1169 * global zone. dladm_zone_boot() tells dlmgmtd that this zone is
1170 * booting, and loads its datalinks from the zone's datalink
1171 * configuration file.
1172 */
1173 if (vplat_get_iptype(zlogp, &iptype) == 0 && iptype == ZS_EXCLUSIVE) {
1174 status = dladm_zone_boot(dld_handle, zoneid);
1175 if (status != DLADM_STATUS_OK) {
1176 zerror(zlogp, B_FALSE, "unable to load zone datalinks: "
1177 " %s", dladm_status2str(status, errmsg));
1178 goto bad;
1179 }
1180 }
1181
1182 /*
1183 * If there is a brand 'boot' callback, execute it now to give the
1184 * brand one last chance to do any additional setup before the zone
1185 * is booted.
1186 */
1187 if ((strlen(cmdbuf) > EXEC_LEN) &&
1188 (do_subproc(zlogp, cmdbuf, NULL, debug) != Z_OK)) {
1189 zerror(zlogp, B_FALSE, "%s failed", cmdbuf);
1190 goto bad;
1191 }
1192
1193 if (zone_setattr(zoneid, ZONE_ATTR_INITNAME, init_file, 0) == -1) {
1194 zerror(zlogp, B_TRUE, "could not set zone boot file");
1195 goto bad;
1196 }
1197
1198 if (zone_setattr(zoneid, ZONE_ATTR_BOOTARGS, nbootargs, 0) == -1) {
1199 zerror(zlogp, B_TRUE, "could not set zone boot arguments");
1200 goto bad;
1201 }
1202
1203 if (!restart_init && zone_setattr(zoneid, ZONE_ATTR_INITNORESTART,
1204 NULL, 0) == -1) {
1205 zerror(zlogp, B_TRUE, "could not set zone init-no-restart");
1206 goto bad;
1207 }
1208
1209 if (app_svc_dep && zone_setattr(zoneid, ZONE_ATTR_APP_SVC_CT,
1210 (void *)B_TRUE, sizeof (boolean_t)) == -1) {
1211 zerror(zlogp, B_TRUE, "could not set zone app-die");
1212 goto bad;
1213 }
1214
1215 /*
1216 * Inform zonestatd of a new zone so that it can install a door for
1217 * the zone to contact it.
1218 */
1219 notify_zonestatd(zone_id);
1220
1221 if (zone_boot(zoneid) == -1) {
1222 zerror(zlogp, B_TRUE, "unable to boot zone");
1223 goto bad;
1224 }
1225
1226 if (brand_poststatechg(zlogp, zstate, Z_BOOT, debug) != 0)
1227 goto bad;
1228
1229 /* Startup a thread to perform zfd logging/tty svc for the zone. */
1230 create_log_thread(zlogp, zone_id);
1231
1232 /* Startup a thread to perform memory capping for the zone. */
1233 create_mcap_thread(zlogp, zone_id);
1234
1235 return (0);
1236
1237 bad:
1238 /*
1239 * If something goes wrong, we up the zones's state to the target
1240 * state, RUNNING, and then invoke the hook as if we're halting.
1241 */
1242 (void) brand_poststatechg(zlogp, ZONE_STATE_RUNNING, Z_HALT, debug);
1243
1244 return (-1);
1245 }
1246
1247 static int
1248 zone_halt(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting, int zstate,
1249 boolean_t debug)
1250 {
1251 int err;
1252
1253 /*
1254 * If performing a scratch zone unmount then do not call the
1255 * state change hooks.
1256 */
1257 if (unmount_cmd == B_FALSE &&
1258 brand_prestatechg(zlogp, zstate, Z_HALT, debug) != 0)
1259 return (-1);
1260
1261 /* Shutting down, stop the memcap thread */
1262 destroy_mcap_thread();
1263
1264 if (vplat_teardown(zlogp, unmount_cmd, rebooting, debug) != 0) {
1265 if (!bringup_failure_recovery)
1266 zerror(zlogp, B_FALSE, "unable to destroy zone");
1267 destroy_log_thread();
1268 return (-1);
1269 }
1270
1271 /* Shut down is done, stop the log thread */
1272 destroy_log_thread();
1273
1274 if (unmount_cmd == B_FALSE &&
1275 brand_poststatechg(zlogp, zstate, Z_HALT, debug) != 0)
1276 return (-1);
1277
1278 if ((err = zonecfg_destroy_snapshot(zone_name)) != Z_OK)
1279 zerror(zlogp, B_FALSE, "destroying snapshot: %s",
1280 zonecfg_strerror(err));
1281
1282 zonecfg_fini_handle(snap_hndl);
1283 snap_hndl = NULL;
1284
1285 return (0);
1286 }
1287
1288 static int
1289 zone_graceful_shutdown(zlog_t *zlogp)
1290 {
1291 zoneid_t zoneid;
1292 pid_t child;
1293 char cmdbuf[MAXPATHLEN];
1294 brand_handle_t bh = NULL;
1295 ctid_t ct;
1296 int tmpl_fd;
1297 int child_status;
1298
1299 if (shutdown_in_progress) {
1300 zerror(zlogp, B_FALSE, "shutdown already in progress");
1301 return (-1);
1302 }
1303
1304 if ((zoneid = getzoneidbyname(zone_name)) == -1) {
1305 zerror(zlogp, B_TRUE, "unable to get zoneid");
1306 return (-1);
1307 }
1308
1309 /* Get a handle to the brand info for this zone */
1310 if ((bh = brand_open(brand_name)) == NULL) {
1311 zerror(zlogp, B_FALSE, "unable to determine zone brand");
1312 return (-1);
1313 }
1314
1315 /*
1316 * If there is a brand 'shutdown' callback, execute it now to give the
1317 * brand a chance to cleanup any custom configuration.
1318 */
1319 (void) strcpy(cmdbuf, EXEC_PREFIX);
1320 if (brand_get_shutdown(bh, zone_name, zonepath, cmdbuf + EXEC_LEN,
1321 sizeof (cmdbuf) - EXEC_LEN) != 0 || strlen(cmdbuf) <= EXEC_LEN) {
1322 (void) strcat(cmdbuf, SHUTDOWN_DEFAULT);
1323 }
1324 brand_close(bh);
1325
1326 if ((tmpl_fd = init_template()) == -1) {
1327 zerror(zlogp, B_TRUE, "failed to create contract");
1328 return (-1);
1329 }
1330
1331 if ((child = fork()) == -1) {
1332 (void) ct_tmpl_clear(tmpl_fd);
1333 (void) close(tmpl_fd);
1334 zerror(zlogp, B_TRUE, "failed to fork");
1335 return (-1);
1336 } else if (child == 0) {
1337 (void) ct_tmpl_clear(tmpl_fd);
1338 if (zone_enter(zoneid) == -1) {
1339 _exit(errno);
1340 }
1341 _exit(execl("/bin/sh", "sh", "-c", cmdbuf, (char *)NULL));
1342 }
1343
1344 if (contract_latest(&ct) == -1)
1345 ct = -1;
1346 (void) ct_tmpl_clear(tmpl_fd);
1347 (void) close(tmpl_fd);
1348
1349 if (waitpid(child, &child_status, 0) != child) {
1350 /* unexpected: we must have been signalled */
1351 (void) contract_abandon_id(ct);
1352 return (-1);
1353 }
1354
1355 (void) contract_abandon_id(ct);
1356 if (WEXITSTATUS(child_status) != 0) {
1357 errno = WEXITSTATUS(child_status);
1358 zerror(zlogp, B_FALSE, "unable to shutdown zone");
1359 return (-1);
1360 }
1361
1362 shutdown_in_progress = B_TRUE;
1363
1364 return (0);
1365 }
1366
1367 static int
1368 zone_wait_shutdown(zlog_t *zlogp)
1369 {
1370 zone_state_t zstate;
1371 uint64_t *tm = NULL;
1372 scf_simple_prop_t *prop = NULL;
1373 int timeout;
1374 int tries;
1375 int rc = -1;
1376
1377 /* Get default stop timeout from SMF framework */
1378 timeout = SHUTDOWN_WAIT;
1379 if ((prop = scf_simple_prop_get(NULL, SHUTDOWN_FMRI, "stop",
1380 SCF_PROPERTY_TIMEOUT)) != NULL) {
1381 if ((tm = scf_simple_prop_next_count(prop)) != NULL) {
1382 if (tm != 0)
1383 timeout = *tm;
1384 }
1385 scf_simple_prop_free(prop);
1386 }
1387
1388 /* allow time for zone to shutdown cleanly */
1389 for (tries = 0; tries < timeout; tries ++) {
1390 (void) sleep(1);
1391 if (zone_get_state(zone_name, &zstate) == Z_OK &&
1392 zstate == ZONE_STATE_INSTALLED) {
1393 rc = 0;
1394 break;
1395 }
1396 }
1397
1398 if (rc != 0)
1399 zerror(zlogp, B_FALSE, "unable to shutdown zone");
1400
1401 shutdown_in_progress = B_FALSE;
1402
1403 return (rc);
1404 }
1405
1406
1407
1408 /*
1409 * Generate AUE_zone_state for a command that boots a zone.
1410 */
1411 static void
1412 audit_put_record(zlog_t *zlogp, ucred_t *uc, int return_val,
1413 char *new_state)
1414 {
1415 adt_session_data_t *ah;
1416 adt_event_data_t *event;
1417 int pass_fail, fail_reason;
1418
1419 if (!adt_audit_enabled())
1420 return;
1421
1422 if (return_val == 0) {
1423 pass_fail = ADT_SUCCESS;
1424 fail_reason = ADT_SUCCESS;
1425 } else {
1426 pass_fail = ADT_FAILURE;
1427 fail_reason = ADT_FAIL_VALUE_PROGRAM;
1428 }
1429
1430 if (adt_start_session(&ah, NULL, 0)) {
1431 zerror(zlogp, B_TRUE, gettext("audit failure."));
1432 return;
1433 }
1434 if (adt_set_from_ucred(ah, uc, ADT_NEW)) {
1435 zerror(zlogp, B_TRUE, gettext("audit failure."));
1436 (void) adt_end_session(ah);
1437 return;
1438 }
1439
1440 event = adt_alloc_event(ah, ADT_zone_state);
1441 if (event == NULL) {
1442 zerror(zlogp, B_TRUE, gettext("audit failure."));
1443 (void) adt_end_session(ah);
1444 return;
1445 }
1446 event->adt_zone_state.zonename = zone_name;
1447 event->adt_zone_state.new_state = new_state;
1448
1449 if (adt_put_event(event, pass_fail, fail_reason))
1450 zerror(zlogp, B_TRUE, gettext("audit failure."));
1451
1452 adt_free_event(event);
1453
1454 (void) adt_end_session(ah);
1455 }
1456
1457 /*
1458 * Log the exit time and status of the zone's init process into
1459 * {zonepath}/lastexited. If the zone shutdown normally, the exit status will
1460 * be -1, otherwise it will be the exit status as described in wait.3c.
1461 * If the zone is configured to restart init, then nothing will be logged if
1462 * init exits unexpectedly (the kernel will never upcall in this case).
1463 */
1464 static void
1465 log_init_exit(int status)
1466 {
1467 char p[MAXPATHLEN];
1468 char buf[128];
1469 struct timeval t;
1470 int fd;
1471
1472 if (snprintf(p, sizeof (p), "%s/lastexited", zonepath) > sizeof (p))
1473 return;
1474 if (gettimeofday(&t, NULL) != 0)
1475 return;
1476 if (snprintf(buf, sizeof (buf), "%ld.%ld %d\n", t.tv_sec, t.tv_usec,
1477 status) > sizeof (buf))
1478 return;
1479 if ((fd = open(p, O_WRONLY | O_CREAT | O_TRUNC, 0644)) < 0)
1480 return;
1481
1482 (void) write(fd, buf, strlen(buf));
1483
1484 (void) close(fd);
1485 }
1486
1487 /*
1488 * The main routine for the door server that deals with zone state transitions.
1489 */
1490 /* ARGSUSED */
1491 static void
1492 server(void *cookie, char *args, size_t alen, door_desc_t *dp,
1493 uint_t n_desc)
1494 {
1495 ucred_t *uc = NULL;
1496 const priv_set_t *eset;
1497
1498 zone_state_t zstate;
1499 zone_cmd_t cmd;
1500 boolean_t debug;
1501 int init_status;
1502 zone_cmd_arg_t *zargp;
1503
1504 boolean_t kernelcall = B_TRUE;
1505
1506 int rval = -1;
1507 uint64_t uniqid;
1508 zoneid_t zoneid = -1;
1509 zlog_t zlog;
1510 zlog_t *zlogp;
1511 zone_cmd_rval_t *rvalp;
1512 size_t rlen = getpagesize(); /* conservative */
1513 fs_callback_t cb;
1514 brand_handle_t bh;
1515 boolean_t wait_shut = B_FALSE;
1516
1517 /* LINTED E_BAD_PTR_CAST_ALIGN */
1518 zargp = (zone_cmd_arg_t *)args;
1519
1520 /*
1521 * When we get the door unref message, we've fdetach'd the door, and
1522 * it is time for us to shut down zoneadmd.
1523 */
1524 if (zargp == DOOR_UNREF_DATA) {
1525 /*
1526 * See comment at end of main() for info on the last rites.
1527 */
1528 exit(0);
1529 }
1530
1531 if (zargp == NULL) {
1532 (void) door_return(NULL, 0, 0, 0);
1533 }
1534
1535 rvalp = alloca(rlen);
1536 bzero(rvalp, rlen);
1537 zlog.logfile = NULL;
1538 zlog.buflen = zlog.loglen = rlen - sizeof (zone_cmd_rval_t) + 1;
1539 zlog.buf = rvalp->errbuf;
1540 zlog.log = zlog.buf;
1541 /* defer initialization of zlog.locale until after credential check */
1542 zlogp = &zlog;
1543
1544 if (alen != sizeof (zone_cmd_arg_t)) {
1545 /*
1546 * This really shouldn't be happening.
1547 */
1548 zerror(&logsys, B_FALSE, "argument size (%d bytes) "
1549 "unexpected (expected %d bytes)", alen,
1550 sizeof (zone_cmd_arg_t));
1551 goto out;
1552 }
1553 cmd = zargp->cmd;
1554 debug = zargp->debug;
1555 init_status = zargp->status;
1556
1557 if (door_ucred(&uc) != 0) {
1558 zerror(&logsys, B_TRUE, "door_ucred");
1559 goto out;
1560 }
1561 eset = ucred_getprivset(uc, PRIV_EFFECTIVE);
1562 if (ucred_getzoneid(uc) != GLOBAL_ZONEID ||
1563 (eset != NULL ? !priv_ismember(eset, PRIV_SYS_CONFIG) :
1564 ucred_geteuid(uc) != 0)) {
1565 zerror(&logsys, B_FALSE, "insufficient privileges");
1566 goto out;
1567 }
1568
1569 kernelcall = ucred_getpid(uc) == 0;
1570
1571 /*
1572 * This is safe because we only use a zlog_t throughout the
1573 * duration of a door call; i.e., by the time the pointer
1574 * might become invalid, the door call would be over.
1575 */
1576 zlog.locale = kernelcall ? DEFAULT_LOCALE : zargp->locale;
1577
1578 (void) mutex_lock(&lock);
1579
1580 /*
1581 * Once we start to really die off, we don't want more connections.
1582 */
1583 if (in_death_throes) {
1584 (void) mutex_unlock(&lock);
1585 ucred_free(uc);
1586 (void) door_return(NULL, 0, 0, 0);
1587 thr_exit(NULL);
1588 }
1589
1590 /*
1591 * Check for validity of command.
1592 */
1593 if (cmd != Z_READY && cmd != Z_BOOT && cmd != Z_FORCEBOOT &&
1594 cmd != Z_REBOOT && cmd != Z_SHUTDOWN && cmd != Z_HALT &&
1595 cmd != Z_NOTE_UNINSTALLING && cmd != Z_MOUNT &&
1596 cmd != Z_FORCEMOUNT && cmd != Z_UNMOUNT) {
1597 zerror(&logsys, B_FALSE, "invalid command %d", (int)cmd);
1598 goto out;
1599 }
1600
1601 if (kernelcall && (cmd != Z_HALT && cmd != Z_REBOOT)) {
1602 /*
1603 * Can't happen
1604 */
1605 zerror(&logsys, B_FALSE, "received unexpected kernel upcall %d",
1606 cmd);
1607 goto out;
1608 }
1609 /*
1610 * We ignore the possibility of someone calling zone_create(2)
1611 * explicitly; all requests must come through zoneadmd.
1612 */
1613 if (zone_get_state(zone_name, &zstate) != Z_OK) {
1614 /*
1615 * Something terribly wrong happened
1616 */
1617 zerror(&logsys, B_FALSE, "unable to determine state of zone");
1618 goto out;
1619 }
1620
1621 if (kernelcall) {
1622 /*
1623 * Kernel-initiated requests may lose their validity if the
1624 * zone_t the kernel was referring to has gone away.
1625 */
1626 if ((zoneid = getzoneidbyname(zone_name)) == -1 ||
1627 zone_getattr(zoneid, ZONE_ATTR_UNIQID, &uniqid,
1628 sizeof (uniqid)) == -1 || uniqid != zargp->uniqid) {
1629 /*
1630 * We're not talking about the same zone. The request
1631 * must have arrived too late. Return error.
1632 */
1633 rval = -1;
1634 goto out;
1635 }
1636 zlogp = &logsys; /* Log errors to syslog */
1637 }
1638
1639 /*
1640 * If we are being asked to forcibly mount or boot a zone, we
1641 * pretend that an INCOMPLETE zone is actually INSTALLED.
1642 */
1643 if (zstate == ZONE_STATE_INCOMPLETE &&
1644 (cmd == Z_FORCEBOOT || cmd == Z_FORCEMOUNT))
1645 zstate = ZONE_STATE_INSTALLED;
1646
1647 switch (zstate) {
1648 case ZONE_STATE_CONFIGURED:
1649 case ZONE_STATE_INCOMPLETE:
1650 /*
1651 * Not our area of expertise; we just print a nice message
1652 * and die off.
1653 */
1654 zerror(zlogp, B_FALSE,
1655 "%s operation is invalid for zones in state '%s'",
1656 z_cmd_name(cmd), zone_state_str(zstate));
1657 break;
1658
1659 case ZONE_STATE_INSTALLED:
1660 switch (cmd) {
1661 case Z_READY:
1662 rval = zone_ready(zlogp, Z_MNT_BOOT, zstate, debug);
1663 if (rval == 0)
1664 eventstream_write(Z_EVT_ZONE_READIED);
1665 zcons_statechanged();
1666 break;
1667 case Z_BOOT:
1668 case Z_FORCEBOOT:
1669 eventstream_write(Z_EVT_ZONE_BOOTING);
1670 if ((rval = zone_ready(zlogp, Z_MNT_BOOT, zstate,
1671 debug)) == 0) {
1672 rval = zone_bootup(zlogp, zargp->bootbuf,
1673 zstate, debug);
1674 }
1675 audit_put_record(zlogp, uc, rval, "boot");
1676 zcons_statechanged();
1677 if (rval != 0) {
1678 bringup_failure_recovery = B_TRUE;
1679 (void) zone_halt(zlogp, B_FALSE, B_FALSE,
1680 zstate, debug);
1681 eventstream_write(Z_EVT_ZONE_BOOTFAILED);
1682 }
1683 break;
1684 case Z_SHUTDOWN:
1685 case Z_HALT:
1686 if (kernelcall) /* Invalid; can't happen */
1687 abort();
1688 /*
1689 * We could have two clients racing to halt this
1690 * zone; the second client loses, but his request
1691 * doesn't fail, since the zone is now in the desired
1692 * state.
1693 */
1694 zerror(zlogp, B_FALSE, "zone is already halted");
1695 rval = 0;
1696 break;
1697 case Z_REBOOT:
1698 if (kernelcall) /* Invalid; can't happen */
1699 abort();
1700 zerror(zlogp, B_FALSE, "%s operation is invalid "
1701 "for zones in state '%s'", z_cmd_name(cmd),
1702 zone_state_str(zstate));
1703 rval = -1;
1704 break;
1705 case Z_NOTE_UNINSTALLING:
1706 if (kernelcall) /* Invalid; can't happen */
1707 abort();
1708 /*
1709 * Tell the console to print out a message about this.
1710 * Once it does, we will be in_death_throes.
1711 */
1712 eventstream_write(Z_EVT_ZONE_UNINSTALLING);
1713 break;
1714 case Z_MOUNT:
1715 case Z_FORCEMOUNT:
1716 if (kernelcall) /* Invalid; can't happen */
1717 abort();
1718 if (!zone_isnative && !zone_iscluster &&
1719 !zone_islabeled) {
1720 /*
1721 * -U mounts the zone without lofs mounting
1722 * zone file systems back into the scratch
1723 * zone. This is required when mounting
1724 * non-native branded zones.
1725 */
1726 (void) strlcpy(zargp->bootbuf, "-U",
1727 BOOTARGS_MAX);
1728 }
1729
1730 rval = zone_ready(zlogp,
1731 strcmp(zargp->bootbuf, "-U") == 0 ?
1732 Z_MNT_UPDATE : Z_MNT_SCRATCH, zstate, debug);
1733 if (rval != 0)
1734 break;
1735
1736 eventstream_write(Z_EVT_ZONE_READIED);
1737
1738 /*
1739 * Get a handle to the default brand info.
1740 * We must always use the default brand file system
1741 * list when mounting the zone.
1742 */
1743 if ((bh = brand_open(default_brand)) == NULL) {
1744 rval = -1;
1745 break;
1746 }
1747
1748 /*
1749 * Get the list of filesystems to mount from
1750 * the brand configuration. These mounts are done
1751 * via a thread that will enter the zone, so they
1752 * are done from within the context of the zone.
1753 */
1754 cb.zlogp = zlogp;
1755 cb.zoneid = zone_id;
1756 cb.mount_cmd = B_TRUE;
1757 rval = brand_platform_iter_mounts(bh,
1758 mount_early_fs, &cb);
1759
1760 brand_close(bh);
1761
1762 /*
1763 * Ordinarily, /dev/fd would be mounted inside the zone
1764 * by svc:/system/filesystem/usr:default, but since
1765 * we're not booting the zone, we need to do this
1766 * manually.
1767 */
1768 if (rval == 0)
1769 rval = mount_early_fs(&cb,
1770 "fd", "/dev/fd", "fd", NULL);
1771 break;
1772 case Z_UNMOUNT:
1773 if (kernelcall) /* Invalid; can't happen */
1774 abort();
1775 zerror(zlogp, B_FALSE, "zone is already unmounted");
1776 rval = 0;
1777 break;
1778 }
1779 break;
1780
1781 case ZONE_STATE_READY:
1782 switch (cmd) {
1783 case Z_READY:
1784 /*
1785 * We could have two clients racing to ready this
1786 * zone; the second client loses, but his request
1787 * doesn't fail, since the zone is now in the desired
1788 * state.
1789 */
1790 zerror(zlogp, B_FALSE, "zone is already ready");
1791 rval = 0;
1792 break;
1793 case Z_BOOT:
1794 case Z_FORCEBOOT:
1795 (void) strlcpy(boot_args, zargp->bootbuf,
1796 sizeof (boot_args));
1797 eventstream_write(Z_EVT_ZONE_BOOTING);
1798 rval = zone_bootup(zlogp, zargp->bootbuf, zstate,
1799 debug);
1800 audit_put_record(zlogp, uc, rval, "boot");
1801 zcons_statechanged();
1802 if (rval != 0) {
1803 bringup_failure_recovery = B_TRUE;
1804 (void) zone_halt(zlogp, B_FALSE, B_TRUE,
1805 zstate, debug);
1806 eventstream_write(Z_EVT_ZONE_BOOTFAILED);
1807 }
1808 boot_args[0] = '\0';
1809 break;
1810 case Z_HALT:
1811 if (kernelcall) /* Invalid; can't happen */
1812 abort();
1813 if ((rval = zone_halt(zlogp, B_FALSE, B_FALSE, zstate,
1814 debug)) != 0)
1815 break;
1816 zcons_statechanged();
1817 eventstream_write(Z_EVT_ZONE_HALTED);
1818 break;
1819 case Z_SHUTDOWN:
1820 case Z_REBOOT:
1821 case Z_NOTE_UNINSTALLING:
1822 case Z_MOUNT:
1823 case Z_FORCEMOUNT:
1824 case Z_UNMOUNT:
1825 if (kernelcall) /* Invalid; can't happen */
1826 abort();
1827 zerror(zlogp, B_FALSE, "%s operation is invalid "
1828 "for zones in state '%s'", z_cmd_name(cmd),
1829 zone_state_str(zstate));
1830 rval = -1;
1831 break;
1832 }
1833 break;
1834
1835 case ZONE_STATE_MOUNTED:
1836 switch (cmd) {
1837 case Z_UNMOUNT:
1838 if (kernelcall) /* Invalid; can't happen */
1839 abort();
1840 rval = zone_halt(zlogp, B_TRUE, B_FALSE, zstate, debug);
1841 if (rval == 0) {
1842 eventstream_write(Z_EVT_ZONE_HALTED);
1843 (void) sema_post(&scratch_sem);
1844 }
1845 break;
1846 default:
1847 if (kernelcall) /* Invalid; can't happen */
1848 abort();
1849 zerror(zlogp, B_FALSE, "%s operation is invalid "
1850 "for zones in state '%s'", z_cmd_name(cmd),
1851 zone_state_str(zstate));
1852 rval = -1;
1853 break;
1854 }
1855 break;
1856
1857 case ZONE_STATE_RUNNING:
1858 case ZONE_STATE_SHUTTING_DOWN:
1859 case ZONE_STATE_DOWN:
1860 switch (cmd) {
1861 case Z_READY:
1862 if ((rval = zone_halt(zlogp, B_FALSE, B_TRUE, zstate,
1863 debug)) != 0)
1864 break;
1865 zcons_statechanged();
1866 if ((rval = zone_ready(zlogp, Z_MNT_BOOT, zstate,
1867 debug)) == 0)
1868 eventstream_write(Z_EVT_ZONE_READIED);
1869 else
1870 eventstream_write(Z_EVT_ZONE_HALTED);
1871 break;
1872 case Z_BOOT:
1873 case Z_FORCEBOOT:
1874 /*
1875 * We could have two clients racing to boot this
1876 * zone; the second client loses, but his request
1877 * doesn't fail, since the zone is now in the desired
1878 * state.
1879 */
1880 zerror(zlogp, B_FALSE, "zone is already booted");
1881 rval = 0;
1882 break;
1883 case Z_HALT:
1884 if (kernelcall) {
1885 log_init_exit(init_status);
1886 } else {
1887 log_init_exit(-1);
1888 }
1889 if ((rval = zone_halt(zlogp, B_FALSE, B_FALSE, zstate,
1890 debug)) != 0)
1891 break;
1892 eventstream_write(Z_EVT_ZONE_HALTED);
1893 zcons_statechanged();
1894 break;
1895 case Z_REBOOT:
1896 (void) strlcpy(boot_args, zargp->bootbuf,
1897 sizeof (boot_args));
1898 eventstream_write(Z_EVT_ZONE_REBOOTING);
1899 if ((rval = zone_halt(zlogp, B_FALSE, B_TRUE, zstate,
1900 debug)) != 0) {
1901 eventstream_write(Z_EVT_ZONE_BOOTFAILED);
1902 boot_args[0] = '\0';
1903 break;
1904 }
1905 zcons_statechanged();
1906 if ((rval = zone_ready(zlogp, Z_MNT_BOOT, zstate,
1907 debug)) != 0) {
1908 eventstream_write(Z_EVT_ZONE_BOOTFAILED);
1909 boot_args[0] = '\0';
1910 break;
1911 }
1912 rval = zone_bootup(zlogp, zargp->bootbuf, zstate,
1913 debug);
1914 audit_put_record(zlogp, uc, rval, "reboot");
1915 if (rval != 0) {
1916 (void) zone_halt(zlogp, B_FALSE, B_TRUE,
1917 zstate, debug);
1918 eventstream_write(Z_EVT_ZONE_BOOTFAILED);
1919 }
1920 boot_args[0] = '\0';
1921 break;
1922 case Z_SHUTDOWN:
1923 if ((rval = zone_graceful_shutdown(zlogp)) == 0) {
1924 wait_shut = B_TRUE;
1925 }
1926 break;
1927 case Z_NOTE_UNINSTALLING:
1928 case Z_MOUNT:
1929 case Z_FORCEMOUNT:
1930 case Z_UNMOUNT:
1931 zerror(zlogp, B_FALSE, "%s operation is invalid "
1932 "for zones in state '%s'", z_cmd_name(cmd),
1933 zone_state_str(zstate));
1934 rval = -1;
1935 break;
1936 }
1937 break;
1938 default:
1939 abort();
1940 }
1941
1942 /*
1943 * Because the state of the zone may have changed, we make sure
1944 * to wake the console poller, which is in charge of initiating
1945 * the shutdown procedure as necessary.
1946 */
1947 eventstream_write(Z_EVT_NULL);
1948
1949 out:
1950 (void) mutex_unlock(&lock);
1951
1952 /* Wait for the Z_SHUTDOWN commands to complete */
1953 if (wait_shut)
1954 rval = zone_wait_shutdown(zlogp);
1955
1956 if (kernelcall) {
1957 rvalp = NULL;
1958 rlen = 0;
1959 } else {
1960 rvalp->rval = rval;
1961 }
1962 if (uc != NULL)
1963 ucred_free(uc);
1964 (void) door_return((char *)rvalp, rlen, NULL, 0);
1965 thr_exit(NULL);
1966 }
1967
1968 static int
1969 setup_door(zlog_t *zlogp)
1970 {
1971 if ((zone_door = door_create(server, NULL,
1972 DOOR_UNREF | DOOR_REFUSE_DESC | DOOR_NO_CANCEL)) < 0) {
1973 zerror(zlogp, B_TRUE, "%s failed", "door_create");
1974 return (-1);
1975 }
1976 (void) fdetach(zone_door_path);
1977
1978 if (fattach(zone_door, zone_door_path) != 0) {
1979 zerror(zlogp, B_TRUE, "fattach to %s failed", zone_door_path);
1980 (void) door_revoke(zone_door);
1981 (void) fdetach(zone_door_path);
1982 zone_door = -1;
1983 return (-1);
1984 }
1985 return (0);
1986 }
1987
1988 /*
1989 * zoneadm(1m) will start zoneadmd if it thinks it isn't running; this
1990 * is where zoneadmd itself will check to see that another instance of
1991 * zoneadmd isn't already controlling this zone.
1992 *
1993 * The idea here is that we want to open the path to which we will
1994 * attach our door, lock it, and then make sure that no-one has beat us
1995 * to fattach(3c)ing onto it.
1996 *
1997 * fattach(3c) is really a mount, so there are actually two possible
1998 * vnodes we could be dealing with. Our strategy is as follows:
1999 *
2000 * - If the file we opened is a regular file (common case):
2001 * There is no fattach(3c)ed door, so we have a chance of becoming
2002 * the managing zoneadmd. We attempt to lock the file: if it is
2003 * already locked, that means someone else raced us here, so we
2004 * lose and give up. zoneadm(1m) will try to contact the zoneadmd
2005 * that beat us to it.
2006 *
2007 * - If the file we opened is a namefs file:
2008 * This means there is already an established door fattach(3c)'ed
2009 * to the rendezvous path. We've lost the race, so we give up.
2010 * Note that in this case we also try to grab the file lock, and
2011 * will succeed in acquiring it since the vnode locked by the
2012 * "winning" zoneadmd was a regular one, and the one we locked was
2013 * the fattach(3c)'ed door node. At any rate, no harm is done, and
2014 * we just return to zoneadm(1m) which knows to retry.
2015 */
2016 static int
2017 make_daemon_exclusive(zlog_t *zlogp)
2018 {
2019 int doorfd = -1;
2020 int err, ret = -1;
2021 struct stat st;
2022 struct flock flock;
2023 zone_state_t zstate;
2024
2025 top:
2026 if ((err = zone_get_state(zone_name, &zstate)) != Z_OK) {
2027 zerror(zlogp, B_FALSE, "failed to get zone state: %s",
2028 zonecfg_strerror(err));
2029 goto out;
2030 }
2031 if ((doorfd = open(zone_door_path, O_CREAT|O_RDWR,
2032 S_IREAD|S_IWRITE)) < 0) {
2033 zerror(zlogp, B_TRUE, "failed to open %s", zone_door_path);
2034 goto out;
2035 }
2036 if (fstat(doorfd, &st) < 0) {
2037 zerror(zlogp, B_TRUE, "failed to stat %s", zone_door_path);
2038 goto out;
2039 }
2040 /*
2041 * Lock the file to synchronize with other zoneadmd
2042 */
2043 flock.l_type = F_WRLCK;
2044 flock.l_whence = SEEK_SET;
2045 flock.l_start = (off_t)0;
2046 flock.l_len = (off_t)0;
2047 if (fcntl(doorfd, F_SETLK, &flock) < 0) {
2048 /*
2049 * Someone else raced us here and grabbed the lock file
2050 * first. A warning here is inappropriate since nothing
2051 * went wrong.
2052 */
2053 goto out;
2054 }
2055
2056 if (strcmp(st.st_fstype, "namefs") == 0) {
2057 struct door_info info;
2058
2059 /*
2060 * There is already something fattach()'ed to this file.
2061 * Lets see what the door is up to.
2062 */
2063 if (door_info(doorfd, &info) == 0 && info.di_target != -1) {
2064 /*
2065 * Another zoneadmd process seems to be in
2066 * control of the situation and we don't need to
2067 * be here. A warning here is inappropriate
2068 * since nothing went wrong.
2069 *
2070 * If the door has been revoked, the zoneadmd
2071 * process currently managing the zone is going
2072 * away. We'll return control to zoneadm(1m)
2073 * which will try again (by which time zoneadmd
2074 * will hopefully have exited).
2075 */
2076 goto out;
2077 }
2078
2079 /*
2080 * If we got this far, there's a fattach(3c)'ed door
2081 * that belongs to a process that has exited, which can
2082 * happen if the previous zoneadmd died unexpectedly.
2083 *
2084 * Let user know that something is amiss, but that we can
2085 * recover; if the zone is in the installed state, then don't
2086 * message, since having a running zoneadmd isn't really
2087 * expected/needed. We want to keep occurences of this message
2088 * limited to times when zoneadmd is picking back up from a
2089 * zoneadmd that died while the zone was in some non-trivial
2090 * state.
2091 */
2092 if (zstate > ZONE_STATE_INSTALLED) {
2093 static zoneid_t zid;
2094
2095 zerror(zlogp, B_FALSE,
2096 "zone '%s': WARNING: zone is in state '%s', but "
2097 "zoneadmd does not appear to be available; "
2098 "restarted zoneadmd to recover.",
2099 zone_name, zone_state_str(zstate));
2100
2101 /*
2102 * Startup a thread to perform the zfd logging/tty svc
2103 * and a thread to perform memory capping for the
2104 * zone. zlogp won't be valid for much longer so use
2105 * logsys.
2106 */
2107 if ((zid = getzoneidbyname(zone_name)) != -1) {
2108 create_log_thread(&logsys, zid);
2109 create_mcap_thread(&logsys, zid);
2110 }
2111
2112 /* recover the global configuration snapshot */
2113 if (snap_hndl == NULL) {
2114 if ((snap_hndl = zonecfg_init_handle())
2115 == NULL ||
2116 zonecfg_create_snapshot(zone_name)
2117 != Z_OK ||
2118 zonecfg_get_snapshot_handle(zone_name,
2119 snap_hndl) != Z_OK) {
2120 zerror(zlogp, B_FALSE, "recovering "
2121 "zone configuration handle");
2122 goto out;
2123 }
2124 }
2125 }
2126
2127 (void) fdetach(zone_door_path);
2128 (void) close(doorfd);
2129 goto top;
2130 }
2131 ret = 0;
2132 out:
2133 (void) close(doorfd);
2134 return (ret);
2135 }
2136
2137 /*
2138 * Run the query hook with the 'env' parameter. It should return a
2139 * string of tab-delimited key-value pairs, each of which should be set
2140 * in the environment.
2141 *
2142 * Because the env_vars string values become part of the environment, the
2143 * string is static and we don't free it.
2144 *
2145 * This function is always called before zoneadmd forks and makes itself
2146 * exclusive, so it is possible there could more than one instance of zoneadmd
2147 * running in parallel at this point. Thus, we have no zonecfg snapshot and
2148 * shouldn't take one yet (i.e. snap_hndl is NULL). Thats ok, since we don't
2149 * need any zonecfg info to query for a brand-specific env value.
2150 */
2151 static int
2152 set_brand_env(zlog_t *zlogp)
2153 {
2154 int ret = 0;
2155 static char *env_vars = NULL;
2156 char buf[2 * MAXPATHLEN];
2157
2158 if (query_hook[0] == '\0' || env_vars != NULL)
2159 return (0);
2160
2161 if (snprintf(buf, sizeof (buf), "%s env", query_hook) > sizeof (buf))
2162 return (-1);
2163
2164 if (do_subproc(zlogp, buf, &env_vars, B_FALSE) != 0)
2165 return (-1);
2166
2167 if (env_vars != NULL) {
2168 char *sp;
2169
2170 sp = strtok(env_vars, "\t");
2171 while (sp != NULL) {
2172 if (putenv(sp) != 0) {
2173 ret = -1;
2174 break;
2175 }
2176 sp = strtok(NULL, "\t");
2177 }
2178 }
2179
2180 return (ret);
2181 }
2182
2183 /*
2184 * Setup the brand's pre and post state change callbacks, as well as the
2185 * query callback, if any of these exist.
2186 */
2187 static int
2188 brand_callback_init(brand_handle_t bh, char *zone_name)
2189 {
2190 (void) strlcpy(pre_statechg_hook, EXEC_PREFIX,
2191 sizeof (pre_statechg_hook));
2192
2193 if (brand_get_prestatechange(bh, zone_name, zonepath,
2194 pre_statechg_hook + EXEC_LEN,
2195 sizeof (pre_statechg_hook) - EXEC_LEN) != 0)
2196 return (-1);
2197
2198 if (strlen(pre_statechg_hook) <= EXEC_LEN)
2199 pre_statechg_hook[0] = '\0';
2200
2201 (void) strlcpy(post_statechg_hook, EXEC_PREFIX,
2202 sizeof (post_statechg_hook));
2203
2204 if (brand_get_poststatechange(bh, zone_name, zonepath,
2205 post_statechg_hook + EXEC_LEN,
2206 sizeof (post_statechg_hook) - EXEC_LEN) != 0)
2207 return (-1);
2208
2209 if (strlen(post_statechg_hook) <= EXEC_LEN)
2210 post_statechg_hook[0] = '\0';
2211
2212 (void) strlcpy(query_hook, EXEC_PREFIX,
2213 sizeof (query_hook));
2214
2215 if (brand_get_query(bh, zone_name, zonepath, query_hook + EXEC_LEN,
2216 sizeof (query_hook) - EXEC_LEN) != 0)
2217 return (-1);
2218
2219 if (strlen(query_hook) <= EXEC_LEN)
2220 query_hook[0] = '\0';
2221
2222 return (0);
2223 }
2224
2225 int
2226 main(int argc, char *argv[])
2227 {
2228 int opt;
2229 zoneid_t zid;
2230 priv_set_t *privset;
2231 zone_state_t zstate;
2232 char parents_locale[MAXPATHLEN];
2233 brand_handle_t bh;
2234 int err;
2235
2236 pid_t pid;
2237 sigset_t blockset;
2238 sigset_t block_cld;
2239
2240 struct {
2241 sema_t sem;
2242 int status;
2243 zlog_t log;
2244 } *shstate;
2245 size_t shstatelen = getpagesize();
2246
2247 zlog_t errlog;
2248 zlog_t *zlogp;
2249
2250 int ctfd;
2251
2252 progname = get_execbasename(argv[0]);
2253
2254 /*
2255 * Make sure stderr is unbuffered
2256 */
2257 (void) setbuffer(stderr, NULL, 0);
2258
2259 /*
2260 * Get out of the way of mounted filesystems, since we will daemonize
2261 * soon.
2262 */
2263 (void) chdir("/");
2264
2265 /*
2266 * Use the default system umask per PSARC 1998/110 rather than
2267 * anything that may have been set by the caller.
2268 */
2269 (void) umask(CMASK);
2270
2271 /*
2272 * Initially we want to use our parent's locale.
2273 */
2274 (void) setlocale(LC_ALL, "");
2275 (void) textdomain(TEXT_DOMAIN);
2276 (void) strlcpy(parents_locale, setlocale(LC_MESSAGES, NULL),
2277 sizeof (parents_locale));
2278
2279 /*
2280 * This zlog_t is used for writing to stderr
2281 */
2282 errlog.logfile = stderr;
2283 errlog.buflen = errlog.loglen = 0;
2284 errlog.buf = errlog.log = NULL;
2285 errlog.locale = parents_locale;
2286
2287 /*
2288 * We start off writing to stderr until we're ready to daemonize.
2289 */
2290 zlogp = &errlog;
2291
2292 /*
2293 * Process options.
2294 */
2295 while ((opt = getopt(argc, argv, "R:z:")) != EOF) {
2296 switch (opt) {
2297 case 'R':
2298 zonecfg_set_root(optarg);
2299 break;
2300 case 'z':
2301 zone_name = optarg;
2302 break;
2303 default:
2304 usage();
2305 }
2306 }
2307
2308 if (zone_name == NULL)
2309 usage();
2310
2311 /*
2312 * Because usage() prints directly to stderr, it has gettext()
2313 * wrapping, which depends on the locale. But since zerror() calls
2314 * localize() which tweaks the locale, it is not safe to call zerror()
2315 * until after the last call to usage(). Fortunately, the last call
2316 * to usage() is just above and the first call to zerror() is just
2317 * below. Don't mess this up.
2318 */
2319 if (strcmp(zone_name, GLOBAL_ZONENAME) == 0) {
2320 zerror(zlogp, B_FALSE, "cannot manage the %s zone",
2321 GLOBAL_ZONENAME);
2322 return (1);
2323 }
2324
2325 if (zone_get_id(zone_name, &zid) != 0) {
2326 zerror(zlogp, B_FALSE, "could not manage %s: %s", zone_name,
2327 zonecfg_strerror(Z_NO_ZONE));
2328 return (1);
2329 }
2330
2331 if ((err = zone_get_state(zone_name, &zstate)) != Z_OK) {
2332 zerror(zlogp, B_FALSE, "failed to get zone state: %s",
2333 zonecfg_strerror(err));
2334 return (1);
2335 }
2336 if (zstate < ZONE_STATE_INCOMPLETE) {
2337 zerror(zlogp, B_FALSE,
2338 "cannot manage a zone which is in state '%s'",
2339 zone_state_str(zstate));
2340 return (1);
2341 }
2342
2343 if (zone_get_zonepath(zone_name, zonepath, sizeof (zonepath)) != Z_OK) {
2344 zerror(zlogp, B_FALSE, "unable to determine zone path");
2345 return (-1);
2346 }
2347
2348 if (zonecfg_default_brand(default_brand,
2349 sizeof (default_brand)) != Z_OK) {
2350 zerror(zlogp, B_FALSE, "unable to determine default brand");
2351 return (1);
2352 }
2353
2354 /* Get a handle to the brand info for this zone */
2355 if (zone_get_brand(zone_name, brand_name, sizeof (brand_name))
2356 != Z_OK) {
2357 zerror(zlogp, B_FALSE, "unable to determine zone brand");
2358 return (1);
2359 }
2360 zone_isnative = (strcmp(brand_name, NATIVE_BRAND_NAME) == 0);
2361 zone_islabeled = (strcmp(brand_name, LABELED_BRAND_NAME) == 0);
2362
2363 /*
2364 * In the alternate root environment, the only supported
2365 * operations are mount and unmount. In this case, just treat
2366 * the zone as native if it is cluster. Cluster zones can be
2367 * native for the purpose of LU or upgrade, and the cluster
2368 * brand may not exist in the miniroot (such as in net install
2369 * upgrade).
2370 */
2371 if (strcmp(brand_name, CLUSTER_BRAND_NAME) == 0) {
2372 zone_iscluster = B_TRUE;
2373 if (zonecfg_in_alt_root()) {
2374 (void) strlcpy(brand_name, default_brand,
2375 sizeof (brand_name));
2376 }
2377 } else {
2378 zone_iscluster = B_FALSE;
2379 }
2380
2381 if ((bh = brand_open(brand_name)) == NULL) {
2382 zerror(zlogp, B_FALSE, "unable to open zone brand");
2383 return (1);
2384 }
2385
2386 /* Get state change brand hooks. */
2387 if (brand_callback_init(bh, zone_name) == -1) {
2388 zerror(zlogp, B_TRUE,
2389 "failed to initialize brand state change hooks");
2390 brand_close(bh);
2391 return (1);
2392 }
2393
2394 brand_close(bh);
2395
2396 /*
2397 * Check that we have all privileges. It would be nice to pare
2398 * this down, but this is at least a first cut.
2399 */
2400 if ((privset = priv_allocset()) == NULL) {
2401 zerror(zlogp, B_TRUE, "%s failed", "priv_allocset");
2402 return (1);
2403 }
2404
2405 if (getppriv(PRIV_EFFECTIVE, privset) != 0) {
2406 zerror(zlogp, B_TRUE, "%s failed", "getppriv");
2407 priv_freeset(privset);
2408 return (1);
2409 }
2410
2411 if (priv_isfullset(privset) == B_FALSE) {
2412 zerror(zlogp, B_FALSE, "You lack sufficient privilege to "
2413 "run this command (all privs required)");
2414 priv_freeset(privset);
2415 return (1);
2416 }
2417 priv_freeset(privset);
2418
2419 if (set_brand_env(zlogp) != 0) {
2420 zerror(zlogp, B_FALSE, "Unable to setup brand's environment");
2421 return (1);
2422 }
2423
2424 if (mkzonedir(zlogp) != 0)
2425 return (1);
2426
2427 /*
2428 * Pre-fork: setup shared state
2429 */
2430 if ((shstate = (void *)mmap(NULL, shstatelen,
2431 PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANON, -1, (off_t)0)) ==
2432 MAP_FAILED) {
2433 zerror(zlogp, B_TRUE, "%s failed", "mmap");
2434 return (1);
2435 }
2436 if (sema_init(&shstate->sem, 0, USYNC_PROCESS, NULL) != 0) {
2437 zerror(zlogp, B_TRUE, "%s failed", "sema_init()");
2438 (void) munmap((char *)shstate, shstatelen);
2439 return (1);
2440 }
2441 shstate->log.logfile = NULL;
2442 shstate->log.buflen = shstatelen - sizeof (*shstate);
2443 shstate->log.loglen = shstate->log.buflen;
2444 shstate->log.buf = (char *)shstate + sizeof (*shstate);
2445 shstate->log.log = shstate->log.buf;
2446 shstate->log.locale = parents_locale;
2447 shstate->status = -1;
2448
2449 /*
2450 * We need a SIGCHLD handler so the sema_wait() below will wake
2451 * up if the child dies without doing a sema_post().
2452 */
2453 (void) sigset(SIGCHLD, sigchld);
2454 /*
2455 * We must mask SIGCHLD until after we've coped with the fork
2456 * sufficiently to deal with it; otherwise we can race and
2457 * receive the signal before pid has been initialized
2458 * (yes, this really happens).
2459 */
2460 (void) sigemptyset(&block_cld);
2461 (void) sigaddset(&block_cld, SIGCHLD);
2462 (void) sigprocmask(SIG_BLOCK, &block_cld, NULL);
2463
2464 /*
2465 * The parent only needs stderr after the fork, so close other fd's
2466 * that we inherited from zoneadm so that the parent doesn't have those
2467 * open while waiting. The child will close the rest after the fork.
2468 */
2469 closefrom(3);
2470
2471 if ((ctfd = init_template()) == -1) {
2472 zerror(zlogp, B_TRUE, "failed to create contract");
2473 return (1);
2474 }
2475
2476 /*
2477 * Do not let another thread localize a message while we are forking.
2478 */
2479 (void) mutex_lock(&msglock);
2480 pid = fork();
2481 (void) mutex_unlock(&msglock);
2482
2483 /*
2484 * In all cases (parent, child, and in the event of an error) we
2485 * don't want to cause creation of contracts on subsequent fork()s.
2486 */
2487 (void) ct_tmpl_clear(ctfd);
2488 (void) close(ctfd);
2489
2490 if (pid == -1) {
2491 zerror(zlogp, B_TRUE, "could not fork");
2492 return (1);
2493
2494 } else if (pid > 0) { /* parent */
2495 (void) sigprocmask(SIG_UNBLOCK, &block_cld, NULL);
2496 /*
2497 * This marks a window of vulnerability in which we receive
2498 * the SIGCLD before falling into sema_wait (normally we would
2499 * get woken up from sema_wait with EINTR upon receipt of
2500 * SIGCLD). So we may need to use some other scheme like
2501 * sema_posting in the sigcld handler.
2502 * blech
2503 */
2504 (void) sema_wait(&shstate->sem);
2505 (void) sema_destroy(&shstate->sem);
2506 if (shstate->status != 0)
2507 (void) waitpid(pid, NULL, WNOHANG);
2508 /*
2509 * It's ok if we die with SIGPIPE. It's not like we could have
2510 * done anything about it.
2511 */
2512 (void) fprintf(stderr, "%s", shstate->log.buf);
2513 _exit(shstate->status == 0 ? 0 : 1);
2514 }
2515
2516 /*
2517 * The child charges on.
2518 */
2519 (void) sigset(SIGCHLD, SIG_DFL);
2520 (void) sigprocmask(SIG_UNBLOCK, &block_cld, NULL);
2521
2522 /*
2523 * SIGPIPE can be delivered if we write to a socket for which the
2524 * peer endpoint is gone. That can lead to too-early termination
2525 * of zoneadmd, and that's not good eats.
2526 */
2527 (void) sigset(SIGPIPE, SIG_IGN);
2528 /*
2529 * Stop using stderr
2530 */
2531 zlogp = &shstate->log;
2532
2533 /*
2534 * We don't need stdout/stderr from now on.
2535 */
2536 closefrom(0);
2537
2538 /*
2539 * Initialize the syslog zlog_t. This needs to be done after
2540 * the call to closefrom().
2541 */
2542 logsys.buf = logsys.log = NULL;
2543 logsys.buflen = logsys.loglen = 0;
2544 logsys.logfile = NULL;
2545 logsys.locale = DEFAULT_LOCALE;
2546
2547 openlog("zoneadmd", LOG_PID, LOG_DAEMON);
2548
2549 /*
2550 * The eventstream is used to publish state changes in the zone
2551 * from the door threads to the console I/O poller.
2552 */
2553 if (eventstream_init() == -1) {
2554 zerror(zlogp, B_TRUE, "unable to create eventstream");
2555 goto child_out;
2556 }
2557
2558 (void) snprintf(zone_door_path, sizeof (zone_door_path),
2559 "%s" ZONE_DOOR_PATH, zonecfg_get_root(), zone_name);
2560
2561 /*
2562 * See if another zoneadmd is running for this zone. If not, then we
2563 * can now modify system state.
2564 */
2565 if (make_daemon_exclusive(zlogp) == -1)
2566 goto child_out;
2567
2568
2569 /*
2570 * Create/join a new session; we need to be careful of what we do with
2571 * the console from now on so we don't end up being the session leader
2572 * for the terminal we're going to be handing out.
2573 */
2574 (void) setsid();
2575
2576 /*
2577 * This thread shouldn't be receiving any signals; in particular,
2578 * SIGCHLD should be received by the thread doing the fork().
2579 */
2580 (void) sigfillset(&blockset);
2581 (void) thr_sigsetmask(SIG_BLOCK, &blockset, NULL);
2582
2583 /*
2584 * Setup the console device and get ready to serve the console;
2585 * once this has completed, we're ready to let console clients
2586 * make an attempt to connect (they will block until
2587 * serve_console_sock() below gets called, and any pending
2588 * connection is accept()ed).
2589 */
2590 if (!zonecfg_in_alt_root() && init_console(zlogp) < 0)
2591 goto child_out;
2592
2593 /*
2594 * Take the lock now, so that when the door server gets going, we
2595 * are guaranteed that it won't take a request until we are sure
2596 * that everything is completely set up. See the child_out: label
2597 * below to see why this matters.
2598 */
2599 (void) mutex_lock(&lock);
2600
2601 /* Init semaphore for scratch zones. */
2602 if (sema_init(&scratch_sem, 0, USYNC_THREAD, NULL) == -1) {
2603 zerror(zlogp, B_TRUE,
2604 "failed to initialize semaphore for scratch zone");
2605 goto child_out;
2606 }
2607
2608 /* open the dladm handle */
2609 if (dladm_open(&dld_handle) != DLADM_STATUS_OK) {
2610 zerror(zlogp, B_FALSE, "failed to open dladm handle");
2611 goto child_out;
2612 }
2613
2614 /*
2615 * Note: door setup must occur *after* the console is setup.
2616 * This is so that as zlogin tests the door to see if zoneadmd
2617 * is ready yet, we know that the console will get serviced
2618 * once door_info() indicates that the door is "up".
2619 */
2620 if (setup_door(zlogp) == -1)
2621 goto child_out;
2622
2623 /*
2624 * Things seem OK so far; tell the parent process that we're done
2625 * with setup tasks. This will cause the parent to exit, signalling
2626 * to zoneadm, zlogin, or whatever forked it that we are ready to
2627 * service requests.
2628 */
2629 shstate->status = 0;
2630 (void) sema_post(&shstate->sem);
2631 (void) munmap((char *)shstate, shstatelen);
2632 shstate = NULL;
2633
2634 (void) mutex_unlock(&lock);
2635
2636 /*
2637 * zlogp is now invalid, so reset it to the syslog logger.
2638 */
2639 zlogp = &logsys;
2640
2641 /*
2642 * Now that we are free of any parents, switch to the default locale.
2643 */
2644 (void) setlocale(LC_ALL, DEFAULT_LOCALE);
2645
2646 /*
2647 * At this point the setup portion of main() is basically done, so
2648 * we reuse this thread to manage the zone console. When
2649 * serve_console() has returned, we are past the point of no return
2650 * in the life of this zoneadmd.
2651 */
2652 if (zonecfg_in_alt_root()) {
2653 /*
2654 * This is just awful, but mounted scratch zones don't (and
2655 * can't) have consoles. We just wait for unmount instead.
2656 */
2657 while (sema_wait(&scratch_sem) == EINTR)
2658 ;
2659 } else {
2660 serve_console(zlogp);
2661 assert(in_death_throes);
2662 }
2663
2664 /*
2665 * This is the next-to-last part of the exit interlock. Upon calling
2666 * fdetach(), the door will go unreferenced; once any
2667 * outstanding requests (like the door thread doing Z_HALT) are
2668 * done, the door will get an UNREF notification; when it handles
2669 * the UNREF, the door server will cause the exit. It's possible
2670 * that fdetach() can fail because the file is in use, in which
2671 * case we'll retry the operation.
2672 */
2673 assert(!MUTEX_HELD(&lock));
2674 for (;;) {
2675 if ((fdetach(zone_door_path) == 0) || (errno != EBUSY))
2676 break;
2677 yield();
2678 }
2679
2680 for (;;)
2681 (void) pause();
2682
2683 child_out:
2684 assert(pid == 0);
2685 if (shstate != NULL) {
2686 shstate->status = -1;
2687 (void) sema_post(&shstate->sem);
2688 (void) munmap((char *)shstate, shstatelen);
2689 }
2690
2691 /*
2692 * This might trigger an unref notification, but if so,
2693 * we are still holding the lock, so our call to exit will
2694 * ultimately win the race and will publish the right exit
2695 * code.
2696 */
2697 if (zone_door != -1) {
2698 assert(MUTEX_HELD(&lock));
2699 (void) door_revoke(zone_door);
2700 (void) fdetach(zone_door_path);
2701 }
2702
2703 if (dld_handle != NULL)
2704 dladm_close(dld_handle);
2705
2706 return (1); /* return from main() forcibly exits an MT process */
2707 }