1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
25 * Copyright 2016 Joyent, Inc.
26 */
27
28 /*
29 * zoneadmd manages zones; one zoneadmd process is launched for each
30 * non-global zone on the system. This daemon juggles four jobs:
31 *
32 * - Implement setup and teardown of the zone "virtual platform": mount and
33 * unmount filesystems; create and destroy network interfaces; communicate
34 * with devfsadmd to lay out devices for the zone; instantiate the zone
35 * console device; configure process runtime attributes such as resource
36 * controls, pool bindings, fine-grained privileges.
37 *
38 * - Launch the zone's init(1M) process.
39 *
40 * - Implement a door server; clients (like zoneadm) connect to the door
41 * server and request zone state changes. The kernel is also a client of
42 * this door server. A request to halt or reboot the zone which originates
43 * *inside* the zone results in a door upcall from the kernel into zoneadmd.
44 *
45 * One minor problem is that messages emitted by zoneadmd need to be passed
46 * back to the zoneadm process making the request. These messages need to
47 * be rendered in the client's locale; so, this is passed in as part of the
48 * request. The exception is the kernel upcall to zoneadmd, in which case
49 * messages are syslog'd.
50 *
51 * To make all of this work, the Makefile adds -a to xgettext to extract *all*
52 * strings, and an exclusion file (zoneadmd.xcl) is used to exclude those
53 * strings which do not need to be translated.
54 *
55 * - Act as a console server for zlogin -C processes; see comments in zcons.c
56 * for more information about the zone console architecture.
57 *
58 * DESIGN NOTES
59 *
60 * Restart:
61 * A chief design constraint of zoneadmd is that it should be restartable in
62 * the case that the administrator kills it off, or it suffers a fatal error,
63 * without the running zone being impacted; this is akin to being able to
64 * reboot the service processor of a server without affecting the OS instance.
65 */
66
67 #include <sys/param.h>
68 #include <sys/mman.h>
69 #include <sys/types.h>
70 #include <sys/stat.h>
71 #include <sys/sysmacros.h>
72 #include <sys/time.h>
73
74 #include <bsm/adt.h>
75 #include <bsm/adt_event.h>
76
77 #include <alloca.h>
78 #include <assert.h>
79 #include <errno.h>
80 #include <door.h>
81 #include <fcntl.h>
82 #include <locale.h>
83 #include <signal.h>
84 #include <stdarg.h>
85 #include <stdio.h>
86 #include <stdlib.h>
87 #include <string.h>
88 #include <strings.h>
89 #include <synch.h>
90 #include <syslog.h>
91 #include <thread.h>
92 #include <unistd.h>
93 #include <wait.h>
94 #include <limits.h>
95 #include <zone.h>
96 #include <libbrand.h>
97 #include <sys/brand.h>
98 #include <libcontract.h>
99 #include <libcontract_priv.h>
100 #include <sys/brand.h>
101 #include <sys/contract/process.h>
102 #include <sys/ctfs.h>
103 #include <libdladm.h>
104 #include <sys/dls_mgmt.h>
105 #include <libscf.h>
106
107 #include <libzonecfg.h>
108 #include <zonestat_impl.h>
109 #include "zoneadmd.h"
110
111 static char *progname;
112 char *zone_name; /* zone which we are managing */
113 zone_dochandle_t snap_hndl; /* handle for snapshot created when ready */
114 char zonepath[MAXNAMELEN];
115 char pool_name[MAXNAMELEN];
116 char default_brand[MAXNAMELEN];
117 char brand_name[MAXNAMELEN];
118 boolean_t zone_isnative;
119 boolean_t zone_iscluster;
120 boolean_t zone_islabeled;
121 boolean_t shutdown_in_progress;
122 static zoneid_t zone_id;
123 dladm_handle_t dld_handle = NULL;
124
125 static char pre_statechg_hook[2 * MAXPATHLEN];
126 static char post_statechg_hook[2 * MAXPATHLEN];
127 char query_hook[2 * MAXPATHLEN];
128
129 zlog_t logsys;
130
131 mutex_t lock = DEFAULTMUTEX; /* to serialize stuff */
132 mutex_t msglock = DEFAULTMUTEX; /* for calling setlocale() */
133
134 static sema_t scratch_sem; /* for scratch zones */
135
136 static char zone_door_path[MAXPATHLEN];
137 static int zone_door = -1;
138
139 boolean_t in_death_throes = B_FALSE; /* daemon is dying */
140 boolean_t bringup_failure_recovery = B_FALSE; /* ignore certain failures */
141
142 #if !defined(TEXT_DOMAIN) /* should be defined by cc -D */
143 #define TEXT_DOMAIN "SYS_TEST" /* Use this only if it wasn't */
144 #endif
145
146 #define DEFAULT_LOCALE "C"
147
148 #define RSRC_NET "net"
149 #define RSRC_DEV "device"
150
151 static const char *
152 z_cmd_name(zone_cmd_t zcmd)
153 {
154 /* This list needs to match the enum in sys/zone.h */
155 static const char *zcmdstr[] = {
156 "ready", "boot", "forceboot", "reboot", "halt",
157 "note_uninstalling", "mount", "forcemount", "unmount",
158 "shutdown"
159 };
160
161 if (zcmd >= sizeof (zcmdstr) / sizeof (*zcmdstr))
162 return ("unknown");
163 else
164 return (zcmdstr[(int)zcmd]);
165 }
166
167 static char *
168 get_execbasename(char *execfullname)
169 {
170 char *last_slash, *execbasename;
171
172 /* guard against '/' at end of command invocation */
173 for (;;) {
174 last_slash = strrchr(execfullname, '/');
175 if (last_slash == NULL) {
176 execbasename = execfullname;
177 break;
178 } else {
179 execbasename = last_slash + 1;
180 if (*execbasename == '\0') {
181 *last_slash = '\0';
182 continue;
183 }
184 break;
185 }
186 }
187 return (execbasename);
188 }
189
190 static void
191 usage(void)
192 {
193 (void) fprintf(stderr, gettext("Usage: %s -z zonename\n"), progname);
194 (void) fprintf(stderr,
195 gettext("\tNote: %s should not be run directly.\n"), progname);
196 exit(2);
197 }
198
199 /* ARGSUSED */
200 static void
201 sigchld(int sig)
202 {
203 }
204
205 char *
206 localize_msg(char *locale, const char *msg)
207 {
208 char *out;
209
210 (void) mutex_lock(&msglock);
211 (void) setlocale(LC_MESSAGES, locale);
212 out = gettext(msg);
213 (void) setlocale(LC_MESSAGES, DEFAULT_LOCALE);
214 (void) mutex_unlock(&msglock);
215 return (out);
216 }
217
218 /* PRINTFLIKE3 */
219 void
220 zerror(zlog_t *zlogp, boolean_t use_strerror, const char *fmt, ...)
221 {
222 va_list alist;
223 char buf[MAXPATHLEN * 2]; /* enough space for err msg with a path */
224 char *bp;
225 int saved_errno = errno;
226
227 if (zlogp == NULL)
228 return;
229 if (zlogp == &logsys)
230 (void) snprintf(buf, sizeof (buf), "[zone '%s'] ",
231 zone_name);
232 else
233 buf[0] = '\0';
234 bp = &(buf[strlen(buf)]);
235
236 /*
237 * In theory, the locale pointer should be set to either "C" or a
238 * char array, so it should never be NULL
239 */
240 assert(zlogp->locale != NULL);
241 /* Locale is per process, but we are multi-threaded... */
242 fmt = localize_msg(zlogp->locale, fmt);
243
244 va_start(alist, fmt);
245 (void) vsnprintf(bp, sizeof (buf) - (bp - buf), fmt, alist);
246 va_end(alist);
247 bp = &(buf[strlen(buf)]);
248 if (use_strerror)
249 (void) snprintf(bp, sizeof (buf) - (bp - buf), ": %s",
250 strerror(saved_errno));
251 if (zlogp == &logsys) {
252 (void) syslog(LOG_ERR, "%s", buf);
253 } else if (zlogp->logfile != NULL) {
254 (void) fprintf(zlogp->logfile, "%s\n", buf);
255 } else {
256 size_t buflen;
257 size_t copylen;
258
259 buflen = snprintf(zlogp->log, zlogp->loglen, "%s\n", buf);
260 copylen = MIN(buflen, zlogp->loglen);
261 zlogp->log += copylen;
262 zlogp->loglen -= copylen;
263 }
264 }
265
266 /*
267 * Append src to dest, modifying dest in the process. Prefix src with
268 * a space character if dest is a non-empty string.
269 */
270 static void
271 strnappend(char *dest, size_t n, const char *src)
272 {
273 (void) snprintf(dest, n, "%s%s%s", dest,
274 dest[0] == '\0' ? "" : " ", src);
275 }
276
277 /*
278 * Since illumos boot arguments are getopt(3c) compatible (see kernel(1m)), we
279 * put the arguments into an argv style array, use getopt to process them,
280 * and put the resultant argument string back into outargs. Non-native brands
281 * may support alternate forms of boot arguments so we must handle that as well.
282 *
283 * During the filtering, we pull out any arguments which are truly "boot"
284 * arguments, leaving only those which are to be passed intact to the
285 * progenitor process. The one we support at the moment is -i, which
286 * indicates to the kernel which program should be launched as 'init'.
287 *
288 * Except for Z_OK, all other return values are treated as fatal.
289 */
290 static int
291 filter_bootargs(zlog_t *zlogp, const char *inargs, char *outargs,
292 char *init_file)
293 {
294 int argc = 0, argc_save;
295 int i;
296 int err;
297 char *arg, *lasts, **argv = NULL, **argv_save;
298 char zonecfg_args[BOOTARGS_MAX];
299 char scratchargs[BOOTARGS_MAX], *sargs;
300 char scratchopt[3];
301 char c;
302
303 bzero(outargs, BOOTARGS_MAX);
304
305 /*
306 * If the user didn't specify transient boot arguments, check
307 * to see if there were any specified in the zone configuration,
308 * and use them if applicable.
309 */
310 if (inargs == NULL || inargs[0] == '\0') {
311 zone_dochandle_t handle;
312 if ((handle = zonecfg_init_handle()) == NULL) {
313 zerror(zlogp, B_TRUE,
314 "getting zone configuration handle");
315 return (Z_BAD_HANDLE);
316 }
317 err = zonecfg_get_snapshot_handle(zone_name, handle);
318 if (err != Z_OK) {
319 zerror(zlogp, B_FALSE,
320 "invalid configuration snapshot");
321 zonecfg_fini_handle(handle);
322 return (Z_BAD_HANDLE);
323 }
324
325 bzero(zonecfg_args, sizeof (zonecfg_args));
326 (void) zonecfg_get_bootargs(handle, zonecfg_args,
327 sizeof (zonecfg_args));
328 inargs = zonecfg_args;
329 zonecfg_fini_handle(handle);
330 }
331
332 if (strlen(inargs) >= BOOTARGS_MAX) {
333 zerror(zlogp, B_FALSE, "boot argument string too long");
334 return (Z_INVAL);
335 }
336
337 (void) strlcpy(scratchargs, inargs, sizeof (scratchargs));
338 sargs = scratchargs;
339 while ((arg = strtok_r(sargs, " \t", &lasts)) != NULL) {
340 sargs = NULL;
341 argc++;
342 }
343
344 if ((argv = calloc(argc + 1, sizeof (char *))) == NULL) {
345 zerror(zlogp, B_FALSE, "memory allocation failed");
346 return (Z_NOMEM);
347 }
348
349 argv_save = argv;
350 argc_save = argc;
351
352 (void) strlcpy(scratchargs, inargs, sizeof (scratchargs));
353 sargs = scratchargs;
354 i = 0;
355 while ((arg = strtok_r(sargs, " \t", &lasts)) != NULL) {
356 sargs = NULL;
357 if ((argv[i] = strdup(arg)) == NULL) {
358 err = Z_NOMEM;
359 zerror(zlogp, B_FALSE, "memory allocation failed");
360 goto done;
361 }
362 i++;
363 }
364
365 /*
366 * We preserve compatibility with the illumos system boot behavior,
367 * which allows:
368 *
369 * # reboot kernel/unix -s -m verbose
370 *
371 * In this example, kernel/unix tells the booter what file to boot. The
372 * original intent of this was that we didn't want reboot in a zone to
373 * be gratuitously different, so we would silently ignore the boot
374 * file, if necessary. However, this usage is archaic and has never
375 * been common, since it is impossible to boot a zone onto a different
376 * kernel. Ignoring the first argument breaks for non-native brands
377 * which pass boot arguments in a different style. e.g.
378 * systemd.log_level=debug
379 * Thus, for backward compatibility we only ignore the first argument
380 * if it appears to be in the illumos form and attempting to specify a
381 * kernel.
382 */
383 if (argv[0] == NULL)
384 goto done;
385
386 assert(argv[0][0] != ' ');
387 assert(argv[0][0] != '\t');
388
389 if (strncmp(argv[0], "kernel/", 7) == 0) {
390 argv = &argv[1];
391 argc--;
392 }
393
394 optind = 0;
395 opterr = 0;
396 err = Z_OK;
397 while ((c = getopt(argc, argv, "fi:m:s")) != -1) {
398 switch (c) {
399 case 'i':
400 /*
401 * -i is handled by the runtime and is not passed
402 * along to userland
403 */
404 (void) strlcpy(init_file, optarg, MAXPATHLEN);
405 break;
406 case 'f':
407 /* This has already been processed by zoneadm */
408 break;
409 case 'm':
410 case 's':
411 /* These pass through unmolested */
412 (void) snprintf(scratchopt, sizeof (scratchopt),
413 "-%c", c);
414 strnappend(outargs, BOOTARGS_MAX, scratchopt);
415 if (optarg != NULL)
416 strnappend(outargs, BOOTARGS_MAX, optarg);
417 break;
418 case '?':
419 /*
420 * If a brand has its own init, we need to pass along
421 * whatever the user provides. We use the entire
422 * unknown string here so that we correctly handle
423 * unknown long options (e.g. --debug).
424 */
425 strnappend(outargs, BOOTARGS_MAX, argv[optind - 1]);
426 break;
427 }
428 }
429
430 /*
431 * We need to pass along everything else since we don't know what
432 * the brand's init is expecting. For example, an argument list like:
433 * --confdir /foo --debug
434 * will cause the getopt parsing to stop at '/foo' but we need to pass
435 * that on, along with the '--debug'. This does mean that we require
436 * any of our known options (-ifms) to preceed the brand-specific ones.
437 */
438 while (optind < argc) {
439 strnappend(outargs, BOOTARGS_MAX, argv[optind]);
440 optind++;
441 }
442
443 done:
444 for (i = 0; i < argc_save; i++) {
445 if (argv_save[i] != NULL)
446 free(argv_save[i]);
447 }
448 free(argv_save);
449 return (err);
450 }
451
452
453 static int
454 mkzonedir(zlog_t *zlogp)
455 {
456 struct stat st;
457 /*
458 * We must create and lock everyone but root out of ZONES_TMPDIR
459 * since anyone can open any UNIX domain socket, regardless of
460 * its file system permissions. Sigh...
461 */
462 if (mkdir(ZONES_TMPDIR, S_IRWXU) < 0 && errno != EEXIST) {
463 zerror(zlogp, B_TRUE, "could not mkdir '%s'", ZONES_TMPDIR);
464 return (-1);
465 }
466 /* paranoia */
467 if ((stat(ZONES_TMPDIR, &st) < 0) || !S_ISDIR(st.st_mode)) {
468 zerror(zlogp, B_TRUE, "'%s' is not a directory", ZONES_TMPDIR);
469 return (-1);
470 }
471 (void) chmod(ZONES_TMPDIR, S_IRWXU);
472 return (0);
473 }
474
475 /*
476 * Run the brand's pre-state change callback, if it exists.
477 */
478 static int
479 brand_prestatechg(zlog_t *zlogp, int state, int cmd)
480 {
481 char cmdbuf[2 * MAXPATHLEN];
482 const char *altroot;
483
484 if (pre_statechg_hook[0] == '\0')
485 return (0);
486
487 altroot = zonecfg_get_root();
488 if (snprintf(cmdbuf, sizeof (cmdbuf), "%s %d %d %s", pre_statechg_hook,
489 state, cmd, altroot) > sizeof (cmdbuf))
490 return (-1);
491
492 if (do_subproc(zlogp, cmdbuf, NULL) != 0)
493 return (-1);
494
495 return (0);
496 }
497
498 /*
499 * Run the brand's post-state change callback, if it exists.
500 */
501 static int
502 brand_poststatechg(zlog_t *zlogp, int state, int cmd)
503 {
504 char cmdbuf[2 * MAXPATHLEN];
505 const char *altroot;
506
507 if (post_statechg_hook[0] == '\0')
508 return (0);
509
510 altroot = zonecfg_get_root();
511 if (snprintf(cmdbuf, sizeof (cmdbuf), "%s %d %d %s", post_statechg_hook,
512 state, cmd, altroot) > sizeof (cmdbuf))
513 return (-1);
514
515 if (do_subproc(zlogp, cmdbuf, NULL) != 0)
516 return (-1);
517
518 return (0);
519 }
520
521 /*
522 * Notify zonestatd of the new zone. If zonestatd is not running, this
523 * will do nothing.
524 */
525 static void
526 notify_zonestatd(zoneid_t zoneid)
527 {
528 int cmd[2];
529 int fd;
530 door_arg_t params;
531
532 fd = open(ZS_DOOR_PATH, O_RDONLY);
533 if (fd < 0)
534 return;
535
536 cmd[0] = ZSD_CMD_NEW_ZONE;
537 cmd[1] = zoneid;
538 params.data_ptr = (char *)&cmd;
539 params.data_size = sizeof (cmd);
540 params.desc_ptr = NULL;
541 params.desc_num = 0;
542 params.rbuf = NULL;
543 params.rsize = NULL;
544 (void) door_call(fd, ¶ms);
545 (void) close(fd);
546 }
547
548 /*
549 * Bring a zone up to the pre-boot "ready" stage. The mount_cmd argument is
550 * 'true' if this is being invoked as part of the processing for the "mount"
551 * subcommand.
552 */
553 static int
554 zone_ready(zlog_t *zlogp, zone_mnt_t mount_cmd, int zstate)
555 {
556 int err;
557
558 if (!ALT_MOUNT(mount_cmd) &&
559 brand_prestatechg(zlogp, zstate, Z_READY) != 0)
560 return (-1);
561
562 if ((err = zonecfg_create_snapshot(zone_name)) != Z_OK) {
563 zerror(zlogp, B_FALSE, "unable to create snapshot: %s",
564 zonecfg_strerror(err));
565 goto bad;
566 }
567
568 if ((zone_id = vplat_create(zlogp, mount_cmd)) == -1) {
569 if ((err = zonecfg_destroy_snapshot(zone_name)) != Z_OK)
570 zerror(zlogp, B_FALSE, "destroying snapshot: %s",
571 zonecfg_strerror(err));
572 goto bad;
573 }
574 if (vplat_bringup(zlogp, mount_cmd, zone_id) != 0) {
575 bringup_failure_recovery = B_TRUE;
576 (void) vplat_teardown(NULL, (mount_cmd != Z_MNT_BOOT), B_FALSE);
577 if ((err = zonecfg_destroy_snapshot(zone_name)) != Z_OK)
578 zerror(zlogp, B_FALSE, "destroying snapshot: %s",
579 zonecfg_strerror(err));
580 goto bad;
581 }
582
583 if (!ALT_MOUNT(mount_cmd) &&
584 brand_poststatechg(zlogp, zstate, Z_READY) != 0)
585 goto bad;
586
587 return (0);
588
589 bad:
590 /*
591 * If something goes wrong, we up the zones's state to the target
592 * state, READY, and then invoke the hook as if we're halting.
593 */
594 if (!ALT_MOUNT(mount_cmd))
595 (void) brand_poststatechg(zlogp, ZONE_STATE_READY, Z_HALT);
596 return (-1);
597 }
598
599 int
600 init_template(void)
601 {
602 int fd;
603 int err = 0;
604
605 fd = open64(CTFS_ROOT "/process/template", O_RDWR);
606 if (fd == -1)
607 return (-1);
608
609 /*
610 * For now, zoneadmd doesn't do anything with the contract.
611 * Deliver no events, don't inherit, and allow it to be orphaned.
612 */
613 err |= ct_tmpl_set_critical(fd, 0);
614 err |= ct_tmpl_set_informative(fd, 0);
615 err |= ct_pr_tmpl_set_fatal(fd, CT_PR_EV_HWERR);
616 err |= ct_pr_tmpl_set_param(fd, CT_PR_PGRPONLY | CT_PR_REGENT);
617 if (err || ct_tmpl_activate(fd)) {
618 (void) close(fd);
619 return (-1);
620 }
621
622 return (fd);
623 }
624
625 typedef struct fs_callback {
626 zlog_t *zlogp;
627 zoneid_t zoneid;
628 boolean_t mount_cmd;
629 } fs_callback_t;
630
631 static int
632 mount_early_fs(void *data, const char *spec, const char *dir,
633 const char *fstype, const char *opt)
634 {
635 zlog_t *zlogp = ((fs_callback_t *)data)->zlogp;
636 zoneid_t zoneid = ((fs_callback_t *)data)->zoneid;
637 boolean_t mount_cmd = ((fs_callback_t *)data)->mount_cmd;
638 char rootpath[MAXPATHLEN];
639 pid_t child;
640 int child_status;
641 int tmpl_fd;
642 int rv;
643 ctid_t ct;
644
645 /* determine the zone rootpath */
646 if (mount_cmd) {
647 char luroot[MAXPATHLEN];
648
649 (void) snprintf(luroot, sizeof (luroot), "%s/lu", zonepath);
650 resolve_lofs(zlogp, luroot, sizeof (luroot));
651 (void) strlcpy(rootpath, luroot, sizeof (rootpath));
652 } else {
653 if (zone_get_rootpath(zone_name,
654 rootpath, sizeof (rootpath)) != Z_OK) {
655 zerror(zlogp, B_FALSE, "unable to determine zone root");
656 return (-1);
657 }
658 }
659
660 if ((rv = valid_mount_path(zlogp, rootpath, spec, dir, fstype)) < 0) {
661 zerror(zlogp, B_FALSE, "%s%s is not a valid mount point",
662 rootpath, dir);
663 return (-1);
664 } else if (rv > 0) {
665 /* The mount point path doesn't exist, create it now. */
666 if (make_one_dir(zlogp, rootpath, dir,
667 DEFAULT_DIR_MODE, DEFAULT_DIR_USER,
668 DEFAULT_DIR_GROUP) != 0) {
669 zerror(zlogp, B_FALSE, "failed to create mount point");
670 return (-1);
671 }
672
673 /*
674 * Now this might seem weird, but we need to invoke
675 * valid_mount_path() again. Why? Because it checks
676 * to make sure that the mount point path is canonical,
677 * which it can only do if the path exists, so now that
678 * we've created the path we have to verify it again.
679 */
680 if ((rv = valid_mount_path(zlogp, rootpath, spec, dir,
681 fstype)) < 0) {
682 zerror(zlogp, B_FALSE,
683 "%s%s is not a valid mount point", rootpath, dir);
684 return (-1);
685 }
686 }
687
688 if ((tmpl_fd = init_template()) == -1) {
689 zerror(zlogp, B_TRUE, "failed to create contract");
690 return (-1);
691 }
692
693 if ((child = fork()) == -1) {
694 (void) ct_tmpl_clear(tmpl_fd);
695 (void) close(tmpl_fd);
696 zerror(zlogp, B_TRUE, "failed to fork");
697 return (-1);
698
699 } else if (child == 0) { /* child */
700 char opt_buf[MAX_MNTOPT_STR];
701 int optlen = 0;
702 int mflag = MS_DATA;
703 int i;
704 int ret;
705
706 (void) ct_tmpl_clear(tmpl_fd);
707 /*
708 * Even though there are no procs running in the zone, we
709 * do this for paranoia's sake.
710 */
711 (void) closefrom(0);
712
713 if (zone_enter(zoneid) == -1) {
714 _exit(errno);
715 }
716 if (opt != NULL) {
717 /*
718 * The mount() system call is incredibly annoying.
719 * If options are specified, we need to copy them
720 * into a temporary buffer since the mount() system
721 * call will overwrite the options string. It will
722 * also fail if the new option string it wants to
723 * write is bigger than the one we passed in, so
724 * you must pass in a buffer of the maximum possible
725 * option string length. sigh.
726 */
727 (void) strlcpy(opt_buf, opt, sizeof (opt_buf));
728 opt = opt_buf;
729 optlen = MAX_MNTOPT_STR;
730 mflag = MS_OPTIONSTR;
731 }
732
733 /*
734 * There is an obscure race condition which can cause mount
735 * to return EBUSY. This happens for example on the mount
736 * of the zone's /etc/svc/volatile file system if there is
737 * a GZ process running svcs -Z, which will touch the
738 * mountpoint, just as we're trying to do the mount. To cope
739 * with this, we retry up to 3 times to let this transient
740 * process get out of the way.
741 */
742 for (i = 0; i < 3; i++) {
743 ret = 0;
744 if (mount(spec, dir, mflag, fstype, NULL, 0, opt,
745 optlen) != 0)
746 ret = errno;
747 if (ret != EBUSY)
748 break;
749 (void) sleep(1);
750 }
751 _exit(ret);
752 }
753
754 /* parent */
755 if (contract_latest(&ct) == -1)
756 ct = -1;
757 (void) ct_tmpl_clear(tmpl_fd);
758 (void) close(tmpl_fd);
759 if (waitpid(child, &child_status, 0) != child) {
760 /* unexpected: we must have been signalled */
761 (void) contract_abandon_id(ct);
762 return (-1);
763 }
764 (void) contract_abandon_id(ct);
765 if (WEXITSTATUS(child_status) != 0) {
766 errno = WEXITSTATUS(child_status);
767 zerror(zlogp, B_TRUE, "mount of %s failed", dir);
768 return (-1);
769 }
770
771 return (0);
772 }
773
774 /*
775 * env variable name format
776 * _ZONECFG;{resource name};{identifying attr. name};{property name}
777 */
778 static void
779 set_zonecfg_env(char *rsrc, char *attr, char *name, char *val)
780 {
781 char *p;
782 /* Enough for maximal name, rsrc + attr, & slop for ZONECFG & _'s */
783 char nm[2 * MAXNAMELEN + 32];
784
785 if (attr == NULL)
786 (void) snprintf(nm, sizeof (nm), "_ZONECFG_%s_%s", rsrc,
787 name);
788 else
789 (void) snprintf(nm, sizeof (nm), "_ZONECFG_%s_%s_%s", rsrc,
790 attr, name);
791
792 p = nm;
793 while ((p = strchr(p, '-')) != NULL)
794 *p++ = '_';
795
796 (void) setenv(nm, val, 1);
797 }
798
799 /*
800 * Export zonecfg network and device properties into environment for the boot
801 * and state change hooks.
802 * If debug is true, export the brand hook debug env. variable as well.
803 *
804 * We could export more of the config in the future, as necessary.
805 */
806 static int
807 setup_subproc_env()
808 {
809 int res;
810 zone_dochandle_t handle;
811 struct zone_nwiftab ntab;
812 struct zone_devtab dtab;
813 char net_resources[MAXNAMELEN * 2];
814 char dev_resources[MAXNAMELEN * 2];
815
816 if ((handle = zonecfg_init_handle()) == NULL)
817 exit(Z_NOMEM);
818
819 if ((res = zonecfg_get_handle(zone_name, handle)) != Z_OK)
820 goto done;
821
822 if ((res = zonecfg_setnwifent(handle)) != Z_OK)
823 goto done;
824
825 while (zonecfg_getnwifent(handle, &ntab) == Z_OK) {
826 struct zone_res_attrtab *rap;
827 char *phys;
828
829 phys = ntab.zone_nwif_physical;
830
831 (void) strlcat(net_resources, phys, sizeof (net_resources));
832 (void) strlcat(net_resources, " ", sizeof (net_resources));
833
834 set_zonecfg_env(RSRC_NET, phys, "physical", phys);
835
836 set_zonecfg_env(RSRC_NET, phys, "address",
837 ntab.zone_nwif_address);
838 set_zonecfg_env(RSRC_NET, phys, "allowed-address",
839 ntab.zone_nwif_allowed_address);
840 set_zonecfg_env(RSRC_NET, phys, "defrouter",
841 ntab.zone_nwif_defrouter);
842 set_zonecfg_env(RSRC_NET, phys, "global-nic",
843 ntab.zone_nwif_gnic);
844 set_zonecfg_env(RSRC_NET, phys, "mac-addr", ntab.zone_nwif_mac);
845 set_zonecfg_env(RSRC_NET, phys, "vlan-id",
846 ntab.zone_nwif_vlan_id);
847
848 for (rap = ntab.zone_nwif_attrp; rap != NULL;
849 rap = rap->zone_res_attr_next)
850 set_zonecfg_env(RSRC_NET, phys, rap->zone_res_attr_name,
851 rap->zone_res_attr_value);
852 }
853
854 (void) zonecfg_endnwifent(handle);
855
856 if ((res = zonecfg_setdevent(handle)) != Z_OK)
857 goto done;
858
859 while (zonecfg_getdevent(handle, &dtab) == Z_OK) {
860 struct zone_res_attrtab *rap;
861 char *match;
862
863 match = dtab.zone_dev_match;
864
865 (void) strlcat(dev_resources, match, sizeof (dev_resources));
866 (void) strlcat(dev_resources, " ", sizeof (dev_resources));
867
868 for (rap = dtab.zone_dev_attrp; rap != NULL;
869 rap = rap->zone_res_attr_next)
870 set_zonecfg_env(RSRC_DEV, match,
871 rap->zone_res_attr_name, rap->zone_res_attr_value);
872 }
873
874 (void) zonecfg_enddevent(handle);
875
876 res = Z_OK;
877
878 done:
879 zonecfg_fini_handle(handle);
880 return (res);
881 }
882
883 /*
884 * If retstr is not NULL, the output of the subproc is returned in the str,
885 * otherwise it is output using zerror(). Any memory allocated for retstr
886 * should be freed by the caller.
887 */
888 int
889 do_subproc(zlog_t *zlogp, char *cmdbuf, char **retstr)
890 {
891 char buf[1024]; /* arbitrary large amount */
892 char *inbuf;
893 FILE *file;
894 int status;
895 int rd_cnt;
896
897 if (retstr != NULL) {
898 if ((*retstr = malloc(1024)) == NULL) {
899 zerror(zlogp, B_FALSE, "out of memory");
900 return (-1);
901 }
902 inbuf = *retstr;
903 rd_cnt = 0;
904 } else {
905 inbuf = buf;
906 }
907
908 if (setup_subproc_env() != Z_OK) {
909 zerror(zlogp, B_FALSE, "failed to setup environment");
910 return (-1);
911 }
912
913 file = popen(cmdbuf, "r");
914 if (file == NULL) {
915 zerror(zlogp, B_TRUE, "could not launch: %s", cmdbuf);
916 return (-1);
917 }
918
919 while (fgets(inbuf, 1024, file) != NULL) {
920 if (retstr == NULL) {
921 if (zlogp != &logsys)
922 zerror(zlogp, B_FALSE, "%s", inbuf);
923 } else {
924 char *p;
925
926 rd_cnt += 1024 - 1;
927 if ((p = realloc(*retstr, rd_cnt + 1024)) == NULL) {
928 zerror(zlogp, B_FALSE, "out of memory");
929 (void) pclose(file);
930 return (-1);
931 }
932
933 *retstr = p;
934 inbuf = *retstr + rd_cnt;
935 }
936 }
937 status = pclose(file);
938
939 if (WIFSIGNALED(status)) {
940 zerror(zlogp, B_FALSE, "%s unexpectedly terminated due to "
941 "signal %d", cmdbuf, WTERMSIG(status));
942 return (-1);
943 }
944 assert(WIFEXITED(status));
945 if (WEXITSTATUS(status) == ZEXIT_EXEC) {
946 zerror(zlogp, B_FALSE, "failed to exec %s", cmdbuf);
947 return (-1);
948 }
949 return (WEXITSTATUS(status));
950 }
951
952 /*
953 * Get the app-svc-dependent flag for this zone's init process. This is a
954 * zone-specific attr which controls the type of contract we create for the
955 * zone's init. When true, the contract will include CT_PR_EV_EXIT in the fatal
956 * set, so that when any service which is in the same contract exits, the init
957 * application will be terminated.
958 *
959 * We use the global "snap_hndl", so no parameters get passed here.
960 */
961 static boolean_t
962 is_app_svc_dep(void)
963 {
964 struct zone_attrtab a;
965
966 bzero(&a, sizeof (a));
967 (void) strlcpy(a.zone_attr_name, "app-svc-dependent",
968 sizeof (a.zone_attr_name));
969
970 if (zonecfg_lookup_attr(snap_hndl, &a) == Z_OK &&
971 strcmp(a.zone_attr_value, "true") == 0) {
972 return (B_TRUE);
973 }
974
975 return (B_FALSE);
976 }
977
978 static int
979 zone_bootup(zlog_t *zlogp, const char *bootargs, int zstate)
980 {
981 zoneid_t zoneid;
982 struct stat st;
983 char rpath[MAXPATHLEN], initpath[MAXPATHLEN], init_file[MAXPATHLEN];
984 char nbootargs[BOOTARGS_MAX];
985 char cmdbuf[MAXPATHLEN];
986 fs_callback_t cb;
987 brand_handle_t bh;
988 zone_iptype_t iptype;
989 dladm_status_t status;
990 char errmsg[DLADM_STRSIZE];
991 int err;
992 boolean_t restart_init;
993 boolean_t app_svc_dep;
994
995 if (brand_prestatechg(zlogp, zstate, Z_BOOT) != 0)
996 return (-1);
997
998 if ((zoneid = getzoneidbyname(zone_name)) == -1) {
999 zerror(zlogp, B_TRUE, "unable to get zoneid");
1000 goto bad;
1001 }
1002
1003 cb.zlogp = zlogp;
1004 cb.zoneid = zoneid;
1005 cb.mount_cmd = B_FALSE;
1006
1007 /* Get a handle to the brand info for this zone */
1008 if ((bh = brand_open(brand_name)) == NULL) {
1009 zerror(zlogp, B_FALSE, "unable to determine zone brand");
1010 goto bad;
1011 }
1012
1013 /*
1014 * Get the list of filesystems to mount from the brand
1015 * configuration. These mounts are done via a thread that will
1016 * enter the zone, so they are done from within the context of the
1017 * zone.
1018 */
1019 if (brand_platform_iter_mounts(bh, mount_early_fs, &cb) != 0) {
1020 zerror(zlogp, B_FALSE, "unable to mount filesystems");
1021 brand_close(bh);
1022 goto bad;
1023 }
1024
1025 /*
1026 * Get the brand's boot callback if it exists.
1027 */
1028 (void) strcpy(cmdbuf, EXEC_PREFIX);
1029 if (brand_get_boot(bh, zone_name, zonepath, cmdbuf + EXEC_LEN,
1030 sizeof (cmdbuf) - EXEC_LEN) != 0) {
1031 zerror(zlogp, B_FALSE,
1032 "unable to determine branded zone's boot callback");
1033 brand_close(bh);
1034 goto bad;
1035 }
1036
1037 /* Get the path for this zone's init(1M) (or equivalent) process. */
1038 if (brand_get_initname(bh, init_file, MAXPATHLEN) != 0) {
1039 zerror(zlogp, B_FALSE,
1040 "unable to determine zone's init(1M) location");
1041 brand_close(bh);
1042 goto bad;
1043 }
1044
1045 /* See if this zone's brand should restart init if it dies. */
1046 restart_init = brand_restartinit(bh);
1047
1048 /*
1049 * See if we need to setup contract dependencies between the zone's
1050 * primary application and any of its services.
1051 */
1052 app_svc_dep = is_app_svc_dep();
1053
1054 brand_close(bh);
1055
1056 err = filter_bootargs(zlogp, bootargs, nbootargs, init_file);
1057 if (err != Z_OK)
1058 goto bad;
1059
1060 assert(init_file[0] != '\0');
1061
1062 /*
1063 * Try to anticipate possible problems: If possible, make sure init is
1064 * executable.
1065 */
1066 if (zone_get_rootpath(zone_name, rpath, sizeof (rpath)) != Z_OK) {
1067 zerror(zlogp, B_FALSE, "unable to determine zone root");
1068 goto bad;
1069 }
1070
1071 (void) snprintf(initpath, sizeof (initpath), "%s%s", rpath, init_file);
1072
1073 if (lstat(initpath, &st) == -1) {
1074 zerror(zlogp, B_TRUE, "could not stat %s", initpath);
1075 goto bad;
1076 }
1077
1078 /*
1079 * If a symlink, we'll have to wait and resolve when we boot,
1080 * otherwise check the executable bits now.
1081 */
1082 if ((st.st_mode & S_IFMT) != S_IFLNK && (st.st_mode & S_IXUSR) == 0) {
1083 zerror(zlogp, B_FALSE, "%s is not executable", initpath);
1084 goto bad;
1085 }
1086
1087 /*
1088 * Exclusive stack zones interact with the dlmgmtd running in the
1089 * global zone. dladm_zone_boot() tells dlmgmtd that this zone is
1090 * booting, and loads its datalinks from the zone's datalink
1091 * configuration file.
1092 */
1093 if (vplat_get_iptype(zlogp, &iptype) == 0 && iptype == ZS_EXCLUSIVE) {
1094 status = dladm_zone_boot(dld_handle, zoneid);
1095 if (status != DLADM_STATUS_OK) {
1096 zerror(zlogp, B_FALSE, "unable to load zone datalinks: "
1097 " %s", dladm_status2str(status, errmsg));
1098 goto bad;
1099 }
1100 }
1101
1102 /*
1103 * If there is a brand 'boot' callback, execute it now to give the
1104 * brand one last chance to do any additional setup before the zone
1105 * is booted.
1106 */
1107 if ((strlen(cmdbuf) > EXEC_LEN) &&
1108 (do_subproc(zlogp, cmdbuf, NULL) != Z_OK)) {
1109 zerror(zlogp, B_FALSE, "%s failed", cmdbuf);
1110 goto bad;
1111 }
1112
1113 if (zone_setattr(zoneid, ZONE_ATTR_INITNAME, init_file, 0) == -1) {
1114 zerror(zlogp, B_TRUE, "could not set zone boot file");
1115 goto bad;
1116 }
1117
1118 if (zone_setattr(zoneid, ZONE_ATTR_BOOTARGS, nbootargs, 0) == -1) {
1119 zerror(zlogp, B_TRUE, "could not set zone boot arguments");
1120 goto bad;
1121 }
1122
1123 if (!restart_init && zone_setattr(zoneid, ZONE_ATTR_INITNORESTART,
1124 NULL, 0) == -1) {
1125 zerror(zlogp, B_TRUE, "could not set zone init-no-restart");
1126 goto bad;
1127 }
1128
1129 if (app_svc_dep && zone_setattr(zoneid, ZONE_ATTR_APP_SVC_CT,
1130 (void *)B_TRUE, sizeof (boolean_t)) == -1) {
1131 zerror(zlogp, B_TRUE, "could not set zone app-die");
1132 goto bad;
1133 }
1134
1135 /*
1136 * Inform zonestatd of a new zone so that it can install a door for
1137 * the zone to contact it.
1138 */
1139 notify_zonestatd(zone_id);
1140
1141 if (zone_boot(zoneid) == -1) {
1142 zerror(zlogp, B_TRUE, "unable to boot zone");
1143 goto bad;
1144 }
1145
1146 if (brand_poststatechg(zlogp, zstate, Z_BOOT) != 0)
1147 goto bad;
1148
1149 /* Startup a thread to perform zfd logging/tty svc for the zone. */
1150 create_log_thread(zlogp, zone_id);
1151
1152 /* Startup a thread to perform memory capping for the zone. */
1153 create_mcap_thread(zlogp, zone_id);
1154
1155 return (0);
1156
1157 bad:
1158 /*
1159 * If something goes wrong, we up the zones's state to the target
1160 * state, RUNNING, and then invoke the hook as if we're halting.
1161 */
1162 (void) brand_poststatechg(zlogp, ZONE_STATE_RUNNING, Z_HALT);
1163
1164 return (-1);
1165 }
1166
1167 static int
1168 zone_halt(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting, int zstate)
1169 {
1170 int err;
1171
1172 if (unmount_cmd == B_FALSE &&
1173 brand_prestatechg(zlogp, zstate, Z_HALT) != 0)
1174 return (-1);
1175
1176 /* Shutting down, stop the memcap thread */
1177 destroy_mcap_thread();
1178
1179 if (vplat_teardown(zlogp, unmount_cmd, rebooting) != 0) {
1180 if (!bringup_failure_recovery)
1181 zerror(zlogp, B_FALSE, "unable to destroy zone");
1182 destroy_log_thread();
1183 return (-1);
1184 }
1185
1186 /* Shut down is done, stop the log thread */
1187 destroy_log_thread();
1188
1189 if (unmount_cmd == B_FALSE &&
1190 brand_poststatechg(zlogp, zstate, Z_HALT) != 0)
1191 return (-1);
1192
1193 if ((err = zonecfg_destroy_snapshot(zone_name)) != Z_OK)
1194 zerror(zlogp, B_FALSE, "destroying snapshot: %s",
1195 zonecfg_strerror(err));
1196
1197 return (0);
1198 }
1199
1200 static int
1201 zone_graceful_shutdown(zlog_t *zlogp)
1202 {
1203 zoneid_t zoneid;
1204 pid_t child;
1205 char cmdbuf[MAXPATHLEN];
1206 brand_handle_t bh = NULL;
1207 ctid_t ct;
1208 int tmpl_fd;
1209 int child_status;
1210
1211 if (shutdown_in_progress) {
1212 zerror(zlogp, B_FALSE, "shutdown already in progress");
1213 return (-1);
1214 }
1215
1216 if ((zoneid = getzoneidbyname(zone_name)) == -1) {
1217 zerror(zlogp, B_TRUE, "unable to get zoneid");
1218 return (-1);
1219 }
1220
1221 /* Get a handle to the brand info for this zone */
1222 if ((bh = brand_open(brand_name)) == NULL) {
1223 zerror(zlogp, B_FALSE, "unable to determine zone brand");
1224 return (-1);
1225 }
1226
1227 /*
1228 * If there is a brand 'shutdown' callback, execute it now to give the
1229 * brand a chance to cleanup any custom configuration.
1230 */
1231 (void) strcpy(cmdbuf, EXEC_PREFIX);
1232 if (brand_get_shutdown(bh, zone_name, zonepath, cmdbuf + EXEC_LEN,
1233 sizeof (cmdbuf) - EXEC_LEN) != 0 || strlen(cmdbuf) <= EXEC_LEN) {
1234 (void) strcat(cmdbuf, SHUTDOWN_DEFAULT);
1235 }
1236 brand_close(bh);
1237
1238 if ((tmpl_fd = init_template()) == -1) {
1239 zerror(zlogp, B_TRUE, "failed to create contract");
1240 return (-1);
1241 }
1242
1243 if ((child = fork()) == -1) {
1244 (void) ct_tmpl_clear(tmpl_fd);
1245 (void) close(tmpl_fd);
1246 zerror(zlogp, B_TRUE, "failed to fork");
1247 return (-1);
1248 } else if (child == 0) {
1249 (void) ct_tmpl_clear(tmpl_fd);
1250 if (zone_enter(zoneid) == -1) {
1251 _exit(errno);
1252 }
1253 _exit(execl("/bin/sh", "sh", "-c", cmdbuf, (char *)NULL));
1254 }
1255
1256 if (contract_latest(&ct) == -1)
1257 ct = -1;
1258 (void) ct_tmpl_clear(tmpl_fd);
1259 (void) close(tmpl_fd);
1260
1261 if (waitpid(child, &child_status, 0) != child) {
1262 /* unexpected: we must have been signalled */
1263 (void) contract_abandon_id(ct);
1264 return (-1);
1265 }
1266
1267 (void) contract_abandon_id(ct);
1268 if (WEXITSTATUS(child_status) != 0) {
1269 errno = WEXITSTATUS(child_status);
1270 zerror(zlogp, B_FALSE, "unable to shutdown zone");
1271 return (-1);
1272 }
1273
1274 shutdown_in_progress = B_TRUE;
1275
1276 return (0);
1277 }
1278
1279 static int
1280 zone_wait_shutdown(zlog_t *zlogp)
1281 {
1282 zone_state_t zstate;
1283 uint64_t *tm = NULL;
1284 scf_simple_prop_t *prop = NULL;
1285 int timeout;
1286 int tries;
1287 int rc = -1;
1288
1289 /* Get default stop timeout from SMF framework */
1290 timeout = SHUTDOWN_WAIT;
1291 if ((prop = scf_simple_prop_get(NULL, SHUTDOWN_FMRI, "stop",
1292 SCF_PROPERTY_TIMEOUT)) != NULL) {
1293 if ((tm = scf_simple_prop_next_count(prop)) != NULL) {
1294 if (tm != 0)
1295 timeout = *tm;
1296 }
1297 scf_simple_prop_free(prop);
1298 }
1299
1300 /* allow time for zone to shutdown cleanly */
1301 for (tries = 0; tries < timeout; tries ++) {
1302 (void) sleep(1);
1303 if (zone_get_state(zone_name, &zstate) == Z_OK &&
1304 zstate == ZONE_STATE_INSTALLED) {
1305 rc = 0;
1306 break;
1307 }
1308 }
1309
1310 if (rc != 0)
1311 zerror(zlogp, B_FALSE, "unable to shutdown zone");
1312
1313 shutdown_in_progress = B_FALSE;
1314
1315 return (rc);
1316 }
1317
1318
1319
1320 /*
1321 * Generate AUE_zone_state for a command that boots a zone.
1322 */
1323 static void
1324 audit_put_record(zlog_t *zlogp, ucred_t *uc, int return_val,
1325 char *new_state)
1326 {
1327 adt_session_data_t *ah;
1328 adt_event_data_t *event;
1329 int pass_fail, fail_reason;
1330
1331 if (!adt_audit_enabled())
1332 return;
1333
1334 if (return_val == 0) {
1335 pass_fail = ADT_SUCCESS;
1336 fail_reason = ADT_SUCCESS;
1337 } else {
1338 pass_fail = ADT_FAILURE;
1339 fail_reason = ADT_FAIL_VALUE_PROGRAM;
1340 }
1341
1342 if (adt_start_session(&ah, NULL, 0)) {
1343 zerror(zlogp, B_TRUE, gettext("audit failure."));
1344 return;
1345 }
1346 if (adt_set_from_ucred(ah, uc, ADT_NEW)) {
1347 zerror(zlogp, B_TRUE, gettext("audit failure."));
1348 (void) adt_end_session(ah);
1349 return;
1350 }
1351
1352 event = adt_alloc_event(ah, ADT_zone_state);
1353 if (event == NULL) {
1354 zerror(zlogp, B_TRUE, gettext("audit failure."));
1355 (void) adt_end_session(ah);
1356 return;
1357 }
1358 event->adt_zone_state.zonename = zone_name;
1359 event->adt_zone_state.new_state = new_state;
1360
1361 if (adt_put_event(event, pass_fail, fail_reason))
1362 zerror(zlogp, B_TRUE, gettext("audit failure."));
1363
1364 adt_free_event(event);
1365
1366 (void) adt_end_session(ah);
1367 }
1368
1369 /*
1370 * Log the exit time and status of the zone's init process into
1371 * {zonepath}/lastexited. If the zone shutdown normally, the exit status will
1372 * be -1, otherwise it will be the exit status as described in wait.3c.
1373 * If the zone is configured to restart init, then nothing will be logged if
1374 * init exits unexpectedly (the kernel will never upcall in this case).
1375 */
1376 static void
1377 log_init_exit(int status)
1378 {
1379 char p[MAXPATHLEN];
1380 char buf[128];
1381 struct timeval t;
1382 int fd;
1383
1384 if (snprintf(p, sizeof (p), "%s/lastexited", zonepath) > sizeof (p))
1385 return;
1386 if (gettimeofday(&t, NULL) != 0)
1387 return;
1388 if (snprintf(buf, sizeof (buf), "%ld.%ld %d\n", t.tv_sec, t.tv_usec,
1389 status) > sizeof (buf))
1390 return;
1391 if ((fd = open(p, O_WRONLY | O_CREAT | O_TRUNC, 0644)) < 0)
1392 return;
1393
1394 (void) write(fd, buf, strlen(buf));
1395
1396 (void) close(fd);
1397 }
1398
1399 /*
1400 * The main routine for the door server that deals with zone state transitions.
1401 */
1402 /* ARGSUSED */
1403 static void
1404 server(void *cookie, char *args, size_t alen, door_desc_t *dp,
1405 uint_t n_desc)
1406 {
1407 ucred_t *uc = NULL;
1408 const priv_set_t *eset;
1409
1410 zone_state_t zstate;
1411 zone_cmd_t cmd;
1412 int init_status;
1413 zone_cmd_arg_t *zargp;
1414
1415 boolean_t kernelcall;
1416
1417 int rval = -1;
1418 uint64_t uniqid;
1419 zoneid_t zoneid = -1;
1420 zlog_t zlog;
1421 zlog_t *zlogp;
1422 zone_cmd_rval_t *rvalp;
1423 size_t rlen = getpagesize(); /* conservative */
1424 fs_callback_t cb;
1425 brand_handle_t bh;
1426 boolean_t wait_shut = B_FALSE;
1427
1428 /* LINTED E_BAD_PTR_CAST_ALIGN */
1429 zargp = (zone_cmd_arg_t *)args;
1430
1431 /*
1432 * When we get the door unref message, we've fdetach'd the door, and
1433 * it is time for us to shut down zoneadmd.
1434 */
1435 if (zargp == DOOR_UNREF_DATA) {
1436 /*
1437 * See comment at end of main() for info on the last rites.
1438 */
1439 exit(0);
1440 }
1441
1442 if (zargp == NULL) {
1443 (void) door_return(NULL, 0, 0, 0);
1444 }
1445
1446 rvalp = alloca(rlen);
1447 bzero(rvalp, rlen);
1448 zlog.logfile = NULL;
1449 zlog.buflen = zlog.loglen = rlen - sizeof (zone_cmd_rval_t) + 1;
1450 zlog.buf = rvalp->errbuf;
1451 zlog.log = zlog.buf;
1452 /* defer initialization of zlog.locale until after credential check */
1453 zlogp = &zlog;
1454
1455 if (alen != sizeof (zone_cmd_arg_t)) {
1456 /*
1457 * This really shouldn't be happening.
1458 */
1459 zerror(&logsys, B_FALSE, "argument size (%d bytes) "
1460 "unexpected (expected %d bytes)", alen,
1461 sizeof (zone_cmd_arg_t));
1462 goto out;
1463 }
1464 cmd = zargp->cmd;
1465 init_status = zargp->status;
1466
1467 if (door_ucred(&uc) != 0) {
1468 zerror(&logsys, B_TRUE, "door_ucred");
1469 goto out;
1470 }
1471 eset = ucred_getprivset(uc, PRIV_EFFECTIVE);
1472 if (ucred_getzoneid(uc) != GLOBAL_ZONEID ||
1473 (eset != NULL ? !priv_ismember(eset, PRIV_SYS_CONFIG) :
1474 ucred_geteuid(uc) != 0)) {
1475 zerror(&logsys, B_FALSE, "insufficient privileges");
1476 goto out;
1477 }
1478
1479 kernelcall = ucred_getpid(uc) == 0;
1480
1481 /*
1482 * This is safe because we only use a zlog_t throughout the
1483 * duration of a door call; i.e., by the time the pointer
1484 * might become invalid, the door call would be over.
1485 */
1486 zlog.locale = kernelcall ? DEFAULT_LOCALE : zargp->locale;
1487
1488 (void) mutex_lock(&lock);
1489
1490 /*
1491 * Once we start to really die off, we don't want more connections.
1492 */
1493 if (in_death_throes) {
1494 (void) mutex_unlock(&lock);
1495 ucred_free(uc);
1496 (void) door_return(NULL, 0, 0, 0);
1497 thr_exit(NULL);
1498 }
1499
1500 /*
1501 * Check for validity of command.
1502 */
1503 if (cmd != Z_READY && cmd != Z_BOOT && cmd != Z_FORCEBOOT &&
1504 cmd != Z_REBOOT && cmd != Z_SHUTDOWN && cmd != Z_HALT &&
1505 cmd != Z_NOTE_UNINSTALLING && cmd != Z_MOUNT &&
1506 cmd != Z_FORCEMOUNT && cmd != Z_UNMOUNT) {
1507 zerror(&logsys, B_FALSE, "invalid command %d", (int)cmd);
1508 goto out;
1509 }
1510
1511 if (kernelcall && (cmd != Z_HALT && cmd != Z_REBOOT)) {
1512 /*
1513 * Can't happen
1514 */
1515 zerror(&logsys, B_FALSE, "received unexpected kernel upcall %d",
1516 cmd);
1517 goto out;
1518 }
1519 /*
1520 * We ignore the possibility of someone calling zone_create(2)
1521 * explicitly; all requests must come through zoneadmd.
1522 */
1523 if (zone_get_state(zone_name, &zstate) != Z_OK) {
1524 /*
1525 * Something terribly wrong happened
1526 */
1527 zerror(&logsys, B_FALSE, "unable to determine state of zone");
1528 goto out;
1529 }
1530
1531 if (kernelcall) {
1532 /*
1533 * Kernel-initiated requests may lose their validity if the
1534 * zone_t the kernel was referring to has gone away.
1535 */
1536 if ((zoneid = getzoneidbyname(zone_name)) == -1 ||
1537 zone_getattr(zoneid, ZONE_ATTR_UNIQID, &uniqid,
1538 sizeof (uniqid)) == -1 || uniqid != zargp->uniqid) {
1539 /*
1540 * We're not talking about the same zone. The request
1541 * must have arrived too late. Return error.
1542 */
1543 rval = -1;
1544 goto out;
1545 }
1546 zlogp = &logsys; /* Log errors to syslog */
1547 }
1548
1549 /*
1550 * If we are being asked to forcibly mount or boot a zone, we
1551 * pretend that an INCOMPLETE zone is actually INSTALLED.
1552 */
1553 if (zstate == ZONE_STATE_INCOMPLETE &&
1554 (cmd == Z_FORCEBOOT || cmd == Z_FORCEMOUNT))
1555 zstate = ZONE_STATE_INSTALLED;
1556
1557 switch (zstate) {
1558 case ZONE_STATE_CONFIGURED:
1559 case ZONE_STATE_INCOMPLETE:
1560 /*
1561 * Not our area of expertise; we just print a nice message
1562 * and die off.
1563 */
1564 zerror(zlogp, B_FALSE,
1565 "%s operation is invalid for zones in state '%s'",
1566 z_cmd_name(cmd), zone_state_str(zstate));
1567 break;
1568
1569 case ZONE_STATE_INSTALLED:
1570 switch (cmd) {
1571 case Z_READY:
1572 rval = zone_ready(zlogp, Z_MNT_BOOT, zstate);
1573 if (rval == 0)
1574 eventstream_write(Z_EVT_ZONE_READIED);
1575 zcons_statechanged();
1576 break;
1577 case Z_BOOT:
1578 case Z_FORCEBOOT:
1579 eventstream_write(Z_EVT_ZONE_BOOTING);
1580 if ((rval = zone_ready(zlogp, Z_MNT_BOOT, zstate))
1581 == 0) {
1582 rval = zone_bootup(zlogp, zargp->bootbuf,
1583 zstate);
1584 }
1585 audit_put_record(zlogp, uc, rval, "boot");
1586 zcons_statechanged();
1587 if (rval != 0) {
1588 bringup_failure_recovery = B_TRUE;
1589 (void) zone_halt(zlogp, B_FALSE, B_FALSE,
1590 zstate);
1591 eventstream_write(Z_EVT_ZONE_BOOTFAILED);
1592 }
1593 break;
1594 case Z_SHUTDOWN:
1595 case Z_HALT:
1596 if (kernelcall) /* Invalid; can't happen */
1597 abort();
1598 /*
1599 * We could have two clients racing to halt this
1600 * zone; the second client loses, but his request
1601 * doesn't fail, since the zone is now in the desired
1602 * state.
1603 */
1604 zerror(zlogp, B_FALSE, "zone is already halted");
1605 rval = 0;
1606 break;
1607 case Z_REBOOT:
1608 if (kernelcall) /* Invalid; can't happen */
1609 abort();
1610 zerror(zlogp, B_FALSE, "%s operation is invalid "
1611 "for zones in state '%s'", z_cmd_name(cmd),
1612 zone_state_str(zstate));
1613 rval = -1;
1614 break;
1615 case Z_NOTE_UNINSTALLING:
1616 if (kernelcall) /* Invalid; can't happen */
1617 abort();
1618 /*
1619 * Tell the console to print out a message about this.
1620 * Once it does, we will be in_death_throes.
1621 */
1622 eventstream_write(Z_EVT_ZONE_UNINSTALLING);
1623 break;
1624 case Z_MOUNT:
1625 case Z_FORCEMOUNT:
1626 if (kernelcall) /* Invalid; can't happen */
1627 abort();
1628 if (!zone_isnative && !zone_iscluster &&
1629 !zone_islabeled) {
1630 /*
1631 * -U mounts the zone without lofs mounting
1632 * zone file systems back into the scratch
1633 * zone. This is required when mounting
1634 * non-native branded zones.
1635 */
1636 (void) strlcpy(zargp->bootbuf, "-U",
1637 BOOTARGS_MAX);
1638 }
1639
1640 rval = zone_ready(zlogp,
1641 strcmp(zargp->bootbuf, "-U") == 0 ?
1642 Z_MNT_UPDATE : Z_MNT_SCRATCH, zstate);
1643 if (rval != 0)
1644 break;
1645
1646 eventstream_write(Z_EVT_ZONE_READIED);
1647
1648 /*
1649 * Get a handle to the default brand info.
1650 * We must always use the default brand file system
1651 * list when mounting the zone.
1652 */
1653 if ((bh = brand_open(default_brand)) == NULL) {
1654 rval = -1;
1655 break;
1656 }
1657
1658 /*
1659 * Get the list of filesystems to mount from
1660 * the brand configuration. These mounts are done
1661 * via a thread that will enter the zone, so they
1662 * are done from within the context of the zone.
1663 */
1664 cb.zlogp = zlogp;
1665 cb.zoneid = zone_id;
1666 cb.mount_cmd = B_TRUE;
1667 rval = brand_platform_iter_mounts(bh,
1668 mount_early_fs, &cb);
1669
1670 brand_close(bh);
1671
1672 /*
1673 * Ordinarily, /dev/fd would be mounted inside the zone
1674 * by svc:/system/filesystem/usr:default, but since
1675 * we're not booting the zone, we need to do this
1676 * manually.
1677 */
1678 if (rval == 0)
1679 rval = mount_early_fs(&cb,
1680 "fd", "/dev/fd", "fd", NULL);
1681 break;
1682 case Z_UNMOUNT:
1683 if (kernelcall) /* Invalid; can't happen */
1684 abort();
1685 zerror(zlogp, B_FALSE, "zone is already unmounted");
1686 rval = 0;
1687 break;
1688 }
1689 break;
1690
1691 case ZONE_STATE_READY:
1692 switch (cmd) {
1693 case Z_READY:
1694 /*
1695 * We could have two clients racing to ready this
1696 * zone; the second client loses, but his request
1697 * doesn't fail, since the zone is now in the desired
1698 * state.
1699 */
1700 zerror(zlogp, B_FALSE, "zone is already ready");
1701 rval = 0;
1702 break;
1703 case Z_BOOT:
1704 (void) strlcpy(boot_args, zargp->bootbuf,
1705 sizeof (boot_args));
1706 eventstream_write(Z_EVT_ZONE_BOOTING);
1707 rval = zone_bootup(zlogp, zargp->bootbuf, zstate);
1708 audit_put_record(zlogp, uc, rval, "boot");
1709 zcons_statechanged();
1710 if (rval != 0) {
1711 bringup_failure_recovery = B_TRUE;
1712 (void) zone_halt(zlogp, B_FALSE, B_TRUE,
1713 zstate);
1714 eventstream_write(Z_EVT_ZONE_BOOTFAILED);
1715 }
1716 boot_args[0] = '\0';
1717 break;
1718 case Z_HALT:
1719 if (kernelcall) /* Invalid; can't happen */
1720 abort();
1721 if ((rval = zone_halt(zlogp, B_FALSE, B_FALSE, zstate))
1722 != 0)
1723 break;
1724 zcons_statechanged();
1725 eventstream_write(Z_EVT_ZONE_HALTED);
1726 break;
1727 case Z_SHUTDOWN:
1728 case Z_REBOOT:
1729 case Z_NOTE_UNINSTALLING:
1730 case Z_MOUNT:
1731 case Z_UNMOUNT:
1732 if (kernelcall) /* Invalid; can't happen */
1733 abort();
1734 zerror(zlogp, B_FALSE, "%s operation is invalid "
1735 "for zones in state '%s'", z_cmd_name(cmd),
1736 zone_state_str(zstate));
1737 rval = -1;
1738 break;
1739 }
1740 break;
1741
1742 case ZONE_STATE_MOUNTED:
1743 switch (cmd) {
1744 case Z_UNMOUNT:
1745 if (kernelcall) /* Invalid; can't happen */
1746 abort();
1747 rval = zone_halt(zlogp, B_TRUE, B_FALSE, zstate);
1748 if (rval == 0) {
1749 eventstream_write(Z_EVT_ZONE_HALTED);
1750 (void) sema_post(&scratch_sem);
1751 }
1752 break;
1753 default:
1754 if (kernelcall) /* Invalid; can't happen */
1755 abort();
1756 zerror(zlogp, B_FALSE, "%s operation is invalid "
1757 "for zones in state '%s'", z_cmd_name(cmd),
1758 zone_state_str(zstate));
1759 rval = -1;
1760 break;
1761 }
1762 break;
1763
1764 case ZONE_STATE_RUNNING:
1765 case ZONE_STATE_SHUTTING_DOWN:
1766 case ZONE_STATE_DOWN:
1767 switch (cmd) {
1768 case Z_READY:
1769 if ((rval = zone_halt(zlogp, B_FALSE, B_TRUE, zstate))
1770 != 0)
1771 break;
1772 zcons_statechanged();
1773 if ((rval = zone_ready(zlogp, Z_MNT_BOOT, zstate)) == 0)
1774 eventstream_write(Z_EVT_ZONE_READIED);
1775 else
1776 eventstream_write(Z_EVT_ZONE_HALTED);
1777 break;
1778 case Z_BOOT:
1779 /*
1780 * We could have two clients racing to boot this
1781 * zone; the second client loses, but his request
1782 * doesn't fail, since the zone is now in the desired
1783 * state.
1784 */
1785 zerror(zlogp, B_FALSE, "zone is already booted");
1786 rval = 0;
1787 break;
1788 case Z_HALT:
1789 if (kernelcall) {
1790 log_init_exit(init_status);
1791 } else {
1792 log_init_exit(-1);
1793 }
1794 if ((rval = zone_halt(zlogp, B_FALSE, B_FALSE, zstate))
1795 != 0)
1796 break;
1797 eventstream_write(Z_EVT_ZONE_HALTED);
1798 zcons_statechanged();
1799 break;
1800 case Z_REBOOT:
1801 (void) strlcpy(boot_args, zargp->bootbuf,
1802 sizeof (boot_args));
1803 eventstream_write(Z_EVT_ZONE_REBOOTING);
1804 if ((rval = zone_halt(zlogp, B_FALSE, B_TRUE, zstate))
1805 != 0) {
1806 eventstream_write(Z_EVT_ZONE_BOOTFAILED);
1807 boot_args[0] = '\0';
1808 break;
1809 }
1810 zcons_statechanged();
1811 if ((rval = zone_ready(zlogp, Z_MNT_BOOT, zstate)) !=
1812 0) {
1813 eventstream_write(Z_EVT_ZONE_BOOTFAILED);
1814 boot_args[0] = '\0';
1815 break;
1816 }
1817 rval = zone_bootup(zlogp, zargp->bootbuf, zstate);
1818 audit_put_record(zlogp, uc, rval, "reboot");
1819 if (rval != 0) {
1820 (void) zone_halt(zlogp, B_FALSE, B_TRUE,
1821 zstate);
1822 eventstream_write(Z_EVT_ZONE_BOOTFAILED);
1823 }
1824 boot_args[0] = '\0';
1825 break;
1826 case Z_SHUTDOWN:
1827 if ((rval = zone_graceful_shutdown(zlogp)) == 0) {
1828 wait_shut = B_TRUE;
1829 }
1830 break;
1831 case Z_NOTE_UNINSTALLING:
1832 case Z_MOUNT:
1833 case Z_UNMOUNT:
1834 zerror(zlogp, B_FALSE, "%s operation is invalid "
1835 "for zones in state '%s'", z_cmd_name(cmd),
1836 zone_state_str(zstate));
1837 rval = -1;
1838 break;
1839 }
1840 break;
1841 default:
1842 abort();
1843 }
1844
1845 /*
1846 * Because the state of the zone may have changed, we make sure
1847 * to wake the console poller, which is in charge of initiating
1848 * the shutdown procedure as necessary.
1849 */
1850 eventstream_write(Z_EVT_NULL);
1851
1852 out:
1853 (void) mutex_unlock(&lock);
1854
1855 /* Wait for the Z_SHUTDOWN commands to complete */
1856 if (wait_shut)
1857 rval = zone_wait_shutdown(zlogp);
1858
1859 if (kernelcall) {
1860 rvalp = NULL;
1861 rlen = 0;
1862 } else {
1863 rvalp->rval = rval;
1864 }
1865 if (uc != NULL)
1866 ucred_free(uc);
1867 (void) door_return((char *)rvalp, rlen, NULL, 0);
1868 thr_exit(NULL);
1869 }
1870
1871 static int
1872 setup_door(zlog_t *zlogp)
1873 {
1874 if ((zone_door = door_create(server, NULL,
1875 DOOR_UNREF | DOOR_REFUSE_DESC | DOOR_NO_CANCEL)) < 0) {
1876 zerror(zlogp, B_TRUE, "%s failed", "door_create");
1877 return (-1);
1878 }
1879 (void) fdetach(zone_door_path);
1880
1881 if (fattach(zone_door, zone_door_path) != 0) {
1882 zerror(zlogp, B_TRUE, "fattach to %s failed", zone_door_path);
1883 (void) door_revoke(zone_door);
1884 (void) fdetach(zone_door_path);
1885 zone_door = -1;
1886 return (-1);
1887 }
1888 return (0);
1889 }
1890
1891 /*
1892 * zoneadm(1m) will start zoneadmd if it thinks it isn't running; this
1893 * is where zoneadmd itself will check to see that another instance of
1894 * zoneadmd isn't already controlling this zone.
1895 *
1896 * The idea here is that we want to open the path to which we will
1897 * attach our door, lock it, and then make sure that no-one has beat us
1898 * to fattach(3c)ing onto it.
1899 *
1900 * fattach(3c) is really a mount, so there are actually two possible
1901 * vnodes we could be dealing with. Our strategy is as follows:
1902 *
1903 * - If the file we opened is a regular file (common case):
1904 * There is no fattach(3c)ed door, so we have a chance of becoming
1905 * the managing zoneadmd. We attempt to lock the file: if it is
1906 * already locked, that means someone else raced us here, so we
1907 * lose and give up. zoneadm(1m) will try to contact the zoneadmd
1908 * that beat us to it.
1909 *
1910 * - If the file we opened is a namefs file:
1911 * This means there is already an established door fattach(3c)'ed
1912 * to the rendezvous path. We've lost the race, so we give up.
1913 * Note that in this case we also try to grab the file lock, and
1914 * will succeed in acquiring it since the vnode locked by the
1915 * "winning" zoneadmd was a regular one, and the one we locked was
1916 * the fattach(3c)'ed door node. At any rate, no harm is done, and
1917 * we just return to zoneadm(1m) which knows to retry.
1918 */
1919 static int
1920 make_daemon_exclusive(zlog_t *zlogp)
1921 {
1922 int doorfd = -1;
1923 int err, ret = -1;
1924 struct stat st;
1925 struct flock flock;
1926 zone_state_t zstate;
1927
1928 top:
1929 if ((err = zone_get_state(zone_name, &zstate)) != Z_OK) {
1930 zerror(zlogp, B_FALSE, "failed to get zone state: %s",
1931 zonecfg_strerror(err));
1932 goto out;
1933 }
1934 if ((doorfd = open(zone_door_path, O_CREAT|O_RDWR,
1935 S_IREAD|S_IWRITE)) < 0) {
1936 zerror(zlogp, B_TRUE, "failed to open %s", zone_door_path);
1937 goto out;
1938 }
1939 if (fstat(doorfd, &st) < 0) {
1940 zerror(zlogp, B_TRUE, "failed to stat %s", zone_door_path);
1941 goto out;
1942 }
1943 /*
1944 * Lock the file to synchronize with other zoneadmd
1945 */
1946 flock.l_type = F_WRLCK;
1947 flock.l_whence = SEEK_SET;
1948 flock.l_start = (off_t)0;
1949 flock.l_len = (off_t)0;
1950 if (fcntl(doorfd, F_SETLK, &flock) < 0) {
1951 /*
1952 * Someone else raced us here and grabbed the lock file
1953 * first. A warning here is inappropriate since nothing
1954 * went wrong.
1955 */
1956 goto out;
1957 }
1958
1959 if (strcmp(st.st_fstype, "namefs") == 0) {
1960 struct door_info info;
1961
1962 /*
1963 * There is already something fattach()'ed to this file.
1964 * Lets see what the door is up to.
1965 */
1966 if (door_info(doorfd, &info) == 0 && info.di_target != -1) {
1967 /*
1968 * Another zoneadmd process seems to be in
1969 * control of the situation and we don't need to
1970 * be here. A warning here is inappropriate
1971 * since nothing went wrong.
1972 *
1973 * If the door has been revoked, the zoneadmd
1974 * process currently managing the zone is going
1975 * away. We'll return control to zoneadm(1m)
1976 * which will try again (by which time zoneadmd
1977 * will hopefully have exited).
1978 */
1979 goto out;
1980 }
1981
1982 /*
1983 * If we got this far, there's a fattach(3c)'ed door
1984 * that belongs to a process that has exited, which can
1985 * happen if the previous zoneadmd died unexpectedly.
1986 *
1987 * Let user know that something is amiss, but that we can
1988 * recover; if the zone is in the installed state, then don't
1989 * message, since having a running zoneadmd isn't really
1990 * expected/needed. We want to keep occurences of this message
1991 * limited to times when zoneadmd is picking back up from a
1992 * zoneadmd that died while the zone was in some non-trivial
1993 * state.
1994 */
1995 if (zstate > ZONE_STATE_INSTALLED) {
1996 static zoneid_t zid;
1997
1998 zerror(zlogp, B_FALSE,
1999 "zone '%s': WARNING: zone is in state '%s', but "
2000 "zoneadmd does not appear to be available; "
2001 "restarted zoneadmd to recover.",
2002 zone_name, zone_state_str(zstate));
2003
2004 /*
2005 * Startup a thread to perform the zfd logging/tty svc
2006 * and a thread to perform memory capping for the
2007 * zone. zlogp won't be valid for much longer so use
2008 * logsys.
2009 */
2010 if ((zid = getzoneidbyname(zone_name)) != -1) {
2011 create_log_thread(&logsys, zid);
2012 create_mcap_thread(&logsys, zid);
2013 }
2014
2015 /* recover the global configuration snapshot */
2016 if (snap_hndl == NULL) {
2017 if ((snap_hndl = zonecfg_init_handle())
2018 == NULL ||
2019 zonecfg_create_snapshot(zone_name)
2020 != Z_OK ||
2021 zonecfg_get_snapshot_handle(zone_name,
2022 snap_hndl) != Z_OK) {
2023 zerror(zlogp, B_FALSE, "recovering "
2024 "zone configuration handle");
2025 goto out;
2026 }
2027 }
2028 }
2029
2030 (void) fdetach(zone_door_path);
2031 (void) close(doorfd);
2032 goto top;
2033 }
2034 ret = 0;
2035 out:
2036 (void) close(doorfd);
2037 return (ret);
2038 }
2039
2040 /*
2041 * Setup the brand's pre and post state change callbacks, as well as the
2042 * query callback, if any of these exist.
2043 */
2044 static int
2045 brand_callback_init(brand_handle_t bh, char *zone_name)
2046 {
2047 (void) strlcpy(pre_statechg_hook, EXEC_PREFIX,
2048 sizeof (pre_statechg_hook));
2049
2050 if (brand_get_prestatechange(bh, zone_name, zonepath,
2051 pre_statechg_hook + EXEC_LEN,
2052 sizeof (pre_statechg_hook) - EXEC_LEN) != 0)
2053 return (-1);
2054
2055 if (strlen(pre_statechg_hook) <= EXEC_LEN)
2056 pre_statechg_hook[0] = '\0';
2057
2058 (void) strlcpy(post_statechg_hook, EXEC_PREFIX,
2059 sizeof (post_statechg_hook));
2060
2061 if (brand_get_poststatechange(bh, zone_name, zonepath,
2062 post_statechg_hook + EXEC_LEN,
2063 sizeof (post_statechg_hook) - EXEC_LEN) != 0)
2064 return (-1);
2065
2066 if (strlen(post_statechg_hook) <= EXEC_LEN)
2067 post_statechg_hook[0] = '\0';
2068
2069 (void) strlcpy(query_hook, EXEC_PREFIX,
2070 sizeof (query_hook));
2071
2072 if (brand_get_query(bh, zone_name, zonepath, query_hook + EXEC_LEN,
2073 sizeof (query_hook) - EXEC_LEN) != 0)
2074 return (-1);
2075
2076 if (strlen(query_hook) <= EXEC_LEN)
2077 query_hook[0] = '\0';
2078
2079 return (0);
2080 }
2081
2082 int
2083 main(int argc, char *argv[])
2084 {
2085 int opt;
2086 zoneid_t zid;
2087 priv_set_t *privset;
2088 zone_state_t zstate;
2089 char parents_locale[MAXPATHLEN];
2090 brand_handle_t bh;
2091 int err;
2092
2093 pid_t pid;
2094 sigset_t blockset;
2095 sigset_t block_cld;
2096
2097 struct {
2098 sema_t sem;
2099 int status;
2100 zlog_t log;
2101 } *shstate;
2102 size_t shstatelen = getpagesize();
2103
2104 zlog_t errlog;
2105 zlog_t *zlogp;
2106
2107 int ctfd;
2108
2109 progname = get_execbasename(argv[0]);
2110
2111 /*
2112 * Make sure stderr is unbuffered
2113 */
2114 (void) setbuffer(stderr, NULL, 0);
2115
2116 /*
2117 * Get out of the way of mounted filesystems, since we will daemonize
2118 * soon.
2119 */
2120 (void) chdir("/");
2121
2122 /*
2123 * Use the default system umask per PSARC 1998/110 rather than
2124 * anything that may have been set by the caller.
2125 */
2126 (void) umask(CMASK);
2127
2128 /*
2129 * Initially we want to use our parent's locale.
2130 */
2131 (void) setlocale(LC_ALL, "");
2132 (void) textdomain(TEXT_DOMAIN);
2133 (void) strlcpy(parents_locale, setlocale(LC_MESSAGES, NULL),
2134 sizeof (parents_locale));
2135
2136 /*
2137 * This zlog_t is used for writing to stderr
2138 */
2139 errlog.logfile = stderr;
2140 errlog.buflen = errlog.loglen = 0;
2141 errlog.buf = errlog.log = NULL;
2142 errlog.locale = parents_locale;
2143
2144 /*
2145 * We start off writing to stderr until we're ready to daemonize.
2146 */
2147 zlogp = &errlog;
2148
2149 /*
2150 * Process options.
2151 */
2152 while ((opt = getopt(argc, argv, "R:z:")) != EOF) {
2153 switch (opt) {
2154 case 'R':
2155 zonecfg_set_root(optarg);
2156 break;
2157 case 'z':
2158 zone_name = optarg;
2159 break;
2160 default:
2161 usage();
2162 }
2163 }
2164
2165 if (zone_name == NULL)
2166 usage();
2167
2168 /*
2169 * Because usage() prints directly to stderr, it has gettext()
2170 * wrapping, which depends on the locale. But since zerror() calls
2171 * localize() which tweaks the locale, it is not safe to call zerror()
2172 * until after the last call to usage(). Fortunately, the last call
2173 * to usage() is just above and the first call to zerror() is just
2174 * below. Don't mess this up.
2175 */
2176 if (strcmp(zone_name, GLOBAL_ZONENAME) == 0) {
2177 zerror(zlogp, B_FALSE, "cannot manage the %s zone",
2178 GLOBAL_ZONENAME);
2179 return (1);
2180 }
2181
2182 if (zone_get_id(zone_name, &zid) != 0) {
2183 zerror(zlogp, B_FALSE, "could not manage %s: %s", zone_name,
2184 zonecfg_strerror(Z_NO_ZONE));
2185 return (1);
2186 }
2187
2188 if ((err = zone_get_state(zone_name, &zstate)) != Z_OK) {
2189 zerror(zlogp, B_FALSE, "failed to get zone state: %s",
2190 zonecfg_strerror(err));
2191 return (1);
2192 }
2193 if (zstate < ZONE_STATE_INCOMPLETE) {
2194 zerror(zlogp, B_FALSE,
2195 "cannot manage a zone which is in state '%s'",
2196 zone_state_str(zstate));
2197 return (1);
2198 }
2199
2200 if (zone_get_zonepath(zone_name, zonepath, sizeof (zonepath)) != Z_OK) {
2201 zerror(zlogp, B_FALSE, "unable to determine zone path");
2202 return (-1);
2203 }
2204
2205 if (zonecfg_default_brand(default_brand,
2206 sizeof (default_brand)) != Z_OK) {
2207 zerror(zlogp, B_FALSE, "unable to determine default brand");
2208 return (1);
2209 }
2210
2211 /* Get a handle to the brand info for this zone */
2212 if (zone_get_brand(zone_name, brand_name, sizeof (brand_name))
2213 != Z_OK) {
2214 zerror(zlogp, B_FALSE, "unable to determine zone brand");
2215 return (1);
2216 }
2217 zone_isnative = (strcmp(brand_name, NATIVE_BRAND_NAME) == 0);
2218 zone_islabeled = (strcmp(brand_name, LABELED_BRAND_NAME) == 0);
2219
2220 /*
2221 * In the alternate root environment, the only supported
2222 * operations are mount and unmount. In this case, just treat
2223 * the zone as native if it is cluster. Cluster zones can be
2224 * native for the purpose of LU or upgrade, and the cluster
2225 * brand may not exist in the miniroot (such as in net install
2226 * upgrade).
2227 */
2228 if (strcmp(brand_name, CLUSTER_BRAND_NAME) == 0) {
2229 zone_iscluster = B_TRUE;
2230 if (zonecfg_in_alt_root()) {
2231 (void) strlcpy(brand_name, default_brand,
2232 sizeof (brand_name));
2233 }
2234 } else {
2235 zone_iscluster = B_FALSE;
2236 }
2237
2238 if ((bh = brand_open(brand_name)) == NULL) {
2239 zerror(zlogp, B_FALSE, "unable to open zone brand");
2240 return (1);
2241 }
2242
2243 /* Get state change brand hooks. */
2244 if (brand_callback_init(bh, zone_name) == -1) {
2245 zerror(zlogp, B_TRUE,
2246 "failed to initialize brand state change hooks");
2247 brand_close(bh);
2248 return (1);
2249 }
2250
2251 brand_close(bh);
2252
2253 /*
2254 * Check that we have all privileges. It would be nice to pare
2255 * this down, but this is at least a first cut.
2256 */
2257 if ((privset = priv_allocset()) == NULL) {
2258 zerror(zlogp, B_TRUE, "%s failed", "priv_allocset");
2259 return (1);
2260 }
2261
2262 if (getppriv(PRIV_EFFECTIVE, privset) != 0) {
2263 zerror(zlogp, B_TRUE, "%s failed", "getppriv");
2264 priv_freeset(privset);
2265 return (1);
2266 }
2267
2268 if (priv_isfullset(privset) == B_FALSE) {
2269 zerror(zlogp, B_FALSE, "You lack sufficient privilege to "
2270 "run this command (all privs required)");
2271 priv_freeset(privset);
2272 return (1);
2273 }
2274 priv_freeset(privset);
2275
2276 if (mkzonedir(zlogp) != 0)
2277 return (1);
2278
2279 /*
2280 * Pre-fork: setup shared state
2281 */
2282 if ((shstate = (void *)mmap(NULL, shstatelen,
2283 PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANON, -1, (off_t)0)) ==
2284 MAP_FAILED) {
2285 zerror(zlogp, B_TRUE, "%s failed", "mmap");
2286 return (1);
2287 }
2288 if (sema_init(&shstate->sem, 0, USYNC_PROCESS, NULL) != 0) {
2289 zerror(zlogp, B_TRUE, "%s failed", "sema_init()");
2290 (void) munmap((char *)shstate, shstatelen);
2291 return (1);
2292 }
2293 shstate->log.logfile = NULL;
2294 shstate->log.buflen = shstatelen - sizeof (*shstate);
2295 shstate->log.loglen = shstate->log.buflen;
2296 shstate->log.buf = (char *)shstate + sizeof (*shstate);
2297 shstate->log.log = shstate->log.buf;
2298 shstate->log.locale = parents_locale;
2299 shstate->status = -1;
2300
2301 /*
2302 * We need a SIGCHLD handler so the sema_wait() below will wake
2303 * up if the child dies without doing a sema_post().
2304 */
2305 (void) sigset(SIGCHLD, sigchld);
2306 /*
2307 * We must mask SIGCHLD until after we've coped with the fork
2308 * sufficiently to deal with it; otherwise we can race and
2309 * receive the signal before pid has been initialized
2310 * (yes, this really happens).
2311 */
2312 (void) sigemptyset(&block_cld);
2313 (void) sigaddset(&block_cld, SIGCHLD);
2314 (void) sigprocmask(SIG_BLOCK, &block_cld, NULL);
2315
2316 /*
2317 * The parent only needs stderr after the fork, so close other fd's
2318 * that we inherited from zoneadm so that the parent doesn't have those
2319 * open while waiting. The child will close the rest after the fork.
2320 */
2321 closefrom(3);
2322
2323 if ((ctfd = init_template()) == -1) {
2324 zerror(zlogp, B_TRUE, "failed to create contract");
2325 return (1);
2326 }
2327
2328 /*
2329 * Do not let another thread localize a message while we are forking.
2330 */
2331 (void) mutex_lock(&msglock);
2332 pid = fork();
2333 (void) mutex_unlock(&msglock);
2334
2335 /*
2336 * In all cases (parent, child, and in the event of an error) we
2337 * don't want to cause creation of contracts on subsequent fork()s.
2338 */
2339 (void) ct_tmpl_clear(ctfd);
2340 (void) close(ctfd);
2341
2342 if (pid == -1) {
2343 zerror(zlogp, B_TRUE, "could not fork");
2344 return (1);
2345
2346 } else if (pid > 0) { /* parent */
2347 (void) sigprocmask(SIG_UNBLOCK, &block_cld, NULL);
2348 /*
2349 * This marks a window of vulnerability in which we receive
2350 * the SIGCLD before falling into sema_wait (normally we would
2351 * get woken up from sema_wait with EINTR upon receipt of
2352 * SIGCLD). So we may need to use some other scheme like
2353 * sema_posting in the sigcld handler.
2354 * blech
2355 */
2356 (void) sema_wait(&shstate->sem);
2357 (void) sema_destroy(&shstate->sem);
2358 if (shstate->status != 0)
2359 (void) waitpid(pid, NULL, WNOHANG);
2360 /*
2361 * It's ok if we die with SIGPIPE. It's not like we could have
2362 * done anything about it.
2363 */
2364 (void) fprintf(stderr, "%s", shstate->log.buf);
2365 _exit(shstate->status == 0 ? 0 : 1);
2366 }
2367
2368 /*
2369 * The child charges on.
2370 */
2371 (void) sigset(SIGCHLD, SIG_DFL);
2372 (void) sigprocmask(SIG_UNBLOCK, &block_cld, NULL);
2373
2374 /*
2375 * SIGPIPE can be delivered if we write to a socket for which the
2376 * peer endpoint is gone. That can lead to too-early termination
2377 * of zoneadmd, and that's not good eats.
2378 */
2379 (void) sigset(SIGPIPE, SIG_IGN);
2380 /*
2381 * Stop using stderr
2382 */
2383 zlogp = &shstate->log;
2384
2385 /*
2386 * We don't need stdout/stderr from now on.
2387 */
2388 closefrom(0);
2389
2390 /*
2391 * Initialize the syslog zlog_t. This needs to be done after
2392 * the call to closefrom().
2393 */
2394 logsys.buf = logsys.log = NULL;
2395 logsys.buflen = logsys.loglen = 0;
2396 logsys.logfile = NULL;
2397 logsys.locale = DEFAULT_LOCALE;
2398
2399 openlog("zoneadmd", LOG_PID, LOG_DAEMON);
2400
2401 /*
2402 * The eventstream is used to publish state changes in the zone
2403 * from the door threads to the console I/O poller.
2404 */
2405 if (eventstream_init() == -1) {
2406 zerror(zlogp, B_TRUE, "unable to create eventstream");
2407 goto child_out;
2408 }
2409
2410 (void) snprintf(zone_door_path, sizeof (zone_door_path),
2411 "%s" ZONE_DOOR_PATH, zonecfg_get_root(), zone_name);
2412
2413 /*
2414 * See if another zoneadmd is running for this zone. If not, then we
2415 * can now modify system state.
2416 */
2417 if (make_daemon_exclusive(zlogp) == -1)
2418 goto child_out;
2419
2420
2421 /*
2422 * Create/join a new session; we need to be careful of what we do with
2423 * the console from now on so we don't end up being the session leader
2424 * for the terminal we're going to be handing out.
2425 */
2426 (void) setsid();
2427
2428 /*
2429 * This thread shouldn't be receiving any signals; in particular,
2430 * SIGCHLD should be received by the thread doing the fork().
2431 */
2432 (void) sigfillset(&blockset);
2433 (void) thr_sigsetmask(SIG_BLOCK, &blockset, NULL);
2434
2435 /*
2436 * Setup the console device and get ready to serve the console;
2437 * once this has completed, we're ready to let console clients
2438 * make an attempt to connect (they will block until
2439 * serve_console_sock() below gets called, and any pending
2440 * connection is accept()ed).
2441 */
2442 if (!zonecfg_in_alt_root() && init_console(zlogp) < 0)
2443 goto child_out;
2444
2445 /*
2446 * Take the lock now, so that when the door server gets going, we
2447 * are guaranteed that it won't take a request until we are sure
2448 * that everything is completely set up. See the child_out: label
2449 * below to see why this matters.
2450 */
2451 (void) mutex_lock(&lock);
2452
2453 /* Init semaphore for scratch zones. */
2454 if (sema_init(&scratch_sem, 0, USYNC_THREAD, NULL) == -1) {
2455 zerror(zlogp, B_TRUE,
2456 "failed to initialize semaphore for scratch zone");
2457 goto child_out;
2458 }
2459
2460 /* open the dladm handle */
2461 if (dladm_open(&dld_handle) != DLADM_STATUS_OK) {
2462 zerror(zlogp, B_FALSE, "failed to open dladm handle");
2463 goto child_out;
2464 }
2465
2466 /*
2467 * Note: door setup must occur *after* the console is setup.
2468 * This is so that as zlogin tests the door to see if zoneadmd
2469 * is ready yet, we know that the console will get serviced
2470 * once door_info() indicates that the door is "up".
2471 */
2472 if (setup_door(zlogp) == -1)
2473 goto child_out;
2474
2475 /*
2476 * Things seem OK so far; tell the parent process that we're done
2477 * with setup tasks. This will cause the parent to exit, signalling
2478 * to zoneadm, zlogin, or whatever forked it that we are ready to
2479 * service requests.
2480 */
2481 shstate->status = 0;
2482 (void) sema_post(&shstate->sem);
2483 (void) munmap((char *)shstate, shstatelen);
2484 shstate = NULL;
2485
2486 (void) mutex_unlock(&lock);
2487
2488 /*
2489 * zlogp is now invalid, so reset it to the syslog logger.
2490 */
2491 zlogp = &logsys;
2492
2493 /*
2494 * Now that we are free of any parents, switch to the default locale.
2495 */
2496 (void) setlocale(LC_ALL, DEFAULT_LOCALE);
2497
2498 /*
2499 * At this point the setup portion of main() is basically done, so
2500 * we reuse this thread to manage the zone console. When
2501 * serve_console() has returned, we are past the point of no return
2502 * in the life of this zoneadmd.
2503 */
2504 if (zonecfg_in_alt_root()) {
2505 /*
2506 * This is just awful, but mounted scratch zones don't (and
2507 * can't) have consoles. We just wait for unmount instead.
2508 */
2509 while (sema_wait(&scratch_sem) == EINTR)
2510 ;
2511 } else {
2512 serve_console(zlogp);
2513 assert(in_death_throes);
2514 }
2515
2516 /*
2517 * This is the next-to-last part of the exit interlock. Upon calling
2518 * fdetach(), the door will go unreferenced; once any
2519 * outstanding requests (like the door thread doing Z_HALT) are
2520 * done, the door will get an UNREF notification; when it handles
2521 * the UNREF, the door server will cause the exit. It's possible
2522 * that fdetach() can fail because the file is in use, in which
2523 * case we'll retry the operation.
2524 */
2525 assert(!MUTEX_HELD(&lock));
2526 for (;;) {
2527 if ((fdetach(zone_door_path) == 0) || (errno != EBUSY))
2528 break;
2529 yield();
2530 }
2531
2532 for (;;)
2533 (void) pause();
2534
2535 child_out:
2536 assert(pid == 0);
2537 if (shstate != NULL) {
2538 shstate->status = -1;
2539 (void) sema_post(&shstate->sem);
2540 (void) munmap((char *)shstate, shstatelen);
2541 }
2542
2543 /*
2544 * This might trigger an unref notification, but if so,
2545 * we are still holding the lock, so our call to exit will
2546 * ultimately win the race and will publish the right exit
2547 * code.
2548 */
2549 if (zone_door != -1) {
2550 assert(MUTEX_HELD(&lock));
2551 (void) door_revoke(zone_door);
2552 (void) fdetach(zone_door_path);
2553 }
2554
2555 if (dld_handle != NULL)
2556 dladm_close(dld_handle);
2557
2558 return (1); /* return from main() forcibly exits an MT process */
2559 }